def main(): p1 = Agent() p2 = Agent() e = Env() p1.setSymbol(e.x) p2.setSymbol(e.o) p1.setV(e.initValues(p1.symbol)) p2.setV(e.initValues(p2.symbol)) for i in range(10000): if i % 1000 == 0: print "epoch: {}".format(i) play_game(p1, p2, Env()) print "Training Complete" human = Human() human.set_symbol(e.o) p1.verbose = True while True: play_game(p1, human, Env(), draw=True) answer = raw_input("Pay again? [y,n] :") if answer and answer.lower()[0] == 'n': break
def env(): config = ConfigParser() config.read(["data.ini"]) # 1.使用环境变量读取的方式,需要吧evn设置为 data.ini下的环境名,并将"env":"test_env"设置到环境变量中 api_root_url = config[os.environ['env']]['api_root_url'] # 2.直接读取 # api_root_url = config['test_env']['api_root_url'] yield Env(api_root_url=api_root_url, username=os.environ["username"], password=os.environ["password"])
def testEnv(): env = Env() channelThroughPut = 0 # fraction of time that packets are successfully delivered over the channel # i.e no collisions or idle time slots for iteration in range(config.Iterations): for t in range(config.TimeSlots): initialState = env.reset() for user in range(config.N): action = slottedAlohaProtocol() env.step(action=action, user=user) # each user changes the inner state of the environment where the environment uses the inner state # in order to keep track on the channels and the ACK signals for each user nextStateForEachUser, rewardForEachUser = env.getNextState() # if a reward is one that means that a packets was successfully delivered over the channel # the sum has a maximum of the number of channels -> config.K channelThroughPut = channelThroughPut + np.sum(rewardForEachUser) # measuring the expected value channelThroughPut = channelThroughPut / (config.Iterations * config.TimeSlots) print("Channel Utilization average {}".format(channelThroughPut)) ToPlotX = range(config.Iterations * config.TimeSlots) ToPlotY = np.ones_like(ToPlotX) * channelThroughPut plot_graph(data=[ToPlotX, ToPlotY], filename="Aloha", title="Aloha", xlabel="Time slot", ylabel="Average channel utilization", legend="SlottedAloha") # # # def testTimeEnv(): # env = TimeDependentEnv() # channelThroughPut = 0 # fraction of time that packets are successfully delivered over the channel # # i.e no collisions or idle time slots # for iteration in range(config.Iterations): # TimeSPU = env.reset() # for t in range(config.TimeSlots): # env.resetTimeStep() # # reset the internal state of the environment # # which keep tracks of the users actions through out the time step # for user in range(config.N): # action = slottedAlohaProtocol() # env.step(action=action, user=user) # # each user changes the inner state of the environment where the environment uses the inner state # # in order to keep track on the channels and the ACK signals for each user # nextStateForEachUser, rewardForEachUser = env.tstep(timestep=t) # # if a reward is one that means that a packets was successfully delivered over the channel # # the sum has a maximum of the number of channels -> config.K # channelThroughPut = channelThroughPut + np.sum(rewardForEachUser) # # measuring the expected value # channelThroughPut = channelThroughPut / (config.Iterations * config.TimeSlots) # print("Channel Utilization average {}".format(channelThroughPut)) # ToPlotX = range(config.Iterations * config.TimeSlots) # ToPlotY = np.ones_like(ToPlotX) * channelThroughPut # plot_graph(data=[ToPlotX, ToPlotY], filename="Aloha", title="Aloha", # xlabel="Time slot", ylabel="Average channel utilization", legend="SlottedAloha")
def test_agents(): result = np.zeros([5, 5]) maps, trials_per_map = 10, 10 ave_cost_1 = [] ave_cost_2 = [] ave_cost_3 = [] for j in range(maps): en = Env(50) cost_1 = [] cost_2 = [] cost_3 = [] # cost_4 = [] for k in range(trials_per_map): print(f'map: {j + 1}/{maps}, play: {k + 1}/{trials_per_map}') # en.set_target_on_type(i) en.set_target() en.print_target() agent_1 = Agent(en) searches_1, distance_1 = agent_1.run(1, False) sum_1 = searches_1 + distance_1 cost_1.append(sum_1) agent_2 = Agent(en) searches_2, distance_2 = agent_2.run(2, False) sum_2 = searches_2 + distance_2 cost_2.append(sum_2) agent_3 = Agent(en) searches_3, distance_3 = agent_3.run_improved(10000) sum_3 = searches_3 + distance_3 cost_3.append(sum_3) ave_cost_1.append(sum(cost_1) / len(cost_1)) ave_cost_2.append(sum(cost_2) / len(cost_2)) ave_cost_3.append(sum(cost_3) / len(cost_3)) result[0][1] = sum(ave_cost_1) / len(ave_cost_1) result[0][2] = sum(ave_cost_2) / len(ave_cost_2) result[0][3] = sum(ave_cost_3) / len(ave_cost_3) print(result)
self.model.load_weights('model.h5') print("The Model loaded") def update_epsilon(self): self.epsilon = max(self.epsilon_min, self.epsilon_decay * self.epsilon) def learning_rate_decay(self): lr = self.optimizer.lr.numpy() lr = max(self.lr_decay * lr, 0.001) self.optimizer.lr.assign(lr) if __name__ == "__main__": # create environment env = Env() agent = DQNAgent() total_scores = np.empty(EPISODES) iteration = 0 for e in range(EPISODES): state = env.reset() check_list = env.check_if_reward(state) goal = check_list['if_goal'] # done wumpus = check_list['if_wumpus'] # done losses = [] score = 0 # done while (not goal) and (not wumpus):
''' Created on Sep 6, 2018 @author: dabrown ''' from Cutie_Network import Cutie from Environment import Env if __name__ == '__main__': pass # first Init the Env env = Env() cutie = Cutie(env) cutie.train_nework(env)
from tqdm import tqdm from collections import namedtuple StateVars = namedtuple('state_vars', ['curr_state', 'prev_state_hash', 'reward']) from Environment import State, Env need = pd.read_csv('../fake_4region_trip_20170510.csv') # dist=pd.read_csv('fake_4region_distance.csv') # dist=dist.values eps_num = 4 car_num = 1 env = Env(initial_region_state=[15, 15, 15, 15], capacity_each_step=10, max_episode=eps_num, car_count=car_num, need=need) history = {i: dict() for i in range(8)} for region in range(env.region_count): state = env.new_state() curr_state_hash = state.get_hash() state.out_stage() for car in range(env.car_num): for move in range(-env.capacity_each_step, env.capacity_each_step + 1): if state.check_feasible(region, car, move): new_state = state.step(region, car, move) new_state.in_stage() new_state_hash = new_state.get_hash()
resultLeaf = resultLeaf.left if not temp.isLeafNode(): addNextStep(temp, resultLeaf) addNextStep(root, resultLeaf) totalReward = numpy.sum(rewards) print('The optimal way to traverse the tree with a total reward of ' + str(totalReward) + ' would be:') print(result) if __name__ == '__main__': environment = Env() alpha = 0.1 gamma = 0.6 epsilon = 0.3 number_of_episodes = 10000 q_table = numpy.zeros([environment.normalizedtree.getNumberOfNodes(), 2]) for i in range(1, number_of_episodes): state = environment.reset() visited_states = [] penalties, reward = 0, 0 done = False
def empezarPrueba(): env = Env() env.width = 10 env.height = 6 env.posY = 6 # QTable : contiene los valores Q para cada par (estado, acción) qtable = np.random.rand(env.stateCount, env.actionCount).tolist() epochs = 100 # Iteraciones que realizará el algoritmo gamma = 0.8 # Valor que reduce las rescompensas de forma exponencial según pasan las acciones epsilon = 0.1 decay = 0.1 print("Mapa inicial") # Generamos el mapa del problema en concreto env.crearMapaPrueba() for i in range(epochs): # Reinicia el algoritmo en cada inicio de las iteraciones state, reward, done = env.reset() steps = 0 # Mientras que el algoritmo no llegue al estado final propuesto seguimos realizando acciones # y cambiando de estado while not done: print("epoch #", i + 1, "/", epochs) time.sleep(0.05) # Dibuja la nueva posición de la A en el mapa env.modificaMapa(i + 1) # Cuenta los pasos realizados hasta llegar el final steps += 1 # Cuando el parametro epsilon es mayor al numero generado automaticamente se realiza una acción # aleatoria if np.random.uniform() < epsilon: action = env.randomAction() # Si no selecciona la acción dependiendo de el numero mayor que encontramos en la tabla else: action = qtable[state].index(max(qtable[state])) # Calcula el siguiente estado, la recompensa obtenido y si el algoritmo acabó la iteración next_state, reward, done = env.step(action) # Hace que la recompensa sea menor en cada iteración reward = reward - (steps * 0.3) # Actualizamos la q-tabla con los valores de la ecuación de bellman pos = reward * (gamma**steps / 2) + 0.9 * max(qtable[next_state]) qtable[state][action] = pos # Si el algoritmo acaba dibuja la A en la posición final if done: env.fin(i + 1) # Muestra el mapa final por Tkinter if done and i + 1 == epochs: time.sleep(30) tk.mainloop() # Actualizamos el estado state = next_state # Epsilon se reduce en cada iteración para que el algoritmo haga menos elecciones aleatorias epsilon -= decay * epsilon print("\nDone in", steps, "steps".format(steps)) time.sleep(0.8)
from keras.models import Sequential, Model from keras.layers import Dense, Activation, Flatten, Input, merge from keras.optimizers import Adam from rl.agents import DDPGAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess from Environment import Env, status, actions import matplotlib.pyplot as plt gym.undo_logger_setup() ENV_NAME = 'SQ' env = Env() np.random.seed(123) env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] actor_depth = 4 actor_width = 32 critic_depth = 6 critic_width = 64 # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1, ) + env.observation_space.shape)) for k in range(actor_depth):
# and do the model fit pass # save the model which is under training def save_model(self): self.model.save_weights('model.h5') # load the saved model def load_model(self): self.model.load_weights('model.h5') if __name__ == "__main__": # create environment env = Env() agent = DQNAgent() # code for e in range(EPISODES): state = env.reset() # code while (not goal) and (not wumpus): if agent.render: env.render() # code
from keras.optimizers import Adam from rl.agents import DDPGAgent from rl.memory import SequentialMemory from rl.random import OrnsteinUhlenbeckProcess from Environment import Env ENV_NAME = 'Pendulum-v0' gym.undo_logger_setup() # Get the environment and extract the number of actions. # env = gym.make(ENV_NAME) env = Env() np.random.seed(123) env.seed(123) assert len(env.action_space.shape) == 1 nb_actions = env.action_space.shape[0] # Next, we build a very simple model. actor = Sequential() actor.add(Flatten(input_shape=(1,) + env.observation_space.shape)) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu')) actor.add(Dense(16)) actor.add(Activation('relu'))
utility_next = utility_matrix[next_state] delta = reward + gamma * utility_next - utility utility_matrix[state] += alpha * (delta) return utility_matrix, delta def updateActor(state_action_matrix, state, action, delta): beta = 1 state_action_matrix[state, action] += beta * delta return state_action_matrix if __name__ == '__main__': environment = Env() alpha = 0.1 gamma = 0.99 epsilon = 0.01 number_of_episodes = 10000 state_action_pairs = numpy.full( (environment.normalizedtree.getNumberOfNodes(), 2), 0.5) utility_matrix = numpy.zeros( [environment.normalizedtree.getNumberOfNodes()]) softmax = lambda vals: numpy.exp(vals - numpy.max(vals)) / numpy.sum( numpy.exp(vals - numpy.max(vals)))
__author__ = 'Maira' from Environment import Env from Learning import Example, Effect from OIlogic import AtomSet, Atom, Term env = Env(4) model = env.get_model() state = env.generateState() actions = env.getAllActions() a, b, c, d, floor = Term('a'), Term('b'), Term('c'), Term('d'), Term('floor') ex_1 = Example(state, actions[1], Effect(AtomSet([Atom("ON\\2", [a, b])]), AtomSet([Atom("ON\\2", [a, floor])]))) ex_2 = Example(state, actions[11], Effect(AtomSet([Atom("ON\\2", [c, d])]), AtomSet([Atom("ON\\2", [c, floor])]))) print(ex_1) print(ex_2) model.memorizeEx(ex_1) model.memorizeEx(ex_2) print(model) rules = model.get_rules() for r in rules: s = model.specialize(r) print(s) uex = model.getUncovEx(r) print('Examples:') for e in uex: print('\t'+str(e)) c = model.contradicted(r) print(c) exs = model.get_exMem()
import random from Environment import MetaEnvironment as Env f = open("query.txt") queryList = [] for line in f.readlines(): line = line.strip() queryList.append(line) env = Env(5) for i in range(5): traceList = queryList[i * 10:(i + 1) * 10] state = env.state(traceList) print(state) moveList = [ random.randint(0, env.server_num - 1) for _ in range(len(env.nodes)) ] env.take_actions(moveList) print('Loc:', env.locality()) print('Load:', env.load())
import numpy as np from Agent import Agent from utils import plotLearning from Environment import Env if __name__ == '__main__': env = Env.reset() num_games = 250 load_checkpoint = False agent = Agent(gamma=0.99, epsilon=1.0, lr=5e-4, input_dims=[8], n_actions=4, mem_size=100000, eps_min=0.01, batch_size=64, eps_dec=1e-3, replace=100) if load_checkpoint: agent.load_models() filename = 'DDQN.png' scores = [] eps_history = [] n_steps = 0 for i in range(num_games): done = False
def transcate_PG(self): total_steps = 0 # 记录步数,一天是一步 profit_list = [] # 记录每局总收益 profitAdvanced_list = [] actions = 2 # 行动个数 brain = PolicyGradient( n_actions=2, n_features=87, learning_rate=0.1, reward_decay=1, ) gameNum = 0 #记录游戏轮数 ex_steps = 500 #探索衰减的轮数 epsilon = self.epsilon last_remainder = 0 reward_list = [0] #存储每次的收益,来计算baseline Loss_list = [] #存储训练过程中的损失值 wait_list = [] #记录等待天数 gameSplit = 500 #每多少轮游戏画图 while total_steps < 60000: # 初始化游戏 # routeId = random.randrange(0, 49, 1) routeId = 21 self.routeline = self.allRoute[routeId] # print(self.routeline) env = Env(self.routeline) gameNum += 1 # state = env.getState() # 以state[0]、state[1]方式访问 today = env.getToday() terminal = False order_accepted = False isExploration = False create_date = 1 end_date = 0 stay_num = 0 # 一局游戏 # print("GAME#:",gameNum) baseline = 0 tao_prob = [] tao_reward = 0 wait_day = [] while today < self.routeline[-1] and terminal == False: # 有新订单产生 (当订单数已满10个时,此处不会收到新订单) if order_accepted == False: self.orderSelect(self.routeline, 60) # print(self.order) env.setOrder(self.order) order_accepted = True # print(self.order[1]) # 遍历self.orders(即state[0])字典,对每一个订单操作 state = env.getState() # 当前状态 state_tf = np.mat(state) # print(state_tf,len(state_tf)) # 由神经网络选择行动 if random.random() < epsilon and isExploration == False: isExploration = True end_date = random.randrange(env.getTodayIndex(), 87, 1) # end_date = 60 if isExploration: if env.getTodayIndex() == end_date: action_model = 1 if ex_steps > 0: ex_steps -= 1 else: action_model = 0 else: #action from learning action_model, p = brain.choose_action( state_tf, env.getTodayIndex()) tao_prob.append(p) if action_model == 0: action_finishOrder = [1, 0] else: action_finishOrder = [0, 1] # 订单字典 历史曲线 reward reward = env.getReward(action_model) # 订单完成或者到最后一天 terminal = env.isTerminal(action_model) if terminal: tmp = reward baseline = np.mean(reward_list) profitAdvanced_list.append(baseline) reward -= baseline reward_list.append(tmp) # print("END_REWARD:",reward,",reward_list:",reward_list) # 保存记录到记忆库 # print("this is store arg:",state_tf,";", action_model,";", reward,";", env.getTodayIndex()) brain.store_transition(state_tf, action_model, reward, env.getTodayIndex()) # print(action_model) total_steps += 1 if terminal: loss, wait_day, tao_reward = brain.learn() Loss_list.append(loss) wait_list.append(wait_day[-1]) break # step 过一天加一 env.nextStep() # 一局的总收益 epsilon = self.epsilon * (ex_steps / 500) print("epsilon:", epsilon) print("Baseline:", baseline) profit = env.getTotalReward() profit_list.append(profit) print("total_steps:", total_steps) print("profit_list", profit_list) print("profit:", profit, "profitAvg:", np.mean(profit_list)) print("action-prob:", tao_prob) print("Reward:", tao_reward) print("wait_day:", wait_day) self.writeHistory('./picture/history.txt', epsilon, baseline, total_steps, profit_list, profit, tao_prob, tao_reward, wait_day, gameNum) print("########################" + str(gameNum) + "###########################") if len(profit_list) >= gameSplit: plt.figure() plt.plot(profit_list, 'r-') plt.savefig('./picture/' + str(gameNum) + 'liner_profit_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), profit_list) plt.savefig('./picture/' + str(gameNum) + 'scatter_profit_PG.jpg') plt.figure() plt.plot(profitAdvanced_list, 'g-') plt.savefig('./picture/' + str(gameNum) + 'liner_advanced_PG.jpg') plt.figure() plt.plot(Loss_list, 'y-') plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), wait_list, c='r') plt.savefig('./picture/' + str(gameNum) + 'scatter_waitDay_PG.jpg') profit_list.clear() wait_list.clear()
self.nb_episodes_random = 100 self.nb_episodes = 100 self.batch_size = 64 self.mission_file = './maze.xml' self.memory_capacity = 100000 self.gamma = 0.99 self.learning_rate = 0.001 self.epsilon = 0.2 self.huber_loss_delta = 2.0 self.update_target_frequency = 25 self.max_epsilon = 0.7 self.min_epsilon = 0.1 self.decreasing_rate = -math.log(0.01) / self.nb_episodes hps = HPS() plt.plot(hps.min_epsilon + (hps.max_epsilon - hps.min_epsilon) * np.exp(-hps.decreasing_rate * np.arange(hps.nb_episodes))) env = Env(hps.mission_file) randomAgent = RandomAgent(hps) play(env, hps, randomAgent, hps.nb_episodes_random, train=True) Agent = DDQNPER_Agent(hps) Agent.memory = randomAgent.memory = Agent.memory ##Agent.load() ##Agent.save() #Agent.epsilon = 0.15 play(env, hps, Agent, hps.nb_episodes, train=True, save_victory=False) #play(env, hps, Agent, 40, train=False, save_victory=True) #plt.plot(Agent.losses)
def transcate_DDPG(self): BATCH_SIZE = 32 total_steps = 0 # 记录步数,一天是一步 profit_list = [] # 记录每局总收益 profitAdvanced_list = [] actions = 2 # 行动个数 s_dim = 87 a_dim = 1 brain = DDPG( a_dim=a_dim, s_dim=s_dim, a_bound=1., LR_A=0.001, LR_C=0.001, GAMMA=.99, TAU=0.01, # replacement=REPLACEMENT, ) gameNum = 0 #记录游戏轮数 ex_steps = 500 #探索衰减的轮数 epsilon = self.epsilon last_remainder = 0 reward_list = [0] #存储每次的收益,来计算baseline Loss_list = [] #存储训练过程中的损失值 wait_list = [] #记录N轮游戏分别等待天数 gameSplit = 5000 #每多少轮游戏画图 while total_steps < 60000: # 初始化游戏 # routeId = random.randrange(0, 49, 1) routeId = 21 self.routeline = self.allRoute[routeId] # print(self.routeline) env = Env(self.routeline) gameNum += 1 # state = env.getState() # 以state[0]、state[1]方式访问 today = env.getToday() terminal = False order_accepted = False isExploration = False create_date = 1 end_date = 0 stay_num = 0 # 一局游戏 # print("GAME#:",gameNum) baseline = 0 tao_prob = [] tao_reward = [] wait_day = [] #记录一局游戏等待哪些天 while today < self.routeline[-1] and terminal == False: # 有新订单产生 (当订单数已满10个时,此处不会收到新订单) if order_accepted == False: self.orderSelect(self.routeline, 60) # print(self.order) env.setOrder(self.order) order_accepted = True # 遍历self.orders(即state[0])字典,对每一个订单操作 state = env.getState() # 当前状态 state_tf = np.mat(state) # print(state_tf,len(state_tf)) # 由神经网络选择行动 if random.random() < epsilon and isExploration == False: isExploration = True # end_date = random.randrange(env.getTodayIndex(),87,1) end_date = 60 if isExploration: if env.getTodayIndex() == end_date: action_model = 1 if ex_steps > 0: ex_steps -= 1 else: action_model = 0 else: #action from learning action_model = brain.choose_action(state_tf) # print(action_model) wait_day.append(env.getTodayIndex()) # 订单字典 历史曲线 reward reward = env.getReward(action_model) tao_reward.append(reward) # 订单完成或者到最后一天 terminal = env.isTerminal(action_model) state_ = env.getNextState(action_model) if len(state_) == 1: state_ = copy.deepcopy(state) brain.store_transition(state, action_model, reward, state_) # profitAdvanced_list.append(td_error[0][0]) if brain.pointer > brain.MEMORY_CAPACITY: # print(b_s_) brain.learn() total_steps += 1 if terminal: # wait_list.append(wait_day[-1]) # loss = brain.learn() # Loss_list.append(loss) break # step 过一天加一 env.nextStep() # 一局的总收益 epsilon = self.epsilon * (ex_steps / 500) print("epsilon:", epsilon) print("TD_Error:", baseline) profit = env.getTotalReward() profit_list.append(profit) print("total_steps:", total_steps) print("profit_list", profit_list) print("profit:", profit, "profitAvg:", np.mean(profit_list)) print("action-prob:", tao_prob) print("Reward:", tao_reward) print("wait_day:", wait_day) self.writeHistory('./picture/history.txt', epsilon, baseline, total_steps, profit_list, profit, tao_prob, tao_reward, wait_day, gameNum) print("########################" + str(gameNum) + "###########################") if len(profit_list) >= gameSplit: plt.figure() plt.plot(profit_list, 'r-') plt.savefig('./picture/' + str(gameNum) + 'liner_profit_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), profit_list) plt.savefig('./picture/' + str(gameNum) + 'scatter_profit_PG.jpg') plt.figure() plt.plot(profitAdvanced_list, 'g-') plt.savefig('./picture/' + str(gameNum) + 'liner_advanced_PG.jpg') plt.figure() plt.plot(Loss_list, 'y-') plt.savefig('./picture/' + str(gameNum) + 'liner_loss_PG.jpg') plt.figure() plt.scatter(np.arange(gameSplit), wait_list, c='r') plt.savefig('./picture/' + str(gameNum) + 'scatter_waitDay_PG.jpg') if len(profit_list) >= 500: profit_list.clear() wait_list.clear()
import copy import pylab import numpy as np import tensorflow as tf from Environment import Env from Agent import PG from Agent import TUC import pickle np.random.seed(0) EPISODES = 50 env = Env() agent = PG() EP_reward_sums, episodes = [], [] #agent.save_model("./model_init/PG1") agent.load_model("./model_init/PG1") # Session settings GPU_mem_ratio = 0.2 gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=GPU_mem_ratio) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) # Create recomposed transition critic state_dim = 22 hidden_dim = 3 critic_hidden_dim = 2 action_dim = 5 tuc = TUC(sess, "TUC", state_dim, action_dim, 0.003) #tuc.save_model("./model_init/TUC1")
from Environment import Env import numpy as np import pandas as pd initial_region_state=[15,15,15,15] capacity_each_step=10 max_episode=5 car_count=1 need=pd.read_csv('../fake_4region_trip_20170510.csv') env = Env(initial_region_state, capacity_each_step, max_episode, car_count, need) NUM_ACTIONS = (2 * env.capacity_each_step + 1) * env.region_count # [-500,500]*4个方块 NUM_STATES = 2 * env.region_count + 7 # MountainCar-v0: (2,) history_dict={0: dict(),1: dict(),2: dict(),3:dict(),4:dict(),5: dict(),6: dict(),7: dict()} history_action={0: dict(),1: dict(),2: dict(),3:dict(),4:dict(),5: dict(),6: dict(),7: dict()} state = env.init() print(state) for action in range(NUM_ACTIONS): env.reset() env.pre_step() move = action % (2 * env.capacity_each_step + 1) - env.capacity_each_step region = int(np.floor(action / (2 * env.capacity_each_step + 1))) if env.check_feasible(env.state,region,0,move): state,reward, recent_R=env.step(region,0,move) if (state in history_dict[0] and history_dict[0][state] < reward) \ or state not in history_dict[0]: history_dict[0][state] = (reward,recent_R) #记录 state->reward R history_action[0][state] =(move,region,reward) #记录 state->move region reward