def get_demonstration(self, fold): state1 = self.get_instance(fold) action1 = (0, 2) state2, r1, _ = state1.step(action1) action2 = (0, state1.target) state3, r2, _ = state2.step(action2) assert r2 == 1 ep = [] ep.append(Experience(state1, None, action1, state2, None, r1, False)) ep.append(Experience(state2, None, action2, state3, None, r2, True)) return ep
def generate_experience(self, nr, serialize=False): result = [] for x in range(nr): domain = self.__domains[random.randint(0, len(self.__domains) - 1)] projects = self.generate_projects(random.randint(1, 5), False) exp = Experience(domain, random.randint(1, 15), projects) result.append(exp) if serialize: return [exp.serialize() for exp in result] else: return result
def __init__(self, env=None, agent=None, logdir=None, should_render=None, should_load=None): self.env = env self.agent = agent self.config = self.agent.config self.logdir = logdir self.should_render = should_render self.experience = Experience(self.config) if should_load: self.load() else: self.step = 0 self.epsilon = 0.3 self.train_rewards = [0] * 100 self.current_episode = 0
def add_grains(self, grains: List[List[Image.Image]]): """Add new grains to memory Params: grains: List[List[Image.Image]] 2D List of new grains Returns: 2D List of novelty for new grains """ # print("Adding new grains to memory...") assert len(grains) == 2 # Currently, we only allow 4 grains assert len(grains[0]) == 2 # Currently, we only allow 4 grains nov_list = [] for row in grains: temp_nov = [] for g in row: grain_tf = self._grain_to_tensor(g) grain_tf = tf.reshape( grain_tf, (1, grain_tf.shape[0], grain_tf.shape[1], grain_tf.shape[2])) # Reshape to (1,H,W,C) predicted_grain = self._network(grain_tf) nov = self.novelty_function(grain_tf, predicted_grain).numpy() temp_nov.append(nov) self._memory.push(Experience(nov, g)) nov_list.append(temp_nov) return nov_list
def run_episode(self, max_steps, train=True): """ Executes a single episode. Params ====== max_steps (integer): The maximum time steps to run in a single episode. train (Boolean): If true, run episode in train mode. If false, run in eval mode. """ env_info = self.env.reset(train_mode=train)[self.brain_name] states = env_info.vector_observations scores = np.zeros(len(states)) for i in range(max_steps): actions = self.agent.act(states, noise=train) env_info = self.env.step(actions)[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards self.agent.learn( Experience(states, actions, rewards, next_states, dones)) if dones[0]: break states = next_states self.agent.end_episode() return scores.max()
def get_experience(self, key): with dbapi2.connect(self.app.config['dsn']) as connection: cursor = connection.cursor() query = "SELECT TITLE, USERNAME, START,FINISH,PERIOD,LENGTH FROM EXPERIENCE WHERE (ID = %s)" cursor.execute(query, (key,)) title, username, start, finish, period,length = cursor.fetchone() return Experience(title, username, start, finish, period,length)
def _do_tr_rollout(code_agent, desc_agent, task, rollout_ph, model, desc_model, desc_to_code, code_to_desc, session, config, h0, z0, fold, mode): worlds = [ task.get_instance(fold) for _ in range(config.trainer.n_rollout_episodes) ] done = [False] * config.trainer.n_rollout_episodes episodes = [[] for i in range(config.trainer.n_rollout_episodes)] hs, zs = h0, z0 dhs = h0 for t in range(config.trainer.n_timeout): hs_, zs_, qs = session.run( [model.tt_rollout_h, model.tt_rollout_z, model.tt_rollout_q], rollout_ph.feed(hs, zs, dhs, worlds, task, config)) dhs_, dqs = session.run( [desc_model.tt_rollout_h, desc_model.tt_rollout_q], rollout_ph.feed(hs, zs, dhs, worlds, task, config)) for i in range(config.trainer.n_rollout_episodes): if done[i]: continue actions = [None, None] actions[code_agent] = np.argmax(qs[code_agent][i, :]) actions[desc_agent] = np.argmax(dqs[desc_agent][i, :]) world_, reward, done_ = worlds[i].step(actions) code = desc_to_code(world_.l_msg[code_agent], mode)[0] zs_[desc_agent][i, :] = code l_words = code_to_desc(zs_[code_agent][i, :], mode)[:5] l_msg = np.zeros(len(task.lexicon)) for l_word in l_words: l_msg[task.lexicon.index(l_word)] += 1 l_msg /= np.sum(l_msg) world_.l_msg = list(world_.l_msg) world_.l_msg[desc_agent] = l_msg world_.l_msg = tuple(world_.l_msg) episodes[i].append( Experience(worlds[i], None, tuple(actions), world_, None, reward, done_)) worlds[i] = world_ done[i] = done_ if config.evaluator.simulate_l: assert False hs = hs_ zs = zs_ dhs = dhs_ if all(done): break return (sum(e.r for ep in episodes for e in ep) * 1. / config.trainer.n_rollout_episodes, sum(ep[-1].s2.success for ep in episodes) * 1. / config.trainer.n_rollout_episodes)
def main(): from experience import Experience from visualization import hist_classes, scatter_classes class_count_list = [] agents = [ 'RainbowAgent', 'SimpleAgent', 'SecondAgent', 'ProbabilisticAgent' ] for agent in agents: exp = Experience(agent, load=True) labels, _, examples, _ = exp.load() class_count, _ = divide_and_count(examples, labels) class_count_list.append(class_count) scatter_classes(class_count_list, agents)
def get_myexperiences(self,name): with dbapi2.connect(self.app.config['dsn']) as connection: cursor = connection.cursor() query = "SELECT * FROM EXPERIENCE where (username=%s)" cursor.execute(query,(name,)) experiences = [(key, Experience(title, username, start, finish, period, length)) for key, title, username, start, finish, period, length,userid,date in cursor] return experiences
def run(env, num_episodes, num_time_steps, replay_batch_size, scores_filename=None): exploration = EpsilonGreedy(epsilon_start=1.0, epsilon_min=0.01, epsilon_decay=0.999) # [Mnih 2015] used: # - replay over 2% of the total experience # - batch size of 32 # - minimum replay start size of 0.1% experience_max_size = int(num_episodes * num_time_steps * 0.02) replay_start_size = int(num_episodes * num_time_steps * 0.001) experience_replay = Experience(max_size=experience_max_size, batch_size=replay_batch_size, replay_start_size=replay_start_size) # experience_replay = PrioritizedExperience( # max_size=experience_max_size, batch_size=replay_batch_size, replay_start_size=replay_start_size, # initial_td_error=10, alpha=0.4, beta=0.4, anneal_rate=0.95, epsilon=0.001) model = ExampleModel(state_size=env.state_size, action_size=env.action_size, learning_rate=0.001) model.build() target_model = ExampleModel(state_size=env.state_size, action_size=env.action_size, learning_rate=0.001) target_model.build() qmodel = QModel(model=model, target_model=target_model, experience_replay=experience_replay, tau=0.1, use_double_q=True) agent = QAgent(state_size=env.state_size, action_size=env.action_size, model=qmodel, exploration=exploration, discount_rate=0.95) scores = np.empty((num_episodes, )) time_start = time.time() for e in range(num_episodes): scores[e] = agent.train(env=env, episode_length=num_time_steps) print('episode: {}/{}, score: {}, e: {:.2}'.format( e + 1, num_episodes, scores[e], agent.exploration.epsilon)) time_end = time.time() print('Average score for last 10% of episodes:', np.mean(scores[int(np.floor(num_episodes * 0.1)):])) print('Time taken:', time_end - time_start, 'seconds') if scores_filename is not None: np.savetxt(scores_filename, scores, delimiter=',')
def search_experience(self,keyword): with dbapi2.connect(self.app.config['dsn']) as connection: cursor = connection.cursor() query="SELECT * FROM EXPERIENCE WHERE (TITLE ILIKE %s OR START ILIKE%s OR FINISH ILIKE %s ) ORDER BY ID" keyword='%'+keyword+'%' cursor.execute(query, (keyword,keyword,keyword)) experiences = [(key, Experience(title, username, start, finish, period, length)) for key, title, username, start, finish, period, length,userid,date in cursor] return experiences
def extract_tensors(experiences): # Convert batch of Experiences to Experience of batches batch = Experience(*zip(*experiences)) t1 = torch.cat(batch.state) t2 = torch.cat(batch.action) t3 = torch.cat(batch.reward) t4 = torch.cat(batch.next_state) return t1, t2, t3, t4
def __init__(self, model, env, action_size, args, state): self.model = model self.env = env self.action_size = action_size self.state = state self.hx = None self.cx = None self.eps_len = 0 self.args = args self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] self.done = True # 初始化,可以设置一次新的状态值 self.info = None self.reward = 0 self.gpu_id = -1 # 参数 self.memory = Experience(history_size=2000)
def to_experiences(self, states, actions, rewards, next_states, dones): """ Turns vectors representing components of multiple experiences into a vector of Experience objects. """ experiences = [] for (state, action, reward, next_state, done) in zip(states, actions, rewards, next_states, dones): experiences.append( Experience(state, action, reward, next_state, done)) return experiences
def __init__(self, rank, args, shared_model, optimizer, lr): # CUDA 相关 self.gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if self.gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) self.replay_buffer = Experience(history_size=2000) self.cx = None # todo: 仍然是 一次 step 就前向传播 self.hx = None self.episodic_score = 0 self.rank = rank self.args = args self.shared_model = shared_model self.optimizer = optimizer self.local_t = 0 # 初始化 # 初始化环境 print('Training Agent: {}'.format(self.rank)) # todo: 需要给 gym 环境加上 pc 等 # agent 代理对象 model = UNREAL(in_channels=3, action_size=6, enable_pixel_control=True) if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): model = model.cuda() model.train() # 学习率 self.initial_learning_rate = lr self.max_global_time_step = 10 * 10**7 # 记录时间 # For log output self.prev_local_t = 0 self.model = model self.env = None self.reset() # cx hx
def __init__(self, num_action, frame_height, frame_width, rng, network_type, algorithm, network_file=None, num_ignore=0, exp_file=None): self.rng = rng self.num_action = num_action self.mbsize = Agent.MINIBATCH_SIZE self.validate_size = Agent.VALID_SIZE self.num_train_obs = 0 self.network_type = network_type self.eps_decay = (Agent.FINAL_EXPLORE - Agent.INIT_EXPLORE) \ / Agent.EXPLORE_FRAMES self.validate_states = None self.exp_file = exp_file if exp_file is not None: with open(exp_file, 'rb') as f: npz = np.load(exp_file) self.num_train_obs = np.sum(npz['num_train_obs']) self.validate_states = npz['validate_states'] self.exp_train = Experience(Agent.REPLAY_SIZE, frame_height, frame_width, Agent.HISTORY, rng, npz) else: self.exp_train = Experience(Agent.REPLAY_SIZE, frame_height, frame_width, Agent.HISTORY, rng) self.exp_eval = Experience(Agent.HISTORY + 1, frame_height, frame_width, Agent.HISTORY, rng) self.network = Network(num_action, self.mbsize, Agent.HISTORY, frame_height, frame_width, Agent.DISCOUNT, Agent.UPDATE_FREQ, rng, network_type, algorithm, network_file, num_ignore)
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def read_experience(): experience = Experience() experience.employer = input("Enter name of employer: ") experience.title = input("Enter title: ") experience.responsibilities = input("Enter responsibilities: ") experience.duration_years = int( input("Enter number of years of experience: ")) return experience
def run_episode(self, train=True): env_info = self.env.reset(train_mode=train)[self.brain_name] state = env_info.vector_observations[0] score = 0 while True: action = self.agent.act(state, explore=train) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward if train: self.agent.learn( Experience(state, action, reward, next_state, done)) state = next_state if done: break return score
def run_episode(self): self.env.reset() self.env.random_start() t = 0 experiences = [] while(not self.env.terminal): #predict action, value action, value = self.predict(self.env.state) self.env.step(action) experience = Experience(self.env.state, action, self.env.reward, None, self.env.terminal) experiences.append(experience) yield experience t += 1
def main(args): with tf.device(args['device']): # tf tf.set_random_seed(args['rand_seed']) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # env env = gym.make('TestEnv-v0') env.seed(args['rand_seed']) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] concat_dim = 2 batched_s_dim = [None, s_dim, concat_dim] batched_a_dim = [None, a_dim] # agents actor = Actor(sess, args['actor_lr'], args['tau'], args['batch_size'], args['clip_val'], batched_s_dim, batched_a_dim) critic = Critic(sess, args['critic_lr'], args['tau'], args['clip_val'], batched_s_dim, batched_a_dim) # experience exp = Experience(args['buffer_size'], args['batch_size'], args['rand_seed']) # noise actor_noise = ActorNoise(actor.predict, a_dim, noise_type=args['noise_type']) # initialize init = tf.global_variables_initializer() sess.run(init) saver = Model(sess, args['restore_path']) saver.restore_model() # training her = HER(saver, exp, env, actor, critic, actor_noise) if args['mode'] == 'train': her.train(args['gamma'], args['her_k'], args['max_episodes'], args['max_episode_len'], args['replay_len']) else: her.play(args['max_episodes'], args['max_episode_len'])
def run_episode(self): self.env.reset() self.history.add(self.env.state) random_start_steps = max(self.config.history_length, self.env.random_start_steps) for _ in range(random_start_steps): self.env.step(self.env.random_step()) self.history.add(self.env.state) t = 0 while (not self.env.terminal): #predict action, value prev_state = self.env.state action = self.predict(self.history.get()) self.env.step(action) experience = Experience(prev_state, action, self.env.reward, self.env.state, self.env.terminal) yield experience t += 1
def index(): app.logger.info("Connection from %s" % str(request.environ['REMOTE_ADDR'])) database = databaseconnector.databaseObject(app) skills = dict(database.query("SELECT title, rating FROM skills")) tools = dict(database.query("SELECT title, rating FROM tools")) languages = dict(database.query("SELECT title, rating FROM languages")) experience = database.query( "SELECT title, dates, shortDescription, longDescription, image, id FROM experience" ) experienceStructs = [] for line in experience: newStruct = Experience(line[0], line[1], line[2], line[3], line[4], 'a' + str(line[5])) experienceStructs.append(newStruct) database.close() return render_template('index.html', skills=skills, tools=tools, languages=languages, experiences=experienceStructs)
def main(): candidate_database = CandidateDatabase() candidate = Candidate() candidate.name = "Alicia Toomtest" candidate.title = "Python Developer" candidate.address = "Gothenburg, Sweden" candidate.phone = "0722879879" candidate.email = "*****@*****.**" candidate.hobbies = "Gardening" candidate.education = [ Education(name="Education", school="School", level="Level"), Education(name="Education2", school="School2", level="Level2") ] candidate.experience = [ Experience(employer="Volvo", title="Python developer", responsibilities="code", duration_years="2018-present") ] candidate.note = Note() candidate.note.summary = "Gslf9ehdlsdfnjslsleofjfms," candidate.note.comment = "dki9eufsklwodudndjskwoeifjdk" # print_candidate(candidate) candidate_database.add_candidate(candidate) find_result = candidate_database.find_candidates("toom") if len(find_result) > 0: print_candidates(find_result) else: print("No result found") print_candidate(candidate) return
def qtest(model_eval, model_target,epsilon=0.05, n_vehicles=20, **opt): # global epsilon epoch = 0 n_epoch = opt.get('n_epoch', 1500) max_memory = opt.get('max_memory', 1000) data_size = opt.get('data_size', 50) weights_file = opt.get('weights_file', "") start_time = datetime.datetime.now() n_vehicles = n_vehicles get_dest_count = 0 failed_count = 0 eval_train_time = 0 # If you want to continue training from a previous model, # just supply the h5 file name to weights_file option if weights_file: print("loading weights from file: %s" % (weights_file,)) model_eval.load_weights(weights_file) # Initialize experience replay object experience = Experience(model_eval, model_target, max_memory=max_memory) # records : 记录各种指标 # reward部分: records_of_total_reward = [] records_of_veh_reward = [] records_of_se_reward = [] # 这一项totalreward是veh与se_reward的单纯相加,不带权值。 records_of_pure_total_reward = [] # 需要输出的部分: records_of_veh_drive = [] records_of_veh_drive_speed = [] records_of_se_delay = [] records_of_se_SR = [] veh_actions_record = [ [] for veh in range(n_vehicles) ] se_actions_record = [ [] for veh in range(n_vehicles) ] for epoch in range(n_epoch): qmaze.reset() # print("训练一轮结束,重置qmaze") # print(qmaze.vehs_og_list) seenv.reset(qmaze)#SE相关环境信息的重置 game_over = False # get initial envstate envstates_list = [qmaze.observe(veh) for veh in range(n_vehicles)] se_envstates_list = [seenv.SE_observe(veh, qmaze) for veh in range(n_vehicles)] #reward_list 是用来计算SE部分和Veh行车部分两个reward的累加值,即“总的reward”。 reward_list = np.zeros((n_vehicles)) veh_step_reward_list = np.zeros((n_vehicles)) se_step_reward_list = np.zeros((n_vehicles)) n_episodes = 0 while not game_over: # 开始前都将需要记录这个tag置零 qmaze.should_save = [0] * n_vehicles # 开始前把每一步经历的step_cost_list全部置零 veh_step_cost_list = np.zeros((n_vehicles)) se_step_cost_list = np.zeros((n_vehicles)) pure_reward_list = np.zeros((n_vehicles)) prev_envstates_list = envstates_list prev_se_envstates_list = se_envstates_list # 获取当前各个网格车辆的信息,汽车行动前的网格汽车数目 # _2nd_channel = background_vehs[np.newaxis,:] # background_vehs = background_vehs[np.newaxis,:] vehsnum_before_act = qmaze.count_cells_vehsnum() sesnum_before_act = seenv.SE_count(qmaze) actions = -1 * np.ones((n_vehicles)) # 默认 -1,表示不采取行动 se_actions = -1 * np.ones((n_vehicles)) # 默认 -1,表示不采取行 nVehsCell_list = qmaze.count_cell_vehsnum_list() nSEsCell_list = seenv.SE_count_list(qmaze) # print(nVehsCell_list) # print(veh_actions_record) # print(nSEsCell_list) # print(se_actions_record) # 每个时段的的开头部分 # 在时段的开头先统计各个网格的车辆, # 从而得知,在这一个时段内,汽车们、SE们可能得到的拥挤开销 # 这等于是在哥哥时刻开始之前计算拥挤开销被计算入reward之前的拥挤开销的积累 qmaze.get_veh_cost(veh_step_cost_list) seenv.get_se_cost(se_step_cost_list, qmaze) # 计算一个评判行车指标drive_cost qmaze.get_drive_cost(background_vehs,nVehsCell_list) # 每个时段的的开头部分 for veh in range(n_vehicles): if qmaze.status_flag[veh] != 0: # 说明汽车已经1或者-1了,不用在对其进行动作。 continue else: # 进入这一分支说明该车还没结束 if qmaze.last_act_done[veh] == 1: # 汽车是刚开始的状态或者完成了上一个动作的执行,需要委派新的动作。 valid_actions = qmaze.valid_actions(qmaze.vehs_cur_list[veh]) qmaze.vehs_change_act[veh] = actions[veh] = np.argmax( experience.predict_e(prev_envstates_list[veh], vehsnum_before_act, prev_se_envstates_list[veh], sesnum_before_act)) seenv.SEs_next_mrg_list[veh] = se_actions[veh] = np.argmax( experience.predict_e_se(prev_envstates_list[veh], vehsnum_before_act, prev_se_envstates_list[veh], sesnum_before_act)) veh_actions_record[veh].append(qmaze.vehs_change_act[veh]) se_actions_record[veh].append(seenv.SEs_next_mrg_list[veh]) # 重新指定time_remain_list # 获取reward_will_get # 还得记录一下vehs的nexttogo qmaze.get_some_will(actions[veh], veh) if qmaze.last_act_done[veh] == 0: # 未完成上一个迁移动作,SEs和Vehs的状态都是GOING actions[veh] = GOING se_actions[veh] = GOING continue # 每个时段的结尾部分 nVehsCell_list = qmaze.count_cell_vehsnum_list() nSEsCell_list = seenv.SE_count_list(qmaze) vehcurpos = qmaze.vehs_cur_list for veh in range(n_vehicles): # 查询汽车是否为“完成状态”,即到达终点或者累计reward过大; if qmaze.status_flag[veh] != 0: # 说明该车状态为完成,即已到达终点或者累计reward过大 continue else: # 说明汽车不是“完成”状态。 qmaze.time_remain_list[veh] -= 1 qmaze.action_times_list[veh] += 1 nVehsCell = nVehsCell_list[transfer_dict[vehcurpos[veh]]] nSEsCell = nSEsCell_list[transfer_dict[seenv.SE_curpos_list[veh]]] veh_pos = vehcurpos[veh] se_pos = seenv.SE_curpos_list[veh] bv = seenv.SE_data_list[veh] cpuneed = seenv.CPUNeedList[veh] if qmaze.time_remain_list[veh] != 0: # 首先把last_act_done设置成0,表示上一个动作未做完 qmaze.last_act_done[veh] = 0 # 说明未发生迁移,所以得到的reward都为0,只是记录服务时延 veh_step_reward_list[veh] = 0 # 计算一下服务时延, # 记录一下时延——后面看一下总的时延时间; # 记录一下服务时延的成功率; delay_time = seenv.sum_delay(nVehsCell, nSEsCell, veh_pos, se_pos, bv, cpuneed) if delay_time < Delay_Threshold: seenv.record_success_rate[veh].append(1) else: seenv.record_success_rate[veh].append(0) seenv.record_delay_list[veh].append(delay_time) seenv.SE_delay_list[veh] += delay_time se_step_reward_list[veh] = 0 continue if qmaze.time_remain_list[veh] == 0: # 说明将要完成迁移 # 首先把last_act_done设置成1,表示上一个动作已经做完 qmaze.last_act_done[veh] = 1 # qmaze.should_change[veh] = 1 # veh的rewrd是直接读取will_get即可 # se部分计算时延、计算reward veh_step_reward_list[veh] = qmaze.reward_will_get[veh] + veh_step_cost_list[veh] qmaze.reward_will_get[veh] = 0 veh_step_cost_list[veh] = 0 # 计算一下服务时延, # 记录一下时延——后面看一下总的时延时间; # 记录一下服务时延的成功率; delay_time = seenv.sum_delay(nVehsCell, nSEsCell, veh_pos, se_pos, bv, cpuneed) if delay_time < Delay_Threshold: seenv.record_success_rate[veh].append(1) else: seenv.record_success_rate[veh].append(0) seenv.record_delay_list[veh].append(delay_time) seenv.SE_delay_list[veh] += delay_time avg_delay = seenv.SE_delay_list[veh] / qmaze.action_times_list[veh] if avg_delay > Delay_Threshold: delay_cost = - 0.20 else: delay_cost = 0 seenv.SE_delay_list[veh] = 0 # 若计算了delaycost 要重新清零delay累加 se_step_reward_list[veh] = delay_cost + -1 * CostMrgS + se_step_cost_list[veh] se_step_cost_list[veh] = 0 # 被计入reward过后的拥挤开销需要清零 # 计算累计的reward,加入了拥挤开销的reward qmaze.game_acc_veh_reward[veh] += veh_step_reward_list[veh] seenv.game_acc_se_reward[veh] += se_step_reward_list[veh] pure_reward_list[veh] = veh_step_reward_list[veh] + se_step_reward_list[veh] reward_list[veh] = vr_weights * veh_step_reward_list[veh] + (1 - vr_weights) * se_step_reward_list[veh] qmaze.game_acc_reward[veh] += reward_list[veh] qmaze.game_acc_pure_reward[veh] += pure_reward_list[veh] # 判断行车的accreward是否小于最低要求而游戏失败 if qmaze.game_acc_veh_reward[veh] < qmaze.min_reward: # 行车失败 qmaze.status_flag[veh] = -1 else: # 尚未达到失败的标准,继续行车: # 根据车辆的will的得到的动作信息,更新veh与se状态。 # 判断一个是否到达终点的信息,即用来更新汽车状态,又用来在experience的判定因素。 qmaze.update_state_single__(veh) seenv.update_se_pos_(veh) # 维护一个should_save,表示有必要存储的经验过程数据 qmaze.should_save[veh] = 1 print("E:",n_episodes) print("cur:",qmaze.vehs_cur_list) print("remain:",qmaze.time_remain_list) print("next:",qmaze.vehs_next_go_list) envstates_list = [qmaze.observe(veh) for veh in range(n_vehicles)] se_envstates_list = [seenv.SE_observe(veh, qmaze) for veh in range(n_vehicles)] # 获取当前各个网格车辆的信息,汽车行动后的网格汽车数目 vehsnum_after_act = qmaze.count_cells_vehsnum() sesnum_after_act = seenv.SE_count(qmaze) # print(qmaze.count_cell_vehsnum_list()) # print(qmaze.acc_drive_cost) for veh in range(n_vehicles): if qmaze.should_save[veh] == 1: # 记录一个episode经验 if qmaze.status_flag[veh] == 1: get_dest = True else: get_dest = False episode = [prev_envstates_list[veh], prev_se_envstates_list[veh], qmaze.vehs_change_act[veh], seenv.SEs_next_mrg_list[veh], reward_list[veh], veh_step_reward_list[veh], se_step_reward_list[veh], envstates_list[veh], se_envstates_list[veh], get_dest, vehsnum_before_act, vehsnum_after_act, sesnum_before_act, sesnum_after_act] experience.remember(episode) else: continue # 重置两个计数器 get_dest_count = 0 failed_count = 0 # todo 判断条件要改 # 计算已经完成的 actor 数量,包括 1(成功),2(失败) for veh in range(n_vehicles): if qmaze.status_flag[veh] == 1: get_dest_count += 1 elif qmaze.status_flag[veh] == -1: failed_count += 1 # 当所有的 actor 都已达到终点(不一定是最优解)或者未能达终点(失败),该 episode 训练结束 if get_dest_count + failed_count == n_vehicles: game_over = True else: game_over = False n_episodes += 1 # 一系列reward信息的统计 sum_pure_total_reward = 0 sum_total_reward = 0 sum_veh_reward = 0 sum_se_reward = 0 for veh in qmaze.done_list: sum_total_reward += qmaze.game_acc_reward[veh] sum_veh_reward += qmaze.game_acc_veh_reward[veh] sum_se_reward += seenv.game_acc_se_reward[veh] sum_pure_total_reward += qmaze.game_acc_pure_reward[veh] if len(qmaze.done_list) == 0: records_veh = 0 records_total = 0 records_se = 0 records_total_pure = 0 else: records_veh = sum_veh_reward / len(qmaze.done_list) records_total = sum_total_reward / len(qmaze.done_list) records_se = sum_se_reward / len(qmaze.done_list) records_total_pure = sum_pure_total_reward / len(qmaze.done_list) records_of_total_reward.append(records_total) records_of_veh_reward.append(records_veh) records_of_se_reward.append(records_se) records_of_pure_total_reward.append(records_total_pure) # 统计汽车的行车代价 sum_veh_drive = 0 sum_veh_drive_speed = 0 # print(qmaze.acc_drive_cost) for veh in qmaze.done_list: qmaze.game_drive_cost[veh] = qmaze.acc_drive_cost[veh] / qmaze.action_times_list[veh] qmaze.game_drive_speed[veh] = 1/qmaze.game_drive_cost[veh] sum_veh_drive += qmaze.acc_drive_cost[veh] / qmaze.action_times_list[veh] sum_veh_drive_speed += qmaze.game_drive_speed[veh] if len(qmaze.done_list) == 0: records_drive = 0 records_drive_speed = 0 else: records_drive = sum_veh_drive / len(qmaze.done_list) records_drive_speed = sum_veh_drive_speed / len(qmaze.done_list) records_of_veh_drive.append(records_drive) records_of_veh_drive_speed.append(records_drive_speed) # 统计各个车辆的在每一局(每一个训练轮次)的平均delay与服务成功率 se_delay_avg = [0]*n_vehicles se_success_rate = [0]*n_vehicles for veh in qmaze.done_list: se_delay_avg[veh] = sum(seenv.record_delay_list[veh])/ len(seenv.record_delay_list[veh]) se_success_rate[veh] = sum(seenv.record_success_rate[veh])/ len(seenv.record_success_rate[veh]) # 在一个轮次中对于各个车辆的delay与服务成功率进行求平均 AvgDelayforAll = 0 SRforAll = 0 SRcount = 0 for veh in qmaze.done_list: AvgDelayforAll += se_delay_avg[veh] SRforAll += sum(seenv.record_success_rate[veh]) SRcount += len(seenv.record_success_rate[veh]) if len(qmaze.done_list) == 0: records_delay = 0 else: records_delay = AvgDelayforAll / len(qmaze.done_list) if SRcount == 0: records_SR =0 else: records_SR = SRforAll / SRcount records_of_se_delay.append(records_delay) records_of_se_SR.append(records_SR) dt = datetime.datetime.now() - start_time t = format_time(dt.total_seconds()) template = "Epoch: {:03d}/{:d} | Episodes: {:d} | GetDestCount: {:d}/{:d} |FailedCount: {:d}/{:d}| time: {},| loss_weight:{}" print(template.format(epoch, n_epoch - 1,n_episodes ,get_dest_count, n_vehicles ,failed_count, n_vehicles, t, model_eval.loss_weights)) print("Arrived vehs:", qmaze.done_list) print("SE_delay_avg:",se_delay_avg) print("SE_success_rate:",se_success_rate) print("veh_drive", qmaze.game_drive_speed) print("【AVG_Veh_drive】:", records_drive) print("【AVG_Veh_drive_speed】:", records_drive_speed) end_time = datetime.datetime.now() dt = datetime.datetime.now() - start_time seconds = dt.total_seconds() t = format_time(seconds) weight_rate = '_'+ str(vr_weights) + '_'+str(seenv.datasize_base)+'_'+str(n_vehicles) +'_' # 保存训练过程中的指标 parent_path = 'save_res0108/' method_name = 'merge_test' + weight_rate print("Saving Reward:") with open(parent_path + method_name + "total_reward.pl", 'wb') as f: print("recording total_reward..") pickle.dump(records_of_total_reward, f) with open(parent_path + method_name +"pure_total_reward.pl", 'wb') as f: print("recording pure total_reward..") pickle.dump(records_of_pure_total_reward, f) with open(parent_path + method_name + "veh_reward.pl", 'wb') as f: print("recording veh_reward..") pickle.dump(records_of_veh_reward, f) with open(parent_path + method_name + "se_reward.pl", 'wb') as f: print("recording se_reward..") pickle.dump(records_of_se_reward, f) print("Saving Index:") with open(parent_path + method_name + "veh_drive.pl", 'wb') as f: print("recording veh_drive..") pickle.dump(records_of_veh_drive, f) with open(parent_path + method_name + "veh_drive_speed.pl", 'wb') as f: print("recording veh_drive_speed..") pickle.dump(records_of_veh_drive_speed, f) with open(parent_path + method_name + "se_delay.pl", 'wb') as f: print("recording se_delay..") pickle.dump(records_of_se_delay, f) with open(parent_path + method_name + "se_SR.pl", 'wb') as f: print("recording se_SR..") pickle.dump(records_of_se_SR, f) # print(veh_actions_record) # print(se_actions_record) print(seenv.SE_data_list)
class Agent(object): def __init__(self, model, env, action_size, args, state): self.model = model self.env = env self.action_size = action_size self.state = state self.hx = None self.cx = None self.eps_len = 0 self.args = args self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] self.done = True # 初始化,可以设置一次新的状态值 self.info = None self.reward = 0 self.gpu_id = -1 # 参数 self.memory = Experience(history_size=2000) def fill_experience(self): prev_state = self.env.last_state last_action = self.env.last_action last_reward = self.env.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) with torch.no_grad(): state = torch.from_numpy(self.env.last_state).unsqueeze(0) lar = torch.from_numpy(last_action_reward).unsqueeze(0) _, pi, (self.hx, self.cx) = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_rewards=lar) action_index = pi.max(1)[1].view(1, 1).item() new_state, reward, terminal, pixel_change = self.env.step( action_index) # 存储为数组 frame = ExperienceFrame(prev_state, reward, action_index, terminal, pixel_change, last_action, last_reward) self.memory.add_frame(frame) if terminal: self.env.reset() if self.memory.is_full(): self.env.reset() print("Replay buffer filled") self.done = terminal def a3c_process(self): """ 在 on-policy 下运行程序 :return: """ states = [] last_action_rewards = [] actions = [] # rewards = [] values = [] # V actions_prob = [] terminal_end = False # t_max times loop for _ in range(self.args.num_steps): # Prepare last action reward last_action = self.env.last_action last_reward = self.env.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) state = torch.from_numpy(self.env.last_state).unsqueeze(0) lar = torch.from_numpy(last_action_reward) v, pi, (self.hx, self.cx) = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_rewards=lar.unsqueeze(0)) action_index = pi.max(1)[1].view(1, 1).item() states.append(torch.from_numpy(self.env.last_state)) actions_prob.append(torch.squeeze(pi, dim=0)) last_action_rewards.append(lar) actions.append(action_index) values.append(v) prev_state = self.env.last_state new_state, reward, terminal, pixel_change = self.env.step( action_index) frame = ExperienceFrame(prev_state, reward, action_index, terminal, pixel_change, last_action, last_reward) # Store to experience self.memory.add_frame(frame) # self.episode_reward += reward rewards.append(reward) self.update_lstm_state() if terminal: self.env.reset() break R = torch.zeros(1, 1) if not terminal_end: state = torch.from_numpy(new_state).unsqueeze(0) lar = torch.from_numpy(frame.get_action_reward( self.action_size)).unsqueeze(0) value, _, _ = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_rewards=lar) R = value.data # 构造误差项 actions.reverse() rewards.reverse() values.reverse() batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, Vi) in zip(actions, rewards, values): R = ri + self.args.gamma * R adv = R - Vi a = np.zeros([self.action_size], dtype=np.float32) a[ai] = 1.0 batch_a.append(torch.from_numpy(a)) batch_adv.append(adv) batch_R.append(R) batch_a.reverse() batch_adv.reverse() batch_R.reverse() # 转换为张量 return batch_a, batch_adv, batch_R, last_action_rewards, states, actions_prob, values def a3c_loss(self, batch_a, batch_adv, batch_R, last_action_rewards, states, actions_prob, values): batch_a = torch.stack(batch_a) # batch, 6 batch_adv = torch.stack(batch_adv) # batch,1,1 last_action_rewards = torch.stack(last_action_rewards) # batch,7 batch_R = torch.stack(batch_R) # batch,1,1 states = torch.stack(states) # batch,3,84,84 actions_prob = torch.stack(actions_prob) # batch,6 values = torch.stack(values) # 损失函数 log_pi = torch.log(torch.clamp(actions_prob, min=1e-20, max=1.0)) entropy = -torch.sum(log_pi * actions_prob, dim=1) # 对应的 a_i 的概率 log_pi_a_i = torch.sum(torch.mul(log_pi, batch_a), dim=1) policy_loss = torch.sum(log_pi_a_i * batch_adv + entropy * 0.001) # value_loss value_loss = 0.5 * F.mse_loss(batch_R, values) return policy_loss + value_loss def action_train(self): value, logit, (self.hx, self.cx) = self.model( (Variable(self.state.unsqueeze(0)), (self.hx, self.cx))) prob = F.softmax(logit, dim=1) log_prob = F.log_softmax(logit, dim=1) entropy = -(log_prob * prob).sum(1) self.entropies.append(entropy) action = prob.multinomial(1).data log_prob = log_prob.gather(1, Variable(action)) state, self.reward, self.done, self.info = self.env.step( action.cpu().numpy()) self.state = torch.from_numpy(state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.state = self.state.cuda() self.reward = max(min(self.reward, 1), -1) self.values.append(value) self.log_probs.append(log_prob) self.rewards.append(self.reward) return self def action_test(self): with torch.no_grad(): self.update_lstm_state() state = torch.from_numpy(self.env.last_state).unsqueeze(0) last_action = self.env.last_action last_reward = np.clip(self.env.last_reward, -1, 1) last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) lar = torch.from_numpy(last_action_reward) v, pi, (self.hx, self.cx) = self.model(task_type='a3c', states=state, hx=self.hx, cx=self.cx, last_action_rewards=lar.unsqueeze(0)) prob = F.softmax(pi, dim=1) action = prob.max(1)[1].data.cpu().numpy() state, self.reward, self.done, pixel_change = self.env.step(action[0]) self.info = 5 self.state = torch.from_numpy(state).float() if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.state = self.state.cuda() self.eps_len += 1 return self def update_lstm_state(self): if self.done: if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): self.cx = Variable(torch.zeros(1, 256).cuda()) self.hx = Variable(torch.zeros(1, 256).cuda()) else: self.cx = Variable(torch.zeros(1, 256)) self.hx = Variable(torch.zeros(1, 256)) else: self.cx = Variable(self.cx.data) self.hx = Variable(self.hx.data) def clear_actions(self): self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] return self
class Trainer(object): def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0 def prepare(self): print('') print('trainer creating env...') print('') self.environment = Environment.create_environment( self.env_type, self.env_name) def stop(self): self.environment.stop() def _anneal_learning_rate(self, global_time_step): learning_rate = self.initial_learning_rate * ( self.max_global_time_step - global_time_step) / self.max_global_time_step if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def choose_action(self, pi_values): return np.random.choice(range(len(pi_values)), p=pi_values) def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) summary_writer.flush() def set_start_time(self, start_time): self.start_time = start_time def _fill_experience(self, sess): """ Fill experience buffer until buffer is full. """ prev_state = self.environment.last_state last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, _ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) new_state, reward, terminal, pixel_change = self.environment.process( action) #print('action:', action, terminal) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) self.experience.add_frame(frame) if terminal: self.environment.reset() if self.experience.is_full(): self.environment.reset() print("Replay buffer filled") def _print_log(self, global_t): if (self.thread_index == 0) and (self.local_t - self.prev_local_t >= PERFORMANCE_LOG_INTERVAL): self.prev_local_t += PERFORMANCE_LOG_INTERVAL elapsed_time = time.time() - self.start_time steps_per_sec = global_t / elapsed_time print( "### Performance : {} STEPS in {:.0f} sec. {:.0f} STEPS/sec. {:.2f}M STEPS/hour" .format(global_t, elapsed_time, steps_per_sec, steps_per_sec * 3600 / 1000000.)) def _process_base(self, sess, global_t, summary_writer, summary_op, score_input): # [Base A3C] states = [] last_action_rewards = [] actions = [] rewards = [] values = [] terminal_end = False start_lstm_state = self.local_network.base_lstm_state_out # t_max times loop for _ in range(self.local_t_max): # Prepare last action reward last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = ExperienceFrame.concat_action_and_reward( last_action, self.action_size, last_reward) pi_, value_ = self.local_network.run_base_policy_and_value( sess, self.environment.last_state, last_action_reward) action = self.choose_action(pi_) states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) if (self.thread_index == 0) and (self.local_t % LOG_INTERVAL == 0): print("pi={}".format(pi_)) print(" V={}".format(value_)) prev_state = self.environment.last_state # Process game new_state, reward, terminal, pixel_change = self.environment.process( action) frame = ExperienceFrame(prev_state, reward, action, terminal, pixel_change, last_action, last_reward) # Store to experience self.experience.add_frame(frame) self.episode_reward += reward rewards.append(reward) self.local_t += 1 if terminal: terminal_end = True print("score={}".format(self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break R = 0.0 if not terminal_end: R = self.local_network.run_base_value( sess, new_state, frame.get_action_reward(self.action_size)) actions.reverse() states.reverse() rewards.reverse() values.reverse() batch_si = [] batch_a = [] batch_adv = [] batch_R = [] for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.zeros([self.action_size]) a[ai] = 1.0 batch_si.append(si) batch_a.append(a) batch_adv.append(adv) batch_R.append(R) batch_si.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() return batch_si, last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state def _process_pc(self, sess): # [pixel change] # Sample 20+1 frame (+1 for last next state) pc_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last pc_experience_frames.reverse() batch_pc_si = [] batch_pc_a = [] batch_pc_R = [] batch_pc_last_action_reward = [] pc_R = np.zeros([20, 20], dtype=np.float32) if not pc_experience_frames[1].terminal: pc_R = self.local_network.run_pc_q_max( sess, pc_experience_frames[0].state, pc_experience_frames[0].get_last_action_reward( self.action_size)) for frame in pc_experience_frames[1:]: pc_R = frame.pixel_change + self.gamma_pc * pc_R a = np.zeros([self.action_size]) a[frame.action] = 1.0 last_action_reward = frame.get_last_action_reward(self.action_size) batch_pc_si.append(frame.state) batch_pc_a.append(a) batch_pc_R.append(pc_R) batch_pc_last_action_reward.append(last_action_reward) batch_pc_si.reverse() batch_pc_a.reverse() batch_pc_R.reverse() batch_pc_last_action_reward.reverse() return batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R def _process_vr(self, sess): # [Value replay] # Sample 20+1 frame (+1 for last next state) vr_experience_frames = self.experience.sample_sequence( self.local_t_max + 1) # Reverse sequence to calculate from the last vr_experience_frames.reverse() batch_vr_si = [] batch_vr_R = [] batch_vr_last_action_reward = [] vr_R = 0.0 if not vr_experience_frames[1].terminal: vr_R = self.local_network.run_vr_value( sess, vr_experience_frames[0].state, vr_experience_frames[0].get_last_action_reward( self.action_size)) # t_max times loop for frame in vr_experience_frames[1:]: vr_R = frame.reward + self.gamma * vr_R batch_vr_si.append(frame.state) batch_vr_R.append(vr_R) last_action_reward = frame.get_last_action_reward(self.action_size) batch_vr_last_action_reward.append(last_action_reward) batch_vr_si.reverse() batch_vr_R.reverse() batch_vr_last_action_reward.reverse() return batch_vr_si, batch_vr_last_action_reward, batch_vr_R def _process_rp(self): # [Reward prediction] rp_experience_frames = self.experience.sample_rp_sequence() # 4 frames batch_rp_si = [] batch_rp_c = [] for i in range(3): batch_rp_si.append(rp_experience_frames[i].state) # one hot vector for target reward r = rp_experience_frames[3].reward rp_c = [0.0, 0.0, 0.0] if r == 0: rp_c[0] = 1.0 # zero elif r > 0: rp_c[1] = 1.0 # positive else: rp_c[2] = 1.0 # negative batch_rp_c.append(rp_c) return batch_rp_si, batch_rp_c def process(self, sess, global_t, summary_writer, summary_op, score_input): # Fill experience replay buffer if not self.experience.is_full(): self._fill_experience(sess) return 0 start_local_t = self.local_t cur_learning_rate = self._anneal_learning_rate(global_t) # Copy weights from shared to local sess.run(self.sync) # [Base] batch_si, batch_last_action_rewards, batch_a, batch_adv, batch_R, start_lstm_state = \ self._process_base(sess, global_t, summary_writer, summary_op, score_input) feed_dict = { self.local_network.base_input: batch_si, self.local_network.base_last_action_reward_input: batch_last_action_rewards, self.local_network.base_a: batch_a, self.local_network.base_adv: batch_adv, self.local_network.base_r: batch_R, self.local_network.base_initial_lstm_state: start_lstm_state, # [common] self.learning_rate_input: cur_learning_rate } # [Pixel change] if self.use_pixel_change: batch_pc_si, batch_pc_last_action_reward, batch_pc_a, batch_pc_R = self._process_pc( sess) pc_feed_dict = { self.local_network.pc_input: batch_pc_si, self.local_network.pc_last_action_reward_input: batch_pc_last_action_reward, self.local_network.pc_a: batch_pc_a, self.local_network.pc_r: batch_pc_R } feed_dict.update(pc_feed_dict) # [Value replay] if self.use_value_replay: batch_vr_si, batch_vr_last_action_reward, batch_vr_R = self._process_vr( sess) vr_feed_dict = { self.local_network.vr_input: batch_vr_si, self.local_network.vr_last_action_reward_input: batch_vr_last_action_reward, self.local_network.vr_r: batch_vr_R } feed_dict.update(vr_feed_dict) # [Reward prediction] if self.use_reward_prediction: batch_rp_si, batch_rp_c = self._process_rp() rp_feed_dict = { self.local_network.rp_input: batch_rp_si, self.local_network.rp_c_target: batch_rp_c } feed_dict.update(rp_feed_dict) # Calculate gradients and copy them to global network. sess.run(self.apply_gradients, feed_dict=feed_dict) self._print_log(global_t) # Return advanced local step size diff_local_t = self.local_t - start_local_t return diff_local_t
from visdom import Visdom viz = Visdom() # Build Environment Template -> Lazy Evaluated Callable, for spawning environments env_template = build_env(args.env) # Build Distributed Environments envs = get_distributed_backend(env_template, args.num_processes, backend=args.distributed_backend) # Obtain Environment metadata metadata = envs.get_metadata() # Instantiate Policy policy = get_policy(args.policy, metadata) # Create agent, with the given training algorithm agent = get_algorithm(args.algorithm, policy, envs, args, visdom=viz) # Create Experience Buffer, with the environment metadata experience = Experience(metadata['max_episode_length'], args.num_processes, metadata['obs_shape'], metadata['action_type'], metadata['action_shape']) # Train agent agent.train(num_frames=args.num_frames) import IPython IPython.embed()
def update_learner(self): sample_idxs, weights, sample = self.replay_buffer.sample(self.sample_size) loss, learner_info = self.learner.update(Experience.training_items(sample), weights) self.replay_buffer.update(sample_idxs, loss) return loss, learner_info
def null_experience_list(self, count=100): return [Experience(None, None, None, None, None) for _ in range(count)]