def get_demonstration(self, fold): state1 = self.get_instance(fold) action1 = (0, 2) state2, r1, _ = state1.step(action1) action2 = (0, state1.target) state3, r2, _ = state2.step(action2) assert r2 == 1 ep = [] ep.append(Experience(state1, None, action1, state2, None, r1, False)) ep.append(Experience(state2, None, action2, state3, None, r2, True)) return ep
def get_experience(self, key): with dbapi2.connect(self.app.config['dsn']) as connection: cursor = connection.cursor() query = "SELECT TITLE, USERNAME, START,FINISH,PERIOD,LENGTH FROM EXPERIENCE WHERE (ID = %s)" cursor.execute(query, (key,)) title, username, start, finish, period,length = cursor.fetchone() return Experience(title, username, start, finish, period,length)
def _do_tr_rollout(code_agent, desc_agent, task, rollout_ph, model, desc_model, desc_to_code, code_to_desc, session, config, h0, z0, fold, mode): worlds = [ task.get_instance(fold) for _ in range(config.trainer.n_rollout_episodes) ] done = [False] * config.trainer.n_rollout_episodes episodes = [[] for i in range(config.trainer.n_rollout_episodes)] hs, zs = h0, z0 dhs = h0 for t in range(config.trainer.n_timeout): hs_, zs_, qs = session.run( [model.tt_rollout_h, model.tt_rollout_z, model.tt_rollout_q], rollout_ph.feed(hs, zs, dhs, worlds, task, config)) dhs_, dqs = session.run( [desc_model.tt_rollout_h, desc_model.tt_rollout_q], rollout_ph.feed(hs, zs, dhs, worlds, task, config)) for i in range(config.trainer.n_rollout_episodes): if done[i]: continue actions = [None, None] actions[code_agent] = np.argmax(qs[code_agent][i, :]) actions[desc_agent] = np.argmax(dqs[desc_agent][i, :]) world_, reward, done_ = worlds[i].step(actions) code = desc_to_code(world_.l_msg[code_agent], mode)[0] zs_[desc_agent][i, :] = code l_words = code_to_desc(zs_[code_agent][i, :], mode)[:5] l_msg = np.zeros(len(task.lexicon)) for l_word in l_words: l_msg[task.lexicon.index(l_word)] += 1 l_msg /= np.sum(l_msg) world_.l_msg = list(world_.l_msg) world_.l_msg[desc_agent] = l_msg world_.l_msg = tuple(world_.l_msg) episodes[i].append( Experience(worlds[i], None, tuple(actions), world_, None, reward, done_)) worlds[i] = world_ done[i] = done_ if config.evaluator.simulate_l: assert False hs = hs_ zs = zs_ dhs = dhs_ if all(done): break return (sum(e.r for ep in episodes for e in ep) * 1. / config.trainer.n_rollout_episodes, sum(ep[-1].s2.success for ep in episodes) * 1. / config.trainer.n_rollout_episodes)
def add_grains(self, grains: List[List[Image.Image]]): """Add new grains to memory Params: grains: List[List[Image.Image]] 2D List of new grains Returns: 2D List of novelty for new grains """ # print("Adding new grains to memory...") assert len(grains) == 2 # Currently, we only allow 4 grains assert len(grains[0]) == 2 # Currently, we only allow 4 grains nov_list = [] for row in grains: temp_nov = [] for g in row: grain_tf = self._grain_to_tensor(g) grain_tf = tf.reshape( grain_tf, (1, grain_tf.shape[0], grain_tf.shape[1], grain_tf.shape[2])) # Reshape to (1,H,W,C) predicted_grain = self._network(grain_tf) nov = self.novelty_function(grain_tf, predicted_grain).numpy() temp_nov.append(nov) self._memory.push(Experience(nov, g)) nov_list.append(temp_nov) return nov_list
def run_episode(self, max_steps, train=True): """ Executes a single episode. Params ====== max_steps (integer): The maximum time steps to run in a single episode. train (Boolean): If true, run episode in train mode. If false, run in eval mode. """ env_info = self.env.reset(train_mode=train)[self.brain_name] states = env_info.vector_observations scores = np.zeros(len(states)) for i in range(max_steps): actions = self.agent.act(states, noise=train) env_info = self.env.step(actions)[self.brain_name] next_states = env_info.vector_observations rewards = env_info.rewards dones = env_info.local_done scores += rewards self.agent.learn( Experience(states, actions, rewards, next_states, dones)) if dones[0]: break states = next_states self.agent.end_episode() return scores.max()
def get_myexperiences(self,name): with dbapi2.connect(self.app.config['dsn']) as connection: cursor = connection.cursor() query = "SELECT * FROM EXPERIENCE where (username=%s)" cursor.execute(query,(name,)) experiences = [(key, Experience(title, username, start, finish, period, length)) for key, title, username, start, finish, period, length,userid,date in cursor] return experiences
def run(env, num_episodes, num_time_steps, replay_batch_size, scores_filename=None): exploration = EpsilonGreedy(epsilon_start=1.0, epsilon_min=0.01, epsilon_decay=0.999) # [Mnih 2015] used: # - replay over 2% of the total experience # - batch size of 32 # - minimum replay start size of 0.1% experience_max_size = int(num_episodes * num_time_steps * 0.02) replay_start_size = int(num_episodes * num_time_steps * 0.001) experience_replay = Experience(max_size=experience_max_size, batch_size=replay_batch_size, replay_start_size=replay_start_size) # experience_replay = PrioritizedExperience( # max_size=experience_max_size, batch_size=replay_batch_size, replay_start_size=replay_start_size, # initial_td_error=10, alpha=0.4, beta=0.4, anneal_rate=0.95, epsilon=0.001) model = ExampleModel(state_size=env.state_size, action_size=env.action_size, learning_rate=0.001) model.build() target_model = ExampleModel(state_size=env.state_size, action_size=env.action_size, learning_rate=0.001) target_model.build() qmodel = QModel(model=model, target_model=target_model, experience_replay=experience_replay, tau=0.1, use_double_q=True) agent = QAgent(state_size=env.state_size, action_size=env.action_size, model=qmodel, exploration=exploration, discount_rate=0.95) scores = np.empty((num_episodes, )) time_start = time.time() for e in range(num_episodes): scores[e] = agent.train(env=env, episode_length=num_time_steps) print('episode: {}/{}, score: {}, e: {:.2}'.format( e + 1, num_episodes, scores[e], agent.exploration.epsilon)) time_end = time.time() print('Average score for last 10% of episodes:', np.mean(scores[int(np.floor(num_episodes * 0.1)):])) print('Time taken:', time_end - time_start, 'seconds') if scores_filename is not None: np.savetxt(scores_filename, scores, delimiter=',')
def search_experience(self,keyword): with dbapi2.connect(self.app.config['dsn']) as connection: cursor = connection.cursor() query="SELECT * FROM EXPERIENCE WHERE (TITLE ILIKE %s OR START ILIKE%s OR FINISH ILIKE %s ) ORDER BY ID" keyword='%'+keyword+'%' cursor.execute(query, (keyword,keyword,keyword)) experiences = [(key, Experience(title, username, start, finish, period, length)) for key, title, username, start, finish, period, length,userid,date in cursor] return experiences
def read_experience(): experience = Experience() experience.employer = input("Enter name of employer: ") experience.title = input("Enter title: ") experience.responsibilities = input("Enter responsibilities: ") experience.duration_years = int( input("Enter number of years of experience: ")) return experience
def extract_tensors(experiences): # Convert batch of Experiences to Experience of batches batch = Experience(*zip(*experiences)) t1 = torch.cat(batch.state) t2 = torch.cat(batch.action) t3 = torch.cat(batch.reward) t4 = torch.cat(batch.next_state) return t1, t2, t3, t4
def to_experiences(self, states, actions, rewards, next_states, dones): """ Turns vectors representing components of multiple experiences into a vector of Experience objects. """ experiences = [] for (state, action, reward, next_state, done) in zip(states, actions, rewards, next_states, dones): experiences.append( Experience(state, action, reward, next_state, done)) return experiences
def generate_experience(self, nr, serialize=False): result = [] for x in range(nr): domain = self.__domains[random.randint(0, len(self.__domains) - 1)] projects = self.generate_projects(random.randint(1, 5), False) exp = Experience(domain, random.randint(1, 15), projects) result.append(exp) if serialize: return [exp.serialize() for exp in result] else: return result
def __init__(self, num_action, frame_height, frame_width, rng, network_type, algorithm, network_file=None, num_ignore=0, exp_file=None): self.rng = rng self.num_action = num_action self.mbsize = Agent.MINIBATCH_SIZE self.validate_size = Agent.VALID_SIZE self.num_train_obs = 0 self.network_type = network_type self.eps_decay = (Agent.FINAL_EXPLORE - Agent.INIT_EXPLORE) \ / Agent.EXPLORE_FRAMES self.validate_states = None self.exp_file = exp_file if exp_file is not None: with open(exp_file, 'rb') as f: npz = np.load(exp_file) self.num_train_obs = np.sum(npz['num_train_obs']) self.validate_states = npz['validate_states'] self.exp_train = Experience(Agent.REPLAY_SIZE, frame_height, frame_width, Agent.HISTORY, rng, npz) else: self.exp_train = Experience(Agent.REPLAY_SIZE, frame_height, frame_width, Agent.HISTORY, rng) self.exp_eval = Experience(Agent.HISTORY + 1, frame_height, frame_width, Agent.HISTORY, rng) self.network = Network(num_action, self.mbsize, Agent.HISTORY, frame_height, frame_width, Agent.DISCOUNT, Agent.UPDATE_FREQ, rng, network_type, algorithm, network_file, num_ignore)
def __init__(self, env=None, agent=None, logdir=None, should_render=None, should_load=None): self.env = env self.agent = agent self.config = self.agent.config self.logdir = logdir self.should_render = should_render self.experience = Experience(self.config) if should_load: self.load() else: self.step = 0 self.epsilon = 0.3 self.train_rewards = [0] * 100 self.current_episode = 0
def main(): from experience import Experience from visualization import hist_classes, scatter_classes class_count_list = [] agents = [ 'RainbowAgent', 'SimpleAgent', 'SecondAgent', 'ProbabilisticAgent' ] for agent in agents: exp = Experience(agent, load=True) labels, _, examples, _ = exp.load() class_count, _ = divide_and_count(examples, labels) class_count_list.append(class_count) scatter_classes(class_count_list, agents)
def run_episode(self): self.env.reset() self.env.random_start() t = 0 experiences = [] while(not self.env.terminal): #predict action, value action, value = self.predict(self.env.state) self.env.step(action) experience = Experience(self.env.state, action, self.env.reward, None, self.env.terminal) experiences.append(experience) yield experience t += 1
def run_episode(self, train=True): env_info = self.env.reset(train_mode=train)[self.brain_name] state = env_info.vector_observations[0] score = 0 while True: action = self.agent.act(state, explore=train) env_info = self.env.step(action)[self.brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] score += reward if train: self.agent.learn( Experience(state, action, reward, next_state, done)) state = next_state if done: break return score
def main(args): with tf.device(args['device']): # tf tf.set_random_seed(args['rand_seed']) sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) # env env = gym.make('TestEnv-v0') env.seed(args['rand_seed']) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] concat_dim = 2 batched_s_dim = [None, s_dim, concat_dim] batched_a_dim = [None, a_dim] # agents actor = Actor(sess, args['actor_lr'], args['tau'], args['batch_size'], args['clip_val'], batched_s_dim, batched_a_dim) critic = Critic(sess, args['critic_lr'], args['tau'], args['clip_val'], batched_s_dim, batched_a_dim) # experience exp = Experience(args['buffer_size'], args['batch_size'], args['rand_seed']) # noise actor_noise = ActorNoise(actor.predict, a_dim, noise_type=args['noise_type']) # initialize init = tf.global_variables_initializer() sess.run(init) saver = Model(sess, args['restore_path']) saver.restore_model() # training her = HER(saver, exp, env, actor, critic, actor_noise) if args['mode'] == 'train': her.train(args['gamma'], args['her_k'], args['max_episodes'], args['max_episode_len'], args['replay_len']) else: her.play(args['max_episodes'], args['max_episode_len'])
def run_episode(self): self.env.reset() self.history.add(self.env.state) random_start_steps = max(self.config.history_length, self.env.random_start_steps) for _ in range(random_start_steps): self.env.step(self.env.random_step()) self.history.add(self.env.state) t = 0 while (not self.env.terminal): #predict action, value prev_state = self.env.state action = self.predict(self.history.get()) self.env.step(action) experience = Experience(prev_state, action, self.env.reward, self.env.state, self.env.terminal) yield experience t += 1
def __init__(self, model, env, action_size, args, state): self.model = model self.env = env self.action_size = action_size self.state = state self.hx = None self.cx = None self.eps_len = 0 self.args = args self.values = [] self.log_probs = [] self.rewards = [] self.entropies = [] self.done = True # 初始化,可以设置一次新的状态值 self.info = None self.reward = 0 self.gpu_id = -1 # 参数 self.memory = Experience(history_size=2000)
def index(): app.logger.info("Connection from %s" % str(request.environ['REMOTE_ADDR'])) database = databaseconnector.databaseObject(app) skills = dict(database.query("SELECT title, rating FROM skills")) tools = dict(database.query("SELECT title, rating FROM tools")) languages = dict(database.query("SELECT title, rating FROM languages")) experience = database.query( "SELECT title, dates, shortDescription, longDescription, image, id FROM experience" ) experienceStructs = [] for line in experience: newStruct = Experience(line[0], line[1], line[2], line[3], line[4], 'a' + str(line[5])) experienceStructs.append(newStruct) database.close() return render_template('index.html', skills=skills, tools=tools, languages=languages, experiences=experienceStructs)
def main(): candidate_database = CandidateDatabase() candidate = Candidate() candidate.name = "Alicia Toomtest" candidate.title = "Python Developer" candidate.address = "Gothenburg, Sweden" candidate.phone = "0722879879" candidate.email = "*****@*****.**" candidate.hobbies = "Gardening" candidate.education = [ Education(name="Education", school="School", level="Level"), Education(name="Education2", school="School2", level="Level2") ] candidate.experience = [ Experience(employer="Volvo", title="Python developer", responsibilities="code", duration_years="2018-present") ] candidate.note = Note() candidate.note.summary = "Gslf9ehdlsdfnjslsleofjfms," candidate.note.comment = "dki9eufsklwodudndjskwoeifjdk" # print_candidate(candidate) candidate_database.add_candidate(candidate) find_result = candidate_database.find_candidates("toom") if len(find_result) > 0: print_candidates(find_result) else: print("No result found") print_candidate(candidate) return
def __init__(self, rank, args, shared_model, optimizer, lr): # CUDA 相关 self.gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if self.gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) self.replay_buffer = Experience(history_size=2000) self.cx = None # todo: 仍然是 一次 step 就前向传播 self.hx = None self.episodic_score = 0 self.rank = rank self.args = args self.shared_model = shared_model self.optimizer = optimizer self.local_t = 0 # 初始化 # 初始化环境 print('Training Agent: {}'.format(self.rank)) # todo: 需要给 gym 环境加上 pc 等 # agent 代理对象 model = UNREAL(in_channels=3, action_size=6, enable_pixel_control=True) if self.gpu_id >= 0: with torch.cuda.device(self.gpu_id): model = model.cuda() model.train() # 学习率 self.initial_learning_rate = lr self.max_global_time_step = 10 * 10**7 # 记录时间 # For log output self.prev_local_t = 0 self.model = model self.env = None self.reset() # cx hx
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate_input, grad_applier, env_type, env_name, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, local_t_max, gamma, gamma_pc, experience_history_size, max_global_time_step, device): self.thread_index = thread_index self.learning_rate_input = learning_rate_input self.env_type = env_type self.env_name = env_name self.use_pixel_change = use_pixel_change self.use_value_replay = use_value_replay self.use_reward_prediction = use_reward_prediction self.local_t_max = local_t_max self.gamma = gamma self.gamma_pc = gamma_pc self.experience_history_size = experience_history_size self.max_global_time_step = max_global_time_step self.action_size = Environment.get_action_size(env_type, env_name) self.local_network = UnrealModel(self.action_size, thread_index, use_pixel_change, use_value_replay, use_reward_prediction, pixel_change_lambda, entropy_beta, device) self.local_network.prepare_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.total_loss, global_network.get_vars(), self.local_network.get_vars()) self.sync = self.local_network.sync_from(global_network) self.experience = Experience(self.experience_history_size) self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 # For log output self.prev_local_t = 0
def qtest(model_eval, model_target,epsilon=0.05, n_vehicles=20, **opt): # global epsilon epoch = 0 n_epoch = opt.get('n_epoch', 1500) max_memory = opt.get('max_memory', 1000) data_size = opt.get('data_size', 50) weights_file = opt.get('weights_file', "") start_time = datetime.datetime.now() n_vehicles = n_vehicles get_dest_count = 0 failed_count = 0 eval_train_time = 0 # If you want to continue training from a previous model, # just supply the h5 file name to weights_file option if weights_file: print("loading weights from file: %s" % (weights_file,)) model_eval.load_weights(weights_file) # Initialize experience replay object experience = Experience(model_eval, model_target, max_memory=max_memory) # records : 记录各种指标 # reward部分: records_of_total_reward = [] records_of_veh_reward = [] records_of_se_reward = [] # 这一项totalreward是veh与se_reward的单纯相加,不带权值。 records_of_pure_total_reward = [] # 需要输出的部分: records_of_veh_drive = [] records_of_veh_drive_speed = [] records_of_se_delay = [] records_of_se_SR = [] veh_actions_record = [ [] for veh in range(n_vehicles) ] se_actions_record = [ [] for veh in range(n_vehicles) ] for epoch in range(n_epoch): qmaze.reset() # print("训练一轮结束,重置qmaze") # print(qmaze.vehs_og_list) seenv.reset(qmaze)#SE相关环境信息的重置 game_over = False # get initial envstate envstates_list = [qmaze.observe(veh) for veh in range(n_vehicles)] se_envstates_list = [seenv.SE_observe(veh, qmaze) for veh in range(n_vehicles)] #reward_list 是用来计算SE部分和Veh行车部分两个reward的累加值,即“总的reward”。 reward_list = np.zeros((n_vehicles)) veh_step_reward_list = np.zeros((n_vehicles)) se_step_reward_list = np.zeros((n_vehicles)) n_episodes = 0 while not game_over: # 开始前都将需要记录这个tag置零 qmaze.should_save = [0] * n_vehicles # 开始前把每一步经历的step_cost_list全部置零 veh_step_cost_list = np.zeros((n_vehicles)) se_step_cost_list = np.zeros((n_vehicles)) pure_reward_list = np.zeros((n_vehicles)) prev_envstates_list = envstates_list prev_se_envstates_list = se_envstates_list # 获取当前各个网格车辆的信息,汽车行动前的网格汽车数目 # _2nd_channel = background_vehs[np.newaxis,:] # background_vehs = background_vehs[np.newaxis,:] vehsnum_before_act = qmaze.count_cells_vehsnum() sesnum_before_act = seenv.SE_count(qmaze) actions = -1 * np.ones((n_vehicles)) # 默认 -1,表示不采取行动 se_actions = -1 * np.ones((n_vehicles)) # 默认 -1,表示不采取行 nVehsCell_list = qmaze.count_cell_vehsnum_list() nSEsCell_list = seenv.SE_count_list(qmaze) # print(nVehsCell_list) # print(veh_actions_record) # print(nSEsCell_list) # print(se_actions_record) # 每个时段的的开头部分 # 在时段的开头先统计各个网格的车辆, # 从而得知,在这一个时段内,汽车们、SE们可能得到的拥挤开销 # 这等于是在哥哥时刻开始之前计算拥挤开销被计算入reward之前的拥挤开销的积累 qmaze.get_veh_cost(veh_step_cost_list) seenv.get_se_cost(se_step_cost_list, qmaze) # 计算一个评判行车指标drive_cost qmaze.get_drive_cost(background_vehs,nVehsCell_list) # 每个时段的的开头部分 for veh in range(n_vehicles): if qmaze.status_flag[veh] != 0: # 说明汽车已经1或者-1了,不用在对其进行动作。 continue else: # 进入这一分支说明该车还没结束 if qmaze.last_act_done[veh] == 1: # 汽车是刚开始的状态或者完成了上一个动作的执行,需要委派新的动作。 valid_actions = qmaze.valid_actions(qmaze.vehs_cur_list[veh]) qmaze.vehs_change_act[veh] = actions[veh] = np.argmax( experience.predict_e(prev_envstates_list[veh], vehsnum_before_act, prev_se_envstates_list[veh], sesnum_before_act)) seenv.SEs_next_mrg_list[veh] = se_actions[veh] = np.argmax( experience.predict_e_se(prev_envstates_list[veh], vehsnum_before_act, prev_se_envstates_list[veh], sesnum_before_act)) veh_actions_record[veh].append(qmaze.vehs_change_act[veh]) se_actions_record[veh].append(seenv.SEs_next_mrg_list[veh]) # 重新指定time_remain_list # 获取reward_will_get # 还得记录一下vehs的nexttogo qmaze.get_some_will(actions[veh], veh) if qmaze.last_act_done[veh] == 0: # 未完成上一个迁移动作,SEs和Vehs的状态都是GOING actions[veh] = GOING se_actions[veh] = GOING continue # 每个时段的结尾部分 nVehsCell_list = qmaze.count_cell_vehsnum_list() nSEsCell_list = seenv.SE_count_list(qmaze) vehcurpos = qmaze.vehs_cur_list for veh in range(n_vehicles): # 查询汽车是否为“完成状态”,即到达终点或者累计reward过大; if qmaze.status_flag[veh] != 0: # 说明该车状态为完成,即已到达终点或者累计reward过大 continue else: # 说明汽车不是“完成”状态。 qmaze.time_remain_list[veh] -= 1 qmaze.action_times_list[veh] += 1 nVehsCell = nVehsCell_list[transfer_dict[vehcurpos[veh]]] nSEsCell = nSEsCell_list[transfer_dict[seenv.SE_curpos_list[veh]]] veh_pos = vehcurpos[veh] se_pos = seenv.SE_curpos_list[veh] bv = seenv.SE_data_list[veh] cpuneed = seenv.CPUNeedList[veh] if qmaze.time_remain_list[veh] != 0: # 首先把last_act_done设置成0,表示上一个动作未做完 qmaze.last_act_done[veh] = 0 # 说明未发生迁移,所以得到的reward都为0,只是记录服务时延 veh_step_reward_list[veh] = 0 # 计算一下服务时延, # 记录一下时延——后面看一下总的时延时间; # 记录一下服务时延的成功率; delay_time = seenv.sum_delay(nVehsCell, nSEsCell, veh_pos, se_pos, bv, cpuneed) if delay_time < Delay_Threshold: seenv.record_success_rate[veh].append(1) else: seenv.record_success_rate[veh].append(0) seenv.record_delay_list[veh].append(delay_time) seenv.SE_delay_list[veh] += delay_time se_step_reward_list[veh] = 0 continue if qmaze.time_remain_list[veh] == 0: # 说明将要完成迁移 # 首先把last_act_done设置成1,表示上一个动作已经做完 qmaze.last_act_done[veh] = 1 # qmaze.should_change[veh] = 1 # veh的rewrd是直接读取will_get即可 # se部分计算时延、计算reward veh_step_reward_list[veh] = qmaze.reward_will_get[veh] + veh_step_cost_list[veh] qmaze.reward_will_get[veh] = 0 veh_step_cost_list[veh] = 0 # 计算一下服务时延, # 记录一下时延——后面看一下总的时延时间; # 记录一下服务时延的成功率; delay_time = seenv.sum_delay(nVehsCell, nSEsCell, veh_pos, se_pos, bv, cpuneed) if delay_time < Delay_Threshold: seenv.record_success_rate[veh].append(1) else: seenv.record_success_rate[veh].append(0) seenv.record_delay_list[veh].append(delay_time) seenv.SE_delay_list[veh] += delay_time avg_delay = seenv.SE_delay_list[veh] / qmaze.action_times_list[veh] if avg_delay > Delay_Threshold: delay_cost = - 0.20 else: delay_cost = 0 seenv.SE_delay_list[veh] = 0 # 若计算了delaycost 要重新清零delay累加 se_step_reward_list[veh] = delay_cost + -1 * CostMrgS + se_step_cost_list[veh] se_step_cost_list[veh] = 0 # 被计入reward过后的拥挤开销需要清零 # 计算累计的reward,加入了拥挤开销的reward qmaze.game_acc_veh_reward[veh] += veh_step_reward_list[veh] seenv.game_acc_se_reward[veh] += se_step_reward_list[veh] pure_reward_list[veh] = veh_step_reward_list[veh] + se_step_reward_list[veh] reward_list[veh] = vr_weights * veh_step_reward_list[veh] + (1 - vr_weights) * se_step_reward_list[veh] qmaze.game_acc_reward[veh] += reward_list[veh] qmaze.game_acc_pure_reward[veh] += pure_reward_list[veh] # 判断行车的accreward是否小于最低要求而游戏失败 if qmaze.game_acc_veh_reward[veh] < qmaze.min_reward: # 行车失败 qmaze.status_flag[veh] = -1 else: # 尚未达到失败的标准,继续行车: # 根据车辆的will的得到的动作信息,更新veh与se状态。 # 判断一个是否到达终点的信息,即用来更新汽车状态,又用来在experience的判定因素。 qmaze.update_state_single__(veh) seenv.update_se_pos_(veh) # 维护一个should_save,表示有必要存储的经验过程数据 qmaze.should_save[veh] = 1 print("E:",n_episodes) print("cur:",qmaze.vehs_cur_list) print("remain:",qmaze.time_remain_list) print("next:",qmaze.vehs_next_go_list) envstates_list = [qmaze.observe(veh) for veh in range(n_vehicles)] se_envstates_list = [seenv.SE_observe(veh, qmaze) for veh in range(n_vehicles)] # 获取当前各个网格车辆的信息,汽车行动后的网格汽车数目 vehsnum_after_act = qmaze.count_cells_vehsnum() sesnum_after_act = seenv.SE_count(qmaze) # print(qmaze.count_cell_vehsnum_list()) # print(qmaze.acc_drive_cost) for veh in range(n_vehicles): if qmaze.should_save[veh] == 1: # 记录一个episode经验 if qmaze.status_flag[veh] == 1: get_dest = True else: get_dest = False episode = [prev_envstates_list[veh], prev_se_envstates_list[veh], qmaze.vehs_change_act[veh], seenv.SEs_next_mrg_list[veh], reward_list[veh], veh_step_reward_list[veh], se_step_reward_list[veh], envstates_list[veh], se_envstates_list[veh], get_dest, vehsnum_before_act, vehsnum_after_act, sesnum_before_act, sesnum_after_act] experience.remember(episode) else: continue # 重置两个计数器 get_dest_count = 0 failed_count = 0 # todo 判断条件要改 # 计算已经完成的 actor 数量,包括 1(成功),2(失败) for veh in range(n_vehicles): if qmaze.status_flag[veh] == 1: get_dest_count += 1 elif qmaze.status_flag[veh] == -1: failed_count += 1 # 当所有的 actor 都已达到终点(不一定是最优解)或者未能达终点(失败),该 episode 训练结束 if get_dest_count + failed_count == n_vehicles: game_over = True else: game_over = False n_episodes += 1 # 一系列reward信息的统计 sum_pure_total_reward = 0 sum_total_reward = 0 sum_veh_reward = 0 sum_se_reward = 0 for veh in qmaze.done_list: sum_total_reward += qmaze.game_acc_reward[veh] sum_veh_reward += qmaze.game_acc_veh_reward[veh] sum_se_reward += seenv.game_acc_se_reward[veh] sum_pure_total_reward += qmaze.game_acc_pure_reward[veh] if len(qmaze.done_list) == 0: records_veh = 0 records_total = 0 records_se = 0 records_total_pure = 0 else: records_veh = sum_veh_reward / len(qmaze.done_list) records_total = sum_total_reward / len(qmaze.done_list) records_se = sum_se_reward / len(qmaze.done_list) records_total_pure = sum_pure_total_reward / len(qmaze.done_list) records_of_total_reward.append(records_total) records_of_veh_reward.append(records_veh) records_of_se_reward.append(records_se) records_of_pure_total_reward.append(records_total_pure) # 统计汽车的行车代价 sum_veh_drive = 0 sum_veh_drive_speed = 0 # print(qmaze.acc_drive_cost) for veh in qmaze.done_list: qmaze.game_drive_cost[veh] = qmaze.acc_drive_cost[veh] / qmaze.action_times_list[veh] qmaze.game_drive_speed[veh] = 1/qmaze.game_drive_cost[veh] sum_veh_drive += qmaze.acc_drive_cost[veh] / qmaze.action_times_list[veh] sum_veh_drive_speed += qmaze.game_drive_speed[veh] if len(qmaze.done_list) == 0: records_drive = 0 records_drive_speed = 0 else: records_drive = sum_veh_drive / len(qmaze.done_list) records_drive_speed = sum_veh_drive_speed / len(qmaze.done_list) records_of_veh_drive.append(records_drive) records_of_veh_drive_speed.append(records_drive_speed) # 统计各个车辆的在每一局(每一个训练轮次)的平均delay与服务成功率 se_delay_avg = [0]*n_vehicles se_success_rate = [0]*n_vehicles for veh in qmaze.done_list: se_delay_avg[veh] = sum(seenv.record_delay_list[veh])/ len(seenv.record_delay_list[veh]) se_success_rate[veh] = sum(seenv.record_success_rate[veh])/ len(seenv.record_success_rate[veh]) # 在一个轮次中对于各个车辆的delay与服务成功率进行求平均 AvgDelayforAll = 0 SRforAll = 0 SRcount = 0 for veh in qmaze.done_list: AvgDelayforAll += se_delay_avg[veh] SRforAll += sum(seenv.record_success_rate[veh]) SRcount += len(seenv.record_success_rate[veh]) if len(qmaze.done_list) == 0: records_delay = 0 else: records_delay = AvgDelayforAll / len(qmaze.done_list) if SRcount == 0: records_SR =0 else: records_SR = SRforAll / SRcount records_of_se_delay.append(records_delay) records_of_se_SR.append(records_SR) dt = datetime.datetime.now() - start_time t = format_time(dt.total_seconds()) template = "Epoch: {:03d}/{:d} | Episodes: {:d} | GetDestCount: {:d}/{:d} |FailedCount: {:d}/{:d}| time: {},| loss_weight:{}" print(template.format(epoch, n_epoch - 1,n_episodes ,get_dest_count, n_vehicles ,failed_count, n_vehicles, t, model_eval.loss_weights)) print("Arrived vehs:", qmaze.done_list) print("SE_delay_avg:",se_delay_avg) print("SE_success_rate:",se_success_rate) print("veh_drive", qmaze.game_drive_speed) print("【AVG_Veh_drive】:", records_drive) print("【AVG_Veh_drive_speed】:", records_drive_speed) end_time = datetime.datetime.now() dt = datetime.datetime.now() - start_time seconds = dt.total_seconds() t = format_time(seconds) weight_rate = '_'+ str(vr_weights) + '_'+str(seenv.datasize_base)+'_'+str(n_vehicles) +'_' # 保存训练过程中的指标 parent_path = 'save_res0108/' method_name = 'merge_test' + weight_rate print("Saving Reward:") with open(parent_path + method_name + "total_reward.pl", 'wb') as f: print("recording total_reward..") pickle.dump(records_of_total_reward, f) with open(parent_path + method_name +"pure_total_reward.pl", 'wb') as f: print("recording pure total_reward..") pickle.dump(records_of_pure_total_reward, f) with open(parent_path + method_name + "veh_reward.pl", 'wb') as f: print("recording veh_reward..") pickle.dump(records_of_veh_reward, f) with open(parent_path + method_name + "se_reward.pl", 'wb') as f: print("recording se_reward..") pickle.dump(records_of_se_reward, f) print("Saving Index:") with open(parent_path + method_name + "veh_drive.pl", 'wb') as f: print("recording veh_drive..") pickle.dump(records_of_veh_drive, f) with open(parent_path + method_name + "veh_drive_speed.pl", 'wb') as f: print("recording veh_drive_speed..") pickle.dump(records_of_veh_drive_speed, f) with open(parent_path + method_name + "se_delay.pl", 'wb') as f: print("recording se_delay..") pickle.dump(records_of_se_delay, f) with open(parent_path + method_name + "se_SR.pl", 'wb') as f: print("recording se_SR..") pickle.dump(records_of_se_SR, f) # print(veh_actions_record) # print(se_actions_record) print(seenv.SE_data_list)
def experiences_page(): if 'username' in session: if request.method == 'GET': experiences = app.store.get_experiences() now = datetime.datetime.now() with dbapi2.connect(app.config['dsn']) as connection: cursor = connection.cursor() cursor.execute("DELETE FROM TOPMEMBERS") connection.commit() with dbapi2.connect(app.config['dsn']) as connection: cursor = connection.cursor() cursor.execute( "(select userid, count(userid) from members inner join experience on(userid=memberid) group by userid limit 5) order by count(userid) desc" ) cr = cursor.fetchall() topmembers = [(row[0], row[1]) for row in cr] connection.commit() with dbapi2.connect(app.config['dsn']) as connection: cursor = connection.cursor() for userid, count in topmembers: query = "INSERT INTO TOPMEMBERS (USERID, COUNT) VALUES ( %s, %s)" cursor.execute(query, (userid, count)) counter = 0 for userid, count in topmembers: cursor.execute( "select username from members where memberid='%s';" % userid) user = cursor.fetchone() topmembers[counter] = user[0], count * 10 counter = counter + 1 connection.commit() return render_template('experiences.html', experiences=experiences, topmembers=topmembers, current_time=now.ctime()) with dbapi2.connect(app.config['dsn']) as connection: cursor = connection.cursor() cursor.execute("""DROP TABLE TOPMEMBERS""") connection.commit() elif 'experiences_to_delete' in request.form or 'search' in request.form: if request.form['submit'] == 'Delete': keys = request.form.getlist('experiences_to_delete') for key in keys: app.store.delete_experience(int(key)) return redirect(url_for('experiences_page')) elif request.form['submit'] == 'Search': keyword = request.form['search'] experiences = app.store.search_experience(keyword) now = datetime.datetime.now() return render_template('experiences.html', experiences=experiences, current_time=now.ctime()) else: title = request.form['title'] start = request.form['start'] finish = request.form['finish'] period = request.form['period'] length = request.form['length'] name = session['username'] experience = Experience(title, name, start, finish, period, length) app.store.add_experience(experience) with dbapi2.connect(app.config['dsn']) as connection: cursor = connection.cursor() cursor.execute( "UPDATE MEMBERS SET SCORE=SCORE+10 WHERE username='******';" % name) connection.commit() with dbapi2.connect(app.config['dsn']) as connection: cursor = connection.cursor() cursor.execute( "SELECT memberid FROM MEMBERS WHERE username='******';" % name) id = cursor.fetchone() connection.commit() with dbapi2.connect(app.config['dsn']) as connection: cursor = connection.cursor() query = ("UPDATE EXPERIENCE SET userid=%s WHERE (id=%s)") key = app.store.exp_key cursor.execute(query, (id[0], key)) connection.commit() now = datetime.datetime.now() return redirect(url_for('experience_page', key=app.store.exp_key)) else: return redirect(url_for('guest_page'))
from visdom import Visdom viz = Visdom() # Build Environment Template -> Lazy Evaluated Callable, for spawning environments env_template = build_env(args.env) # Build Distributed Environments envs = get_distributed_backend(env_template, args.num_processes, backend=args.distributed_backend) # Obtain Environment metadata metadata = envs.get_metadata() # Instantiate Policy policy = get_policy(args.policy, metadata) # Create agent, with the given training algorithm agent = get_algorithm(args.algorithm, policy, envs, args, visdom=viz) # Create Experience Buffer, with the environment metadata experience = Experience(metadata['max_episode_length'], args.num_processes, metadata['obs_shape'], metadata['action_type'], metadata['action_shape']) # Train agent agent.train(num_frames=args.num_frames) import IPython IPython.embed()
def null_experience_list(self, count=100): return [Experience(None, None, None, None, None) for _ in range(count)]
def __init__(self, env, task, visualise): self.env = env self.task = task self.ob_shape = [HEIGHT, WIDTH, CHANNEL] self.action_n = Environment.get_action_size() # define the network stored in ps which is used to sync worker_device = '/job:worker/task:{}'.format(task) with tf.device( tf.train.replica_device_setter(1, worker_device=worker_device)): with tf.variable_scope('global'): self.experience = Experience( EXPERIENCE_HISTORY_SIZE) # exp replay pool self.network = UnrealModel(self.action_n, self.env, self.experience) self.global_step = tf.get_variable('global_step', dtype=tf.int32, initializer=tf.constant( 0, dtype=tf.int32), trainable=False) # define the local network which is used to calculate the gradient with tf.device(worker_device): with tf.variable_scope('local'): self.local_network = net = UnrealModel(self.action_n, self.env, self.experience) net.global_step = self.global_step # add summaries for losses and norms self.batch_size = tf.to_float(tf.shape(net.base_input)[0]) base_loss = self.local_network.base_loss pc_loss = self.local_network.pc_loss rp_loss = self.local_network.rp_loss vr_loss = self.local_network.vr_loss entropy = tf.reduce_sum(self.local_network.entropy) self.loss = base_loss + pc_loss + rp_loss + vr_loss grads = tf.gradients(self.loss, net.var_list) tf.summary.scalar('model/a3c_loss', base_loss / self.batch_size) tf.summary.scalar('model/pc_loss', pc_loss / self.batch_size) tf.summary.scalar('model/rp_loss', rp_loss / self.batch_size) tf.summary.scalar('model/vr_loss', vr_loss / self.batch_size) tf.summary.scalar('model/grad_global_norm', tf.global_norm(grads)) tf.summary.scalar('model/var_global_norm', tf.global_norm(net.var_list)) tf.summary.scalar('model/entropy', entropy / self.batch_size) tf.summary.image('model/state', net.base_input) self.summary_op = tf.summary.merge_all() # clip the gradients to avoid gradient explosion grads, _ = tf.clip_by_global_norm(grads, GRAD_NORM_CLIP) self.sync = tf.group(*[ v1.assign(v2) for v1, v2 in zip(net.var_list, self.network.var_list) ]) grads_and_vars = list(zip(grads, self.network.var_list)) inc_step = self.global_step.assign_add(tf.to_int32( self.batch_size)) lr = log_uniform(LR_LOW, LR_HIGH) opt = tf.train.RMSPropOptimizer(learning_rate=lr, decay=RMSP_ALPHA, momentum=0.0, epsilon=RMSP_EPSILON) self.train_op = tf.group(opt.apply_gradients(grads_and_vars), inc_step) self.summary_writer = None self.local_step = 0
def train(self, name_scope, action_verbose=False, update_interval=20, graph_interval=50, monitor_interval=100, discount_rate=0.99, obs_verbose=False, reward_verbose=False): online_q_values, online_vars, online_input = self.q_network( name_scope="online") target_q_values, target_vars, target_input = self.q_network( name_scope="target") copy_ops = [ target_var.assign(online_vars[var_name]) for var_name, target_var in target_vars.items() ] copy_online_to_target = tf.group(*copy_ops) with self.graph.as_default() as graph: with tf.variable_scope('train', reuse=tf.AUTO_REUSE): X_action = tf.placeholder(dtype=tf.int32, shape=[None, 1], name='X_action') max_reward = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='max_reward') # expected Bellman reward; self.q variable takes a one-hot encoding of output-action and multiplies # by policy net output to determine q-value for given state(s) over all n actions # by policy net output to determine q-value for given state(s) over all actions q = tf.reduce_sum(online_q_values * tf.one_hot(X_action, self.n_actions), axis=1, keep_dims=True) loss = tf.square((max_reward - q)) optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate) train_opt = optimizer.minimize( loss, var_list=tf.trainable_variables( scope='online')) #important to specify var_list tf.add_to_collection(tf.GraphKeys.TRAIN_OP, train_opt) self.saver = tf.train.Saver() init = tf.global_variables_initializer() # Some initialization steps train_sess = tf.InteractiveSession(graph=self.graph) train_sess.run(init) self.replay.clear() assert tf.get_default_graph() is self.graph # Monitor game by game progress agg_rewards = [] step_agg_rewards = [] game_steps = [] # Outer loop for number of games to play for i in range(self.num_games_train): print('==========================\nGame #%d begun' % (i + 1)) game_step = 0 init_state = self.env.reset() init_state = preprocess_observation(init_state) next_state = None done = False agg_reward = 0 frame_count = 0 #Total number of frames encounted in game last_frame = None #variable used for pixelwise max operation maxop = True #take framewise max of num_frames neighbors self.frame_cache = FrameCache(size=self.num_frames) # This while loop is **like** each step in the game (not really because we're using multiple frames) # Frames go like x1,x2,x3,x4 then x2,x3,x4,x5 ... end of game but skip between while done == False: game_step += 1 if i % (render_interval) == 0: self.env.render() frame_skip = False #initialize skipping to False # q_values = online_q_values.eval(feed_dict={online_input: [next_state],target_input: None}) # action = self.epsilon_greedy(q_values) init_state = np.reshape(init_state, [ 1, init_state.shape[0], init_state.shape[1], init_state.shape[2] ]) qs = q.eval( feed_dict={ online_input: init_state, target_input: np.zeros_like(init_state), X_action: [[0]], max_reward: [[0]] }) action = self.epsilon_greedy(qs, epsilon=0.6, iteration=i, k=1000, show_eps=True) action = self.epsilon_greedy(qs) next_state, reward, done, info = self.env.step(action) next_state = preprocess_observation(next_state) # metrics for monitoring improvement agg_reward += reward step_agg_reward = agg_reward / game_step if game_step % monitor_interval == 0: print('Aggregate reward at step %d: %f' % (game_step, agg_reward)) print('Step Aggregate Reward: %f' % step_agg_reward) # experience replay addition if maxop == True and last_frame is not None: next_state = np.maximum( last_frame, next_state) #take element-wise max of two frames if not frame_skip: self.frame_cache.add_base_frame( next_state ) #this is going to be the last index of frame stack #maybe not the most efficient way to do this for k in range(0, (self.frame_cache.len_stacks() - 1)): self.frame_cache.add_to_stack( next_state, k ) #adds frame to every preceding stack if it's not full yet frame_count += 1 #Flip frame_skip every num_frames interval (it's possible to change this interval but let's keep it simple) if frame_count % self.num_frames == 0: if frame_skip == True: frame_skip = False else: frame_skip = True last_frame = next_state #now this frame is the last state init_state = next_state if self.frame_cache.len_stacks( ) > self.num_frames: #at least num_frames actions must be taken before one stack frame is full first_state_frames, second_state_frames = self.frame_cache.get_last_fulls( ) if first_state_frames is not None and second_state_frames is not None: exp_input = Experience(first_state_frames, action, reward, second_state_frames, done) self.replay.add(exp_input) if i % render_interval == 0: self.env.close() agg_rewards.append(agg_reward) step_agg_rewards.append(step_agg_reward) game_steps.append(game_step) samples = self.replay.sample_batch( batch_size=1 ) #try doing different batch sizes like Ryan said as parallelization, if batch_size>1 flatten this for exp in samples: # Experience Fields # self.first_state # self.action # self.reward # self.second_state # self.terminal fed = {target_input: exp.first_state} target_qs = target_q_values.eval(feed_dict=fed) max_next_q_values = np.max(target_qs, axis=1, keepdims=True) if exp.terminal: #done == True y_val = exp.reward else: y_val = exp.reward + discount_rate * max_next_q_values print('\n\nGame ', i) print('Step ', game_step) print('y_val') print(y_val) print('\ntarget_qs') print(target_qs) print('\nmax_next_qs') print(max_next_q_values) print('\n\n') outs, out_loss, _ = train_sess.run( [q, loss, train_opt], feed_dict={ max_reward: max_next_q_values, X_action: [[exp.action]], online_input: exp.first_state, target_input: exp.first_state }) if action_verbose: print('Mean output loss at terminal step %d: %f' % (game_step, np.mean(out_loss))) if i % update_interval == 0: train_sess.run(copy_online_to_target) print('Online to target copy complete after game %d' % i) if i % graph_interval == 0 and i > 1: f, (ax1, ax2, ax3) = plt.subplots(3) xs = np.arange(0, len(agg_rewards)) ax1.set_title('Aggregate Reward Update at Game %d' % i) ax2.set_title('Step Aggregate Reward Update at Game %d' % i) ax3.set_title('Game Steps Used Update at Game %d' % i) ax1.plot(xs, agg_rewards, 'r', label='Agg Rewards') ax2.plot(xs, step_agg_rewards, 'b', label='Step Agg Rewards') ax3.plot(xs, game_steps, 'y', label='Game Steps') ax1.set_xlabel('Game Played') ax1.set_ylabel('Agg Reward') ax2.set_xlabel('Game Played') ax2.set_ylabel('Step Agg Reward') ax3.set_xlabel('Game Played') ax3.set_ylabel('Game Steps Used') f.tight_layout() plt.savefig('Metrics Measurement.png') plt.show(block=False) time.sleep(15) plt.close() self.saver.save(train_sess, self.ckptdir) print('Model saved at %s' % self.ckptdir)