def test(opts): env = create_env(opts.env_name, 712) action_space = env.action_space state_space = env.state_space tp_matrix = env.tp_matrix agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount, action_space, state_space, tp_matrix, env.blocked_positions, 712) qvalues = np.load(os.path.join(opts.policy_dir, opts.env_name + '.npy')) agent.qvalues = qvalues env.render(agent.qvalues) state = env.get_state() for i in range(200): possible_actions = env.get_possible_actions() action = agent.get_best_action(state, possible_actions) time.sleep(0.1) next_state, reward, done, next_possible_states = env.step(action) env.render(agent.qvalues) next_state_possible_actions = env.get_possible_actions() state = next_state print(reward) if done == True: env.reset_state() env.render(agent.qvalues) time.sleep(0.5) state = env.get_state() continue
def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS, id_num = 0): threading.Thread.__init__(self) self.render = render #self.env = #gym.make(ENV) self.env = create_env('mario',id_num,None) self.agent = Agent(eps_start, eps_end, eps_steps)
def main(): args = parser.parse_args() np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) # Disable nondeterministic ops (not sure if critical but better safe than sorry) #torch.backends.cudnn.enabled = False else: args.device = torch.device('cpu') # Environment env = create_env(args.environment_filename, custom=False, skip_frames=1, realtime=args.render, worker_id=args.worker, device=args.device) agent = AgentEval(args, env) if env.unwrapped.is_grading(): print('grading...') run_evaluation(env, agent) else: print('testing...') rewards = [] for _ in range(10): episode_rewards = run_episode(env, agent) rewards.append(episode_rewards) env.close() print(sum(rewards) / len(rewards))
def train(opts): env = create_env(opts.env_name) action_space = env.action_space state_space = env.state_space tp_matrix = env.tp_matrix agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount, action_space, state_space, tp_matrix, env.blocked_positions) env.render(agent.qvalues) state = env.get_state() for i in range(opts.num_iters): possible_actions = env.get_possible_actions() action = agent.get_action(state, possible_actions) next_state, reward, done, next_possible_states = env.step(action) env.render(agent.qvalues) next_state_possible_actions = env.get_possible_actions() agent.update(state, action, reward, next_state, next_state_possible_actions, next_possible_states, done) state = next_state if done == True: env.reset_state() env.render(agent.qvalues) state = env.get_state() continue if not os.path.exists(opts.policy_dir): os.makedirs(opts.policy_dir) np.save(os.path.join(opts.policy_dir, opts.env_name + '.npy'), np.asarray(agent.qvalues))
def global_test(global_model, device, args, model_type, delay=0.03): world = args.world stage = args.stage env = create_env(world, stage) device = device state = env.reset() state = (env.reset()).to(device, dtype=torch.float) state = state.view(1, 1, 80, 80) done = True if (model_type == "LSTM"): model = ActorCritic_LSTM().to(device) else: model = ActorCritic().to(device) model.eval() model.load_state_dict(global_model.state_dict()) while (True): if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() h_0 = h_0.to(device) c_0 = c_0.to(device) env.render() p, _, h_0, c_0 = model(state, h_0, c_0) policy = F.softmax(p, dim=1) action = torch.argmax(policy) next_state, _, done, info = env.step(action.item()) next_state = (next_state).to(device, dtype=torch.float) next_state = next_state.view(1, 1, 80, 80) state = next_state if (done): if (info['flag_get']): break state = env.reset() state = state.to(device) state = state.view(1, 1, 80, 80) model.load_state_dict(global_model.state_dict()) time.sleep(delay) print('Success clear {}-{}'.format(world, stage))
def main(): args = parser.parse_args() print(' ' * 26 + 'Options') for k, v in vars(args).items(): print(' ' * 26 + k + ': ' + str(v)) np.random.seed(args.seed) torch.manual_seed(np.random.randint(1, 10000)) if torch.cuda.is_available() and not args.disable_cuda: args.device = torch.device('cuda') torch.cuda.manual_seed(np.random.randint(1, 10000)) # Disable nondeterministic ops (not sure if critical but better safe than sorry) #torch.backends.cudnn.enabled = False else: args.device = torch.device('cpu') args.large = False args.skip_frames = 0 args.random_aug = 0. # Environment train_env = create_env(args.environment_filename, custom=True, large=args.large, skip_frames=args.skip_frames, random_aug=args.random_aug, docker=args.docker_training, device=args.device) action_space = train_env.action_space test_env = create_env( args.environment_filename, custom=True, large=args.large, custom_reward=False, skip_frames=args.skip_frames, docker=args.docker_training, device=args.device, worker_id=1, ) mem = ReplayMemory(args, args.memory_capacity, obs_space=train_env.observation_space) val_mem = ReplayMemory(args, args.evaluation_size, obs_space=test_env.observation_space) # for debugging environment issues if args.timeout_monitor: train_env = TimeoutMonitor(train_env, mem) test_env = TimeoutMonitor(test_env, val_mem) # Agent dqn = Agent(args, train_env) priority_weight_increase = (1 - args.priority_weight) / (args.T_max - args.learn_start) time_step = 0 done = True state = None while time_step < args.evaluation_size: if done: state = train_env.reset() done = False next_state, _, done, _ = train_env.step(action_space.sample()) val_mem.append(state, None, None, done) state = next_state time_step += 1 if args.evaluate: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, 0, dqn, val_mem, evaluate=True) # Test print('Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) else: # Training loop dqn.train() done = True for time_step in tqdm(range(args.T_max)): if done: state = train_env.reset() done = False if time_step % args.replay_frequency == 0: dqn.reset_noise() # Draw a new set of noisy weights action = dqn.act( state) # Choose an action greedily (with noisy weights) next_state, reward, done, info = train_env.step(action) # Step if args.reward_clip > 0: reward = max(min(reward, args.reward_clip), -args.reward_clip) # Clip rewards mem.append(state, action, reward, done) # Append transition to memory # Train and test if time_step >= args.learn_start: # Anneal importance sampling weight β to 1 mem.priority_weight = min( mem.priority_weight + priority_weight_increase, 1) if time_step % args.replay_frequency == 0: dqn.learn( mem ) # Train with n-step distributional double-Q learning if time_step % args.evaluation_interval == 0: dqn.eval() # Set DQN (online network) to evaluation mode avg_reward, avg_Q = test(args, time_step, dqn, val_mem, env=test_env) # Test log('T = ' + str(time_step) + ' / ' + str(args.T_max) + ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' + str(avg_Q)) dqn.train( ) # Set DQN (online network) back to training mode # Update target network if time_step % args.target_update == 0: dqn.update_target_net() state = next_state train_env.close()
# * compute the minimal initial load needed in each state. # In other words, all the solvers have the same guarantees. Where they differ are the strategies they produce. Our overall goal is to provide strategies that not only provide these guarantees but are also **usable in practical control case studies**. # # We use a simple gridworld underwater environment generated by [FiMDPEnv] to demonstrate the behavior we can obtain using different solvers of the [FiMDP] package. We have some pre-defined environments in the file [env.py](env.py). The goal of the agent is to reach the green target with sufficient energy so that it can reach it again and again. # # In each cell of the gridworld, the agent can choose one of eight possible actions. For each of the 4 directions (`NORTH`, `SOUTH`, `WEST`, `EAST`) the agent chooses whether to play a *weak* or _strong_ action. A strong action costs more energy, while the weak action has uncertain outcome. The resulting direction of movement can be affected by pre-defined currents. For example, in most cases, picking `EAST` can, with a small probability end up with the agent going `SOUTH` or `NORTH`. # # [FiMDP]: https://github.com/xblahoud/FiMDP # [FiMDPEnv]: https://github.com/pthangeda/FiMDPEnv import fimdpenv fimdpenv.setup() from env import create_env e = create_env('2R-1T-simple', heading_sd=0.32, agent_capacity=40) e # The colors of the gridworld cells have the following semantics: # * <font color='blue'>Blue Cell</font>: Current location of the agent # * <font color='gray'>Gray Cells</font>: Trajectory of the agent # * <font color='green'>Green Cells</font>: Target States # * <font color='orange'>Orange Cells</font>: Reload states # This package offers 2 solvers that generate strategies: # * Basic solver (class `BasicES`) # * Goal-leaning solver (class `GoalLeaningES`) # + import fimdp
def train(bisimulation, opts): log_path = 'output_logs/Softmax-Logs/{}'.format(opts.env_name) if not os.path.exists(log_path): os.makedirs(log_path) curr_log_dir = os.path.join(log_path, str(datetime.datetime.now())[:-7]) os.makedirs(curr_log_dir) with open(os.path.join(curr_log_dir, 'command.txt'), 'w') as f: json.dump(opts.__dict__, f, indent=2) lower_bound = np.zeros( (bisimulation.src_env.state_space, bisimulation.tgt_env.state_space)) for t in range(bisimulation.tgt_env.state_space): for s in range(bisimulation.src_env.state_space): lower_bound[s, t] = np.max(bisimulation.src_agent.qvalues[s] ) - bisimulation.dist_matrix_final[s, t] plt_path = os.path.join(curr_log_dir, opts.env_name + '_softmax_qlearn.png') policy_path = os.path.join(curr_log_dir, opts.env_name + '_softmax_qlearn.npy') df_path = os.path.join(curr_log_dir, opts.env_name + '_softmax_qlearn.csv') heatmap_path = os.path.join(curr_log_dir, opts.env_name + '_explore_softmax.png') seeds = random.sample(range(1, 10000), opts.num_seeds) print(seeds) # seeds = [1981, 8702, 3497, 8058, 4931, 1968, 6555, 8390, 8711, 7212] # np.arange(opts.num_seeds) cr_allvec = [] tp_all = [] pi_val_all = [] avg_reward_all = [] exp_q_all = [] pbar = tqdm(total=opts.num_seeds) dummy_env = create_env(opts.env_name, 712) count_matrix_avg = np.zeros( (dummy_env.state_space, dummy_env.action_space)) for i in seeds: np.random.seed(i) random.seed(i) env = create_env(opts.env_name, i) action_space = env.action_space state_space = env.state_space tp_matrix = env.tp_matrix count_matrix = np.zeros((env.state_space, env.action_space)) gt_agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount, action_space, state_space, tp_matrix, env.blocked_positions, i) gt_agent.qvalues = np.load( os.path.join(opts.policy_dir, opts.env_name + '.npy')) gt_policy_val = np.mean(gt_agent.qvalues) agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount, action_space, state_space, tp_matrix, env.blocked_positions, i) agent.qvalues = np.random.rand(state_space, action_space) # env.render(agent.qvalues) cr_vec = [] pi_val_vec = [] exp_q_vec = [] c_reward = 0 avg_reward_vec = [] state = env.get_state() for i in range(opts.num_iters): possible_actions = env.get_possible_actions() action = agent.get_action_softmax(state, possible_actions, opts.temp) # action = agent.get_action_trexsoftmax(state, possible_actions, lower_bound, bisimulation.d_sa_final, i, opts.temp) next_state, reward, done, next_possible_states = env.step(action) if i < exp_check_thresh: count_matrix[state][action] = 1 # env.render(agent.qvalues) next_state_possible_actions = env.get_possible_actions() agent.update(state, action, reward, next_state, next_state_possible_actions, done) state = next_state c_reward += reward # pival_diff = match_actions(gt_agent, agent, env) if i % plot_freq == 0: avg_r = evaluate_mean_avg_reward(dummy_env, agent) exp_q = exploration_quality(gt_agent.qvalues, count_matrix) avg_reward_vec.append(avg_r) exp_q_vec.append(exp_q) if done == True: env.reset_state() # env.render(agent.qvalues) state = env.get_state() continue # for _ in range(100): # env.render(agent.qvalues) timesteps = np.arange(start=0, stop=opts.num_iters, step=plot_freq) tp_all.append(timesteps) avg_reward_all.append(avg_reward_vec) exp_q_all.append(exp_q_vec) pbar.update(1) count_matrix_avg += count_matrix temp_tp = np.arange(start=0, stop=opts.num_iters, step=plot_freq) count_matrix_avg /= opts.num_seeds timesteps = np.asarray(tp_all) avg_reward_array = np.asarray(avg_reward_all) exp_q_array = np.asarray(exp_q_all) timesteps = timesteps.reshape((opts.num_seeds * temp_tp.shape[0])) avg_reward_all = avg_reward_array.reshape( (opts.num_seeds * temp_tp.shape[0])) exp_q_all = exp_q_array.reshape((opts.num_seeds * temp_tp.shape[0])) df = pd.DataFrame({ 'Timesteps': timesteps, 'Mean Average Reward': avg_reward_all, 'Exp Quality': exp_q_all }) mean_avg_reward = np.mean(avg_reward_array, axis=0) exp_q = np.mean(exp_q_array, axis=0) tp = timesteps[:temp_tp.shape[0]] poly = np.polyfit(tp, mean_avg_reward, 5) poly_y = np.poly1d(poly)(tp) plt.subplot(2, 1, 1) plt.title('Softmax') plt.grid() plt.xlabel('Timesteps') plt.ylabel('Mean Average Reward') plt.plot(tp, poly_y) plt.subplot(2, 1, 2) plt.grid() plt.xlabel('Timesteps') plt.ylabel('Exploration Quality') plt.plot(tp[:int(exp_check_thresh / plot_freq)], exp_q[:int(exp_check_thresh / plot_freq)]) plt.savefig(plt_path) df.to_csv(df_path) np.save(policy_path, agent.qvalues) env.generate_heatmap(count_matrix, heatmap_path)
type=str) argparser.add_argument( '-ma', '--match-action', action='store_true', dest='debug', help='Match actions with ground truths and generate plots') argparser.add_argument('-v', '--verbose', action='store_true', dest='debug', help='print debug information') args = argparser.parse_args() env = create_env(args.env_name, 712) agent = QLearningAgent(args.alpha, args.epsilon, args.discount, env.action_space, env.state_space, env.tp_matrix, env.blocked_positions, 712) agent.qvalues = np.load( os.path.join(args.policy_dir, args.env_name + '.npy')) avg_reward_vec = [] # for _ in range(args.num_iters): avg_r = evaluate_mean_avg_reward(env, agent) print(avg_r) # avg_reward_vec.append(avg_r) avg_reward_vec = np.asarray( (avg_r)).repeat(int(args.num_iters / plot_freq)) timesteps = np.arange(start=0, stop=args.num_iters, step=plot_freq) avg_reward_array = np.asarray(avg_reward_vec)
return history.history["loss"][0] def eval2target(self): self.model_target.set_weights(self.model_eval.get_weights()) def save(self, filepath): self.model_eval.save(filepath) def load(self, filepath): self.model_eval = load_model(filepath) self.eval2target() if __name__ == "__main__": # load the gym env env = create_env('MsPacman-ram-v0') # set random seeds to get reproduceable result(recommended) set_random_seed(0) # get size of state and action from environment state_size = env.observation_space.shape[0] * env.observation_space.shape[1] action_size = env.action_space.n # create the agent agent = DQNAgent(state_size, action_size) if os.path.exists("dqn.h5"): agent.load("dqn.h5") # log the training result scores, episodes = [], [] graph_episodes = [] graph_score = [] avg_length = 10 sum_score = 0
def train(bisimulation, opts): log_path = 'output_logs/Count-Based-Q-Logs/{}'.format(opts.env_name) if not os.path.exists(log_path): os.makedirs(log_path) curr_log_dir = os.path.join(log_path, str(datetime.datetime.now())[:-7]) os.makedirs(curr_log_dir) with open(os.path.join(curr_log_dir, 'command.txt'), 'w') as f: json.dump(opts.__dict__, f, indent=2) lower_bound = np.zeros( (bisimulation.src_env.state_space, bisimulation.tgt_env.state_space)) for t in range(bisimulation.tgt_env.state_space): for s in range(bisimulation.src_env.state_space): lower_bound[s, t] = np.max(bisimulation.src_agent.qvalues[s] ) - bisimulation.dist_matrix_final[s, t] plt_path = os.path.join(curr_log_dir, opts.env_name + '_countbased_qlearn.png') policy_path = os.path.join(curr_log_dir, opts.env_name + '_countbased_qlearn.npy') df_path = os.path.join(curr_log_dir, opts.env_name + '_countbased_qlearn.csv') heatmap_path = os.path.join(curr_log_dir, opts.env_name + '_explore_countbased.png') epsilon_bisim = 0.5 seeds = np.arange(opts.num_seeds) cr_allvec = [] tp_all = [] pi_val_all = [] avg_reward_all = [] exp_q_all = [] pbar = tqdm(total=opts.num_seeds) dummy_env = create_env(opts.env_name, 712) temp = 1 count_matrix_avg = np.zeros( (dummy_env.state_space, dummy_env.action_space)) for s in seeds: np.random.seed(s) random.seed(s) env = create_env(opts.env_name, s) action_space = env.action_space state_space = env.state_space tp_matrix = env.tp_matrix gt_agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount, action_space, state_space, tp_matrix, env.blocked_positions, s) gt_agent.qvalues = np.load( os.path.join(opts.policy_dir, opts.env_name + '.npy')) gt_policy_val = np.mean(gt_agent.qvalues) agent = QLearningAgent(opts.alpha, opts.epsilon, opts.discount, action_space, state_space, tp_matrix, env.blocked_positions, s) agent.qvalues = np.random.rand(state_space, action_space) cr_vec = [] pi_val_vec = [] exp_q_vec = [] c_reward = 0 avg_reward_vec = [] count_matrix = np.zeros((state_space, action_space)) count_matrix_t = np.zeros((state_space, action_space)) state = env.get_state() for i in range(opts.num_iters): possible_actions = env.get_possible_actions() action = agent.get_action(state, possible_actions) # action = agent.get_action_trail(state, possible_actions, lower_bound, bisimulation.d_sa_final, i, epsilon_bisim, temp) count_matrix[state][action] += 1 if i < exp_check_thresh: count_matrix_t[state][action] = 1 next_state, reward, done, next_possible_states = env.step(action) c_reward += copy.deepcopy(reward) reward += opts.cb_beta / np.sqrt(count_matrix[state][action]) # if opts.render: # env.render(agent.qvalues) next_state_possible_actions = env.get_possible_actions() agent.update(state, action, reward, next_state, next_state_possible_actions, done) state = next_state if i % plot_freq == 0: avg_r = evaluate_mean_avg_reward(dummy_env, agent) exp_q = exploration_quality(gt_agent.qvalues, count_matrix_t) avg_reward_vec.append(avg_r) exp_q_vec.append(exp_q) if done == True: env.reset_state() # if opts.render: # env.render(agent.qvalues) state = env.get_state() continue timesteps = np.arange(start=0, stop=opts.num_iters, step=plot_freq) tp_all.append(timesteps) avg_reward_all.append(avg_reward_vec) exp_q_all.append(exp_q_vec) pbar.update(1) # count_matrix_avg += count_matrix temp_tp = np.arange(start=0, stop=opts.num_iters, step=plot_freq) # count_matrix_avg /= opts.num_seeds timesteps = np.asarray(tp_all) avg_reward_array = np.asarray(avg_reward_all) exp_q_array = np.asarray(exp_q_all) timesteps = timesteps.reshape((opts.num_seeds * temp_tp.shape[0])) avg_reward_all = avg_reward_array.reshape( (opts.num_seeds * temp_tp.shape[0])) exp_q_all = exp_q_array.reshape((opts.num_seeds * temp_tp.shape[0])) df = pd.DataFrame({ 'Timesteps': timesteps, 'Mean Average Reward': avg_reward_all, 'Exp Quality': exp_q_all }) mean_avg_reward = np.mean(avg_reward_array, axis=0) exp_q = np.mean(exp_q_array, axis=0) tp = timesteps[:temp_tp.shape[0]] poly = np.polyfit(tp, mean_avg_reward, 5) poly_y = np.poly1d(poly)(tp) plt.subplot(2, 1, 1) plt.title('MBIE-EB') plt.grid() plt.xlabel('Timesteps') plt.ylabel('Mean Average Reward') plt.plot(tp, poly_y) plt.subplot(2, 1, 2) plt.grid() plt.xlabel('Timesteps') plt.ylabel('Exploration Quality') plt.plot(tp[:int(exp_check_thresh / plot_freq)], exp_q[:int(exp_check_thresh / plot_freq)]) plt.savefig(plt_path) df.to_csv(df_path) np.save(policy_path, agent.qvalues) env.generate_heatmap(count_matrix, heatmap_path)
def run(self): #self.global_model=self.global_model.to(self.device) if(self.args.model_type == "LSTM"): self.AC=ActorCritic_LSTM() else: self.AC=ActorCritic() #optimizer_to(self.optimizer,self.device) env = create_env(self.world,self.stage) state=(env.reset()) #state=state.reshape(1,1,80,80) state=(state).to(self.device,dtype=torch.float) #state=self.imageProcess(state) i_epoch=self.epoch done=True while True: if done: h_0 = torch.zeros((1, 512), dtype=torch.float) c_0 = torch.zeros((1, 512), dtype=torch.float) else: h_0 = h_0.detach() c_0 = c_0.detach() h_0 = h_0.to(self.device) c_0 = c_0.to(self.device) Timestamp=50 for i in range((Timestamp)): env.render() p,value,h_0,c_0=self.AC(state,h_0,c_0) policy=F.softmax(p,dim=1) log_prob=F.log_softmax(p,dim=1) entropy=-(policy*log_prob).sum(1,keepdim=True) m=Categorical(policy) action=m.sample() next_state, reward, done, info = env.step(action.item()) #reward=reward/15 #next_state=next_state.view(1,1,80,80) next_state=(next_state).to(self.device,dtype=torch.float) #self.states.append(state) self.log_probs.append(log_prob[0,action]) self.rewards.append(reward) self.values.append(value) self.entropies.append(entropy) state=next_state if(done): state=(env.reset()) #state=state.reshape(1,1,80,80) state=state.to(self.device) #state=self.imageProcess(state) break """ actor_loss=0 critic_loss=0 returns=[] R=0 for reward in self.rewards[::-1]: R=reward+self.GAMMA*R returns.insert(0,R) """ #td=torch.tensor([1],dtype=torch.float).to(device) R = torch.zeros((1, 1), dtype=torch.float) if not done: _, R, _, _ = self.AC(state, h_0, c_0) R=R.to(self.device) actor_loss=0 critic_loss=0 entropy_loss=0 advantage=torch.zeros((1, 1), dtype=torch.float) advantage=advantage.to(self.device) next_value=R for log_prob,reward,value,entropy in list(zip(self.log_probs,self.rewards,self.values,self.entropies))[::-1]: advantage=advantage*self.GAMMA advantage=advantage+reward+self.GAMMA*next_value.detach()-value.detach() next_value=value actor_loss=actor_loss+(-log_prob*advantage) R=R*self.GAMMA+reward critic_loss=critic_loss+(R-value)**2/2 entropy_loss=entropy_loss+entropy total_loss=actor_loss+critic_loss-0.01*entropy_loss push_and_pull(self.optimizer, self.AC, self.global_model, total_loss) #for name, parms in self.C.named_parameters(): #print('-->name:', name, '-->grad_requirs:',parms.requires_grad,' -->grad_value:',parms.grad) if(i_epoch%10==0): print(self.name+"\ Episode %d \ Actor loss:%f \ Critic Loss:%f \ Total Loss: %f"%(i_epoch,actor_loss.item(),critic_loss.item(),total_loss.item())) """ y.append(critic_loss.item()) x.append(i_epoch) plt.plot(x,y) #畫線 plt.show() #顯示繪製的圖形 """ i_epoch+=1 del self.log_probs[:] del self.rewards[:] del self.values[:] del self.entropies[:] if(self.save): if(i_epoch%100==0): PATH='./model/{}/A3C_{}_{}.pkl'.format(self.level,self.level,self.args.model_type) torch.save({ 'epoch': i_epoch, 'model_state_dict': self.global_model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'loss': total_loss, 'type':self.args.model_type, }, PATH) if(i_epoch==Max_epoch): return
def run(args, server): env = create_env(client_id=str(args.task), remotes=args.remotes) trainer = A3C(env, args.task, args.visualise) # 以 'local' 开头的变量(局部变量)不会被保存在 checkpoint 参数文件中 variables_to_save = [ v for v in tf.global_variables() if not v.name.startswith("local") ] init_op = tf.variables_initializer(variables_to_save) init_all_op = tf.global_variables_initializer() # 保存变量到参数文件中 saver = FastSaver(variables_to_save) # 获取可被训练的变量 var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) logger.info('可被训练的变量 :') for v in var_list: logger.info(' %s %s', v.name, v.get_shape()) def init_fn(ses): logger.info("初始化所有参数。") ses.run(init_all_op) config = tf.ConfigProto(device_filters=[ "/job:ps", "/job:worker/task:{}/cpu:0".format(args.task) ]) logdir = os.path.join(args.log_dir, 'train') # 写入 TensorBoard 的日志文件 summary_writer = tf.summary.FileWriter(logdir + "_%d" % args.task) logger.info("存储 TensorBoard 文件的目录: %s_%s", logdir, args.task) # 一个高层的 Wrapper(包装类) # 可以做 TensorBoard 日志文件的保存,参数文件的保存,等等操作 sv = tf.train.Supervisor( is_chief=(args.task == 0), logdir=logdir, # 存储参数文件的目录 saver=saver, # 存储参数文件所用的 Saver summary_op=None, init_op=init_op, init_fn=init_fn, summary_writer=summary_writer, # 存储 TensorBoard 日志文件的 FileWriter ready_op=tf.report_uninitialized_variables(variables_to_save), global_step=trainer.global_step, save_model_secs=30, save_summaries_secs=30) # 总的可运行步数。可修改 num_global_steps = 100000000 logger.info("启动会话中...") with sv.managed_session(server.target, config=config) as sess, sess.as_default(): sess.run(trainer.sync) trainer.start(sess, summary_writer) global_step = sess.run(trainer.global_step) logger.info("在第 %d 步开始训练", global_step) while not sv.should_stop() and (not num_global_steps or global_step < num_global_steps): trainer.process(sess) global_step = sess.run(trainer.global_step) # 停止所有服务 sv.stop() logger.info('已经 %s 步了. worker 被停止.', global_step)
# Memory params pretrain_length = batch_size # length for pretraining filling of memory memory_size = 50000 # memory size ### Preprocessing params stack_size = 4 # stacked_frames = deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size), maxlen = 4]) ################################################## ################################################## ################################################## net = CNN() mem = Memory(memory_size) game, possible_actions = create_env() # We need to pre-populate memory with taking some random actions game.new_episode() for i in range(pretrain_length): if i == 0: state = game.get_state().screen_buffer state = stacked_frames(stacked_frames, state) # take random action action = random.choice(possible_actions) # observe reward from that action reward = game.make_action(action)
def __init__(self, opts): self.opts = opts self.src_env = create_env(opts.src_env, 712) self.tgt_env = create_env(opts.tgt_env, 712) self.src_agent = QLearningAgent(alpha, epsilon, discount, 4, self.src_env.state_space, self.src_env.tp_matrix, self.src_env.blocked_positions, 712) self.tgt_agent = QLearningAgent(alpha, epsilon, discount, 4, self.tgt_env.state_space, self.tgt_env.tp_matrix, self.tgt_env.blocked_positions, 712) self.transferred_agent = QLearningAgent(alpha, epsilon, discount, 4, self.tgt_env.state_space, self.tgt_env.tp_matrix, self.tgt_env.blocked_positions, 712) self.action_space = self.src_env.action_space self.solver = opts.solver self.lfp_iters = opts.lfp_iters self.threshold = opts.threshold self.discount_kd = opts.discount_kd self.discount_kd = opts.discount_r if self.solver == 'lp': m = self.src_env.state_space n = self.tgt_env.state_space A_r = np.zeros((m, m, n)) A_t = np.zeros((n, m, n)) for i in range(m): for j in range(n): A_r[i, i, j] = 1 for i in range(n): for j in range(m): A_t[i, j, i] = 1 self.A = np.concatenate((A_r.reshape( (m, m * n)), A_t.reshape((n, m * n))), axis=0) self.src_possible_actions = self.src_env.get_possible_actions() self.tgt_possible_actions = self.tgt_env.get_possible_actions() # Initialize Q-Values self.src_agent.qvalues = np.load( os.path.join(opts.policy_dir, opts.src_env + '.npy')) self.tgt_agent.qvalues = np.load( os.path.join(opts.policy_dir, opts.tgt_env + '.npy')) # Distance and reward matrices self.d_sa_final = np.zeros( (self.src_env.state_space, self.action_space, self.tgt_env.state_space, self.action_space)) self.dist_matrix_final = np.zeros( (self.src_env.state_space, self.tgt_env.state_space)) self.reward_matrix_tmp = np.zeros( (self.src_env.state_space, self.action_space, self.tgt_env.state_space, self.action_space)) self.reward_matrix = np.zeros( (self.src_env.state_space, self.tgt_env.state_space)) self.init_reward_matix() self.accuracy = None
def test(args, T, dqn, val_mem, skip_frames=1, evaluate=False, realtime=False, env=None): global Ts, rewards, Qs, best_avg_reward if env is None: env = create_env(args.environment_filename, custom=False, skip_frames=skip_frames, realtime=realtime, docker=args.docker_training, worker_id=1, device=args.device) own_env = True else: own_env = False Ts.append(T) T_rewards = [] T_Qs = [] # Test performance over several episodes done = True for _ in range(args.evaluation_episodes): while True: if done: state = env.reset() reward_sum = 0 done = False action = dqn.act_e_greedy(state) # Choose an action ε-greedily state, reward, done, _ = env.step(action) # Step reward_sum += reward if args.render: env.render() if done: T_rewards.append(reward_sum) break if own_env: env.close() # Test Q-values over validation memory for state in val_mem: # Iterate over valid states T_Qs.append(dqn.evaluate_q(state)) avg_reward = sum(T_rewards) / len(T_rewards) avg_Q = sum(T_Qs) / len(T_Qs) if not evaluate: # Append to results rewards.append(T_rewards) Qs.append(T_Qs) # Plot _plot_line(Ts, rewards, 'Reward', path='results') _plot_line(Ts, Qs, 'Q', path='results') # Save model parameters if improved if avg_reward > best_avg_reward: best_avg_reward = avg_reward dqn.save('results') # Return average reward and Q-value return avg_reward, avg_Q