def start_training_dqn(is_prioritized): if is_prioritized: prio = "with_priority" else: prio = "no_priority" env = gym.make(hyperparams['environment']) state_spec = len(env.observation_space.sample()) action_spec = env.action_space.n log_name = 'final_build' + prio log_dir = 'logs/acrobot/' + log_name log_writer = tf.summary.create_file_writer(log_dir) epsilon = hyperparams['epsilon'] buffer = PrioritizedReplay( hyperparams['max_experiences']) if is_prioritized else UniformReplay( hyperparams['max_experiences']) agent = DQNAgent(hyperparams['hidden_layer_dqn'], state_spec, action_spec, buffer, hyperparams['learning_rate_dqn'], is_prioritized) total_rewards = np.empty(hyperparams['episodes']) for episode in range(hyperparams['episodes']): episode_reward = 0 epsilon = max(hyperparams['min_epsilon'], epsilon * hyperparams['decay']) done = False state = env.reset() while not done: action = agent.play_action(state, epsilon) next_state, reward, done, _ = env.step(action) episode_reward += reward buffer.add((state, action, reward, next_state, done)) state = next_state if len(buffer.experiences) > hyperparams['min_experiences']: agent.train(hyperparams['gamma'], hyperparams['batch_size']) total_rewards[episode] = episode_reward avg_rewards = total_rewards[max(0, episode - 20):(episode + 1)].mean() env.reset() with log_writer.as_default(): tf.summary.scalar('episode reward', episode_reward, step=episode) tf.summary.scalar('avg for 20 episodes', avg_rewards, step=episode) agent.network.save_weights('dqn_{}_network.h5'.format(prio)) env.close()
def run_exp(cfg=None): logger = Logger(cfg) agent = DQNAgent(cfg) env = Env(cfg) trainer = Trainer(env, agent, cfg) cfg = cfg.exp n_training_steps = cfg.n_episodes // cfg.train_after global_step = 0 state = env.reset() joint_angles = np.empty(cfg.n_episodes) for step in range(cfg.n_episodes): state = trainer.single_step(state) # agent training if global_step % cfg.train_after == (cfg.train_after - 1): print(f"step: {step}") print("Training agents") # fw model warmup phase of 2000 steps metrics_dict = agent.train( cfg.train_iv, cfg.train_fw, cfg.train_policy if global_step >= 0 else False) logger.log_metrics(metrics_dict, global_step) logger.log_all_network_weights(agent.joint_agents[0], step) agent.decrease_eps(n_training_steps) # video logging if global_step % cfg.video_after == 0: print("logging video") vis, debug0, debug1 = trainer.record_frames(debug_cams=True) logger.log_vid_debug_cams(vis, debug0, debug1, global_step) # distractor toggling if global_step % cfg.toggle_table_after == (cfg.toggle_table_after - 1): env.toggle_table() global_step += 1 pos = env.get_joint_positions()[0] joint_angles[step] = pos joint_angles = np.degrees(-joint_angles) plt.hist(joint_angles, bins=20, range=(0, 170)) plt.savefig(os.path.join("plots", "explored_angles.png"))
#avg_reward = 0 if (episode_reward > max_reward): max_reward = episode_reward logger.log_best(episode, max_reward) #print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") #print("New Max-Reward: ", episode_reward) #print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") # save model if we have reached new max reward name = str(episode_reward) + "_max_agent_.h5" logger.save_model(agent.model, name) # train_dqn_agent if we have to enough replay memory, otherwise continue while-loop if (len(replay_memory.memory) < min_replay_memory): continue agent.train() #dynamic epsilon if (dynamic_epsilon and epsilon > min_epsilon and replay_memory_size > min_replay_memory): epsilon *= epsilon_decay * epsilon_reward_scale else: if (epsilon > min_epsilon and len(replay_memory.memory) > min_replay_memory): epsilon *= epsilon_decay done = True parameter_list = {} parameter_list['end_epsilon'] = epsilon # not right yet parameter_list['max_reward'] = max_reward parameter_list['done'] = done
h4 = F.relu(self.l4(h3)) output = self.l5(h4) return output net = Linear() print('Initializing the learner...') learner = Learner(settings) learner.load_net(net) print('Initializing the agent framework...') agent = DQNAgent(settings) print('Training...') agent.train(learner, memory, simulator) print('Loading the net...') learner = agent.load(settings['save_dir'] + '/learner_final.p') ind_max = learner.val_rewards.index(max(learner.val_rewards)) ind_net = settings['initial_exploration'] + ind_max * settings['eval_every'] agent.load_net(learner, settings['save_dir'] + '/net_%d.p' % int(ind_net)) np.random.seed(settings["seed_general"]) print('Evaluating DQN agent...') print('(reward, MSE loss, mean Q-value, episodes - NA, time)') reward, MSE_loss, mean_Q_value, episodes, time, paths, actions, rewards = agent.evaluate( learner, simulator, 50000) print(reward, MSE_loss, mean_Q_value, episodes, time)