import gym from agents import experienceReplayBuffer, DDQNAgent, QNetwork import torch from agents import evaluate from copy import deepcopy if __name__ == "__main__": n_iter = 100000 env = gym.make('gym_pvz:pvz-env-v2') nn_name = input("Save name: ") buffer = experienceReplayBuffer(memory_size=100000, burn_in=10000) net = QNetwork(env, device='cpu', use_zombienet=False, use_gridnet=False) # old_agent = torch.load("agents/benchmark/dfq5_znet_epslinear") # net.zombienet.load_state_dict(old_agent.zombienet.state_dict()) # for p in net.zombienet.parameters(): # p.requires_grad = False # net.optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), # lr=net.learning_rate) agent = DDQNAgent(env, net, buffer, n_iter=n_iter, batch_size=200) agent.train(max_episodes=n_iter, evaluate_frequency=5000, evaluate_n_iter=1000) torch.save(agent.network, nn_name) agent._save_training_data(nn_name)
def train(config, env): all_rewards = [] steps_taken = [] all_losses = [] epsilon = 1.0 tf.reset_default_graph() annealing_rate = (epsilon - config.epsilon_min) / config.total_episodes tfConfig = tf.ConfigProto() tfConfig.gpu_options.per_process_gpu_memory_fraction = config.gpu with tf.Session(config=tfConfig) as sess: # agent = QAgent(sess, config) agent = DDQNAgent(sess, config) sess.run(tf.global_variables_initializer()) fig.show() fig.canvas.draw() # Create folder to store model in, if doesn't exist. if config.save_model and not os.path.exists(path): os.makedirs(path) total_step_count = 0 if config.load_model: print('Loading latest saved model...') agent.load_agent_state() for episode_count in range(1, config.total_episodes + 1): step_count = 0 episode_buffer = [] running_reward = 0 episode_loss = [] done = False s = env.reset() while step_count < config.max_episode_length and not done: if config.render_env: env.render() step_count += 1 total_step_count += 1 if np.random.randn(1) < epsilon or \ total_step_count < config.pretrain_steps: action = np.random.randint(0, config.a_size) else: action = agent.take_action(s) print(action) next_state, reward, done, _ = env.step(action) if config.verbose: print("Post Action", action, " on step count", step_count, "total_step_count", total_step_count, "next_state", next_state, "reward", reward, "done", done) d_int = 1 if done else 0 running_reward += reward episode_buffer.append([s, action, reward, next_state, d_int]) s = next_state if total_step_count > config.pretrain_steps and \ total_step_count % config.update_freq == 0: episode_loss.append(np.mean(agent.update_agent())) agent.add_experiences(episode_buffer) all_rewards.append(running_reward) if len(episode_loss) != 0: all_losses.append(np.mean(episode_loss)) steps_taken.append(step_count) if total_step_count > config.pretrain_steps: epsilon -= annealing_rate # Save model. if config.save_model and total_step_count > config.pretrain_steps and \ episode_count % config.save_model_episode_interval == 0: print('Saving model...') agent.save_agent_state( path + '/model-' + str(episode_count) + '.ckpt', total_step_count) # Refresh charts if total_step_count > config.pretrain_steps and \ episode_count % config.chart_refresh_interval == 0: refresh_chart(all_rewards, all_losses)
def ddqn_train(model_name, load_model=False, model_filename=None, optimizer_filename=None): print("DDQN -- Training") env = make('hungry_geese') trainer = env.train( ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = DDQNAgent(rows=11, columns=11, num_actions=3) buffer = ReplayBuffer() strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001) if load_model: agent.load_model_weights(model_filename) agent.load_optimizer_weights(optimizer_filename) start_episode = 0 end_episode = 50000 epochs = 32 batch_size = 128 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] for episode in range(start_episode + 1, end_episode + 1): obs_dict = trainer.reset() epsilon = strategy.get_epsilon(episode - start_episode) ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 while not done: ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) buffer.add(state, action, reward, next_state, done) obs_dict = next_obs_dict prev_direction = direction ep_reward += reward if len(buffer) >= batch_size: for _ in range(epochs): states, actions, rewards, next_states, dones = buffer.get_samples( batch_size) agent.fit(states, actions, rewards, next_states, dones) print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if episode % 10 == 0: agent.update_target_network() if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) print('Epsilon: ' + str(round(epsilon, 3))) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() epsilon = 0 done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/ddqn_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/ddqn_' + model_name + '_' + str(episode) + '_optimizer.npy') agent.save_model_weights('models/ddqn_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/ddqn_' + model_name + '_' + str(end_episode) + '_optimizer.npy') plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title('Reward') plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()
agent0 = DDQNAgentWithPER(action_space_size=gs.get_action_space_size(), neurons_per_hidden_layer=128, hidden_layers=5) agent0.alpha = 0.1 agent0.epsilon = 0.005 agent1 = RandomAgent() agent2 = DDQNAgentWithER(action_space_size=gs.get_action_space_size(), neurons_per_hidden_layer=128, hidden_layers=5) agent2.alpha = 0.1 agent2.epsilon = 0.005 agent3 = DDQNAgent(action_space_size=gs.get_action_space_size(), neurons_per_hidden_layer=128, hidden_layers=5) agent3.alpha = 0.1 agent3.epsilon = 0.005 agent4 = DeepQLearningAgent(action_space_size=gs.get_action_space_size(), neurons_per_hidden_layer=128, hidden_layers=5) agent4.alpha = 0.1 agent4.epsilon = 0.005 agent5 = PPOAgent(state_space_size=gs.get_vectorized_state().shape[0], action_space_size=gs.get_action_space_size()) agent6 = RandomRolloutAgent(100, False)
if args.double_dqn == 'True': args.double_dqn = True else: args.double_dqn = False if args.duelling == 'True': args.duelling = True else: args.duelling = False print("Double DQN {}, Duelling Architecture {}".format( args.double_dqn, args.duelling)) # instantiate appropriate agent if (args.double_dqn is True) & (args.duelling is True): agent = DDQNAgent(state_size=37, action_size=4, model=DuelingQNetwork, seed=0) agent_name = 'duel_ddqn' elif (args.double_dqn is True) & (args.duelling is False): agent = DDQNAgent(state_size=37, action_size=4, model=QNetwork, seed=0) agent_name = 'ddqn' elif (args.double_dqn is False) & (args.duelling is True): agent = DQNAgent(state_size=37, action_size=4, model=DuelingQNetwork, seed=0) agent_name = 'duel_dqn' else: