def DQNTrain(scenario_args, observation_space_args, action_space_args, reward_args, data_args, almgren_chriss_args): EPISODES = 10000 env = gym.make('hwenv-v0', scenario_args=scenario_args, observation_space_args=observation_space_args, action_space_args=action_space_args, reward_args=reward_args, data_args=data_args, almgren_chriss_args=almgren_chriss_args) # get size of state and action from trading_environment ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n agent = DQNAgent(ob_dim, ac_dim, batch_size=64, initial_exploration_steps=10000) scores = [] avg_step = 10 for eps in range(EPISODES): eps_rew = agent.sample_trajectory(env) scores.append(eps_rew) if eps % avg_step == 0: avg = sum(scores[-avg_step-1:-1]) / avg_step print('{} episode: {}/{}, average reward: {}'. format(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), eps, EPISODES, avg)) agent.train_model() if eps % 10 == 0: agent.update_target_model() plot_with_avg_std(scores, 1, xlabel=f'Number of Episodes in {1}')
def DRQNTrain(scenario_args, observation_space_args, action_space_args, reward_args, data_args, almgren_chriss_args, double): EPISODES = 30000 env = gym.make('hwenv-v0', scenario_args=scenario_args, observation_space_args=observation_space_args, action_space_args=action_space_args, reward_args=reward_args, data_args=data_args, almgren_chriss_args=almgren_chriss_args) # get size of state and action from trading_environment ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n agent = DRQNAgent(ob_dim, ac_dim, lookback=30, batch_size=256, initial_exploration_steps=1000, double=double) scores = [] avgs = [] avg_step = 100 for eps in range(EPISODES): eps_rew = agent.sample_trajectory(env) scores.append(eps_rew) if (eps % avg_step) == 0 & (eps != 0): avg = sum(scores[-avg_step - 1:-1]) / avg_step avgs.append(avg) print('{} episode: {}/{}, average reward: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), eps, EPISODES, avg)) # env.render() agent.train_model() if eps % 5 == 0: agent.update_target_model() agent.target_model.save('model.h5') print('Saved model to disk.') plot_with_avg_std(avgs, 10)
def A2CTrain(scenario_args, observation_space_args, action_space_args, reward_args, data_args, almgren_chriss_args): """ Train the A2CAgent by sampling trajectories from the trading_environment. """ N_ITERATION = 200 # Initialize the gym trading_environment. env = gym.make('hwenv-v0', scenario_args=scenario_args, observation_space_args=observation_space_args, action_space_args=action_space_args, reward_args=reward_args, data_args=data_args, almgren_chriss_args=almgren_chriss_args) # Initialize the A2CAgent. ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n agent = A2CAgent(ob_dim, ac_dim) # Run the iterations through recursively sampling trajectories and update neural network parameters. avg_rews = [] for itr in range(N_ITERATION): # Sample trajectories print("********** Iteration %i ************" % itr) paths, timesteps_this_batch, avg_rew, avg_info = agent.sample_trajectories( itr, env, info_name='shortfall') avg_rews.append(avg_rew) print("Total rewards per trajectory in this iteration: ", avg_rew) ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate( [path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) # Update the critic model and the actor model agent.update_critic(ob_no, next_ob_no, re_n, terminal_n) adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) agent.update_actor(ob_no, ac_na, adv_n) # Visualize the training results. plot_with_avg_std(avg_rews, 1, xlabel=f'Number of Episodes in {1}')
env = gym.make('CartPole-v1') # get size of state and action from trading_environment state_size = env.observation_space.shape[0] action_size = env.action_space.n model = keras.models.load_model( '/Users/mmw/Documents/GitHub/rl_for_optimal_exec/drqn_cartpole/drqn_cartpole_v0_10000_eps.h5' ) agent = DRQN_Cartpole_Agent(state_size, action_size, lookback=5, initial_exploration_eps=0, exploration=LinearSchedule(1, 0, initial_p=0), model=model) scores, episodes = [], [] avg_step = 1 for eps in range(EPISODES): eps_rew = agent.sample_transition_pairs(env, render=(eps % avg_step == 0), max_step=500) scores.append(eps_rew) if eps % avg_step == 0: avg = sum(scores[-avg_step - 1:-1]) / avg_step print('{} episode: {}/{}, average reward: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), eps, EPISODES, avg)) env.reset() plot_with_avg_std(scores, 1, xlabel=f'Number of Episodes in {1}')
import gym import keras from cartpole_agents.a2c_rnn_cartpole import ACRnnAgent from tools.plot_tool import plot_with_avg_std env = gym.make('CartPole-v1') discrete = isinstance(env.action_space, gym.spaces.Discrete) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] agent = ACRnnAgent(ob_dim, ac_dim, batch_size=100) agent.actor_model = keras.models.load_model('dra2c_cartpole_actor_70_itr.h5') agent.critic_model = keras.models.load_model('dra2c_cartpole_critic_70_itr.h5') avg_rews = [] n_iter = 100 total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) ob_seq, next_ob_seq, ac_na, re_n, terminal_n, avg_rew = agent.sample_trajectories( env, render=True, animate_eps_frequency=1) avg_rews.append(avg_rew) print(avg_rew) plot_with_avg_std(avg_rews, 1, xlabel=f'Number of Iterations')
from cartpole_agents.a2c_cartpole.ac_agent import ACAgent from tools.plot_tool import plot_with_avg_std env = gym.make('CartPole-v1') discrete = isinstance(env.action_space, gym.spaces.Discrete) ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] agent = ACAgent(ob_dim, ac_dim) agent.actor_model = keras.models.load_model('a2c_cartpole_actor_50_eps.h5') agent.critic_model = keras.models.load_model('a2c_cartpole_critic_50_eps.h5') # # build computation graph # agent.build_computation_graph() # # # tensorflow: config, session, variable initialization # agent.init_tf_sess() avg_rews = [] n_iter = 100 total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths, timesteps_this_batch, avg_rew = agent.sample_trajectories( env, render=True, animate_eps_frequency=1) avg_rews.append(avg_rew) print(avg_rew) total_timesteps += timesteps_this_batch plot_with_avg_std(avg_rews, 1, xlabel=f'Number of Episodes in {1}')
agent = DRQN_Cartpole_Agent(state_size, action_size, lookback=5, batch_size=64, initial_exploration_eps=1000, buffer_size=int(2e5)) scores, episodes = [], [] avg_step = 100 for eps in range(EPISODES): eps_rew = agent.sample_transition_pairs(env, render=(eps % avg_step == 0), max_step=500) scores.append(eps_rew) if eps % avg_step == 0: avg = sum(scores[-avg_step:-1]) / avg_step print('{} episode: {}/{}, average reward: {}'.format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), eps, EPISODES, avg)) agent.train_model() if eps % 1 == 0: agent.update_target_model() if eps % 100 == 0: env.render() env.reset() if eps % 10000 == 0: agent.model.save(f'drqn_cartpole_tanh_v1_{int(eps)}_eps.h5') plot_with_avg_std(scores, 500, xlabel=f'Number of Episodes in {500}')