class LEARNER(): def __init__(self, args, sess, simulator): self.args = args self.sess = sess self.simulator = simulator #Define learning agent (TRPO) self.agent = TRPO(self.args, self.simulator, self.sess) def learn(self): train_index = 0 total_episode = 0 total_steps = 0 all_logs = list() while True: #Train the TRPO agent train_index += 1 train_log = self.agent.train(train_index) total_steps += train_log["Total Step"] total_episode += train_log["Num episode"] all_logs.append(train_log) print(train_log['Episode_Avg_Reward']) print(train_index) if total_steps > self.args.total_train_step: savemat( 'data1_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat', dict(data=all_logs, args=self.args)) break
class LEARNER(): def __init__(self, args, sess, simulator): self.args = args self.sess = sess self.simulator = simulator #Construct simulation environment self.simulator = gym.make('Pendulum-v0') #Define learning agent (TRPO) self.agent = TRPO(self.args, self.simulator, self.sess) def learn(self): train_index = 0 total_episode = 0 total_steps = 0 all_logs = list() while True: #Train the TRPO agent train_index += 1 train_log = self.agent.train() total_steps += train_log["Total Step"] total_episode += train_log["Num episode"] all_logs.append(train_log) #Simulate system w/ new parameters if train_index % 20 == 0: self.agent.sim() if total_steps > self.args.total_train_step: nn_weights = { 'policy_network': self.agent.get_value(), 'advantage_network': self.agent.gae.get_value() } savemat( 'data_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat', dict(data=all_logs, args=self.args)) savemat( 'weights_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat', dict(policy_weights=nn_weights, args=self.args)) break
class LEARNER(): def __init__(self, args, sess, simulator): self.args = args self.sess = sess self.simulator = simulator #Construct simulation environment self.simulator = gym.make('Pendulum-v0') self.simulator.unwrapped.max_torque = 15. self.simulator.unwrapped.max_speed = 60. self.simulator.unwrapped.action_space = spaces.Box(low=-self.simulator.unwrapped.max_torque, high=self.simulator.unwrapped.max_torque, shape=(1,)) high = np.array([1., 1., self.simulator.unwrapped.max_speed]) self.simulator.unwrapped.observation_space = spaces.Box(low=-high, high=high) #Define learning agent (TRPO) self.agent = TRPO(self.args, self.simulator, self.sess) def learn(self): train_index = 0 total_episode = 0 total_steps = 0 all_logs = list() while True: #Train the TRPO agent train_index += 1 train_log = self.agent.train() total_steps += train_log["Total Step"] total_episode += train_log["Num episode"] all_logs.append(train_log) #Simulate system w/ new parameters if train_index%5 == 0: self.agent.sim() print(train_index) if total_steps > self.args.total_train_step: savemat('data4_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat',dict(data=all_logs, args=self.args)) break
env_args['fixed_length'] = True env_args['trajectory_length'] = 5 if simulator_type == 'single-path': simulator = SinglePathSimulator(env_name, policy, n_trajectories, max_timesteps, state_filter=state_filter, **env_args) elif simulator_type == 'vine': raise NotImplementedError try: trpo_args = config['trpo_args'] except: trpo_args = {} trpo = TRPO(policy, value_fun, simulator, model_name=model_name, continue_from_file=continue_from_file, **trpo_args) print(f'Training policy {model_name} on {env_name} environment...\n') trpo.train(config['n_episodes']) print('\nTraining complete.\n')
# env=env, # learning_rate=0.02, # gamma=0.995, # output_graph=False, # seed=1, # ep_max=3000, # ep_steps_max=8000, # hidden_sizes=(30,) # ) RL = TRPO(env=env, lr_pi=0.01, lr_v=0.01, gamma=0.99, lam=0.97, delta=0.01, output_graph=False, seed=1, ep_max=100, ep_steps_max=4000, hidden_sizes=(64, 64), train_v_iters=80, damping_coeff=0.1, cg_iters=10, backtrack_iters=10, backtrack_coeff=0.8, algo='npg') # RL.train(env, render_threshold_reward=-500, render=False) RL.train(env, render_threshold_reward=-1000, render=False)
class LEARNER(): def __init__(self, args, sess): self.args = args self.sess = sess self.env = gym.make(self.args.env_name) self.args.max_path_length = self.env.spec.timestep_limit self.agent = TRPO(self.args, self.env, self.sess) self.saver = tf.train.Saver() def learn(self): train_index = 0 total_episode = 0 total_steps = 0 while True: train_index += 1 start_time = time.time() train_log = self.agent.train() total_steps += train_log["Total Step"] total_episode += train_log["Num episode"] self.write_logs(train_index, total_episode, total_steps, start_time, train_log) if np.mod(train_index, self.args.save_interval) == 0: self.save(train_index) if total_steps > self.args.total_train_step: break def write_logs(self, train_index, total_episode, total_steps, start_time, log_info): log_path = os.path.join(self.args.log_dir, self.model_dir + '.csv') if not os.path.exists(log_path): log_file = open(log_path, 'w') log_file.write("Train step\t," + "Surrogate\t," + "KL divergence\t," + "Number of steps trained\t," + "Number of episodes trained\t," + "Episode.Avg.reward\t," + "Elapsed time\n") else: log_file = open(log_path, 'a') print( "Train step %d => Surrogate loss : %3.3f, KL div : %3.8f, Number of Episode/steps trained : %d/%d, Episode.Avg.reward : %3.3f, Time : %3.3f" % (train_index, log_info["Surrogate loss"], log_info["KL_DIV"], total_episode, total_steps, log_info["Episode Avg.reward"], time.time() - start_time)) log_file.write( str(train_index) + '\t,' + str(log_info["Surrogate loss"]) + '\t,' + str(log_info["KL_DIV"]) + '\t,' + str(total_steps) + '\t,' + str(total_episode) + '\t,' + str(log_info["Episode Avg.reward"]) + '\t,' + str(time.time() - start_time) + '\n') log_file.flush() def save(self, steps): model_name = 'TRPO_GAE' checkpoint_dir = os.path.join(self.args.checkpoint_dir, self.model_dir) if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir) self.saver.save(self.sess, os.path.join(checkpoint_dir, model_name), global_step=steps) print('Checkpoint saved at %d train step' % steps) @property def model_dir(self): return '{}_{}lambda'.format(self.args.env_name, self.args.lamda)