示例#1
0
class LEARNER():
    def __init__(self, args, sess, simulator):
        self.args = args
        self.sess = sess
        self.simulator = simulator

        #Define learning agent (TRPO)
        self.agent = TRPO(self.args, self.simulator, self.sess)

    def learn(self):
        train_index = 0
        total_episode = 0
        total_steps = 0
        all_logs = list()
        while True:
            #Train the TRPO agent
            train_index += 1
            train_log = self.agent.train(train_index)
            total_steps += train_log["Total Step"]
            total_episode += train_log["Num episode"]

            all_logs.append(train_log)

            print(train_log['Episode_Avg_Reward'])
            print(train_index)

            if total_steps > self.args.total_train_step:
                savemat(
                    'data1_' +
                    datetime.datetime.now().strftime("%y-%m-%d-%H-%M") +
                    '.mat', dict(data=all_logs, args=self.args))
                break
示例#2
0
class LEARNER():
    def __init__(self, args, sess, simulator):
        self.args = args
        self.sess = sess
        self.simulator = simulator

        #Construct simulation environment
        self.simulator = gym.make('Pendulum-v0')

        #Define learning agent (TRPO)
        self.agent = TRPO(self.args, self.simulator, self.sess)

    def learn(self):
        train_index = 0
        total_episode = 0
        total_steps = 0
        all_logs = list()
        while True:
            #Train the TRPO agent
            train_index += 1
            train_log = self.agent.train()
            total_steps += train_log["Total Step"]
            total_episode += train_log["Num episode"]

            all_logs.append(train_log)

            #Simulate system w/ new parameters
            if train_index % 20 == 0:
                self.agent.sim()

            if total_steps > self.args.total_train_step:
                nn_weights = {
                    'policy_network': self.agent.get_value(),
                    'advantage_network': self.agent.gae.get_value()
                }
                savemat(
                    'data_' +
                    datetime.datetime.now().strftime("%y-%m-%d-%H-%M") +
                    '.mat', dict(data=all_logs, args=self.args))
                savemat(
                    'weights_' +
                    datetime.datetime.now().strftime("%y-%m-%d-%H-%M") +
                    '.mat', dict(policy_weights=nn_weights, args=self.args))
                break
示例#3
0
class LEARNER():
    def __init__(self, args, sess, simulator):
        self.args = args
        self.sess = sess
        self.simulator = simulator

        #Construct simulation environment
        self.simulator = gym.make('Pendulum-v0')
        self.simulator.unwrapped.max_torque = 15.
        self.simulator.unwrapped.max_speed = 60.
        self.simulator.unwrapped.action_space = spaces.Box(low=-self.simulator.unwrapped.max_torque, high=self.simulator.unwrapped.max_torque, shape=(1,))
        high = np.array([1., 1., self.simulator.unwrapped.max_speed])
        self.simulator.unwrapped.observation_space = spaces.Box(low=-high, high=high)

        
        #Define learning agent (TRPO)
        self.agent = TRPO(self.args, self.simulator, self.sess)

    def learn(self):
        train_index = 0
        total_episode = 0
        total_steps = 0
        all_logs = list()
        while True:
            #Train the TRPO agent
            train_index += 1
            train_log = self.agent.train()
            total_steps += train_log["Total Step"]
            total_episode += train_log["Num episode"]

            all_logs.append(train_log)
            
            #Simulate system w/ new parameters
            if train_index%5 == 0:
                self.agent.sim()
                print(train_index)

            if total_steps > self.args.total_train_step:
                savemat('data4_' + datetime.datetime.now().strftime("%y-%m-%d-%H-%M") + '.mat',dict(data=all_logs, args=self.args))
                break
示例#4
0
env_args['fixed_length'] = True
env_args['trajectory_length'] = 5

if simulator_type == 'single-path':
    simulator = SinglePathSimulator(env_name,
                                    policy,
                                    n_trajectories,
                                    max_timesteps,
                                    state_filter=state_filter,
                                    **env_args)
elif simulator_type == 'vine':
    raise NotImplementedError

try:
    trpo_args = config['trpo_args']
except:
    trpo_args = {}

trpo = TRPO(policy,
            value_fun,
            simulator,
            model_name=model_name,
            continue_from_file=continue_from_file,
            **trpo_args)

print(f'Training policy {model_name} on {env_name} environment...\n')

trpo.train(config['n_episodes'])

print('\nTraining complete.\n')
#     env=env,
#     learning_rate=0.02,
#     gamma=0.995,
#     output_graph=False,
#     seed=1,
#     ep_max=3000,
#     ep_steps_max=8000,
#     hidden_sizes=(30,)
# )

RL = TRPO(env=env,
          lr_pi=0.01,
          lr_v=0.01,
          gamma=0.99,
          lam=0.97,
          delta=0.01,
          output_graph=False,
          seed=1,
          ep_max=100,
          ep_steps_max=4000,
          hidden_sizes=(64, 64),
          train_v_iters=80,
          damping_coeff=0.1,
          cg_iters=10,
          backtrack_iters=10,
          backtrack_coeff=0.8,
          algo='npg')

# RL.train(env, render_threshold_reward=-500, render=False)
RL.train(env, render_threshold_reward=-1000, render=False)
示例#6
0
class LEARNER():
    def __init__(self, args, sess):
        self.args = args
        self.sess = sess

        self.env = gym.make(self.args.env_name)
        self.args.max_path_length = self.env.spec.timestep_limit
        self.agent = TRPO(self.args, self.env, self.sess)
        self.saver = tf.train.Saver()

    def learn(self):
        train_index = 0
        total_episode = 0
        total_steps = 0
        while True:
            train_index += 1
            start_time = time.time()
            train_log = self.agent.train()
            total_steps += train_log["Total Step"]
            total_episode += train_log["Num episode"]
            self.write_logs(train_index, total_episode, total_steps,
                            start_time, train_log)
            if np.mod(train_index, self.args.save_interval) == 0:
                self.save(train_index)

            if total_steps > self.args.total_train_step:
                break

    def write_logs(self, train_index, total_episode, total_steps, start_time,
                   log_info):
        log_path = os.path.join(self.args.log_dir, self.model_dir + '.csv')
        if not os.path.exists(log_path):
            log_file = open(log_path, 'w')
            log_file.write("Train step\t," + "Surrogate\t," +
                           "KL divergence\t," + "Number of steps trained\t," +
                           "Number of episodes trained\t," +
                           "Episode.Avg.reward\t," + "Elapsed time\n")
        else:
            log_file = open(log_path, 'a')
        print(
            "Train step %d => Surrogate loss : %3.3f, KL div : %3.8f, Number of Episode/steps trained : %d/%d, Episode.Avg.reward : %3.3f, Time : %3.3f"
            % (train_index, log_info["Surrogate loss"], log_info["KL_DIV"],
               total_episode, total_steps, log_info["Episode Avg.reward"],
               time.time() - start_time))
        log_file.write(
            str(train_index) + '\t,' + str(log_info["Surrogate loss"]) +
            '\t,' + str(log_info["KL_DIV"]) + '\t,' + str(total_steps) +
            '\t,' + str(total_episode) + '\t,' +
            str(log_info["Episode Avg.reward"]) + '\t,' +
            str(time.time() - start_time) + '\n')
        log_file.flush()

    def save(self, steps):
        model_name = 'TRPO_GAE'
        checkpoint_dir = os.path.join(self.args.checkpoint_dir, self.model_dir)
        if not os.path.exists(checkpoint_dir):
            os.mkdir(checkpoint_dir)
        self.saver.save(self.sess,
                        os.path.join(checkpoint_dir, model_name),
                        global_step=steps)
        print('Checkpoint saved at %d train step' % steps)

    @property
    def model_dir(self):
        return '{}_{}lambda'.format(self.args.env_name, self.args.lamda)