示例#1
0
def train_AC(
        exp_name,
        env_name,
        n_iter, 
        gamma, 
        min_timesteps_per_batch, 
        max_path_length,
        learning_rate,
        num_target_updates,
        num_grad_steps_per_target_update,
        animate, 
        logdir, 
        normalize_advantages,
        seed,
        n_layers,
        size):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        'num_target_updates': num_target_updates,
        'num_grad_steps_per_target_update': num_grad_steps_per_target_update,
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_advantage_args = {
        'gamma': gamma,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args

    # build computation graph
    agent.build_computation_graph()

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate([path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])

        # Call tensorflow operations to:
        # (1) update the critic, by calling agent.update_critic
        # (2) use the updated critic to compute the advantage by, calling agent.estimate_advantage
        # (3) use the estimated advantage values to update the actor, by calling agent.update_actor
        # YOUR CODE HERE
        raise NotImplementedError

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#2
0
    def log_progress(self):
        episode_rewards = get_wrapper_by_name(self.env,
                                              "Monitor").get_episode_rewards()

        if len(episode_rewards) > 0:
            self.mean_episode_reward = np.mean(episode_rewards[-100:])
            self.std_episode_reward = np.std(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            self.best_mean_episode_reward = \
                max(self.best_mean_episode_reward, self.mean_episode_reward)

        # See the `log.txt` file for where these statistics are stored.
        if self.t % self.log_every_n_steps == 0:
            lr = self.optimizer_spec.lr_schedule.value(self.t)
            hours = (time.time() - self.start_time) / (60. * 60.)
            logz.log_tabular("Steps", self.t)
            logz.log_tabular("Avg_Last_100_Episodes", self.mean_episode_reward)
            logz.log_tabular("Std_Last_100_Episodes", self.std_episode_reward)
            logz.log_tabular("Best_Avg_100_Episodes",
                             self.best_mean_episode_reward)
            logz.log_tabular("Num_Episodes", len(episode_rewards))
            logz.log_tabular("Exploration_Epsilon",
                             self.exploration.value(self.t))
            logz.log_tabular("Adam_Learning_Rate", lr)
            logz.log_tabular("Elapsed_Time_Hours", hours)
            logz.dump_tabular()
def train_MAPG(
        exp_name='',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        learning_rate=5e-3,
        logdir=None,
        normalize_advantages=True,
        seed=101,
        # network arguments
        n_layers=1,
        size=32):
    #========================================================================================#
    # Logfile setup
    #========================================================================================#
    start = time.time()
    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_MAPG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    #========================================================================================#
    # Env setup
    #========================================================================================#
    nAgent = 2  # hard coded!
    env1 = Simulator(seed=101,
                     N_agent=nAgent,
                     N_prod=3,
                     Tstamp=10,
                     costQ=np.array([[0.3, 0.3, 0.3]]),
                     costInv=np.array([[0.2, 0.2, 0.2]]),
                     costLastInv=np.array([[2, 2, 2]]),
                     costBack=np.array([[0.75, 0.75, 0.75]]))

    env2 = Simulator(seed=202,
                     N_agent=nAgent,
                     N_prod=3,
                     Tstamp=10,
                     costQ=np.array([[0.3, 0.3, 0.3]]),
                     costInv=np.array([[0.2, 0.2, 0.2]]),
                     costLastInv=np.array([[2, 2, 2]]),
                     costBack=np.array([[0.75, 0.75, 0.75]]))
    # Observation and action sizes
    ob_dim = env1.obs_dim()
    ac_dim = env1.act_dim()

    print('observation dimension is: ', ob_dim)
    print('action dimension is: ', ac_dim)
    print('critic network input dimension is:',
          ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent)

    #========================================================================================#
    # PG Networks
    #========================================================================================#

    def PGNet(sy_ob_no, sy_ac_na, sy_adv_n, agent_id):

        sy_mean = build_mlp(input_placeholder=sy_ob_no,
                            output_size=ac_dim[0] * ac_dim[1],
                            scope=str(seed) + 'MA_' + str(agent_id),
                            n_layers=n_layers,
                            output_activation=tf.sigmoid,
                            size=size,
                            scale=10.)

        sy_logstd = tf.Variable(tf.truncated_normal(
            shape=[1, ac_dim[0] * ac_dim[1]], stddev=0.1),
                                name='var_std' + str(agent_id))
        sy_sampled_ac = sy_mean + tf.multiply(
            tf.random_normal(shape=tf.shape(sy_mean)), tf.exp(sy_logstd))
        MVN_dist = tf.contrib.distributions.MultivariateNormalDiag(
            sy_mean, tf.exp(sy_logstd))
        sy_logprob_n = MVN_dist.log_prob(sy_ac_na)

        # Loss function for PG network
        loss = -tf.reduce_mean(
            tf.multiply(sy_logprob_n, sy_adv_n)
        )  # Loss function that we'll differentiate to get the policy gradient.
        update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

        return sy_sampled_ac, loss, update_op

    #========================================================================================#
    # Critic network
    #========================================================================================#

    def CriticNet(sy_ob_critic, baseline_target, agent_id):
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_critic,
                      output_size=1,
                      scope=str(seed) + "critic_" + str(agent_id),
                      n_layers=n_layers,
                      size=size))

        baseline_loss = tf.nn.l2_loss(baseline_target - baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)
        return baseline_prediction, baseline_loss, baseline_update_op

    #========================================================================================#
    # Add networks in a loop
    #========================================================================================#

    sy_ob_no_1 = tf.placeholder(shape=[None, ob_dim[0]],
                                name='ob' + str(1),
                                dtype=tf.float32)
    sy_ac_na_1 = tf.placeholder(shape=[None, ac_dim[0] * ac_dim[1]],
                                name='ac' + str(1),
                                dtype=tf.float32)
    sy_adv_n_1 = tf.placeholder(shape=[None],
                                name='adv' + str(1),
                                dtype=tf.float32)
    sy_ob_critic_1 = tf.placeholder(
        shape=[None, ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent],
        name='critic_ob' + str(1),
        dtype=tf.float32)
    baseline_target_1 = tf.placeholder(shape=[None],
                                       name='baseline_target_qn' + str(1),
                                       dtype=tf.float32)

    sy_sampled_ac_1, loss_1, update_op_1 = PGNet(sy_ob_no_1, sy_ac_na_1,
                                                 sy_adv_n_1, 1)
    baseline_prediction_1, baseline_loss_1, baseline_update_op_1 = CriticNet(
        sy_ob_critic_1, baseline_target_1, 1)

    sy_ob_no_2 = tf.placeholder(shape=[None, ob_dim[0]],
                                name='ob' + str(2),
                                dtype=tf.float32)
    sy_ac_na_2 = tf.placeholder(shape=[None, ac_dim[0] * ac_dim[1]],
                                name='ac' + str(2),
                                dtype=tf.float32)
    sy_adv_n_2 = tf.placeholder(shape=[None],
                                name='adv' + str(2),
                                dtype=tf.float32)
    sy_ob_critic_2 = tf.placeholder(
        shape=[None, ob_dim[0] + ac_dim[0] * ac_dim[1] * nAgent],
        name='critic_ob' + str(2),
        dtype=tf.float32)
    baseline_target_2 = tf.placeholder(shape=[None],
                                       name='baseline_target_qn' + str(2),
                                       dtype=tf.float32)

    sy_sampled_ac_2, loss_2, update_op_2 = PGNet(sy_ob_no_2, sy_ac_na_2,
                                                 sy_adv_n_2, 2)
    baseline_prediction_2, baseline_loss_2, baseline_update_op_2 = CriticNet(
        sy_ob_critic_2, baseline_target_2, 2)

    # exec("sy_sampled_ac_%s, loss_%s, update_op_%s = PGNet(sy_ob_no_%s, sy_ac_na_%s, sy_adv_n_%s, agent)"%(agent, agent, agent, agent, agent, agent))
    # exec("baseline_prediction_%s, baseline_loss_%s, baseline_update_op_%s = CriticNet(sy_ob_critic_%s, baseline_target_%s, agent)"%(agent, agent, agent, agent, agent))
    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    num_gpu = 0
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               device_count={'GPU': num_gpu})
    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    total_numpaths = 0
    demand_cov = np.array([[0.1, -0.5 * 0.3, -0.5 * 0.3],
                           [-0.5 * 0.3, 0.1, 0.5 * 0.3],
                           [-0.5 * 0.3, 0.5 * 0.3, 0.1]])
    for itr in range(n_iter):
        #========================#
        # Sampling
        #========================#
        randk1 = 0 + itr * seed
        randk2 = 12306 + itr * seed
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        num_path = 0
        paths1 = []
        paths2 = []

        while True:
            steps = 0
            last = False

            ob1 = env1.randomInitialStateGenerator()
            obs1, acs1, rewards1, criticObs1 = [], [], [], []

            ob2 = env2.randomInitialStateGenerator()
            obs2, acs2, rewards2, criticObs2 = [], [], [], []

            while steps < env1.Tstamp:
                if steps == env1.Tstamp - 1:
                    last = True

                obs1.append(ob1.flatten())
                obs2.append(ob2.flatten())

                ac1 = sess.run(sy_sampled_ac_1, feed_dict={sy_ob_no_1: ob1})
                ac2 = sess.run(sy_sampled_ac_2, feed_dict={sy_ob_no_2: ob2})
                acs1.append(ac1.flatten())
                acs2.append(ac2.flatten())

                criticObs1.append(
                    np.append(np.append(ob1.flatten(), ac1.flatten()),
                              ac2.flatten()).flatten())
                criticObs2.append(
                    np.append(np.append(ob2.flatten(), ac2.flatten()),
                              ac1.flatten()).flatten())

                actList = [ac1.reshape(-1, 2), ac2.reshape(-1, 2)]

                demand = env1.demandGenerator_p(
                    actList,
                    M=np.array([10, 10, 10]).reshape(-1, 1),
                    V=np.array([5, 5, 5]).reshape(-1, 1),
                    sens=np.array([1.5, 1.5, 1.5]).reshape(-1, 1),
                    cov=demand_cov,
                    seed=randk1)
                demand1 = demand[:, 0]
                demand2 = demand[:, 1]

                # demand2 = env2.demandGenerator_p(actList,
                #                                  M = np.array([3, 3, 3]).reshape(-1,1),
                #                                  V = np.array([5,5,5]).reshape(-1,1),
                #                                  sens = np.array([1, 1, 1]).reshape(-1,1),
                #                                  cov = np.diag(np.array([0.25, 0.25, 0.25])),
                #                                  seed = randk2)

                ob1, rew1 = env1.step(actList[0], ob1.flatten(), demand1, last)
                ob2, rew2 = env2.step(actList[1], ob2.flatten(), demand2, last)

                randk1 += 1
                randk2 += 1

                rewards1.append(rew1)
                rewards2.append(rew2)
                steps += 1

            path1 = {
                "observation": np.array(obs1),
                "reward": np.array(rewards1),
                "action": np.array(acs1),
                "criticObservation": np.array(criticObs1)
            }

            path2 = {
                "observation": np.array(obs2),
                "reward": np.array(rewards2),
                "action": np.array(acs2),
                "criticObservation": np.array(criticObs2)
            }

            paths1.append(path1)
            paths2.append(path2)
            num_path += 1
            timesteps_this_batch += pathlength(path1)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_numpaths += num_path
        total_timesteps += timesteps_this_batch
        if last and itr == n_iter - 1:
            pickle.dump(path1,
                        open(logdir + '/trained_path1_sample.pkl', 'wb'),
                        protocol=2)
            pickle.dump(path2,
                        open(logdir + '/trained_path2_sample.pkl', 'wb'),
                        protocol=2)

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no1 = np.concatenate([path["observation"] for path in paths1])
        ac_na1 = np.concatenate([path["action"] for path in paths1])
        critic_ob_no1 = np.concatenate(
            [path["criticObservation"] for path in paths1])

        ob_no2 = np.concatenate([path["observation"] for path in paths2])
        ac_na2 = np.concatenate([path["action"] for path in paths2])
        critic_ob_no2 = np.concatenate(
            [path["criticObservation"] for path in paths2])
        # print(ob_no.shape)
        # print(ac_na.shape)
        # print(path['reward'].shape)

        #========================#
        # Compute Q value
        #========================#
        q_n1 = np.concatenate([[
            np.npv((1 / gamma - 1), path["reward"][i:])
            for i in range(len(path["reward"]))
        ] for path in paths1])
        q_n2 = np.concatenate([[
            np.npv((1 / gamma - 1), path["reward"][i:])
            for i in range(len(path["reward"]))
        ] for path in paths2])

        #========================#
        # Compute Baselines
        #========================#

        q_n_mean1 = q_n1.mean()
        q_n_std1 = q_n1.std()
        q_n1 = (q_n1 - q_n_mean1) / q_n_std1
        b_n1 = baseline_prediction_1
        adv_n_baseline1 = q_n1 - b_n1

        q_n_mean2 = q_n2.mean()
        q_n_std2 = q_n2.std()
        q_n2 = (q_n2 - q_n_mean2) / q_n_std2
        b_n2 = baseline_prediction_2
        adv_n_baseline2 = q_n2 - b_n2

        # if bootstrap:
        #     last_critic_ob_no1 = np.concatenate([path["criticObservation"] for path in paths1])
        #     lastFit1 = sess.run(baseline_prediction_1,
        #                         feed_dict = {sy_ob_critic_1: critic_ob_no1[]})

        #====================================#
        # Optimizing Neural Network Baseline
        #====================================#
        _, adv_n1 = sess.run([baseline_update_op_1, adv_n_baseline1],
                             feed_dict={
                                 baseline_target_1: q_n1,
                                 sy_ob_critic_1: critic_ob_no1
                             })
        adv_n1 = adv_n1 * q_n_std1 + q_n_mean1

        _, adv_n2 = sess.run([baseline_update_op_2, adv_n_baseline2],
                             feed_dict={
                                 baseline_target_2: q_n2,
                                 sy_ob_critic_2: critic_ob_no2
                             })
        adv_n2 = adv_n2 * q_n_std2 + q_n_mean2

        #====================================================================================#
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            adv_n1 = (adv_n1 - adv_n1.mean()) / adv_n1.std()
            adv_n2 = (adv_n2 - adv_n2.mean()) / adv_n2.std()

        #====================================================================================#
        # Performing the Policy Update
        #====================================================================================#
        _, train_loss1 = sess.run([update_op_1, loss_1],
                                  feed_dict={
                                      sy_adv_n_1: adv_n1,
                                      sy_ac_na_1: ac_na1,
                                      sy_ob_no_1: ob_no1
                                  })
        _, train_loss2 = sess.run([update_op_2, loss_2],
                                  feed_dict={
                                      sy_adv_n_2: adv_n2,
                                      sy_ac_na_2: ac_na2,
                                      sy_ob_no_2: ob_no2
                                  })
        print("PG Network 1 training loss: %.5f" % train_loss1)
        print("PG Network 2 training loss: %.5f" % train_loss2)

        # Log diagnostics
        returns1 = np.array([path["reward"].sum() for path in paths1])
        returns2 = np.array([path["reward"].sum() for path in paths2])
        totalReturn = returns1 + returns2

        ep_lengths = [pathlength(path) for path in paths1]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)

        logz.log_tabular("AverageReturn1", np.mean(returns1))
        logz.log_tabular("StdReturn1", np.std(returns1))
        logz.log_tabular("MaxReturn1", np.max(returns1))
        logz.log_tabular("MinReturn1", np.min(returns1))
        logz.log_tabular("AverageReturn2", np.mean(returns2))
        logz.log_tabular("StdReturn2", np.std(returns2))
        logz.log_tabular("MaxReturn2", np.max(returns2))
        logz.log_tabular("MinReturn2", np.min(returns2))

        logz.log_tabular("AverageTotalReturn", np.mean(totalReturn))
        logz.log_tabular("StdReturn", np.std(totalReturn))
        logz.log_tabular("MaxReturn", np.max(totalReturn))
        logz.log_tabular("MinReturn", np.min(totalReturn))

        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("NumPathsThisBatch", num_path)
        logz.log_tabular("NumPathsSoFar", total_numpaths)
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
def train_PG(
        exp_name,
        env_name,
        n_iter, 
        gamma, 
        min_timesteps_per_batch, 
        max_path_length,
        learning_rate, 
        reward_to_go, 
        animate, 
        logdir, 
        normalize_advantages,
        nn_baseline, 
        seed,
        n_layers,
        size):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the environment
    env = get_random_env()

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Maximum length for episodes
    max_path_length = 24

    # Is this env continuous, or self.discrete?
    discrete = True

    # Observation and action sizes
    ob_dim = env.get_obs_shape()
    ac_dim = 1

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline': nn_baseline,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args)

    # build computation graph
    agent.build_computation_graph()

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = [path["reward"] for path in paths]

        q_n, adv_n = agent.estimate_return(ob_no, re_n)
        agent.update_parameters(ob_no, ac_na, q_n, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#5
0
def train_PG(
    exp_name,
    env_name,
    n_iter,
    gamma,
    min_timesteps_per_batch,
    mini_batch_size,
    max_path_length,
    learning_rate,
    num_ppo_updates,
    num_value_iters,
    animate,
    logdir,
    normalize_advantages,
    nn_critic,
    seed,
    n_layers,
    size,
    gru_size,
    history,
    num_tasks,
    l2reg,
    recurrent,
):

    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    envs = {
        'pm': PointEnv,
        'pm-obs': ObservedPointEnv,
    }
    env = envs[env_name](num_tasks)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    task_dim = len(env._goal)  # rude, sorry

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'task_dim': task_dim,
        'size': size,
        'gru_size': gru_size,
        'learning_rate': learning_rate,
        'history': history,
        'num_value_iters': num_value_iters,
        'l2reg': l2reg,
        'recurrent': recurrent,
    }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_return_args = {
        'gamma': gamma,
        'nn_critic': nn_critic,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args,
                  estimate_return_args)

    # build computation graph
    agent.build_computation_graph()

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    #========================================================================================#
    # Training Loop
    #========================================================================================#
    def unpack_sample(data):
        '''
        unpack a sample from the replay buffer
        '''
        ob = data["observations"]
        ac = data["actions"]
        re = data["rewards"]
        hi = data["hiddens"]
        ma = 1 - data["terminals"]
        return ob, ac, re, hi, ma

    # construct PPO replay buffer, perhaps rude to do outside the agent
    ppo_buffer = PPOReplayBuffer(agent.replay_buffer)

    total_timesteps = 0
    for itr in range(n_iter):
        # for PPO: flush the replay buffer!
        ppo_buffer.flush()

        # sample trajectories to fill agent's replay buffer
        print("********** Iteration %i ************" % itr)
        stats = []
        for _ in range(num_tasks):
            s, timesteps_this_batch = agent.sample_trajectories(
                itr, env, min_timesteps_per_batch)
            total_timesteps += timesteps_this_batch
            stats += s

        # compute the log probs, advantages, and returns for all data in agent's buffer
        # store in ppo buffer for use in multiple ppo updates
        # TODO: should move inside the agent probably
        data = agent.replay_buffer.all_batch()
        ob_no, ac_na, re_n, hidden, masks = unpack_sample(data)
        fixed_log_probs = agent.sess.run(agent.sy_lp_n,
                                         feed_dict={
                                             agent.sy_ob_no: ob_no,
                                             agent.sy_hidden: hidden,
                                             agent.sy_ac_na: ac_na
                                         })
        q_n, adv_n = agent.estimate_return(ob_no, re_n, hidden, masks)

        ppo_buffer.add_samples(fixed_log_probs, adv_n, q_n)

        # update with mini-batches sampled from ppo buffer
        for _ in range(num_ppo_updates):

            data = ppo_buffer.random_batch(mini_batch_size)

            ob_no, ac_na, re_n, hidden, masks = unpack_sample(data)
            fixed_log_probs = data["log_probs"]
            adv_n = data["advantages"]
            q_n = data["returns"]

            log_probs = agent.sess.run(agent.sy_lp_n,
                                       feed_dict={
                                           agent.sy_ob_no: ob_no,
                                           agent.sy_hidden: hidden,
                                           agent.sy_ac_na: ac_na
                                       })

            agent.update_parameters(ob_no, hidden, ac_na, fixed_log_probs, q_n,
                                    adv_n)

        # compute validation statistics
        print('Validating...')
        val_stats = []
        for _ in range(num_tasks):
            vs, timesteps_this_batch = agent.sample_trajectories(
                itr, env, min_timesteps_per_batch // 10, is_evaluation=True)
            val_stats += vs

        # save trajectories for viz
        with open("output/{}-epoch{}.pkl".format(exp_name, itr), 'wb') as f:
            pickle.dump(agent.val_replay_buffer.all_batch(), f,
                        pickle.HIGHEST_PROTOCOL)
        agent.val_replay_buffer.flush()

        # Log TRAIN diagnostics
        returns = [sum(s["rewards"]) for s in stats]
        final_rewards = [s["rewards"][-1] for s in stats]
        ep_lengths = [s['ep_len'] for s in stats]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("FinalReward", np.mean(final_rewards))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)

        # Log VAL diagnostics
        val_returns = [sum(s["rewards"]) for s in val_stats]
        val_final_rewards = [s["rewards"][-1] for s in val_stats]
        logz.log_tabular("ValAverageReturn", np.mean(val_returns))
        logz.log_tabular("ValFinalReward", np.mean(val_final_rewards))

        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#6
0
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch,
             max_path_length, learning_rate, reward_to_go, animate, logdir,
             normalize_advantages, nn_baseline, seed, n_layers, size):

    start = time.time()

    # ======================================================================================= #
    # Set Up Logger
    # ======================================================================================= #
    setup_logger(logdir, locals())

    # ======================================================================================= #
    # Set Up Env
    # ======================================================================================= #

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds
    tf.random.set_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # ======================================================================================== #
    # Initialize Agent
    # ======================================================================================== #
    neural_net_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
    }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline': nn_baseline,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(neural_net_args, sample_trajectory_args,
                  estimate_return_args)

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr,
                                                                env)  # Fixed
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = [path["reward"] for path in paths]

        # ob_no = (sum_length_paths, ob_dim)
        # ac_na = (sum_length_paths, ac_dim)
        # re_n = (num_paths,) where re_n[i] = (path_len_i, 1)

        q_n, adv_n = agent.estimate_return(ob_no, re_n)  # Fixed
        agent.update_parameters(ob_no, ac_na, q_n, adv_n)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
 def on_epoch_end(self, epoch, logs={}):
     # Save training and validation losses
     logz.log_tabular('train_loss', logs.get('loss'))
     logz.log_tabular('val_loss', logs.get('val_loss'))
     logz.dump_tabular()
示例#8
0
文件: ars.py 项目: zhan0903/ARS
    def train(self, num_iter):

        start = time.time()
        for i in range(num_iter):
            
            t1 = time.time()
            self.train_step()
            t2 = time.time()
            print('total time of one step', t2 - t1)           
            print('iter ', i,' done')

            # record statistics every 10 iterations
            if ((i + 1) % 10 == 0):
                
                rewards = self.aggregate_rollouts(num_rollouts = 100, evaluate = True)
                w = ray.get(self.workers[0].get_weights_plus_stats.remote())
                np.savez(self.logdir + "/lin_policy_plus", w)
                
                print(sorted(self.params.items()))
                logz.log_tabular("Time", time.time() - start)
                logz.log_tabular("Iteration", i + 1)
                logz.log_tabular("AverageReward", np.mean(rewards))
                logz.log_tabular("StdRewards", np.std(rewards))
                logz.log_tabular("MaxRewardRollout", np.max(rewards))
                logz.log_tabular("MinRewardRollout", np.min(rewards))
                logz.log_tabular("timesteps", self.timesteps)
                logz.dump_tabular()
                
            t1 = time.time()
            # get statistics from all workers
            for j in range(self.num_workers):
                self.policy.observation_filter.update(ray.get(self.workers[j].get_filter.remote()))
            self.policy.observation_filter.stats_increment()

            # make sure master filter buffer is clear
            self.policy.observation_filter.clear_buffer()
            # sync all workers
            filter_id = ray.put(self.policy.observation_filter)
            setting_filters_ids = [worker.sync_filter.remote(filter_id) for worker in self.workers]
            # waiting for sync of all workers
            ray.get(setting_filters_ids)
         
            increment_filters_ids = [worker.stats_increment.remote() for worker in self.workers]
            # waiting for increment of all workers
            ray.get(increment_filters_ids)            
            t2 = time.time()
            print('Time to sync statistics:', t2 - t1)
                        
        return 
示例#9
0
    def train(self, train_db, val_db, test_db):
        ##################################################################
        ## LOG
        ##################################################################
        logz.configure_output_dir(self.cfg.model_dir)
        logz.save_config(self.cfg)

        ##################################################################
        ## Main loop
        ##################################################################
        start = time()
        min_val_loss = 1000.0
        max_val_recall = -1.0
        train_loaddb = region_loader(train_db)
        val_loaddb = region_loader(val_db)
        #TODO
        train_loader = DataLoader(train_loaddb,
                                  batch_size=self.cfg.batch_size,
                                  shuffle=True,
                                  num_workers=self.cfg.num_workers,
                                  collate_fn=region_collate_fn)
        val_loader = DataLoader(val_loaddb,
                                batch_size=self.cfg.batch_size,
                                shuffle=False,
                                num_workers=self.cfg.num_workers,
                                collate_fn=region_collate_fn)

        for epoch in range(self.epoch, self.cfg.n_epochs):
            ##################################################################
            ## Training
            ##################################################################
            if self.cfg.coco_mode >= 0:
                self.cfg.coco_mode = np.random.randint(0, self.cfg.max_turns)
            torch.cuda.empty_cache()
            train_losses = self.train_epoch(train_loaddb, train_loader, epoch)

            ##################################################################
            ## Validation
            ##################################################################
            if self.cfg.coco_mode >= 0:
                self.cfg.coco_mode = 0
            torch.cuda.empty_cache()
            val_losses, val_metrics, caches_results = self.validate_epoch(
                val_loaddb, val_loader, epoch)

            #################################################################
            # Logging
            #################################################################

            # update optim scheduler
            current_val_loss = np.mean(val_losses)
            self.optimizer.update(current_val_loss, epoch)
            logz.log_tabular("Time", time() - start)
            logz.log_tabular("Iteration", epoch)
            logz.log_tabular("TrainAverageLoss", np.mean(train_losses))
            logz.log_tabular("ValAverageLoss", current_val_loss)

            mmm = np.zeros((5, ), dtype=np.float64)
            for k, v in val_metrics.items():
                mmm = mmm + np.array(v)
            mmm /= len(val_metrics)
            logz.log_tabular("t2i_R1", mmm[0])
            logz.log_tabular("t2i_R5", mmm[1])
            logz.log_tabular("t2i_R10", mmm[2])
            logz.log_tabular("t2i_medr", mmm[3])
            logz.log_tabular("t2i_meanr", mmm[4])
            logz.dump_tabular()
            current_val_recall = np.mean(mmm[:3])

            ##################################################################
            ## Checkpoint
            ##################################################################
            if self.cfg.rl_finetune == 0 and self.cfg.coco_mode < 0:
                if min_val_loss > current_val_loss:
                    min_val_loss = current_val_loss
                    self.save_checkpoint(epoch)
                    with open(
                            osp.join(self.cfg.model_dir,
                                     'val_metrics_%d.json' % epoch),
                            'w') as fp:
                        json.dump(val_metrics, fp, indent=4, sort_keys=True)
                    with open(
                            osp.join(self.cfg.model_dir,
                                     'val_top5_inds_%d.pkl' % epoch),
                            'wb') as fid:
                        pickle.dump(caches_results, fid,
                                    pickle.HIGHEST_PROTOCOL)
            else:
                if max_val_recall < current_val_recall:
                    max_val_recall = current_val_recall
                    self.save_checkpoint(epoch)
                    with open(
                            osp.join(self.cfg.model_dir,
                                     'val_metrics_%d.json' % epoch),
                            'w') as fp:
                        json.dump(val_metrics, fp, indent=4, sort_keys=True)
                    with open(
                            osp.join(self.cfg.model_dir,
                                     'val_top5_inds_%d.pkl' % epoch),
                            'wb') as fid:
                        pickle.dump(caches_results, fid,
                                    pickle.HIGHEST_PROTOCOL)
示例#10
0
def trainPG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length,\
            learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline,\
            seed, n_layers, size):
    tic = time.time()
    setup_logger(logdir, locals())

    env = gym.make(env_name)
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    max_path_length = max_path_length or env.spec.max_episode_steps
    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n if discrete else env.action_space.shape[0]

    ## Define Placeholders
    obs_ph = tf.placeholder(shape=[None, obs_dim],
                            dtype=tf.float32,
                            name='obs')
    if discrete:
        act_ph = tf.placeholder(shape=[None], dtype=tf.int32, name='act')
    else:
        act_ph = tf.placeholder(shape=[None, act_dim],
                                dtype=tf.float32,
                                name='act')
    adv_ph = tf.placeholder(shape=[None], dtype=tf.float32, name='adv')

    ## Build computation graph, define forward pass
    nn_out = build_mlp(input_ph=obs_ph,
                       output_size=act_dim,
                       scope='policy_model',
                       n_layers=n_layers,
                       size=size)
    if discrete:
        logits_ph = nn_out
        sampled_action_ph = tf.multinomial(logits=logits_ph, num_samples=1)[0]
        logprob_ph = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=act_ph, logits=logits_ph)
    else:
        mu_ph = nn_out
        logstd_ph = tf.get_variable('logstd', [act_dim], dtype=tf.float32)
        sampled_action_ph = tf.random_normal(shape=mu_ph.shape,
                                             mean=mu_ph,
                                             stddev=tf.exp(logstd_ph))
        logprob_ph = -tf.contrib.distributions.MultivariateNormalDiag(
            loc=mu_ph, scale_diag=tf.exp(logstd_ph))

    ## Define Loss Function and Training Operation
    loss = tf.reduce_mean(-logprob_ph * adv_ph)
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    if nn_baseline:
        baseline_pred_ph = tf.squeeze(
            build_mlp(input_ph=obs_ph,
                      output_size=1,
                      scope='nn_baseline',
                      n_layers=n_layers,
                      size=size))
        baseline_target_ph = tf.placeholder(shape=[None],
                                            dtype=tf.float32,
                                            name='baseline')
        baseline_loss = tf.nn.l2_loss(baseline_pred_ph - baseline_target_ph)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)

    ## Initialize Tensorflow Configs
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to "with self.sess:"
    tf.global_variables_initializer().run()

    ## Training Loop
    total_time_steps = 0
    for itr in range(n_iter):
        print("********* Iteration %i *********" % itr)
        ### Sample_Trajectories
        timesteps_this_batch = 0
        paths = []
        while True:
            #### Sample a trajectory
            observations, actions, rewards = [], [], []
            animate_this_episode = (animate and len(paths) == 0
                                    and itr % 10 == 0)
            s = env.reset()
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.1)
                a = sess.run(sampled_action_ph, feed_dict={obs_ph: s[None]})
                a = a[0]
                sp, r, done, _ = env.step(a)
                observations.append(s)
                actions.append(a)
                rewards.append(r)
                steps += 1
                if done or steps > max_path_length:
                    break
                s = sp
            #### End of Sample a trajectory
            path = {
                'observation':
                np.array(observations, dtype=np.float32),
                'action':
                np.array(actions, dtype=np.int32 if discrete else np.float32),
                'reward':
                np.array(rewards, dtype=np.float32)
            }
            paths.append(path)
            timesteps_this_batch += steps
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_time_steps += timesteps_this_batch

        ## Build arrays for observation, action for the policy gradient update by concatenating across paths
        obs = np.concatenate([path['observation'] for path in paths])
        act = np.concatenate([path['action'] for path in paths])
        rew = [path['reward'] for path in paths]

        ## Estimate Return
        ### Compute Q-values
        qvals = []
        for path_rewards in rew:
            q_path = []
            q = 0
            for r in reversed(path_rewards):
                q = r + gamma * q
                q_path.append(q)
            if reward_to_go:
                q_path.reverse()
            else:
                q_path = [q for _ in range(len(path_rewards))]
            qvals.extend(q_path)
        ### Compute Advantages
        if nn_baseline:
            bl = sess.run(baseline_pred_ph, feed_dict={obs_ph: obs})
            bl = bl * np.std(qvals) + np.mean(qvals)
            adv = qvals - bl
            #### TODO: GAE implementation
        else:
            adv = qvals.copy()
        if normalize_advantages:
            adv = (adv - np.mean(adv)) / (np.std(adv) + 1e-8)

        ## Policy Network Parameters Update
        if nn_baseline:
            sess.run([baseline_update_op],
                     feed_dict={
                         baseline_target_ph: adv,
                         obs_ph: obs
                     })
        _, loss_policy = sess.run([update_op, loss], \
                    feed_dict={obs_ph: obs, act_ph: act, adv_ph: adv})

        # Log diagnostics
        returns = [path['reward'].sum() for path in paths]
        ep_lengths = [len(path['reward']) for path in paths]
        logz.log_tabular('Time', time.time() - tic)
        logz.log_tabular('Iteration', itr)
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenSt", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_time_steps)
        logz.log_tabular("PolicyLoss", loss_policy)

        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#11
0
    def log_progress(self):
        episode_rewards = get_wrapper_by_name(self.env,
                                              "Monitor").get_episode_rewards()

        if len(episode_rewards) > 0:
            self.mean_episode_reward = np.mean(episode_rewards[-100:])

        if len(episode_rewards) > 100:
            self.best_mean_episode_reward = max(self.best_mean_episode_reward,
                                                self.mean_episode_reward)

        if self.t % self.log_every_n_steps == 0 and self.model_initialized:
            logz.log_tabular("TimeStep", self.t)
            logz.log_tabular("MeanReturn", self.mean_episode_reward)
            logz.log_tabular(
                "BestMeanReturn",
                max(self.best_mean_episode_reward, self.mean_episode_reward))
            logz.log_tabular("Episodes", len(episode_rewards))
            logz.log_tabular("Exploration", self.exploration.value(self.t))
            logz.log_tabular("LearningRate",
                             self.optimizer_spec.lr_lambda(self.t))
            logz.log_tabular("Time", (time.time() - self.start_time) / 60.)
            logz.dump_tabular()
            logz.save_pytorch_model(self.q_net)
示例#12
0
def learn(env,
          q_func,
          optimizer_spec,
          session,
          exploration=LinearSchedule(1000000, 0.1),
          stopping_criterion=None,
          replay_buffer_size=1000000,
          batch_size=32,
          gamma=0.99,
          learning_starts=50000,
          learning_freq=4,
          frame_history_len=4,
          target_update_freq=10000,
          grad_norm_clipping=10):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    # Log the progress during the trainining
    start = time.time()
    logdir = 'pacman_hra_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('hra_result', logdir)
    logz.configure_output_dir(logdir)
    args = inspect.getargspec(q_func)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)
    time_name = path.join(logdir, "rha_t.dat")
    mean_name = path.join(logdir, "rha_mean.dat")
    best_name = path.join(logdir, "rha_best.dat")
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    times, mean_ep_rewards, best_ep_rewards = [], [], []

    img_h, img_w, img_c = env.observation_space.shape
    input_shape = (img_h, img_w, frame_history_len * img_c)

    num_actions = env.action_space.n

    # set up placeholders
    # placeholder for current observation (or state)
    obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for current action
    act_t_ph = tf.placeholder(tf.int32, [None])
    # placeholder for current reward
    rew_food_t_ph = tf.placeholder(tf.float32, [None])
    rew_fruit_t_ph = tf.placeholder(tf.float32, [None])
    rew_avoid_t_ph = tf.placeholder(tf.float32, [None])
    rew_eat_t_ph = tf.placeholder(tf.float32, [None])
    # placeholder for next observation (or state)
    obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for end of episode mask
    # this value is 1 if the next state corresponds to the end of an episode,
    # in which case there is no Q-value at the next state; at the end of an
    # episode, only the current state reward contributes to the target, not the
    # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
    done_mask_ph = tf.placeholder(tf.float32, [None])

    # casting to float on GPU ensures lower data transfer times.
    obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0
    obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0

    # Here, you should fill in your own code to compute the Bellman error. This requires
    # evaluating the current and next Q-values and constructing the corresponding error.
    # TensorFlow will differentiate this error for you, you just need to pass it to the
    # optimizer. See assignment text for details.
    # Your code should produce one scalar-valued tensor: total_error
    # This will be passed to the optimizer in the provided code below.
    # Your code should also produce two collections of variables:
    # q_func_vars
    # target_q_func_vars
    # These should hold all of the variables of the Q-function network and target network,
    # respectively. A convenient way to get these is to make use of TF's "scope" feature.
    # For example, you can create your Q-function network with the scope "q_func" like this:
    # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
    # And then you can obtain the variables like this:
    # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func')
    # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES"
    ######

    q_val = q_func(obs_t_float, num_actions, scope="q_func", reuse=False)
    q_food, q_avoid, q_fruit, q_eat = q_val
    target_val = q_func(obs_tp1_float,
                        num_actions,
                        scope="target_q_func",
                        reuse=False)
    target_food, target_avoid, target_fruit, target_eat = target_val

    q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope='q_func')
    target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_q_func')

    q_all = 1 / 4 * (q_food + q_avoid + q_fruit + q_eat)
    action_selected = tf.argmax(q_all, 1)

    q_act_food_t_val = tf.reduce_sum(q_food *
                                     tf.one_hot(act_t_ph, num_actions),
                                     axis=1)
    q_act_avoid_t_val = tf.reduce_sum(q_avoid *
                                      tf.one_hot(act_t_ph, num_actions),
                                      axis=1)
    q_act_fruit_t_val = tf.reduce_sum(q_fruit *
                                      tf.one_hot(act_t_ph, num_actions),
                                      axis=1)
    q_act_eat_t_val = tf.reduce_sum(q_eat * tf.one_hot(act_t_ph, num_actions),
                                    axis=1)

    y_food_t_val = rew_food_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max(
        target_food, axis=1)
    y_avoid_t_val = rew_avoid_t_ph + (
        1 - done_mask_ph) * gamma * tf.reduce_max(target_avoid, axis=1)
    y_fruit_t_val = rew_fruit_t_ph + (
        1 - done_mask_ph) * gamma * tf.reduce_max(target_fruit, axis=1)
    y_eat_t_val = rew_eat_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max(
        target_eat, axis=1)

    food_error = tf.reduce_mean(
        tf.losses.huber_loss(y_food_t_val, q_act_food_t_val))
    avoid_error = tf.reduce_mean(
        tf.losses.huber_loss(y_avoid_t_val, q_act_avoid_t_val))
    fruit_error = tf.reduce_mean(
        tf.losses.huber_loss(y_fruit_t_val, q_act_fruit_t_val))
    eat_error = tf.reduce_mean(
        tf.losses.huber_loss(y_eat_t_val, q_act_eat_t_val))

    ######

    # construct optimization op (with gradient clipping)
    learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
    optimizer = optimizer_spec.constructor(learning_rate=learning_rate,
                                           **optimizer_spec.kwargs)
    train_food_fn = minimize_and_clip(optimizer,
                                      food_error,
                                      var_list=q_func_vars,
                                      clip_val=grad_norm_clipping)
    train_avoid_fn = minimize_and_clip(optimizer,
                                       avoid_error,
                                       var_list=q_func_vars,
                                       clip_val=grad_norm_clipping)
    train_fruit_fn = minimize_and_clip(optimizer,
                                       fruit_error,
                                       var_list=q_func_vars,
                                       clip_val=grad_norm_clipping)
    train_eat_fn = minimize_and_clip(optimizer,
                                     eat_error,
                                     var_list=q_func_vars,
                                     clip_val=grad_norm_clipping)

    # update_target_fn will be called periodically to copy Q network to target Q network
    update_target_fn = []
    for var, var_target in zip(
            sorted(q_func_vars, key=lambda v: v.name),
            sorted(target_q_func_vars, key=lambda v: v.name)):
        update_target_fn.append(var_target.assign(var))
    update_target_fn = tf.group(*update_target_fn)

    # construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    model_initialized = False
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in itertools.count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env, t):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)

        #####

        idx = replay_buffer.store_frame(last_obs, rha_shape=4)
        epsilon = exploration.value(t)

        if not model_initialized or np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            obs_input = replay_buffer.encode_recent_observation()[None, :]
            action = session.run(action_selected,
                                 feed_dict={obs_t_ph: obs_input})
        obs, reward, done, info = env.step(action)
        replay_buffer.store_effect(idx, action, reward, done)
        if done: obs = env.reset()
        last_obs = obs

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # 3.b: initialize the model if it has not been initialized yet; to do
            # that, call
            #    initialize_interdependent_variables(session, tf.global_variables(), {
            #        obs_t_ph: obs_t_batch,
            #        obs_tp1_ph: obs_tp1_batch,
            #    })
            # where obs_t_batch and obs_tp1_batch are the batches of observations at
            # the current and next time step. The boolean variable model_initialized
            # indicates whether or not the model has been initialized.
            # Remember that you have to update the target network too (see 3.d)!
            # 3.c: train the model. To do this, you'll need to use the train_fn and
            # total_error ops that were created earlier: total_error is what you
            # created to compute the total Bellman error in a batch, and train_fn
            # will actually perform a gradient step and update the network parameters
            # to reduce total_error. When calling session.run on these you'll need to
            # populate the following placeholders:
            # obs_t_ph
            # act_t_ph
            # rew_t_ph
            # obs_tp1_ph
            # done_mask_ph
            # (this is needed for computing total_error)
            # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
            # (this is needed by the optimizer to choose the learning rate)
            # 3.d: periodically update the target network by calling
            # session.run(update_target_fn)
            # you should update every target_update_freq steps, and you may find the
            # variable num_param_updates useful for this (it was initialized to 0)
            #####

            obs_t_batch, act_t_batch, rew_t_batch, obs_tp1_batch, done_mask_batch = replay_buffer.sample(
                batch_size)
            rew_food_t_batch = rew_t_batch[:, 0]
            rew_fruit_t_batch = rew_t_batch[:, 1]
            rew_avoid_t_batch = rew_t_batch[:, 2]
            rew_eat_t_batch = rew_t_batch[:, 3]
            if not model_initialized:
                initialize_interdependent_variables(
                    session, tf.global_variables(), {
                        obs_t_ph: obs_t_batch,
                        obs_tp1_ph: obs_tp1_batch
                    })
                session.run(update_target_fn)
                model_initialized = True

            session.run(train_food_fn,
                        feed_dict={
                            obs_t_ph: obs_t_batch,
                            act_t_ph: act_t_batch,
                            rew_food_t_ph: rew_food_t_batch,
                            obs_tp1_ph: obs_tp1_batch,
                            done_mask_ph: done_mask_batch,
                            learning_rate: optimizer_spec.lr_schedule.value(t)
                        })
            session.run(train_avoid_fn,
                        feed_dict={
                            obs_t_ph: obs_t_batch,
                            act_t_ph: act_t_batch,
                            rew_avoid_t_ph: rew_avoid_t_batch,
                            obs_tp1_ph: obs_tp1_batch,
                            done_mask_ph: done_mask_batch,
                            learning_rate: optimizer_spec.lr_schedule.value(t)
                        })
            session.run(train_fruit_fn,
                        feed_dict={
                            obs_t_ph: obs_t_batch,
                            act_t_ph: act_t_batch,
                            rew_fruit_t_ph: rew_fruit_t_batch,
                            obs_tp1_ph: obs_tp1_batch,
                            done_mask_ph: done_mask_batch,
                            learning_rate: optimizer_spec.lr_schedule.value(t)
                        })
            session.run(train_eat_fn,
                        feed_dict={
                            obs_t_ph: obs_t_batch,
                            act_t_ph: act_t_batch,
                            rew_eat_t_ph: rew_eat_t_batch,
                            obs_tp1_ph: obs_tp1_batch,
                            done_mask_ph: done_mask_batch,
                            learning_rate: optimizer_spec.lr_schedule.value(t)
                        })

            if num_param_updates % target_update_freq == 0:
                session.run(update_target_fn)
                train_food_loss = session.run(food_error,
                                              feed_dict={
                                                  obs_t_ph: obs_t_batch,
                                                  act_t_ph: act_t_batch,
                                                  rew_food_t_ph:
                                                  rew_food_t_batch,
                                                  obs_tp1_ph: obs_tp1_batch,
                                                  done_mask_ph: done_mask_batch
                                              })
                train_avoid_loss = session.run(avoid_error,
                                               feed_dict={
                                                   obs_t_ph: obs_t_batch,
                                                   act_t_ph: act_t_batch,
                                                   rew_avoid_t_ph:
                                                   rew_avoid_t_batch,
                                                   obs_tp1_ph: obs_tp1_batch,
                                                   done_mask_ph:
                                                   done_mask_batch
                                               })
                train_fruit_loss = session.run(fruit_error,
                                               feed_dict={
                                                   obs_t_ph: obs_t_batch,
                                                   act_t_ph: act_t_batch,
                                                   rew_fruit_t_ph:
                                                   rew_fruit_t_batch,
                                                   obs_tp1_ph: obs_tp1_batch,
                                                   done_mask_ph:
                                                   done_mask_batch
                                               })
                train_eat_loss = session.run(eat_error,
                                             feed_dict={
                                                 obs_t_ph: obs_t_batch,
                                                 act_t_ph: act_t_batch,
                                                 rew_eat_t_ph: rew_eat_t_batch,
                                                 obs_tp1_ph: obs_tp1_batch,
                                                 done_mask_ph: done_mask_batch
                                             })

                train_loss = (train_food_loss + train_avoid_loss +
                              train_fruit_loss + train_eat_loss) / 4.
                print("Loss at iteration {} is: {}".format(t, train_loss))
            num_param_updates += 1

            #####

        ### 4. Log progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
            times.append(t)
            mean_ep_rewards.append(mean_episode_reward)
            best_ep_rewards.append(best_mean_episode_reward)

            joblib.dump(value=times, filename=time_name, compress=3)
            joblib.dump(value=mean_ep_rewards, filename=mean_name, compress=3)
            joblib.dump(value=best_ep_rewards, filename=best_name, compress=3)

            logz.log_tabular("Training Time", time.time() - start)
            logz.log_tabular("Loss", train_loss)
            logz.log_tabular("Iteration", t)
            logz.log_tabular("Mean Reward (/100ep)", mean_episode_reward)
            logz.log_tabular("Best Mean Reward", best_mean_episode_reward)
            logz.log_tabular("Episodes", len(episode_rewards))
            logz.log_tabular("Exploration", exploration.value(t))
            logz.log_tabular("Learning Rate",
                             optimizer_spec.lr_schedule.value(t))
            logz.dump_tabular()
            sys.stdout.flush()

    return times, mean_ep_rewards, best_ep_rewards
示例#13
0
def train_PG(exp_name,
             env_name,
             n_iter,
             gamma,
             min_timesteps_per_batch,
             max_path_length,
             learning_rate,
             reward_to_go,
             animate,
             logdir,
             normalize_advantages,
             nn_baseline,
             seed,
             n_layers,
             size,
             epoch,
             evaluate=False):
    start = time.time()

    # ========================================================================================#
    # Set Up Logger
    # ========================================================================================#
    setup_logger(logdir, locals())

    # ========================================================================================#
    # Set Up Env
    # ========================================================================================#

    # Make the gym environment
    env = gym.make(env_name)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous, or self.discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # ========================================================================================#
    # Initialize Agent
    # ========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
    }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_return_args = {
        'gamma': gamma,
        'reward_to_go': reward_to_go,
        'nn_baseline': nn_baseline,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args,
                  estimate_return_args)

    # build computation graph
    agent.build_computation_graph(
        '/tmp/hw2/%s/seed_%d_lr_%f_batch_%d_epoch_%d' %
        (env_name, seed, learning_rate, min_timesteps_per_batch, epoch))

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    if evaluate:
        reward = 0
        agent.load_model(799)
        for _ in range(10):
            path = agent.sample_trajectory(env, True)
            reward += path['reward']

        print("Mean Reward:", sum(reward) / 10)

    else:
        total_timesteps = 0
        for itr in range(n_iter):
            print("********** Iteration %i ************" % itr)
            paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
            total_timesteps += timesteps_this_batch

            # Build arrays for observation, action for the policy gradient update by concatenating
            # across paths
            ob_no = np.concatenate([path["observation"] for path in paths])
            ac_na = np.concatenate([path["action"] for path in paths])
            re_n = [path["reward"] for path in paths]

            q_n, adv_n = agent.estimate_return(ob_no, re_n)
            agent.update_parameters(ob_no, ac_na, q_n, adv_n, itr, epoch)
            agent.copy_new_to_old()

            # Log diagnostics
            returns = [path["reward"].sum() for path in paths]
            ep_lengths = [pathlength(path) for path in paths]
            logz.log_tabular("Time", time.time() - start)
            logz.log_tabular("Iteration", itr)
            logz.log_tabular("AverageReturn", np.mean(returns))
            logz.log_tabular("StdReturn", np.std(returns))
            logz.log_tabular("MaxReturn", np.max(returns))
            logz.log_tabular("MinReturn", np.min(returns))
            logz.log_tabular("EpLenMean", np.mean(ep_lengths))
            logz.log_tabular("EpLenStd", np.std(ep_lengths))
            logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
            logz.log_tabular("TimestepsSoFar", total_timesteps)
            logz.dump_tabular()
            logz.pickle_tf_vars()
            agent.add_to_tensorboard(returns, ep_lengths, itr)

            if (itr + 1) % 100 == 0:
                agent.save_model(itr)
示例#14
0
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False):
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]
    logz.configure_output_dir(logdir)
    if vf_type == 'linear':
        vf = LinearValueFunction(**vf_params)
    elif vf_type == 'nn':
        vf = NnValueFunction(ob_dim=ob_dim, **vf_params)


    YOUR_CODE_HERE


    sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101

    total_timesteps = 0
    stepsize = initial_stepsize

    for i in range(n_iter):
        print("********** Iteration %i ************"%i)

        YOUR_CODE_HERE

        if kl > desired_kl * 2: 
            stepsize /= 1.5
            print('stepsize -> %s'%stepsize)
        elif kl < desired_kl / 2: 
            stepsize *= 1.5
            print('stepsize -> %s'%stepsize)
        else:
            print('stepsize OK')


        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
示例#15
0
    def fit(self, dataset):
        """
        """      

        self.graph = self.build_computation_graph()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(graph=self.graph, config = config)
        self.sess.run(self.initializer)

        num_steps_in_epoch = dataset.num_train // self.n_batch
        n_step = num_steps_in_epoch * self.n_epoch

        start = time.time()
        np.random.seed(self.seed)

        try:
            self.learning_curve['train'].clear()
            self.learning_curve['val'].clear()
            loss_train = 0.
            
            for step in range(n_step):
                local_batch = dataset.next_batch(self.n_batch)
                loss_train += self.compute_loss(self.sess, batch = local_batch, optimize=True)

                
                
                if (step + 1) % num_steps_in_epoch == 0:
                    train_error = self.n_batch / dataset.num_train * loss_train
                    val_error = self.compute_loss(self.sess, batch = dataset.testdata(), optimize = False)
                    # Return the negative error to allow monitoring for the ELBO.
                    self.learning_curve['train'] += [train_error]
                    self.learning_curve['val'] += [val_error]
                    loss_train = 0.

                    logz.log_tabular("Time", time.time() - start)
                    logz.log_tabular("Fold", dataset.test_fold)
                    logz.log_tabular("Epoch", dataset.epochs_completed)
                    logz.log_tabular("BatchStep", step)
                    logz.log_tabular("TrainError", train_error)
                    logz.log_tabular("ValError", val_error)
                    logz.dump_tabular()
                    # print('epoch: {:2d}, step: {:5d}, training error: {:03.4f}, '
                    #       'validation error: {:03.4f}, time elapsed: {:4.0f} s'
                    #       .format(dataset.epochs_completed, step, train_error, val_error, time.time() - start))
        except KeyboardInterrupt:
            print('ending training')
        finally:
            # If interrupted or stopped, store the progress of the model.
            self.saver.save(self.sess, self.checkpoint_path)
            self.sess.close()
            #coord.request_stop()
            #coord.join(threads)
            print('finished training')
        return self                  
示例#16
0
def main_pendulum(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=False, logfile=None):
    env = gym.make("Pendulum-v0")
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.shape[0]

    logz.configure_output_file(logfile)
    #vf = LinearValueFunction()
    vf = NeuralValueFunction(ob_dim)

    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in these functions
    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations
    sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.float32) # batch of actions taken by the policy, used for policy gradient computation
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate
    sy_h1 = tf.nn.relu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer
    sy_mean_n = dense(sy_h1, ac_dim, "final", weight_init=normc_initializer(0.05)) # Mean control output
    sy_logstd_n = tf.Variable(tf.zeros([ac_dim]))
    sy_std_n = tf.exp(sy_logstd_n)

    # Get probabilities from normal distribution and sample from distribution
    dist = tf.contrib.distributions.Normal(mu=tf.reshape(sy_mean_n,[-1]), sigma=sy_std_n)
    sy_logprob_n = tf.reshape(tf.log(dist.pdf(sy_ac_n)),[-1])
    sy_n = tf.shape(sy_ob_no)[0]
    sy_sampled_ac = dist.sample(sy_n) # sampled actions, used for defining the policy (NOT computing the policy gradient)

    # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
    sy_mean_n_old = tf.placeholder(shape=[None, ac_dim], name='old_mean', dtype=tf.float32)
    sy_std_n_old = tf.placeholder(shape=[ac_dim], name='old_std', dtype=tf.float32)

    sy_kl = tf.reduce_sum(tf.log(sy_std_n/sy_std_n_old)+(sy_std_n_old**0.5+(sy_mean_n_old-sy_mean_n)**0.5)/(2*sy_std_n**0.5)-0.5)/tf.to_float(sy_n)
    sy_ent = tf.reduce_sum(-(1+tf.log(2*math.pi*sy_std_n**2))*0.5)
    # <<<<<<<<<<<<<

    sy_surr = -tf.reduce_mean(sy_adv_n*sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__()
    sess.run(tf.global_variables_initializer())

    total_timesteps = 0
    obs_mean = np.zeros(ob_dim)
    obs_std = np.zeros(ob_dim)

    for i in range(n_iter):
        print("********** Iteration %i ************"%i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                acs.append(ac.flatten())
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew.flatten())
                ob = ob.flatten()
                if done:
                    break
            path = {"observation" : np.array(obs), "terminated" : terminated,
                    "reward" : np.array(rewards), "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict((path["observation"]-obs_mean)/(obs_std+1e-8))
            adv_t = return_t.flatten() - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n-adv_n.mean())/(adv_n.std()+1e-8)
        vtarg_n = np.concatenate(vtargs).flatten()
        vpred_n = np.concatenate(vpreds)
        obs_mean = np.average(ob_no,axis=0)
        obs_std = np.std(ob_no,axis=0)
        vf.fit((ob_no-obs_mean)/(obs_std+1e-8), vtarg_n)

        # Policy update
        _, mean_n, std_n = sess.run([update_op, sy_mean_n, sy_std_n], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n.flatten(), sy_adv_n:standardized_adv_n, sy_stepsize:stepsize})
        kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_mean_n_old: mean_n, sy_std_n_old: std_n})

        desired_kl = 2e-3
        if kl > desired_kl * 2: 
            stepsize /= 1.5
            print('stepsize -> %s'%stepsize)
        elif kl < desired_kl / 2: 
            stepsize *= 1.5
            print('stepsize -> %s'%stepsize)
        else:
            print('stepsize OK')

        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
示例#17
0
def train(pub_cmd,
          pub_act,
          rate,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    rand_controller = RandomController()
    paths = sample(pub_cmd, pub_act, rate, rand_controller, num_paths_random,
                   env_horizon, render)
    data = paths_to_array(paths)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        # Fit dynamics model
        print('Training dynamics model...')
        dyn_model.fit(data)
        mpc_controller.dyn_model = dyn_model
        costs = []
        returns = []
        # Do MPC
        for i in range(num_paths_onpol):
            print('On policy path: %i' % i)
            obs_t, obs_tp1, acs_t, rews_t = [], [], [], []
            s_t = reset(pub_cmd, rate)
            total_return = 0

            for j in range(env_horizon):
                # print('Timestep: %i, Return: %g' % (j,total_return))
                a_t = mpc_controller.get_action(s_t)
                s_tp1, _ = step(a_t, pub_act, pub_cmd, rate)
                r_t = 0
                for i in range(9):
                    r_t += s_tp1[i * 12] - s_t[i * 12]
                total_return += r_t

                if render:
                    env.render()
                    time.sleep(0.05)

                obs_t.append(s_t)
                obs_tp1.append(s_tp1)
                acs_t.append(a_t)
                rews_t.append(r_t)

                s_t = s_tp1

            path = {
                "observations": np.array(obs_t),
                "next_observations": np.array(obs_tp1),
                "actions": np.array(acs_t),
                "rewards": np.array(rews_t)
            }
            total_cost = path_cost(cost_fn, path)

            paths.append(path)
            returns.append(total_return)
            costs.append(total_cost)
            print('Total cost: %g, Total reward: %g' %
                  (total_cost, total_return))

        data = paths_to_array(paths)
        normalization = compute_normalization(data)
        # Set new normalization statistics for dynamics model
        dyn_model.normalization = normalization

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
示例#18
0
文件: main.py 项目: zhenghaoz/cs294
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    del params['cost_fn']
    del params['activation']
    del params['output_activation']
    del params['env']
    logz.save_params(params)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    # Sample from random controller
    paths = sample(env, random_controller, num_paths_random, env_horizon,
                   render, True)
    # Build data set
    data = dict()
    data['observations'] = np.concatenate(
        [path['observations'] for path in paths])
    data['actions'] = np.concatenate([path['actions'] for path in paths])
    next_observations = np.concatenate(
        [path['next_observations'] for path in paths])
    data['deltas'] = next_observations - data['observations']

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        # Refit dynamic model
        dyn_model.fit(data)
        # Sample on-policy trajectories
        paths = sample(env, mpc_controller, num_paths_onpol, env_horizon,
                       render, True)
        # Summarize trajectories
        costs = [path_cost(cost_fn, path) for path in paths]
        returns = [np.sum(path['rewards']) for path in paths]
        # Aggregate data
        onpol_observations = np.concatenate(
            [path['observations'] for path in paths])
        onpol_actions = np.concatenate([path['actions'] for path in paths])
        onpol_next_observations = np.concatenate(
            [path['next_observations'] for path in paths])
        onpol_deltas = onpol_next_observations - onpol_observations
        data['observations'] = np.append(data['observations'],
                                         onpol_observations, 0)
        data['actions'] = np.append(data['actions'], onpol_actions, 0)
        data['deltas'] = np.append(data['deltas'], onpol_deltas, 0)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
示例#19
0
    def train_mf(self):
        self.start_worker()
        self.init_opt()
        logz.configure_output_dir(
            "/home/hendawy/Desktop/2DOF_Robotic_Arm_withSphereObstacle/Rr",
            1807)
        for itr in range(self.current_itr, self.n_itr):
            with logger.prefix('itr #%d | ' % itr):
                paths = self.sampler.obtain_samples(itr, Constrained=True)
                samples_data, analysis_data = self.sampler.process_samples(
                    itr, paths)
                self.log_diagnostics(paths)
                optimization_data = self.optimize_policy(itr, samples_data)
                logz.log_tabular('Iteration', analysis_data["Iteration"])
                # In terms of true environment reward of your rolled out trajectory using the MPC controller
                logz.log_tabular('AverageDiscountedReturn',
                                 analysis_data["AverageDiscountedReturn"])
                logz.log_tabular('AverageReturns',
                                 analysis_data["AverageReturn"])
                logz.log_tabular('violation_cost',
                                 np.mean(samples_data["violation_cost"]))
                logz.log_tabular(
                    'boundary_violation_cost',
                    np.mean(samples_data["boundary_violation_cost"]))
                logz.log_tabular('success_rate', samples_data["success_rate"])
                logz.log_tabular(
                    'successful_AverageReturn',
                    np.mean(samples_data["successful_AverageReturn"]))
                logz.log_tabular('ExplainedVariance',
                                 analysis_data["ExplainedVariance"])
                logz.log_tabular('NumTrajs', analysis_data["NumTrajs"])
                logz.log_tabular('Entropy', analysis_data["Entropy"])
                logz.log_tabular('Perplexity', analysis_data["Perplexity"])
                logz.log_tabular('StdReturn', analysis_data["StdReturn"])
                logz.log_tabular('MaxReturn', analysis_data["MaxReturn"])
                logz.log_tabular('MinReturn', analysis_data["MinReturn"])
                logz.log_tabular('LossBefore', optimization_data["LossBefore"])
                logz.log_tabular('LossAfter', optimization_data["LossAfter"])
                logz.log_tabular('MeanKLBefore',
                                 optimization_data["MeanKLBefore"])
                logz.log_tabular('MeanKL', optimization_data["MeanKL"])
                logz.log_tabular('dLoss', optimization_data["dLoss"])
                logz.dump_tabular()
                logger.log("saving snapshot...")
                params = self.get_itr_snapshot(itr, samples_data)
                self.current_itr = itr + 1
                params["algo"] = self
                if self.store_paths:
                    params["paths"] = samples_data["paths"]
                logger.save_itr_params(itr, params)
                logger.log("saved")
                logger.dump_tabular(with_prefix=False)
                if self.plot:
                    self.update_plot()
                    if self.pause_for_plot:
                        input("Plotting evaluation run: Press Enter to "
                              "continue...")

        self.shutdown_worker()
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run.

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated
    |                           by a random agent. We use these to train our
    |_                          initial dynamics model.

    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments.

    """

    logz.configure_output_dir(logdir)

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    paths = sample(env=env,
                   controller=random_controller,
                   num_paths=num_paths_random,
                   horizon=env_horizon,
                   verbose=False)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = {
        "observations":
        compute_normalization(paths["observations"]),
        "actions":
        compute_normalization(paths["actions"]),
        "deltas":
        compute_normalization(paths["next_observations"] -
                              paths["observations"])
    }

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """
        shuffle_indexes = np.random.permutation(paths["observations"].shape[0])
        for key in ['observations', 'actions', 'next_observations', 'rewards']:
            paths[key] = paths[key][shuffle_indexes]

        dyn_model.fit(paths)

        newpaths = sample(env=env,
                          controller=mpc_controller,
                          num_paths=num_paths_onpol,
                          horizon=env_horizon,
                          verbose=False)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
示例#21
0
    def train(self):
        args = self.args
        mnist = self.data
        t_start = time.time()

        for ee in range(args.epochs):
            # Resample the hyperparameters if we're doing HMC.
            if args.algo == 'hmc':
                hparams = self.hmc_updater.update_hparams()
                hmc_info = defaultdict(list)

            for ii in range(self.num_train_mbs):
                xs = self.data_mb_list['X_train'][ii]
                ys = self.data_mb_list['y_train'][ii]
                if args.algo == 'hmc':
                    info = self.hmc_updater.hmc(xs, ys, hparams)
                    for key in info:
                        hmc_info[key].append(info[key])
                else:
                    feed = {self.x_BO: xs, self.y_targ_B: ys}
                    _, grads, loss = self.sess.run(
                        [self.train_op, self.grads, self.loss], feed)

            # Log after each epoch, if desired and test on validation.
            if (ee % args.log_every_t_epochs == 0):
                acc_valid, loss_valid = self._check_validation()

                print("\n  ************ Epoch %i ************" % (ee + 1))
                elapsed_time_hours = (time.time() - t_start) / (60.0**2)

                if args.algo == 'hmc':
                    for ww, hp in zip(self.weights, hparams):
                        print("{:10} -- plambda={:.3f}".format(
                            str(ww.get_shape().as_list()), hp))
                    logz.log_tabular("HMCAcceptRateEpoch",
                                     np.mean(hmc_info['accept']))
                    logz.log_tabular("KineticOldMean",
                                     np.mean(hmc_info['K_old']))
                    logz.log_tabular("KineticNewMean",
                                     np.mean(hmc_info['K_new']))
                    logz.log_tabular("PotentialOldMean",
                                     np.mean(hmc_info['U_old']))
                    logz.log_tabular("PotentialNewMean",
                                     np.mean(hmc_info['U_new']))
                    logz.log_tabular("HamiltonianOldMean",
                                     np.mean(hmc_info['H_old']))
                    logz.log_tabular("HamiltonianNewMean",
                                     np.mean(hmc_info['H_new']))

                logz.log_tabular("ValidAcc", acc_valid)
                logz.log_tabular("ValidLoss", loss_valid)
                logz.log_tabular("Temperature", args.temperature)
                logz.log_tabular("TimeHours", elapsed_time_hours)
                logz.log_tabular("Epochs", ee)
                logz.dump_tabular()
示例#22
0
    def train(self, num_iter):

        log_name = "seed_{0}".format(self.seed)
        logger = Logger(logname=self.env_name, now=log_name)

        start = time.time()
        for i in range(num_iter):

            t1 = time.time()
            reward_avg_loss = self.train_step()
            t2 = time.time()
            print('total time of one step', t2 - t1)
            print('iter ', i, ' done')

            # record statistics every 10 iterations
            if ((i + 1) % 10 == 0):

                rewards = self.aggregate_rollouts(num_rollouts=100,
                                                  evaluate=True)
                w = ray.get(self.workers[0].get_weights_plus_stats.remote())
                np.savez(self.logdir + "/lin_policy_plus", w)

                # # output reward function loss
                # test_loss_list = []
                # test_size = len(test_dataset_x)
                # assert len(test_dataset_x) == len(test_dataset_y)
                # test_dataset_x = np.array(test_dataset_x)
                # test_dataset_y = np.array(test_dataset_y).reshape(-1,1)
                # num_batch = int(test_size / self.batch_size)

                # for idx in range(num_batch):
                #     test_loss_list.append(self.reward_func.sess.run(self.reward_func.loss, feed_dict={self.reward_func.input_ph: test_dataset_x[idx*self.batch_size: (idx+1)*self.batch_size],
                #                                                     self.reward_func.reward_ph: test_dataset_y[idx*self.batch_size: (idx+1)*self.batch_size]}))
                # test_avg_loss = np.mean(test_loss_list)
                print(sorted(self.params.items()))
                logz.log_tabular("Time", time.time() - start)
                logz.log_tabular("Iteration", i + 1)
                logz.log_tabular("AverageReward", np.mean(rewards))
                logz.log_tabular("StdRewards", np.std(rewards))
                logz.log_tabular("MaxRewardRollout", np.max(rewards))
                logz.log_tabular("MinRewardRollout", np.min(rewards))
                logz.log_tabular("timesteps", self.timesteps)
                logz.log_tabular("AvgRewardFunctionLoss", reward_avg_loss)
                # logz.log_tabular("AvgRewardTestLoss", test_avg_loss)
                logz.dump_tabular()

                logger.log({
                    "Time": time.time() - start,
                    "Iteration": i + 1,
                    "AverageReward": np.mean(rewards),
                    "StdRewards": np.std(rewards),
                    "MaxRewardRollout": np.max(rewards),
                    "MinRewardRollout": np.min(rewards),
                    "timesteps": self.timesteps
                })
                logger.write(display=False)

            t1 = time.time()
            # get statistics from all workers
            for j in range(self.num_workers):
                self.policy.observation_filter.update(
                    ray.get(self.workers[j].get_filter.remote()))
            self.policy.observation_filter.stats_increment()

            # make sure master filter buffer is clear
            self.policy.observation_filter.clear_buffer()
            # sync all workers
            filter_id = ray.put(self.policy.observation_filter)
            setting_filters_ids = [
                worker.sync_filter.remote(filter_id) for worker in self.workers
            ]
            # waiting for sync of all workers
            ray.get(setting_filters_ids)

            increment_filters_ids = [
                worker.stats_increment.remote() for worker in self.workers
            ]
            # waiting for increment of all workers
            ray.get(increment_filters_ids)
            t2 = time.time()
            print('Time to sync statistics:', t2 - t1)

        np.savetxt("dataset_x.txt", self.dataset_x)
        np.savetxt("dataset_y.txt", self.dataset_y)
        logger.close()
        return
示例#23
0
def train_SAC(env_name, exp_name, seed, logdir, extra_params=None):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
    }.get(env_name, 0.2)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': 256,
        'discount': 0.99,
        'learning_rate': 1e-3,
        'reparameterize': get_extra_param(extra_params, 'reparameterize',
                                          False),
        'tau': 0.01,
        'epoch_length': 1000,
        'n_epochs': 500,
        'two_qf': get_extra_param(extra_params, 'two_qf', False),
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': 1000,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    q_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    policy_params = {
        'hidden_layer_sizes': (128, 128),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    env = gym.envs.make(env_name)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        **replay_pool_params)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(name='value_function',
                                      **value_function_params)
    target_value_function = nn.ValueFunction(name='target_value_function',
                                             **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=env.action_space.shape[0],
        reparameterize=algorithm_params['reparameterize'],
        **policy_params)

    sampler.initialize(env, policy, replay_pool)

    algorithm = SAC(**algorithm_params)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True  # may need if using GPU
    with tf.Session(config=tf_config):
        algorithm.build(env=env,
                        policy=policy,
                        q_function=q_function,
                        q_function2=q_function2,
                        value_function=value_function,
                        target_value_function=target_value_function)

        for epoch in algorithm.train(sampler,
                                     n_epochs=algorithm_params.get(
                                         'n_epochs', 1000)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        to_animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32,
        video_dir=None):

    start = time.time()

    nn_params = {"n_layers": n_layers, "size": size, "lr": learning_rate}

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    #env._max_episode_steps = 4000

    to_animate = ToAnimate(False)
    to_animate.animate = False

    if video_dir is not None:
        env = gym.wrappers.Monitor(env,
                                   video_dir,
                                   force=True,
                                   video_callable=to_animate)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    (sy_sampled_ac, sy_ob_no, sy_ac_na,
     sy_adv_n), (update_op,
                 loss) = get_policy_gradient_NN(ob_dim, ac_dim, discrete,
                                                nn_params)

    if nn_baseline:
        baseline_predictor = BaselinePredictor(sy_ob_no,
                                               epoch_num=500,
                                               nn_params=nn_params)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  # pylint: disable=E1101

    # Training Loop
    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps for one batch
        paths, num_collected_timesteps = collect_paths(
            sess, sy_sampled_ac, sy_ob_no, env, min_timesteps, max_path_length,
            to_animate, itr, discrete)
        total_timesteps += num_collected_timesteps

        # Build arrays for observation, action for the policy gradient update
        #  by concatenating  across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        q_n = get_reward(paths, gamma, reward_to_go)

        if nn_baseline:
            # Getting baselines for each timesteps
            b_n = baseline_predictor.predict(ob_no)[0]

            # Rescaling the output to mach statistics of Q-values
            b_n = (b_n - np.mean(b_n)) / np.std(b_n)
            b_n = np.mean(q_n) + (b_n * np.std(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            adv_n = (adv_n - np.mean(adv_n)) / np.std(adv_n)

        if nn_baseline:
            baseline_predictor.fit(inputs=ob_no,
                                   labels=(q_n - np.mean(q_n)) / np.std(q_n),
                                   n_iter=1)

        if discrete: ac_na = ac_na.flatten()  # FIXME

        loss_before = sess.run(
            loss,
            feed_dict={
                sy_ob_no: ob_no,  # observation
                sy_ac_na: ac_na,  # taken actions
                sy_adv_n: adv_n  # adventages
            })

        sess.run(
            update_op,
            feed_dict={
                sy_ob_no: ob_no,  # observation
                sy_ac_na: ac_na,  # taken actions
                sy_adv_n: adv_n  # adventages
            })

        loss_after = sess.run(
            loss,
            feed_dict={
                sy_ob_no: ob_no,  # observation
                sy_ac_na: ac_na,  # taken actions
                sy_adv_n: adv_n  # adventages
            })

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]

        #logz.log_tabular("Loss_before", loss_before)
        logz.log_tabular("Loss_after", loss_after)
        logz.log_tabular("delta_loss", loss_after - loss_before)

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", len(ac_na))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#25
0
def train_PG(
             exp_name='',
             env_name='',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=False, 
             animate=True, 
             logdir=None, 
             normalize_advantages=False,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,

             # mb mpc arguments
             model_learning_rate=1e-3,
             onpol_iters=10,
             dynamics_iters=260,
             batch_size=512,
             num_paths_random=10, 
             num_paths_onpol=10, 
             num_simulated_paths=1000,
             env_horizon=1000, 
             mpc_horizon=10,
             m_n_layers=2,
             m_size=500,
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    # env = gym.make(env_name)
    env = HalfCheetahEnvNew()
    cost_fn = cheetah_cost_fn
    activation=tf.nn.relu
    output_activation=None

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    # max_path_length = max_path_length or env.spec.max_episode_steps
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Print environment infomation
    print("-------- env info --------")
    print("Environment name: ", env_name)
    print("Action space is discrete: ", discrete)
    print("Action space dim: ", ac_dim)
    print("Observation space dim: ", ob_dim)
    print("Max_path_length ", max_path_length)




    #========================================================================================#
    # Random data collection
    #========================================================================================#

    random_controller = RandomController(env)
    data_buffer_model = DataBuffer()
    data_buffer_ppo = DataBuffer_general(10000, 4)

    # sample path
    print("collecting random data .....  ")
    paths = sample(env, 
               random_controller, 
               num_paths=num_paths_random, 
               horizon=env_horizon, 
               render=False,
               verbose=False)

    # add into buffer
    for path in paths:
        for n in range(len(path['observations'])):
            data_buffer_model.add(path['observations'][n], path['actions'][n], path['next_observations'][n])

    print("data buffer size: ", data_buffer_model.size)

    normalization = compute_normalization(data_buffer_model)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#
    tf_config = tf.ConfigProto() 
    tf_config.allow_soft_placement = True
    tf_config.intra_op_parallelism_threads =4
    tf_config.inter_op_parallelism_threads = 1
    sess = tf.Session(config=tf_config)

    dyn_model = NNDynamicsModel(env=env, 
                                n_layers=n_layers, 
                                size=size, 
                                activation=activation, 
                                output_activation=output_activation, 
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env, 
                                   dyn_model=dyn_model, 
                                   horizon=mpc_horizon, 
                                   cost_fn=cost_fn, 
                                   num_simulated_paths=num_simulated_paths)


    policy_nn = policy_network_ppo(sess, ob_dim, ac_dim, discrete, n_layers, size, learning_rate)

    if nn_baseline:
        value_nn = value_network(sess, ob_dim, n_layers, size, learning_rate)

    sess.__enter__() # equivalent to `with sess:`

    tf.global_variables_initializer().run()


    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        if MPC:
            dyn_model.fit(data_buffer_model)
        returns = []
        costs = []

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []

        while True:
            # print("data buffer size: ", data_buffer_model.size)
            current_path = {'observations': [], 'actions': [], 'reward': [], 'next_observations':[]}

            ob = env.reset()
            obs, acs, mpc_acs, rewards = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            return_ = 0
 
            while True:
                # print("steps ", steps)
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)

                if MPC:
                    mpc_ac = mpc_controller.get_action(ob)
                else:
                    mpc_ac = random_controller.get_action(ob)

                ac = policy_nn.predict(ob, mpc_ac)

                ac = ac[0]

                if not PG:
                    ac = mpc_ac

                acs.append(ac)
                mpc_acs.append(mpc_ac)

                current_path['observations'].append(ob)

                ob, rew, done, _ = env.step(ac)

                current_path['reward'].append(rew)
                current_path['actions'].append(ac)
                current_path['next_observations'].append(ob)

                return_ += rew
                rewards.append(rew)

                steps += 1
                if done or steps > max_path_length:
                    break


            if MPC:
                # cost & return
                cost = path_cost(cost_fn, current_path)
                costs.append(cost)
                returns.append(return_)
                print("total return: ", return_)
                print("costs: ", cost)

                # add into buffers
                for n in range(len(current_path['observations'])):
                    data_buffer_model.add(current_path['observations'][n], current_path['actions'][n], current_path['next_observations'][n])

            for n in range(len(current_path['observations'])):
                data_buffer_ppo.add(current_path['observations'][n], current_path['actions'][n], current_path['reward'][n], current_path['next_observations'][n])
        
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs),
                    "mpc_action" : np.array(mpc_acs)}



            paths.append(path)
            timesteps_this_batch += pathlength(path)
            # print("timesteps_this_batch", timesteps_this_batch)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch


        print("data_buffer_ppo.size:", data_buffer_ppo.size)


        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        mpc_ac_na = np.concatenate([path["mpc_action"] for path in paths])


        # Computing Q-values
     
        if reward_to_go:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        if t_ >= t:
                            q += gamma**(t_-t) * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)

        else:
            q_n = []
            for path in paths:
                for t in range(len(path["reward"])):
                    t_ = 0
                    q = 0
                    while t_ < len(path["reward"]):
                        q += gamma**t_ * path["reward"][t_]
                        t_ += 1
                    q_n.append(q)
            q_n = np.asarray(q_n)


        # Computing Baselines
        if nn_baseline:

            # b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no :ob_no})
            b_n = value_nn.predict(ob_no)
            b_n = normalize(b_n)
            b_n = denormalize(b_n, np.std(q_n), np.mean(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        # Advantage Normalization
        if normalize_advantages:
            adv_n = normalize(adv_n)

        # Optimizing Neural Network Baseline
        if nn_baseline:
            b_n_target = normalize(q_n)
            value_nn.fit(ob_no, b_n_target)
                # sess.run(baseline_update_op, feed_dict={sy_ob_no :ob_no, sy_baseline_target_n:b_n_target})


        # Performing the Policy Update

        # policy_nn.fit(ob_no, ac_na, adv_n)
        policy_nn.fit(ob_no, ac_na, adv_n, mpc_ac_na)

        # sess.run(update_op, feed_dict={sy_ob_no :ob_no, sy_ac_na:ac_na, sy_adv_n:adv_n})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
def train(sess, env, args, actor, critic, actor_noise, logdir):
    logz.configure_output_dir(logdir)
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    print('params: ', params)
    params['env'] = 'InvertedPendulum'
    params['exp_name'] = '3layer'
    logz.save_params(params)
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()
    checkpoint_actor_dir = os.path.join(os.curdir, 'Actor_InvertedPendulum')
    if not os.path.exists(checkpoint_actor_dir):
        os.makedirs(checkpoint_actor_dir)
    actor_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt")
    ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir)

    checkpoint_critic_dir = os.path.join(os.curdir, 'Critic_InvertedPendulum')
    if not os.path.exists(checkpoint_critic_dir):
        os.makedirs(checkpoint_critic_dir)
    critic_prefix = os.path.join(checkpoint_critic_dir, "model.ckpt")
    ckpt_2 = tf.train.get_checkpoint_state(checkpoint_critic_dir)

    if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path):
        print("Reading actor parameters from %s" % ckpt_1.model_checkpoint_path)
        actor.saver.restore(sess, ckpt_1.model_checkpoint_path)

    if ckpt_2 and tf.train.checkpoint_exists(ckpt_2.model_checkpoint_path):
        print("Reading critic parameters from %s" % ckpt_2.model_checkpoint_path)
        critic.saver.restore(sess, ckpt_2.model_checkpoint_path)

    uninitialized_vars = []
    for var in tf.all_variables():
        try:
            sess.run(var)
        except tf.errors.FailedPreconditionError:
            uninitialized_vars.append(var)

    if len(uninitialized_vars) > 0:
        init_new_vars_op = tf.variables_initializer(uninitialized_vars)
        sess.run(init_new_vars_op)

    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']), int(args['random_seed']))

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    # tflearn.is_training(True)

    def testing():
        env1 = gym.make(args['env'])
        s = env1.reset()
        done = False
        total_reward = 0
        max_steps = env1.spec.timestep_limit
        step = 0

        while not done:
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))
            s2, r, done, _ = env1.step(a[0])
            total_reward += r
            step += 1
            s = s2
            # env.render()
            if step > max_steps:
                break
        print('total steps: ', step)
        print('total reward: ', total_reward)
        return step, total_reward

    iter = 0
    start = time.time()
    best_step, best_rew = testing()
    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise
            # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
            num = np.random.uniform()
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim,)), np.reshape(a, (actor.a_dim,)), r,
                              terminal, np.reshape(s2, (actor.s_dim,)))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            batch_size = int(args['minibatch_size'])
            if replay_buffer.size() > 100000:
                iter += 1
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(batch_size)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                # critic will be trained to minimise the mean square error of the predicted Q value
                # and the target value.
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch, np.reshape(y_i, (int(args['minibatch_size']), 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                # gradients of the critic Q value according to the action valu --> action gradients
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch, a_outs)  # del_a Q(s,a)
                actor.train(s_batch, grads[0]) # del_a Q(s,a) * del_theta Mu_theta(s) ---> actor gradients
                # directly apply these gradients on actor params. No special loss to minimize

                if iter%20 == 0:
                    new_steps, new_rew = testing()
                    if new_rew > best_rew:
                        best_rew = new_rew
                        actor.saver.save(sess, actor_prefix)
                        critic.saver.save(sess, critic_prefix)
                        print('model saved to disk.')
                        actor.saver.restore(sess, ckpt_1.model_checkpoint_path)
                        critic.saver.restore(sess, ckpt_2.model_checkpoint_path)
                        best_step, best_rew = testing()
                    # print('actor model saved to: ', actor_prefix)
                    # print('critic model saved to: ', critic_prefix)

                if iter%10 == 0:
                    new_steps, new_rew = testing()
                    logz.log_tabular("Time", time.time() - start)
                    logz.log_tabular('Iteration', iter/10)
                    logz.log_tabular('Reward', new_rew)
                    logz.log_tabular('Steps', new_steps)
                    logz.dump_tabular()

                # Update target networks
                if iter%50 == 0:
                    replay_buffer.update()
                    print('updating buffer')
                    print('updating target networks..')
                    actor.update_target_network()
                    critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
                summary_str = sess.run(summary_ops, feed_dict={
                    summary_vars[0]: ep_reward,
                    summary_vars[1]: ep_ave_max_q / float(j)
                })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                                                                             i, (ep_ave_max_q / float(j))))
                break
示例#27
0
def train_PG(exp_name, env_name, n_iter, \
             gamma, min_timesteps_per_batch, max_path_length, learning_rate, \
             reward_to_go, animate, logdir, normalize_advantages, nn_baseline, \
             seed, n_layers, size):
    start = time.time()
    setup_logger(logdir, locals())  ## Set up Logger

    env = gym.make(env_name)
    tf.set_random_seed(seed)
    env.seed(seed)

    max_path_length = max_path_length or env.spec.max_episode_steps
    discrete = isinstance(env.action_space, gym.spaces.Discrete)
    obs_dim = env.observation_space.shape[0]
    act_dim = env.action_space.n if discrete else env.action_space.shape[0]

    ## Initialize Agent
    computation_graph_args = {'n_layers': n_layers, 'obs_dim': obs_dim, 'act_dim': act_dim, \
                              'discrete': discrete, 'size': size, 'learning_rate': learning_rate}
    sample_trajectory_args = {'animate': animate, 'max_path_length': max_path_length, \
                              'min_timesteps_per_batch': min_timesteps_per_batch}
    estimate_return_args = {'gamma': gamma, 'reward_to_go': reward_to_go, \
                            'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages}
    agent = Agent(computation_graph_args, sample_trajectory_args,
                  estimate_return_args)
    agent.build_computation_graph()
    agent.init_tf_sess()

    ## Training Loop
    total_time_steps = 0
    for itr in range(n_iter):
        print("********* Iteration %i *********" % itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_time_steps += timesteps_this_batch

        obs_no = np.concatenate([path['observation'] for path in paths])
        act_na = np.concatenate([path['action'] for path in paths])
        ret_n = [path['reward'] for path in paths]

        q_n, adv_n = agent.estimate_return(obs_no, ret_n)
        agent.update_parameters(obs_no, act_na, q_n, adv_n)

        # Log dianostics
        returns = [path['reward'].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenSt", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_time_steps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#28
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name='adv', dtype=tf.float32)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 'network',
                                 n_layers=n_layers)

        # Hint: Use the tf.multinomial
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), [-1])
        sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=sy_logits_na)

    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(sy_ob_no, ac_dim, 'network', n_layers=n_layers)

        # logstd should just be a trainable variable, not a network output.
        sy_logstd = tf.Variable(tf.zeros([1, ac_dim]),
                                name='sy_logstd',
                                dtype=tf.float32)
        sy_std = tf.exp(sy_logstd)

        sy_sampled_ac = tf.random_normal(tf.shape(sy_mean),
                                         mean=sy_mean,
                                         stddev=sy_std)

        sy_z = (sy_ac_na - sy_mean) / sy_std
        sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    # Loss function that we'll differentiate to get the policy gradient.
    loss = tf.reduce_mean(sy_logprob_n * sy_adv_n)
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        baseline_target = tf.placeholder(shape=[None],
                                         name='baseline_target',
                                         dtype=tf.float32)
        baseline_loss = tf.reduce_sum(
            tf.losses.mean_squared_error(labels=baseline_target,
                                         predictions=baseline_prediction))
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            baseline_loss)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        discounted_rewards = []
        for path in paths:
            r = 0
            path_rewards = [0.0] * pathlength(path)
            for t in reversed(range(pathlength(path))):
                r = path['reward'][t] + gamma * r
                path_rewards[t] = r
            if reward_to_go:
                discounted_rewards.append(path_rewards)
            else:
                discounted_rewards.append([path_rewards[0]] * pathlength(path))
        q_n = np.concatenate(discounted_rewards)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            b_n = (b_n - b_n.mean(axis=0)) / (b_n.std(axis=0) + 1e-8)
            q_mean = q_n.mean(axis=0)
            q_std = q_n.std(axis=0)

            b_n = q_mean + q_std * b_n
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_n = (adv_n - adv_n.mean(axis=0)) / (adv_n.std(axis=0) + 1e-8)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            q_n = (q_n - q_n.mean(axis=0)) / (q_n.std(axis=0) + 1e-8)

            sess.run([baseline_update_op],
                     feed_dict={
                         sy_ob_no: ob_no,
                         baseline_target: q_n
                     })
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        sess.run(update_op,
                 feed_dict={
                     sy_ob_no: ob_no,
                     sy_ac_na: ac_na,
                     sy_adv_n: adv_n
                 })

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#29
0
    def train(self, num_iter):

        start = time.time()
        for i in range(num_iter):

            t1 = time.time()
            self.train_step()
            t2 = time.time()
            print("total time of one step", t2 - t1)
            print("iter ", i, " done")

            # record statistics every 10 iterations
            if (i + 1) % 10 == 0:

                rewards = self.aggregate_rollouts(num_rollouts=100,
                                                  evaluate=True)
                w = ray.get(self.workers[0].get_weights_plus_stats.remote())
                np.savez(self.logdir + "/lin_policy_plus", w)

                print(sorted(self.params.items()))
                logz.log_tabular("Time", time.time() - start)
                logz.log_tabular("Iteration", i + 1)
                logz.log_tabular("AverageReward", np.mean(rewards))
                logz.log_tabular("StdRewards", np.std(rewards))
                logz.log_tabular("MaxRewardRollout", np.max(rewards))
                logz.log_tabular("MinRewardRollout", np.min(rewards))
                logz.log_tabular("timesteps", self.timesteps)
                logz.dump_tabular()

            t1 = time.time()
            # get statistics from all workers
            for j in range(self.num_workers):
                self.policy.observation_filter.update(
                    ray.get(self.workers[j].get_filter.remote()))
            self.policy.observation_filter.stats_increment()

            # make sure master filter buffer is clear
            self.policy.observation_filter.clear_buffer()
            # sync all workers
            filter_id = ray.put(self.policy.observation_filter)
            setting_filters_ids = [
                worker.sync_filter.remote(filter_id) for worker in self.workers
            ]
            # waiting for sync of all workers
            ray.get(setting_filters_ids)

            increment_filters_ids = [
                worker.stats_increment.remote() for worker in self.workers
            ]
            # waiting for increment of all workers
            ray.get(increment_filters_ids)
            t2 = time.time()
            print("Time to sync statistics:", t2 - t1)

        return
示例#30
0
def train_AC(
        exp_name,
        env_name,
        n_iter,
        gamma,
        min_timesteps_per_batch,
        max_path_length,
        learning_rate,
        num_target_updates,
        num_grad_steps_per_target_update,
        animate,
        logdir,
        normalize_advantages,
        seed,
        n_layers,
        size,
        ########################################################################
        # Exploration args
        bonus_coeff,
        kl_weight,
        density_lr,
        density_train_iters,
        density_batch_size,
        density_hiddim,
        dm,
        replay_size,
        sigma,
        ########################################################################
        ):
    start = time.time()

    #========================================================================================#
    # Set Up Logger
    #========================================================================================#
    setup_logger(logdir, locals())

    #========================================================================================#
    # Set Up Env
    #========================================================================================#

    # Make the gym environment
    ########################################################################
    # Exploration
    if env_name == 'PointMass-v0':
        from pointmass import PointMass
        env = PointMass()
    else:
        env = gym.make(env_name)
    dirname = logz.G.output_dir
    ########################################################################

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # Is this env continuous or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    # Initialize Agent
    #========================================================================================#
    computation_graph_args = {
        'n_layers': n_layers,
        'ob_dim': ob_dim,
        'ac_dim': ac_dim,
        'discrete': discrete,
        'size': size,
        'learning_rate': learning_rate,
        'num_target_updates': num_target_updates,
        'num_grad_steps_per_target_update': num_grad_steps_per_target_update,
        }

    sample_trajectory_args = {
        'animate': animate,
        'max_path_length': max_path_length,
        'min_timesteps_per_batch': min_timesteps_per_batch,
    }

    estimate_advantage_args = {
        'gamma': gamma,
        'normalize_advantages': normalize_advantages,
    }

    agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args

    # build computation graph
    agent.build_computation_graph()

    ########################################################################
    # Initalize exploration density model
    if dm != 'none':
        if env_name == 'PointMass-v0' and dm == 'hist':
            density_model = Histogram(
                nbins=env.grid_size,
                preprocessor=env.preprocess)
            exploration = DiscreteExploration(
                density_model=density_model,
                bonus_coeff=bonus_coeff)
        elif dm == 'rbf':
            density_model = RBF(sigma=sigma)
            exploration = RBFExploration(
                density_model=density_model,
                bonus_coeff=bonus_coeff,
                replay_size=int(replay_size))
        elif dm == 'ex2':
            density_model = Exemplar(
                ob_dim=ob_dim,
                hid_dim=density_hiddim,
                learning_rate=density_lr,
                kl_weight=kl_weight)
            exploration = ExemplarExploration(
                density_model=density_model,
                bonus_coeff=bonus_coeff,
                train_iters=density_train_iters,
                bsize=density_batch_size,
                replay_size=int(replay_size))
            exploration.density_model.build_computation_graph()
        else:
            raise NotImplementedError

    ########################################################################

    # tensorflow: config, session, variable initialization
    agent.init_tf_sess()

    ########################################################################
    if dm != 'none':
        exploration.receive_tf_sess(agent.sess)
    ########################################################################

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0
    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)
        paths, timesteps_this_batch = agent.sample_trajectories(itr, env)
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        re_n = np.concatenate([path["reward"] for path in paths])
        next_ob_no = np.concatenate([path["next_observation"] for path in paths])
        terminal_n = np.concatenate([path["terminal"] for path in paths])

        ########################################################################
        # Modify the reward to include exploration bonus
        """
            1. Fit density model
                if dm == 'ex2':
                    the call to exploration.fit_density_model should return ll, kl, elbo
                else:
                    the call to exploration.fit_density_model should return nothing
            2. Modify the re_n with the reward bonus by calling exploration.modify_reward
        """
        old_re_n = re_n
        if dm == 'none':
            pass
        else:
            # 1. Fit density model
            if dm == 'ex2':
                ### PROBLEM 3
                ### YOUR CODE HERE
                ll, kl, elbo = exploration.fit_density_model(ob_no)
            elif dm == 'hist' or dm == 'rbf':
                ### PROBLEM 1
                ### YOUR CODE HERE
                exploration.fit_density_model(ob_no)
            else:
                assert False

            # 2. Modify the reward
            ### PROBLEM 1
            ### YOUR CODE HERE
            # raise NotImplementedError
            re_n = exploration.modify_reward(old_re_n,ob_no)

            print('average state', np.mean(ob_no, axis=0))
            print('average action', np.mean(ac_na, axis=0))

            # Logging stuff.
            # Only works for point mass.
            if env_name == 'PointMass-v0':
                np.save(os.path.join(dirname, '{}'.format(itr)), ob_no)
        ########################################################################
        agent.update_critic(ob_no, next_ob_no, re_n, terminal_n)
        adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n)
        agent.update_actor(ob_no, ac_na, adv_n)

        if n_iter - itr < 10:
            max_reward_path_idx = np.argmax(np.array([path["reward"].sum() for path in paths]))
            print(paths[max_reward_path_idx]['reward'])

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        ########################################################################
        logz.log_tabular("Unmodified Rewards Mean", np.mean(old_re_n))
        logz.log_tabular("Unmodified Rewards Std", np.mean(old_re_n))
        logz.log_tabular("Modified Rewards Mean", np.mean(re_n))
        logz.log_tabular("Modified Rewards Std", np.mean(re_n))
        if dm == 'ex2':
            logz.log_tabular("Log Likelihood Mean", np.mean(ll))
            logz.log_tabular("Log Likelihood Std", np.std(ll))
            logz.log_tabular("KL Divergence Mean", np.mean(kl))
            logz.log_tabular("KL Divergence Std", np.std(kl))
            logz.log_tabular("Negative ELBo", -elbo)
        ########################################################################
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#31
0
    def train(self, num_iter, state_filter=False):

        start = time.time()
        for i in range(num_iter):

            t1 = time.time()
            rewards = self.train_step(state_filter=state_filter)
            t2 = time.time()
            print('total time of one step', t2 - t1)
            print('Iteration', i + 1, 'done')
            print('AverageReward:', np.mean(rewards))
            print('StdRewards:', np.std(rewards))
            print('MaxRewardRollout:', np.max(rewards))
            print('MinRewardRollout:', np.min(rewards))

            # record weights and stats every n iterations
            if ((i + 1) % self.log_every == 0):
                rewards = self.aggregate_rollouts(
                    num_rollouts=self.eval_rollouts, evaluate=True)
                #w = ray.get(self.workers[0].get_weights.remote())
                if state_filter:
                    w = ray.get(
                        self.workers[0].get_weights_plus_stats.remote())
                else:
                    w = ray.get(self.workers[0].get_weights.remote())
                np.savez(self.logdir + "/lin_policy_plus", w)

                #print(sorted(self.params.items()))
                logz.log_tabular("Time", time.time() - start)
                logz.log_tabular("Iteration", i + 1)
                logz.log_tabular("AverageReward", np.mean(rewards))
                logz.log_tabular("StdRewards", np.std(rewards))
                logz.log_tabular("MaxRewardRollout", np.max(rewards))
                logz.log_tabular("MinRewardRollout", np.min(rewards))
                logz.log_tabular("Timesteps", self.timesteps)
                logz.log_tabular("LearningRate", self.optimizer.learning_rate)
                logz.log_tabular("DeltaStd", self.delta_std)
                logz.dump_tabular()

            if state_filter:
                t1 = time.time()
                # get statistics from all workers
                for j in range(self.num_workers):
                    self.policy.observation_filter.update(
                        ray.get(self.workers[j].get_filter.remote()))
                self.policy.observation_filter.stats_increment()

                # make sure master filter buffer is clear
                self.policy.observation_filter.clear_buffer()
                # sync all workers
                filter_id = ray.put(self.policy.observation_filter)
                setting_filters_ids = [
                    worker.sync_filter.remote(filter_id)
                    for worker in self.workers
                ]
                # waiting for sync of all workers
                ray.get(setting_filters_ids)

                increment_filters_ids = [
                    worker.stats_increment.remote() for worker in self.workers
                ]
                # waiting for increment of all workers
                ray.get(increment_filters_ids)
                t2 = time.time()
                print('Time to sync statistics:', t2 - t1)

        return
示例#32
0
def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=False, logfile=None):
    env = gym.make("CartPole-v0")
    ob_dim = env.observation_space.shape[0]
    num_actions = env.action_space.n
    logz.configure_output_file(logfile)
    #vf = LinearValueFunction()
    vf = NeuralValueFunction(ob_dim)

    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in these function
    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations
    sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate
    sy_h1 = tf.nn.relu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer
    sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer
    # we use a small initialization for the last layer, so the initial policy has maximal entropy
    sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic)
    sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions
    sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient)
    sy_n = tf.shape(sy_ob_no)[0]
    sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation

    # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>>
    sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na)
    sy_oldp_na = tf.exp(sy_oldlogp_na) 
    sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n)
    sy_p_na = tf.exp(sy_logp_na)
    sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n)
    # <<<<<<<<<<<<<

    sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss")

    sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently)
    update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr)

    sess = tf.Session()
    sess.__enter__()
    sess.run(tf.global_variables_initializer())

    total_timesteps = 0
    obs_mean = np.zeros(ob_dim)
    obs_std = np.zeros(ob_dim)

    for i in range(n_iter):
        print("********** Iteration %i ************"%i)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                if done:
                    break                    
            path = {"observation" : np.array(obs), "terminated" : terminated,
                    "reward" : np.array(rewards), "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        # Estimate advantage function
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = discount(rew_t, gamma)
            vpred_t = vf.predict((path["observation"]-obs_mean)/(obs_std+1e-8))
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        standardized_adv_n = (adv_n-adv_n.mean())/(adv_n.std()+1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        obs_mean = np.average(ob_no,axis=0)
        obs_std = np.std(ob_no,axis=0)
        vf.fit((ob_no-obs_mean)/(obs_std+1e-8), vtarg_n)

        # Policy update
        _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize})
        kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na})

        # Log diagnostics
        logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
        logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
        logz.log_tabular("KLOldNew", kl)
        logz.log_tabular("Entropy", ent)
        logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n))
        logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n))
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        # If you're overfitting, EVAfter will be way larger than EVBefore.
        # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias
        logz.dump_tabular()
def run_experiment(exp_params, learner_params, discriminator_params):
    # Experiment parameters
    file_location = exp_params.get('expert_samples_location', 'expert_data')
    prior_file_location = exp_params.get('prior_samples_location',
                                         'prior_data')
    env_name = exp_params.get('env_name', 'InvertedPendulum-v2')
    env_type = exp_params.get('env_type', 'expert')
    exp_name = exp_params.get('exp_name', '{}_{}'.format(env_name, env_type))
    exp_num = exp_params.get('exp_num', 0)
    epochs = exp_params.get('epochs', 100)
    test_runs_per_epoch = exp_params.get('test_runs_per_epoch', 10)
    steps_per_epoch = exp_params.get('steps_per_epoch', 1000)
    init_random_samples = exp_params.get('init_random_samples', 5000)
    training_starts = exp_params.get('training_starts', 0)
    episode_limit = exp_params.get('episode_limit', 200)
    return_threshold = exp_params.get('return_threshold', 1e4)
    return_agent_buffer = exp_params.get('return_agent_buffer', False)
    visualize_collected_observations = exp_params.get(
        'visualize_collected_observations', False)
    save_weights_checkpoints = exp_params.get('save_weights_checkpoints',
                                              False)

    # Learner parameters
    l_type = learner_params.get('l_type', 'TD3')
    l_buffer_size = learner_params.get('l_buffer_size', 10000)
    l_exploration_noise = learner_params.get('l_exploration_noise', 0.2)
    l_learning_rate = learner_params.get('l_learning_rate', 1e-3)
    l_batch_size = learner_params.get('l_batch_size', 128)
    l_updates_per_step = learner_params.get('l_updates_per_step', 1)
    l_act_delay = learner_params.get('l_act_delay', 2)
    l_gamma = learner_params.get('l_gamma', 0.99)
    l_polyak = learner_params.get('l_polyak', 0.995)
    l_train_actor_noise = learner_params.get('l_train_actor_noise', 0.1)
    l_entropy_coefficient = learner_params.get('l_entropy_coefficient', 0.2)
    l_tune_entropy_coefficient = learner_params.get(
        'l_tune_entropy_coefficient', True)
    l_target_entropy = learner_params.get('l_target_entropy', None)
    l_clip_actor_gradients = learner_params.get('l_clip_actor_gradients',
                                                False)

    # Discriminator parameters
    d_type = discriminator_params.get('d_type', 'latent')
    d_loss = discriminator_params.get('d_loss', 'ce')
    d_rew = discriminator_params.get('d_rew', 'mixed')
    d_rew_noise = discriminator_params.get('d_rew_noise', True)
    d_learning_rate = discriminator_params.get('d_learning_rate', 1e-3)
    d_mi_learning_rate = discriminator_params.get('d_mi_learning_rate', 1e-3)
    d_updates_per_step = discriminator_params.get('d_updates_per_step', 1)
    d_mi_updates_per_step = discriminator_params.get('d_mi_updates_per_step',
                                                     1)
    d_e_batch_size = discriminator_params.get('d_e_batch_size', 64)
    d_l_batch_size = discriminator_params.get('d_l_batch_size', 64)
    d_label_smoothing = discriminator_params.get('d_label_smoothing', 0.0)
    d_stability_constant = discriminator_params.get('d_stability_constant',
                                                    0.0)
    d_sn_discriminator = discriminator_params.get('d_sn_discriminator', False)
    d_mi_constant = discriminator_params.get('d_mi_constant', 0.0)
    d_adaptive_mi = discriminator_params.get('d_adaptive_mi', False)
    d_double_mi = discriminator_params.get('d_double_mi', False)
    d_use_min_double_mi = discriminator_params.get('d_use_min_double_mi',
                                                   False)
    d_max_mi = discriminator_params.get('d_max_mi', 1)
    d_min_mi = discriminator_params.get('d_min_mi', d_max_mi / 2)
    d_use_dual_mi = discriminator_params.get('d_use_dual_mi', False)
    d_mi_lagrangian_lr = discriminator_params.get('d_mi_lagrangian_lr', 1e-3)
    d_max_mi_constant = discriminator_params.get('d_max_mi_constant', 10)
    d_min_mi_constant = discriminator_params.get('d_min_mi_constant', 1e-4)
    d_unbiased_mi = discriminator_params.get('d_unbiased_mi', False)
    d_unbiased_mi_decay = discriminator_params.get('d_unbiased_mi_decay', 0.99)
    d_prior_mi_constant = discriminator_params.get('d_prior_mi_constant', 0.0)
    d_negative_priors = discriminator_params.get('d_negative_priors', False)
    d_max_mi_prior = discriminator_params.get('d_max_mi_prior', 0.05)
    d_min_mi_prior_constant = discriminator_params.get(
        'd_min_mi_prior_constant', 1e-4)
    d_clip_mi_predictions = discriminator_params.get('d_clip_mi_predictions',
                                                     False)
    d_pre_filters = discriminator_params.get('d_pre_filters', [32, 32, 1])
    d_hidden_units = discriminator_params.get('d_hidden_units', [32])
    d_mi_hidden_units = discriminator_params.get('d_mi_hidden_units', [32, 32])
    d_mi2_hidden_units = discriminator_params.get('d_mi2_hidden_units',
                                                  d_mi_hidden_units)
    d_pre_scale_stddev = discriminator_params.get('d_pre_scale_stddev', 1.0)
    n_expert_demos = discriminator_params.get('n_expert_demos', None)
    n_expert_prior_demos = discriminator_params.get('n_expert_prior_demos',
                                                    None)
    n_agent_prior_demos = discriminator_params.get('n_agent_prior_demos',
                                                   n_expert_prior_demos)

    if env_name == 'InvertedPendulum-v2':
        im_side = 32
        im_shape = [im_side, im_side]
        expert_prior_location = 'Expert' + env_name
        if env_type == 'expert':
            env = ExpertInvertedPendulumEnv()
            agent_prior_location = 'Expert' + env_name
        elif env_type == 'agent' or env_type == 'colored' or env_type == 'to_colored':
            env = AgentInvertedPendulumEnv()
            agent_prior_location = 'Agent' + env_name
        elif env_type == 'to_two':
            env = ExpertInvertedDoublePendulumEnv()
            agent_prior_location = 'ExpertInvertedDoublePendulum-v2'
        elif env_type == 'to_colored_two':
            env = AgentInvertedDoublePendulumEnv()
            agent_prior_location = 'AgentInvertedDoublePendulum-v2'
        else:
            raise NotImplementedError
    elif env_name == 'InvertedDoublePendulum-v2':
        im_side = 32
        im_shape = [im_side, im_side]
        expert_prior_location = 'ExpertInvertedDoublePendulum-v2'
        if env_type == 'expert':
            agent_prior_location = 'ExpertInvertedDoublePendulum-v2'
            env = ExpertInvertedDoublePendulumEnv()
        elif env_type == 'colored' or env_type == 'to_colored':
            env = AgentInvertedDoublePendulumEnv()
            agent_prior_location = 'AgentInvertedDoublePendulum-v2'
        elif env_type == 'to_one':
            agent_prior_location = 'ExpertInvertedPendulum-v2'
            env = ExpertInvertedPendulumEnv()
        elif env_type == 'agent' or env_type == 'to_colored_one':
            agent_prior_location = 'AgentInvertedPendulum-v2'
            env = AgentInvertedPendulumEnv()
        else:
            raise NotImplementedError
    elif env_name == 'ThreeReacherEasy-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'Expert' + env_name
        if env_type == 'expert':
            env = ThreeReacherEasyEnv()
            agent_prior_location = 'Expert' + env_name
        elif env_type == 'agent' or env_type == 'to_two':
            agent_prior_location = 'ExpertReacherEasy-v2'
            env = ReacherEasyEnv()
        elif env_type == 'tilted' or env_type == 'to_tilted':
            agent_prior_location = 'AgentThreeReacherEasy-v2'
            env = Tilted3ReacherEasyEnv()
        elif env_type == 'to_tilted_two':
            env = TiltedReacherEasyEnv()
            agent_prior_location = 'AgentReacherEasy-v2'
        else:
            raise NotImplementedError
    elif env_name == 'ReacherEasy-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'ExpertReacherEasy-v2'
        if env_type == 'expert':
            env = ReacherEasyEnv()
            agent_prior_location = 'ExpertReacherEasy-v2'
        elif env_type == 'agent' or env_type == 'tilted' or env_type == 'to_tilted':
            env = TiltedReacherEasyEnv()
            agent_prior_location = 'AgentReacherEasy-v2'
        elif env_type == 'to_three':
            env = ThreeReacherEasyEnv()
            agent_prior_location = 'ExpertThreeReacherEasy-v2'
        elif env_type == 'to_tilted_three':
            agent_prior_location = 'AgentThreeReacherEasy-v2'
            env = Tilted3ReacherEasyEnv()
        else:
            raise NotImplementedError
    elif env_name == 'Hopper-v2':
        im_side = 64
        im_shape = [im_side, im_side]
        expert_prior_location = 'Hopper-v2'
        if env_type == 'expert':
            env = HopperEnv()
            agent_prior_location = 'Hopper-v2'
        elif env_type == 'flexible':
            env = HopperFlexibleEnv()
            agent_prior_location = 'HopperFlexible-v2'
        else:
            raise NotImplementedError
    elif env_name == 'HalfCheetah-v2':
        im_side = 64
        im_shape = [im_side, im_side]
        expert_prior_location = 'HalfCheetah-v2'
        if env_type == 'expert':
            env = ExpertHalfCheetahEnv()
            agent_prior_location = 'HalfCheetah-v2'
        elif env_type == 'locked_legs':
            env = LockedLegsHalfCheetahEnv()
            agent_prior_location = 'LockedLegsHalfCheetah-v2'
        else:
            raise NotImplementedError
    elif env_name == 'Striker-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'Striker-v2'
        if env_type == 'expert':
            env = StrikerEnv()
            agent_prior_location = 'Striker-v2'
        elif env_type == 'to_human':
            env = StrikerHumanSimEnv()
            agent_prior_location = 'StrikerHuman-v2'
        else:
            raise NotImplementedError
    elif env_name == 'StrikerHumanSim-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'StrikerHumanSim-v2'
        if env_type == 'expert':
            env = StrikerHumanSimEnv()
            agent_prior_location = 'StrikerHumanSim-v2'
        elif env_type == 'to_robot':
            env = StrikerEnv()
            agent_prior_location = 'Striker-v2'
        else:
            raise NotImplementedError
    elif env_name == 'Pusher-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'Pusher-v2'
        if env_type == 'expert':
            env = PusherEnv()
            agent_prior_location = 'Pusher-v2'
        elif env_type == 'to_human':
            env = PusherHumanSimEnv()
            agent_prior_location = 'PusherHuman-v2'
        else:
            raise NotImplementedError
    elif env_name == 'PusherHumanSim-v2':
        im_side = 48
        im_shape = [im_side, im_side]
        expert_prior_location = 'PusherHumanSim-v2'
        if env_type == 'expert':
            env = PusherHumanSimEnv()
            agent_prior_location = 'PusherHumanSim-v2'
        elif env_type == 'to_robot':
            env = PusherEnv()
            agent_prior_location = 'Pusher-v2'
        else:
            raise NotImplementedError
    else:
        raise NotImplementedError
    expert_buffer = DemonstrationsReplayBuffer(
        load_expert_trajectories(env_name,
                                 file_location,
                                 visual_data=True,
                                 load_ids=True,
                                 max_demos=n_expert_demos))
    expert_visual_data_shape = expert_buffer.get_random_batch(
        1)['ims'][0].shape
    print('Visual data shape: {}'.format(expert_visual_data_shape))
    past_frames = expert_visual_data_shape[0]
    print('Past frames: {}'.format(past_frames))
    if d_prior_mi_constant > 0.0 or d_negative_priors:
        prior_expert_buffer = DemonstrationsReplayBuffer(
            load_expert_trajectories(agent_prior_location,
                                     prior_file_location,
                                     visual_data=True,
                                     load_ids=True,
                                     max_demos=n_expert_prior_demos))
        prior_agent_buffer = DemonstrationsReplayBuffer(
            load_expert_trajectories(expert_prior_location,
                                     prior_file_location,
                                     visual_data=True,
                                     load_ids=True,
                                     max_demos=n_agent_prior_demos))
    else:
        prior_expert_buffer, prior_agent_buffer = None, None

    if d_type == 'latent':
        im_shape += [3]
    else:
        im_shape += [3 * past_frames]

    action_size = env.action_space.shape[0]
    if exp_num == -1:
        logz.configure_output_dir(None, True)
    else:
        log_dir = osp.join('experiments_data/',
                           '{}/{}'.format(exp_name, exp_num))
        logz.configure_output_dir(log_dir, True)

    params = {
        'exp': exp_params,
        'learner': learner_params,
        'discriminator': discriminator_params,
    }
    print(params)
    logz.save_params(params)
    if l_type == 'TD3':

        def make_actor():
            actor = Actor([
                tf.keras.layers.Dense(400,
                                      'relu',
                                      kernel_initializer='orthogonal'),
                tf.keras.layers.Dense(300,
                                      'relu',
                                      kernel_initializer='orthogonal'),
                tf.keras.layers.Dense(
                    action_size,
                    'tanh',
                    kernel_initializer=tf.keras.initializers.Orthogonal(0.01))
            ])
            return actor

        def make_critic():
            critic = Critic([
                tf.keras.layers.Dense(400,
                                      'relu',
                                      kernel_initializer='orthogonal'),
                tf.keras.layers.Dense(300,
                                      'relu',
                                      kernel_initializer='orthogonal'),
                tf.keras.layers.Dense(
                    1,
                    kernel_initializer=tf.keras.initializers.Orthogonal(0.01))
            ])
            return critic
    elif l_type == 'SAC':

        def make_actor():
            actor = StochasticActor([
                tf.keras.layers.Dense(256,
                                      'relu',
                                      kernel_initializer='orthogonal'),
                tf.keras.layers.Dense(256,
                                      'relu',
                                      kernel_initializer='orthogonal'),
                tf.keras.layers.Dense(
                    action_size * 2,
                    kernel_initializer=tf.keras.initializers.Orthogonal(0.01))
            ])
            return actor

        def make_critic():
            critic = Critic([
                tf.keras.layers.Dense(256,
                                      'relu',
                                      kernel_initializer='orthogonal'),
                tf.keras.layers.Dense(256,
                                      'relu',
                                      kernel_initializer='orthogonal'),
                tf.keras.layers.Dense(
                    1,
                    kernel_initializer=tf.keras.initializers.Orthogonal(0.01))
            ])
            return critic

        if l_target_entropy is None:
            l_target_entropy = -1 * (np.prod(env.action_space.shape))
    else:
        raise NotImplementedError

    d_optimizer = tf.keras.optimizers.Adam(learning_rate=d_learning_rate)
    d_mi_optimizer = tf.keras.optimizers.Adam(learning_rate=d_mi_learning_rate)
    d_mi_lagrangian_optimizer = tf.keras.optimizers.Adam(
        learning_rate=d_mi_lagrangian_lr)
    tfl = tf.keras.layers
    if d_type == 'latent':
        pre_layers = [tfl.Reshape(im_shape)]
    else:
        pre_layers = [tfl.Permute((2, 3, 1, 4)), tfl.Reshape(im_shape)]
    if (d_type == 'latent') or (not d_sn_discriminator):
        for filters in d_pre_filters[:-1]:
            pre_layers += [
                tfl.Conv2D(filters, 3, activation='tanh', padding='same'),
                tfl.MaxPooling2D(2, padding='same')
            ]
        pre_layers += [
            tfl.Conv2D(d_pre_filters[-1], 3, padding='same'),
            tfl.MaxPooling2D(2, padding='same'),
            tfl.Reshape([-1])
        ]
    else:
        for filters in d_pre_filters[:-1]:
            pre_layers += [
                SpectralNormalization(tfl.Conv2D(filters, 3, padding='same')),
                tfl.LeakyReLU(),
                tfl.MaxPooling2D(2, padding='same')
            ]
        pre_layers += [
            SpectralNormalization(
                tfl.Conv2D(d_pre_filters[-1], 3, padding='same')),
            tfl.MaxPooling2D(2, padding='same'),
            tfl.Reshape([-1])
        ]
    if d_sn_discriminator:
        disc_layers = [
            SpectralNormalization(tfl.Dense(units, activation='relu'))
            for units in d_hidden_units
        ]
        disc_layers.append(SpectralNormalization(tfl.Dense(1)))
    else:
        disc_layers = [
            tfl.Dense(units, activation='tanh') for units in d_hidden_units
        ]
        disc_layers.append(tfl.Dense(1))
    if d_type == 'latent':

        def make_pre():
            pre = GaussianPreprocessor(pre_layers, d_pre_scale_stddev)
            return pre

        def make_disc():
            disc = InvariantDiscriminator(disc_layers, d_stability_constant,
                                          d_rew)
            return disc
    else:

        def make_pre():
            pre = DeterministicPreprocessor(pre_layers)
            return pre

        def make_disc():
            disc = InvariantDiscriminator(disc_layers, d_stability_constant,
                                          d_rew)
            return disc

    mi_layers = [
        tfl.Dense(units, activation='tanh') for units in d_mi_hidden_units
    ]
    mi_layers.append(tfl.Dense(1))

    def make_mi_est():
        mi_est = MIEstimator(mi_layers)
        return mi_est

    if d_double_mi:
        mi2_layers = [
            tfl.Dense(units, activation='tanh') for units in d_mi2_hidden_units
        ]
        mi2_layers.append(tfl.Dense(1))

        def make_mi2_est():
            mi2_est = MIEstimator(mi2_layers)
            return mi2_est
    else:
        make_mi2_est = None

    l_optimizer = tf.keras.optimizers.Adam(l_learning_rate)
    if l_type == 'TD3':
        l_agent = DDPG(
            make_actor=make_actor,
            make_critic=make_critic,
            make_critic2=make_critic,
            actor_optimizer=l_optimizer,
            critic_optimizer=l_optimizer,
            gamma=l_gamma,
            polyak=l_polyak,
            train_actor_noise=l_train_actor_noise,
            clip_actor_gradients=l_clip_actor_gradients,
        )
    elif l_type == 'SAC':
        l_agent = SAC(
            make_actor=make_actor,
            make_critic=make_critic,
            make_critic2=make_critic,
            actor_optimizer=l_optimizer,
            critic_optimizer=l_optimizer,
            gamma=l_gamma,
            polyak=l_polyak,
            entropy_coefficient=l_entropy_coefficient,
            tune_entropy_coefficient=l_tune_entropy_coefficient,
            target_entropy=l_target_entropy,
            clip_actor_gradients=l_clip_actor_gradients,
        )
    else:
        raise NotImplementedError
    sampler = Sampler(env, episode_limit, init_random_samples, visual_env=True)

    gail = DisentanGAIL(
        agent=l_agent,
        make_discriminator=make_disc,
        make_preprocessing=make_pre,
        expert_buffer=expert_buffer,
        prior_expert_buffer=prior_expert_buffer,
        prior_agent_buffer=prior_agent_buffer,
        make_mi_estimator=make_mi_est,
        make_mi2_estimator=make_mi2_est,
        use_min_double_mi=d_use_min_double_mi,
        d_loss=d_loss,
        d_optimizer=d_optimizer,
        mi_optimizer=d_mi_optimizer,
        label_smoothing=d_label_smoothing,
        stab_const=d_stability_constant,
        mi_constant=d_mi_constant,
        adaptive_mi=d_adaptive_mi,
        max_mi=d_max_mi,
        min_mi=d_min_mi,
        prior_mi_constant=d_prior_mi_constant,
        negative_priors=d_negative_priors,
        max_mi_prior=d_max_mi_prior,
        use_dual_mi=d_use_dual_mi,
        mi_lagrangian_optimizer=d_mi_lagrangian_optimizer,
        max_mi_constant=d_max_mi_constant,
        min_mi_constant=d_min_mi_constant,
        min_mi_prior_constant=d_min_mi_prior_constant,
        unbiased_mi=d_unbiased_mi,
        clip_mi_predictions=d_clip_mi_predictions,
        unbiased_mi_decay=d_unbiased_mi_decay,
        im_side=im_side,
        past_frames=past_frames,
    )

    agent_buffer = LearnerAgentReplayBuffer(gail,
                                            l_buffer_size,
                                            reward_noise=d_rew_noise)
    test_input = expert_buffer.get_random_batch(1)
    test_input['obs'] = np.expand_dims((env.reset()['obs']).astype('float32'),
                                       axis=0)
    gail(test_input)
    gail.summary()

    mean_test_returns = []
    mean_test_std = []
    steps = []

    step_counter = 0
    logz.log_tabular('Iteration', 0)
    logz.log_tabular('Steps', step_counter)
    print('Epoch {}/{} - total steps {}'.format(0, epochs, step_counter))
    out = sampler.evaluate(l_agent, test_runs_per_epoch, False)
    mean_test_returns.append(out['mean'])
    mean_test_std.append(out['std'])
    steps.append(step_counter)
    for k, v in out.items():
        logz.log_tabular(k, v)
    logz.dump_tabular()
    for e in range(epochs):
        while step_counter < (e + 1) * steps_per_epoch:
            traj_data = sampler.sample_trajectory(l_agent, l_exploration_noise)
            agent_buffer.add(traj_data)
            n = traj_data['n']
            step_counter += traj_data['n']
            if step_counter > training_starts:
                gail.train(
                    agent_buffer=agent_buffer,
                    l_batch_size=l_batch_size,
                    l_updates=l_updates_per_step * n,
                    l_act_delay=l_act_delay,
                    d_updates=d_updates_per_step * n,
                    mi_updates=d_mi_updates_per_step * n,
                    d_e_batch_size=d_e_batch_size,
                    d_l_batch_size=d_l_batch_size,
                )

        logz.log_tabular('Iteration', e + 1)
        logz.log_tabular('Steps', step_counter)
        print('Epoch {}/{} - total steps {}'.format(e + 1, epochs,
                                                    step_counter))
        traj_test = sampler.sample_test_trajectories(l_agent, 0.0,
                                                     test_runs_per_epoch)
        out = log_trajectory_statistics(traj_test['ret'], False)
        mean_test_returns.append(out['mean'])
        mean_test_std.append(out['std'])
        steps.append(step_counter)
        for k, v in out.items():
            logz.log_tabular(k, v)
        logz.dump_tabular()
        if save_weights_checkpoints:
            weights_log_dir = 'experiments_data/{}/{}/{}/{}.h5'.format(
                exp_name, exp_num, 'weights', e)
            l_agent.save_weights(weights_log_dir)

        if visualize_collected_observations:
            training_sample = traj_data['ims'][-1, 0]
            print('Visualization of latest training sample')
            plt.imshow(training_sample)
            plt.show()
            test_sample = traj_test['ims'][-1, 0]
            print('Visualization of latest test sample')
            plt.imshow(test_sample)
            plt.show()
        if out['mean'] >= return_threshold:
            print('Early termination due to reaching return threshold')
            break

    if return_agent_buffer:
        return gail, sampler, agent_buffer
    else:
        return gail, sampler,
示例#34
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,
             network_activation='tanh'
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    
    #activation function for the network
    if network_activation=='relu':
        activation=torch.nn.functional.relu
    elif network_activation=='leaky_relu':
        activation=torch.nn.functional.leaky_relu
    else:
        activation=torch.nn.functional.tanh
    #todo: create policy
    actor=build_mlp(ob_dim, ac_dim, "actor",\
                             n_layers=n_layers, size=size, activation=activation, discrete=discrete)
    actor_loss=reinforce_loss
    actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate)
    
    #todo: initilize Agent:
    
    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#
    if nn_baseline:
        critic=build_mlp(ob_dim,1,"nn_baseline",\
                                    n_layers=n_layers,size=size, discrete=discrete)
        critic_loss=nn.MSELoss()
        critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate)
        

    #========================================================================================#
    # Training Loop
    #========================================================================================#
    
    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards, log_probs = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                ob = torch.from_numpy(ob).float().unsqueeze(0)
                obs.append(ob)
                ac, log_prob = actor.run(ob)
                acs.append(ac)
                log_probs.append(log_prob)
                #format the action from policy
                if discrete:
                    ac = int(ac)
                else:
                    ac = ac.squeeze(0).numpy()
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : torch.cat(obs, 0),
                    "reward" : torch.Tensor(rewards),
                    "action" : torch.cat(acs, 0),
                    "log_prob" : torch.cat(log_probs, 0)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        ob_no = torch.cat([path["observation"] for path in paths], 0)
        ac_na = torch.cat([path["action"] for path in paths], 0)
                                   
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#
        q_n = []
        for path in paths:
            rewards = path['reward']
            num_steps = pathlength(path)
            R=[]
            if reward_to_go:
                for t in range(num_steps):
                    R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1))
                q_n.append(torch.cat(R))
            else:
                q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1))
        q_n = torch.cat(q_n, 0)
        
         #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#
        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            b_n = critic(ob_no)
            q_n_std = q_n.std()
            q_n_mean = q_n.mean()
            b_n_scaled = b_n * q_n_std + q_n_mean
            adv_n = (q_n - b_n_scaled).detach()
        else:
            adv_n = q_n
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item())
        
        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item())
            critic_optimizer.zero_grad()
            c_loss = critic_loss(b_n, target)
            c_loss.backward()
            critic_optimizer.step()
            
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE
        log_probs = torch.cat([path["log_prob"] for path in paths], 0)
        actor_optimizer.zero_grad()
        loss = actor_loss(log_probs, adv_n, len(paths))
        print(loss)
        loss.backward()
        actor_optimizer.step()

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#35
0
def run_vanilla_policy_gradient_experiment(args, vf_params, logdir, env, sess, continuous_control):
    """
    General purpose method to run vanilla policy gradients.
    Works for both continuous and discrete environments.

    Roughly inspired by starter code for this homework and
    https://github.com/DanielTakeshi/rl_algorithms/blob/master/vpg/main.py

    Thanks!

    Params
    ------
    args: arguments for vanilla policy gradient.
    vf_params: dict of params for value function
    logdir: where to store outputs or None if you don't want to store anything
    env: openai gym env
    sess: TF session
    continuous_control: boolean, if true then we do gaussian continuous control
    """

    ob_dim = env.observation_space.shape[0]

    if args.vf_type == 'linear':
       value_function = LinearValueFunction(**vf_params)
    elif args.vf_type == 'nn':
       value_function = NnValueFunction(session=sess, ob_dim=ob_dim)
    #value_function = LinearValueFunction()

    if continuous_control:
        ac_dim = env.action_space.shape[0]
        policy_fn = policies.GaussianPolicy(sess, ob_dim, ac_dim)
    else:
        ac_dim = env.action_space.n
        policy_fn = policies.DisceretePolicy(sess, ob_dim, ac_dim)


    sess.__enter__()  # equivalent to with sess, to reduce indentation
    tf.global_variables_initializer().run()
    total_timesteps = 0
    stepsize = args.initial_stepsize

    filterAction = 0.1
    stepMax = 100
    for i in range(args.n_iter):
        print("\n********** Iteration %i ************" % i)

        # Collect paths until we have enough timesteps.
        timesteps_this_batch = 0
        paths = []
        step = 0

        #if(filterAction > 1.0):
        #    filterAction = 1.0
        #else:
        #    filterAction = filterAction*1.1
        while True:
            ob = env.reset()
            terminated = False
            obs, acs, rewards = [], [], []
            animate_this_episode = (
            len(paths) == 0 and (i % 10 == 0) and args.render)
            while True:
                if animate_this_episode:
                    env.render()
                obs.append(ob)
                ac = policy_fn.sample_action(ob)
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                step = step + 1
                if done:
                    step = 0
                    #print "done "
                    break
                #if done or step > stepMax:
                #    print "max steps: {}".format(stepMax)
                #    step = 0
                #    stepMax = stepMax + 2
                #    break
            path = {"observation": np.array(obs), "terminated": terminated,
                    "reward": np.array(rewards), "action": np.array(acs)}
            paths.append(path)
            timesteps_this_batch += utils.pathlength(path)
            if timesteps_this_batch > args.min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Estimate advantage function using baseline vf (these are lists!).
        # return_t: list of sum of discounted rewards (to end of
        # episode), one per time
        # vpred_t: list of value function's predictions of components of
        # return_t
        vtargs, vpreds, advs = [], [], []
        for path in paths:
            rew_t = path["reward"]
            return_t = utils.discount(rew_t, args.gamma)
            vpred_t = value_function.predict(path["observation"])
            adv_t = return_t - vpred_t
            advs.append(adv_t)
            vtargs.append(return_t)
            vpreds.append(vpred_t)

        # Build arrays for policy update and **re-fit the baseline**.
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_n = np.concatenate([path["action"] for path in paths])
        adv_n = np.concatenate(advs)
        std_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
        vtarg_n = np.concatenate(vtargs)
        vpred_n = np.concatenate(vpreds)
        value_function.fit(ob_no, vtarg_n)

        # Policy update, plus diagnostics stuff. Is there a better way to
        #  handle
        # the continuous vs discrete control cases?
        if continuous_control:
            surr_loss, oldmean_na, oldlogstd_a = policy_fn.update_policy(
                ob_no, ac_n, std_adv_n, stepsize)

            kl, ent = policy_fn.kldiv_and_entropy(
                ob_no, oldmean_na, oldlogstd_a
            )
        else:
            surr_loss, oldlogits_na = policy_fn.update_policy(
                ob_no, ac_n, std_adv_n, stepsize)
            kl, ent = policy_fn.kldiv_and_entropy(ob_no, oldlogits_na)

        # Step size heuristic to ensure that we don't take too large steps.
        if args.use_kl_heuristic:
            if kl > args.desired_kl * 2:
                stepsize /= 1.5
                print('PG stepsize -> %s' % stepsize)
            elif kl < args.desired_kl / 2:
                stepsize *= 1.5
                print('PG stepsize -> %s' % stepsize)
            else:
                print('PG stepsize OK')

        # Log diagnostics
        if i % args.log_every_t_iter == 0:
            logz.log_tabular("EpRewMean", np.mean(
                [path["reward"].sum() for path in paths]))
            logz.log_tabular("EpLenMean", np.mean(
                [utils.pathlength(path) for path in paths]))
            logz.log_tabular("KLOldNew", kl)
            logz.log_tabular("Entropy", ent)
            logz.log_tabular("EVBefore",
                             utils.explained_variance_1d(vpred_n, vtarg_n))
            logz.log_tabular("EVAfter",
                             utils.explained_variance_1d(value_function.predict(ob_no),
                                                         vtarg_n))
            logz.log_tabular("SurrogateLoss", surr_loss)
            logz.log_tabular("TimestepsSoFar", total_timesteps)
            # If you're overfitting, EVAfter will be way larger than
            # EVBefore.
            # Note that we fit the value function AFTER using it to
            # compute the
            # advantage function to avoid introducing bias
            logz.dump_tabular()
示例#36
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #todo: create Agent
    
    #todo: initilize Agent:

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = actor.run(ob)
                print("need to type-check action here:(two lines)")
                print(ac)
                print(ac.size())
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            #One episode finishes; perform update here
            finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, )
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch



        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#37
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = TODO


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = TODO
        sy_sampled_ac = TODO # Hint: Use the tf.multinomial op
        sy_logprob_n = TODO

    else:
        # YOUR_CODE_HERE
        sy_mean = TODO
        sy_logstd = TODO # logstd should just be a trainable variable, not a network output.
        sy_sampled_ac = TODO
        sy_logprob_n = TODO  # Hint: Use the log probability under a multivariate gaussian. 



    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_update_op = TODO


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = TODO

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = TODO
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            pass


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE


        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()