示例#1
0
文件: ars.py 项目: zhan0903/ARS
    def __init__(self, env_name='HalfCheetah-v1',
                 policy_params=None,
                 num_workers=32, 
                 num_deltas=320, 
                 deltas_used=320,
                 delta_std=0.02, 
                 logdir=None, 
                 rollout_length=1000,
                 step_size=0.01,
                 shift='constant zero',
                 params=None,
                 seed=123):

        logz.configure_output_dir(logdir)
        logz.save_params(params)
        
        env = gym.make(env_name)
        
        self.timesteps = 0
        self.action_size = env.action_space.shape[0]
        self.ob_size = env.observation_space.shape[0]
        self.num_deltas = num_deltas
        self.deltas_used = deltas_used
        self.rollout_length = rollout_length
        self.step_size = step_size
        self.delta_std = delta_std
        self.logdir = logdir
        self.shift = shift
        self.params = params
        self.max_past_avg_reward = float('-inf')
        self.num_episodes_used = float('inf')

        
        # create shared table for storing noise
        print("Creating deltas table.")
        deltas_id = create_shared_noise.remote()
        self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3)
        print('Created deltas table.')

        # initialize workers with different random seeds
        print('Initializing workers.') 
        self.num_workers = num_workers
        self.workers = [Worker.remote(seed + 7 * i,
                                      env_name=env_name,
                                      policy_params=policy_params,
                                      deltas=deltas_id,
                                      rollout_length=rollout_length,
                                      delta_std=delta_std) for i in range(num_workers)]


        # initialize policy 
        if policy_params['type'] == 'linear':
            self.policy = LinearPolicy(policy_params)
            self.w_policy = self.policy.get_weights()
        else:
            raise NotImplementedError
            
        # initialize optimization algorithm
        self.optimizer = optimizers.SGD(self.w_policy, self.step_size)        
        print("Initialization of ARS complete.")
示例#2
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(Supervisor.__init__)[0]
    params = {k: locals_[k] if k in locals_ and not isinstance(locals_[k], types.FunctionType) and k is not "self" else None for k in args}
    logz.save_params(params)
示例#3
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)
示例#4
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    # args = inspect.getargspec(learn)[0]
    # params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(locals_.get("kwargs"))
示例#5
0
    def __init__(self, env_name='HalfCheetah-v1',
                 policy_params=None,
                 num_workers=32, 
                 num_deltas=320, 
                 deltas_used=320,
                 delta_std=0.02, 
                 logdir=None, 
                 rollout_length=1000,
                 step_size=0.01,
                 shift='constant zero',
                 params=None,
                 seed=123):

        logz.configure_output_dir(logdir)
        logz.save_params(params)
        
        env = minitaur_gym_env.MinitaurBulletEnv() #gym.make(env_name)
        
        self.timesteps = 0
        self.action_size = env.action_space.shape[0]
        self.ob_size = env.observation_space.shape[0]
        self.num_deltas = num_deltas
        self.deltas_used = deltas_used
        self.rollout_length = rollout_length
        self.step_size = step_size
        self.delta_std = delta_std
        self.logdir = logdir
        self.shift = shift
        self.params = params
        self.max_past_avg_reward = float('-inf')
        self.num_episodes_used = float('inf')

        
        # create shared table for storing noise
        print("Creating deltas table.")
        deltas_id = create_shared_noise.remote()

        self.deltas = SharedNoiseTable(ray.get(deltas_id), seed = seed + 3)
        print('Created deltas table.')

        # initialize workers with different random seeds
        print('Initializing workers.') 
        self.num_workers = num_workers
        self.workers = [Worker.remote(seed + 7 * i,
                                      env_name=env_name,
                                      policy_params=policy_params,
                                      deltas=deltas_id,
                                      rollout_length=rollout_length,
                                      delta_std=delta_std) for i in range(num_workers)]

        # initialize policy 
        if policy_params['type'] == 'linear':
            self.policy = LinearPolicy(policy_params)
            self.w_policy = self.policy.get_weights()
        else:
            raise NotImplementedError
            
        # initialize optimization algorithm
        self.optimizer = optimizers.SGD(self.w_policy, self.step_size)        
        print("Initialization of ARS complete.")
示例#6
0
def setup_logger(logdir, locals_):
  # Configure output directory for logging
  logz.configure_output_dir(logdir)
  # Log experimental parameters
  args = inspect.getargspec(QLearner)[0]
  params = {k: str(locals_[k]) if k in locals_ else None for k in args}
  params['exp_name'] = locals_['q_func'].__name__ + locals_['double_q'] * '_doubleQ'
  logz.save_params(params)
示例#7
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    seed = np.random.get_state()[1][0]
    logz.configure_output_dir(logdir + '/%s/' % seed)
    # Log experimental parameters
    params = {k: str(locals_[k]) for k in locals_ if '__' not in k}
    params['seed'] = str(seed)
    logz.save_params(params)
示例#8
0
def setup_logger(logdir, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    params = {k: locals_[k] if k in locals_ else None for k in args}
    # print(params.items())
    # print(json.dumps(list(params.values())))
    logz.save_params(params)
示例#9
0
    def __init__(
        self,
        organism_builder=None,
        logdir=None,
        params=None,
        master_organism=None,
        sampler_builder=None,
    ):

        logz.configure_output_dir(logdir)
        logz.save_params(params)

        # env = env_registry.get_env_constructor(params['env_name'])()

        self.logdir = logdir
        self.params = params
        self.max_past_avg_reward = float('-inf')
        self.num_episodes_used = float('inf')

        # create shared table for storing noise
        print("Creating deltas table.")
        deltas_id = create_shared_noise_serial()
        self.deltas = SharedNoiseTable(deltas_id, seed=params['seed'] + 3)
        print('Created deltas table.')

        ########################################################

        self.master_organism = master_organism

        self.sampler = sampler_builder(
            num_deltas=params['n_directions'],
            shift=params['shift'],
            num_workers=params['n_workers'],
            seed=params['seed'],
            env_name=params['env_name'],
            organism_builder=
            organism_builder,  #lambda: ARS_LinearAgent(agent_args)
            deltas_id=deltas_id,
            rollout_length=params['rollout_length'],
            delta_std=params['delta_std'],
        )

        # maybe we'd need to merge Sampler and Agent
        # agent holds the parameters, but sampler takes the agent and does the parallel rollouts
        # so agent should not have the workers at all...
        # agent should just contain the parameter.
        # but the sampler would need to take the agent in.
        # so the sampler is the thing that takes a single agent, and creates a bunch of workers
        # modeled the agent.

        self.rl_alg = ARS_RL_Alg(
            deltas=self.deltas,  # noise table
            num_deltas=params['n_directions'],  # N
            deltas_used=params['deltas_used']  # b
        )
示例#10
0
def run_model(session, predict, loss, train_step, saver, images, labels, X, y,
              epochs=1, batch_size=64, print_every=100, is_test=False):
    if not is_test:
        # Configure output directory for logging
        logz.configure_output_dir('logs')

        # Log experimental parameters
        args = inspect.getargspec(main)[0] # Get the names and default values of a function's parameters.
        locals_ = locals() # Return a dictionary containing the current scope's local variables
        params = {k: locals_[k] if k in locals_ else None for k in args}
        logz.save_params(params)

    # have tensorflow compute accuracy
    correct_prediction = tf.equal(tf.argmax(predict, axis=1), tf.argmax(y, axis=1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    # counter
    iter_cnt = 0
    iters_each_epoch = len(images)//batch_size - 1
    for e in range(epochs):
        # keep track of losses and accuracy
        correct = 0
        losses = []
        # make sure we iterate over the dataset once
        images, labels = shuffle_dataset(images, labels)
        for i in range(iters_each_epoch):
            current_iter = i+1
            
            batch_X, batch_y = images[current_iter*batch_size:(current_iter+1)*batch_size], labels[current_iter*batch_size:(current_iter+1)*batch_size]
            feed_dict = {X: batch_X, y: batch_y}
            
            # have tensorflow compute loss and correct predictions
            # and (if given) perform a training step
            l, corr, _ = session.run([loss, correct_prediction, train_step],feed_dict=feed_dict)

            # aggregate performance stats
            losses.append(l*batch_size)
            correct += np.sum(corr)
            
            # print every now and then
            if (iter_cnt % print_every) == 0 and not is_test:
                logz.log_tabular("Iteration", iter_cnt)
                logz.log_tabular("minibatch_loss", l)
                logz.log_tabular("minibatch_accuracy", np.sum(corr)/batch_size)
                logz.dump_tabular()
                logz.pickle_tf_vars()

            iter_cnt += 1
        if is_test:
            total_correct = correct/len(images)
            total_loss = np.sum(losses)/len(images)
            print('acc:', total_correct)
            print('los:', total_loss)
        else:
            saver.save(session, 'checkpoints/mnist_plus', iter_cnt)
示例#11
0
def setup_logger(logdir, params):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experimental parameters
    # args = inspect.getargspec(learn)[0]
    check_params = params.copy()
    log_params = params.copy()
    for param in check_params.keys():
        try:
            json.dumps(check_params[param])
        except:
            del log_params[param]
    logz.save_params(log_params)
示例#12
0
def main():
    # Get Atari games.
    task = gym.make('LunarLander-v2')

    file_dir = osp.dirname(osp.abspath(__file__))
    unique_name = datetime.datetime.now(dateutil.tz.tzlocal()).strftime(
        '%Y_%m_%d_%H_%M_%S_%f_%Z') + '__' + str(uuid.uuid4())
    result_dir = osp.join(file_dir, unique_name)

    logz.configure_output_dir(result_dir)
    logz.save_params(dict(exp_name=unique_name, ))

    # Run training
    seed = 1
    print('random seed = %d' % seed)
    env = get_env(task, seed, result_dir)
    session = get_session()
    atari_learn(env, session, num_timesteps=5e5, result_dir=result_dir)
示例#13
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir) # i need here to give a directory 

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    #seed: it makes sure that you will not have the same random number twice/ ref:https://en.wikipedia.org/wiki/Random_seed
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) 
    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        sy_logits_na =build_mlp(sy_ob_no,ac_dim,"discrete",n_layers,size,activation=tf.nn.relu,output_activation=tf.nn.relu)
        #print(sy_logits_na.shape)
        #env_actions=tf.concat(axis=1,values=[sy_logits_na,1-sy_logits_na])
        sy_sampled_ac =tf.reshape(tf.multinomial(sy_logits_na,1,seed),[-1])
        sy_logprob_n =tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na)

    else:
        # YOUR_CODE_HERE
        #sy_mean =-tf.reduce_mean(build_mlp(sy_ob_no,ac_dim,"cont",n_layers,size,activation=tf.tanh))
        #sy_logstd = tf.Variable(tf.random_uniform([None, ac_dim])) # logstd should just be a trainable variable, not a network output.
        #sy_sampled_ac = tf.random_normal([None, ac_dim],sy_mean,sy_logstd,dtype=tf.float32,seed=seed)
        #sy_logprob_n = -0.5*(sy_sampled_ac-sy_ac_na)^2  # Hint: Use the log probability under a multivariate gaussian. 
        print("Continous System")
        


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#
    
    loss = tf.reduce_mean(tf.multiply(sy_logprob_n,sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_target=tf.placeholder(shape=[None], name="tr", dtype=tf.float32)
        baseline_loss=tf.placeholder(shape=[None], name="lo", dtype=tf.float32)
        #baseline_update_op=tf.placeholder(shape=[None], name="up", dtype=tf.float32)
        b_loss=tf.losses.mean_squared_error(labels=baseline_target,predictions=baseline_prediction)
        baseline_update_op=tf.train.AdamOptimizer(learning_rate).minimize(b_loss)
        #baseline_loss=(baseline_prediction-baseline_target)**2
        #baseline_update_op=tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss)
        

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    with tf.Session() as sess: # equivalent to `with sess:`
        sess.run(tf.global_variables_initializer()) #pylint: disable=E1101



        #========================================================================================#
        # Training Loop
        #========================================================================================#

        total_timesteps = 0

        for itr in range(n_iter):
            print("********** Iteration %i ************"%itr)

            # Collect paths until we have enough timesteps
            timesteps_this_batch = 0
            paths = []
            while True:
                ob = env.reset()
                obs, acs, rewards = [], [], []
                animate_this_episode=( (itr % 10 == 0) and animate)
                steps = 0
                while True:
                    if animate_this_episode:
                        env.render()
                        time.sleep(0.05)
                    obs.append(ob)
                    ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no :[ob]})
                    ac = ac[0]
                    acs.append(ac)
                    ob, rew, done, _ = env.step(ac)
                    rewards.append(rew)
                    steps += 1
                    if done or steps > max_path_length:
                        break
                path = {"observation" : np.array(obs), 
                        "reward" : np.array(rewards), 
                        "action" : np.array(acs)}
                paths.append(path)
                timesteps_this_batch += pathlength(path)
                if timesteps_this_batch > min_timesteps_per_batch:
                    break
            total_timesteps += timesteps_this_batch

            # Build arrays for observation, action for the policy gradient update by concatenating 
            # across paths
            ob_no = np.concatenate([path["observation"] for path in paths])
            ac_na = np.concatenate([path["action"] for path in paths])
            print(ac_na.shape, "action sizeeeee")

            #====================================================================================#
            #                           ----------SECTION 4----------
            # Computing Q-values
            #
            # Your code should construct numpy arrays for Q-values which will be used to compute
            # advantages (which will in turn be fed to the placeholder you defined above). 
            #
            # Recall that the expression for the policy gradient PG is
            #
            #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
            #
            # where 
            #
            #       tau=(s_0, a_0, ...) is a trajectory,
            #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
            #       and b_t is a baseline which may depend on s_t. 
            #
            # You will write code for two cases, controlled by the flag 'reward_to_go':
            #
            #   Case 1: trajectory-based PG 
            #
            #       (reward_to_go = False)
            #0
            #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
            #       entire trajectory (regardless of which time step the Q-value should be for). 
            #
            #       For this case, the policy gradient estimator is
            #
            #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
            #
            #       where
            #
            #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
            #
            #       Thus, you should compute
            #
            #           Q_t = Ret(tau)
            #
            #   Case 2: reward-to-go PG 
            #
            #       (reward_to_go = True)
            #
            #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
            #       from time step t. Thus, you should compute
            #
            #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
            #
            #
            # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
            # like the 'ob_no' and 'ac_na' above. 
            #
            #====================================================================================#

            # YOUR_CODE_HERE
            print(total_timesteps)
            Q_t=[]
            if(reward_to_go): #Case 2: reward-to-go PG 
                for no_traj in range(len(paths)):
                    for _ in range(np.size((paths[no_traj])["reward"])):
                        temp_rew=0
                        t_=np.size((paths[no_traj])["reward"])-1
                        for no_rew in range(t_+1):
                            temp_rew+=(math.pow(gamma,t_-no_rew)*(((paths[no_traj])["reward"])[no_rew,]))
                            
                        Q_t.append(temp_rew)


            else:#   Case 1: trajectory-based PG 
                count =0
                index=len(paths)
                i=0
                t_=0
                while(count<=total_timesteps and i <index):
                    for _ in range (np.size((paths[i])["reward"])):
                        Q_t.append((math.pow(gamma,total_timesteps-t_)*((paths[i])["reward"])[_,]))
                        t_+=1
                    count+=np.size((paths[i])["reward"])
                    i+=1

            q_n=Q_t
            print(len(q_n))

            
                
            #====================================================================================#
            #                           ----------SECTION 5----------
            # Computing Baselines
            #====================================================================================#

            if nn_baseline:
                # If nn_baseline is True, use your neural network to predict reward-to-go
                # at each timestep for each trajectory, and save the result in a variable 'b_n'
                # like 'ob_no', 'ac_na', and 'q_n'.
                #
                # Hint #bl1: rescale the output from the nn_baseline to match the statistics
                # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
                # #bl2 below.)

                b_n = sess.run(baseline_prediction,feed_dict={sy_ob_no:ob_no})
                b_n = preprocessing.scale(b_n)
                adv_n = q_n - b_n
            else:
                adv_n = q_n.copy()

            #====================================================================================#
            #                           ----------SECTION 4----------
            # Advantage Normalization
            #====================================================================================#

            if normalize_advantages:
                # On the next line, implement a trick which is known empirically to reduce variance
                # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
                # YOUR_CODE_HERE
                adv_n = preprocessing.scale(adv_n)


            #====================================================================================#
            #                           ----------SECTION 5----------
            # Optimizing Neural Network Baseline
            #====================================================================================#
            if nn_baseline:
                # ----------SECTION 5----------
                # If a neural network baseline is used, set up the targets and the inputs for the 
                # baseline. 
                # 
                # Fit it to the current batch in order to use for the next iteration. Use the 
                # baseline_update_op you defined earlier.
                #
                # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
                # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

                # YOUR_CODE_HERE
                target_tmp=1+gamma*b_n
                target_tmp=preprocessing.scale(target_tmp)
                sess.run(b_loss,feed_dict={sy_ob_no:ob_no,baseline_target:target_tmp})
                sess.run(baseline_update_op,feed_dict={sy_ob_no:ob_no,baseline_target:target_tmp})


                

            #====================================================================================#
            #                           ----------SECTION 4----------
            # Performing the Policy Update
            #====================================================================================#

            # Call the update operation necessary to perform the policy gradient update based on 
            # the current batch of rollouts.
            # 
            # For debug purposes, you may wish to save the value of the loss function before
            # and after an update, and then log them below. 

            # YOUR_CODE_HERE
            #print(sess.run(sy_logits_na,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}))
            #print(sess.run(sy_sampled_ac,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n}))
            loss_=sess.run(loss,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})
            sess.run(update_op,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})
            loss_=sess.run(loss,feed_dict={sy_ob_no:ob_no,sy_ac_na:ac_na,sy_adv_n:adv_n})

            
            



            # Log diagnostics
            returns = [path["reward"].sum() for path in paths]
            ep_lengths = [pathlength(path) for path in paths]
            logz.log_tabular("Time", time.time() - start)
            logz.log_tabular("Iteration", itr)
            logz.log_tabular("AverageReturn", np.mean(returns))
            logz.log_tabular("StdReturn", np.std(returns))
            logz.log_tabular("MaxReturn", np.max(returns))
            logz.log_tabular("MinReturn", np.min(returns))
            logz.log_tabular("EpLenMean", np.mean(ep_lengths))
            logz.log_tabular("EpLenStd", np.std(ep_lengths))
            logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) 
            logz.log_tabular("TimestepsSoFar", total_timesteps)
            logz.log_tabular("loss_",loss_)
            logz.dump_tabular()
            logz.pickle_tf_vars()
示例#14
0
def train_SAC(env_name, exp_name, n_iter, ep_len, seed, logdir, alpha,
              prefill_steps, discount, batch_size, learning_rate, tau, two_qf):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
    }.get(env_name, alpha)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': batch_size,
        'discount': discount,
        'learning_rate': learning_rate,
        'reparameterize': True,
        'tau': tau,
        'epoch_length': ep_len,
        'n_epochs': n_iter,
        'two_qf': two_qf,
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': prefill_steps,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (64, 64),
    }

    q_function_params = {
        'hidden_layer_sizes': (64, 64),
    }

    policy_params = {
        'hidden_layer_sizes': (64, 64),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    env = gym.envs.make(env_name)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        **replay_pool_params)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(
        name='value_function', **value_function_params)
    target_value_function = nn.ValueFunction(
        name='target_value_function', **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=env.action_space.shape[0],
        reparameterize=algorithm_params['reparameterize'],
        **policy_params)

    sampler.initialize(env, policy, replay_pool)

    algorithm = SAC(**algorithm_params)

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1)
    tf_config.gpu_options.allow_growth = True  # may need if using GPU
    with tf.Session(config=tf_config):
        algorithm.build(
            env=env,
            policy=policy,
            q_function=q_function,
            q_function2=q_function2,
            value_function=value_function,
            target_value_function=target_value_function)

        for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get('n_epochs', 1000)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()
示例#15
0
def train_PG(
        exp_name='',
        batch_size=250,
        n_episodes=25000,
        learning_rate=1e-3,
        logdir=None,
        seed=0,
        # network arguments
        n_layers=2,
        size=64):

    env = Environment()
    agent1 = Agent(env, n_layers, size, learning_rate, "agent1")
    agent2 = Agent(env, n_layers, size, learning_rate, "agent2")
    agent1_Nash = Agent(env, 3, 32, 1e-2, "agent1_Nash")
    agent2_Nash = Agent(env, 3, 32, 1e-2, "agent2_Nash")

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    n_iter = n_episodes // batch_size

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)
    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        #simulate a batch of temperature-gas price states
        s = env.samplestatess(batch_size)

        ag1_prices, _ = agent1.sample_actions(sess, s)
        ag2_prices, _ = agent2.sample_actions(sess, s)

        #====================================================================================#                                       # Feed agents' actions into the market simulator and obtain corresponding rewards
        #====================================================================================#
        #Convert agent RTM actions to corresponding prices
        ag1_rewards, ag2_rewards = get_rewards(env, ag1_prices, ag2_prices)

        #====================================================================================#
        #
        # Advantage Normalization
        #====================================================================================#
        ag1_adv = normalize(ag1_rewards)
        ag2_adv = normalize(ag2_rewards)

        #====================================================================================#
        #
        # Performing the Policy Update
        #====================================================================================#
        #update policy parameters for agent1
        #if (itr % 20 < 10):
        loss1 = agent1.improve_policy(sess, s, ag1_adv, ag1_prices)
        #update policy parameters for agent2
        #else:
        loss2 = agent2.improve_policy(sess, s, ag2_adv, ag2_prices)

        # Log diagnostics
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageProfit_agt1", np.mean(ag1_rewards))
        logz.log_tabular("AverageProfit_agt2", np.mean(ag2_rewards))

        logz.log_tabular("Agt1_StdReturn", np.std(ag1_rewards))
        logz.log_tabular("Agt2_StdReturn", np.std(ag2_rewards))

        logz.log_tabular("Agt1_MaxReturn", np.max(ag1_rewards))
        logz.log_tabular("Agt2_MaxReturn", np.max(ag2_rewards))

        logz.log_tabular("Agt1_MinReturn", np.min(ag1_rewards))
        logz.log_tabular("Agt2_MinReturn", np.min(ag2_rewards))

        logz.dump_tabular()
        logz.pickle_tf_vars()

    m1, m2, m1_m, m2_m, ag1_p, ag2_p = get_smart_rewards(
        sess, agent1, agent2, env)
    print("Agent1 Stochastic Profit: " + repr(m1))
    print("Agent2 Stochastic Profit: " + repr(m2))

    print("Agent1 Deterministic Profit: " + repr(m1_m))
    print("Agent2 Deterministic Profit: " + repr(m2_m))

    print("Agent1 Mean Price")
    print(ag1_p)
    print("Agent2 Prices")
    print(ag2_p)

    print("Assessing degree of deviation from Nash Eq")
    ag1_imp, ag2_imp = assess_policy_accuracy(sess, agent1, agent1_Nash,
                                              agent2, agent2_Nash, env)
    print("Agent1 Accuracy: " + repr(ag1_imp))
    print("Agent2 Accuracy: " + repr(ag2_imp))
示例#16
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name = "adv", dtype=tf.float32)


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        sy_logits_na = build_mlp(sy_ob_no, ac_dim, "scope", n_layers, size)
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1, seed = seed), [-1])
        sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=sy_ac_na, logits=sy_logits_na) #[None]

    else:
        sy_mean = build_mlp(sy_ob_no, ac_dim, "scope", n_layers, size)               # [None, ac_dim]
        # logstd should just be a trainable variable, not a network output.
        sy_logstd = tf.get_variable("logstd", shape = [ac_dim, 1], trainable = True, initializer = tf.contrib.layers.xavier_initializer())
        z = tf.random_normal(tf.shape(sy_mean), mean = 0.0, stddev = 1.0, seed = seed)  # [None, ac_dim]
        sigma = tf.reshape(tf.exp(sy_logstd), [1, ac_dim])                           # [1, ac_dim] STANDARD DEVIATION
        sy_sampled_ac = (sigma * z) + sy_mean                                        # [None, ac_dim]
        # Hint: Use the log probability under a multivariate gaussian. 
        # diff = sy_ac_na - sy_mean
        # # the implementation below is by hand and assumes that sigma is covariance, though i've changed it to be SD instead.
        # first_term = -0.5 * tf.diag_part(tf.matmul(diff, tf.matmul(tf.matrix_inverse(sigma), tf.transpose(diff))))
        # second_term = -0.5 * ac_dim * tf.log(tf.norm(sigma))
        # third_term = -0.5 * ac_dim * tf.log(2*math.pi)
        # sy_logprob_n = first_term + second_term + third_term # [None, 1]
        sy_logprob_n = -tf.contrib.distributions.MultivariateNormalDiag(loc=sy_mean, scale_diag=sigma).log_prob(sy_ac_na) # [None]


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss =  tf.reduce_mean(sy_logprob_n * sy_adv_n) # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        sy_value_n = tf.placeholder(shape=[None], name = "V", dtype=tf.float32)
        baseline_loss = tf.losses.mean_squared_error(sy_value_n, baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(baseline_loss)


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#

        rewards_n = [path["reward"] for path in paths]
        if not reward_to_go:
            weighted_rewards = np.array([[(gamma**i)*r for i, r in enumerate(row)] for row in rewards_n])
            q_sums = [sum(row) for row in weighted_rewards]
            q_n = np.hstack(np.array([[q_sums[i]]*len(weighted_rewards[i]) for i in range(len(weighted_rewards))])) # [None]
        else:
            q_n = np.hstack(np.array([[sum(map_gamma(row[i:], gamma)) for i in range(len(row))] for row in rewards_n])) # [None]


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            
            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no : ob_no})
            b_n = tf.nn.l2_normalize(b_n, 0)
            q_mean = np.mean(q_n)
            q_std = np.std(q_n)
            b_n = b_n * q_std
            b_n = b_n + q_mean
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            adv_n = tf.nn.l2_normalize(adv_n, 0)


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            q_n = tf.nn.l2_normalize(q_n, 0)
            sess.run(baseline_update_op, feed_dict={sy_ob_no : ob_no, sy_value_n : q_n.eval()})

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        _, l_next = sess.run([update_op, loss], feed_dict = {sy_ob_no : ob_no, sy_ac_na : ac_na, sy_adv_n : adv_n.eval()})

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.log_tabular("Loss After Update", l_next)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#17
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32,
        num_threads_gen=1,
        multi_steps_gd=1,
        reuse_nn_bl=False):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    tf.reset_default_graph()
    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 "nn",
                                 n_layers=n_layers,
                                 size=size)

        # Hint: Use the tf.multinomial op
        # the shape -1 automatically infers that the reshape will be done in the None axis
        sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), shape=[-1])
        # negative in front is to remove the negative nature of cross entropy
        sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=sy_logits_na, labels=sy_ac_na)

    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(sy_ob_no,
                            ac_dim,
                            "nn",
                            n_layers=n_layers,
                            size=size)
        # logstd should just be a trainable variable, not a network output.
        sy_logstd = tf.get_variable('logstd',
                                    shape=[1, ac_dim],
                                    dtype=tf.float32,
                                    initializer=tf.zeros_initializer)

        sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal(
            tf.shape(sy_mean))

        # Hint: Use the log probability under a multivariate gaussian.
        sy_z = (sy_ac_na - sy_mean) / tf.exp(sy_logstd)
        sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)

        # sy_logprob_n = - 1/2 * tf.nn.l2_loss(sy_mean - sy_ac_na)

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    # Loss function that we'll differentiate to get the policy gradient.
    # Negative is to maximize the loss, instead of minimizing
    loss = -tf.reduce_mean(sy_logprob_n * sy_adv_n)
    update_op = tf.train.AdamOptimizer(learning_rate,
                                       name='AdamPolicy').minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        if not reuse_nn_bl:
            baseline_prediction = tf.squeeze(
                build_mlp(sy_ob_no,
                          1,
                          "nn_baseline",
                          n_layers=n_layers,
                          size=size))
        else:
            baseline_prediction = tf.squeeze(
                build_mlp(sy_ob_no,
                          1,
                          "nn_baseline",
                          n_layers=n_layers,
                          size=size,
                          reuse_hidden_layers=True,
                          reuse_scope_name="nn"))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        sy_target_bn = tf.placeholder(tf.float32,
                                      shape=[None],
                                      name='target_bn')
        loss_bn = tf.nn.l2_loss(sy_target_bn - baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(
            learning_rate, name='AdamBL').minimize(loss_bn)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  # pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        paths = []
        gen_start_time = time.time()
        if num_threads_gen == 1:
            # Collect paths until we have enough timesteps
            timesteps_this_batch = 0
            while True:
                ob = env.reset()
                obs, acs, rewards = [], [], []
                animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                        and animate)
                steps = 0
                while True:
                    if animate_this_episode:
                        env.render()
                        time.sleep(0.05)
                    obs.append(ob)
                    ac = sess.run(sy_sampled_ac,
                                  feed_dict={sy_ob_no: ob[None]})
                    ac = ac[0]
                    acs.append(ac)
                    ob, rew, done, _ = env.step(ac)
                    rewards.append(rew)
                    steps += 1
                    if done or steps > max_path_length:
                        break
                path = {
                    "observation": np.array(obs),
                    "reward": np.array(rewards),
                    "action": np.array(acs)
                }
                paths.append(path)
                timesteps_this_batch += pathlength(path)
                if timesteps_this_batch > min_timesteps_per_batch:
                    break
            total_timesteps += timesteps_this_batch
        else:
            # Multithread approach using tf coordinator

            coord = tf.train.Coordinator()

            workers = [
                TrajectionRunner(sess, sy_sampled_ac, sy_ob_no, env_name,
                                 max_path_length,
                                 min_timesteps_per_batch // num_threads_gen)
                for _ in range(num_threads_gen)
            ]

            for wrk in workers:
                wrk.start()

            coord.join(workers)

            # After here, all workers should be ready, let's collect their data

            timesteps_this_batch = 0
            for wrk in workers:
                paths.extend(wrk.paths)
                timesteps_this_batch = wrk.total_timesteps
                total_timesteps += wrk.total_timesteps

        gen_total_time = time.time() - gen_start_time
        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        # ====================================================================================#

        # YOUR_CODE_HERE
        # wrong, every path leads to different rewards!

        def discount_rewards(rwds, rtg):
            q = np.zeros_like(rwds)
            s = 0
            for t in reversed(range(rwds.shape[0])):
                s = s * gamma + rwds[t]
                q[t] = s

            if not rtg:
                q[:] = q[0]
            return q

        q_n = np.concatenate(
            [discount_rewards(path["reward"], reward_to_go) for path in paths])

        # ====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        # ====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            b_n = rescale(normalize(b_n), q_n.mean(axis=0, keepdims=True),
                          q_n.std(axis=0, keepdims=True))

            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_n = normalize(adv_n)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            norm_q_n = normalize(q_n)
            total_bn_loss = 0
            for _ in range(multi_steps_gd):
                _, bn_loss = sess.run([baseline_update_op, loss_bn],
                                      feed_dict={
                                          sy_ob_no: ob_no,
                                          sy_target_bn: norm_q_n
                                      })
                total_bn_loss += bn_loss
            total_bn_loss /= multi_steps_gd
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        total_loss = 0
        for _ in range(multi_steps_gd):
            _, current_loss = sess.run([update_op, loss],
                                       feed_dict={
                                           sy_ob_no: ob_no,
                                           sy_ac_na: ac_na,
                                           sy_adv_n: adv_n
                                       })
            total_loss += current_loss
        total_loss /= multi_steps_gd

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("GenTime", gen_total_time)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("Loss", total_loss)
        if nn_baseline:
            logz.log_tabular("BNLoss", total_bn_loss)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#18
0
def learn(env,
          q_func,
          optimizer_spec,
          session,
          exploration=LinearSchedule(1000000, 0.1),
          stopping_criterion=None,
          replay_buffer_size=1000000,
          batch_size=32,
          gamma=0.99,
          learning_starts=50000,
          learning_freq=4,
          frame_history_len=4,
          target_update_freq=10000,
          grad_norm_clipping=10):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    # Log the progress during the trainining
    start = time.time()
    logdir = 'pacman_hra_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('hra_result', logdir)
    logz.configure_output_dir(logdir)
    args = inspect.getargspec(q_func)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)
    time_name = path.join(logdir, "rha_t.dat")
    mean_name = path.join(logdir, "rha_mean.dat")
    best_name = path.join(logdir, "rha_best.dat")
    if not os.path.exists(logdir):
        os.makedirs(logdir)

    times, mean_ep_rewards, best_ep_rewards = [], [], []

    img_h, img_w, img_c = env.observation_space.shape
    input_shape = (img_h, img_w, frame_history_len * img_c)

    num_actions = env.action_space.n

    # set up placeholders
    # placeholder for current observation (or state)
    obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for current action
    act_t_ph = tf.placeholder(tf.int32, [None])
    # placeholder for current reward
    rew_food_t_ph = tf.placeholder(tf.float32, [None])
    rew_fruit_t_ph = tf.placeholder(tf.float32, [None])
    rew_avoid_t_ph = tf.placeholder(tf.float32, [None])
    rew_eat_t_ph = tf.placeholder(tf.float32, [None])
    # placeholder for next observation (or state)
    obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for end of episode mask
    done_mask_ph = tf.placeholder(tf.float32, [None])
    # casting to float on GPU ensures lower data transfer times.
    obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0
    obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0

    q_val, img_val = q_func(obs_t_float,
                            num_actions,
                            scope="q_func",
                            reuse=False)
    q_food, q_avoid, q_fruit, q_eat = q_val
    target_val, target_img_val = q_func(obs_tp1_float,
                                        num_actions,
                                        scope="target_q_func",
                                        reuse=False)
    target_food, target_avoid, target_fruit, target_eat = target_val

    q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope='q_func')
    target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_q_func')

    # q_all = 1/4 * (q_food + q_avoid + q_fruit + q_eat)
    # action_select = tf.argmax(q_all, 1)

    q_all = tf.concat([food, avoid, fruit, eat], 1)
    q_total = aggregator(img_val,
                         q_all,
                         num_actions,
                         scope="q_agg",
                         reuse=False)
    action_selected = tf.argmax(q_total, 1)  # potential problem
    agg_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_agg')

    q_act_food_t_val = tf.reduce_sum(q_food *
                                     tf.one_hot(act_t_ph, num_actions),
                                     axis=1)
    q_act_avoid_t_val = tf.reduce_sum(q_avoid *
                                      tf.one_hot(act_t_ph, num_actions),
                                      axis=1)
    q_act_fruit_t_val = tf.reduce_sum(q_fruit *
                                      tf.one_hot(act_t_ph, num_actions),
                                      axis=1)
    q_act_eat_t_val = tf.reduce_sum(q_eat * tf.one_hot(act_t_ph, num_actions),
                                    axis=1)

    y_food_t_val = rew_food_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max(
        target_food, axis=1)
    y_avoid_t_val = rew_avoid_t_ph + (
        1 - done_mask_ph) * gamma * tf.reduce_max(target_avoid, axis=1)
    y_fruit_t_val = rew_fruit_t_ph + (
        1 - done_mask_ph) * gamma * tf.reduce_max(target_fruit, axis=1)
    y_eat_t_val = rew_eat_t_ph + (1 - done_mask_ph) * gamma * tf.reduce_max(
        target_eat, axis=1)

    food_error = tf.reduce_mean(
        tf.losses.huber_loss(y_food_t_val, q_act_food_t_val))
    avoid_error = tf.reduce_mean(
        tf.losses.huber_loss(y_avoid_t_val, q_act_avoid_t_val))
    fruit_error = tf.reduce_mean(
        tf.losses.huber_loss(y_fruit_t_val, q_act_fruit_t_val))
    eat_error = tf.reduce_mean(
        tf.losses.huber_loss(y_eat_t_val, q_act_eat_t_val))

    q_weight_val = tf.reduce_sum(target_q_total *
                                 tf.one_hot(act_t_ph, num_actions),
                                 axis=1)
    q_weight_y = rew_food_t_ph + rew_avoid_t_ph + rew_fruit_t_ph + rew_eat_t_ph
    q_weight_y += (1 - done_mask_ph) * gamma * tf.reduce_max(
        target_q_total, axis=1)  # - q_weight_val

    weight_error = tf.reduce_mean(
        tf.losses.huber_loss(q_weight_y, q_weight_val))

    ######

    # construct optimization op (with gradient clipping)
    learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
    optimizer = optimizer_spec.constructor(learning_rate=learning_rate,
                                           **optimizer_spec.kwargs)

    train_food_fn = minimize_and_clip(optimizer,
                                      food_error,
                                      var_list=q_func_vars,
                                      clip_val=grad_norm_clipping)
    train_avoid_fn = minimize_and_clip(optimizer,
                                       avoid_error,
                                       var_list=q_func_vars,
                                       clip_val=grad_norm_clipping)
    train_fruit_fn = minimize_and_clip(optimizer,
                                       fruit_error,
                                       var_list=q_func_vars,
                                       clip_val=grad_norm_clipping)
    train_eat_fn = minimize_and_clip(optimizer,
                                     eat_error,
                                     var_list=q_func_vars,
                                     clip_val=grad_norm_clipping)
    train_weight = minimize_and_clip(optimizer,
                                     weight_error,
                                     var_list=agg_vars,
                                     clip_val=grad_norm_clipping)

    # update_target_fn will be called periodically to copy Q network to target Q network
    update_target_fn = []
    for var, var_target in zip(
            sorted(q_func_vars, key=lambda v: v.name),
            sorted(target_q_func_vars, key=lambda v: v.name)):
        update_target_fn.append(var_target.assign(var))
    update_target_fn = tf.group(*update_target_fn)

    # construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    model_initialized = False
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in itertools.count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env, t):
            break

        ### 2. Step the env and store the transition
        idx = replay_buffer.store_frame(last_obs, rha_shape=4)
        epsilon = exploration.value(t)

        if not model_initialized or np.random.rand(1) < epsilon:
            action = env.action_space.sample()
        else:
            obs_input = replay_buffer.encode_recent_observation()[None, :]
            action = session.run(action_selected,
                                 feed_dict={obs_tp1_ph:
                                            obs_input})  # potential problem
        obs, reward, done, info = env.step(action)
        replay_buffer.store_effect(idx, action, reward, done)
        if done: obs = env.reset()
        last_obs = obs

        ### 3. Perform experience replay and train the network.
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):

            obs_t_batch, act_t_batch, rew_t_batch, obs_tp1_batch, done_mask_batch = replay_buffer.sample(
                batch_size)
            rew_food_t_batch = rew_t_batch[:, 0]
            rew_fruit_t_batch = rew_t_batch[:, 1]
            rew_avoid_t_batch = rew_t_batch[:, 2]
            rew_eat_t_batch = rew_t_batch[:, 3]

            if not model_initialized:
                initialize_interdependent_variables(
                    session, tf.global_variables(), {
                        obs_t_ph: obs_t_batch,
                        obs_tp1_ph: obs_tp1_batch
                    })
                session.run(update_target_fn)
                model_initialized = True

            session.run(train_food_fn,
                        feed_dict={
                            obs_t_ph: obs_t_batch,
                            act_t_ph: act_t_batch,
                            rew_food_t_ph: rew_food_t_batch,
                            obs_tp1_ph: obs_tp1_batch,
                            done_mask_ph: done_mask_batch,
                            learning_rate: optimizer_spec.lr_schedule.value(t)
                        })
            session.run(train_avoid_fn,
                        feed_dict={
                            obs_t_ph: obs_t_batch,
                            act_t_ph: act_t_batch,
                            rew_avoid_t_ph: rew_avoid_t_batch,
                            obs_tp1_ph: obs_tp1_batch,
                            done_mask_ph: done_mask_batch,
                            learning_rate: optimizer_spec.lr_schedule.value(t)
                        })
            session.run(train_fruit_fn,
                        feed_dict={
                            obs_t_ph: obs_t_batch,
                            act_t_ph: act_t_batch,
                            rew_fruit_t_ph: rew_fruit_t_batch,
                            obs_tp1_ph: obs_tp1_batch,
                            done_mask_ph: done_mask_batch,
                            learning_rate: optimizer_spec.lr_schedule.value(t)
                        })
            session.run(train_eat_fn,
                        feed_dict={
                            obs_t_ph: obs_t_batch,
                            act_t_ph: act_t_batch,
                            rew_eat_t_ph: rew_eat_t_batch,
                            obs_tp1_ph: obs_tp1_batch,
                            done_mask_ph: done_mask_batch,
                            learning_rate: optimizer_spec.lr_schedule.value(t)
                        })

            session.run(train_weight,
                        feed_dict={
                            obs_t_ph: obs_t_batch,
                            act_t_ph: act_t_batch,
                            rew_food_t_ph: rew_food_t_batch,
                            rew_avoid_t_ph: rew_avoid_t_batch,
                            rew_fruit_t_ph: rew_fruit_t_batch,
                            rew_eat_t_ph: rew_eat_t_batch,
                            obs_tp1_ph: obs_tp1_batch,
                            done_mask_ph: done_mask_batch,
                            learning_rate: optimizer_spec.lr_schedule.value(t)
                        })

            if num_param_updates % target_update_freq == 0:
                session.run(update_target_fn)
                train_food_loss = session.run(food_error,
                                              feed_dict={
                                                  obs_t_ph: obs_t_batch,
                                                  act_t_ph: act_t_batch,
                                                  rew_food_t_ph:
                                                  rew_food_t_batch,
                                                  obs_tp1_ph: obs_tp1_batch,
                                                  done_mask_ph: done_mask_batch
                                              })
                train_avoid_loss = session.run(avoid_error,
                                               feed_dict={
                                                   obs_t_ph: obs_t_batch,
                                                   act_t_ph: act_t_batch,
                                                   rew_avoid_t_ph:
                                                   rew_avoid_t_batch,
                                                   obs_tp1_ph: obs_tp1_batch,
                                                   done_mask_ph:
                                                   done_mask_batch
                                               })
                train_fruit_loss = session.run(fruit_error,
                                               feed_dict={
                                                   obs_t_ph: obs_t_batch,
                                                   act_t_ph: act_t_batch,
                                                   rew_fruit_t_ph:
                                                   rew_fruit_t_batch,
                                                   obs_tp1_ph: obs_tp1_batch,
                                                   done_mask_ph:
                                                   done_mask_batch
                                               })
                train_eat_loss = session.run(eat_error,
                                             feed_dict={
                                                 obs_t_ph: obs_t_batch,
                                                 act_t_ph: act_t_batch,
                                                 rew_eat_t_ph: rew_eat_t_batch,
                                                 obs_tp1_ph: obs_tp1_batch,
                                                 done_mask_ph: done_mask_batch
                                             })

                train_loss = 0.25 * (train_food_loss + train_avoid_loss +
                                     train_fruit_loss + train_eat_loss)
                # print("Loss at iteration {} is: {}".format(t, train_loss))
                print("\n \
                       Food loss: {}\n \
                       Avoid loss: {}\n \
                       Fruit loss: {}\n \
                       Eat loss: {}".format(train_food_loss, train_avoid_loss,
                                            train_fruit_loss, train_eat_loss))
                print("Average loss at iteration {} is: {}".format(
                    t, train_loss))
            num_param_updates += 1

            #####

        ### 4. Log progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
            times.append(t)
            mean_ep_rewards.append(mean_episode_reward)
            best_ep_rewards.append(best_mean_episode_reward)

            joblib.dump(value=times, filename=time_name, compress=3)
            joblib.dump(value=mean_ep_rewards, filename=mean_name, compress=3)
            joblib.dump(value=best_ep_rewards, filename=best_name, compress=3)

            logz.log_tabular("Training Time", time.time() - start)
            logz.log_tabular("Loss", train_loss)
            logz.log_tabular("Iteration", t)
            logz.log_tabular("Mean Reward (/100ep)", mean_episode_reward)
            logz.log_tabular("Best Mean Reward", best_mean_episode_reward)
            logz.log_tabular("Episodes", len(episode_rewards))
            logz.log_tabular("Exploration", exploration.value(t))
            logz.log_tabular("Learning Rate",
                             optimizer_spec.lr_schedule.value(t))
            logz.dump_tabular()
            sys.stdout.flush()

    return times, mean_ep_rewards, best_ep_rewards
示例#19
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,
             network_activation='tanh'
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    
    #activation function for the network
    if network_activation=='relu':
        activation=torch.nn.functional.relu
    elif network_activation=='leaky_relu':
        activation=torch.nn.functional.leaky_relu
    else:
        activation=torch.nn.functional.tanh
    #todo: create policy
    actor=build_mlp(ob_dim, ac_dim, "actor",\
                             n_layers=n_layers, size=size, activation=activation, discrete=discrete)
    actor_loss=reinforce_loss
    actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate)
    
    #todo: initilize Agent:
    
    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#
    if nn_baseline:
        critic=build_mlp(ob_dim,1,"nn_baseline",\
                                    n_layers=n_layers,size=size, discrete=discrete)
        critic_loss=nn.MSELoss()
        critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate)
        

    #========================================================================================#
    # Training Loop
    #========================================================================================#
    
    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards, log_probs = [], [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                ob = torch.from_numpy(ob).float().unsqueeze(0)
                obs.append(ob)
                ac, log_prob = actor.run(ob)
                acs.append(ac)
                log_probs.append(log_prob)
                #format the action from policy
                if discrete:
                    ac = int(ac)
                else:
                    ac = ac.squeeze(0).numpy()
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : torch.cat(obs, 0),
                    "reward" : torch.Tensor(rewards),
                    "action" : torch.cat(acs, 0),
                    "log_prob" : torch.cat(log_probs, 0)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch
        ob_no = torch.cat([path["observation"] for path in paths], 0)
        ac_na = torch.cat([path["action"] for path in paths], 0)
                                   
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#
        q_n = []
        for path in paths:
            rewards = path['reward']
            num_steps = pathlength(path)
            R=[]
            if reward_to_go:
                for t in range(num_steps):
                    R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1))
                q_n.append(torch.cat(R))
            else:
                q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1))
        q_n = torch.cat(q_n, 0)
        
         #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#
        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            b_n = critic(ob_no)
            q_n_std = q_n.std()
            q_n_mean = q_n.mean()
            b_n_scaled = b_n * q_n_std + q_n_mean
            adv_n = (q_n - b_n_scaled).detach()
        else:
            adv_n = q_n
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item())
        
        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item())
            critic_optimizer.zero_grad()
            c_loss = critic_loss(b_n, target)
            c_loss.backward()
            critic_optimizer.step()
            
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE
        log_probs = torch.cat([path["log_prob"] for path in paths], 0)
        actor_optimizer.zero_grad()
        loss = actor_loss(log_probs, adv_n, len(paths))
        print(loss)
        loss.backward()
        actor_optimizer.step()

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#20
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name', type=str, default='HalfCheetah-v1')
    # Experiment meta-params
    parser.add_argument('--exp_name', type=str, default='mb_mpc')
    parser.add_argument('--seed', type=int, default=3)
    parser.add_argument('--render', action='store_true')
    # Training args
    parser.add_argument('--learning_rate', '-lr', type=float, default=1e-3)
    parser.add_argument('--onpol_iters', '-n', type=int, default=1)
    parser.add_argument('--dyn_iters', '-nd', type=int, default=60)
    parser.add_argument('--batch_size', '-b', type=int, default=512)
    # Data collection
    parser.add_argument('--random_paths', '-r', type=int, default=10)
    parser.add_argument('--onpol_paths', '-d', type=int, default=10)
    parser.add_argument('--ep_len', '-ep', type=int, default=1000)
    # Neural network architecture args
    parser.add_argument('--n_layers', '-l', type=int, default=2)
    parser.add_argument('--size', '-s', type=int, default=500)
    # MPC Controller
    parser.add_argument('--simulated_paths', '-sp', type=int, default=1000)
    parser.add_argument('--mpc_horizon', '-m', type=int, default=15)
    # Debug
    parser.add_argument('--quiet', '-q', action='count', default=0)
    args = parser.parse_args()

    logging.basicConfig(level=args.quiet * 10)

    # Set seed
    np.random.seed(args.seed)
    tf.set_random_seed(args.seed)

    # Make data directory if it does not already exist
    if not (os.path.exists('data')):
        os.makedirs('data')
    logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('data', logdir)
    if not (os.path.exists(logdir)):
        os.makedirs(logdir)

    logz.configure_output_dir(logdir)
    logz.save_params(vars(args))

    # Make env
    if args.env_name is "HalfCheetah-v1":
        env = HalfCheetahEnvNew()
        cost_fn = cheetah_cost_fn
    train(
        env=env,
        cost_fn=cost_fn,
        render=args.render,
        learning_rate=args.learning_rate,
        onpol_iters=args.onpol_iters,
        dynamics_iters=args.dyn_iters,
        batch_size=args.batch_size,
        num_paths_random=args.random_paths,
        num_paths_onpol=args.onpol_paths,
        num_simulated_paths=args.simulated_paths,
        env_horizon=args.ep_len,
        mpc_horizon=args.mpc_horizon,
        n_layers=args.n_layers,
        size=args.size,
        activation=tf.nn.relu,
        output_activation=None,
    )
def train_PG(exp_name='',
             env_name=' HalfCheetah',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=False, 
             animate=True, 
             logdir=None, 
             normalize_advantages=False,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32,
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = HalfCheetahEnvNew()
    # env = gym.make("RoboschoolHalfCheetah-v1")

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    # Print environment infomation
    print("Environment name: ",  "HalfCheetah")
    print("Action space is discrete: ", discrete)
    print("Action space dim: ", ac_dim)
    print("Observation space dim: ", ob_dim)
    print("Max_path_length ", max_path_length)



    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#


    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=4) 

    sess = tf.Session(config=tf_config)

    sess.__enter__() # equivalent to `with sess:`

    data_buffer_ppo = DataBuffer_general(10000, 4)


    timesteps_per_actorbatch=1000
    max_timesteps = 10000000
    clip_param=0.2
    entcoeff=0.0
    optim_epochs=10
    optim_stepsize=3e-4 
    optim_batchsize=64
    gamma=0.99
    lam=0.95
    schedule='linear'
    callback=None # you can do anything in the callback, since it takes locals(), globals()
    adam_epsilon=1e-5

    policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff)
    # policy_nn = MlpPolicy(sess=sess, env=env, hid_size=64, num_hid_layers=2, clip_param=clip_param , entcoeff=entcoeff, adam_epsilon=adam_epsilon)

    tf.global_variables_initializer().run() #pylint: disable=E1101


    # Prepare for rollouts
    # ----------------------------------------

    # seg_gen = traj_segment_generator_old(policy_nn, env, timesteps_per_actorbatch)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards


    while True:

        if callback: callback(locals(), globals())
        if max_timesteps and timesteps_so_far >= max_timesteps:
            break

        if schedule == 'constant':
            cur_lrmult = 1.0
        elif schedule == 'linear':
            cur_lrmult =  max(1.0 - float(timesteps_so_far) / max_timesteps, 0)
        else:
            raise NotImplementedError

        logger.log("********** Iteration %i ************"%iters_so_far)

        data_buffer_ppo.clear()
        seg = traj_segment_generator(policy_nn, env, timesteps_per_actorbatch)
        # seg = seg_gen.__next__()

        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"]
        vpredbefore = seg["vpred"] # predicted value function before udpate
        atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate
        # d = Dataset(dict(ob=ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not policy_nn.recurrent)

        for n in range(len(ob)):
            data_buffer_ppo.add([ob[n], ac[n], atarg[n], tdlamret[n]])
        print("data_buffer_ppo", data_buffer_ppo.size)

        optim_batchsize = optim_batchsize or ob.shape[0]

        if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy

        policy_nn.assign_old_eq_new() # set old parameter values to new parameter values

        # logger.log("Optimizing...")
        # logger.log(fmt_row(13, policy_nn.loss_names))

        # Here we do a bunch of optimization epochs over the data
        for _ in range(optim_epochs):
            losses = [] # list of tuples, each of which gives the loss for a minibatch
            for i in range(int(timesteps_per_actorbatch/optim_batchsize)):
                sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = data_buffer_ppo.sample(optim_batchsize)

                newlosses = policy_nn.lossandupdate_ppo(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, optim_stepsize*cur_lrmult)
                losses.append(newlosses)

            # logger.log(fmt_row(13, np.mean(losses, axis=0)))



        # logger.log("Evaluating losses...")
        # losses = []
        # # for batch in d.iterate_once(optim_batchsize):
        # sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = data_buffer_ppo.sample(optim_batchsize)

        # newlosses = policy_nn.compute_losses(sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult)
        # losses.append(newlosses)
        # meanlosses,_,_ = mpi_moments(losses, axis=0)
        # logger.log(fmt_row(13, meanlosses))
        # for (lossval, name) in zipsame(meanlosses, policy_nn.loss_names):
        #     logger.record_tabular("loss_"+name, lossval)
        # logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret))
        lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)
        # logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        # logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        # logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1
        # logger.record_tabular("EpisodesSoFar", episodes_so_far)
        # logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        # logger.record_tabular("TimeElapsed", time.time() - tstart)
        # if MPI.COMM_WORLD.Get_rank()==0:
        #     logger.dump_tabular()




        # Log diagnostics
        # returns = [path["reward"].sum() for path in paths]
        # ep_lengths = [pathlength(path) for path in paths]

        ep_lengths = seg["ep_lens"]
        returns =  seg["ep_rets"]

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", iters_so_far)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", timesteps_so_far)
        logz.dump_tabular()
        logz.pickle_tf_vars()
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        test=False,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        seed=0,
        # network arguments
        n_layers=1,
        size=32):
    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    # ========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    # ========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    print('observation dim: ', ob_dim)
    print('action dim: ', ac_dim)
    print('action space: ', discrete)
    # print("hellooooooo",ac_dim,env.action_space.shape)
    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    # ========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

        # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name="adv")

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    # ========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 scope="build_nn",
                                 n_layers=n_layers,
                                 size=size,
                                 activation=tf.nn.relu)
        sy_sampled_ac = tf.one_hot(tf.squeeze(tf.multinomial(sy_logits_na, 1)),
                                   ac_dim)  # Hint: Use the tf.multinomial op
        # batch_size x ac_dim

        sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits_v2(
            labels=sy_ac_na, logits=sy_logits_na)
        # batch_size ---> log probability for each action

        # Learned from https://github.com/InnerPeace-Wu/
        # # Another way to do it
        # N = tf.shape(sy_ob_no)[0]
        # sy_prob_na = tf.nn.softmax(sy_logits_na)
        # sy_logprob_n = tf.log(tf.gather_nd(sy_prob_na, tf.stack((tf.range(N), sy_ac_na), axis=1)))
    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(sy_ob_no,
                            ac_dim,
                            scope="build_nn",
                            n_layers=n_layers,
                            size=size,
                            activation=tf.nn.relu)
        sy_logstd = tf.Variable(tf.zeros(ac_dim),
                                name='logstd',
                                dtype=tf.float32)
        sy_std = tf.exp(sy_logstd)
        sy_sampled_ac = sy_mean + tf.multiply(
            sy_std, tf.random_normal(tf.shape(sy_mean)))
        sy_z = (sy_ac_na - sy_mean) / sy_std

        sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)
        # sy_logprob_n = 0.5*tf.reduce_sum(tf.squared_difference(tf.div(sy_mean,sy_std),
        # tf.div(sy_ac_na,sy_std)))  # Hint: Use the log probability under a multivariate gaussian.

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    # ========================================================================================#

    # loss = tf.reduce_sum(tf.multiply(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sy_ac_na,logits=sy_logits_na),sy_adv_n))
    # Loss function that we'll differentiate to get the policy gradient.

    loss = tf.reduce_sum(tf.multiply(sy_logprob_n, sy_adv_n))
    actor_update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    actor_params = tf.trainable_variables()

    # ========================================================================================#
    # critic graph
    # Loss and training operations
    # ========================================================================================#

    predict_value = critic(sy_ob_no)
    sy_target_value = tf.placeholder(dtype=tf.float32,
                                     shape=[None],
                                     name="target_value")
    predict_value = tf.squeeze(predict_value)
    rms_loss = tf.reduce_mean(
        tf.squared_difference(predict_value, sy_target_value))
    critic_update_op = tf.train.AdamOptimizer(learning_rate).minimize(rms_loss)
    critic_params = tf.trainable_variables()[len(actor_params):]

    # ========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    # ========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`

    actor_saver = tf.train.Saver(actor_params, max_to_keep=1)
    critic_saver = tf.train.Saver(critic_params, max_to_keep=1)

    checkpoint_actor_dir = os.path.join(os.curdir,
                                        'Actor_GAE_0.7' + str(env_name))
    if not os.path.exists(checkpoint_actor_dir):
        os.makedirs(checkpoint_actor_dir)
    actor_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt")
    ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir)

    checkpoint_critic_dir = os.path.join(os.curdir,
                                         'Critic_GAE_0.7' + str(env_name))
    if not os.path.exists(checkpoint_critic_dir):
        os.makedirs(checkpoint_critic_dir)
    critic_prefix = os.path.join(checkpoint_critic_dir, "model.ckpt")
    ckpt_2 = tf.train.get_checkpoint_state(checkpoint_critic_dir)

    if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path):
        print("Reading actor parameters from %s" %
              ckpt_1.model_checkpoint_path)
        actor_saver.restore(sess, ckpt_1.model_checkpoint_path)

    if ckpt_2 and tf.train.checkpoint_exists(ckpt_2.model_checkpoint_path):
        print("Reading critic parameters from %s" %
              ckpt_2.model_checkpoint_path)
        critic_saver.restore(sess, ckpt_2.model_checkpoint_path)

    uninitialized_vars = []
    for var in tf.global_variables():
        try:
            sess.run(var)
        except tf.errors.FailedPreconditionError:
            uninitialized_vars.append(var)

    if len(uninitialized_vars) > 0:
        init_new_vars_op = tf.variables_initializer(uninitialized_vars)
        sess.run(init_new_vars_op)

    def testing():
        print('testing..')
        ob = env.reset()
        steps = 0
        total_r = 0
        while True:
            one_hot_ac = sess.run(sy_sampled_ac,
                                  feed_dict={sy_ob_no: ob[None]})
            if discrete:
                ac = int(np.argmax(one_hot_ac))
            else:
                ac = one_hot_ac
            ob, rew, done, _ = env.step(ac)
            env.render()
            total_r += rew
            steps += 1
            if steps > max_path_length:
                break
        print(steps, total_r)
        return steps, total_r

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    if test:
        testing()
        return

    total_timesteps = 0

    best_steps, best_rew = testing()
    # best_rew = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)
        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            next_obs = []
            animate_this_episode = (len(paths) == 0 and (itr % 30 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                one_hot_ac = sess.run(sy_sampled_ac,
                                      feed_dict={sy_ob_no: ob[None]})

                if discrete:
                    ac = int(np.argmax(one_hot_ac))
                else:
                    ac = one_hot_ac
                    # print("helloooo",ac)
                acs.append(one_hot_ac)
                next_ob, rew, done, _ = env.step(
                    ac
                )  # transition dynamics P(s_t+1/s_t,a_t), r(s_t+1/s_t,a_t)
                next_obs.append(next_ob)
                ob = next_ob
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs),
                "next_observation": np.array(next_obs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        next_ob_no = np.concatenate(
            [path["next_observation"] for path in paths])
        rew_no = np.concatenate([path["reward"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        ac_na = ac_na.reshape([-1, ac_dim])
        print("helloooo", ac_na.shape)

        # ======================== Finding target values ===================================#
        # target = r(s,a) + gamma* V(s') - V(s)
        # This estimate has less variance but is biased. Alternatively
        # we can go for n-step returns or GAE(Generalised Advantage Estimation)
        # ==================================================================================#

        next_values = sess.run(predict_value, feed_dict={sy_ob_no: next_ob_no})
        target_values = rew_no + gamma * next_values

        # fit critic with target r(s,a) + gamma*V(s')
        print('updating the critic params..')
        sess.run(critic_update_op,
                 feed_dict={
                     sy_ob_no: ob_no,
                     sy_target_value: target_values
                 })

        current_values = sess.run(predict_value, feed_dict={sy_ob_no: ob_no})
        next_values = sess.run(predict_value, feed_dict={sy_ob_no: next_ob_no})
        adv_n = rew_no + gamma * next_values - current_values

        # ====================== Generalized Advatage Estimation =========================== #

        # A(s_t, a_t) = sum_{t'=t}^{t'=inf} (gamma*lambda)^{t'-t} delta_{t'}, where
        # delta_{t} = r(s_t, a_t) + gamma*V(s_{t+1}) - V(s_t)
        # ================================================================================== #

        q_n = list()
        GAE = True

        if GAE:
            ind = 0
            lam = 0.7
            for path in paths:
                pLen = pathlength(path)
                q_p = np.zeros(pLen)
                q_p[pLen - 1] = adv_n[ind + pLen - 1]
                for t in reversed(range(pLen - 1)):
                    q_p[t] = adv_n[ind + t] + (gamma * lam) * q_p[t + 1]
                q_p = np.array(q_p)
                q_n.append(q_p)
                ind += pLen

        # =========================== n-step returns =========================================#
        # Consider only the n-step returns instead of until the end of episode.
        # Variance reduction technique
        # adv(s_t) = sum_{t'=t}^(t+n) gamma^{t'-t}*r(t') + gamma^{n} V(s_{t+n}) - V(s_t)
        # ====================================================================================#

        n_step_returns = False

        if n_step_returns:
            n = 100
            value_paths = []
            for path in paths:
                ob = path['observation']
                pLen = pathlength(path)
                values = sess.run(predict_value, feed_dict={sy_ob_no: ob})
                x = {}
                x['value'] = values
                value_paths.append(x)

            for ind, path in enumerate(paths):
                pLen = pathlength(path)
                q_p = np.zeros(pLen)
                rew = path['reward']
                values = value_paths[ind]['value']
                for i in range(pLen):
                    start = i
                    end = min(start + n, pLen - 1)
                    for j, r in enumerate(rew[start:end]):
                        q_p[i] += pow(gamma, j) * r
                q_p[i] += pow(gamma, n) * values[end]
                q_p[i] -= values[start]
                q_p = np.array(q_p)
                q_n.append(q_p)

        q_n = np.concatenate(q_n)
        adv_n = q_n.copy()

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        # ====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]

        if np.mean(returns) > best_rew:
            best_rew = np.mean(returns)
            print('saving actor to ', actor_prefix)
            actor_saver.save(sess, actor_prefix)
            print('saving critic to ', critic_prefix)
            critic_saver.save(sess, critic_prefix)

        sess.run(actor_update_op,
                 feed_dict={
                     sy_ac_na: ac_na,
                     sy_ob_no: ob_no,
                     sy_adv_n: adv_n
                 })

        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#23
0
def train(sess, env, args, actor, critic, actor_noise, logdir):
    logz.configure_output_dir(logdir)
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    print('params: ', params)
    params['env'] = 'InvertedPendulum'
    params['exp_name'] = '3layer'
    logz.save_params(params)
    # Set up summary Ops
    summary_ops, summary_vars = build_summaries()
    checkpoint_actor_dir = os.path.join(os.curdir, 'Actor_InvertedPendulum')
    if not os.path.exists(checkpoint_actor_dir):
        os.makedirs(checkpoint_actor_dir)
    actor_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt")
    ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir)

    checkpoint_critic_dir = os.path.join(os.curdir, 'Critic_InvertedPendulum')
    if not os.path.exists(checkpoint_critic_dir):
        os.makedirs(checkpoint_critic_dir)
    critic_prefix = os.path.join(checkpoint_critic_dir, "model.ckpt")
    ckpt_2 = tf.train.get_checkpoint_state(checkpoint_critic_dir)

    if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path):
        print("Reading actor parameters from %s" %
              ckpt_1.model_checkpoint_path)
        actor.saver.restore(sess, ckpt_1.model_checkpoint_path)

    if ckpt_2 and tf.train.checkpoint_exists(ckpt_2.model_checkpoint_path):
        print("Reading critic parameters from %s" %
              ckpt_2.model_checkpoint_path)
        critic.saver.restore(sess, ckpt_2.model_checkpoint_path)

    uninitialized_vars = []
    for var in tf.all_variables():
        try:
            sess.run(var)
        except tf.errors.FailedPreconditionError:
            uninitialized_vars.append(var)

    if len(uninitialized_vars) > 0:
        init_new_vars_op = tf.variables_initializer(uninitialized_vars)
        sess.run(init_new_vars_op)

    writer = tf.summary.FileWriter(args['summary_dir'], sess.graph)

    # Initialize target network weights
    actor.update_target_network()
    critic.update_target_network()

    # Initialize replay memory
    replay_buffer = ReplayBuffer(int(args['buffer_size']),
                                 int(args['random_seed']))

    # Needed to enable BatchNorm.
    # This hurts the performance on Pendulum but could be useful
    # in other environments.
    # tflearn.is_training(True)

    def testing():
        env1 = gym.make(args['env'])
        s = env1.reset()
        done = False
        total_reward = 0
        max_steps = env1.spec.timestep_limit
        step = 0

        while not done:
            a = actor.predict(np.reshape(s, (1, actor.s_dim)))
            s2, r, done, _ = env1.step(a[0])
            total_reward += r
            step += 1
            s = s2
            # env.render()
            if step > max_steps:
                break
        print('total steps: ', step)
        print('total reward: ', total_reward)
        return step, total_reward

    iter = 0
    start = time.time()
    best_step, best_rew = testing()
    for i in range(int(args['max_episodes'])):

        s = env.reset()

        ep_reward = 0
        ep_ave_max_q = 0

        for j in range(int(args['max_episode_len'])):

            if args['render_env']:
                env.render()

            # Added exploration noise
            # a = actor.predict(np.reshape(s, (1, 3))) + (1. / (1. + i))
            num = np.random.uniform()
            a = actor.predict(np.reshape(s, (1, actor.s_dim))) + actor_noise()

            s2, r, terminal, info = env.step(a[0])

            replay_buffer.add(np.reshape(s, (actor.s_dim, )),
                              np.reshape(a, (actor.a_dim, )), r, terminal,
                              np.reshape(s2, (actor.s_dim, )))

            # Keep adding experience to the memory until
            # there are at least minibatch size samples
            batch_size = int(args['minibatch_size'])
            if replay_buffer.size() > 100000:
                iter += 1
                s_batch, a_batch, r_batch, t_batch, s2_batch = \
                    replay_buffer.sample_batch(batch_size)

                # Calculate targets
                target_q = critic.predict_target(
                    s2_batch, actor.predict_target(s2_batch))

                y_i = []
                for k in range(int(args['minibatch_size'])):
                    if t_batch[k]:
                        y_i.append(r_batch[k])
                    else:
                        y_i.append(r_batch[k] + critic.gamma * target_q[k])

                # Update the critic given the targets
                # critic will be trained to minimise the mean square error of the predicted Q value
                # and the target value.
                predicted_q_value, _ = critic.train(
                    s_batch, a_batch,
                    np.reshape(y_i, (int(args['minibatch_size']), 1)))

                ep_ave_max_q += np.amax(predicted_q_value)

                # Update the actor policy using the sampled gradient
                # gradients of the critic Q value according to the action valu --> action gradients
                a_outs = actor.predict(s_batch)
                grads = critic.action_gradients(s_batch,
                                                a_outs)  # del_a Q(s,a)
                actor.train(
                    s_batch, grads[0]
                )  # del_a Q(s,a) * del_theta Mu_theta(s) ---> actor gradients
                # directly apply these gradients on actor params. No special loss to minimize

                if iter % 20 == 0:
                    new_steps, new_rew = testing()
                    if new_rew > best_rew:
                        best_rew = new_rew
                        actor.saver.save(sess, actor_prefix)
                        critic.saver.save(sess, critic_prefix)
                        print('model saved to disk.')
                        actor.saver.restore(sess, ckpt_1.model_checkpoint_path)
                        critic.saver.restore(sess,
                                             ckpt_2.model_checkpoint_path)
                        best_step, best_rew = testing()
                    # print('actor model saved to: ', actor_prefix)
                    # print('critic model saved to: ', critic_prefix)

                if iter % 10 == 0:
                    new_steps, new_rew = testing()
                    logz.log_tabular("Time", time.time() - start)
                    logz.log_tabular('Iteration', iter / 10)
                    logz.log_tabular('Reward', new_rew)
                    logz.log_tabular('Steps', new_steps)
                    logz.dump_tabular()

                # Update target networks
                if iter % 50 == 0:
                    replay_buffer.update()
                    print('updating buffer')
                    print('updating target networks..')
                    actor.update_target_network()
                    critic.update_target_network()

            s = s2
            ep_reward += r

            if terminal:
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]: ep_reward,
                                           summary_vars[1]:
                                           ep_ave_max_q / float(j)
                                       })

                writer.add_summary(summary_str, i)
                writer.flush()

                print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(ep_reward), \
                                                                             i, (ep_ave_max_q / float(j))))
                break
示例#24
0
def train(env,
          cost_fn,
          logdir=None,
          render=False,
          learning_rate=1e-3,
          onpol_iters=10,
          dynamics_iters=60,
          batch_size=512,
          num_paths_random=10,
          num_paths_onpol=10,
          num_simulated_paths=10000,
          env_horizon=1000,
          mpc_horizon=15,
          n_layers=2,
          size=500,
          activation=tf.nn.relu,
          output_activation=None):
    """

    Arguments:

    onpol_iters                 Number of iterations of onpolicy aggregation for the loop to run. 

    dynamics_iters              Number of iterations of training for the dynamics model
    |_                          which happen per iteration of the aggregation loop.

    batch_size                  Batch size for dynamics training.

    num_paths_random            Number of paths/trajectories/rollouts generated 
    |                           by a random agent. We use these to train our 
    |_                          initial dynamics model.
    
    num_paths_onpol             Number of paths to collect at each iteration of
    |_                          aggregation, using the Model Predictive Control policy.

    num_simulated_paths         How many fictitious rollouts the MPC policy
    |                           should generate each time it is asked for an
    |_                          action.

    env_horizon                 Number of timesteps in each path.

    mpc_horizon                 The MPC policy generates actions by imagining 
    |                           fictitious rollouts, and picking the first action
    |                           of the best fictitious rollout. This argument is
    |                           how many timesteps should be in each fictitious
    |_                          rollout.

    n_layers/size/activations   Neural network architecture arguments. 

    """

    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    print(params)
    # the three lines below are to override the functions passed in, which aren't serializable
    params["activation"] = "relu"
    params["cost_fn"] = "cheetah_cost_fn"
    params["env"] = "HalfCheetahEnvNew"
    logz.save_params(params)

    returns_file = "returns.csv"
    returns_array = []

    #========================================================
    #
    # First, we need a lot of data generated by a random
    # agent, with which we'll begin to train our dynamics
    # model.

    random_controller = RandomController(env)
    """ YOUR CODE HERE """
    data = sample(env,
                  random_controller,
                  num_paths=num_paths_random,
                  horizon=env_horizon,
                  render=False,
                  verbose=False)

    #========================================================
    #
    # The random data will be used to get statistics (mean
    # and std) for the observations, actions, and deltas
    # (where deltas are o_{t+1} - o_t). These will be used
    # for normalizing inputs and denormalizing outputs
    # from the dynamics network.
    #
    """ YOUR CODE HERE """
    normalization = compute_normalization(data)

    #========================================================
    #
    # Build dynamics model and MPC controllers.
    #
    sess = tf.Session()

    dyn_model = NNDynamicsModel(env=env,
                                n_layers=n_layers,
                                size=size,
                                activation=activation,
                                output_activation=output_activation,
                                normalization=normalization,
                                batch_size=batch_size,
                                iterations=dynamics_iters,
                                learning_rate=learning_rate,
                                sess=sess)

    mpc_controller = MPCcontroller(env=env,
                                   dyn_model=dyn_model,
                                   horizon=mpc_horizon,
                                   cost_fn=cost_fn,
                                   num_simulated_paths=num_simulated_paths)

    #========================================================
    #
    # Tensorflow session building.
    #
    sess.__enter__()
    tf.global_variables_initializer().run()

    #========================================================
    #
    # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then
    # taking onpolicy samples and aggregating to the dataset.
    # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in
    # https://arxiv.org/abs/1708.02596
    #
    for itr in range(onpol_iters):
        """ YOUR CODE HERE """

        print(itr)
        # learn/fit dynamics model using the Adam optimization algorithm
        l = dyn_model.fit(data)
        print(l)

        # sample a set of on-policy trajectories from the environment
        new_data = sample(env,
                          mpc_controller,
                          num_paths=num_paths_onpol,
                          horizon=env_horizon,
                          render=render,
                          verbose=False)

        # append transition to dataset
        data += new_data

        # compute costs
        costs = np.array([path_cost(cost_fn, path) for path in new_data])
        print(costs)

        # compute returns
        returns = np.array(
            [new_data[i]["returns"] for i in range(len(new_data))])
        print(returns)

        returns_array.append(returns)
        np.array(returns_array).dump(returns_file)

        # LOGGING
        # Statistics for performance of MPC policy using
        # our learned dynamics model
        logz.log_tabular('Iteration', itr)
        # In terms of cost function which your MPC controller uses to plan
        logz.log_tabular('AverageCost', np.mean(costs))
        logz.log_tabular('StdCost', np.std(costs))
        logz.log_tabular('MinimumCost', np.min(costs))
        logz.log_tabular('MaximumCost', np.max(costs))
        # In terms of true environment reward of your rolled out trajectory using the MPC controller
        logz.log_tabular('AverageReturn', np.mean(returns))
        logz.log_tabular('StdReturn', np.std(returns))
        logz.log_tabular('MinimumReturn', np.min(returns))
        logz.log_tabular('MaximumReturn', np.max(returns))

        logz.dump_tabular()
示例#25
0
def train_PG(
        exp_name='',
        env_name='ProstheticsEnv',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32,
        test=False):
    start = time.time()

    logz.configure_output_dir(logdir)
    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    params['env_name'] = 'Prosthetic_3D'
    print('params: ', params)
    logz.save_params(params)

    args = inspect.getargspec(train_PG)[0]

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = env_name

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.timestep_limit

    # ========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    # ========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    print('observation dim: ', ob_dim)
    print('action dim: ', ac_dim)
    print('action space: ', discrete)
    # print("hellooooooo",ac_dim,env.action_space.shape)
    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    # ========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

        # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(dtype=tf.float32, shape=[None], name="adv")

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    # ========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(env.action_space.high,
                                 sy_ob_no,
                                 ac_dim,
                                 scope="build_nn",
                                 n_layers=n_layers,
                                 size=size,
                                 activation=tf.nn.relu)
        sy_sampled_ac = tf.one_hot(tf.squeeze(tf.multinomial(sy_logits_na, 1)),
                                   ac_dim)  # Hint: Use the tf.multinomial op
        # batch_size x ac_dim

        sy_logprob_n = tf.nn.softmax_cross_entropy_with_logits(
            labels=sy_ac_na, logits=sy_logits_na)
        # batch_size ---> log probability for each action

        # Learned from https://github.com/InnerPeace-Wu/
        # # Another way to do it
        # N = tf.shape(sy_ob_no)[0]
        # sy_prob_na = tf.nn.softmax(sy_logits_na)
        # sy_logprob_n = tf.log(tf.gather_nd(sy_prob_na, tf.stack((tf.range(N), sy_ac_na), axis=1)))
    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(env.action_space.high,
                            sy_ob_no,
                            ac_dim,
                            scope="build_nn",
                            n_layers=n_layers,
                            size=size,
                            activation=tf.nn.relu)
        sy_logstd = tf.Variable(tf.zeros(ac_dim),
                                name='logstd',
                                dtype=tf.float32)
        sy_std = tf.exp(sy_logstd)
        sy_sampled_ac = sy_mean + tf.multiply(
            sy_std, tf.random_normal(tf.shape(sy_mean)))
        sy_z = (sy_ac_na - sy_mean) / sy_std

        sy_logprob_n = 0.5 * tf.reduce_sum(tf.square(sy_z), axis=1)
        # sy_logprob_n = 0.5*tf.reduce_sum(tf.squared_difference(tf.div(sy_mean,sy_std),
        # tf.div(sy_ac_na,sy_std)))  # Hint: Use the log probability under a multivariate gaussian.

    # ========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    # ========================================================================================#

    # loss = tf.reduce_sum(tf.multiply(tf.nn.softmax_cross_entropy_with_logits_v2(labels=sy_ac_na,logits=sy_logits_na),sy_adv_n)) # Loss function that we'll differentiate to get the policy gradient.
    loss = tf.reduce_sum(tf.multiply(sy_logprob_n, sy_adv_n))
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    # ========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline - Defining Second Graph
    # ========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(1,
                      sy_ob_no,
                      1,
                      "nn_baseline",
                      n_layers=n_layers,
                      size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        sy_rew_n = tf.placeholder(shape=[None], name="rew", dtype=tf.int32)
        loss2 = tf.losses.mean_squared_error(labels=sy_rew_n,
                                             predictions=baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            loss2)

    # ========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    # ========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    # pylint: disable=E1101

    network_params = tf.trainable_variables()
    saver = tf.train.Saver(network_params, max_to_keep=1)

    checkpoint_actor_dir = os.path.join(os.curdir, 'PG_MODEL_CONT_TANH')
    if not os.path.exists(checkpoint_actor_dir):
        os.makedirs(checkpoint_actor_dir)
    model_prefix = os.path.join(checkpoint_actor_dir, "model.ckpt")
    ckpt_1 = tf.train.get_checkpoint_state(checkpoint_actor_dir)

    if ckpt_1 and tf.train.checkpoint_exists(ckpt_1.model_checkpoint_path):
        print("Reading actor parameters from %s" %
              ckpt_1.model_checkpoint_path)
        saver.restore(sess, ckpt_1.model_checkpoint_path)

    uninitialized_vars = []
    for var in tf.global_variables():
        try:
            sess.run(var)
        except tf.errors.FailedPreconditionError:
            uninitialized_vars.append(var)

    if len(uninitialized_vars) > 0:
        init_new_vars_op = tf.variables_initializer(uninitialized_vars)
        sess.run(init_new_vars_op)

    # ========================================================================================#
    # Training Loop
    # ========================================================================================#

    total_timesteps = 0
    t = 0

    def testing():
        print('testing the model..')
        ob = env.reset()
        steps = 0
        done = False
        total_r = 0
        one_hot_ac = env.action_space.sample()
        while not done:
            k = np.reshape(np.array(ob), newshape=(-1, len(ob)))
            # print('sampling an action...')
            if steps % 1 == 0:
                one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k})
            ac = np.reshape(one_hot_ac, newshape=(one_hot_ac.shape[1]))
            # print('getting observations from env ..')
            # ac = np.clip(ac, -1.0, 1.0)
            ob, rew, done, _ = env.step(ac)
            total_r += rew
            env.render()
            steps += 1
            if steps > max_path_length:
                break
        print('steps, rew', steps, total_r)
        return steps, total_r

    test = False
    if test:
        steps, rew = testing()
        return

    exp = False
    if exp:
        print('generating exp data..')
        import pickle as pkl
        paths = []
        timesteps_this_batch = 0
        while True:
            ob = env.reset()
            obs, acs = [], []
            total_r = 0
            while True:
                obs.append(ob)
                k = np.reshape(np.array(ob), newshape=(-1, len(ob)))
                one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k})
                ac = np.reshape(one_hot_ac, newshape=(one_hot_ac.shape[1]))
                ac = np.clip(ac, 0.0, 1.0)
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                total_r += rew
                if done:
                    done = False
                    break
            path = {
                "observation": np.array(obs[:-15]),
                "action": np.array(acs[:-15])
            }

            if total_r > 50:
                timesteps_this_batch += len(path['action'])
                timesteps_this_batch -= 15
                paths.append(path)

            print(timesteps_this_batch, total_r)
            if timesteps_this_batch > 1000:
                break
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        pkl.dump(ob_no, open('./simulation_0_1/obs_pg.p', 'wb'))
        pkl.dump(ac_na, open('./simulation_0_1/acts_pg.p', 'wb'))
        return

    _, best_rew = testing()
    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 30 == 0)
                                    and animate)
            steps = 0
            total_r = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                k = np.reshape(np.array(ob), newshape=(-1, len(ob)))
                # print(k.shape)
                # print('sampling an action...')
                one_hot_ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: k})

                if discrete:
                    ac = int(np.argmax(one_hot_ac))
                else:
                    ac = one_hot_ac

                acs.append(one_hot_ac)
                max_action = env.action_space.high
                ac = np.reshape(ac, newshape=(ac.shape[1]))
                # print('getting observations from env ..')
                ob, rew, done, _ = env.step(
                    ac
                )  # transition dynamics P(s_t+1/s_t,a_t), r(s_t+1/s_t,a_t)
                total_r += rew
                rew = rew * 4
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }

            if total_r > 0:
                paths.append(path)
                timesteps_this_batch += pathlength(path)
                print(total_r)

            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        ac_na = ac_na.reshape([-1, ac_dim])

        import pickle as pkl
        # pkl.dump(ob_no, open('./simulation_data/obs_'+str(itr)+'.p', 'wb'))
        # pkl.dump(ac_na, open('./simulation_data/act_'+str(itr)+'.p', 'wb'))

        print("hello..", ac_na.shape)
        # ====================================================================================#
        #                           ----------..----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        # ====================================================================================#

        # DYNAMIC PROGRAMMING
        if reward_to_go:
            q_n = list()
            for path in paths:
                pLen = pathlength(path)
                q_p = np.zeros(pLen)
                q_p[pLen - 1] = path['reward'][pLen - 1]
                for t in reversed(range(pLen - 1)):
                    q_p[t] = path['reward'][t] + gamma * q_p[t + 1]
                q_p = np.array(q_p)
                q_n.append(q_p)
        else:
            q_n = list()
            for path in paths:
                pLen = pathlength(path)
                q_p = 0
                for t in range(pLen):
                    q_p = q_p + (gamma**t) * (path['reward'][t])
                q_n.append(q_p * np.ones(pLen))
        q_n = np.concatenate(q_n)
        # print(q_n.shape)
        # ====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        # ====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            b_n = normalize(b_n, np.mean(q_n), np.std(q_n))
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        # ====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            adv_n = normalize(adv_n)

        # ====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        # ====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            sess.run(baseline_update_op,
                     feed_dict={
                         sy_ob_no: ob_no,
                         sy_rew_n: q_n
                     })

        # ====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        # ====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        t += 1

        for i in range(1):
            print('updating model params..')
            sess.run(update_op,
                     feed_dict={
                         sy_ac_na: ac_na,
                         sy_ob_no: ob_no,
                         sy_adv_n: adv_n
                     })

            _, new_r = testing()
            if new_r > best_rew:
                print('saving model params to, ', model_prefix)
                best_rew = new_r
                saver.save(sess, model_prefix)

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#26
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=0.99,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=2e-2,
        reward_to_go=True,
        animate=False,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32,
        activation='Tanh',

        #baseline_network arguments
        bl_learning_rate=1e-3,
        bl_n_layers=1,
        bl_size=32,
        bl_activation='Tanh',
        bl_n_iter=1):
    start = time.time()

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    env.seed(seed)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    ''' Do not need in PyTorch
    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = TODO
    
    '''

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    def sampling(ob, sy_logstd):

        sy_logit = mlp(ob)

        if discrete:
            # YOUR_CODE_HERE
            sy_probs = F.softmax(sy_logit)
            sy_sampled_ac = torch.multinomial(sy_probs, 1)
        else:
            # YOUR_CODE_HERE
            sy_std = torch.exp(sy_logstd)
            z = torch.normal(torch.zeros(sy_logit.size())).to(device)
            sy_sampled_ac = sy_logit + z * sy_std
        return sy_sampled_ac

    '''Loss is defined in last section : "Performing the Policy Update" 
    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) 
    
    '''
    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = build_mlp(ob_dim, 1, bl_n_layers, bl_size,
                                        bl_activation).to(device)
        bl_optimizer = Adam(baseline_prediction.parameters(),
                            lr=bl_learning_rate)

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    mlp = build_mlp(ob_dim, ac_dim, n_layers, size, activation).to(device)
    sy_logstd = nn.Parameter(torch.zeros(1, ac_dim).to(device))
    optimizer = Adam(list(mlp.parameters()) + [sy_logstd], lr=learning_rate)
    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sampling(torch.FloatTensor(ob).to(device), sy_logstd)
                ac = ac.cpu().detach().numpy()[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        returns = [path["reward"].sum() for path in paths]
        average_returns = (np.mean(returns))

        print("average_rewards : ", average_returns)
        print("\n")

        if average_returns > env.spec.reward_threshold:
            print("task solved")

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = []
        if reward_to_go:
            for path in paths:
                qs = []
                q = 0
                for reward in reversed(path["reward"]):
                    q = reward + q * gamma
                    qs.append(q)

                q_n = q_n + qs[::-1]
        else:
            for path in paths:
                discounted_reward = [
                    path["reward"][i] * (gamma**i)
                    for i in range(pathlength(path))
                ]
                q_n = q_n + [np.sum(discounted_reward)] * pathlength(path)

        q_n = torch.FloatTensor(q_n).to(device)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = baseline_prediction(
                Variable(torch.FloatTensor(ob_no)).to(device)).squeeze(1)
            b_n = torch.mean(q_n) + (
                (b_n - torch.mean(b_n)) / torch.std(b_n)) * torch.std(q_n)
            adv_n = q_n - b_n
        else:
            adv_n = q_n.clone()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.

            adv_n = (adv_n - torch.mean(adv_n)) / torch.std(adv_n)

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            normalize_q_n = (q_n - torch.mean(q_n)) / torch.std(q_n)

            for i in range(bl_n_iter):
                b_n = baseline_prediction(
                    Variable(torch.FloatTensor(ob_no)).to(device)).squeeze(1)
                bl_loss = F.mse_loss(b_n, normalize_q_n)
                bl_optimizer.zero_grad()
                bl_loss.backward()
                bl_optimizer.step()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        sy_logit = mlp(Variable(torch.FloatTensor(ob_no)).to(device))
        if discrete:
            sy_logprob_n = -F.cross_entropy(
                sy_logit, torch.LongTensor(ac_na).to(device), reduce=False)
        else:
            sy_std = torch.exp(sy_logstd)
            sy_logprob_n = -0.5 * torch.sum(
                (((sy_logit - torch.FloatTensor(ac_na).to(device)) / sy_std)**
                 2),
                dim=1
            )  # Hint: Use the log probability under a multivariate gaussian.

        weighted_negative_likelihoods = sy_logprob_n * adv_n
        loss = -torch.mean(weighted_negative_likelihoods)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        if itr == 0:
            logz.G.first_row = True
        logz.dump_tabular()
示例#27
0
def train_PG(
        exp_name='',
        env_name='CartPole-v0',
        n_iter=100,
        gamma=1.0,
        min_timesteps_per_batch=1000,
        max_path_length=None,
        learning_rate=5e-3,
        reward_to_go=True,
        animate=True,
        logdir=None,
        normalize_advantages=True,
        nn_baseline=False,
        seed=0,
        # network arguments
        n_layers=1,
        size=32,
        bootstrap=False):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)

    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    #
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    #
    # Prefixes and suffixes:
    # ob - observation
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    #
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]
    print ob_dim, ac_dim
    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    #
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim],
                              name="ob",
                              dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim],
                                  name="ac",
                                  dtype=tf.float32)

    # Define a placeholder for advantages
    sy_adv_n = tf.placeholder(shape=[None], name="advantage", dtype=tf.float32)
    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    #
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken,
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the
    #      policy network output ops.
    #
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = build_mlp(sy_ob_no,
                                 ac_dim,
                                 "discrete_mlp",
                                 n_layers=n_layers,
                                 size=size,
                                 activation=tf.nn.relu,
                                 output_activation=None)
        # print sy_logits_na
        sy_logprob_na = tf.nn.log_softmax(sy_logits_na)
        sy_sampled_ac = tf.multinomial(sy_logprob_na,
                                       1)  # Hint: Use the tf.multinomial op
        # print sy_sampled_ac
        batch_n = tf.shape(sy_ob_no)[0]
        act_index = tf.stack([tf.range(0, batch_n), sy_ac_na], axis=1)
        # sy_sampled_ac = tf.gather_nd(sy_sampled_ac,tf.range(0,batch_n))
        # sy_sampled_ac = sy_sampled_ac[0]
        sy_logprob_n = tf.gather_nd(sy_logprob_na, act_index)

    else:
        # YOUR_CODE_HERE
        sy_mean = build_mlp(sy_ob_no,
                            ac_dim,
                            "continuous_mlp",
                            n_layers=2,
                            size=32,
                            activation=tf.nn.relu,
                            output_activation=None)
        sy_logstd = tf.Variable(
            tf.ones(batch_n), name="std"
        )  # logstd should just be a trainable variable, not a network output.
        sy_sampled_ac = sy_mean + sy_logstd * tf.random_normal(
            tf.shape(sy_mean))
        sy_logprob_n = normal_log_prob(
            sy_ac_na, sy_mean, sy_log_std, ac_dim
        )  # Hint: Use the log probability under a multivariate gaussian.

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = -tf.reduce_mean(
        sy_logprob_n * sy_adv_n
    )  # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)

    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(
            build_mlp(sy_ob_no,
                      1,
                      "nn_baseline",
                      n_layers=1,
                      size=32,
                      activation=tf.nn.relu,
                      output_activation=None))
        # Define placeholders for targets, a loss function and an update op for fitting a
        # neural network baseline. These will be used to fit the neural network baseline.
        # YOUR_CODE_HERE
        v_t = tf.placeholder("float", [None])
        l_2 = 0.5 * tf.nn.l2_loss(v_t - baseline_prediction)
        baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize(
            l_2)

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1)

    sess = tf.Session(config=tf_config)
    sess.__enter__()  # equivalent to `with sess:`
    tf.global_variables_initializer().run()  #pylint: disable=E1101

    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************" % itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards, obs_2 = [], [], [], []
            animate_this_episode = (len(paths) == 0 and (itr % 10 == 0)
                                    and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                pi = sess.run(sy_logits_na, feed_dict={sy_ob_no: ob[None]})
                # print pi
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})
                # print ac
                ac = ac[0][0]
                # print ac
                acs.append(ac)
                # print ac
                ob, rew, done, _ = env.step(ac)
                obs_2.append(ob)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    terminated = done
                    break
            path = {
                "observation": np.array(obs),
                "reward": np.array(rewards),
                "action": np.array(acs),
                "obs_next": np.array(obs_2)
            }
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])
        ob_next_no = np.concatenate([path["obs_next"] for path in paths])
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above).
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t.
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over
        #       entire trajectory (regardless of which time step the Q-value should be for).
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above.
        #
        #====================================================================================#

        # q_n = np.zero(q_n.shape)
        # YOUR_CODE_HERE
        if reward_to_go:
            q_n = []
            # for path in paths.reverse():
            #     q_t = 0
            #     r_path = path["reward"].reverse()
            #     path_len = pathlength(r_path)
            #     for r in enumerate(r_path):
            #         q_t = r + gamma*q_t
            #         q_n[i] = q_t
            #         i += 1
            # q_n.reverse()
            if not bootstrap:
                for path in paths:
                    rew_t = path["reward"]
                    return_t = discount(rew_t, gamma)
                    q_n.append(return_t)
            else:
                for path in paths:
                    v_nxt = sess.run(baseline_prediction,
                                     feed_dict={sy_ob_no: path["obs_next"]})
                    q_target = v_nxt + path["reward"]
                    q_n.append(q_target)
            q_n = np.concatenate(q_n)
        else:
            i = 0
            q_n = np.concatenate([path["reward"] for path in paths])
            for path in paths:
                q_t = 0
                for idx, r in enumerate(path["reward"]):
                    q_t += gamma**idx * r
                    q_n[i] = q_t
                    i += 1

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)
            b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no})
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()
            # print q_n

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1.
            # YOUR_CODE_HERE
            normal_adv = tf.nn.l2_normalize(sy_adv_n,
                                            0,
                                            epsilon=1e-8,
                                            name="adv_normal")
            sess.run(normal_adv, feed_dict={sy_adv_n: adv_n})

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the
            # baseline.
            #
            # Fit it to the current batch in order to use for the next iteration. Use the
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            v_target = []
            for path in paths:
                rew_t = path["reward"]
                return_t = discount(rew_t, gamma)
                v_target.append(return_t)
            v_target = np.concatenate(v_target)
            print v_target.shape
            for _ in range(40):
                sess.run(baseline_update_op,
                         feed_dict={
                             sy_ob_no: ob_no,
                             v_t: v_target
                         })
        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on
        # the current batch of rollouts.
        #
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below.

        # YOUR_CODE_HERE
        sess.run(update_op,
                 feed_dict={
                     sy_ob_no: ob_no,
                     sy_ac_na: ac_na,
                     sy_adv_n: adv_n
                 })

        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#28
0
    def __init__(self,
                 env_params=None,
                 policy_params=None,
                 num_workers=16,
                 num_deltas=60,
                 deltas_used=60,
                 delta_std=0.003,
                 logdir=None,
                 model_path=None,
                 save_path=None,
                 rollout_length=1000,
                 step_size=0.01,
                 shift='constant zero',
                 params=None,
                 seed=123,
                 eval_num=5):

        logz.configure_output_dir(logdir)
        logz.save_params(params)

        self.timesteps = 0
        self.ob_size = policy_params["ob_dim"]
        self.action_size = policy_params["ac_dim"]
        self.num_deltas = num_deltas
        self.deltas_used = deltas_used
        self.rollout_length = rollout_length
        self.step_size = step_size
        self.delta_std = delta_std
        self.logdir = logdir
        self.model_path = model_path
        self.save_path = save_path
        self.shift = shift
        self.params = params
        self.max_past_avg_reward = float('-inf')
        self.num_episodes_used = float('inf')
        self.eval_num = eval_num
        self.best_score = -np.inf

        # create shared table for storing noise
        print("Creating deltas table.")
        deltas_id = create_shared_noise.remote()
        self.deltas = SharedNoiseTable(ray.get(deltas_id), seed=seed + 3)
        print('Created deltas table.')

        # initialize workers with different random seeds
        print('Initializing workers.')
        self.num_workers = num_workers
        self.workers = [
            Worker.remote(seed + 7 * i,
                          env_params=env_params,
                          policy_params=policy_params,
                          deltas=deltas_id,
                          rollout_length=rollout_length,
                          delta_std=delta_std) for i in range(num_workers)
        ]

        # initialize policy
        if policy_params['type'] == 'linear':
            self.policy = LinearPolicy(policy_params)

        else:
            self.policy = MLPPolicy("policy", policy_params["ob_dim"], policy_params["ac_dim"], policy_params["layer_norm"], tf.nn.selu, policy_params["layer_depth"],\
                                    policy_params["layer_width"], self.save_path)
            # load model
            self.load_model()
        self.w_policy = self.policy.get_weights()

        # initialize optimization algorithm
        # self.optimizer = optimizers.SGD(self.w_policy, self.step_size)
        print("Initialization of ARS complete.")
示例#29
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #todo: create Agent
    
    #todo: initilize Agent:

    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = actor.run(ob)
                print("need to type-check action here:(two lines)")
                print(ac)
                print(ac.size())
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            #One episode finishes; perform update here
            finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, )
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch



        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()
示例#30
0
def setup_logger(logdir, env, locals_):
    # Configure output directory for logging
    logz.configure_output_dir(logdir)
    # Log experiment title based on env
    params = {"exp_name": env.spec.id}
    logz.save_params(params)
def train_SAC(env_name, exp_name, seed, reparametrize, two_qf, old_funct,
              logdir, debug, gpu):
    alpha = {
        'Ant-v2': 0.1,
        'HalfCheetah-v2': 0.2,
        'Hopper-v2': 0.2,
        'Humanoid-v2': 0.05,
        'Walker2d-v2': 0.2,
    }.get(env_name, 0.2)

    algorithm_params = {
        'alpha': alpha,
        'batch_size': 256,
        'discount': 0.99,
        'learning_rate': 1e-3,
        'reparameterize': reparametrize,
        'tau': 0.01,
        'epoch_length': 1000,
        'n_epochs': 500,
        'two_qf': two_qf,
    }
    sampler_params = {
        'max_episode_length': 1000,
        'prefill_steps': 1000,
    }
    replay_pool_params = {
        'max_size': 1e6,
    }

    value_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    q_function_params = {
        'hidden_layer_sizes': (128, 128),
    }

    policy_params = {
        'hidden_layer_sizes': (128, 128),
    }

    logz.configure_output_dir(logdir)
    params = {
        'exp_name': exp_name,
        'env_name': env_name,
        'algorithm_params': algorithm_params,
        'sampler_params': sampler_params,
        'replay_pool_params': replay_pool_params,
        'value_function_params': value_function_params,
        'q_function_params': q_function_params,
        'policy_params': policy_params
    }
    logz.save_params(params)

    env = gym.envs.make(env_name)
    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)
    env.seed(seed)

    sampler = utils.SimpleSampler(**sampler_params)
    replay_pool = utils.SimpleReplayPool(
        observation_shape=env.observation_space.shape,
        action_shape=env.action_space.shape,
        **replay_pool_params)

    q_function = nn.QFunction(name='q_function', **q_function_params)
    if algorithm_params.get('two_qf', False):
        q_function2 = nn.QFunction(name='q_function2', **q_function_params)
    else:
        q_function2 = None
    value_function = nn.ValueFunction(name='value_function',
                                      **value_function_params)
    target_value_function = nn.ValueFunction(name='target_value_function',
                                             **value_function_params)
    policy = nn.GaussianPolicy(
        action_dim=env.action_space.shape[0],
        reparameterize=algorithm_params['reparameterize'],
        old_funct=old_funct,
        **policy_params)

    sampler.initialize(env, policy, replay_pool)

    algorithm = SAC(**algorithm_params)

    gpu_options = tf.GPUOptions(allow_growth=True, visible_device_list=gpu)
    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1,
                               intra_op_parallelism_threads=1,
                               gpu_options=gpu_options)
    with tf.Session(config=tf_config) as sess:

        if debug:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        algorithm.build(env=env,
                        policy=policy,
                        q_function=q_function,
                        q_function2=q_function2,
                        value_function=value_function,
                        target_value_function=target_value_function)

        for epoch in algorithm.train(sampler,
                                     session=sess,
                                     n_epochs=algorithm_params.get(
                                         'n_epochs', 1000)):
            logz.log_tabular('Iteration', epoch)
            for k, v in algorithm.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in replay_pool.get_statistics().items():
                logz.log_tabular(k, v)
            for k, v in sampler.get_statistics().items():
                logz.log_tabular(k, v)
            logz.dump_tabular()
示例#32
0
def train_PG(exp_name='',
             env_name='CartPole-v0',
             n_iter=100, 
             gamma=1.0, 
             min_timesteps_per_batch=1000, 
             max_path_length=None,
             learning_rate=5e-3, 
             reward_to_go=True, 
             animate=True, 
             logdir=None, 
             normalize_advantages=True,
             nn_baseline=False, 
             seed=0,
             # network arguments
             n_layers=1,
             size=32
             ):

    start = time.time()

    # Configure output directory for logging
    logz.configure_output_dir(logdir)

    # Log experimental parameters
    args = inspect.getargspec(train_PG)[0]
    locals_ = locals()
    params = {k: locals_[k] if k in locals_ else None for k in args}
    logz.save_params(params)

    # Set random seeds
    tf.set_random_seed(seed)
    np.random.seed(seed)

    # Make the gym environment
    env = gym.make(env_name)
    
    # Is this env continuous, or discrete?
    discrete = isinstance(env.action_space, gym.spaces.Discrete)

    # Maximum length for episodes
    max_path_length = max_path_length or env.spec.max_episode_steps

    #========================================================================================#
    # Notes on notation:
    # 
    # Symbolic variables have the prefix sy_, to distinguish them from the numerical values
    # that are computed later in the function
    # 
    # Prefixes and suffixes:
    # ob - observation 
    # ac - action
    # _no - this tensor should have shape (batch size /n/, observation dim)
    # _na - this tensor should have shape (batch size /n/, action dim)
    # _n  - this tensor should have shape (batch size /n/)
    # 
    # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis
    # is None
    #========================================================================================#

    # Observation and action sizes
    ob_dim = env.observation_space.shape[0]
    ac_dim = env.action_space.n if discrete else env.action_space.shape[0]

    #========================================================================================#
    #                           ----------SECTION 4----------
    # Placeholders
    # 
    # Need these for batch observations / actions / advantages in policy gradient loss function.
    #========================================================================================#

    sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32)
    if discrete:
        sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) 
    else:
        sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) 

    # Define a placeholder for advantages
    sy_adv_n = TODO


    #========================================================================================#
    #                           ----------SECTION 4----------
    # Networks
    # 
    # Make symbolic operations for
    #   1. Policy network outputs which describe the policy distribution.
    #       a. For the discrete case, just logits for each action.
    #
    #       b. For the continuous case, the mean / log std of a Gaussian distribution over 
    #          actions.
    #
    #      Hint: use the 'build_mlp' function you defined in utilities.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ob_no'
    #
    #   2. Producing samples stochastically from the policy distribution.
    #       a. For the discrete case, an op that takes in logits and produces actions.
    #
    #          Should have shape [None]
    #
    #       b. For the continuous case, use the reparameterization trick:
    #          The output from a Gaussian distribution with mean 'mu' and std 'sigma' is
    #
    #               mu + sigma * z,         z ~ N(0, I)
    #
    #          This reduces the problem to just sampling z. (Hint: use tf.random_normal!)
    #
    #          Should have shape [None, ac_dim]
    #
    #      Note: these ops should be functions of the policy network output ops.
    #
    #   3. Computing the log probability of a set of actions that were actually taken, 
    #      according to the policy.
    #
    #      Note: these ops should be functions of the placeholder 'sy_ac_na', and the 
    #      policy network output ops.
    #   
    #========================================================================================#

    if discrete:
        # YOUR_CODE_HERE
        sy_logits_na = TODO
        sy_sampled_ac = TODO # Hint: Use the tf.multinomial op
        sy_logprob_n = TODO

    else:
        # YOUR_CODE_HERE
        sy_mean = TODO
        sy_logstd = TODO # logstd should just be a trainable variable, not a network output.
        sy_sampled_ac = TODO
        sy_logprob_n = TODO  # Hint: Use the log probability under a multivariate gaussian. 



    #========================================================================================#
    #                           ----------SECTION 4----------
    # Loss Function and Training Operation
    #========================================================================================#

    loss = TODO # Loss function that we'll differentiate to get the policy gradient.
    update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)


    #========================================================================================#
    #                           ----------SECTION 5----------
    # Optional Baseline
    #========================================================================================#

    if nn_baseline:
        baseline_prediction = tf.squeeze(build_mlp(
                                sy_ob_no, 
                                1, 
                                "nn_baseline",
                                n_layers=n_layers,
                                size=size))
        # Define placeholders for targets, a loss function and an update op for fitting a 
        # neural network baseline. These will be used to fit the neural network baseline. 
        # YOUR_CODE_HERE
        baseline_update_op = TODO


    #========================================================================================#
    # Tensorflow Engineering: Config, Session, Variable initialization
    #========================================================================================#

    tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) 

    sess = tf.Session(config=tf_config)
    sess.__enter__() # equivalent to `with sess:`
    tf.global_variables_initializer().run() #pylint: disable=E1101



    #========================================================================================#
    # Training Loop
    #========================================================================================#

    total_timesteps = 0

    for itr in range(n_iter):
        print("********** Iteration %i ************"%itr)

        # Collect paths until we have enough timesteps
        timesteps_this_batch = 0
        paths = []
        while True:
            ob = env.reset()
            obs, acs, rewards = [], [], []
            animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate)
            steps = 0
            while True:
                if animate_this_episode:
                    env.render()
                    time.sleep(0.05)
                obs.append(ob)
                ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]})
                ac = ac[0]
                acs.append(ac)
                ob, rew, done, _ = env.step(ac)
                rewards.append(rew)
                steps += 1
                if done or steps > max_path_length:
                    break
            path = {"observation" : np.array(obs), 
                    "reward" : np.array(rewards), 
                    "action" : np.array(acs)}
            paths.append(path)
            timesteps_this_batch += pathlength(path)
            if timesteps_this_batch > min_timesteps_per_batch:
                break
        total_timesteps += timesteps_this_batch

        # Build arrays for observation, action for the policy gradient update by concatenating 
        # across paths
        ob_no = np.concatenate([path["observation"] for path in paths])
        ac_na = np.concatenate([path["action"] for path in paths])

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Computing Q-values
        #
        # Your code should construct numpy arrays for Q-values which will be used to compute
        # advantages (which will in turn be fed to the placeholder you defined above). 
        #
        # Recall that the expression for the policy gradient PG is
        #
        #       PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )]
        #
        # where 
        #
        #       tau=(s_0, a_0, ...) is a trajectory,
        #       Q_t is the Q-value at time t, Q^{pi}(s_t, a_t),
        #       and b_t is a baseline which may depend on s_t. 
        #
        # You will write code for two cases, controlled by the flag 'reward_to_go':
        #
        #   Case 1: trajectory-based PG 
        #
        #       (reward_to_go = False)
        #
        #       Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over 
        #       entire trajectory (regardless of which time step the Q-value should be for). 
        #
        #       For this case, the policy gradient estimator is
        #
        #           E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)]
        #
        #       where
        #
        #           Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}.
        #
        #       Thus, you should compute
        #
        #           Q_t = Ret(tau)
        #
        #   Case 2: reward-to-go PG 
        #
        #       (reward_to_go = True)
        #
        #       Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting
        #       from time step t. Thus, you should compute
        #
        #           Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'}
        #
        #
        # Store the Q-values for all timesteps and all trajectories in a variable 'q_n',
        # like the 'ob_no' and 'ac_na' above. 
        #
        #====================================================================================#

        # YOUR_CODE_HERE
        q_n = TODO

        #====================================================================================#
        #                           ----------SECTION 5----------
        # Computing Baselines
        #====================================================================================#

        if nn_baseline:
            # If nn_baseline is True, use your neural network to predict reward-to-go
            # at each timestep for each trajectory, and save the result in a variable 'b_n'
            # like 'ob_no', 'ac_na', and 'q_n'.
            #
            # Hint #bl1: rescale the output from the nn_baseline to match the statistics
            # (mean and std) of the current or previous batch of Q-values. (Goes with Hint
            # #bl2 below.)

            b_n = TODO
            adv_n = q_n - b_n
        else:
            adv_n = q_n.copy()

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Advantage Normalization
        #====================================================================================#

        if normalize_advantages:
            # On the next line, implement a trick which is known empirically to reduce variance
            # in policy gradient methods: normalize adv_n to have mean zero and std=1. 
            # YOUR_CODE_HERE
            pass


        #====================================================================================#
        #                           ----------SECTION 5----------
        # Optimizing Neural Network Baseline
        #====================================================================================#
        if nn_baseline:
            # ----------SECTION 5----------
            # If a neural network baseline is used, set up the targets and the inputs for the 
            # baseline. 
            # 
            # Fit it to the current batch in order to use for the next iteration. Use the 
            # baseline_update_op you defined earlier.
            #
            # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the 
            # targets to have mean zero and std=1. (Goes with Hint #bl1 above.)

            # YOUR_CODE_HERE
            pass

        #====================================================================================#
        #                           ----------SECTION 4----------
        # Performing the Policy Update
        #====================================================================================#

        # Call the update operation necessary to perform the policy gradient update based on 
        # the current batch of rollouts.
        # 
        # For debug purposes, you may wish to save the value of the loss function before
        # and after an update, and then log them below. 

        # YOUR_CODE_HERE


        # Log diagnostics
        returns = [path["reward"].sum() for path in paths]
        ep_lengths = [pathlength(path) for path in paths]
        logz.log_tabular("Time", time.time() - start)
        logz.log_tabular("Iteration", itr)
        logz.log_tabular("AverageReturn", np.mean(returns))
        logz.log_tabular("StdReturn", np.std(returns))
        logz.log_tabular("MaxReturn", np.max(returns))
        logz.log_tabular("MinReturn", np.min(returns))
        logz.log_tabular("EpLenMean", np.mean(ep_lengths))
        logz.log_tabular("EpLenStd", np.std(ep_lengths))
        logz.log_tabular("TimestepsThisBatch", timesteps_this_batch)
        logz.log_tabular("TimestepsSoFar", total_timesteps)
        logz.dump_tabular()
        logz.pickle_tf_vars()