def runner(env,
           policy_func,
           load_model_path,
           timesteps_per_batch,
           number_trajs,
           stochastic_policy,
           save=False,
           reuse=False):

    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    U.initialize()
    policy = build_policy(env, 'mlp', value_network='copy')
    ob = observation_placeholder(ob_space)
    with tf.variable_scope('pi'):
        pi = policy(observ_placeholder=ob)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state(load_model_path)
    saver.restore(U.get_session(), ckpt.model_checkpoint_path)

    obs_list = []
    acs_list = []
    len_list = []
    ret_list = []
    from tqdm import tqdm
    for _ in tqdm(range(number_trajs)):
        traj = traj_1_generator(pi,
                                env,
                                timesteps_per_batch,
                                stochastic=stochastic_policy)
        obs, acs, ep_len, ep_ret = traj['ob'], traj['ac'], traj[
            'ep_len'], traj['ep_ret']
        obs_list.append(obs)
        acs_list.append(acs)
        len_list.append(ep_len)
        ret_list.append(ep_ret)
    if stochastic_policy:
        print('stochastic policy:')
    else:
        print('deterministic policy:')
    if save:
        filename = load_model_path.split('/')[-1] + '.' + env.spec.id
        np.savez(filename,
                 obs=np.array(obs_list),
                 acs=np.array(acs_list),
                 lens=np.array(len_list),
                 rets=np.array(ret_list))
    avg_len = sum(len_list) / len(len_list)
    avg_ret = sum(ret_list) / len(ret_list)
    # print("Average length:", avg_len)
    # print("Average return:", avg_ret)
    return avg_len, avg_ret
예제 #2
0
 def add_all_summary(self, writer, values, iter):
     # Note that the order of the incoming ```values``` should be the same as the that of the
     #            ```scalar_keys``` given in ```__init__```
     if np.sum(np.isnan(values)+0) != 0:
         return
     sess = U.get_session()
     keys = self.scalar_summaries_ph + self.histogram_summaries_ph
     feed_dict = {}
     for k, v in zip(keys, values):
         feed_dict.update({k: v})
     summaries_str = sess.run(self.summaries, feed_dict)
     writer.add_summary(summaries_str, iter)
예제 #3
0
def run_eval(config):
    def create_loss():
        test_dataset_args = create_dataset(config,
                                           split="test",
                                           shuffle=True,
                                           repeat=True)
        test_inputs, test_targets, test_lengths, _, _, _ = test_dataset_args

        cell = create_cell(config, test_dataset_args)

        test_ll_per_seq, kl, log_weight, log_ess, trajectories = \
            basic_bounds.iwae(cell, (test_inputs, test_targets), test_lengths, num_samples=config.num_samples)

        test_ll_per_t = tf.reduce_mean(test_ll_per_seq /
                                       tf.to_float(test_lengths))

        return test_ll_per_t, trajectories, cell, log_weight, test_lengths

    def create_graph():
        global_step = tf.train.get_or_create_global_step()
        test_bound, trajectories, cell, log_weight, test_lengths = create_loss
        return cell, test_bound, global_step, trajectories, test_lengths

    cell, test_bound, global_step, trajectories, test_lengths = create_graph()

    sess = U.get_session()
    U.initialize()
    cur_step = 0

    # saver
    saver = tf.train.Saver(max_to_keep=1)

    if not os.path.exists(config.logdir):
        assert False
    ckpt = tf.train.get_checkpoint_state(config.logdir + '/valid_best')
    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        cur_step = int(ckpt.model_checkpoint_path.split('-')[-1])
        print('Model and log loaded! (checkpoint_path=%s, cur_step=%d)' %
              (ckpt.model_checkpoint_path, cur_step))
    test_bound_value = sess.run([test_bound])

    print("##################################")
    print("VALID_BEST_STEP: %s" % cur_step)
    print("PARTICLE_NUM: %s" % config.num_samples)
    print("TEST_BOUND: %s" % test_bound_value[0])
    print("##################################")
예제 #4
0
def main(_):
    # create visualizer
    #visualizer = TensorboardVisualizer()
    monitor = Monitor(FLAGS)
    #log_dir = monitor.log_dir
    #visualizer.initialize(log_dir, None)
    saved_mean_reward = None
    # openAI logger
    L.configure(monitor.log_dir, format_strs=['stdout', 'csv'])

    # initialize env
    atari_env = AtariEnv(monitor)
    #screen_shot_subgoal(atari_env)

    # we should probably follow deepmind style env
    # stack 4 frames and scale float
    env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True)

    # get default tf_session
    sess = U.get_session()

    # create q networks for controller
    controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller')
    controller = Controller(controller_network, env.action_space.n)

    # create q networks for meta-controller
    num_goals = env.unwrapped.goals_space.n
    metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller')
    metacontroller = MetaController(metacontroller_network, num_goals)
    # Create the schedule for exploration starting from 1.
    exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps),
                                 initial_p=1.0,
                                 final_p=EXPLORATION_FINAL_EPS)
    # initialize experience replay
    controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE)
    metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE)
    
    # initialize critic
    critic = Critic(env.unwrapped)

    total_extrinsic_reward = []
    # for success rate
    total_goal_reached = np.zeros(num_goals, dtype=np.int32) 
    total_goal_sampled = np.zeros(num_goals, dtype=np.int32)
    total_goal_epsilon = np.ones(num_goals, dtype=np.float32)
    ep = 0
    total_step = 0
    init_ob = env.reset()

    U.initialize()
    # initialize target network in both controller and meta
    sess.run(metacontroller.network.update_target_op)
    sess.run(controller.network.update_target_op)

    # load ckpt if presence 
    model_path = tf.train.latest_checkpoint(monitor.ckpt_dir)
    model_saved = False
    model_file = os.path.join(monitor.ckpt_dir, 'model')
    if model_path is not None:
        U.load_variables(model_file)
        L.log('loaded model from %s' % model_file)
        model_saved = True

    while ep < MAX_EPISODE: # count number of steps 
        # init environment game play variables
        
        init_ob = env.reset()
        observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape)
        desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0]
        env.unwrapped.desired_goal = desired_goal
        total_goal_sampled[desired_goal] += 1

        # given predicted goal, we encode this goal bounding mask to the observation np array
        ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal)

        # NOTE: Below code verify added mask correctly
        # for i in range(ob_with_g.shape[-1]):
        #     ob = ob_with_g[:,:,i]
        #     image = Image.fromarray(ob)
        #     image = image.convert('RGB')
        #     image.save('test_%i.png' % i)

        done = False
        reached_goal = False

        while not done:
            extrinsic_rewards = 0
            s0 = init_ob['observation']

            while not (done or reached_goal):
                update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0]
                # obtain extrinsic reward from environment
                ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t)
                reached_goal = env.unwrapped.reached_goal(desired_goal)
                ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal)
                
                intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t)
                controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t)
                
                # sample from replay_buffer1 to train controller
                obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None
                # get q estimate for tp1 as 'supervised'
                ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape)
                q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0]
                td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1)
                # join train meta-controller only sample from replay_buffer2 to train meta-controller
                if total_step >= WARMUP_STEPS:
                    L.log('join train has started ----- step %d', total_step)
                    # sample from replay_buffer2 to train meta-controller
                    init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                    weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None
                    # get q estimate for tp1 as 'supervised'
                    obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape)
                    q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0]
                    td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1)

                if total_step % UPDATE_TARGET_NETWORK_FREQ == 0:
                    #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step)
                    sess.run(controller.network.update_target_op)
                    # its fine, we aren't really training meta dqn until after certain steps.
                    sess.run(metacontroller.network.update_target_op)

                extrinsic_rewards += extrinsic_reward_t
                ob_with_g = ob_with_g_tp1
                done = done_t
                total_step += 1
            # we are done / reached_goal
            # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2
            # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards))
            # clean observation without goal encoded
            metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done)

            # if we are here then we have finished the desired goal
            if not done:
                #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards))
                exploration_ep = 1.0
                total_goal_reached[env.unwrapped.achieved_goal] += 1
                if total_step >= WARMUP_STEPS:
                    t = total_step - WARMUP_STEPS
                    exploration_ep = exploration2.value(t)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                
                while env.unwrapped.achieved_goal == desired_goal:
                    desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0]

                env.unwrapped.desired_goal = desired_goal
                total_goal_sampled[desired_goal] += 1
                L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal))

                # start again
                reached_goal = False
        
        # finish an episode
        total_extrinsic_reward.append(extrinsic_rewards)
        ep += 1

        mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1)
        if ep % monitor.print_freq == 0 :
            L.record_tabular("steps", total_step)
            L.record_tabular("episodes", ep)
            L.record_tabular("mean 100 episode reward", mean_100ep_reward)
            L.dump_tabular()

        if total_step % monitor.ckpt_freq == 0:
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                L.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
            U.save_variables(model_file)
            model_saved = True
            saved_mean_reward = mean_100ep_reward
    
    # verified our model was saved
    if model_saved:
        L.log('restored model with mean reward: %d' % saved_mean_reward)
        U.load_variables(model_file)
def learn(
        *,
        network,
        env,
        eval_policy,
        total_timesteps,
        timesteps_per_batch=1024,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        max_episodes=0,
        max_iters=0,  # time constraint
        callback=None,
        load_path=None,
        checkpoint_path_in=None,
        checkpoint_dir_out=None,
        checkpoint_freq=100,  # In iterations!!,
        from_iter=0,
        eval_episodes=20,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()

    cpus_per_worker = 1
    U.get_session(
        config=tf.ConfigProto(allow_soft_placement=True,
                              inter_op_parallelism_threads=cpus_per_worker,
                              intra_op_parallelism_threads=cpus_per_worker))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)

    set_global_seeds(seed)

    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    # ob_space = Box(low=-np.inf, high=np.inf, shape=(env.observation_space.n,))
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)
    with tf.variable_scope("pi"):
        pi = policy(observ_placeholder=ob)
    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)
    # Loading checkpoint
    if checkpoint_path_in is not None and os.path.isfile(checkpoint_path_in):
        pi.load(checkpoint_path_in)
        logger.log('Loaded policy weights from %s' % checkpoint_path_in)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    # s = env.reset()
    # start = time.time()
    # for i in range(10000):
    #     pi.step(s, stochastic=True)
    # duration = time.time() - start
    # print(duration)
    # return
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True,
                                     gamma=gamma)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    iters_eval = 0
    all_logs = []
    best_rew = -np.inf

    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    online_scores = []
    offline_scores = []
    if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0:
        # noththing to be done
        return pi

    assert sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) < 2, \
        'out of max_iters, total_timesteps, and max_episodes only one should be specified'

    while True:
        if callback: callback(locals(), globals())
        if total_timesteps and timesteps_so_far >= total_timesteps:
            break
        elif max_episodes and episodes_so_far >= max_episodes:
            break
        elif max_iters and iters_so_far >= max_iters:
            break
        logger.log("********** Iteration %i ************" % iters_so_far)

        if iters_so_far % checkpoint_freq == 0 and checkpoint_dir_out is not None:
            if not os.path.exists(checkpoint_dir_out):
                os.makedirs(checkpoint_dir_out)
            pi.save(
                os.path.join(checkpoint_dir_out,
                             'checkpoint_%d' % iters_so_far))
            logger.log('Saved policy weights as %s' % os.path.join(
                checkpoint_dir_out, 'checkpoint_%d.npy' % iters_so_far))

            def pi_wrapper(ob):
                ac, vpred, _, _ = pi.step(ob, stochastic=True)
                return ac

            rew, _, logs, disc_rets, num_stops, avg_damages = eval_policy(
                pi=pi_wrapper, n_episodes=eval_episodes, verbose=True)
            offline_scores.append(
                [np.mean(disc_rets),
                 np.mean(num_stops),
                 np.mean(avg_damages)])
            np.save(os.path.join(checkpoint_dir_out, 'offline_scores.npy'),
                    offline_scores)
            for log in logs:
                log['iter'] = iters_eval
            all_logs = all_logs + logs

            iters_eval += 1

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate

        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        for (lossname, lossval) in zip(loss_names, meanlosses):
            logger.record_tabular(lossname, lossval)

        with timed("vf"):

            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        logger.record_tabular("ev_tdlam_before",
                              explained_variance(vpredbefore, tdlamret))

        lrlocal = (seg["ep_lens"], seg["ep_rets"])  # local values
        listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal)  # list of tuples
        lens, rews = map(flatten_lists, zip(*listoflrpairs))
        lenbuffer.extend(lens)
        rewbuffer.extend(rews)

        logger.record_tabular("EpLenMean", np.mean(lenbuffer))
        logger.record_tabular("EpRewMean", np.mean(rewbuffer))
        logger.record_tabular("EpThisIter", len(lens))
        episodes_so_far += len(lens)
        timesteps_so_far += sum(lens)
        iters_so_far += 1

        logger.record_tabular("EpisodesSoFar", episodes_so_far)
        logger.record_tabular("TimestepsSoFar", timesteps_so_far)
        logger.record_tabular("TimeElapsed", time.time() - tstart)

        ep_rew_mean = np.mean(rewbuffer)
        online_scores.append(ep_rew_mean)
        np.save(os.path.join(checkpoint_dir_out, 'online_scores.npy'),
                online_scores)
        # Saving best
        if iters_so_far % checkpoint_freq == 0 and ep_rew_mean > best_rew and checkpoint_dir_out is not None:
            pi.save(os.path.join(checkpoint_dir_out, 'best'))
            best_rew = ep_rew_mean
            logger.log('Saved policy weights as %s' %
                       os.path.join(checkpoint_dir_out, 'best.npy'))

        if rank == 0:
            logger.dump_tabular()

    return pi
예제 #6
0
def run(config):
    def create_loss():
        train_dataset_args = create_dataset(config,
                                            split="train",
                                            shuffle=True,
                                            repeat=True)
        test_dataset_args = create_dataset(config,
                                           split="test",
                                           shuffle=True,
                                           repeat=True)
        valid_dataset_args = create_dataset(config,
                                            split="valid",
                                            shuffle=True,
                                            repeat=True)
        inputs, targets, lengths, params, _, _ = train_dataset_args
        test_inputs, test_targets, test_lengths, _, _, _ = test_dataset_args
        valid_inputs, valid_targets, valid_lengths, _, _, _ = valid_dataset_args

        cell = create_cell(config, train_dataset_args)

        if config.bound == "iwae":
            ll_per_seq, kl, log_weight, log_ess, trajectories = \
                basic_bounds.iwae(cell, (inputs, targets), lengths, num_samples=config.num_samples)
        else:
            raise ValueError("Undefined bound %s" % config.bound)

        if config.test_bound == "iwae":
            valid_ll_per_seq, _, _, _, _ = \
                basic_bounds.iwae(cell, (valid_inputs, valid_targets), valid_lengths, num_samples=config.test_num_samples)
        else:
            raise ValueError("Undefined bound %s" % config.test_bound)

        ll_per_t = tf.reduce_mean(ll_per_seq / tf.to_float(lengths))
        valid_ll_per_t = tf.reduce_mean(valid_ll_per_seq /
                                        tf.to_float(valid_lengths))

        return cell, ll_per_t, valid_ll_per_t, trajectories, lengths

    def create_graph():
        global_step = tf.train.get_or_create_global_step()
        cell, bound, valid_bound, trajectories, lengths = create_loss()
        loss = -bound
        opt = tf.train.AdamOptimizer(config.learning_rate)
        if config.model_train:
            grad_theta = opt.compute_gradients(loss,
                                               var_list=tf.trainable_variables(
                                                   "%s/theta" % config.cell))
            train_op_theta = opt.apply_gradients(grad_theta,
                                                 global_step=global_step)
        else:
            train_op_theta = tf.constant(1)
        if config.algorithm == 'reparam':
            grad_phi = opt.compute_gradients(
                loss, var_list=tf.trainable_variables('prop_phi'))
            train_op_phi = opt.apply_gradients(grad_phi,
                                               global_step=global_step)
        else:
            train_op_phi = tf.constant(1)
        return cell, bound, valid_bound, trajectories, lengths, train_op_theta, train_op_phi, global_step

    valid_best = -1000000

    cell, bound, valid_bound, trajectories, lengths, train_op_theta, train_op_phi, global_step = create_graph(
    )
    sess = U.get_session()
    U.initialize()
    cur_step = 0

    saver = tf.train.Saver(max_to_keep=1)
    valid_saver = tf.train.Saver(max_to_keep=1)
    model_savepath = config.logdir + '/model.ckpt'
    valid_best_model_savepath = config.logdir + '/valid_best/valid_best_model.ckpt'

    if not os.path.exists(config.logdir):
        os.makedirs(config.logdir)
        os.makedirs(config.logdir + '/valid_best')
    ckpt = tf.train.get_checkpoint_state(config.logdir)

    if ckpt and ckpt.model_checkpoint_path:
        saver.restore(sess, ckpt.model_checkpoint_path)
        valid_saver.restore(sess, ckpt.model_checkpoint_path)
        cur_step = int(ckpt.model_checkpoint_path.split('-')[-1])
        print('Model and log loaded! (checkpoint_path=%s, cur_step=%d)' %
              (ckpt.model_checkpoint_path, cur_step))

    while cur_step < config.max_iter + 1:
        if config.algorithm == 'reparam':
            _, _, bound_value, valid_bound_value = sess.run(
                [train_op_theta, train_op_phi, bound, valid_bound])
        elif "reinforce" in config.algorithm or "vimco" in config.algorithm or "vifle" in config.algorithm or "fr" in config.algorithm:
            _, bound_value, raw_seg, valid_bound_value, run_lengths = sess.run(
                [train_op_theta, bound, trajectories, valid_bound, lengths])
            cell.prop_update.update(raw_seg, run_lengths)
        else:
            raise ValueError("Undefined algorithm %s" % config.algorithm)

        if valid_bound_value > valid_best and cur_step > config.init_steps:
            valid_best = valid_bound_value
            valid_best_model_saved_path = valid_saver.save(
                sess, valid_best_model_savepath, global_step=cur_step)
            print('Model saved: %s' % valid_best_model_saved_path)
        # for save - current work
        if cur_step % config.save_every == 0:
            model_saved_path = saver.save(sess,
                                          model_savepath,
                                          global_step=cur_step)
            print('Model saved: %s' % model_saved_path)
        cur_step += 1
def main():
    L.configure('/home/metalabadmin/exp/freeway',
                format_strs=['stdout', 'csv', 'tensorboard'])
    env = gym.make('Freeway-v0')
    env = wrapper.wrap_deepmind(env, frame_stack=True, scale=True)

    optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)
    network = Q_network(env.observation_space,
                        env.action_space.n,
                        optimizer,
                        gamma=0.99,
                        scope='freeway')
    m_controller = MetaController(network, env.action_space.n)
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(0.1 * 1e7),
                                 initial_p=1.0,
                                 final_p=0.02)
    replay = ReplayBuffer(50000)
    # get default tf_session
    sess = U.get_session()
    U.initialize()
    sess.run(m_controller.network.update_target_op)
    step = 0
    episodes = 0
    rewards = 0
    mean_100ep_reward = 0
    total_reward = []
    saved_mean_reward = None
    ob = env.reset()

    while step <= 1e7:
        ep = exploration.value(step)
        ob_reshaped = np.reshape(ob, (1, ) + env.observation_space.shape)
        act = m_controller.sample_act(sess, ob_reshaped, update_eps=ep)[0]
        ob_tp1, reward_t, done_t, info = env.step(act)
        env.render()
        rewards += reward_t
        replay.add(ob, act, reward_t, ob_tp1, float(done_t))
        ob = ob_tp1

        # train every 4 steps
        if step >= 1000 and step % 4 == 0:
            obs, acts, rewards_t, obs_tp1, dones_t = replay.sample(64)
            weights, batch_idxes = np.ones_like(rewards_t), None
            # get q estimate for tp1 as 'supervised'
            obs_tp1_reshaped = np.reshape(obs_tp1,
                                          (64, ) + env.observation_space.shape)
            q_tp1 = m_controller.get_q(sess, obs_tp1_reshaped)[0]
            td_error = m_controller.train(sess, obs, acts, rewards_t, obs_tp1,
                                          dones_t, weights, q_tp1)

        step += 1

        if step >= 1000 and step % 1000 == 0:
            sess.run(m_controller.network.update_target_op)

        if done_t:
            ob = env.reset()
            total_reward.append(rewards)
            episodes += 1
            rewards = 0
            print('step %d done %s, ep %.2f' % (step, str(done_t), ep))
            mean_100ep_reward = round(np.mean(total_reward[-101:-1]), 1)
            if episodes % 10 == 0 and episodes != 0:
                print('date time %s' % str(datetime.now()))
                L.record_tabular("steps", step)
                L.record_tabular("episodes", episodes)
                L.record_tabular("mean 100 episode reward", mean_100ep_reward)
                L.dump_tabular()

        if step % 1000 == 0:
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                L.log("Saving model due to mean reward increase: {} -> {}".
                      format(saved_mean_reward, mean_100ep_reward))
                U.save_variables('./freewaymodel.ckpt')
                model_saved = True
                saved_mean_reward = mean_100ep_reward