Пример #1
0
def run(args):
    logger.configure(
        f'logs/{args["dataset"]}/pam/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}'
    )
    logger.info(args)

    pool = mp.Pool(mp.cpu_count())
    pam_arg = args.copy()

    if 'margin' not in pam_arg.keys():
        best_margin = pool.map(find_best_margin, make_arg_list(pam_arg))
        best_margin = np.mean(best_margin, 0)
        if 'verbose' in pam_arg.keys() and pam_arg['verbose']:
            for i in range(len(best_margin)):
                logger.record_tabular(f'[PAM] margin = {MARGINS[i]}',
                                      best_margin[i])
            logger.dump_tabular()
        best_margin = MARGINS[best_margin.argmax()]
        logger.record_tabular('[PAM] best margin', best_margin)
        pam_arg['margin'] = best_margin

    results_pam = pool.map(run_pam, make_arg_list(pam_arg))

    logger.record_tabular('[PAM] accuracy mean', np.mean(results_pam))
    logger.record_tabular('[PAM] accuracy max', np.max(results_pam))
    logger.record_tabular('[PAM] accuracy min', np.min(results_pam))
    logger.record_tabular('[PAM] accuracy std', np.std(results_pam))
    logger.dump_tabular()
Пример #2
0
def main():
    parser = atari_arg_parser()
    parser.add_argument('--policy',
                        help='Policy architecture',
                        choices=['cnn', 'lstm', 'lnlstm'],
                        default='cnn')

    parser.add_argument('--lrschedule',
                        help='Learning rate schedule',
                        choices=['constant', 'linear'],
                        default='constant')

    args = parser.parse_args()
    logger.configure()

    if args.policy == 'cnn':
        policy_fn = Convnet
    elif args.policy == 'lstm':
        policy_fn = Lstm
    elif args.policy == 'lnlstm':
        policy_fn = LnLstm

    num_env = 16
    env = VecFrameStack(make_atari_env(args.env, num_env, args.seed), 4)
    fit(policy_fn,
        env,
        args.seed,
        total_timesteps=int(args.num_timesteps * 1.1),
        lrschedule=args.lrschedule)
    env.close()
Пример #3
0
def configure_log_info(env_name, seed):
    """ Configure Log Information """
    cwd = os.path.join(os.getcwd(), 'log')
    now = datetime.utcnow().strftime("%b-%d_%H:%M:%S")  # create unique log file
    run_name = "{0}-{1}-{2}".format(env_name, seed, now)
    cwd = os.path.join(cwd, run_name)
    logger.configure(dir=cwd)
Пример #4
0
def main():
    parser = atari_arg_parser()
    parser.add_argument('--policy', help='Policy architecture',
                        choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
    parser.add_argument('--lrschedule', help='Learning rate schedule',
                        choices=['constant', 'linear'], default='constant')
    parser.add_argument('--logdir', help='Directory for logging')
    args = parser.parse_args()
    logger.configure(args.logdir)

    num_cpu = 16
    env = make_atari_env(args.env, num_cpu, args.seed)
    if args.policy == 'cnn':
        policy_fn = AcerConvnet
    elif args.policy == 'lstm':
        policy_fn = AcerLstm
    else:
        print("Policy {} not implemented".format(args.policy))
        return
    fit(
        policy_fn,
        env,
        args.seed,
        total_timesteps=int(args.num_timesteps * 1.1),
        lrschedule=args.lrschedule
    )
    env.close()
Пример #5
0
def run(args):
    logger.configure(
        f'logs/{args["dataset"]}/svm/{datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")}'
    )
    logger.info(args)

    pool = mp.Pool(mp.cpu_count())
    svm_arg = args.copy()

    if 'C1' not in svm_arg.keys():
        best_c1 = pool.map(find_best_c1, make_arg_list(svm_arg))
        best_c1 = np.mean(best_c1, 0)
        if 'verbose' in svm_arg.keys() and svm_arg['verbose']:
            for i in range(len(best_c1)):
                logger.record_tabular(f'[C-SVM] C1 = {CLASS_WEIGHTS[i]}',
                                      best_c1[i])
            logger.dump_tabular()
        best_c1 = CLASS_WEIGHTS[best_c1.argmax()]
        logger.record_tabular('[C-SVM] best C1', best_c1)
        svm_arg['C1'] = best_c1

    results_svm = pool.map(run_c_svm, make_arg_list(svm_arg))

    logger.record_tabular('[C-SVM] accuracy mean', np.mean(results_svm))
    logger.record_tabular('[C-SVM] accuracy max', np.max(results_svm))
    logger.record_tabular('[C-SVM] accuracy min', np.min(results_svm))
    logger.record_tabular('[C-SVM] accuracy std', np.std(results_svm))
    logger.dump_tabular()
Пример #6
0
def main(_):
    # Configure parameters for each run and perform training/testing
    if params:
        for key, values in params.items():
            for param in values:
                try:
                    # Set parameter
                    setattr(FLAGS, key, param)

                    # Set log directory
                    dir = DIR_LOG + '_' + ALGO + '_' + key + '_' + param + '_' + FLAGS.level_name

                    # Configure loggers
                    logger.configure(dir=dir)
                    setattr(FLAGS, 'logdir', dir)
                    write(FLAGS.logdir)

                    # Run algorithm
                    with Xvfb(width=1400, height=900, colordepth=24) as xvfb:
                        run()
                except Exception as e:
                    print(e)
                    pass
    else:
        run()
Пример #7
0
def make_mujoco_env(env_id, seed):
    """
    Create a wrapped, monitored gym.Env for MuJoCo.
    """
    rank = MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed + 10000 * rank)
    env = gym.make(env_id)
    logger.configure()
    env = Monitor(env, os.path.join(logger.get_dir(), str(rank)))
    env.seed(seed)
    return env
Пример #8
0
 def __init__(self, encoder, decoder, datasets, optimizer, logdir):
     self.encoder = encoder
     self.decoder = decoder
     self.datasets = datasets
     self.optimizer = optimizer
     self._create_datasets()
     self._create_loss()
     self._create_optimizer()
     self._create_summary()
     self._create_evaluation(encoder, decoder)
     self._create_session(logdir)
     logger.configure(logdir, format_strs=['stdout', 'log'])
def main():
    args = pybullet_arg_parser().parse_args()
    logger.configure(
        format_strs = ['stdout', 'log', 'csv'],
        log_suffix = "EAC-{}-Seed_{}-sr_{}-se_{}-nbt_{}-START-"
            .format(args.env,args.seed,args.scale_reward,args.scale_entropy, args.num_of_train))
    logger.log("Algorithm: EAC")
    logger.log("Environment: {}".format(args.env))
    logger.log("Seed: {}".format(args.seed))
    logger.log("scale-reward: {}".format(args.scale_reward))
    logger.log("numberOfTrain: {}".format(args.num_of_train))
    run_experiment(env = args.env, seed = args.seed, scale_reward = args.scale_reward,
                   scale_entropy = args.scale_entropy, num_of_train = args.num_of_train)
Пример #10
0
def main():
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--env', help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--prioritized', type=int, default=1)
    parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
    parser.add_argument('--dueling', type=int, default=1)
    parser.add_argument('--num-timesteps', type=int, default=int(10e6))
    parser.add_argument('--checkpoint-freq', type=int, default=10000)
    parser.add_argument('--checkpoint-path', type=str, default=None)

    args = parser.parse_args()
    logger.configure()
    set_global_seeds(args.seed)
    env = make_atari(args.env)
    env = Monitor(env, logger.get_dir())
    env = wrap_deepmind(env)
    model = cnn_to_mlp(
        convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
        hiddens=[256],
        dueling=bool(args.dueling),
    )

    fit(
        env,
        q_func=model,
        lr=1e-4,
        max_timesteps=args.num_timesteps,
        buffer_size=10000,
        exploration_fraction=0.1,
        exploration_final_eps=0.01,
        train_freq=4,
        learning_starts=10000,
        target_network_update_freq=1000,
        gamma=0.99,
        prioritized_replay=bool(args.prioritized),
        prioritized_replay_alpha=args.prioritized_replay_alpha,
        checkpoint_freq=args.checkpoint_freq,
        checkpoint_path=args.checkpoint_path,
    )

    env.close()
    sess = tf.get_default_session()
    del sess
Пример #11
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo1 import pposgd_simple, cnn_policy
    import ppo1.common.tf_util as U
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=int(num_timesteps * 1.1),
                        timesteps_per_actorbatch=256,
                        clip_param=0.2,
                        entcoeff=0.01,
                        optim_epochs=4,
                        optim_stepsize=1e-3,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()
Пример #12
0
def main():
    parser = arg_parser()
    parser.add_argument('--platform', help='environment choice',
                        choices=['atari', 'mujoco', 'humanoid', 'robotics'],
                        default='atari')
    platform_args, environ_args = parser.parse_known_args()
    platform = platform_args.platform
    logger.configure()

    # atari
    if platform == 'atari':
        parser = atari_arg_parser()
        parser.add_argument('--policy', help='Policy architecture',
                            choices=['cnn', 'lstm', 'lnlstm', 'mlp'],
                            default='cnn')
        args = parser.parse_known_args()[0]

        # fit(
        #     args.env,
        #     num_timesteps=args.num_timesteps,
        #     seed=args.seed,
        #     policy=args.policy
        # )
        sess = Agent().init_session().__enter__()
        env = VecFrameStack(make_atari_env(args.env, 8, args.seed), 4)
        policy = {'cnn' : Convnet,
                  'lstm' : Lstm,
                  'lnlstm' : LnLstm,
                  'mlp': Mlp}[args.policy]

        fit(
            policy=policy,
            env=env,
            nsteps=128,
            nminibatches=8,
            lam=0.95,
            gamma=0.99,
            noptepochs=4,
            log_interval=1,
            ent_coef=.01,
            lr=lambda f: f * 2.5e-4,
            cliprange=lambda f: f * 0.1,
            total_timesteps=int(args.num_timesteps * 1.1)
        )

        sess.close()
        env.close()
        del sess

    # mujoco
    if platform == 'mujoco':
        args = mujoco_arg_parser().parse_known_args()[0]

        sess = Agent().init_session().__enter__()
        from utils.monitor import Monitor

        def make_env():
            env = make_mujoco_env(args.env, args.seed)
            # env = gym.make(env_id)
            env = Monitor(env, logger.get_dir(), allow_early_resets=True)
            return env

        env = DummyVecEnv([make_env])
        env = VecNormalize(env)

        model = fit(
            policy=Mlp,
            env=env,
            nsteps=2048,
            nminibatches=32,
            lam=0.95,
            gamma=0.99,
            noptepochs=10,
            log_interval=1,
            ent_coef=0.0,
            lr=3e-4,
            cliprange=0.2,
            total_timesteps=args.num_timesteps
        )

        # return model, env

        if args.play:
            logger.log("Running trained model")
            obs = np.zeros((env.num_envs,) + env.observation_space.shape)
            obs[:] = env.reset()
            while True:
                actions = model.step(obs)[0]
                obs[:]  = env.step(actions)[0]
                env.render()

        sess.close()
        env.close()
        del sess
Пример #13
0
def main(_):
    # create visualizer
    #visualizer = TensorboardVisualizer()
    monitor = Monitor(FLAGS)
    #log_dir = monitor.log_dir
    #visualizer.initialize(log_dir, None)
    saved_mean_reward = None
    # openAI logger
    L.configure(monitor.log_dir, format_strs=['stdout', 'csv'])

    # initialize env
    atari_env = AtariEnv(monitor)
    #screen_shot_subgoal(atari_env)

    # we should probably follow deepmind style env
    # stack 4 frames and scale float
    env = wrapper.wrap_deepmind(atari_env, frame_stack=True, scale=True)

    # get default tf_session
    sess = U.get_session()

    # create q networks for controller
    controller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    controller_network = Q_network(env.observation_space, env.action_space.n, controller_optimizer, scope='controller')
    controller = Controller(controller_network, env.action_space.n)

    # create q networks for meta-controller
    num_goals = env.unwrapped.goals_space.n
    metacontroller_optimizer = tf.train.AdamOptimizer(learning_rate=LEARNING_RATE)
    metacontroller_network = Q_network(env.observation_space, num_goals, metacontroller_optimizer, scope='metacontroller')
    metacontroller = MetaController(metacontroller_network, num_goals)
    # Create the schedule for exploration starting from 1.
    exploration2 = LinearSchedule(schedule_timesteps=int(EXPLORATION_FRACTION * monitor.num_timesteps),
                                 initial_p=1.0,
                                 final_p=EXPLORATION_FINAL_EPS)
    # initialize experience replay
    controller_replay_buffer = ReplayBuffer(D1_MEMORY_SIZE)
    metacontroller_replay_buffer = ReplayBuffer(D2_MEMORY_SIZE)
    
    # initialize critic
    critic = Critic(env.unwrapped)

    total_extrinsic_reward = []
    # for success rate
    total_goal_reached = np.zeros(num_goals, dtype=np.int32) 
    total_goal_sampled = np.zeros(num_goals, dtype=np.int32)
    total_goal_epsilon = np.ones(num_goals, dtype=np.float32)
    ep = 0
    total_step = 0
    init_ob = env.reset()

    U.initialize()
    # initialize target network in both controller and meta
    sess.run(metacontroller.network.update_target_op)
    sess.run(controller.network.update_target_op)

    # load ckpt if presence 
    model_path = tf.train.latest_checkpoint(monitor.ckpt_dir)
    model_saved = False
    model_file = os.path.join(monitor.ckpt_dir, 'model')
    if model_path is not None:
        U.load_variables(model_file)
        L.log('loaded model from %s' % model_file)
        model_saved = True

    while ep < MAX_EPISODE: # count number of steps 
        # init environment game play variables
        
        init_ob = env.reset()
        observation = np.reshape(init_ob['observation'], (1, )+init_ob['observation'].shape)
        desired_goal = metacontroller.sample_act(sess, observation, update_eps=1.0)[0]
        env.unwrapped.desired_goal = desired_goal
        total_goal_sampled[desired_goal] += 1

        # given predicted goal, we encode this goal bounding mask to the observation np array
        ob_with_g = env.unwrapped._add_goal_mask(init_ob['observation'], desired_goal)

        # NOTE: Below code verify added mask correctly
        # for i in range(ob_with_g.shape[-1]):
        #     ob = ob_with_g[:,:,i]
        #     image = Image.fromarray(ob)
        #     image = image.convert('RGB')
        #     image.save('test_%i.png' % i)

        done = False
        reached_goal = False

        while not done:
            extrinsic_rewards = 0
            s0 = init_ob['observation']

            while not (done or reached_goal):
                update_eps1_with_respect_to_g = get_epsilon(total_goal_epsilon, total_goal_reached, total_goal_sampled, desired_goal, total_step, EXPLORATION_WARM_UP)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                primitive_action_t = controller.sample_act(sess, ob_with_g_reshaped, update_eps=update_eps1_with_respect_to_g)[0]
                # obtain extrinsic reward from environment
                ob_tp1, extrinsic_reward_t, done_t, info = env.step(primitive_action_t)
                reached_goal = env.unwrapped.reached_goal(desired_goal)
                ob_with_g_tp1 = env.unwrapped._add_goal_mask(ob_tp1['observation'], desired_goal)
                
                intrinsic_reward_t = critic.criticize(desired_goal, reached_goal, primitive_action_t, done_t)
                controller_replay_buffer.add(ob_with_g, primitive_action_t, intrinsic_reward_t, ob_with_g_tp1, done_t)
                
                # sample from replay_buffer1 to train controller
                obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t = controller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                weights, batch_idxes = np.ones_like(intrinsic_rewards_t), None
                # get q estimate for tp1 as 'supervised'
                ob_with_g_tp1_reshaped = np.reshape(ob_with_g_tp1, (1, )+ob_with_g.shape)
                q_tp1 = controller.get_q(sess, ob_with_g_tp1_reshaped)[0]
                td_error = controller.train(sess, obs_with_g_t, primitive_actions_t, intrinsic_rewards_t, obs_with_g_tp1, dones_t, weights, q_tp1)
                # join train meta-controller only sample from replay_buffer2 to train meta-controller
                if total_step >= WARMUP_STEPS:
                    L.log('join train has started ----- step %d', total_step)
                    # sample from replay_buffer2 to train meta-controller
                    init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t = metacontroller_replay_buffer.sample(TRAIN_BATCH_SIZE)
                    weights, batch_idxes = np.ones_like(extrinsic_rewards_t), None
                    # get q estimate for tp1 as 'supervised'
                    obs_terminate_in_g_reshaped = np.reshape(obs_terminate_in_g, (1, )+obs_terminate_in_g.shape)
                    q_tp1 = metacontroller.get_q(sess, obs_terminate_in_g_reshaped)[0]
                    td_error = metacontroller.train(sess, init_obs, goals_t, extrinsic_rewards_t, obs_terminate_in_g, dones_t, weights, q_tp1)

                if total_step % UPDATE_TARGET_NETWORK_FREQ == 0:
                    #L.log('UPDATE BOTH CONTROLLER Q NETWORKS ----- step %d', step)
                    sess.run(controller.network.update_target_op)
                    # its fine, we aren't really training meta dqn until after certain steps.
                    sess.run(metacontroller.network.update_target_op)

                extrinsic_rewards += extrinsic_reward_t
                ob_with_g = ob_with_g_tp1
                done = done_t
                total_step += 1
            # we are done / reached_goal
            # store transitions of init_ob, goal, all the extrinsic rewards, current ob in D2
            # print("ep %d : step %d, goal extrinsic total %d" % (ep, step, extrinsic_rewards))
            # clean observation without goal encoded
            metacontroller_replay_buffer.add(init_ob['observation'], desired_goal, extrinsic_rewards, ob_tp1['observation'], done)

            # if we are here then we have finished the desired goal
            if not done:
                #print("ep %d : goal %d reached, not yet done, extrinsic %d" % (ep, desired_goal, extrinsic_rewards))
                exploration_ep = 1.0
                total_goal_reached[env.unwrapped.achieved_goal] += 1
                if total_step >= WARMUP_STEPS:
                    t = total_step - WARMUP_STEPS
                    exploration_ep = exploration2.value(t)
                ob_with_g_reshaped = np.reshape(ob_with_g, (1, )+ob_with_g.shape)
                
                while env.unwrapped.achieved_goal == desired_goal:
                    desired_goal = metacontroller.sample_act(sess, ob_with_g_reshaped, update_eps=exploration_ep)[0]

                env.unwrapped.desired_goal = desired_goal
                total_goal_sampled[desired_goal] += 1
                L.log('ep %d : achieved goal was %d ----- new goal --- %d' % (ep, env.unwrapped.achieved_goal, desired_goal))

                # start again
                reached_goal = False
        
        # finish an episode
        total_extrinsic_reward.append(extrinsic_rewards)
        ep += 1

        mean_100ep_reward = round(np.mean(total_extrinsic_reward[-101:-1]), 1)
        if ep % monitor.print_freq == 0 :
            L.record_tabular("steps", total_step)
            L.record_tabular("episodes", ep)
            L.record_tabular("mean 100 episode reward", mean_100ep_reward)
            L.dump_tabular()

        if total_step % monitor.ckpt_freq == 0:
            if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                L.log("Saving model due to mean reward increase: {} -> {}".format(
                                   saved_mean_reward, mean_100ep_reward))
            U.save_variables(model_file)
            model_saved = True
            saved_mean_reward = mean_100ep_reward
    
    # verified our model was saved
    if model_saved:
        L.log('restored model with mean reward: %d' % saved_mean_reward)
        U.load_variables(model_file)
Пример #14
0
def run(args):
    prefix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S")
    logger.configure(f'logs/{args["dataset"]}/nn/{prefix}')
    logger.info(args)

    pool = mp.Pool(mp.cpu_count())

    nn_arg = args.copy()
    nn_arg.update(find_best_params(nn_arg))
    nn_arg.update(find_best_alpha_val(nn_arg))
    logger.record_tabular('[PEER] batchsize', nn_arg['batchsize'])
    logger.record_tabular('[PEER] learning rate', nn_arg['lr'])
    logger.record_tabular('[PEER] hidsize', nn_arg['hidsize'])
    logger.record_tabular('[PEER] alpha', nn_arg['alpha'])
    logger.dump_tabular()

    nn_arg['seed'] = 1
    run_nn_dmi(nn_arg)
    results_dmi = pool.map(run_nn_dmi, make_arg_list(nn_arg))
    results_surr = pool.map(run_nn_surr, make_arg_list(nn_arg))
    results_nn = pool.map(run_nn, make_arg_list(nn_arg))
    results_peer = pool.map(run_nn_peer, make_arg_list(nn_arg))
    results_symm = pool.map(run_nn_symm, make_arg_list(nn_arg))
    pool.close()
    pool.join()

    test_acc_bce = [res['val_acc'] for res in results_nn]
    test_acc_peer = [res['val_acc'] for res in results_peer]
    test_acc_surr = [res['val_acc'] for res in results_surr]
    test_acc_symm = [res['val_acc'] for res in results_symm]
    test_acc_dmi = [res['val_acc'] for res in results_dmi]

    plot([
        test_acc_bce, test_acc_peer, test_acc_surr, test_acc_symm, test_acc_dmi
    ], [
        'cross entropy loss', 'peer loss', 'surrogate loss', 'symmtric loss',
        'dmi loss'
    ],
         title='Accuracy During Testing',
         path=f'logs/{args["dataset"]}/nn/{prefix}')

    train_acc_bce = [res['train_acc'] for res in results_nn]
    train_acc_peer = [res['train_acc'] for res in results_peer]
    train_acc_surr = [res['train_acc'] for res in results_surr]
    train_acc_symm = [res['train_acc'] for res in results_symm]
    train_acc_dmi = [res['train_acc'] for res in results_dmi]

    plot([
        train_acc_bce, train_acc_peer, train_acc_surr, train_acc_symm,
        train_acc_dmi
    ], [
        'cross entropy loss', 'peer loss', 'surrogate loss', 'symmetric loss',
        'dmi loss'
    ],
         title='Accuracy During Training',
         path=f'logs/{args["dataset"]}/nn/{prefix}')

    loss_acc_surr = [res['loss'] for res in results_surr]
    loss_acc_bce = [res['loss'] for res in results_nn]
    loss_acc_peer = [res['loss'] for res in results_peer]
    loss_acc_symm = [res['loss'] for res in results_symm]
    loss_acc_dmi = [res['loss'] for res in results_dmi]

    plot([
        loss_acc_bce, loss_acc_peer, loss_acc_surr, loss_acc_symm, loss_acc_dmi
    ], [
        'cross entropy loss', 'peer loss', 'surrogate loss', 'symmetric loss',
        'dmi loss'
    ],
         title='Loss',
         path=f'logs/{args["dataset"]}/nn/{prefix}')

    logger.record_tabular('[NN] with peer loss', np.mean(test_acc_peer, 0)[-1])
    logger.record_tabular('[NN] with surrogate loss',
                          np.mean(test_acc_surr, 0)[-1])
    logger.record_tabular('[NN] with symmetric loss',
                          np.mean(test_acc_symm, 0)[-1])
    logger.record_tabular('[NN] with dmi loss', np.mean(test_acc_dmi, 0)[-1])
    logger.record_tabular(f'[NN] with {args["loss"]} loss',
                          np.mean(test_acc_bce, 0)[-1])
    logger.dump_tabular()
Пример #15
0
def main():
    # Parse input parameters
    args, unknown_args = parser.parse_known_args()
    args.num_steps = int(args.num_steps)
    unknown_args = parse_cmdline_kwargs(unknown_args)

    # Load config file
    load_yaml_config(args, 'learner')

    # Expose socket to actor(s)
    context = zmq.Context()
    weights_socket = context.socket(zmq.PUB)
    weights_socket.bind(f'tcp://*:{args.param_port}')

    _, agent = init_components(args, unknown_args)

    # Configure experiment directory
    create_experiment_dir(args, 'LEARNER-')
    save_yaml_config(args.exp_path / 'config.yaml', args, 'learner', agent)
    args.log_path = args.exp_path / 'log'
    args.ckpt_path = args.exp_path / 'ckpt'
    args.ckpt_path.mkdir()
    args.log_path.mkdir()

    logger.configure(str(args.log_path))

    # Record commit hash
    with open(args.exp_path / 'hash', 'w') as f:
        f.write(
            str(
                subprocess.run('git rev-parse HEAD'.split(),
                               stdout=subprocess.PIPE).stdout.decode('utf-8')))

    # Variables to control the frequency of training
    receiving_condition = multiprocessing.Condition()
    num_receptions = multiprocessing.Value('i', 0)

    # Start memory pool in another process
    manager = MemPoolManager()
    manager.start()
    mem_pool = manager.MemPool(capacity=args.pool_size)
    Process(target=recv_data,
            args=(args.data_port, mem_pool, receiving_condition,
                  num_receptions, args.keep_training)).start()

    # Print throughput statistics
    Process(target=MultiprocessingMemPool.record_throughput,
            args=(mem_pool, args.record_throughput_interval)).start()

    freq = 0
    learn_flag = 0
    while True:
        if learn_flag == 0:
            weights_socket.send(pickle.dumps(agent.get_weights()))

        if len(mem_pool) >= args.batch_size:

            # Sync weights to actor
            weights = agent.get_weights()
            if hvd.rank() == 0:
                weights_socket.send(pickle.dumps(weights))

            if freq % args.ckpt_save_freq == 0:
                if args.ckpt_save_type == 'checkpoint':
                    agent.save(args.ckpt_path / 'ckpt')
                elif args.ckpt_save_type == 'weight':
                    with open(args.ckpt_path / 'weight.ckpt', 'wb') as f:
                        pickle.dump(weights, f)

            if args.keep_training:
                agent.learn(mem_pool.sample(size=args.batch_size))
            else:
                with receiving_condition:
                    while num_receptions.value < args.training_freq:
                        receiving_condition.wait()
                    data = mem_pool.sample(size=args.batch_size)
                    num_receptions.value -= args.training_freq
                # Training
                stat = agent.learn(data)
                learn_flag = 1
                if stat is not None:
                    for k, v in stat.items():
                        logger.record_tabular(k, v)
                logger.dump_tabular()

            freq += 1
Пример #16
0
            avg_ret.append(avg_reward)

        return avg_ret, avg_pg_loss, avg_vf_loss, avg_latencies


if __name__ == "__main__":
    from env.mec_offloaing_envs.offloading_env import Resources
    from env.mec_offloaing_envs.offloading_env import OffloadingEnvironment
    from policies.meta_seq2seq_policy import Seq2SeqPolicy
    from samplers.seq2seq_sampler import Seq2SeqSampler
    from samplers.seq2seq_sampler_process import Seq2SeSamplerProcessor
    from baselines.vf_baseline import ValueFunctionBaseline
    from meta_algos.ppo_offloading import PPO
    from utils import utils, logger

    logger.configure(dir="./meta_evaluate_ppo_log/task_offloading",
                     format_strs=['stdout', 'log', 'csv'])

    resource_cluster = Resources(mec_process_capable=(10.0 * 1024 * 1024),
                                 mobile_process_capable=(1.0 * 1024 * 1024),
                                 bandwidth_up=7.0,
                                 bandwidth_dl=7.0)

    env = OffloadingEnvironment(
        resource_cluster=resource_cluster,
        batch_size=100,
        graph_number=100,
        graph_file_paths=[
            "./env/mec_offloaing_envs/data/meta_offloading_20/offload_random20_12/random.20."
        ],
        time_major=False)
Пример #17
0
def main():
    parser = arg_parser()
    parser.add_argument('--platform',
                        help='environment choice',
                        choices=['atari', 'mujoco', 'humanoid', 'robotics'],
                        default='atari')
    platform_args, environ_args = parser.parse_known_args()
    platform = platform_args.platform

    # atari
    if platform == 'atari':
        args = atari_arg_parser().parse_known_args()[0]
        pi = fit(platform,
                 args.env,
                 num_timesteps=args.num_timesteps,
                 seed=args.seed)

    # mujoco
    if platform == 'mujoco':
        args = mujoco_arg_parser().parse_known_args()[0]
        logger.configure()
        pi = fit(platform,
                 args.env,
                 num_timesteps=args.num_timesteps,
                 seed=args.seed)

    # robotics
    if platform == 'robotics':
        args = robotics_arg_parser().parse_known_args()[0]
        pi = fit(platform,
                 args.env,
                 num_timesteps=args.num_timesteps,
                 seed=args.seed)

    # humanoids
    if platform == 'humanoid':
        logger.configure()
        parser = mujoco_arg_parser()
        parser.add_argument('--model-path',
                            default=os.path.join(logger.get_dir(),
                                                 'humanoid_policy'))
        parser.set_defaults(num_timesteps=int(2e7))

        args = parser.parse_known_args()[0]

        if not args.play:
            # train the model
            pi = fit(platform,
                     args.env,
                     num_timesteps=args.num_timesteps,
                     seed=args.seed,
                     model_path=args.model_path)
        else:
            # construct the model object, load pre-trained model and render
            from utils.cmd import make_mujoco_env
            pi = fit(platform, args.evn, num_timesteps=1, seed=args.seed)
            Model().load_state(args.model_path)
            env = make_mujoco_env('Humanoid-v2', seed=0)

            ob = env.reset()
            while True:
                action = pi.act(stochastic=False, ob=ob)[0]
                ob, _, done, _ = env.step(action)
                env.render()
                if done:
                    ob = env.reset()
Пример #18
0
def train():
    processes = []
    if os.path.isdir(args.log_dir):
        ans = input('{} exists\ncontinue and overwrite? y/n: '.format(args.log_dir))
        if ans == 'n':
            return

    logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv'])
    logger.log(args)
    json.dump(vars(args), open(os.path.join(args.log_dir, 'params.json'), 'w'))

    torch.set_num_threads(2)

    start = time.time()
    policy_update_time, policy_forward_time = 0, 0
    step_time_env, step_time_total, step_time_rewarder = 0, 0, 0
    visualize_time = 0
    rewarder_fit_time = 0

    envs = RL2EnvInterface(args)
    if args.look:
        looker = Looker(args.log_dir)

    actor_critic = Policy(envs.obs_shape, envs.action_space,
                          base=RL2Base, base_kwargs={'recurrent': True,
                                                     'num_act_dim': envs.action_space.shape[0]})
    actor_critic.to(args.device)
    agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch,
                     args.value_loss_coef, args.entropy_coef, lr=args.lr,
                     eps=args.eps,
                     max_grad_norm=args.max_grad_norm)

    rollouts = RolloutStorage(args.num_steps, args.num_processes,
                        envs.obs_shape, envs.action_space,
                        actor_critic.recurrent_hidden_state_size)
    rollouts.to(args.device)

    def copy_obs_into_beginning_of_storage(obs):
        obs_raw, obs_act, obs_rew, obs_flag = obs
        rollouts.obs[0].copy_(obs_raw)
        rollouts.obs_act[0].copy_(obs_act)
        rollouts.obs_rew[0].copy_(obs_rew)
        rollouts.obs_flag[0].copy_(obs_flag)

    for j in range(args.num_updates):
        obs = envs.reset()
        copy_obs_into_beginning_of_storage(obs)

        if args.use_linear_lr_decay:
            update_linear_schedule(agent.optimizer, j, args.num_updates, args.lr)

        if args.algo == 'ppo' and args.use_linear_clip_decay:
            agent.clip_param = args.clip_param  * (1 - j / float(args.num_updates))

        episode_returns = [0 for i in range(args.trial_length)]
        episode_final_reward = [0 for i in range(args.trial_length)]
        i_episode = 0

        log_marginal = 0
        lambda_log_s_given_z = 0

        for step in range(args.num_steps):
            # Sample actions
            policy_forward_start = time.time()
            with torch.no_grad():
                value, action, action_log_prob, recurrent_hidden_states = actor_critic.act(
                        rollouts.get_obs(step),
                        rollouts.recurrent_hidden_states[step],
                        rollouts.masks[step])
            policy_forward_time += time.time() - policy_forward_start

            # Obser reward and next obs
            step_total_start = time.time()
            obs, reward, done, info = envs.step(action)
            step_time_total += time.time() - step_total_start
            step_time_env += info['step_time_env']
            step_time_rewarder += info['reward_time']
            log_marginal += info['log_marginal'].sum().item()
            lambda_log_s_given_z += info['lambda_log_s_given_z'].sum().item()

            episode_returns[i_episode] += reward.sum().item()
            if all(done['episode']):
                episode_final_reward[i_episode] += reward.sum().item()
                i_episode = (i_episode + 1) % args.trial_length

            # If done then clean the history of observations.
            masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done['trial']])
            rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks)

        assert all(done['trial'])

        with torch.no_grad():
            next_value = actor_critic.get_value(rollouts.get_obs(-1),
                                                rollouts.recurrent_hidden_states[-1],
                                                rollouts.masks[-1]).detach()

        rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau)

        policy_update_start = time.time()
        if args.rewarder != 'supervised' and envs.rewarder.fit_counter == 0 and not args.vae_load:
            value_loss, action_loss, dist_entropy = 0, 0, 0
        else:
            value_loss, action_loss, dist_entropy = agent.update(rollouts)
        policy_update_time += time.time() - policy_update_start
        rollouts.after_update()

        # metrics
        trajectories_pre = envs.trajectories_pre_current_update
        state_entropy_pre = calculate_state_entropy(args, trajectories_pre)

        trajectories_post = envs.trajectories_post_current_update
        state_entropy_post = calculate_state_entropy(args, trajectories_post)

        return_avg = rollouts.rewards.sum() / args.trials_per_update
        reward_avg = return_avg / (args.trial_length * args.episode_length)
        log_marginal_avg = log_marginal / args.trials_per_update / (args.trial_length * args.episode_length)
        lambda_log_s_given_z_avg = lambda_log_s_given_z / args.trials_per_update / (args.trial_length * args.episode_length)

        num_steps = (j + 1) * args.num_steps * args.num_processes
        num_episodes = num_steps // args.episode_length
        num_trials = num_episodes // args.trial_length

        logger.logkv('state_entropy_pre', state_entropy_pre)
        logger.logkv('state_entropy_post', state_entropy_post)
        logger.logkv('value_loss', value_loss)
        logger.logkv('action_loss', action_loss)
        logger.logkv('dist_entropy', dist_entropy)
        logger.logkv('return_avg', return_avg.item())
        logger.logkv('reward_avg', reward_avg.item())
        logger.logkv('steps', (j + 1) * args.num_steps * args.num_processes)
        logger.logkv('episodes', num_episodes)
        logger.logkv('trials', num_trials)
        logger.logkv('policy_updates', (j + 1))
        logger.logkv('time', time.time() - start)
        logger.logkv('policy_forward_time', policy_forward_time)
        logger.logkv('policy_update_time', policy_update_time)
        logger.logkv('step_time_rewarder', step_time_rewarder)
        logger.logkv('step_time_env', step_time_env)
        logger.logkv('step_time_total', step_time_total)
        logger.logkv('visualize_time', visualize_time)
        logger.logkv('rewarder_fit_time', rewarder_fit_time)
        logger.logkv('log_marginal_avg', log_marginal_avg)
        logger.logkv('lambda_log_s_given_z_avg', lambda_log_s_given_z_avg)
        for i_episode in range(args.trial_length):
            logger.logkv('episode_return_avg_{}'.format(i_episode),
                         episode_returns[i_episode] / args.trials_per_update)
            logger.logkv('episode_final_reward_{}'.format(i_episode),
                         episode_final_reward[i_episode] / args.trials_per_update)

        if (j % args.save_period == 0 or j == args.num_updates - 1) and args.log_dir != '':
            save_model(args, actor_critic, envs, iteration=j)

        if not args.vae_freeze and j % args.rewarder_fit_period == 0:
            rewarder_fit_start = time.time()
            envs.fit_rewarder()
            rewarder_fit_time += time.time() - rewarder_fit_start

        if (j % args.vis_period == 0 or j == args.num_updates - 1) and args.log_dir != '':
            visualize_start = time.time()
            if args.look:
                eval_return_avg, eval_episode_returns, eval_episode_final_reward = looker.look(iteration=j)
                logger.logkv('eval_return_avg', eval_return_avg)
                for i_episode in range(args.trial_length):
                    logger.logkv('eval_episode_return_avg_{}'.format(i_episode),
                                 eval_episode_returns[i_episode] / args.trials_per_update)
                    logger.logkv('eval_episode_final_reward_{}'.format(i_episode),
                                 eval_episode_final_reward[i_episode] / args.trials_per_update)

            if args.plot:
                p = Popen('python visualize.py --log-dir {}'.format(args.log_dir), shell=True)
                processes.append(p)
            visualize_time += time.time() - visualize_start

        logger.dumpkvs()
Пример #19
0
def run_one_agent(index, args, unknown_args, actor_status):
    from tensorflow.keras.backend import set_session
    import tensorflow.compat.v1 as tf

    # Set 'allow_growth'
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    set_session(tf.Session(config=config))

    # Connect to learner
    context = zmq.Context()
    context.linger = 0  # For removing linger behavior
    socket = context.socket(zmq.REQ)
    socket.connect(f'tcp://{args.ip}:{args.data_port}')

    # Initialize environment and agent instance
    env, agent = init_components(args, unknown_args)

    # Configure logging only in one process
    if index == 0:
        logger.configure(str(args.log_path))
        save_yaml_config(args.exp_path / 'config.yaml', args, 'actor', agent)
    else:
        logger.configure(str(args.log_path), format_strs=[])

    # Create local queues for collecting data
    transitions = []  # A list to store raw transitions within an episode
    mem_pool = MemPool()  # A pool to store prepared training data

    # Initialize values
    model_id = -1
    episode_rewards = [0.0]
    episode_lengths = [0]
    num_episodes = 0
    mean_10ep_reward = 0
    mean_10ep_length = 0
    send_time_start = time.time()

    state = env.reset()
    for step in range(args.num_steps):
        # Do some updates
        agent.update_sampling(step, args.num_steps)

        # Sample action
        action, extra_data = agent.sample(state)
        next_state, reward, done, info = env.step(action)

        # Record current transition
        transitions.append(
            (state, action, reward, next_state, done, extra_data))
        episode_rewards[-1] += reward
        episode_lengths[-1] += 1

        state = next_state

        is_terminal = done or episode_lengths[-1] >= args.max_episode_length > 0
        if is_terminal or len(mem_pool) + len(
                transitions) >= args.max_steps_per_update:
            # Current episode is terminated or a trajectory of enough training data is collected
            data = agent.prepare_training_data(transitions)
            transitions.clear()
            mem_pool.push(data)

            if is_terminal:
                # Log information at the end of episode
                num_episodes = len(episode_rewards)
                mean_10ep_reward = round(np.mean(episode_rewards[-10:]), 2)
                mean_10ep_length = round(np.mean(episode_lengths[-10:]), 2)
                episode_rewards.append(0.0)
                episode_lengths.append(0)

                # Reset environment
                state = env.reset()

        if len(mem_pool) >= args.max_steps_per_update:
            # Send training data after enough training data (>= 'arg.max_steps_per_update') is collected
            post_processed_data = agent.post_process_training_data(
                mem_pool.sample())
            socket.send(serialize(post_processed_data).to_buffer())
            socket.recv()
            mem_pool.clear()

            send_data_interval = time.time() - send_time_start
            send_time_start = time.time()

            if num_episodes > 0:
                # Log information
                logger.record_tabular("iteration",
                                      (step + 1) // args.max_steps_per_update)
                logger.record_tabular("steps", step)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean 10 episode reward",
                                      mean_10ep_reward)
                logger.record_tabular("mean 10 episode length",
                                      mean_10ep_length)
                logger.record_tabular(
                    "send data fps",
                    args.max_steps_per_update // send_data_interval)
                logger.record_tabular("send data interval", send_data_interval)
                logger.dump_tabular()

        # Update weights
        new_weights, model_id = find_new_weights(model_id, args.ckpt_path)
        if new_weights is not None:
            agent.set_weights(new_weights)

    actor_status[index] = 1
Пример #20
0
def fit(environ, env_id, num_timesteps, seed, model_path=None):
    # atari
    if environ == 'atari':
        rank = MPI.COMM_WORLD.Get_rank()
        sess = Model().single_threaded_session()
        sess.__enter__()
        if rank == 0:
            logger.configure()
        else:
            logger.configure(format_strs=[])
        workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed \
            is not None else None
        set_global_seeds(workerseed)
        env = make_atari(env_id)

        def policy_fn(name, ob_space, ac_space):
            return PPO1Cnn(name=name, ob_space=ob_space, ac_space=ac_space)

        env = Monitor(
            env,
            logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
        env.seed(workerseed)

        env = wrap_deepmind(env)
        env.seed(workerseed)

        pi = PPOSGD(env,
                    policy_fn,
                    env.observation_space,
                    env.action_space,
                    timesteps_per_actorbatch=256,
                    clip_param=0.2,
                    entcoeff=0.01,
                    optim_epochs=4,
                    optim_stepsize=1e-3,
                    optim_batchsize=64,
                    gamma=0.99,
                    lam=0.95,
                    max_timesteps=int(num_timesteps * 1.1),
                    schedule='linear')

        env.close()
        sess.close()
        return pi

    # mujoco
    if environ == 'mujoco':
        from utils.cmd import make_mujoco_env

        sess = Model().init_session(num_cpu=1).__enter__()

        def policy_fn(name, ob_space, ac_space):
            return PPO1Mlp(name=name,
                           ob_space=ob_space,
                           ac_space=ac_space,
                           hid_size=64,
                           num_hid_layers=2)

        env = make_mujoco_env(env_id, seed)
        pi = PPOSGD(
            env,
            policy_fn,
            env.observation_space,
            env.action_space,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
        env.close()
        sess.close()
        return pi

    if environ == 'humanoid':
        import gym
        from utils.cmd import make_mujoco_env

        env_id = 'Humanoid-v2'

        class RewScale(gym.RewardWrapper):
            def __init__(self, env, scale):
                gym.RewardWrapper.__init__(self, env)
                self.scale = scale

            def reward(self, r):
                return r * self.scale

        sess = Model().init_session(num_cpu=1).__enter__()

        def policy_fn(name, ob_space, ac_space):
            return PPO1Mlp(name=name,
                           ob_space=ob_space,
                           ac_space=ac_space,
                           hid_size=64,
                           num_hid_layers=2)

        env = make_mujoco_env(env_id, seed)

        # parameters below were the best found in a simple random
        # search these are good enough to make humanoid walk, but
        # whether those are an absolute best or not is not certain
        env = RewScale(env, 0.1)
        pi = PPOSGD(
            env,
            policy_fn,
            env.observation_space,
            env.action_space,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=10,
            optim_stepsize=3e-4,
            optim_batchsize=64,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
        env.close()
        if model_path:
            Model().save_state(model_path)

        sess.close()
        return pi

    if environ == 'robotics':
        import mujoco_py
        from utils.cmd import make_robotics_env
        rank = MPI.COMM_WORLD.Get_rank()
        sess = Model().single_threaded_session()
        sess.__enter__()
        mujoco_py.ignore_mujoco_warnings().__enter__()
        workerseed = seed + 10000 * rank
        set_global_seeds(workerseed)
        env = make_robotics_env(env_id, workerseed, rank=rank)

        def policy_fn(name, ob_space, ac_space):
            return PPO1Mlp(name=name,
                           ob_space=ob_space,
                           ac_space=ac_space,
                           hid_size=256,
                           num_hid_layers=3)

        pi = PPOSGD(
            env,
            policy_fn,
            env.observation_space,
            env.action_space,
            max_timesteps=num_timesteps,
            timesteps_per_actorbatch=2048,
            clip_param=0.2,
            entcoeff=0.0,
            optim_epochs=5,
            optim_stepsize=3e-4,
            optim_batchsize=256,
            gamma=0.99,
            lam=0.95,
            schedule='linear',
        )
        env.close()
        sess.close()
        return pi
Пример #21
0
def main(args):
    continuous_actions = (args.env_name in [
        'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1',
        'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0',
        'Ant-v0', 'HalfCheetah-v0'
    ])

    writer = SummaryWriter(log_dir=args.log_dir)

    logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv'])
    logger.log(args)
    json.dump(vars(args),
              open(os.path.join(
                  args.log_dir,
                  'params.json',
              ), 'w'),
              indent=2)

    sampler = BatchSampler(args.env_name,
                           batch_size=args.fast_batch_size,
                           num_workers=args.num_workers)
    if continuous_actions:
        policy = NormalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            int(np.prod(sampler.envs.action_space.shape)),
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    else:
        policy = CategoricalMLPPolicy(
            int(np.prod(sampler.envs.observation_space.shape)),
            sampler.envs.action_space.n,
            hidden_sizes=(args.hidden_size, ) * args.num_layers)
    baseline = LinearFeatureBaseline(
        int(np.prod(sampler.envs.observation_space.shape)))

    metalearner = MetaLearner(sampler,
                              policy,
                              baseline,
                              gamma=args.gamma,
                              fast_lr=args.fast_lr,
                              tau=args.tau,
                              device=args.device)

    for batch in range(args.num_batches):
        tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size)
        episodes = metalearner.sample(tasks, first_order=args.first_order)
        metalearner.step(episodes,
                         max_kl=args.max_kl,
                         cg_iters=args.cg_iters,
                         cg_damping=args.cg_damping,
                         ls_max_steps=args.ls_max_steps,
                         ls_backtrack_ratio=args.ls_backtrack_ratio)

        # # Tensorboard
        writer.add_scalar('total_rewards/before_update',
                          total_rewards([ep.rewards for ep, _ in episodes]),
                          batch)
        writer.add_scalar('total_rewards/after_update',
                          total_rewards([ep.rewards for _, ep in episodes]),
                          batch)

        logger.logkv('return_avg_pre',
                     total_rewards([ep.rewards for ep, _ in episodes]))
        logger.logkv('return_avg_post',
                     total_rewards([ep.rewards for _, ep in episodes]))
        logger.dumpkvs()