Exemplo n.º 1
0
def train(env_id, num_timesteps, seed):
    import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1, num_gpu=0).__enter__()
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=128,
                                    num_hid_layers=2)

    env.seed(seed)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=1e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='constant',
    )
    env.close()
Exemplo n.º 2
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument('--num-timesteps', type=int, default=int(1e12))
    parser.add_argument('--num_env', type=int, default=32)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--gamma_ext', type=float, default=0.99)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0)
    parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy', type=str, default='cnn', choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=2.)
    parser.add_argument('--dynamics_bonus', type=int, default=0)


    args = parser.parse_args()
    logger.configure(dir=logger.get_dir(), format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else [])
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f:
            f.write(args.tag)
        # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code'))

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(
        frame_stack=4,
        nminibatches=4,
        nepochs=4,
        lr=0.0001,
        max_grad_norm=0.0,
        use_news=args.use_news,
        gamma=args.gamma,
        gamma_ext=args.gamma_ext,
        max_episode_steps=args.max_episode_steps,
        lam=args.lam,
        update_ob_stats_every_step=args.update_ob_stats_every_step,
        update_ob_stats_independently_per_gpu=args.update_ob_stats_independently_per_gpu,
        update_ob_stats_from_random_agent=args.update_ob_stats_from_random_agent,
        proportion_of_exp_used_for_predictor_update=args.proportion_of_exp_used_for_predictor_update,
        policy=args.policy,
        int_coeff=args.int_coeff,
        ext_coeff=args.ext_coeff,
        dynamics_bonus = args.dynamics_bonus
    )

    tf_util.make_session(make_default=True)
    train(env_id=args.env, num_env=args.num_env, seed=seed,
        num_timesteps=args.num_timesteps, hps=hps)
Exemplo n.º 3
0
 def load(path, num_cpus=1):
     with open(path, "rb") as f:
         model_data, act_params = dill.load(f)
     act = build_act(**act_params)
     sess = U.make_session(num_cpus=num_cpus)
     sess.__enter__()
     with tempfile.TemporaryDirectory() as td:
         filepath = os.path.join(td, "packed.zip")
         with open(filepath, "wb") as f:
             f.write(model_data)
         zipfile.ZipFile(filepath, 'r', zipfile.ZIP_DEFLATED).extractall(td)
         U.load_state(os.path.join(td, "model"))
     return ActWrapper(act, act_params)
Exemplo n.º 4
0
def run():
    import mlp_policy_robo
    U.make_session(num_cpu=1).__enter__()
    env = gym.make("RoboschoolHumanoid-v1")

    #env = wrappers.Monitor(env, directory="./video/HalfCheeta-v1", force=True)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy_robo.MlpPolicy(name=name,
                                         ob_space=ob_space,
                                         ac_space=ac_space,
                                         hid_size=128,
                                         num_hid_layers=2)

    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_fn("pi", ob_space, ac_space)
    oldpi = policy_fn("oldpi", ob_space, ac_space)

    U.load_state("save/Humanoid-v1")
    for epi in range(100):
        ob = env.reset()

        total_reward = 0
        step = 0
        while True:
            env.render("human")
            ac, v = pi.act(True, ob)

            ob, rew, new, info = env.step(ac)
            step += 1

            total_reward += rew

            if new:
                print("Reward: {}, Step: {}".format(total_reward, step))
                break
def bernstein_error_partition_cuda(
    nn,
    f,
    degree_bound,
    input_box,
    output_index,
    activation,
    filename,
):
    global step
    step += 1
    import error_bound
    eps = error_bound.error_bound
    input_dim = len(degree_bound)
    lips, network_output_range = lipschitz(nn, input_box, output_index,
                                           activation)

    distance_estimate = 0
    for idxState in range(input_dim):
        diff = np.diff(input_box[idxState])[0]
        if diff > distance_estimate:
            distance_estimate = diff

    LD_estimate = lips * distance_estimate * np.sqrt(input_dim)
    num_partition = int(np.ceil(LD_estimate // eps + 1))

    partition = [num_partition] * input_dim

    print('---------------' + filename + '-------------------')
    print('step: {}'.format(step))
    print('degree bound: {}'.format(degree_bound))
    print('number of partition: {}'.format(num_partition))
    print('Lipschitz constant: {}'.format(lips))

    all_comb_lists = sample_points_list(partition, input_dim)

    if isinstance(lips, np.ndarray):
        lips = lips[0]

    sample_times = (num_partition + 1)**input_dim
    large_sample_times = False
    if sample_times < 1e7:
        all_sample_points = np.zeros(
            ((num_partition + 1)**input_dim, input_dim), dtype=np.float32)
        all_shift_points = np.zeros(
            ((num_partition + 1)**input_dim, input_dim), dtype=np.float32)
    else:
        large_sample_times = True
        os.system('rm ./cach.hdf5')
        hdf5_store = h5py.File('./cach.hdf5', 'a')
        all_sample_points = hdf5_store.create_dataset(
            "all_sample_points", (sample_times, input_dim), compression='gzip')
        all_shift_points = hdf5_store.create_dataset("all_shift_points",
                                                     (sample_times, input_dim),
                                                     compression='gzip')

    partition_box = np.zeros(input_dim, dtype=np.float64)
    for j in range(input_dim):
        alpha_j = np.float64(input_box[j][0])
        beta_j = np.float64(input_box[j][1])
        partition_box[j] = (beta_j - alpha_j) / num_partition

    for idxState in range(input_dim):
        alpha_j = np.float64(input_box[idxState][0])
        beta_j = np.float64(input_box[idxState][1])
        all_sample_points[:, idxState] = (
            (beta_j - alpha_j) *
            (points_list(all_comb_lists, idxState) / num_partition) + alpha_j)
        all_shift_points = point_shift_all(all_sample_points, input_box,
                                           large_sample_times,
                                           all_shift_points)
    if large_sample_times:
        hdf5_store.close()

    order_list, coeffs_list = nn_poly_approx_bernstein_cuda(
        f, degree_bound, input_box, output_index)
    poly = polyval(order_list, degree_bound, coeffs_list, 'test')

    if large_sample_times:
        with h5py.File('./cach.hdf5', 'r') as hdf5_store:
            all_sample_points = hdf5_store['all_sample_points'][:]
            all_shift_points = hdf5_store['all_shift_points'][:]

    if filename[:4] == 'nn_5' or filename[:4] == 'nn_2':
        batch_size = 1e5
    else:
        batch_size = 1e7
    batch_num = math.ceil(all_sample_points.shape[0] / batch_size)
    batch_idx = np.arange(1, batch_num) * batch_size
    batch_idx = batch_idx.astype(int)
    all_sample_points_batches = np.split(all_sample_points, batch_idx, axis=0)
    all_shift_points_batches = np.split(all_shift_points, batch_idx, axis=0)

    poly_results = np.zeros((all_sample_points.shape[0], 1))
    nn_results = np.zeros((all_sample_points.shape[0], 1))

    with U.make_session() as sess:
        sess.run(tf.global_variables_initializer())
        batch_pointer = 0
        print('number of sampling points: {}'.format(
            all_sample_points.shape[0]))
        for sample_points, shift_points in zip(all_sample_points_batches,
                                               all_shift_points_batches):
            batch_range = range(batch_pointer,
                                batch_pointer + sample_points.shape[0])
            print('batch_range: {}'.format(batch_range))
            poly_results[batch_range, :] = poly(sess, shift_points)
            nn_results[batch_range, :] = nn(sess, sample_points)
            batch_pointer += sample_points.shape[0]

    sample_error = np.max(np.absolute(poly_results[:, 0] - nn_results[:, 0]))
    error = sample_error + lips * LA.norm(partition_box)
    print('bp to nn error: {}'.format(error))

    return error
Exemplo n.º 6
0
def main():
    parser = arg_parser()
    parser.add_argument('--env',
                        help='environment ID',
                        default='BreakoutNoFrameskip-v4')
    parser.add_argument('--seed', help='RNG seed', type=int, default=0)
    parser.add_argument('--max_episode_steps', type=int, default=4500)

    parser.add_argument('--num-timesteps', type=int, default=int(1e8))
    parser.add_argument('--num_env', type=int, default=128)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu',
                        type=int,
                        default=0)
    parser.add_argument('--update_ob_stats_from_random_agent',
                        type=int,
                        default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_update',
                        type=float,
                        default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy',
                        type=str,
                        default='cnn',
                        choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=0.)
    parser.add_argument('--beta', type=float, default=1e-3)
    parser.add_argument('--exploration_type', type=str, default='bottleneck')
    parser.add_argument('--noise_type',
                        type=str,
                        default='none',
                        choices=['none', 'box'])
    parser.add_argument('--noise_p', type=float, default=0.1)
    parser.add_argument('--use_sched', type=int, default=0)
    parser.add_argument('--exp_name', type=str, default='none')

    args = parser.parse_args()
    if args.policy == 'rnn':
        args.gamma_ext = 0.999
    else:
        args.gamma_ext = 0.99

    logger_dir = './results/' + args.env.replace("NoFrameskip-v4", "")
    logger_dir += datetime.datetime.now().strftime("-%m-%d-%H-%M-%S")
    logger.configure(dir=logger_dir,
                     format_strs=['stdout', 'log', 'csv']
                     if MPI.COMM_WORLD.Get_rank() == 0 else [])

    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'),
                  'w') as f:
            f.write(args.tag)

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(
        frame_stack=4,
        nminibatches=4,
        nepochs=4,
        lr=0.0001,
        max_grad_norm=0.0,
        use_news=args.use_news,
        gamma=args.gamma,
        gamma_ext=args.gamma_ext,
        max_episode_steps=args.max_episode_steps,
        lam=args.lam,
        update_ob_stats_every_step=args.update_ob_stats_every_step,
        update_ob_stats_independently_per_gpu=args.
        update_ob_stats_independently_per_gpu,
        update_ob_stats_from_random_agent=args.
        update_ob_stats_from_random_agent,
        proportion_of_exp_used_for_predictor_update=args.
        proportion_of_exp_used_for_predictor_update,
        policy=args.policy,
        int_coeff=args.int_coeff,
        ext_coeff=args.ext_coeff,
        exploration_type=args.exploration_type,
        beta=args.beta,
        noise_type=args.noise_type,
        noise_p=args.noise_p,
        use_sched=args.use_sched,
        exp_name=args.exp_name,
    )

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps)
Exemplo n.º 7
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument(
        "--num-timesteps",
        type=int,
        default=int(1e12),
    )
    parser.add_argument(
        "--num_env",
        type=int,
        default=32,
    )
    parser.add_argument(
        "--use_news",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--gamma",
        type=float,
        default=0.99,
    )
    parser.add_argument(
        "--gamma_ext",
        type=float,
        default=0.999,
    )
    parser.add_argument(
        "--lam",
        type=float,
        default=0.95,
    )
    parser.add_argument(
        "--update_ob_stats_every_step",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--update_ob_stats_independently_per_gpu",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--update_ob_stats_from_random_agent",
        type=int,
        default=1,
    )
    parser.add_argument(
        "--proportion_of_exp_used_for_predictor_update",
        type=float,
        default=1.0,
    )
    parser.add_argument(
        "--tag",
        type=str,
        default="",
    )
    parser.add_argument(
        "--policy",
        type=str,
        default="cnn",
        choices=["cnn", "rnn", "ffnn"],
    )
    parser.add_argument(
        "--int_coeff",
        type=float,
        default=1.0,
    )
    parser.add_argument(
        "--ext_coeff",
        type=float,
        default=2.0,
    )
    parser.add_argument(
        "--dynamics_bonus",
        type=int,
        default=0,
    )
    parser.add_argument(
        "--meta_rl",
        type=lambda x: True if x.lower() in {'true', 't'} else False,
        default=False,
    )

    args = parser.parse_args()
    logger.configure(
        dir=logger.get_dir(),
        format_strs=["stdout", "log", "csv"]
        if MPI.COMM_WORLD.Get_rank() == 0 else [],
    )
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), "experiment_tag.txt"),
                  "w") as f:
            f.write(args.tag)

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(
        frame_stack=4,
        nminibatches=4,
        nepochs=4,
        lr=0.0001,
        max_grad_norm=0.0,
        use_news=args.use_news,
        gamma=args.gamma,
        gamma_ext=args.gamma_ext,
        max_episode_steps=args.max_episode_steps,
        lam=args.lam,
        update_ob_stats_every_step=args.update_ob_stats_every_step,
        update_ob_stats_independently_per_gpu=args.
        update_ob_stats_independently_per_gpu,
        update_ob_stats_from_random_agent=args.
        update_ob_stats_from_random_agent,
        proportion_of_exp_used_for_predictor_update=args.
        proportion_of_exp_used_for_predictor_update,
        policy=args.policy,
        int_coeff=args.int_coeff,
        ext_coeff=args.ext_coeff,
        dynamics_bonus=args.dynamics_bonus,
        meta_rl=args.meta_rl,
    )

    tf_util.make_session(make_default=True)
    train(
        env_id=args.env,
        num_env=args.num_env,
        seed=seed,
        num_timesteps=args.num_timesteps,
        hps=hps,
    )
Exemplo n.º 8
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument('--num_timesteps', type=float, default=100e6)
    parser.add_argument('--num_env', type=int, default=128)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--gamma_ext', type=float, default=0.99)
    parser.add_argument('--gamma_div', type=float, default=0.999)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu',
                        type=int,
                        default=1)
    parser.add_argument('--update_ob_stats_from_random_agent',
                        type=int,
                        default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_updated',
                        type=float,
                        default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy',
                        type=str,
                        default='cnn',
                        choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=2.)
    parser.add_argument('--dynamics_bonus', type=int, default=0)
    parser.add_argument('--save_dir',
                        help="dir to save and log",
                        type=str,
                        default="save_dir")
    parser.add_argument('--load_path',
                        help="dir to load model",
                        type=str,
                        default=None)
    parser.add_argument('--base_load_path',
                        help="dir to load model",
                        type=str,
                        default=None)
    parser.add_argument('--r_path',
                        help="dir to load r network",
                        type=str,
                        default=None)

    parser.add_argument('--play', default=False, action='store_true')
    parser.add_argument('--only_train_r', default=False, action='store_true')
    parser.add_argument('--online_train_r', default=False, action='store_true')
    #parser.add_argument('--ec_type', type=str, default='episodic_curiosity', choices=['episodic_curiosity', 'none','oracle'])
    parser.add_argument('--rnd_type',
                        type=str,
                        default='rnd',
                        choices=['rnd', 'oracle'])
    parser.add_argument('--reset', default=False, action='store_true')
    parser.add_argument('--dynamics_sample',
                        default=False,
                        action='store_true')

    parser.add_argument('--num_agents', type=int, default=1)

    parser.add_argument('--div_type',
                        type=str,
                        default='oracle',
                        choices=['oracle', 'cls', 'rnd'])
    parser.add_argument('--load_ram', default=False, action='store_true')
    parser.add_argument('--debug', default=False, action='store_true')
    parser.add_argument('--rnd_mask_prob', type=float, default=1.)
    parser.add_argument('--rnd_mask_type',
                        type=str,
                        default='indep',
                        choices=['prog', 'indep', 'shared'])
    parser.add_argument('--indep_rnd', default=False, action='store_true')
    parser.add_argument('--indep_policy', default=True, action='store_true')
    parser.add_argument('--sd_type',
                        type=str,
                        default='oracle',
                        choices=['oracle', 'sd'])
    parser.add_argument('--from_scratch', default=False, action='store_true')

    parser.add_argument('--kl', default=False, action='store_true')

    args = parser.parse_args()

    log_path = os.path.join(args.save_dir, 'logs')
    save_path = os.path.join(args.save_dir, 'models')

    logger.configure(dir=log_path,
                     format_strs=['stdout', 'log', 'csv']
                     if MPI.COMM_WORLD.Get_rank() == 0 else [])
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'),
                  'w') as f:
            f.write(args.tag)
        # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code'))

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(
        frame_stack=4,
        nminibatches=4,
        nepochs=4,
        lr=0.0001,
        max_grad_norm=0.0,
        use_news=args.use_news,
        gamma=args.gamma,
        gamma_ext=args.gamma_ext,
        gamma_div=args.gamma_div,
        max_episode_steps=args.max_episode_steps,
        lam=args.lam,
        update_ob_stats_every_step=args.update_ob_stats_every_step,
        update_ob_stats_independently_per_gpu=args.
        update_ob_stats_independently_per_gpu,
        update_ob_stats_from_random_agent=args.
        update_ob_stats_from_random_agent,
        proportion_of_exp_used_for_predictor_update=args.
        proportion_of_exp_used_for_predictor_updated,
        policy=args.policy,
        int_coeff=args.int_coeff,
        ext_coeff=args.ext_coeff,
        dynamics_bonus=args.dynamics_bonus,
        log_interval=10,
        save_path=save_path,
        load_path=args.load_path,
        r_path=args.r_path,
        play=args.play,
        only_train_r=args.only_train_r,
        online_train_r=args.online_train_r,
        #ec_type = args.ec_type,
        rnd_type=args.rnd_type,
        reset=args.reset,
        dynamics_sample=args.dynamics_sample,
        num_agents=args.num_agents,
        div_type=args.div_type,
        load_ram=args.load_ram,
        debug=args.debug,
        rnd_mask_prob=args.rnd_mask_prob,
        rnd_mask_type=args.rnd_mask_type,
        indep_rnd=args.indep_rnd,
        indep_policy=args.indep_policy,
        sd_type=args.sd_type,
        from_scratch=args.from_scratch,
        base_load_path=args.base_load_path,
        use_kl=args.kl)

    if args.play:
        args.num_env = 1

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps)
Exemplo n.º 9
0
def main():
    default_log_dir = "/tmp/rnd_log"
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument('--num-timesteps', type=int, default=int(4.2e7))  # 10k
    parser.add_argument('--num_env', type=int, default=32)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    # parser.add_argument('--gamma_ext', type=float, default=0.99)
    parser.add_argument('--gamma_ext', type=float, default=0.999)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu',
                        type=int,
                        default=0)
    parser.add_argument('--update_ob_stats_from_random_agent',
                        type=int,
                        default=1)
    # parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.)
    parser.add_argument('--proportion_of_exp_used_for_predictor_update',
                        type=float,
                        default=0.25)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy',
                        type=str,
                        default='cnn',
                        choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=2.)
    parser.add_argument('--dynamics_bonus', type=int, default=0)

    parser.add_argument('--logdir', type=str, default=default_log_dir)
    parser.add_argument('--action_balance_coef',
                        '--abc',
                        type=float,
                        default=None)
    parser.add_argument('--array_action', type=int, default=1)

    parser.add_argument('--num_minibatches', type=int, default=4)

    args = parser.parse_args()

    if args.logdir != default_log_dir and os.path.isdir(
            args.logdir) and os.listdir(args.logdir):
        raise ValueError("logdir not empty!")
    logger.configure(dir=args.logdir,
                     format_strs=['stdout', 'log', 'csv', 'tensorboard']
                     if MPI.COMM_WORLD.Get_rank() == 0 else [])
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'),
                  'w') as f:
            f.write(args.tag)
        # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code'))

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(frame_stack=4,
               nminibatches=args.num_minibatches,
               nepochs=4,
               lr=0.0001,
               max_grad_norm=0.0,
               use_news=args.use_news,
               gamma=args.gamma,
               gamma_ext=args.gamma_ext,
               max_episode_steps=args.max_episode_steps,
               lam=args.lam,
               update_ob_stats_every_step=args.update_ob_stats_every_step,
               update_ob_stats_independently_per_gpu=args.
               update_ob_stats_independently_per_gpu,
               update_ob_stats_from_random_agent=args.
               update_ob_stats_from_random_agent,
               proportion_of_exp_used_for_predictor_update=args.
               proportion_of_exp_used_for_predictor_update,
               policy=args.policy,
               int_coeff=args.int_coeff,
               ext_coeff=args.ext_coeff,
               dynamics_bonus=args.dynamics_bonus,
               action_balance_coef=args.action_balance_coef,
               array_action=args.array_action)

    logger.info('args: {}'.format(args))

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps)
Exemplo n.º 10
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument('--num-timesteps', type=int, default=int(1e12))
    parser.add_argument('--num_env', type=int, default=32)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--gamma_ext', type=float, default=0.99)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu',
                        type=int,
                        default=0)
    parser.add_argument('--update_ob_stats_from_random_agent',
                        type=int,
                        default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_update',
                        type=float,
                        default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy',
                        type=str,
                        default='rnn',
                        choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=2.)
    parser.add_argument('--dynamics_bonus', type=int, default=0)
    parser.add_argument(
        '--save_dir',
        type=str,
        default='/home/hxu/PriorRL/random-network-distillation/ckpts/')
    parser.add_argument(
        '--load_dir',
        type=str,
        default='/home/hxu/PriorRL/random-network-distillation/ckpts/')
    parser.add_argument('--test', type=int, default=0)
    parser.add_argument('--save_image', type=int, default=0)
    parser.add_argument('--exp_name', type=str, default='tmp')
    parser.add_argument('--logdir', type=str, default='./logs/')
    parser.add_argument('--clip_rewards', type=int, default=1)
    parser.add_argument('--e_greedy', type=int, default=0)
    parser.add_argument('--action_space', type=str, default='RIGHT_ONLY')
    parser.add_argument('--load_mtype', type=str, default='latest')

    args = parser.parse_args()
    logdir = os.path.join(
        args.logdir, args.exp_name + '_' +
        datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))
    logger.configure(folder=logdir,
                     format_strs=['stdout', 'log', 'csv']
                     if MPI.COMM_WORLD.Get_rank() == 0 else [])
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'),
                  'w') as f:
            f.write(args.tag)
        # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code'))

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(frame_stack=4,
               nminibatches=4,
               nepochs=4,
               lr=0.0001,
               max_grad_norm=0.0,
               use_news=args.use_news,
               gamma=args.gamma,
               gamma_ext=args.gamma_ext,
               max_episode_steps=args.max_episode_steps,
               lam=args.lam,
               update_ob_stats_every_step=args.update_ob_stats_every_step,
               update_ob_stats_independently_per_gpu=args.
               update_ob_stats_independently_per_gpu,
               update_ob_stats_from_random_agent=args.
               update_ob_stats_from_random_agent,
               proportion_of_exp_used_for_predictor_update=args.
               proportion_of_exp_used_for_predictor_update,
               policy=args.policy,
               int_coeff=args.int_coeff,
               ext_coeff=args.ext_coeff,
               dynamics_bonus=args.dynamics_bonus)

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps,
          load_dir=args.load_dir,
          save_dir=args.save_dir,
          test=args.test,
          exp_name=args.exp_name,
          clip_rewards=args.clip_rewards,
          save_image=args.save_image,
          action_space=args.action_space,
          e_greedy=args.e_greedy,
          load_mtype=args.load_mtype)
Exemplo n.º 11
0
def learn(env,
          q_func,
          alpha=1e-5,
          num_cpu=1,
          n_steps=100000,
          update_target_every=500,
          train_main_every=1,
          print_every=50,
          checkpoint_every=10000,
          buffer_size=50000,
          gamma=1.0,
          batch_size=32,
          param_noise=False,
          pre_run_steps=1000,
          exploration_fraction=0.1,
          final_epsilon=0.1,
          callback=None):
    """
    :param env: gym.Env, environment from OpenAI
    :param q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the q function takes the following inputs:
        input_ph: tf.placeholder, network input
        n_actions: int, number of possible actions
        scope: str, specifying the variable scope
        reuse: bool, whether to reuse the variable given in `scope`
    :param alpha: learning rate
    :param num_cpu: number of cpu to use
    :param n_steps: number of training steps
    :param update_target_every: frequency to update the target network
    :param train_main_every: frequency to update(train) the main network
    :param print_every: how often to print message to console
    :param checkpoint_every: how often to save the model.
    :param buffer_size: size of the replay buffer
    :param gamma: int, discount factor
    :param batch_size: int, size of the input batch
    :param param_noise: bool, whether to use parameter noise
    :param pre_run_steps: bool, pre-run steps to fill in the replay buffer. And only
        after `pre_run_steps` steps, will the main and target network begin to update.
    :param exploration_fraction: float, between 0 and 1. Fraction of the `n_steps` to
        linearly decrease the epsilon. After that, the epsilon will remain unchanged.
    :param final_epsilon: float, final epsilon value, usually a very small number
        towards zero.
    :param callback: (dict, dict) -> bool
        a function to decide whether it's time to stop training, takes following inputs:
        local_vars: dict, the local variables in the current scope
        global_vars: dict, the global variables in the current scope
    :return: ActWrapper, a callable function
    """
    n_actions = env.action_space.n
    sess = U.make_session(num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput(env.observation_space.shape, name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph,
        q_func,
        n_actions,
        optimizer=tf.train.AdamOptimizer(alpha),
        gamma=gamma,
        param_noise=param_noise,
        grad_norm_clipping=10)
    act_params = {
        "q_func": q_func,
        "n_actions": env.action_space.n,
        "make_obs_ph": make_obs_ph,
    }
    buffer = ReplayBuffer(buffer_size)
    exploration = LinearSchedule(schedule_steps=int(exploration_fraction *
                                                    n_steps),
                                 final_p=final_epsilon,
                                 initial_p=1.0)
    # writer = tf.summary.FileWriter("./log", sess.graph)

    U.initialize()
    # writer.close()
    update_target()  # copy from the main network
    episode_rewards = []
    current_episode_reward = 0.0
    model_saved = False
    saved_mean_reward = 0.0
    obs_t = env.reset()
    with tempfile.TemporaryDirectory() as td:
        model_file_path = os.path.join(td, "model")
        for step in range(n_steps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            kwargs = {}
            if not param_noise:
                epsilon = exploration.value(step)
            else:
                assert False, "Not implemented"
            action = act(np.array(obs_t)[None], epsilon=epsilon, **kwargs)[0]
            obs_tp1, reward, done, _ = env.step(action)
            current_episode_reward += reward
            buffer.add(obs_t, action, reward, obs_tp1, done)
            obs_t = obs_tp1
            if done:
                obs_t = env.reset()
                episode_rewards.append(current_episode_reward)
                current_episode_reward = 0.0
            # given sometime to fill in the buffer
            if step < pre_run_steps:
                continue
            # q_value = debug["q_values"]
            # if step % 1000 == 0:
            #     print(q_value(np.array(obs_t)[None]))
            if step % train_main_every == 0:
                obs_ts, actions, rewards, obs_tp1s, dones = buffer.sample(
                    batch_size)
                weights = np.ones_like(dones)
                td_error = train(obs_ts, actions, rewards, obs_tp1s, dones,
                                 weights)
            if step % update_target_every == 0:
                update_target()
            mean_100eps_reward = float(np.mean(episode_rewards[-101:-1]))
            if done and print_every is not None and len(
                    episode_rewards) % print_every == 0:
                print(
                    "step %d, episode %d, epsilon %.2f, running mean reward %.2f"
                    %
                    (step, len(episode_rewards), epsilon, mean_100eps_reward))
            if checkpoint_every is not None and step % checkpoint_every == 0:
                if saved_mean_reward is None or mean_100eps_reward > saved_mean_reward:
                    U.save_state(model_file_path)
                    model_saved = True
                    if print_every is not None:
                        print(
                            "Dump model to file due to mean reward increase: %.2f -> %.2f"
                            % (saved_mean_reward, mean_100eps_reward))
                    saved_mean_reward = mean_100eps_reward
        if model_saved:
            U.load_state(model_file_path)
            if print_every:
                print("Restore model from file with mean reward %.2f" %
                      (saved_mean_reward, ))
    return ActWrapper(act, act_params)
def bernstein_error_partition_cuda(f_details, f, d, box, output_index, activation, filename, eps=1e-2):
    if filename == 'nn_12_relu':
        eps = 1e-2
    elif filename == 'nn_12_sigmoid':
        eps = 1e-2
    elif filename == 'nn_12_tanh':
        eps = 1e-2
    elif filename == 'nn_12_relu_tanh':
        eps = 1e-3
    elif filename == 'nn_13_relu':
        eps = 1e-3
    elif filename == 'nn_13_sigmoid':
        eps = 5e-4
    elif filename == 'nn_13_tanh':
        eps = 1e-2
    elif filename == 'nn_13_relu_tanh':
        eps = 1e-2
    elif filename == 'nn_13_relu_tanh_1':
        eps = 1e-2
    elif filename == 'nn_13_relu_tanh_100':
        eps = 1e-2
    elif filename == 'nn_13_relu_tanh_origin':
        eps = 1e-2
    elif filename == 'nn_14_relu':
        eps = 1e-2
    elif filename == 'nn_14_sigmoid':
        eps = 5e-3
    elif filename == 'nn_14_tanh':
        eps = 1e-2
    elif filename == 'nn_14_relu_sigmoid':
        eps = 5e-3
    elif filename == 'nn_tora_relu_retrained':
        eps = 1e-2
    elif filename == 'nn_tora_tanh':
        eps = 2e-2
    elif filename == 'nn_tora_relu_tanh':
        eps = 1e-2
    elif filename == 'nn_tora_sigmoid':
        eps = 1e-2
    elif filename == 'nn_16_relu':
        eps = 5e-3
    elif filename == 'nn_16_sigmoid':
        eps = 1e-2
    elif filename == 'nn_16_tanh':
        eps = 1e-2
    elif filename == 'nn_16_relu_tanh':
        eps = 1e-2
    elif filename == 'nn_18_relu':
        eps = 4e-3
    elif filename == 'nn_18_relu_tanh':
        eps = 4e-3
    elif filename == 'nn_18_sigmoid':
        eps = 4e-3
    elif filename == 'nn_18_tanh_new':
        eps = 4e-3

    m = len(d)
    lips, network_output_range = lipschitz(f_details, box, output_index, activation)

    distance_estimate = 0
    for j in range(m):
        diff = np.diff(box[j])[0]
        if diff > distance_estimate:
            distance_estimate = diff

    LD_estimate = lips * distance_estimate * np.sqrt(m)
    num_partition = int(np.ceil(LD_estimate // eps + 1))

    partition = [num_partition]*m
    all_comb_lists = degree_comb_lists(partition, m)

    if isinstance(lips, np.ndarray):
        lips = lips[0]

    all_sample_points = np.zeros((len(all_comb_lists),m), dtype=np.float64)
    all_shift_points = np.zeros((len(all_comb_lists),m), dtype=np.float64)
    partition_box = np.zeros(m, dtype=np.float64)
    for j in range(m):
        alpha_j = np.float64(box[j][0])
        beta_j = np.float64(box[j][1])
        partition_box[j] = (beta_j - alpha_j) / num_partition

    all_comb_lists = np.array(all_comb_lists)
    for idxState in range(m):
        alpha_j = np.float64(box[idxState][0])
        beta_j = np.float64(box[idxState][0])
        all_sample_points[:, idxState] = (
            (beta_j - alpha_j) * (all_comb_lists[:, idxState]/num_partition)
            + alpha_j
        )
        all_shift_points = point_shift_all(all_sample_points, box)

    degree_list, coef_list = nn_poly_approx_bernstein_cuda(f, d, box, output_index)
    poly = polyval(degree_list, d, coef_list, 'test')
    with U.make_session() as sess:
        sess.run(tf.global_variables_initializer())
        poly_results = poly(sess, all_shift_points)
        nn_results = f_details(sess, all_sample_points)

    # nn_results = np.zeros(len(all_sample_points), dtype=np.float64)
    # for index in range(all_sample_points.shape[0]):
    #     point = all_sample_points[index,:]
    #     nn_results[index] = f(point)[output_index]


    sample_error = np.max(np.absolute(poly_results[:,0] - nn_results[:,0]))
    # max_index = np.argmax(np.absolute(poly_results - nn_results))
    # print(max_index)
    # print(all_sample_points[max_index, :])
    # print(nn_results[max_index])
    # print(all_shift_points[max_index, :])
    # print(poly_results[max_index])
    error = sample_error + lips * LA.norm(partition_box)

    return error
Exemplo n.º 13
0
def learn(env,
          q_func,
          lr=5e-4,
          max_timesteps=100000,
          buffer_size=50000,
          exploration_fraction=0.1,
          exploration_final_eps=0.02,
          train_freq=1,
          batch_size=32,
          print_freq=1,
          checkpoint_freq=10000,
          learning_starts=1000,
          gamma=1.0,
          target_network_update_freq=500,
          prioritized_replay=False,
          prioritized_replay_alpha=0.6,
          prioritized_replay_beta0=0.4,
          prioritized_replay_beta_iters=None,
          prioritized_replay_eps=1e-6,
          num_cpu=16,
          callback=None):
    """Train a deepq model.
    Parameters
    -------
    env : gym.Env
        environment to train on
    q_func: (tf.Variable, int, str, bool) -> tf.Variable
        the model that takes the following inputs:
            observation_in: object
                the output of observation placeholder
            num_actions: int
                number of actions
            scope: str
            reuse: bool
                should be passed to outer variable scope
        and returns a tensor of shape (batch_size, num_actions) with values of every action.
    lr: float
        learning rate for adam optimizer
    max_timesteps: int
        number of env steps to optimizer for
    buffer_size: int
        size of the replay buffer
    exploration_fraction: float
        fraction of entire training period over which the exploration rate is annealed
    exploration_final_eps: float
        final value of random action probability
    train_freq: int
        update the model every `train_freq` steps.
        set to None to disable printing
    batch_size: int
        size of a batched sampled from replay buffer for training
    print_freq: int
        how often to print out training progress
    checkpoint_freq: int
        how often to save the model. This is so that the best version is restored
        at the end of the training. If you do not wish to restore the best version at
        the end of the training set this variable to None.
    learning_starts: int
        how many steps of the model to collect transitions for before learning starts
    gamma: float
        discount factor
    target_network_update_freq: int
        update the target network every `target_network_update_freq` steps.
    prioritized_replay: True
        if True prioritized replay buffer will be used.
    prioritized_replay_alpha: float
        alpha parameter for prioritized replay buffer
    prioritized_replay_beta0: float
        initial value of beta for prioritized replay buffer
    prioritized_replay_beta_iters: int
        number of iterations over which beta will be annealed from initial value
        to 1.0. If set to None equals to max_timesteps.
    prioritized_replay_eps: float
        epsilon to add to the TD errors when updating priorities.
    num_cpu: int
        number of cpus to use for training
    callback: (locals, globals) -> None
        function called at every steps with state of the algorithm.
        If callback returns true training stops.
    Returns
    -------
    act: ActWrapper
        Wrapper over act function. Adds ability to save it and load it.
        See header of baselines/deepq/categorical.py for details on the act function.
    """
    # Create all the functions necessary to train the model

    sess = U.make_session(num_cpu=num_cpu)
    sess.__enter__()

    def make_obs_ph(name):
        return U.BatchInput([84, 84], name=name)

    act, train, update_target, debug = build_train(
        make_obs_ph=make_obs_ph,
        q_func=q_func,
        num_actions=2,
        optimizer=tf.train.AdamOptimizer(learning_rate=lr),
        gamma=gamma,
        grad_norm_clipping=10)
    act_params = {
        'make_obs_ph': make_obs_ph,
        'q_func': q_func,
        'num_actions': 2,
    }
    # Create the replay buffer
    if prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(buffer_size,
                                                alpha=prioritized_replay_alpha)
        if prioritized_replay_beta_iters is None:
            prioritized_replay_beta_iters = max_timesteps
        beta_schedule = LinearSchedule(prioritized_replay_beta_iters,
                                       initial_p=prioritized_replay_beta0,
                                       final_p=1.0)
    else:
        replay_buffer = ReplayBuffer(buffer_size)
        beta_schedule = None
    # Create the schedule for exploration starting from 1.
    exploration = LinearSchedule(schedule_timesteps=int(exploration_fraction *
                                                        max_timesteps),
                                 initial_p=1.0,
                                 final_p=exploration_final_eps)

    # Initialize the parameters and copy them to the target network.
    U.initialize()
    update_target()

    episode_rewards = [0.0]
    saved_mean_reward = None
    obs = env.step(0)
    with tempfile.TemporaryDirectory() as td:
        model_saved = False
        model_file = os.path.join(td, "model")
        for t in range(max_timesteps):
            if callback is not None:
                if callback(locals(), globals()):
                    break
            # Take action and update exploration to the newest value
            action = act(np.array(obs)[None],
                         update_eps=exploration.value(t))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                #obs = env.reset()
                episode_rewards.append(0)

            if t > learning_starts and t % train_freq == 0:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if prioritized_replay:
                    experience = replay_buffer.sample(
                        batch_size, beta=beta_schedule.value(t))
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_idxes) = experience
                else:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        batch_size)
                    weights, batch_idxes = np.ones_like(rewards), None
                td_errors = train(obses_t, actions, rewards, obses_tp1, dones,
                                  np.ones_like(rewards))
                if prioritized_replay:
                    new_priorities = np.abs(td_errors) + prioritized_replay_eps
                    replay_buffer.update_priorities(batch_idxes,
                                                    new_priorities)

            if t > learning_starts and t % target_network_update_freq == 0:
                # Update target network periodically.
                update_target()

            mean_100ep_reward = round(np.mean(episode_rewards[-101:-1]), 1)
            num_episodes = len(episode_rewards)
            if done and print_freq is not None and len(
                    episode_rewards) % print_freq == 0:
                #logger.record_tabular("steps", t)
                #logger.record_tabular("episodes", num_episodes)
                #logger.record_tabular("mean 100 episode reward", mean_100ep_reward)
                #logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
                #logger.dump_tabular()
                print("steps: {}".format(t))
                print("episodes: {}".format(num_episodes))
                print("mean 100 episode reward: {}".format(mean_100ep_reward))
                print("% time spent exploring: {}".format(
                    int(100 * exploration.value(t))))

            if (checkpoint_freq is not None and t > learning_starts
                    and num_episodes > 100 and t % checkpoint_freq == 0):
                if saved_mean_reward is None or mean_100ep_reward > saved_mean_reward:
                    #if print_freq is not None:
                    #logger.log("Saving model due to mean reward increase: {} -> {}".format(
                    #           saved_mean_reward, mean_100ep_reward))
                    U.save_state(model_file)
                    model_saved = True
                    saved_mean_reward = mean_100ep_reward
        if model_saved:
            #if print_freq is not None:
            #logger.log("Restored model with mean reward: {}".format(saved_mean_reward))
            U.load_state(model_file)

    return ActWrapper(act, act_params)
Exemplo n.º 14
0
def main():
    parser = arg_parser()
    add_env_params(parser)
    parser.add_argument('--num-timesteps', type=int, default=int(1e7))
    parser.add_argument('--num_env', type=int, default=16)
    parser.add_argument('--use_news', type=int, default=0)
    parser.add_argument('--gamma', type=float, default=0.99)
    parser.add_argument('--gamma_ext', type=float, default=0.99)
    parser.add_argument('--lam', type=float, default=0.95)
    parser.add_argument('--update_ob_stats_every_step', type=int, default=0)
    parser.add_argument('--update_ob_stats_independently_per_gpu',
                        type=int,
                        default=0)
    parser.add_argument('--update_ob_stats_from_random_agent',
                        type=int,
                        default=1)
    parser.add_argument('--proportion_of_exp_used_for_predictor_update',
                        type=float,
                        default=1.)
    parser.add_argument('--tag', type=str, default='')
    parser.add_argument('--policy',
                        type=str,
                        default='rnn',
                        choices=['cnn', 'rnn'])
    parser.add_argument('--int_coeff', type=float, default=1.)
    parser.add_argument('--ext_coeff', type=float, default=0)
    parser.add_argument('--dynamics_bonus', type=int, default=0)

    parser.add_argument('--clear-run',
                        action='store_true',
                        default=False,
                        help='if clear the save folder')
    parser.add_argument('--mega-wrapper',
                        type=int,
                        default=0,
                        help='if use the same wrapper as mega')

    args = parser.parse_args()
    args.save_dir = '../rnd_results/'
    args.save_dir = os.path.join(args.save_dir, 'e_n-{}/'.format(args.env))
    args.save_dir = os.path.join(
        args.save_dir, 'mega_wrapper-{}'.format(str(args.mega_wrapper)))
    args.save_dir = os.path.join(args.save_dir,
                                 'num_env-{}'.format(str(args.num_env)))
    args.save_dir = os.path.join(args.save_dir,
                                 'int_coeff-{}'.format(str(args.int_coeff)))

    if args.clear_run:
        '''if clear_run, clear the path before create the path'''
        input('You have set clear_run, is that what you want?')
        subprocess.call(["rm", "-r", args.save_dir])

    try:
        os.makedirs(args.save_dir)
    except Exception as e:
        print('file exists')

    try:
        os.makedirs('../rnd_log_results/' + args.env + '/')
    except Exception as e:
        print('log file exists')

    args.summary_writer = tf.summary.FileWriter(args.save_dir)

    logger.configure(dir='../rnd_log_results/' + args.env + '/',
                     format_strs=['stdout', 'log', 'csv']
                     if MPI.COMM_WORLD.Get_rank() == 0 else [])
    if MPI.COMM_WORLD.Get_rank() == 0:
        with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'),
                  'w') as f:
            f.write(args.tag)
        # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code'))

    mpi_util.setup_mpi_gpus()

    seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(frame_stack=4,
               nminibatches=4,
               nepochs=4,
               lr=0.0001,
               max_grad_norm=0.0,
               use_news=args.use_news,
               gamma=args.gamma,
               gamma_ext=args.gamma_ext,
               max_episode_steps=args.max_episode_steps,
               lam=args.lam,
               update_ob_stats_every_step=args.update_ob_stats_every_step,
               update_ob_stats_independently_per_gpu=args.
               update_ob_stats_independently_per_gpu,
               update_ob_stats_from_random_agent=args.
               update_ob_stats_from_random_agent,
               proportion_of_exp_used_for_predictor_update=args.
               proportion_of_exp_used_for_predictor_update,
               policy=args.policy,
               int_coeff=args.int_coeff,
               ext_coeff=args.ext_coeff,
               dynamics_bonus=args.dynamics_bonus)

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps,
          args=args)
def main():
    import neptune

    parser = argparse.ArgumentParser(argument_default=None)
    parser.add_argument('--config', action='append', help='Gin config files.')
    parser.add_argument('--debug', action='store_true', default=False)
    cmd_args, unknown = parser.parse_known_args()
    debug = cmd_args.debug
    spec_path = cmd_args.config[0]

    if not debug:
        try:
            with open(spec_path, 'rb') as f:
                import cloudpickle
                specification = cloudpickle.load(f)
        except pickle.UnpicklingError:
            with open(spec_path) as f:
                vars_ = {'script': os.path.basename(spec_path)}
                exec(f.read(), vars_)  # pylint: disable=exec-used
                specification = vars_['experiments_list'][0].to_dict()
                print(
                    'NOTE: Only the first experiment from the list will be run!'
                )
        parameters = specification['parameters']
    else:
        print("debug run")
        parameters = dict(env_id="toy_mr", env_size=None)

    class MockArgs(object):
        def add(self, key, value):
            setattr(self, key, value)

    args = MockArgs()

    args.add('env', parameters["env_id"])  # 'chain_env' 'toy_mr'
    args.add('env_size', parameters["env_size"])
    args.add('seed', 0)
    args.add('max_episode_steps', 300)

    args.add('num_timesteps', int(1e12))
    args.add('num_env', 32)
    args.add('use_news', 0)
    args.add('gamma', 0.99)
    args.add('gamma_ext', 0.999)
    args.add('lam', 0.95)
    args.add('update_ob_stats_every_step', 0)
    args.add('update_ob_stats_independently_per_gpu', 0)
    args.add('update_ob_stats_from_random_agent', 1)
    args.add('proportion_of_exp_used_for_predictor_update', 1.)
    args.add('tag', '')
    args.add(
        'policy',
        'cnn',
    )
    args.add('int_coeff', 1.)
    args.add('ext_coeff', 2.)
    args.add('dynamics_bonus', 0)

    if not debug:
        # TODO read more from specification
        print("running with neptune")
        neptune.init(
            project_qualified_name="pmtest/planning-with-learned-models")
        neptune.create_experiment(
            name=specification['name'],
            tags=specification['tags'],
            params=specification['parameters'],
            upload_stdout=False,
            upload_stderr=False,
        )
        neptune.send_metric("test", 777)
        baselines_format_strs = ['log', 'csv']
    else:
        print("running without neptune")
        baselines_format_strs = ['stdout', 'log', 'csv']

    logger.configure(dir="out", format_strs=baselines_format_strs)

    seed = 10000 * args.seed  # + MPI.COMM_WORLD.Get_rank()
    set_global_seeds(seed)

    hps = dict(frame_stack=4,
               nminibatches=4,
               nepochs=4,
               lr=0.0001,
               max_grad_norm=0.0,
               env_size=args.env_size,
               use_news=args.use_news,
               gamma=args.gamma,
               gamma_ext=args.gamma_ext,
               max_episode_steps=args.max_episode_steps,
               lam=args.lam,
               update_ob_stats_every_step=args.update_ob_stats_every_step,
               update_ob_stats_independently_per_gpu=args.
               update_ob_stats_independently_per_gpu,
               update_ob_stats_from_random_agent=args.
               update_ob_stats_from_random_agent,
               proportion_of_exp_used_for_predictor_update=args.
               proportion_of_exp_used_for_predictor_update,
               policy=args.policy,
               int_coeff=args.int_coeff,
               ext_coeff=args.ext_coeff,
               dynamics_bonus=args.dynamics_bonus)

    tf_util.make_session(make_default=True)
    train(env_id=args.env,
          num_env=args.num_env,
          seed=seed,
          num_timesteps=args.num_timesteps,
          hps=hps,
          use_neptune=(not debug))