Пример #1
0
def train(env_id, num_timesteps, seed):
    from baselines.pposgd import mlp_policy
    import pposgd_simple
    sess=U.make_session(num_cpu=1)
    sess.__enter__()
    logger.session().__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)

    env = bench.Monitor(env, "monitor.json")
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env, policy_fn,
            max_timesteps=num_timesteps,
            timesteps_per_batch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95,
             schedule='linear'
        )
    env.close()

    # save model
    saver = tf.train.Saver()
    saver.save(sess, "model/model.ckpt")
Пример #2
0
def train(env_id, num_timesteps, seed, num_cpu):
    from baselines.pposgd import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    whoami  = mpi_fork(num_cpu)
    if whoami == "parent": return
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    logger.session().__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps /= 4 # because we're wrapping the envs to do frame skip
    env.seed(workerseed)

    pposgd_simple.learn(env, policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=256,
        clip_param=0.2, entcoeff=0.01,
        optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
        gamma=0.99, lam=0.95,
        schedule='linear'
    )
    env.close()
Пример #3
0
def train(env_id, num_timesteps, seed):
    from baselines.pposgd import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    logger.session().__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=2)

    env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=num_timesteps,
        timesteps_per_batch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
    )
    env.close()
Пример #4
0
def train(env_id, num_timesteps, seed, num_cpu):
    from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
    from baselines.trpo_mpi import trpo_mpi
    import baselines.common.tf_util as U
    whoami  = mpi_fork(num_cpu)
    if whoami == "parent":
        return
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    logger.session().__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)


    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
        return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json"%rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps /= 4 # because we're wrapping the envs to do frame skip
    env.seed(workerseed)

    trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
        max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
    env.close()
Пример #5
0
def train(env_id, num_timesteps, seed):
    from baselines.pposgd import mlp_policy, pposgd_simple
    U.make_session(num_cpu=1).__enter__()
    logger.session().__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json"))
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pposgd_simple.learn(env, policy_fn, 
            max_timesteps=num_timesteps,
            timesteps_per_batch=2048,
            clip_param=0.2, entcoeff=0.0,
            optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
            gamma=0.99, lam=0.95,
        )
    env.close()
Пример #6
0
def train(env_id, num_timesteps, seed, num_cpu):
    from baselines.pposgd import pposgd_simple, cnn_policy
    import baselines.common.tf_util as U
    whoami = mpi_fork(num_cpu)
    if whoami == "parent": return
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    logger.session().__enter__()
    if rank != 0: logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):  #pylint: disable=W0613
        return cnn_policy.CnnPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space)

    env = bench.Monitor(env,
                        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    env = wrap_train(env)
    num_timesteps /= 4  # because we're wrapping the envs to do frame skip
    env.seed(workerseed)

    pposgd_simple.learn(env,
                        policy_fn,
                        max_timesteps=num_timesteps,
                        timesteps_per_batch=256,
                        clip_param=0.2,
                        entcoeff=0.01,
                        optim_epochs=4,
                        optim_stepsize=1e-3,
                        optim_batchsize=64,
                        gamma=0.99,
                        lam=0.95,
                        schedule='linear')
    env.close()
Пример #7
0
def enjoy(env_id, num_timesteps, seed):
    from baselines.pposgd import mlp_policy, pposgd_simple
    sess = U.make_session(num_cpu=1)
    sess.__enter__()
    logger.session().__enter__()
    set_global_seeds(seed)
    env = gym.make(env_id)
    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
            hid_size=64, num_hid_layers=2)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "monitor.json"))
    obs = env.reset()
    env.seed(seed)
    gym.logger.setLevel(logging.WARN)
    pi = policy_fn('pi', env.observation_space, env.action_space)
    tf.train.Saver().restore(sess, '/tmp/model')
    done = False
    while not done:
        action = pi.act(True, obs)[0]
        obs, reward, done, info = env.step(action)
        env.render()
Пример #8
0
def train(env_id, num_timesteps, seed):
    whoami = mpi_fork(num_cpu)
    if whoami == "parent":
        return
    import baselines.common.tf_util as U
    logger.session().__enter__()
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)

    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=32,
                         num_hid_layers=2)

    env = bench.Monitor(env,
                        osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_mpi.learn(env,
                   policy_fn,
                   timesteps_per_batch=1024,
                   max_kl=0.01,
                   cg_iters=10,
                   cg_damping=0.1,
                   max_timesteps=num_timesteps,
                   gamma=0.99,
                   lam=0.98,
                   vf_iters=5,
                   vf_stepsize=1e-3)
    env.close()
def train(env_id, num_timesteps, seed, model_name, model_path, para,load_model,
          timesteps_per_batch,hidden_units,hidden_layers):
    whoami  = mpi_fork(num_cpu)
    if whoami == "parent":
        return
    import baselines.common.tf_util as U
    logger.session().__enter__()
    sess = U.single_threaded_session()
    sess.__enter__()

    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = gym.make(env_id)
    env = SubsetWrapper(env, para)
    #env = gym_kidney.LogWrapper(env, NN, EXP, OUT, FREQ, PARAM)
    def policy_fn(name, ob_space, ac_space):
        return MlpPolicy(name=name,
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         hid_size=hidden_units,
                         num_hid_layers=hidden_layers)
    env = bench.Monitor(env, osp.join(logger.get_dir(), "%i.monitor.json" % rank))
    # env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    trpo_indi.learn(env, policy_fn,
                   timesteps_per_batch=timesteps_per_batch,
                   max_kl=max_kl, cg_iters=cg_iters,
                   cg_damping=cd_damping,
                   max_episodes=num_timesteps,
                   gamma=gamma, lam=lam,
                   vf_iters=vf_iters,
                   vf_stepsize=vf_stepsize,
                   load_model=load_model,
                   model_path=model_path
                    )
    env.close()
def train(args):
    from baselines.pposgd import mlp_policy, pposgd_simple
    rank = MPI.COMM_WORLD.Get_rank()
    sess = U.single_threaded_session()
    sess.__enter__()
    logger.session(dir=args.exp_path,
                   format_strs=None if rank == 0 and not args.test_only
                   and not args.evaluate else []).__enter__()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    workerseed = args.seed + 10000 * rank
    set_global_seeds(workerseed)

    if args.submit:
        env = SubmitRunEnv(visualize=args.render)
    elif args.submit_round2:
        from turnips.submit_round2_env import SubmitRunEnv2
        submit_env = env = SubmitRunEnv2()
    elif args.simwalker:
        env = SimWalker(visualize=args.render)
    else:
        env = IsolatedMyRunEnv(visualize=args.render,
                               run_logs_dir=args.run_logs_dir,
                               additional_info={'exp_name': args.exp_name},
                               step_timeout=args.step_timeout,
                               n_obstacles=args.n_obstacles,
                               higher_pelvis=args.higher_pelvis)

    env = RunEnvWrapper(env, args.diff)
    if args.simwalker and args.log_simwalker:
        cls = type(
            "h5pyEnvLoggerClone", (gym.Wrapper, ),
            dict(h5pyEnvLogger.__dict__))  # workaround for double wrap problem
        env = cls(env,
                  log_dir=args.run_logs_dir,
                  filename_prefix='simwalker_',
                  additional_info={
                      'exp_name': args.exp_name,
                      'difficulty': args.diff,
                      'seed': args.seed
                  })

    env = env_walker = Walker(env,
                              shaping_mode=args.shaping,
                              transform_inputs=args.transform_inputs,
                              obstacle_hack=not args.noobsthack,
                              max_steps=args.max_env_steps,
                              memory_size=args.memory_size,
                              swap_legs_mode=args.swap_legs_mode,
                              filter_obs=args.filter_obs,
                              add_time=args.add_time,
                              fall_penalty=args.fall_penalty,
                              fall_penalty_value=args.fall_penalty_val,
                              print_action=args.print_action,
                              new8_fix=args.new8_fix,
                              pause=args.pause,
                              noisy_obstacles=args.noisy_obstacles,
                              noisy_obstacles2=args.noisy_obstacles2,
                              noisy_fix=args.noisy_fix)

    if args.log_walker:
        env = h5pyEnvLogger(env,
                            log_dir=args.run_logs_dir,
                            filename_prefix='walker_',
                            additional_info={
                                'exp_name': args.exp_name,
                                'difficulty': args.diff,
                                'seed': args.seed
                            })
    if args.muscles:
        env = MuscleWalker(env)
    if args.repeats > 1:
        env = RepeatActionsWalker(env, args.repeats)

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(
            name=name,
            ob_space=ob_space,
            ac_space=ac_space,
            hid_size=args.hid_size,
            num_hid_layers=args.num_hid_layers,
            bound_by_sigmoid=args.bound_by_sigmoid,
            sigmoid_coef=args.sigmoid_coef,
            activation=args.activation,
            normalize_obs=not args.nonormalize_obs,
            gaussian_fixed_var=not args.nogaussian_fixed_var,
            avg_norm_symmetry=args.avg_norm_symmetry,
            symmetric_interpretation=args.symmetric_interpretation,
            stdclip=args.stdclip,
            actions=args.actions,
            gaussian_bias=args.gaussian_bias,
            gaussian_from_binary=args.gaussian_from_binary,
            parallel_value=args.parallel_value,
            pv_layers=args.pv_layers,
            pv_hid_size=args.pv_hid_size,
            three=args.three)

    if not args.test_only and not args.evaluate:
        env = bench.Monitor(env,
                            path.join(args.exp_path, "%i.monitor.json" % rank))
    env.seed(workerseed)
    gym.logger.setLevel(logging.WARN)

    current_best = float('-inf')
    current_best_completed = float('-inf')
    current_best_perc_completed = float('-inf')
    stats_f = None

    start = time.time()

    def callback(local_, global_):
        nonlocal current_best
        nonlocal current_best_completed
        nonlocal current_best_perc_completed
        nonlocal stats_f
        if rank != 0: return
        if args.test_only or args.evaluate: return

        print('ELAPSED', time.time() - start)
        print(f'{socket.gethostname()}:{args.exp_path}')

        iter_no = local_['iters_so_far']
        if iter_no % args.save_every == 0:
            U.save_state(
                path.join(args.exp_path, 'models', f'{iter_no:04d}', 'model'))

        if local_['iters_so_far'] == 0:
            stats_f = open(path.join(args.exp_path, 'simple_stats.csv'), 'w')
            cols = [
                "Iter", "EpLenMean", "EpRewMean", "EpOrigRewMean",
                "EpThisIter", "EpisodesSoFar", "TimestepsSoFar", "TimeElapsed",
                "AvgCompleted", "PercCompleted"
            ]
            for name in local_['loss_names']:
                cols.append("loss_" + name)
            stats_f.write(",".join(cols) + '\n')
        else:
            current_orig_reward = np.mean(local_['origrew_buffer'])
            if current_best < current_orig_reward:
                print(
                    f'Found better {current_best:.2f} -> {current_orig_reward:.2f}'
                )
                current_best = current_orig_reward
                U.save_state(path.join(args.exp_path, 'best', 'model'))
            U.save_state(path.join(args.exp_path, 'last', 'model'))

            avg_completed = local_["avg_completed"]
            if current_best_completed < avg_completed:
                print(
                    f'Found better completed {current_best_completed:.2f} -> {avg_completed:.2f}'
                )
                current_best_completed = avg_completed
                U.save_state(
                    path.join(args.exp_path, 'best_completed', 'model'))

            perc_completed = local_["perc_completed"]
            if current_best_perc_completed < perc_completed:
                print(
                    f'Found better perc completed {current_best_perc_completed:.2f} -> {perc_completed:.2f}'
                )
                current_best_perc_completed = perc_completed
                U.save_state(
                    path.join(args.exp_path, 'perc_completed', 'model'))

            data = [
                local_['iters_so_far'],
                np.mean(local_['len_buffer']),
                np.mean(local_['rew_buffer']),
                np.mean(local_['origrew_buffer']),
                len(local_['lens']),
                local_['episodes_so_far'],
                local_['timesteps_so_far'],
                time.time() - local_['tstart'],
                avg_completed,
                perc_completed,
            ]
            if 'meanlosses' in local_:
                for lossval in local_['meanlosses']:
                    data.append(lossval)

            stats_f.write(",".join([str(x) for x in data]) + '\n')
            stats_f.flush()

    if args.load_model is not None:
        args.load_model += '/model'
    if args.submit_round2:
        submit_round2(env,
                      submit_env,
                      policy_fn,
                      load_model_path=args.load_model,
                      stochastic=False,
                      actions=args.actions)
        #submit_env.submit()   # submit_round2(...) submits already
        sys.exit()
    if args.evaluate:
        pposgd_simple.evaluate(env,
                               policy_fn,
                               load_model_path=args.load_model,
                               n_episodes=args.n_eval_episodes,
                               stochastic=not args.nostochastic,
                               actions=args.actions,
                               execute_just=args.execute_just)
    else:
        pposgd_simple.learn(
            env,
            policy_fn,
            max_timesteps=args.max_timesteps,
            timesteps_per_batch=args.timesteps_per_batch,
            clip_param=args.clip_param,
            entcoeff=args.entcoeff,
            optim_epochs=args.optim_epochs,
            optim_stepsize=args.optim_stepsize,
            optim_batchsize=args.optim_batchsize,
            gamma=args.gamma,
            lam=args.lam,
            callback=callback,
            load_model_path=args.load_model,
            test_only=args.test_only,
            stochastic=not args.nostochastic,
            symmetric_training=args.symmetric_training,
            obs_names=env_walker.obs_names,
            single_episode=args.single_episode,
            horizon_hack=args.horizon_hack,
            running_avg_len=args.running_avg_len,
            init_three=args.init_three,
            actions=args.actions,
            symmetric_training_trick=args.symmetric_training_trick,
            bootstrap_seeds=args.bootstrap_seeds,
            seeds_fn=args.seeds_fn,
        )
    env.close()
Пример #11
0
        cell = tf.nn.rnn_cell.BasicLSTMCell(64)
        rnn_out, last_state = tf.nn.dynamic_rnn(cell, out, dtype=np.float32)
        out = rnn_out[:, -1]

        #out = layers.fully_connected(out, num_outputs=16, activation_fn=tf.nn.tanh)
        out = layers.fully_connected(out,
                                     num_outputs=num_actions,
                                     activation_fn=None)
        return out


if __name__ == '__main__':
    np.random.seed(7)

    batch_size = 1  # Necessary, Different lengths
    log_session = logger.session(dir='logs')

    with U.make_session(8):
        # Create all the functions necessary to train the model
        act, train, update_target, debug = deepq.build_train(
            make_obs_ph=lambda name: tf.placeholder(tf.int32, [None, None],
                                                    name=name),
            q_func=model,
            num_actions=s.action_space,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
        )
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000,
Пример #12
0
def on_iteration_start(local_vars, global_vars):
    on_iteration_start.iteration += 1
    load_model(on_iteration_start.iteration)
    plot_history(local_vars['history'], on_iteration_start.iteration)
    save_model(on_iteration_start.iteration)


on_iteration_start.iteration = 0

whoami = mpi_fork(args.cores)
if whoami == 'parent':
    exit(0)

session = U.single_threaded_session()
session.__enter__()
logger.session().__enter__()

env = RunEnv(args.visualize, max_obstacles=args.obstacles, original_reward=args.original)
env.spec.timestep_limit = args.max_steps
if args.visualize:
    vis = env.osim_model.model.updVisualizer().updSimbodyVisualizer()
    vis.setBackgroundType(vis.GroundAndSky)
    vis.setShowFrameNumber(True)
    vis.zoomCameraToShowAllGeometry()
    vis.setCameraFieldOfView(1)

if args.train:
    history = pposgd_simple.learn(
        env,
        policy_fn,
        max_timesteps=args.steps,