예제 #1
0
def main():

    args = get_args()
    utils.cleanup_log_dir(args.log_dir)
    args_dict = vars(args)
    json.dump(args_dict,
              open(os.path.join(args.log_dir, "training_arguments.json"), "w"),
              indent=4)

    if args.cluster:
        ray.init(address="auto")
    else:
        ray.init()

    resources = ""
    for k, v in ray.cluster_resources().items():
        resources += "{} {}, ".format(k, v)
    print(resources[:-2], flush=True)

    # 1. Define Train Vector of Envs
    train_envs_factory, action_space, obs_space = VecEnv.create_factory(
        vec_env_size=args.num_env_processes,
        log_dir=args.log_dir,
        env_fn=pybullet_train_env_factory,
        env_kwargs={
            "env_id": args.env_id,
            "frame_skip": args.frame_skip,
            "frame_stack": args.frame_stack
        })

    # 2. Define Test Vector of Envs (Optional)
    test_envs_factory, _, _ = VecEnv.create_factory(
        vec_env_size=args.num_env_processes,
        log_dir=args.log_dir,
        env_fn=pybullet_test_env_factory,
        env_kwargs={
            "env_id": args.env_id,
            "frame_skip": args.frame_skip,
            "frame_stack": args.frame_stack
        })

    # 3. Define RL training algorithm
    algo_factory = PPO.create_factory(
        lr=args.lr,
        eps=args.eps,
        num_epochs=args.ppo_epoch,
        clip_param=args.clip_param,
        entropy_coef=args.entropy_coef,
        value_loss_coef=args.value_loss_coef,
        max_grad_norm=args.max_grad_norm,
        num_mini_batch=args.num_mini_batch,
        use_clipped_value_loss=args.use_clipped_value_loss,
        gamma=args.gamma)

    # 4. Define RL Policy
    actor_factory = OnPolicyActor.create_factory(
        obs_space,
        action_space,
        feature_extractor_network=get_feature_extractor(args.nn),
        recurrent_policy=args.recurrent_policy,
        restart_model=args.restart_model)

    # 5. Define rollouts storage
    storage_factory = GAEBuffer.create_factory(size=args.num_steps,
                                               gae_lambda=args.gae_lambda)

    # 6. Define scheme
    params = {}

    # add core modules
    params.update({
        "algo_factory": algo_factory,
        "actor_factory": actor_factory,
        "storage_factory": storage_factory,
        "train_envs_factory": train_envs_factory,
        "test_envs_factory": test_envs_factory,
    })

    # add collection specs
    params.update({
        "num_col_workers": args.num_col_workers,
        "col_communication": args.com_col_workers,
        "col_worker_resources": {
            "num_cpus": 1,
            "num_gpus": 0.5
        },
        "sync_col_specs": {
            "fraction_samples": 1.0,
            "fraction_workers": 1.0
        }
    })

    # add gradient specs
    params.update({
        "num_grad_workers": args.num_grad_workers,
        "grad_communication": args.com_grad_workers,
        "grad_worker_resources": {
            "num_cpus": 1.0,
            "num_gpus": 0.5
        },
    })

    scheme = Scheme(**params)

    # 7. Define learner
    learner = Learner(scheme,
                      target_steps=args.num_env_steps,
                      log_dir=args.log_dir)

    # 8. Define train loop
    iterations = 0
    start_time = time.time()
    while not learner.done():

        learner.step()

        if iterations % args.log_interval == 0:
            learner.print_info()

        if iterations % args.save_interval == 0:
            save_name = learner.save_model()
            args_dict.update({"latest_model": save_name})
            args_path = os.path.join(args.log_dir, "training_arguments.json")
            json.dump(args_dict, open(args_path, "w"), indent=4)

        if args.max_time != -1 and (time.time() - start_time) > args.max_time:
            break

        iterations += 1

    print("Finished!")
    sys.exit()
예제 #2
0
def main():

    args = get_args()
    utils.cleanup_log_dir(args.log_dir)
    args_dict = vars(args)
    json.dump(args_dict,
              open(os.path.join(args.log_dir, "training_arguments.json"), "w"),
              indent=4)

    if args.cluster:
        ray.init(address="auto")
    else:
        ray.init()

    resources = ""
    for k, v in ray.cluster_resources().items():
        resources += "{} {}, ".format(k, v)
    print(resources[:-2], flush=True)

    # 1. Define Train Vector of Envs
    train_envs_factory, action_space, obs_space = VecEnv.create_factory(
        env_fn=atari_train_env_factory,
        env_kwargs={
            "env_id": args.env_id,
            "frame_stack": args.frame_stack
        },
        vec_env_size=args.num_env_processes,
        log_dir=args.log_dir,
        info_keywords=('rr', 'rrr', 'lives'))

    # 2. Define Test Vector of Envs (Optional)
    test_envs_factory, _, _ = VecEnv.create_factory(
        env_fn=atari_test_env_factory,
        env_kwargs={
            "env_id": args.env_id,
            "frame_stack": args.frame_stack
        },
        vec_env_size=args.num_env_processes,
        log_dir=args.log_dir)

    # 3. Define RL training algorithm
    algo_factory = SAC.create_factory(lr_pi=args.lr,
                                      lr_q=args.lr,
                                      lr_alpha=args.lr,
                                      initial_alpha=args.alpha,
                                      gamma=args.gamma,
                                      polyak=args.polyak,
                                      num_updates=args.num_updates,
                                      update_every=args.update_every,
                                      start_steps=args.start_steps,
                                      mini_batch_size=args.mini_batch_size)

    # 4. Define RL Policy
    actor_factory = OffPolicyActor.create_factory(
        obs_space,
        action_space,
        feature_extractor_network=get_feature_extractor(args.nn),
        recurrent_policy=args.recurrent_policy,
        restart_model=args.restart_model)

    # 5. Define rollouts storage
    storage_factory = ReplayBuffer.create_factory(size=args.buffer_size)

    # 6. Define scheme
    params = {}

    # add core modules
    params.update({
        "algo_factory": algo_factory,
        "actor_factory": actor_factory,
        "storage_factory": storage_factory,
        "train_envs_factory": train_envs_factory,
        "test_envs_factory": test_envs_factory,
    })

    # add collection specs
    params.update({
        "num_col_workers": args.num_col_workers,
        "col_communication": args.com_col_workers,
        "col_worker_resources": {
            "num_cpus": 1,
            "num_gpus": 0.5
        },
        "sync_col_specs": {
            "fraction_samples": 1.0,
            "fraction_workers": 1.0
        }
    })

    # add gradient specs
    params.update({
        "num_grad_workers": args.num_grad_workers,
        "grad_communication": args.com_grad_workers,
        "grad_worker_resources": {
            "num_cpus": 1.0,
            "num_gpus": 0.5
        },
    })

    scheme = Scheme(**params)

    # 7. Define learner
    learner = Learner(scheme,
                      target_steps=args.num_env_steps,
                      log_dir=args.log_dir)

    # 8. Define train loop
    iterations = 0
    start_time = time.time()
    while not learner.done():

        learner.step()

        if iterations % args.log_interval == 0:
            learner.print_info()

        if iterations % args.save_interval == 0:
            save_name = learner.save_model()
            args_dict.update({"latest_model": save_name})
            args_path = os.path.join(args.log_dir, "training_arguments.json")
            json.dump(args_dict, open(args_path, "w"), indent=4)

        if args.max_time != -1 and (time.time() - start_time) > args.max_time:
            break

        iterations += 1

    print("Finished!")
    sys.exit()