Exemplo n.º 1
0
def build_and_train(game="pong",
                    run_ID=0,
                    cuda_idx=None,
                    mid_batch_reset=False,
                    n_parallel=2):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)))
    Collector = GpuResetCollector if mid_batch_reset else GpuWaitResetCollector
    print(f"To satisfy mid_batch_reset=={mid_batch_reset}, using {Collector}.")

    sampler = GpuSampler(
        EnvCls=AtariEnv,
        env_kwargs=dict(game=game,
                        num_img_obs=1),  # Learn on individual frames.
        CollectorCls=Collector,
        batch_T=20,  # Longer sampling/optimization horizon for recurrence.
        batch_B=16,  # 16 parallel environments.
        max_decorrelation_steps=400,
    )
    algo = A2C()  # Run with defaults.
    agent = AtariLstmAgent()
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e5,
        affinity=affinity,
    )
    config = dict(game=game)
    name = "a2c_" + game
    log_dir = "example_4"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 2
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    eval_env_config = config["env"].copy()
    eval_env_config["start_level"] = config["env"]["num_levels"] + 100
    eval_env_config["num_levels"] = 100
    sampler = GpuSampler(EnvCls=make,
                         env_kwargs=config["env"],
                         CollectorCls=GpuResetCollector,
                         eval_env_kwargs=eval_env_config,
                         **config["sampler"])

    if config["checkpoint"]:
        model_state_dict = torch.load(config["checkpoint"])
        print("Loaded.")
    else:
        model_state_dict = None

    algo = PPO_AUG_VAE(optim_kwargs=config["optim"], **config["algo"])
    agent = RADPgVaeAgent(ModelCls=RadVaePolicy,
                          model_kwargs=config["model"],
                          initial_model_state_dict=model_state_dict,
                          **config["agent"])
    runner = MinibatchRlEval(algo=algo,
                             agent=agent,
                             sampler=sampler,
                             affinity=affinity,
                             **config["runner"])
    name = config["env"]["id"]

    with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'):
        runner.train()
Exemplo n.º 3
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    eval_env_config = config["env"].copy()
    eval_env_config["start_level"] = config["env"]["num_levels"] + 100
    eval_env_config["num_levels"] = 100
    sampler = GpuSampler(
        EnvCls=make,
        env_kwargs=config["env"],
        CollectorCls=GpuResetCollector,
        eval_env_kwargs=eval_env_config,
        **config["sampler"]
    )
    algo = PPO(optim_kwargs=config["optim"], **config["algo"])
    agent = RADPgAgent(ModelCls=RADModel, model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["id"]

    with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'):
        runner.train()
Exemplo n.º 4
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = make_affinity(
        run_slot=0,
        n_cpu_core=os.cpu_count(),  # Use 16 cores across all experiments.
        n_gpu=1,  # Use 8 gpus across all experiments.
        gpu_per_run=1,
        sample_gpu_per_run=1,
        async_sample=True,
        optim_sample_share_gpu=True)

    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)
    config["eval_env"]["game"] = config["env"]["game"]

    sampler = GpuSampler(EnvCls=AtariEnv,
                         env_kwargs=config["env"],
                         CollectorCls=GpuWaitResetCollector,
                         TrajInfoCls=AtariTrajInfo,
                         eval_env_kwargs=config["eval_env"],
                         **config["sampler"])
    algo = CategoricalDQN(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariCatDqnAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRlEval(algo=algo,
                             agent=agent,
                             sampler=sampler,
                             affinity=affinity,
                             **config["runner"])
    name = config["env"]["game"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 5
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)
    config["eval_env"]["game"] = config["env"]["game"]

    sampler = GpuSampler(
        EnvCls=AtariEnv,
        env_kwargs=config["env"],
        CollectorCls=WaitResetCollector,
        TrajInfoCls=AtariTrajInfo,
        eval_env_kwargs=config["eval_env"],
        **config["sampler"]
    )
    algo = DQN(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariDqnAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["game"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 6
0
def build_and_train(slot_affinity_code, log_dir, run_ID):
    # (Or load from a central store of configs.)
    config = dict(
        env=dict(game="pong"),
        algo=dict(learning_rate=7e-4),
        sampler=dict(batch_B=16),
    )

    affinity = affinity_from_code(slot_affinity_code)
    variant = load_variant(log_dir)
    global config
    config = update_config(config, variant)

    sampler = GpuSampler(
        EnvCls=AtariEnv,
        env_kwargs=config["env"],
        CollectorCls=GpuWaitResetCollector,
        batch_T=5,
        # batch_B=16,  # Get from config.
        max_decorrelation_steps=400,
        **config["sampler"])
    algo = A2C(**config["algo"])  # Run with defaults.
    agent = AtariFfAgent()
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e5,
        affinity=affinity,
    )
    name = "a2c_" + config["env"]["game"]
    log_dir = "example_6"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 7
0
def build_and_train(game="pong", run_ID=0, cuda_idx=None, n_parallel=2):
    config = dict(
        env=dict(game=game),
        algo=dict(batch_size=128),
        sampler=dict(batch_T=2, batch_B=32),
    )
    sampler = GpuSampler(
        EnvCls=AtariEnv,
        env_kwargs=dict(game=game),
        CollectorCls=GpuWaitResetCollector,
        eval_env_kwargs=dict(game=game),
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
        # batch_T=4,  # Get from config.
        # batch_B=1,
        **config[
            "sampler"]  # More parallel environments for batched forward-pass.
    )
    algo = DQN(**config["algo"])  # Run with defaults.
    agent = AtariDqnAgent()
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))),
    )
    name = "dqn_" + game
    log_dir = "example_5"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 8
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    assert isinstance(affinity, list)  # One for each GPU.
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    sampler = GpuSampler(
        EnvCls=AtariEnv,
        env_kwargs=config["env"],
        CollectorCls=GpuWaitResetCollector,
        TrajInfoCls=AtariTrajInfo,
        **config["sampler"]
    )
    algo = A2C(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariFfAgent(model_kwargs=config["model"], **config["agent"])
    runner = SyncRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["game"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 9
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)
    
    sampler = GpuSampler(
        EnvCls=gym.make,
        env_kwargs=config["env"],
        eval_env_kwargs=config["eval_env"],
        **config["sampler"]
    )
    algo = DiscreteSACAE(optim_kwargs=config["optim"], ae_optim_kwargs=config["ae_optim"], **config["algo"])
    agent = DiscreteSacAEAgent(**config["agent"], encoder_kwargs=config["encoder"], model_kwargs=config["actor"], critic_kwargs=config["critic"])
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["id"]
    
    with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'):
        runner.train()
Exemplo n.º 10
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)
    config["algo_name"] = 'A2OC'
    t_env = pomdp_interface(**config["env"])
    config["algo"]["discount"] = t_env.discount

    sampler = GpuSampler(
        EnvCls=pomdp_interface,
        env_kwargs=config["env"],
        **config["sampler"]
    )
    algo = A2OC(optim_kwargs=config["optim"], **config["algo"])
    agent = PomdpOcFfAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 11
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    sampler = GpuSampler(EnvCls=gym_make,
                         env_kwargs=config["env"],
                         CollectorCls=GpuResetCollector,
                         **config["sampler"])
    algo = PPO(optim_kwargs=config["optim"], **config["algo"])
    agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 12
0
def build_and_train(game="pong", run_ID=0):
    # Seems like we should be able to skip the intermediate step of the code,
    # but so far have just always run that way.
    # Change these inputs to match local machine and desired parallelism.
    affinity = make_affinity(
        run_slot=0,
        n_cpu_core=16,  # Use 16 cores across all experiments.
        n_gpu=8,  # Use 8 gpus across all experiments.
        hyperthread_offset=24,  # If machine has 24 cores.
        n_socket=2,  # Presume CPU socket affinity to lower/upper half GPUs.
        gpu_per_run=2,  # How many GPUs to parallelize one run across.
        # cpu_per_run=1,
    )

    sampler = GpuSampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,
        env_kwargs=dict(game=game),
        CollectorCls=GpuWaitResetCollector,
        batch_T=5,
        batch_B=16,
        max_decorrelation_steps=400,
    )
    algo = A2C()  # Run with defaults.
    agent = AtariFfAgent()
    runner = SyncRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e5,
        affinity=affinity,
    )
    config = dict(game=game)
    name = "a2c_" + game
    log_dir = "example_7"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 13
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    sampler = GpuSampler(
        EnvCls=gym.make,
        env_kwargs=config["env"],
        CollectorCls=GpuResetCollector,
        eval_env_kwargs=config["eval_env"],
        **config["sampler"]
    )
    if config["checkpoint"]:
        model_state_dict = torch.load(config["checkpoint"])
    else:
        model_state_dict = None

    algo = PPO(optim_kwargs=config["optim"], **config["algo"])
    agent = CategoricalPgAgent(
        ModelCls=BaselinePolicy,
        model_kwargs=config["model"],
        initial_model_state_dict=model_state_dict,
        **config["agent"]
    )
    runner = MinibatchRlEval(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["id"]

    with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'):
        runner.train()
Exemplo n.º 14
0
def start_experiment(args):

    args_json = json.dumps(vars(args), indent=4)
    if not os.path.isdir(args.log_dir):
        os.makedirs(args.log_dir)
    with open(args.log_dir + '/arguments.json', 'w') as jsonfile:
        jsonfile.write(args_json)
    with open(args.log_dir + '/git.txt', 'w') as git_file:
        branch = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).strip().decode('utf-8')
        commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip().decode('utf-8')
        git_file.write('{}/{}'.format(branch, commit))

    config = dict(env_id=args.env)
    
    if args.sample_mode == 'gpu':
        # affinity = dict(num_gpus=args.num_gpus, workers_cpus=list(range(args.num_cpus)))
        if args.num_gpus > 0:
            # import ipdb; ipdb.set_trace()
            affinity = make_affinity(
                run_slot=0,
                n_cpu_core=args.num_cpus,  # Use 16 cores across all experiments.
                n_gpu=args.num_gpus,  # Use 8 gpus across all experiments.
                # contexts_per_gpu=2,
                # hyperthread_offset=72,  # If machine has 24 cores.
                # n_socket=2,  # Presume CPU socket affinity to lower/upper half GPUs.
                gpu_per_run=args.gpu_per_run,  # How many GPUs to parallelize one run across.

                # cpu_per_run=1,
            )
            print('Make multi-gpu affinity')
        else:
            affinity = dict(cuda_idx=0, workers_cpus=list(range(args.num_cpus)))
            os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
    else:
        affinity = dict(workers_cpus=list(range(args.num_cpus)))
    
    # potentially reload models
    initial_optim_state_dict = None
    initial_model_state_dict = None
    if args.pretrain != 'None':
        os.system(f"find {args.log_dir} -name '*.json' -delete") # clean up json files for video recorder
        checkpoint = torch.load(os.path.join(_RESULTS_DIR, args.pretrain, 'params.pkl'))
        initial_optim_state_dict = checkpoint['optimizer_state_dict']
        initial_model_state_dict = checkpoint['agent_state_dict']

    # ----------------------------------------------------- POLICY ----------------------------------------------------- #
    model_args = dict(curiosity_kwargs=dict(curiosity_alg=args.curiosity_alg), curiosity_step_kwargs=dict())
    if args.curiosity_alg =='icm':
        model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding
        model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm
        model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta
        model_args['curiosity_kwargs']['forward_loss_wt'] = args.forward_loss_wt
        model_args['curiosity_kwargs']['forward_model'] = args.forward_model
        model_args['curiosity_kwargs']['feature_space'] = args.feature_space
    elif args.curiosity_alg == 'micm':
        model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding
        model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm
        model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta
        model_args['curiosity_kwargs']['forward_loss_wt'] = args.forward_loss_wt
        model_args['curiosity_kwargs']['forward_model'] = args.forward_model
        model_args['curiosity_kwargs']['ensemble_mode'] = args.ensemble_mode
        model_args['curiosity_kwargs']['device'] = args.sample_mode
    elif args.curiosity_alg == 'disagreement':
        model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding
        model_args['curiosity_kwargs']['ensemble_size'] = args.ensemble_size
        model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm
        model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta
        model_args['curiosity_kwargs']['forward_loss_wt'] = args.forward_loss_wt
        model_args['curiosity_kwargs']['device'] = args.sample_mode
        model_args['curiosity_kwargs']['forward_model'] = args.forward_model
    elif args.curiosity_alg == 'ndigo':
        model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding
        model_args['curiosity_kwargs']['pred_horizon'] = args.pred_horizon
        model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta
        model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm
        model_args['curiosity_kwargs']['device'] = args.sample_mode
    elif args.curiosity_alg == 'rnd':
        model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding
        model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta
        model_args['curiosity_kwargs']['drop_probability'] = args.drop_probability
        model_args['curiosity_kwargs']['gamma'] = args.discount
        model_args['curiosity_kwargs']['device'] = args.sample_mode
    
    if args.curiosity_alg != 'none':
        model_args['curiosity_step_kwargs']['curiosity_step_minibatches'] = args.curiosity_step_minibatches

    if args.env in _MUJOCO_ENVS:
        if args.lstm:
            agent = MujocoLstmAgent(initial_model_state_dict=initial_model_state_dict)
        else:
            agent = MujocoFfAgent(initial_model_state_dict=initial_model_state_dict)
    else:
        if args.lstm:            
            agent = AtariLstmAgent(
                        initial_model_state_dict=initial_model_state_dict,
                        model_kwargs=model_args,
                        no_extrinsic=args.no_extrinsic,
                        dual_model=args.dual_model,
                        )
        else:
            agent = AtariFfAgent(initial_model_state_dict=initial_model_state_dict,
                model_kwargs=model_args,
                no_extrinsic=args.no_extrinsic,
                dual_model=args.dual_model)

    # ----------------------------------------------------- LEARNING ALG ----------------------------------------------------- #
    if args.alg == 'ppo':
        algo = PPO(
                discount=args.discount,
                learning_rate=args.lr,
                value_loss_coeff=args.v_loss_coeff,
                entropy_loss_coeff=args.entropy_loss_coeff,
                OptimCls=torch.optim.Adam,
                optim_kwargs=None,
                clip_grad_norm=args.grad_norm_bound,
                initial_optim_state_dict=initial_optim_state_dict, # is None is not reloading a checkpoint
                gae_lambda=args.gae_lambda,
                minibatches=args.minibatches, # if recurrent: batch_B needs to be at least equal, if not recurrent: batch_B*batch_T needs to be at least equal to this
                epochs=args.epochs,
                ratio_clip=args.ratio_clip,
                linear_lr_schedule=args.linear_lr,
                normalize_advantage=args.normalize_advantage,
                normalize_reward=args.normalize_reward,
                curiosity_type=args.curiosity_alg,
                policy_loss_type=args.policy_loss_type
                )
    elif args.alg == 'a2c':
        algo = A2C(
                discount=args.discount,
                learning_rate=args.lr,
                value_loss_coeff=args.v_loss_coeff,
                entropy_loss_coeff=args.entropy_loss_coeff,
                OptimCls=torch.optim.Adam,
                optim_kwargs=None,
                clip_grad_norm=args.grad_norm_bound,
                initial_optim_state_dict=initial_optim_state_dict,
                gae_lambda=args.gae_lambda,
                normalize_advantage=args.normalize_advantage
                )

    # ----------------------------------------------------- SAMPLER ----------------------------------------------------- #

    # environment setup
    traj_info_cl = TrajInfo # environment specific - potentially overriden below
    if 'mario' in args.env.lower():
        env_cl = mario_make
        env_args = dict(
            game=args.env,  
            no_extrinsic=args.no_extrinsic,
            no_negative_reward=args.no_negative_reward,
            normalize_obs=args.normalize_obs,
            normalize_obs_steps=10000
            )
    elif args.env in _PYCOLAB_ENVS:
        env_cl = deepmind_make
        traj_info_cl = PycolabTrajInfo
        env_args = dict(
            game=args.env,
            no_extrinsic=args.no_extrinsic,
            no_negative_reward=args.no_negative_reward,
            normalize_obs=args.normalize_obs,
            normalize_obs_steps=10000,
            log_heatmaps=args.log_heatmaps,
            logdir=args.log_dir,
            obs_type=args.obs_type,
            grayscale=args.grayscale,
            max_steps_per_episode=args.max_episode_steps
            )
    elif args.env in _MUJOCO_ENVS:
        env_cl = gym_make
        env_args = dict(
            id=args.env, 
            no_extrinsic=args.no_extrinsic,
            no_negative_reward=args.no_negative_reward,
            normalize_obs=False,
            normalize_obs_steps=10000
            )
    elif args.env in _ATARI_ENVS:
        env_cl = AtariEnv
        traj_info_cl = AtariTrajInfo
        env_args = dict(
            game=args.env, 
            no_extrinsic=args.no_extrinsic,
            no_negative_reward=args.no_negative_reward,
            normalize_obs=args.normalize_obs,
            normalize_obs_steps=10000,
            downsampling_scheme='classical',
            record_freq=args.record_freq,
            record_dir=args.log_dir,
            horizon=args.max_episode_steps,
            score_multiplier=args.score_multiplier,
            repeat_action_probability=args.repeat_action_probability,
            fire_on_reset=args.fire_on_reset
            )

    if args.sample_mode == 'gpu':
        if args.lstm:
            collector_class = GpuWaitResetCollector
        else:
            collector_class = GpuResetCollector
        sampler = GpuSampler(
            EnvCls=env_cl,
            env_kwargs=env_args,
            eval_env_kwargs=env_args,
            batch_T=args.timestep_limit,
            batch_B=args.num_envs,
            max_decorrelation_steps=0,
            TrajInfoCls=traj_info_cl,
            eval_n_envs=args.eval_envs,
            eval_max_steps=args.eval_max_steps,
            eval_max_trajectories=args.eval_max_traj,
            record_freq=args.record_freq,
            log_dir=args.log_dir,
            CollectorCls=collector_class
        )
    else:
        if args.lstm:
            collector_class = CpuWaitResetCollector
        else:
            collector_class = CpuResetCollector
        sampler = CpuSampler(
            EnvCls=env_cl,
            env_kwargs=env_args,
            eval_env_kwargs=env_args,
            batch_T=args.timestep_limit, # timesteps in a trajectory episode
            batch_B=args.num_envs, # environments distributed across workers
            max_decorrelation_steps=0,
            TrajInfoCls=traj_info_cl,
            eval_n_envs=args.eval_envs,
            eval_max_steps=args.eval_max_steps,
            eval_max_trajectories=args.eval_max_traj,
            record_freq=args.record_freq,
            log_dir=args.log_dir,
            CollectorCls=collector_class
            )

    # ----------------------------------------------------- RUNNER ----------------------------------------------------- #     
    if args.eval_envs > 0:
        runner = (MinibatchRlEval if args.num_gpus <= 1 else SyncRlEval)(
            algo=algo,
            agent=agent,
            sampler=sampler,
            n_steps=args.iterations,
            affinity=affinity,
            log_interval_steps=args.log_interval,
            log_dir=args.log_dir,
            pretrain=args.pretrain
            )
    else:
        runner = (MinibatchRl if args.num_gpus <= 1 else SyncRl)(
            algo=algo,
            agent=agent,
            sampler=sampler,
            n_steps=args.iterations,
            affinity=affinity,
            log_interval_steps=args.log_interval,
            log_dir=args.log_dir,
            pretrain=args.pretrain
            )

    with logger_context(args.log_dir, config, snapshot_mode="last", use_summary_writer=True):
        runner.train()
Exemplo n.º 15
0
def start_experiment(args):

    args_json = json.dumps(vars(args), indent=4)
    if not os.path.isdir(args.log_dir):
        os.makedirs(args.log_dir)
    with open(args.log_dir + '/arguments.json', 'w') as jsonfile:
        jsonfile.write(args_json)

    config = dict(env_id=args.env)

    if args.sample_mode == 'gpu':
        assert args.num_gpus > 0
        affinity = dict(cuda_idx=0, workers_cpus=list(range(args.num_cpus)))
        os.environ['CUDA_VISIBLE_DEVICES'] = str(0)
    else:
        affinity = dict(workers_cpus=list(range(args.num_cpus)))

    # potentially reload models
    initial_optim_state_dict = None
    initial_model_state_dict = None
    if args.pretrain != 'None':
        os.system(f"find {args.log_dir} -name '*.json' -delete"
                  )  # clean up json files for video recorder
        checkpoint = torch.load(
            os.path.join(_RESULTS_DIR, args.pretrain, 'params.pkl'))
        initial_optim_state_dict = checkpoint['optimizer_state_dict']
        initial_model_state_dict = checkpoint['agent_state_dict']

    # ----------------------------------------------------- POLICY ----------------------------------------------------- #
    model_args = dict(curiosity_kwargs=dict(curiosity_alg=args.curiosity_alg))
    if args.curiosity_alg == 'icm':
        model_args['curiosity_kwargs'][
            'feature_encoding'] = args.feature_encoding
        model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm
        model_args['curiosity_kwargs'][
            'prediction_beta'] = args.prediction_beta
        model_args['curiosity_kwargs'][
            'forward_loss_wt'] = args.forward_loss_wt
    elif args.curiosity_alg == 'disagreement':
        model_args['curiosity_kwargs'][
            'feature_encoding'] = args.feature_encoding
        model_args['curiosity_kwargs']['ensemble_size'] = args.ensemble_size
        model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm
        model_args['curiosity_kwargs'][
            'prediction_beta'] = args.prediction_beta
        model_args['curiosity_kwargs'][
            'forward_loss_wt'] = args.forward_loss_wt
        model_args['curiosity_kwargs']['device'] = args.sample_mode
    elif args.curiosity_alg == 'ndigo':
        model_args['curiosity_kwargs'][
            'feature_encoding'] = args.feature_encoding
        model_args['curiosity_kwargs']['pred_horizon'] = args.pred_horizon
        model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm
        model_args['curiosity_kwargs']['num_predictors'] = args.num_predictors
        model_args['curiosity_kwargs']['device'] = args.sample_mode
    elif args.curiosity_alg == 'rnd':
        model_args['curiosity_kwargs'][
            'feature_encoding'] = args.feature_encoding
        model_args['curiosity_kwargs'][
            'prediction_beta'] = args.prediction_beta
        model_args['curiosity_kwargs'][
            'drop_probability'] = args.drop_probability
        model_args['curiosity_kwargs']['gamma'] = args.discount
        model_args['curiosity_kwargs']['device'] = args.sample_mode

    if args.env in _MUJOCO_ENVS:
        if args.lstm:
            agent = MujocoLstmAgent(
                initial_model_state_dict=initial_model_state_dict)
        else:
            agent = MujocoFfAgent(
                initial_model_state_dict=initial_model_state_dict)
    else:
        if args.lstm:
            agent = AtariLstmAgent(
                initial_model_state_dict=initial_model_state_dict,
                model_kwargs=model_args,
                no_extrinsic=args.no_extrinsic)
        else:
            agent = AtariFfAgent(
                initial_model_state_dict=initial_model_state_dict)

    # ----------------------------------------------------- LEARNING ALG ----------------------------------------------------- #
    if args.alg == 'ppo':
        if args.kernel_mu == 0.:
            kernel_params = None
        else:
            kernel_params = (args.kernel_mu, args.kernel_sigma)
        algo = PPO(
            discount=args.discount,
            learning_rate=args.lr,
            value_loss_coeff=args.v_loss_coeff,
            entropy_loss_coeff=args.entropy_loss_coeff,
            OptimCls=torch.optim.Adam,
            optim_kwargs=None,
            clip_grad_norm=args.grad_norm_bound,
            initial_optim_state_dict=
            initial_optim_state_dict,  # is None is not reloading a checkpoint
            gae_lambda=args.gae_lambda,
            minibatches=args.
            minibatches,  # if recurrent: batch_B needs to be at least equal, if not recurrent: batch_B*batch_T needs to be at least equal to this
            epochs=args.epochs,
            ratio_clip=args.ratio_clip,
            linear_lr_schedule=args.linear_lr,
            normalize_advantage=args.normalize_advantage,
            normalize_reward=args.normalize_reward,
            kernel_params=kernel_params,
            curiosity_type=args.curiosity_alg)
    elif args.alg == 'a2c':
        algo = A2C(discount=args.discount,
                   learning_rate=args.lr,
                   value_loss_coeff=args.v_loss_coeff,
                   entropy_loss_coeff=args.entropy_loss_coeff,
                   OptimCls=torch.optim.Adam,
                   optim_kwargs=None,
                   clip_grad_norm=args.grad_norm_bound,
                   initial_optim_state_dict=initial_optim_state_dict,
                   gae_lambda=args.gae_lambda,
                   normalize_advantage=args.normalize_advantage)

    # ----------------------------------------------------- SAMPLER ----------------------------------------------------- #

    # environment setup
    traj_info_cl = TrajInfo  # environment specific - potentially overriden below
    if 'mario' in args.env.lower():
        env_cl = mario_make
        env_args = dict(game=args.env,
                        no_extrinsic=args.no_extrinsic,
                        no_negative_reward=args.no_negative_reward,
                        normalize_obs=args.normalize_obs,
                        normalize_obs_steps=10000)
    elif 'deepmind' in args.env.lower():  # pycolab deepmind environments
        env_cl = deepmind_make
        traj_info_cl = PycolabTrajInfo
        env_args = dict(game=args.env,
                        no_extrinsic=args.no_extrinsic,
                        no_negative_reward=args.no_negative_reward,
                        normalize_obs=args.normalize_obs,
                        normalize_obs_steps=10000,
                        log_heatmaps=args.log_heatmaps,
                        logdir=args.log_dir,
                        obs_type=args.obs_type,
                        max_steps_per_episode=args.max_episode_steps)
    elif args.env in _MUJOCO_ENVS:
        env_cl = gym_make
        env_args = dict(id=args.env,
                        no_extrinsic=args.no_extrinsic,
                        no_negative_reward=args.no_negative_reward,
                        normalize_obs=False,
                        normalize_obs_steps=10000)
    elif args.env in _ATARI_ENVS:
        env_cl = AtariEnv
        traj_info_cl = AtariTrajInfo
        env_args = dict(
            game=args.env,
            no_extrinsic=args.no_extrinsic,
            no_negative_reward=args.no_negative_reward,
            normalize_obs=args.normalize_obs,
            normalize_obs_steps=10000,
            downsampling_scheme='classical',
            record_freq=args.record_freq,
            record_dir=args.log_dir,
            horizon=args.max_episode_steps,
        )

    if args.sample_mode == 'gpu':
        if args.lstm:
            collector_class = GpuWaitResetCollector
        else:
            collector_class = GpuResetCollector
        sampler = GpuSampler(EnvCls=env_cl,
                             env_kwargs=env_args,
                             eval_env_kwargs=env_args,
                             batch_T=args.timestep_limit,
                             batch_B=args.num_envs,
                             max_decorrelation_steps=0,
                             TrajInfoCls=traj_info_cl,
                             eval_n_envs=args.eval_envs,
                             eval_max_steps=args.eval_max_steps,
                             eval_max_trajectories=args.eval_max_traj,
                             record_freq=args.record_freq,
                             log_dir=args.log_dir,
                             CollectorCls=collector_class)
    else:
        if args.lstm:
            collector_class = CpuWaitResetCollector
        else:
            collector_class = CpuResetCollector
        sampler = CpuSampler(
            EnvCls=env_cl,
            env_kwargs=env_args,
            eval_env_kwargs=env_args,
            batch_T=args.timestep_limit,  # timesteps in a trajectory episode
            batch_B=args.num_envs,  # environments distributed across workers
            max_decorrelation_steps=0,
            TrajInfoCls=traj_info_cl,
            eval_n_envs=args.eval_envs,
            eval_max_steps=args.eval_max_steps,
            eval_max_trajectories=args.eval_max_traj,
            record_freq=args.record_freq,
            log_dir=args.log_dir,
            CollectorCls=collector_class)

    # ----------------------------------------------------- RUNNER ----------------------------------------------------- #
    if args.eval_envs > 0:
        runner = MinibatchRlEval(algo=algo,
                                 agent=agent,
                                 sampler=sampler,
                                 n_steps=args.iterations,
                                 affinity=affinity,
                                 log_interval_steps=args.log_interval,
                                 log_dir=args.log_dir,
                                 pretrain=args.pretrain)
    else:
        runner = MinibatchRl(algo=algo,
                             agent=agent,
                             sampler=sampler,
                             n_steps=args.iterations,
                             affinity=affinity,
                             log_interval_steps=args.log_interval,
                             log_dir=args.log_dir,
                             pretrain=args.pretrain)

    with logger_context(args.log_dir,
                        config,
                        snapshot_mode="last",
                        use_summary_writer=True):
        runner.train()
Exemplo n.º 16
0
def build_and_train(log_dir,
                    game="pong",
                    run_ID=0,
                    cuda_idx=None,
                    eval=False,
                    save_model='last',
                    load_model_path=None,
                    n_parallel=2,
                    CumSteps=0):
    device = 'cpu' if cuda_idx is None else 'cuda'
    params = torch.load(
        load_model_path,
        map_location=torch.device(device)) if load_model_path else {}
    agent_state_dict = params.get('agent_state_dict')
    optimizer_state_dict = params.get('optimizer_state_dict')

    ##--- wu ---##
    log_interval_steps = 5e4
    prefill = 5e4
    train_every = 16
    batch_B = 16
    n_steps = 1e4 if eval else 5e6
    itr_start = max(0, CumSteps - prefill) // train_every
    ##--- wu ---##

    action_repeat = 4  # 2
    env_kwargs = dict(
        name=game,
        action_repeat=action_repeat,
        size=(64, 64),
        grayscale=True,  # False
        life_done=True,
        sticky_actions=True,
    )
    factory_method = make_wapper(
        AtariEnv, [OneHotAction, TimeLimit],
        [dict(), dict(duration=1000000 / action_repeat)])  # 1000

    sampler = GpuSampler(
        EnvCls=factory_method,
        TrajInfoCls=AtariTrajInfo,
        env_kwargs=env_kwargs,
        eval_env_kwargs=env_kwargs,
        batch_T=1,
        batch_B=batch_B,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e5),
        eval_max_trajectories=5,
    )

    algo = Dreamer(
        initial_optim_state_dict=optimizer_state_dict,
        horizon=10,
        use_pcont=True,
        replay_size=int(2e6),  # int(5e6)
        kl_scale=0.1,
        batch_size=50,
        batch_length=50,
        C=1,  # 100,
        train_every=train_every // batch_B,  # 1000
        pretrain=100,
        world_lr=2e-4,  # 6e-4,
        value_lr=1e-4,  # 8e-5,
        actor_lr=4e-5,  # 8e-5,
        discount=0.999,  # 0.99,
        expl_amount=0.0,  # 0.3,
        prefill=prefill // batch_B,  # 5000
        discount_scale=5.,  # 10.
        video_every=int(2e4 // 16 * 16 // batch_B),  # int(10)
    )

    if eval:
        # for eval - all versions
        agent = AtariDreamerAgent(train_noise=0.0,
                                  eval_noise=0,
                                  expl_type="epsilon_greedy",
                                  itr_start=itr_start,
                                  the_expl_mode='eval',
                                  expl_min=0.0,
                                  expl_decay=11000,
                                  initial_model_state_dict=agent_state_dict,
                                  model_kwargs=dict(use_pcont=True))
    else:
        # for train - all versions
        # agent = AtariDreamerAgent(train_noise=0.4, eval_noise=0, expl_type="epsilon_greedy", itr_start=itr_start, the_expl_mode='train',
        #                           expl_min=0.1, expl_decay=11000, initial_model_state_dict=agent_state_dict,
        #                           model_kwargs=dict(use_pcont=True))

        # for train - dreamer_V2
        agent = AtariDreamerAgent(train_noise=0.0,
                                  eval_noise=0,
                                  expl_type="epsilon_greedy",
                                  itr_start=itr_start,
                                  the_expl_mode='train',
                                  expl_min=0.0,
                                  expl_decay=11000,
                                  initial_model_state_dict=agent_state_dict,
                                  model_kwargs=dict(use_pcont=True))

    my_seed = 0  # reproductivity
    set_seed(my_seed)
    runner_cls = MinibatchRlEval if eval else MinibatchRl
    runner = runner_cls(
        algo=
        algo,  # Uses gathered samples to train the agent (e.g. defines a loss function and performs gradient descent).
        agent=
        agent,  # Chooses control action to the environment in sampler; trained by the algorithm. Interface to model.
        sampler=sampler,
        n_steps=n_steps,
        log_interval_steps=log_interval_steps,
        affinity=dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel))),
        seed=my_seed,
    )
    config = dict(game=game)
    name = "dreamer_" + game
    with logger_context(log_dir,
                        run_ID,
                        name,
                        config,
                        snapshot_mode=save_model,
                        override_prefix=True,
                        use_summary_writer=True):
        runner.train()
Exemplo n.º 17
0
class RandomDiscreteModel(torch.nn.Module):
    def __init__(self, num_actions):
        super().__init__()
        self.num_actions = num_actions

    def forward(self, observation, prev_action, prev_reward):
        lead_dim, T, B, img_shape = infer_leading_dims(observation, 3)
        action = torch.randint(low=0, high=self.num_actions, size=(T * B, ))
        action = restore_leading_dims((action), lead_dim, T, B)
        return action


# Setup the data collection pipeline
sampler = GpuSampler(EnvCls=gym.make,
                     env_kwargs=config["env"],
                     CollectorCls=GpuResetCollector,
                     eval_env_kwargs=config["env"],
                     **config["sampler"])
agent = RandomAgent(ModelCls=RandomDiscreteModel,
                    model_kwargs={"num_actions": 15})
seed = make_seed()
set_seed(seed)
sampler.initialize(agent=agent, affinity=affinity, seed=seed + 1, rank=0)
steps = config["train_steps"]

# Create the model
model = BiGAN(**config["model"])
if config["load_path"]:
    model.load_state_dict(torch.load(config["load_path"]))
# Setup the optimizers
lr = config["optim"]["lr"]
Exemplo n.º 18
0
def build_and_train(args, game="", run_ID=0, config=None):
    """
    1. Parse the args object into dictionaries understood by rlpyt
    """
    config['env']['id'] = args.env_name
    config["eval_env"]["id"] = args.env_name

    config["eval_env"]["horizon"] = args.horizon
    config["env"]["horizon"] = args.horizon

    if 'procgen' in args.env_name:
        for k, v in vars(args).items():
            if args.env_name.split('-')[1] in k:
                config['env'][k] = v

    config['model']['frame_stack'] = args.frame_stack
    config['model']['nce_loss'] = args.nce_loss
    config['model']['algo'] = args.algo
    config['model']['env_name'] = args.env_name
    config['model']['dueling'] = args.dueling == 1
    config['algo']['double_dqn'] = args.double_dqn == 1
    config['algo']['prioritized_replay'] = args.prioritized_replay == 1
    config['algo']['n_step_return'] = args.n_step_return
    config['algo']['learning_rate'] = args.learning_rate

    config['runner']['log_interval_steps'] = args.log_interval_steps
    config['cmd_args'] = vars(args)
    """
    2. Create the CatDQN (C51) agent from custom implementation
    """

    agent = AtariCatDqnAgent(ModelCls=AtariCatDqnModel_nce,
                             model_kwargs=config["model"],
                             **config["agent"])
    algo = CategoricalDQN_nce(args=config['cmd_args'],
                              ReplayBufferCls=None,
                              optim_kwargs=config["optim"],
                              **config["algo"])

    if args.mode == 'parallel':
        affinity = make_affinity(n_cpu_core=args.n_cpus,
                                 n_gpu=args.n_gpus,
                                 n_socket=1
                                 # hyperthread_offset=0
                                 )
        """
        Some architecture require the following block to be uncommented. Try with and without.
        This is here to allow scheduling of non-sequential CPU IDs
        """
        # import psutil
        # psutil.Process().cpu_affinity([])
        # cpus = tuple(psutil.Process().cpu_affinity())
        # affinity['all_cpus'] = affinity['master_cpus'] = cpus
        # affinity['workers_cpus'] = tuple([tuple([x]) for x in cpus+cpus])
        # env_kwargs = config['env']

        sampler = GpuSampler(EnvCls=make_env,
                             env_kwargs=config["env"],
                             CollectorCls=GpuWaitResetCollector,
                             TrajInfoCls=AtariTrajInfo,
                             eval_env_kwargs=config["eval_env"],
                             **config["sampler"])
        """
        If you don't have a GPU, use the CpuSampler
        """
        # sampler = CpuSampler(
        #             EnvCls=AtariEnv if args.game is not None else make_env,
        #             env_kwargs=config["env"],
        #             CollectorCls=CpuWaitResetCollector,
        #             TrajInfoCls=AtariTrajInfo,
        #             eval_env_kwargs=config["eval_env"],
        #             **config["sampler"]
        #         )

    elif args.mode == 'serial':
        affinity = make_affinity(
            n_cpu_core=1,  # Use 16 cores across all experiments.
            n_gpu=args.n_gpus,  # Use 8 gpus across all experiments.
            n_socket=1,
        )
        """
        Some architecture require the following block to be uncommented. Try with and without.
        """
        # import psutil
        # psutil.Process().cpu_affinity([])
        # cpus = tuple(psutil.Process().cpu_affinity())
        # affinity['all_cpus'] = affinity['master_cpus'] = cpus
        # affinity['workers_cpus'] = tuple([tuple([x]) for x in cpus+cpus])
        # env_kwargs = config['env']

        sampler = SerialSampler(
            EnvCls=make_env,
            env_kwargs=config["env"],
            # CollectorCls=SerialEvalCollector,
            TrajInfoCls=AtariTrajInfo,
            eval_env_kwargs=config["eval_env"],
            **config["sampler"])
    """
    3. Bookkeeping, setting up Comet.ml experiments, etc
    """
    folders_name = [args.output_dir, args.env_name, 'run_' + args.run_ID]
    path = os.path.join(*folders_name)
    os.makedirs(path, exist_ok=True)

    experiment = Experiment(api_key='your_key',
                            auto_output_logging=False,
                            project_name='driml',
                            workspace="your_workspace",
                            disabled=True)
    experiment.add_tag('C51+DIM' if (
        args.lambda_LL > 0 or args.lambda_LG > 0 or args.lambda_GL > 0
        or args.lambda_GG > 0) else 'C51')
    experiment.set_name(args.experiment_name)
    experiment.log_parameters(config)

    MinibatchRlEval.TF_logger = Logger(path,
                                       use_TFX=True,
                                       params=config,
                                       comet_experiment=experiment,
                                       disable_local=True)
    MinibatchRlEval.log_diagnostics = log_diagnostics_custom
    MinibatchRlEval._log_infos = _log_infos
    MinibatchRlEval.evaluate_agent = evaluate_agent
    """
    4. Define the runner as minibatch
    """
    runner = MinibatchRlEval(algo=algo,
                             agent=agent,
                             sampler=sampler,
                             affinity=affinity,
                             **config["runner"])

    runner.algo.opt_info_fields = tuple(
        list(runner.algo.opt_info_fields) + ['lossNCE'] +
        ['action%d' % i for i in range(15)])
    name = args.mode + "_value_based_nce_" + args.env_name
    log_dir = os.path.join(args.output_dir, args.env_name)
    logger.set_snapshot_gap(args.weight_save_interval //
                            config['runner']['log_interval_steps'])
    """
    6. Run the experiment and optionally save network weights
    """

    with experiment.train():
        with logger_context(
                log_dir,
                run_ID,
                name,
                config,
                snapshot_mode=(
                    'last' if args.weight_save_interval == -1 else 'gap'
                )):  # set 'all' to save every it, 'gap' for every X it
            runner.train()
Exemplo n.º 19
0
def build_and_train(env_id="POMDP-hallway-episodic-v0", run_ID=0, cuda_idx=None, n_parallel=1, fomdp=False):
    EnvCls = pomdp_interface
    env_args = dict(fomdp=fomdp, id=env_id, time_limit=100)
    test_instance = EnvCls(**env_args)
    gamma = test_instance.discount
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True)
    lr = 1e-3
    po = np.array([1,0,0,1,0], dtype=bool)
    # Model kwargs
    # model_kwargs = dict()
    # model_kwargs = dict(hidden_sizes=[64, 64], shared_processor=False)
    model_kwargs = dict(hidden_sizes=[64, 64], rnn_type='gru', rnn_size=256, rnn_placement=1, shared_processor=False, layer_norm=True, prev_action=3, prev_reward=3)
    # model_kwargs = dict(hidden_sizes=[64, 64], option_size=4, shared_processor=False, use_interest=False, use_diversity=False, use_attention=False)
    # model_kwargs = dict(hidden_sizes=[64, 64], option_size=4, use_interest=True, use_diversity=False,
    #                     use_attention=False, rnn_type='gru', rnn_size=256, rnn_placement=1, shared_processor=False, layer_norm=True, prev_option=po)

    # Samplers
    sampler = GpuSampler(
        EnvCls=EnvCls,
        env_kwargs=env_args,
        eval_env_kwargs=env_args,
        batch_T=20,  # One time-step per sampler iteration.
        batch_B=30,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        eval_n_envs=5,
        eval_max_steps=int(25e3),
        eval_max_trajectories=30
    )
    # sampler = AlternatingSampler(
    #     EnvCls=EnvCls,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=20,  # One time-step per sampler iteration.
    #     batch_B=30,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=0,
    #     eval_n_envs=5,
    #     eval_max_steps=int(25e3),
    #     eval_max_trajectories=30
    # )
    #
    # sampler = SerialSampler(
    #     EnvCls=EnvCls,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=20,  # One time-step per sampler iteration.
    #     batch_B=30,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=0,
    #     # eval_n_envs=2,
    #     # eval_max_steps=int(51e2),
    #     # eval_max_trajectories=5,
    # )

    # Algos (swapping out discount)
    algo = A2C(discount=gamma, learning_rate=lr, clip_grad_norm=2.)
    # algo = A2OC(discount=gamma, learning_rate=lr, clip_grad_norm=2.)
    # algo = PPO(discount=gamma, learning_rate=lr, clip_grad_norm=2.)
    # algo = PPOC(discount=gamma, learning_rate=lr, clip_grad_norm=2.)

    # Agents
    # agent = PomdpFfAgent(model_kwargs=model_kwargs)
    agent = PomdpRnnAgent(model_kwargs=model_kwargs)
    # agent = PomdpOcFfAgent(model_kwargs=model_kwargs)
    # agent = PomdpOcRnnAgent(model_kwargs=model_kwargs)
    # agent = AlternatingPomdpRnnAgent(model_kwargs=model_kwargs)
    # agent = AlternatingPomdpOcRnnAgent(model_kwargs=model_kwargs)
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e3,
        affinity=affinity,
    )
    config = dict(env_id=env_id, fomdp=fomdp, algo_name=algo.__class__.__name__, learning_rate=lr, sampler=sampler.__class__.__name__, model=model_kwargs)
    name = algo.NAME + '_' + env_id
    log_dir = "pomdps"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
Exemplo n.º 20
0
    def run(self, run_ID=0):
        config = self.getConfig()
        sampler = GpuSampler(EnvCls=make_env,
                             env_kwargs={
                                 "num_levels": config["num_levels"],
                                 "env": config['env']
                             },
                             CollectorCls=GpuResetCollector,
                             batch_T=256,
                             batch_B=config["envs_per_worker"],
                             max_decorrelation_steps=1000)

        optim_args = dict(weight_decay=config["l2_penalty"]
                          ) if "l2_penalty" in config else None

        algo = PPO(value_loss_coeff=0.5,
                   clip_grad_norm=0.5,
                   discount=config["discount"],
                   entropy_loss_coeff=config["entropy_bonus"],
                   gae_lambda=config["lambda"],
                   minibatches=config["minibatches_per_epoch"],
                   epochs=config["epochs_per_rollout"],
                   ratio_clip=config["ppo_clip"],
                   learning_rate=config["learning_rate"],
                   normalize_advantage=True,
                   optim_kwargs=optim_args)

        if config["arch"] == 'impala':
            agent = ImpalaAgent(
                model_kwargs={
                    "in_channels": [3, 16, 32],
                    "out_channels": [16, 32, 32],
                    "hidden_size": 256
                })
        elif config["arch"] == 'lstm':
            agent = NatureRecurrentAgent(model_kwargs={
                "hidden_sizes": [512],
                "lstm_size": 256
            })
        else:
            agent = OriginalNatureAgent(
                model_kwargs={
                    "batchNorm": config["batchNorm"],
                    "dropout": config["dropout"],
                    "augment_obs": config["augment_obs"],
                    "use_maxpool": config["maxpool"],
                    "hidden_sizes": config["hidden_sizes"],
                    "arch": config["arch"]
                })

        affinity = dict(cuda_idx=0, workers_cpus=list(range(8)))

        runner = MinibatchRl(algo=algo,
                             agent=agent,
                             sampler=sampler,
                             n_steps=config["total_timesteps"],
                             log_interval_steps=500,
                             affinity=affinity)
        log_dir = "./logs"
        name = config["name"]
        with logger_context(log_dir,
                            run_ID,
                            name,
                            config,
                            use_summary_writer=True,
                            override_prefix=False):
            runner.train()
        torch.save(agent.state_dict(), "./" + name + ".pt")
        wandb.save("./" + name + ".pt")