示例#1
0
def build_and_train(
        slot_affinity_code="0slt_0gpu_1cpu_1cpr",
        log_dir="test",
        run_ID="0",
        config_key="LSTM",
        ):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    pprint.pprint(config)

    sampler = CpuSampler(
        EnvCls=safety_gym_make,
        env_kwargs=config["env"],
        TrajInfoCls=SafetyGymTrajInfo,
        **config["sampler"]
    )
    algo = CppoPID(**config["algo"])
    agent = CppoLstmAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"],
    )
    name = "cppo_" + config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#2
0
def build_and_train(run_id=0, greedy_eval=False):
    sampler = SerialSampler(
        EnvCls=MyEnv,
        env_kwargs=dict(),
        eval_env_kwargs=dict(),
        batch_T=horizon,
        batch_B=64,
        max_decorrelation_steps=0,
        eval_n_envs=64,
        eval_max_steps=int(1e6),
        eval_max_trajectories=64,
    )
    runner = MinibatchRl(
        algo=PPO(entropy_loss_coeff=0., learning_rate=3e-4),
        agent=MyAgent(greedy_eval),
        sampler=sampler,
        n_steps=int(400 * horizon * 64),
        log_interval_steps=int(10 * horizon * 64),
    )
    log_params = dict()

    log_dir = "data/rl_example_2/{}".format(
        datetime.datetime.today().strftime("%Y%m%d_%H%M"))
    with logger_context(log_dir,
                        run_id,
                        'Reacher2D',
                        log_params=log_params,
                        snapshot_mode="last",
                        use_summary_writer=True,
                        override_prefix=True):
        runner.train()
示例#3
0
def build_and_train(env_id="HalfCheetah-Directional-v0", run_ID=0, cuda_idx=None, n_parallel=6):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True)
    env_args = dict(id=env_id)
    env_args[RLPYT_WRAPPER_KEY] = [ClipActionsWrapper]
    # sampler = GpuSampler(
    #     EnvCls=gym_make,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=256,  # One time-step per sampler iteration.
    #     batch_B=8,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=100,
    #     eval_n_envs=5,
    #     eval_max_steps=int(25e3),
    #     eval_max_trajectories=30
    # )
    # agent = MujocoFfOcAgent(model_kwargs={'option_size': 2})

    # sampler = AlternatingSampler(
    #     EnvCls=gym_make,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=256,  # One time-step per sampler iteration.
    #     batch_B=8,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=100,
    #     eval_n_envs=5,
    #     eval_max_steps=int(25e3),
    #     eval_max_trajectories=30
    # )
    # agent = AlternatingMujocoFfOcAgent(model_kwargs={'option_size': 2})

    sampler = SerialSampler(
        EnvCls=gym_make,
        env_kwargs=env_args,
        eval_env_kwargs=env_args,
        batch_T=256,  # One time-step per sampler iteration.
        batch_B=8,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        # eval_n_envs=2,
        # eval_max_steps=int(51e2),
        # eval_max_trajectories=5,
    )
    agent = MujocoFfOcAgent(model_kwargs={'option_size': 2})

    algo = PPOC(clip_vf_loss=False, normalize_rewards='return')  # Run with defaults.
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e3,
        affinity=affinity,
        transfer=True,
        transfer_iter=150,
        log_traj_window=10
    )
    config = dict(env_id=env_id)
    name = "ppoc_" + env_id
    log_dir = "example_2a_ppoc"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#4
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)
    config["algo_name"] = 'A2OC'
    t_env = pomdp_interface(**config["env"])
    config["algo"]["discount"] = t_env.discount

    sampler = GpuSampler(
        EnvCls=pomdp_interface,
        env_kwargs=config["env"],
        **config["sampler"]
    )
    algo = A2OC(optim_kwargs=config["optim"], **config["algo"])
    agent = PomdpOcFfAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#5
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = get_affinity(slot_affinity_code)
    config = configs[config_key]
    # variant = load_variant(log_dir)
    # config = update_config(config, variant)

    sampler = CpuParallelSampler(
        EnvCls=AtariEnv,
        env_kwargs=config["env"],
        CollectorCls=EpisodicLivesWaitResetCollector,
        TrajInfoCls=AtariTrajInfo,
        **config["sampler"]
    )
    algo = A2C(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariFfAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["game"]
    with logger_context(log_dir, run_ID, name, config):  # Might have to flatten config
        runner.train()
def build_and_train(
    slot_affinity_code="0slt_1gpu_1cpu",
    log_dir="test",
    run_ID="0",
    config_key="ppo_ul_16env",
):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    pprint.pprint(config)

    sampler = SerialSampler(
        EnvCls=AtariEnv84,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        TrajInfoCls=AtariTrajInfo,
        eval_env_kwargs=config["env"],  # Same args!
        **config["sampler"])
    algo = PpoUl(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariPgAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = config["env"]["game"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#7
0
文件: example_4.py 项目: zikkat/rlpyt
def build_and_train(game="pong",
                    run_ID=0,
                    cuda_idx=None,
                    mid_batch_reset=False,
                    n_parallel=2):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)))
    Collector = GpuResetCollector if mid_batch_reset else GpuWaitResetCollector
    print(f"To satisfy mid_batch_reset=={mid_batch_reset}, using {Collector}.")

    sampler = GpuParallelSampler(
        EnvCls=AtariEnv,
        env_kwargs=dict(game=game,
                        num_img_obs=1),  # Learn on individual frames.
        CollectorCls=Collector,
        batch_T=20,  # Longer sampling/optimization horizon for recurrence.
        batch_B=16,  # 16 parallel environments.
        max_decorrelation_steps=400,
    )
    algo = A2C()  # Run with defaults.
    agent = AtariLstmAgent()
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e5,
        affinity=affinity,
    )
    config = dict(game=game)
    name = "a2c_" + game
    log_dir = "example_4"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#8
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    if slot_affinity_code is 'None':
        # affinity = affinity_from_code(run_slot_affinity_code)
        slot_affinity_code = prepend_run_slot(0, affinity_code)
        affinity = affinity_from_code(slot_affinity_code)
    else:
        affinity = affinity_from_code(slot_affinity_code)

    config = configs[config_key]

    # load variant of experiment (there may not be a variant, though)
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    sampler = CpuSampler(
        EnvCls=make_env,
        env_kwargs={},
        CollectorCls=CpuResetCollector,
        **config["sampler"]
    )
    algo = PPO(optim_kwargs=config["optim"], **config["algo"])
    agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#9
0
def test_rlpyt_simple():
    """ partially copied from example 1 """
    game = "pong"
    run_ID = 0
    cuda_idx = None
    n_steps = 1
    sampler = SerialSampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=dict(game=game),
        eval_env_kwargs=dict(game=game),
        batch_T=4,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = DQN(min_steps_learn=1e3, replay_size=1e3)  # remove memory issues
    agent = AtariDqnAgent()
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=n_steps,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(game=game)
    name = "dqn_" + game
    log_dir = "test_example_1"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
示例#10
0
def build_and_train(slot_affinity_code, log_dir, run_ID):
    # (Or load from a central store of configs.)
    config = dict(
        env=dict(game="pong"),
        algo=dict(learning_rate=7e-4),
        sampler=dict(batch_B=16),
    )

    affinity = get_affinity(slot_affinity_code)
    variant = load_variant(log_dir)
    global config
    config = update_config(config, variant)

    sampler = GpuParallelSampler(
        EnvCls=AtariEnv,
        env_kwargs=config["env"],
        CollectorCls=WaitResetCollector,
        batch_T=5,
        # batch_B=16,  # Get from config.
        max_decorrelation_steps=400,
        **config["sampler"])
    algo = A2C(**config["algo"])  # Run with defaults.
    agent = AtariFfAgent()
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e5,
        affinity=affinity,
    )
    name = "a2c_" + config["env"]["game"]
    log_dir = "example_6"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#11
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    sampler = GpuSampler(
        EnvCls=AtariEnv,
        env_kwargs=config["env"],
        CollectorCls=WaitResetCollector,
        TrajInfoCls=AtariTrajInfo,
        **config["sampler"]
    )
    algo = PPO(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariLstmAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        **config["runner"]
    )
    name = config["env"]["game"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#12
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    sampler = CpuSampler(
        EnvCls=gym_make,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        **config["sampler"]
    )
    algo = PPO(optim_kwargs=config["optim"], **config["algo"])
    agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        affinity=affinity,
        seed=int(run_ID) * 1000,
        **config["runner"]
    )
    name = config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#13
0
def build_and_train(run_ID=0, cuda_idx=None, n_parallel=2, serial_sampling=False):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)))
    device = "cpu" if cuda_idx is None else f"gpu {cuda_idx}"
    if serial_sampling:
        Sampler = SerialSampler  # ignores workers_cpus
        print(f"Using serial sampler w/ {device} for action sampling and optimization")
    else:
        Sampler = CpuSampler if cuda_idx is None else GpuSampler
        print(f"Using parallel sampler w/ {device} for action sampling and optimization")

    game = "pong"

    sampler = Sampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=dict(game=game),
        eval_env_kwargs=dict(game=game),
        batch_T=1,
        batch_B=8,  # number of game running in parallel
        max_decorrelation_steps=0
    )

    # load target observations along with respective thresholds and target actions
    target_obs = []
    target_info = {}
    for idx, (tpath, tthresh, ttarg) in enumerate(TARGET_META):
        with open(tpath, "rb") as f:
            tob = pickle.load(f)
            target_obs.append(tob)
            target_info[idx] = (tthresh, ttarg)
    target_obs = np.asarray(target_obs).transpose(0, 3, 1, 2)  # N, H, W, C --> N, C, H, W

    # adversary algorithm (subsumes agent DQN algorithm)
    algo = FixedAttackerDQN(
        target_obs,
        target_info,
        contrast_sd_path,
        dqn_oracle_sd_path,
        delta_bound=1.0,
        first_poison_itr=1,
        min_steps_learn=1e3
    )

    agent = AtariDqnAgent()

    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e3,
        affinity=affinity
    )

    config = dict(game=game)
    name = "rp_fixed_attack_dqn_" + game
    log_dir = "rp_fixed_attack"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
示例#14
0
def build_and_train(game="fruitbot", run_ID=0, cuda_idx=None, n_parallel=6):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True)
    env_args = dict(game=game, start_level=0, num_levels=1)
    # sampler = AlternatingSampler(
    #     EnvCls=ProcgenEnv,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=256,  # One time-step per sampler iteration.
    #     batch_B=12,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=100,
    #     # eval_n_envs=5,
    #     # eval_max_steps=int(25e3),
    #     # eval_max_trajectories=30
    # )
    # sampler = GpuSampler(
    #     EnvCls=ProcgenEnv,
    #     env_kwargs=env_args,
    #     eval_env_kwargs=env_args,
    #     batch_T=256,  # One time-step per sampler iteration.
    #     batch_B=12,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=100,
    #     # eval_n_envs=5,
    #     # eval_max_steps=int(25e3),
    #     # eval_max_trajectories=30
    # )
    #
    sampler = SerialSampler(
        EnvCls=ProcgenEnv,
        env_kwargs=env_args,
        eval_env_kwargs=env_args,
        batch_T=256,  # One time-step per sampler iteration.
        batch_B=8,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=0,
        # eval_n_envs=2,
        # eval_max_steps=int(51e2),
        # eval_max_trajectories=5,
    )

    algo = PPOC(clip_vf_loss=False, normalize_rewards=None)  # Run with defaults.
    agent = Agent(model_kwargs={'option_size': 2})
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e3,
        affinity=affinity,
        # transfer=True,
        # transfer_iter=150,
        # log_traj_window=10
    )
    config = dict(game=game)
    name = "ppo_" + game
    log_dir = "example_2a_fruitbot"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#15
0
def build_and_train(game="doom_benchmark", run_ID=0, cuda_idx=None, n_parallel=-1,
                    n_env=-1, n_timestep=-1, sample_mode=None):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)))

    gpu_cpu = "CPU" if cuda_idx is None else f"GPU {cuda_idx}"
    if sample_mode == "serial":
        Sampler = SerialSampler  # (Ignores workers_cpus.)
        print(f"Using serial sampler, {gpu_cpu} for sampling and optimizing.")
    elif sample_mode == "cpu":
        Sampler = CpuSampler
        print(f"Using CPU parallel sampler (agent in workers), {gpu_cpu} for optimizing.")
    elif sample_mode == "gpu":
        Sampler = GpuSampler
        print(f"Using GPU parallel sampler (agent in master), {gpu_cpu} for sampling and optimizing.")
    elif sample_mode == "alternating":
        Sampler = AlternatingSampler
        affinity["workers_cpus"] += affinity["workers_cpus"]  # (Double list)
        affinity["alternating"] = True  # Sampler will check for this.
        print(f"Using Alternating GPU parallel sampler, {gpu_cpu} for sampling and optimizing.")

        # !!!
        # COMMENT: to use alternating sampler here we had to comment lines 126-127 in action_server.py
        # if "bootstrap_value" in self.samples_np.agent:
        #     self.bootstrap_value_pair[alt][:] = self.agent.value(*agent_inputs_pair[alt])
        # otherwise it crashes
        # !!!

    sampler = Sampler(
        EnvCls=VizdoomEnv,
        env_kwargs=dict(game=game),
        batch_T=n_timestep,
        batch_B=n_env,
        max_decorrelation_steps=0,
    )
    algo = PPO(minibatches=1, epochs=1)

    agent = DoomLstmAgent()

    # Maybe AsyncRL could give better performance?
    # In the current version however PPO + AsyncRL does not seem to be working (not implemented)
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e5,
        affinity=affinity,
    )
    config = dict(game=game)
    name = "ppo_" + game + str(n_env)
    log_dir = "doom_ppo"

    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#16
0
def build_and_train(
    slot_affinity_code="0slt_1gpu_1cpu",
    log_dir="test",
    run_ID="0",
    config_key="ppo_16env",
    experiment_title="exp",
    snapshot_mode="none",
    snapshot_gap=None,
):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)

    # Hack that the first part of the log_dir matches the source of the model
    model_base_dir = config["pretrain"]["model_dir"]
    if model_base_dir is not None:
        raw_log_dir = log_dir.split(experiment_title)[-1].lstrip(
            "/")  # get rid of ~/GitRepos/adam/rlpyt/data/local/<timestamp>/
        model_sub_dir = raw_log_dir.split("/RlFromUl/")[
            0]  # keep the UL part, which comes first
        config["agent"]["state_dict_filename"] = osp.join(
            model_base_dir, model_sub_dir, "run_0/params.pkl")
    pprint.pprint(config)

    sampler = AlternatingSampler(
        EnvCls=DmlabEnv,
        env_kwargs=config["env"],
        CollectorCls=GpuWaitResetCollector,
        # TrajInfoCls=AtariTrajInfo,
        # eval_env_kwargs=config["env"],  # Same args!
        **config["sampler"])
    algo = PPO(optim_kwargs=config["optim"], **config["algo"])
    agent = DmlabPgLstmAlternatingAgent(model_kwargs=config["model"],
                                        **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = config["env"]["level"]
    if snapshot_gap is not None:
        snapshot_gap = int(snapshot_gap)
    with logger_context(
            log_dir,
            run_ID,
            name,
            config,
            snapshot_mode=snapshot_mode,
            snapshot_gap=snapshot_gap,
    ):
        runner.train()
示例#17
0
def build_and_train(game="montezuma_revenge",
                    run_ID=0,
                    cuda_idx=None,
                    n_parallel=6):
    affinity = dict(cuda_idx=cuda_idx,
                    workers_cpus=list(range(n_parallel)),
                    alternating=True)
    env_args = dict(id=game)
    # env_args[RLPYT_WRAPPER_KEY] = [ClipActionsWrapper]
    # sampler = AlternatingSampler(
    #     EnvCls=AtariEnv,
    #     TrajInfoCls=AtariTrajInfo,
    #     env_kwargs=dict(game=game),
    #     batch_T=64,  # One time-step per sampler iteration.
    #     batch_B=36,  # One environment (i.e. sampler Batch dimension).
    #     max_decorrelation_steps=1000,
    #     # eval_n_envs=5,
    #     # eval_max_steps=int(25e3),
    #     # eval_max_trajectories=30
    # )
    #
    sampler = SerialSampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,
        env_kwargs=dict(game=game),
        batch_T=256,  # One time-step per sampler iteration.
        batch_B=8,  # One environment (i.e. sampler Batch dimension).
        max_decorrelation_steps=1000,
        # eval_n_envs=2,
        # eval_max_steps=int(51e2),
        # eval_max_trajectories=5,
    )

    # algo = PPO(clip_vf_loss=False, normalize_rewards=None)  # Run with defaults.
    algo = A2OC(normalize_rewards=None)
    agent = AtariOcAgent(model_kwargs={'option_size': 4})
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=1e6,
        log_interval_steps=1e3,
        affinity=affinity,
        # transfer=True,
        # transfer_iter=150,
        # log_traj_window=10
    )
    config = dict(game=game)
    name = "ppo_" + game
    log_dir = "example_2a_atari"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#18
0
def build_and_train(game="pong",
                    run_ID=0,
                    cuda_idx=None,
                    sample_mode="serial",
                    n_parallel=2):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)))
    gpu_cpu = "CPU" if cuda_idx is None else f"GPU {cuda_idx}"
    if sample_mode == "serial":
        Sampler = SerialSampler  # (Ignores workers_cpus.)
        print(f"Using serial sampler, {gpu_cpu} for sampling and optimizing.")
    elif sample_mode == "cpu":
        Sampler = CpuSampler
        print(
            f"Using CPU parallel sampler (agent in workers), {gpu_cpu} for optimizing."
        )
    elif sample_mode == "gpu":
        Sampler = GpuSampler
        print(
            f"Using GPU parallel sampler (agent in master), {gpu_cpu} for sampling and optimizing."
        )
    elif sample_mode == "alternating":
        Sampler = AlternatingSampler
        affinity["workers_cpus"] += affinity["workers_cpus"]  # (Double list)
        affinity["alternating"] = True  # Sampler will check for this.
        print(
            f"Using Alternating GPU parallel sampler, {gpu_cpu} for sampling and optimizing."
        )

    sampler = Sampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,
        env_kwargs=dict(game=game),
        batch_T=5,  # 5 time-steps per sampler iteration.
        batch_B=16,  # 16 parallel environments.
        max_decorrelation_steps=400,
    )
    algo = A2C()  # Run with defaults.
    agent = AtariFfAgent()
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e5,
        affinity=affinity,
    )
    config = dict(game=game)
    name = "a2c_" + game
    log_dir = "example_3"
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#19
0
def build_and_train():
    p = psutil.Process()
    cpus = p.cpu_affinity()
    affinity = dict(cuda_idx=None,
                    master_cpus=cpus,
                    workers_cpus=list([x] for x in cpus),
                    set_affinity=True)
    sampler = CpuSampler(
        EnvCls=_make_env,
        env_kwargs=dict(rank=0),
        max_decorrelation_steps=0,
        batch_T=6000,
        batch_B=len(cpus),  # 20 parallel environments.
    )

    model_kwargs = dict(model_kwargs=dict(hidden_sizes=[256, 256]))
    ppo_config = {
        "discount": 0.98,
        "entropy_loss_coeff": 0.01,
        "learning_rate": 0.00025,
        "value_loss_coeff": 0.5,
        "clip_grad_norm": 0.5,
        "minibatches": 40,
        "gae_lambda": 0.95,
        "ratio_clip": 0.2,
        "epochs": 4
    }

    algo = PPO(**ppo_config)
    agent = MujocoFfAgent(**model_kwargs)

    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=int(60e6),
        log_interval_steps=int(1e6),
        affinity=affinity,
    )
    config = dict(rank=0, env_id='picking')
    name = "ppo_rlpyt_pushing"
    log_dir = os.path.join(os.path.dirname(__file__), name)
    with logger_context(log_dir,
                        0,
                        name,
                        config,
                        use_summary_writer=True,
                        snapshot_mode='all'):
        runner.train()
示例#20
0
    def run(self):
        config = self.getConfig()
        sampler = CpuSampler(EnvCls=make_env,
                             env_kwargs={"num_levels": config["num_levels"]},
                             batch_T=256,
                             batch_B=8,
                             max_decorrelation_steps=0)

        optim_args = dict(weight_decay=config["l2_penalty"]
                          ) if "l2_penalty" in config else None

        algo = PPO(discount=config["discount"],
                   entropy_loss_coeff=config["entropy_bonus"],
                   gae_lambda=config["lambda"],
                   minibatches=config["minibatches_per_epoch"],
                   epochs=config["epochs_per_rollout"],
                   ratio_clip=config["ppo_clip"],
                   learning_rate=config["learning_rate"],
                   normalize_advantage=True,
                   optim_kwargs=optim_args)
        agent = ImpalaAgent(
            model_kwargs={
                "in_channels": config["in_channels"],
                "out_channels": config["out_channels"],
                "hidden_size": config["hidden_size"]
            })

        affinity = dict(cuda_idx=0,
                        workers_cpus=list(range(config["workers"])))

        runner = MinibatchRl(algo=algo,
                             agent=agent,
                             sampler=sampler,
                             n_steps=25e6,
                             log_interval_steps=500,
                             affinity=affinity,
                             seed=42069)
        log_dir = "./logs"
        name = config["name"]
        run_ID = name
        with logger_context(log_dir,
                            run_ID,
                            name,
                            config,
                            use_summary_writer=True):
            runner.train()
        torch.save(agent.state_dict(), "./" + name + ".pt")
        wandb.save("./" + name + ".pt")
示例#21
0
def build_and_train():
    p = psutil.Process()
    cpus = p.cpu_affinity()
    affinity = dict(cuda_idx=None,
                    master_cpus=cpus,
                    workers_cpus=list([x] for x in cpus),
                    set_affinity=True)
    sampler = CpuSampler(EnvCls=_make_env,
                         env_kwargs=dict(rank=0),
                         batch_T=1,
                         batch_B=4,
                         max_decorrelation_steps=0,
                         CollectorCls=CpuResetCollector)
    algo = SAC(batch_size=256,
               min_steps_learn=10000,
               replay_size=1000000,
               replay_ratio=256 / 4,
               target_update_interval=1,
               target_entropy=-9,
               target_update_tau=0.01,
               learning_rate=0.00025,
               action_prior="uniform",
               reward_scale=1,
               reparameterize=True,
               clip_grad_norm=1e9,
               n_step_return=1,
               updates_per_sync=1,
               bootstrap_timelimit=False)  # Run with defaults.
    agent = SacAgent(model_kwargs={'hidden_sizes': [256, 256]})
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=10000,
        affinity=affinity,
    )
    config = dict(env_id='picking')
    name = "sac_rlpyt_picking"
    log_dir = os.path.join(os.path.dirname(__file__), "sac_rlpyt_picking")
    with logger_context(log_dir,
                        0,
                        name,
                        config,
                        use_summary_writer=True,
                        snapshot_mode='all'):
        runner.train()
示例#22
0
def build_and_train(slot_affinity_code="0slt_0gpu_4cpu_4cpr",
                    log_dir="test",
                    run_ID="0",
                    config_key="ppo_ul_16env"):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    # variant = load_variant(log_dir)
    # config = update_config(config, variant)

    # config["sampler"]["batch_B"] = 4
    # config["sampler"]["batch_T"] = 5
    # config["runner"]["log_interval_steps"] = 100
    # config["runner"]["n_steps"] = 1000
    config["algo"]["ul_update_schedule"] = "constant_1"
    config["algo"]["min_steps_rl"] = 1e3
    config["algo"]["min_steps_ul"] = 200
    config["algo"]["max_steps_ul"] = 20e6
    config["model"]["stop_conv_grad"] = True
    config["sampler"]["max_decorrelation_steps"] = 0
    config["sampler"]["batch_B"] = 3
    config["sampler"]["batch_T"] = 20
    config["algo"]["ul_pri_alpha"] = 1.
    config["algo"]["ul_pri_n_step_return"] = 10
    config["algo"]["ul_replay_size"] = 900

    pprint.pprint(config)

    sampler = SerialSampler(
        EnvCls=AtariEnv84,
        env_kwargs=config["env"],
        CollectorCls=CpuResetCollector,
        TrajInfoCls=AtariTrajInfo,
        eval_env_kwargs=config["env"],  # Same args!
        **config["sampler"])
    algo = PpoUl(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariPgRlWithUlAgent(model_kwargs=config["model"],
                                 **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = config["env"]["game"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#23
0
def build_and_train(run_ID=0,
                    cuda_idx=None,
                    n_parallel=2,
                    serial_sampling=False):
    affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)))
    device = "CPU" if cuda_idx is None else f"GPU {cuda_idx}"
    if serial_sampling:
        Sampler = SerialSampler  # ignores workers_cpus
        print(
            f"Using serial sampler w/ {device} for action sampling and optimization"
        )
    else:
        Sampler = CpuSampler if cuda_idx is None else GpuSampler
        print(
            f"Using parallel sampler w/ {device} for action sampling and optimization"
        )

    game = "pong"

    sampler = Sampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=dict(game=game),
        eval_env_kwargs=dict(game=game),
        batch_T=1,
        batch_B=8,  # number of game running in parallel
        max_decorrelation_steps=0)

    ### ALGO GOES HERE
    algo = None

    agent = AtariDqnAgent()

    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         n_steps=50e6,
                         log_interval_steps=1e3,
                         affinity=affinity)

    config = dict(game=game)
    name = "rp_attack_dqn_" + game
    log_dir = "rp_attack"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
示例#24
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    # variant = load_variant(log_dir)
    # config = update_config(config, variant)

    sampler = CpuSampler(EnvCls=AtariEnv,
                         env_kwargs=config["env"],
                         CollectorCls=EpisodicLivesWaitResetCollector,
                         **config["sampler"])
    algo = A2C(optim_kwargs=config["optim"], **config["algo"])
    agent = AtariLstmAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = config["env"]["game"] + str(config["algo"]["entropy_loss_coeff"])
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#25
0
def build_and_train(log_dir, run_ID, config_key):
    # affinity = affinity_from_code(run_slot_affinity_code)
    slot_affinity_code = prepend_run_slot(0, affinity_code)
    affinity = affinity_from_code(slot_affinity_code)

    config = configs[config_key]

    sampler = CpuSampler(EnvCls=make_env,
                         env_kwargs={},
                         CollectorCls=CpuResetCollector,
                         **config["sampler"])
    algo = PPO(optim_kwargs=config["optim"], **config["algo"])
    agent = MultiFfAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#26
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)
    env = IsaacGymEnv(config['env']['task'])  # Make env
    import torch.nn as nn
    config["model"]["hidden_nonlinearity"] = getattr(
        nn, config["model"]
        ["hidden_nonlinearity"])  # Replace string with proper activation
    sampler = IsaacSampler(env, **config["sampler"])
    algo = PPOC(optim_kwargs=config["optim"], **config["algo"])
    agent = MujocoFfOcAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = "ppo_nv_" + config["env"]["task"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#27
0
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key):
    affinity = affinity_from_code(slot_affinity_code)
    config = configs[config_key]
    variant = load_variant(log_dir)
    config = update_config(config, variant)
    config["algo_name"] = 'A2OC_D_RNN'

    env = BatchPOMDPEnv(batch_B=config["sampler"]["batch_B"], **config["env"])
    config["algo"]["discount"] = env.discount
    sampler = BatchPOMDPSampler(env=env, **config["sampler"])

    algo = A2OC(optim_kwargs=config["optim"], **config["algo"])
    agent = PomdpOcRnnAgent(model_kwargs=config["model"], **config["agent"])
    runner = MinibatchRl(algo=algo,
                         agent=agent,
                         sampler=sampler,
                         affinity=affinity,
                         **config["runner"])
    name = config["env"]["id"]
    with logger_context(log_dir, run_ID, name, config):
        runner.train()
示例#28
0
文件: run_rlpyt.py 项目: zivzone/spr
def debug_build_and_train(game="pong", run_ID=0, cuda_idx=0):
    config = configs['ernbw']
    config['runner']['log_interval_steps'] = 1e5
    config['env']['game'] = game
    config["eval_env"]["game"] = config["env"]["game"]
    config["algo"]["n_step_return"] = 5
    config["algo"]["prioritized_replay"] = True
    config["algo"]["min_steps_learn"] = 1e3
    wandb.config.update(config)
    sampler = SerialSampler(
        EnvCls=AtariEnv,
        TrajInfoCls=AtariTrajInfo,  # default traj info + GameScore
        env_kwargs=dict(game=game),
        eval_env_kwargs=dict(game=game),
        batch_T=4,  # Four time-steps per sampler iteration.
        batch_B=1,
        max_decorrelation_steps=0,
        eval_n_envs=10,
        eval_max_steps=int(10e3),
        eval_max_trajectories=5,
    )
    algo = PizeroCategoricalDQN(optim_kwargs=config["optim"],
                                **config["algo"])  # Run with defaults.
    agent = AtariCatDqnAgent(ModelCls=PizeroCatDqnModel,
                             model_kwargs=config["model"],
                             **config["agent"])
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )
    config = dict(game=game)
    name = "dqn_" + game
    log_dir = "example_1"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
示例#29
0
def build_and_train(game="pong", run_ID=0, cuda_idx=None):
    sampler = SerialSampler(
        # EnvCls=MyEnv,
        # env_kwargs=dict(),
        # batch_T=4,  # Four time-steps per sampler iteration.
        # batch_B=1,
        # max_decorrelation_steps=0,
        # eval_n_envs=10,
        # eval_env_kwargs=dict(),
        # eval_max_steps=int(10e3),
        # eval_max_trajectories=5,
        EnvCls=CanvasEnv,
        env_kwargs=dict(),
        batch_T=1,  # 5 time-steps per sampler iteration.
        batch_B=16,  # 16 parallel environments.
        max_decorrelation_steps=400,
    )
    algo = PPO()
    agent = CategoricalPgAgent(
        ModelCls=MyModel,
        model_kwargs=dict(image_shape=(1, CANVAS_WIDTH, CANVAS_WIDTH),
                          output_size=N_ACTIONS),
        initial_model_state_dict=None,
    )
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=50e6,
        log_interval_steps=1e3,
        affinity=dict(cuda_idx=cuda_idx),
    )

    config = dict()
    name = "dqn_" + game
    log_dir = "example_1"
    with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"):
        runner.train()
示例#30
0
def findOptimalAgent(reward, run_ID=0):
    """
    Find the optimal agent for the MDP (see Config for 
    specification) under a custom reward function
    using rlpyt's implementation of A2C.
    """
    cpus = list(range(C.N_PARALLEL))
    affinity = dict(cuda_idx=C.CUDA_IDX, workers_cpus=cpus)
    sampler = SerialSampler(EnvCls=rlpyt_make,
                            env_kwargs=dict(id=C.ENV, reward=reward),
                            batch_T=C.BATCH_T,
                            batch_B=C.BATCH_B,
                            max_decorrelation_steps=400,
                            eval_env_kwargs=dict(id=C.ENV),
                            eval_n_envs=5,
                            eval_max_steps=2500)
    algo = A2C(discount=C.DISCOUNT,
               learning_rate=C.LR,
               value_loss_coeff=C.VALUE_LOSS_COEFF,
               entropy_loss_coeff=C.ENTROPY_LOSS_COEFF)
    agent = CategoricalPgAgent(AcrobotNet)
    runner = MinibatchRl(
        algo=algo,
        agent=agent,
        sampler=sampler,
        n_steps=C.N_STEPS,
        log_interval_steps=C.LOG_STEP,
        affinity=affinity,
    )
    name = "a2c_" + C.ENV.lower()
    log_dir = name
    with logger_context(log_dir,
                        run_ID,
                        name,
                        snapshot_mode='last',
                        override_prefix=True):
        runner.train()
    return agent