def test_rlpyt_simple(): """ partially copied from example 1 """ game = "pong" run_ID = 0 cuda_idx = None n_steps = 1 sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e3, replay_size=1e3) # remove memory issues agent = AtariDqnAgent() runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=n_steps, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dqn_" + game log_dir = "test_example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(log_dir, game="pong", run_ID=0, cuda_idx=None, eval=False): sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = Dreamer() # Run with defaults. agent = AtariDreamerAgent() runner_cls = MinibatchRlEval if eval else MinibatchRl runner = runner_cls( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dreamer_" + game with logger_context(log_dir, run_ID, name, config, snapshot_mode="last", override_prefix=True, use_summary_writer=True): runner.train()
def build_and_train(game="academy_empty_goal_close", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=create_single_football_env, env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e3) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dqn_" + game log_dir = "example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(env_id="CartPole-v0", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id), eval_env_kwargs=dict(id=env_id), batch_T=1, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(51e3), eval_max_trajectories=50, ) algo = PPO() # Run with defaults. agent = RecurrentCategoricalPgAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e4, affinity=dict(cuda_idx=cuda_idx), ) config = dict(env_id=env_id) name = "ppo_" + env_id log_dir = "ppo_test" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(run_id=0, greedy_eval=False): sampler = SerialSampler( EnvCls=MyEnv, env_kwargs=dict(), eval_env_kwargs=dict(), batch_T=horizon, batch_B=64, max_decorrelation_steps=0, eval_n_envs=64, eval_max_steps=int(1e6), eval_max_trajectories=64, ) runner = MinibatchRl( algo=PPO(entropy_loss_coeff=0., learning_rate=3e-4), agent=MyAgent(greedy_eval), sampler=sampler, n_steps=int(400 * horizon * 64), log_interval_steps=int(10 * horizon * 64), ) log_params = dict() log_dir = "data/rl_example_2/{}".format( datetime.datetime.today().strftime("%Y%m%d_%H%M")) with logger_context(log_dir, run_id, 'Reacher2D', log_params=log_params, snapshot_mode="last", use_summary_writer=True, override_prefix=True): runner.train()
def build_and_train(env_id="HalfCheetah-Directional-v0", run_ID=0, cuda_idx=None, n_parallel=6): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True) env_args = dict(id=env_id) env_args[RLPYT_WRAPPER_KEY] = [ClipActionsWrapper] # sampler = GpuSampler( # EnvCls=gym_make, # env_kwargs=env_args, # eval_env_kwargs=env_args, # batch_T=256, # One time-step per sampler iteration. # batch_B=8, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=100, # eval_n_envs=5, # eval_max_steps=int(25e3), # eval_max_trajectories=30 # ) # agent = MujocoFfOcAgent(model_kwargs={'option_size': 2}) # sampler = AlternatingSampler( # EnvCls=gym_make, # env_kwargs=env_args, # eval_env_kwargs=env_args, # batch_T=256, # One time-step per sampler iteration. # batch_B=8, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=100, # eval_n_envs=5, # eval_max_steps=int(25e3), # eval_max_trajectories=30 # ) # agent = AlternatingMujocoFfOcAgent(model_kwargs={'option_size': 2}) sampler = SerialSampler( EnvCls=gym_make, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=256, # One time-step per sampler iteration. batch_B=8, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, # eval_n_envs=2, # eval_max_steps=int(51e2), # eval_max_trajectories=5, ) agent = MujocoFfOcAgent(model_kwargs={'option_size': 2}) algo = PPOC(clip_vf_loss=False, normalize_rewards='return') # Run with defaults. runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e3, affinity=affinity, transfer=True, transfer_iter=150, log_traj_window=10 ) config = dict(env_id=env_id) name = "ppoc_" + env_id log_dir = "example_2a_ppoc" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(level="nav_maze_random_goal_01", run_ID=0, cuda_idx=None): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(8))) sampler = SerialSampler( EnvCls=DeepmindLabEnv, env_kwargs=dict(level=level), eval_env_kwargs=dict(level=level), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=5, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = PPO() agent = AtariFfAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=affinity, ) config = dict(level=level) name = "lab_ppo" log_dir = "lab_example_3" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) print('Variant', variant) config = update_config(config, variant) sampler = SerialSampler(EnvCls=DMControlEnv, env_kwargs=config["env"], CollectorCls=CpuResetCollector, eval_env_kwargs=config["eval_env"], **config["sampler"]) algo = SAC(optim_kwargs=config["optim"], **config["algo"]) agent = SacAgent(**config["agent"]) runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = "sac_{}_{}".format(config['env']['domain'], config['env']['task']) with logger_context(log_dir, run_ID, name, log_params=config, snapshot_mode='last'): runner.train()
def build_and_train( slot_affinity_code="0slt_1gpu_1cpu", log_dir="test", run_ID="0", config_key="ppo_ul_16env", ): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) pprint.pprint(config) sampler = SerialSampler( EnvCls=AtariEnv84, env_kwargs=config["env"], CollectorCls=CpuResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["env"], # Same args! **config["sampler"]) algo = PpoUl(optim_kwargs=config["optim"], **config["algo"]) agent = AtariPgAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(cfg, game="ftwc", run_ID=0): #GVS NOTE: for ftwc/qait ?use CpuWaitResetCollector (or CpuResetCollector) sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e2), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e2) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cfg.cuda_idx), ) config = dict(game=game) name = "dqn_" + game log_dir = "ftwc" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) # config["eval_env"]["id"] = config["env"]["id"] sampler = SerialSampler( EnvCls=gym_make, env_kwargs=config["env"], CollectorCls=CpuResetCollector, eval_env_kwargs=config["env"], **config["sampler"] ) algo = SAC(optim_kwargs=config["optim"], **config["algo"]) agent = SacAgent(**config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = "sac_" + config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train( slot_affinity_code="0slt_1gpu_1cpu", log_dir="test", run_ID="0", config_key="sac_with_ul", ): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) pprint.pprint(config) sampler = SerialSampler( EnvCls=make, env_kwargs=config["env"], CollectorCls=CpuResetCollector, # TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["env"], # Same args! **config["sampler"]) algo = SacWithUl(**config["algo"]) agent = SacAgent(conv_kwargs=config["conv"], fc1_kwargs=config["fc1"], pi_model_kwargs=config["pi_model"], q_model_kwargs=config["q_model"], **config["agent"]) runner = MinibatchRlEvalEnvStep(algo=algo, agent=agent, sampler=sampler, affinity=affinity, frame_skip=config["env"]["frame_skip"], **config["runner"]) name = config["env"]["domain_name"] + "_" + config["env"]["task_name"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None): env_args = dict(id=env_id) env_args[RLPYT_WRAPPER_KEY] = [ClipActionsWrapper] sampler = SerialSampler( EnvCls=gym_make, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=1, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(51e3), eval_max_trajectories=50, ) algo = SAC() # Run with defaults. agent = SacAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e4, affinity=dict(cuda_idx=cuda_idx), ) config = dict(env_id=env_id) name = "sac_" + env_id log_dir = "example_2" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(game="pong", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=dict(game=game), eval_env_kwargs=dict(game=game), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e3) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dqn_" + game #log_dir = "example_1" log_dir = get_outputs_path() with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(level="nav_maze_random_goal_01", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=DeepmindLabEnv, env_kwargs=dict(level=level), eval_env_kwargs=dict(level=level), batch_T=4, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=5, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = DQN(min_steps_learn=1e3) # Run with defaults. agent = AtariDqnAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=50e6, log_interval_steps=1e5, affinity=dict(cuda_idx=cuda_idx), ) config = dict(level=level) name = "lab_dqn" log_dir = "lab_example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(env_id="Hopper-v3", run_ID=0, cuda_idx=None): sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id), eval_env_kwargs=dict(id=env_id), batch_T=50, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=2, eval_max_steps=int(51e3), eval_max_trajectories=200, ) # The cost function for InvertedPendulumBulletEnv def obs_cost_fn(x): target = torch.FloatTensor([0,0,1,0,0]) c = (x - target)**2 c = -c.sum(dim=1) return -c.exp() algo = GP_Mlp(obs_cost_fn=obs_cost_fn) # Run with defaults. agent = GP_MlpAgent() runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=200, affinity=dict(cuda_idx=cuda_idx), ) config = dict(env_id=env_id) name = "gp_mlp_" + env_id log_dir = "example_1" with logger_context(log_dir, run_ID, name, config, snapshot_mode='last'): runner.train()
def main(): parser = argparse.ArgumentParser() parser.add_argument('snapshot_dir', type=str) parser.add_argument('max_q_eval_mode', type=str) parser.add_argument('--n_rollouts', type=int, default=10) args = parser.parse_args() snapshot_file = join(args.snapshot_dir, 'params.pkl') config_file = join(args.snapshot_dir, 'params.json') params = torch.load(snapshot_file, map_location='cpu') with open(config_file, 'r') as f: config = json.load(f) config['sampler']['batch_B'] = 1 config['sampler']['eval_n_envs'] = 1 config['sampler']['eval_max_trajectories'] = args.n_rollouts config['env']['task_kwargs']['maxq'] = True itr, cum_steps = params['itr'], params['cum_steps'] print(f'Loading experiment at itr {itr}, cum_steps {cum_steps}') agent_state_dict = params['agent_state_dict'] sac_agent_module = 'rlpyt.agents.qpg.{}'.format(config['sac_agent_module']) sac_agent_module = importlib.import_module(sac_agent_module) SacAgent = sac_agent_module.SacAgent agent = SacAgent(max_q_eval_mode=args.max_q_eval_mode, **config["agent"]) sampler = SerialSampler( EnvCls=DMControlEnv, env_kwargs=config["env"], eval_env_kwargs=config["env"], **config["sampler"] ) sampler.initialize(agent) agent.load_state_dict(agent_state_dict) agent.to_device(cuda_idx=0) agent.eval_mode(0) traj_infos = sampler.evaluate_agent(0) returns = [traj_info.Return for traj_info in traj_infos] lengths = [traj_info.Length for traj_info in traj_infos] print('Returns', returns) print(f'Average Return {np.mean(returns)}, Average Length {np.mean(lengths)}')
def build_and_train(log_dir, game="cartpole_balance", run_ID=0, cuda_idx=None, eval=False, save_model='last', load_model_path=None): params = torch.load(load_model_path) if load_model_path else {} agent_state_dict = params.get('agent_state_dict') optimizer_state_dict = params.get('optimizer_state_dict') action_repeat = 2 factory_method = make_wapper(DeepMindControl, [ActionRepeat, NormalizeActions, TimeLimit], [ dict(amount=action_repeat), dict(), dict(duration=1000 / action_repeat) ]) sampler = SerialSampler( EnvCls=factory_method, TrajInfoCls=TrajInfo, env_kwargs=dict(name=game, use_state=args.state), eval_env_kwargs=dict(name=game, use_state=args.state), batch_T=1, batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = Dreamer( initial_optim_state_dict=optimizer_state_dict) # Run with defaults. agent = DMCDreamerAgent(train_noise=0.3, eval_noise=0, expl_type="additive_gaussian", expl_min=None, expl_decay=None, initial_model_state_dict=agent_state_dict) runner_cls = MinibatchRlEval if eval else MinibatchRl runner = runner_cls( algo=algo, agent=agent, sampler=sampler, n_steps=5e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dreamer_" + game with logger_context(log_dir, run_ID, name, config, snapshot_mode=save_model, override_prefix=True, use_summary_writer=True): runner.train()
def build_and_train(game="fruitbot", run_ID=0, cuda_idx=None, n_parallel=6): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True) env_args = dict(game=game, start_level=0, num_levels=1) # sampler = AlternatingSampler( # EnvCls=ProcgenEnv, # env_kwargs=env_args, # eval_env_kwargs=env_args, # batch_T=256, # One time-step per sampler iteration. # batch_B=12, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=100, # # eval_n_envs=5, # # eval_max_steps=int(25e3), # # eval_max_trajectories=30 # ) # sampler = GpuSampler( # EnvCls=ProcgenEnv, # env_kwargs=env_args, # eval_env_kwargs=env_args, # batch_T=256, # One time-step per sampler iteration. # batch_B=12, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=100, # # eval_n_envs=5, # # eval_max_steps=int(25e3), # # eval_max_trajectories=30 # ) # sampler = SerialSampler( EnvCls=ProcgenEnv, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=256, # One time-step per sampler iteration. batch_B=8, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, # eval_n_envs=2, # eval_max_steps=int(51e2), # eval_max_trajectories=5, ) algo = PPOC(clip_vf_loss=False, normalize_rewards=None) # Run with defaults. agent = Agent(model_kwargs={'option_size': 2}) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e3, affinity=affinity, # transfer=True, # transfer_iter=150, # log_traj_window=10 ) config = dict(game=game) name = "ppo_" + game log_dir = "example_2a_fruitbot" with logger_context(log_dir, run_ID, name, config): runner.train()
def estimateForState(s): cpus = list(range(C.N_PARALLEL)) affinity = dict(cuda_idx=C.CUDA_IDX, workers_cpus=cpus) agent_ = CategoricalPgAgent( AcrobotNet, initial_model_state_dict=agent.state_dict()) sampler = SerialSampler( EnvCls=rlpyt_make, env_kwargs=dict(id=C.ENV, reward=rewardFn, internalStateFn=C.INTERNAL_STATE_FN, s0=s), batch_T=C.HORIZON, batch_B=C.BATCH_B, max_decorrelation_steps=0, ) sampler.initialize(agent=agent_, affinity=affinity, seed=C.SEED) _, traj_info = sampler.obtain_samples(0) returns = [t['DiscountedReturn'] for t in traj_info] return np.mean(returns)
def build_and_train(game="pong", run_ID=0, cuda_idx=None, eval=False): action_repeat = 2 env_kwargs = dict( name=game, action_repeat=action_repeat, size=(64, 64), grayscale=False, life_done=True, sticky_actions=True, ) factory_method = make_wapper( AtariEnv, [OneHotAction, TimeLimit], [dict(), dict(duration=1000 / action_repeat)]) sampler = SerialSampler( EnvCls=factory_method, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=env_kwargs, eval_env_kwargs=env_kwargs, batch_T=1, batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = Dreamer( batch_size=1, batch_length=5, train_every=10, train_steps=2, prefill=10, horizon=5, replay_size=100, log_video=False, kl_scale=0.1, use_pcont=True, ) agent = AtariDreamerAgent(train_noise=0.4, eval_noise=0, expl_type="epsilon_greedy", expl_min=0.1, expl_decay=2000 / 0.3, model_kwargs=dict(use_pcont=True)) runner_cls = MinibatchRlEval if eval else MinibatchRl runner = runner_cls( algo=algo, agent=agent, sampler=sampler, n_steps=20, log_interval_steps=10, affinity=dict(cuda_idx=cuda_idx), ) runner.train()
def build_and_train(game="montezuma_revenge", run_ID=0, cuda_idx=None, n_parallel=6): affinity = dict(cuda_idx=cuda_idx, workers_cpus=list(range(n_parallel)), alternating=True) env_args = dict(id=game) # env_args[RLPYT_WRAPPER_KEY] = [ClipActionsWrapper] # sampler = AlternatingSampler( # EnvCls=AtariEnv, # TrajInfoCls=AtariTrajInfo, # env_kwargs=dict(game=game), # batch_T=64, # One time-step per sampler iteration. # batch_B=36, # One environment (i.e. sampler Batch dimension). # max_decorrelation_steps=1000, # # eval_n_envs=5, # # eval_max_steps=int(25e3), # # eval_max_trajectories=30 # ) # sampler = SerialSampler( EnvCls=AtariEnv, TrajInfoCls=AtariTrajInfo, env_kwargs=dict(game=game), batch_T=256, # One time-step per sampler iteration. batch_B=8, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=1000, # eval_n_envs=2, # eval_max_steps=int(51e2), # eval_max_trajectories=5, ) # algo = PPO(clip_vf_loss=False, normalize_rewards=None) # Run with defaults. algo = A2OC(normalize_rewards=None) agent = AtariOcAgent(model_kwargs={'option_size': 4}) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=1e3, affinity=affinity, # transfer=True, # transfer_iter=150, # log_traj_window=10 ) config = dict(game=game) name = "ppo_" + game log_dir = "example_2a_atari" with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train( slot_affinity_code="0slt_1gpu_1cpu", log_dir="test", run_ID="0", config_key="serial_radsac", experiment_title="exp", ): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) # Hack that the first part of the log_dir matches the source of the model model_base_dir = config["pretrain"]["model_dir"] if model_base_dir is not None: raw_log_dir = log_dir.split(experiment_title)[-1].lstrip( "/") # get rid of ~/GitRepos/adam/rlpyt/data/local/<timestamp>/ model_sub_dir = raw_log_dir.split("/RlFromUl/")[ 0] # keep the UL part, which comes first pretrain_ID = config["pretrain"]["run_ID"] config["agent"]["state_dict_filename"] = osp.join( model_base_dir, model_sub_dir, f"run_{pretrain_ID}/params.pkl") pprint.pprint(config) sampler = SerialSampler( EnvCls=make, env_kwargs=config["env"], CollectorCls=CpuResetCollector, eval_env_kwargs=config["env"], # Same args! **config["sampler"], ) algo = RadSacFromUl(**config["algo"]) agent = SacAgent( conv_kwargs=config["conv"], fc1_kwargs=config["fc1"], pi_model_kwargs=config["pi_model"], q_model_kwargs=config["q_model"], **config["agent"], ) runner = MinibatchRlEvalEnvStep( algo=algo, agent=agent, sampler=sampler, affinity=affinity, frame_skip=config["env"]["frame_skip"], **config["runner"], ) name = config["env"]["domain_name"] + "_" + config["env"]["task_name"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train( slot_affinity_code="0slt_0gpu_4cpu_4cpr", log_dir="test", run_ID="0", config_key="sac_ul_compress", ): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] # variant = load_variant(log_dir) # config = update_config(config, variant) config["algo"]["min_steps_rl"] = 100 config["algo"]["min_steps_ul"] = 150 config["algo"]["replay_size"] = 1e4 config["algo"]["batch_size"] = 64 config["algo"]["ul_batch_size"] = 32 config["runner"]["n_steps"] = 1e3 config["runner"]["log_interval_steps"] = 1e2 config["sampler"]["eval_n_envs"] = 1 config["sampler"]["eval_max_steps"] = 500 config["algo"]["stop_rl_conv_grad"] = True config["algo"]["ul_update_schedule"] = "cosine_8" pprint.pprint(config) sampler = SerialSampler( EnvCls=make, env_kwargs=config["env"], CollectorCls=CpuResetCollector, # TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["env"], # Same args! **config["sampler"]) algo = SacUl(**config["algo"]) agent = SacWithUlAgent(conv_kwargs=config["conv"], fc1_kwargs=config["fc1"], pi_model_kwargs=config["pi_model"], q_model_kwargs=config["q_model"], **config["agent"]) runner = MinibatchRlEvalEnvStep(algo=algo, agent=agent, sampler=sampler, affinity=affinity, frame_skip=config["env"]["frame_skip"], **config["runner"]) name = config["env"]["domain_name"] + "_" + config["env"]["task_name"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(level="nav_maze_random_goal_01", run_ID=0, cuda_idx=None): config = configs['r2d1'] config['eval_env'] = dict(level=level) config['env'] = dict(level=level) affinity = make_affinity( run_slot=0, n_cpu_core=4, # Use 16 cores across all experiments. n_gpu=1, # Use 8 gpus across all experiments. hyperthread_offset=6, # If machine has 24 cores. n_socket=2, # Presume CPU socket affinity to lower/upper half GPUs. gpu_per_run=1, # How many GPUs to parallelize one run across. ) # sampler = GpuSampler( # EnvCls=DeepmindLabEnv, # env_kwargs=config['env'], # eval_env_kwargs=config['eval_env'], # CollectorCls=GpuWaitResetCollector, # TrajInfoCls=LabTrajInfo, # **config["sampler"] # ) sampler = SerialSampler( EnvCls=DeepmindLabEnv, env_kwargs=config['env'], eval_env_kwargs=config['env'], batch_T=16, # Four time-steps per sampler iteration. batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = R2D1(optim_kwargs=config["optim"], **config["algo"]) agent = AtariR2d1Agent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = "lab_dqn_" + level log_dir = "lab_example_2" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train()
def build_and_train(log_dir, level="Level_GoToLocalAvoidLava", run_ID=0, cuda_idx=None, eval=False, save_model='last', load_model_path=None): params = torch.load(load_model_path) if load_model_path else {} agent_state_dict = params.get('agent_state_dict') optimizer_state_dict = params.get('optimizer_state_dict') env_kwargs = dict( level=level, slipperiness=0.0, one_hot_obs=True, ) factory_method = make_wapper( Minigrid, [OneHotAction, TimeLimit], [dict(), dict(duration=64)]) sampler = SerialSampler( EnvCls=factory_method, TrajInfoCls=TrajInfo, env_kwargs=env_kwargs, eval_env_kwargs=env_kwargs, batch_T=1, batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = Dreamer(horizon=10, kl_scale=0.1, use_pcont=True, initial_optim_state_dict=optimizer_state_dict, env=OneHotAction(TimeLimit(Minigrid(**env_kwargs), 64)), save_env_videos=True) agent = MinigridDreamerAgent(train_noise=0.4, eval_noise=0, expl_type="epsilon_greedy", expl_min=0.1, expl_decay=2000 / 0.3, initial_model_state_dict=agent_state_dict, model_kwargs=dict(use_pcont=True, stride=1, shape=(20, 7, 7), depth=1, padding=2, full_conv=False)) runner_cls = MinibatchRlEval if eval else MinibatchRl runner = runner_cls( algo=algo, agent=agent, sampler=sampler, n_steps=5e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(level=level) name = "dreamer_" + level with logger_context(log_dir, run_ID, name, config, snapshot_mode=save_model, override_prefix=True, use_summary_writer=True): runner.train()
def build_and_train(slot_affinity_code="0slt_0gpu_4cpu_4cpr", log_dir="test", run_ID="0", config_key="ppo_ul_16env"): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] # variant = load_variant(log_dir) # config = update_config(config, variant) # config["sampler"]["batch_B"] = 4 # config["sampler"]["batch_T"] = 5 # config["runner"]["log_interval_steps"] = 100 # config["runner"]["n_steps"] = 1000 config["algo"]["ul_update_schedule"] = "constant_1" config["algo"]["min_steps_rl"] = 1e3 config["algo"]["min_steps_ul"] = 200 config["algo"]["max_steps_ul"] = 20e6 config["model"]["stop_conv_grad"] = True config["sampler"]["max_decorrelation_steps"] = 0 config["sampler"]["batch_B"] = 3 config["sampler"]["batch_T"] = 20 config["algo"]["ul_pri_alpha"] = 1. config["algo"]["ul_pri_n_step_return"] = 10 config["algo"]["ul_replay_size"] = 900 pprint.pprint(config) sampler = SerialSampler( EnvCls=AtariEnv84, env_kwargs=config["env"], CollectorCls=CpuResetCollector, TrajInfoCls=AtariTrajInfo, eval_env_kwargs=config["env"], # Same args! **config["sampler"]) algo = PpoUl(optim_kwargs=config["optim"], **config["algo"]) agent = AtariPgRlWithUlAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"]) name = config["env"]["game"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(game="pong", run_ID=0, cuda_idx=0, args=None): np.random.seed(args.seed) torch.manual_seed(args.seed) env = AtariEnv config = set_config(args, game) sampler = SerialSampler( EnvCls=env, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=config["env"], eval_env_kwargs=config["eval_env"], batch_T=config['sampler']['batch_T'], batch_B=config['sampler']['batch_B'], max_decorrelation_steps=0, eval_CollectorCls=OneToOneSerialEvalCollector, eval_n_envs=config["sampler"]["eval_n_envs"], eval_max_steps=config['sampler']['eval_max_steps'], eval_max_trajectories=config["sampler"]["eval_max_trajectories"], ) args.discount = config["algo"]["discount"] algo = SPRCategoricalDQN(optim_kwargs=config["optim"], jumps=args.jumps, **config["algo"]) # Run with defaults. agent = SPRAgent(ModelCls=SPRCatDqnModel, model_kwargs=config["model"], **config["agent"]) wandb.config.update(config) runner = MinibatchRlEvalWandb(algo=algo, agent=agent, sampler=sampler, n_steps=args.n_steps, affinity=dict(cuda_idx=cuda_idx), log_interval_steps=args.n_steps // args.num_logs, seed=args.seed, final_eval_only=args.final_eval_only, skip_init_eval=args.skip_init_eval) config = dict(game=game) name = "dqn_" + game log_dir = "logs" with logger_context(log_dir, run_ID, name, config, snapshot_mode="last"): runner.train() return None
def build_and_train(log_dir, game="traffic", run_ID=0, cuda_idx=None, eval=False, save_model='last', load_model_path=None, action_repeat=1, **kwargs): params = torch.load(load_model_path) if load_model_path else {} agent_state_dict = params.get('agent_state_dict') optimizer_state_dict = params.get('optimizer_state_dict') env_kwargs = dict( name=game, render=False, **kwargs ) factory_method = make_wapper( TrafficEnv, [ActionRepeat, OneHotAction, TimeLimit], [dict(amount=action_repeat), dict(), dict(duration=1000 / action_repeat)]) sampler = SerialSampler( EnvCls=factory_method, TrajInfoCls=AtariTrajInfo, # default traj info + GameScore env_kwargs=env_kwargs, eval_env_kwargs=env_kwargs, batch_T=1, batch_B=1, max_decorrelation_steps=0, eval_n_envs=10, eval_max_steps=int(10e3), eval_max_trajectories=5, ) algo = Dreamer(horizon=10, kl_scale=0.1, use_pcont=True, initial_optim_state_dict=optimizer_state_dict) agent = AtariDreamerAgent(train_noise=0.4, eval_noise=0, expl_type="epsilon_greedy", expl_min=0.1, expl_decay=2000 / 0.3, initial_model_state_dict=agent_state_dict, model_kwargs=dict(use_pcont=True)) runner_cls = MinibatchRlEval if eval else MinibatchRl runner = runner_cls( algo=algo, agent=agent, sampler=sampler, n_steps=5e6, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), ) config = dict(game=game) name = "dreamer_" + game with logger_context(log_dir, run_ID, name, config, snapshot_mode=save_model, override_prefix=True, use_summary_writer=True): runner.train()
def build_and_train(env_id="Cassie-v0", run_ID=0, cuda_idx=None, snapshot_file=None): if snapshot_file is None: initial_optim_state_dict = None initial_model_state_dict = None else: snapshot = torch.load(snapshot_file) initial_optim_state_dict=snapshot['optimizer_state_dict'] initial_model_state_dict=snapshot['agent_state_dict'] sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id, xml_file=get_full_path('resources/cassie.xml')), eval_env_kwargs=dict(id=env_id, xml_file=get_full_path('resources/cassie.xml')), batch_T=1, # One time-step per sampler iteration. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps=0, eval_n_envs=1, eval_max_steps=int(1000), eval_max_trajectories=50, # 50 ) algo = SAC( initial_optim_state_dict=initial_optim_state_dict) agent = SacAgent( initial_model_state_dict=initial_model_state_dict) runner = MinibatchRlEval( algo=algo, agent=agent, sampler=sampler, n_steps=1e6, log_interval_steps=5e4, #5e4 affinity=dict(cuda_idx=cuda_idx), ) other_param = dict( env_id=env_id, forward_reward_weight=0, shift_cost=True, cum_steps='1M') name = "sac_" + env_id log_dir = "Cassie_stand" with logger_context(log_dir, run_ID, name, other_param, snapshot_mode='last', use_summary_writer=True): runner.train()