def record_test(directory, video_path, n): make_Dirs(video_path + n + '/') env = gym_make(ENV_ID) env = wrappers.Monitor(env, video_path + n + '/', video_callable=lambda episode_id: True, force=True) env.seed(int(n) * 7) np.random.seed(int(n) * 7) torch.manual_seed(int(n) * 7) agent = MujocoFfAgent() agent.initialize(env.spaces) netword_state_dict = None try: network_state_dict = torch.load(directory + 'agent_model.pth') except (FileNotFoundError): print("No data found for the PPO agent (No existing model).") network_state_dict = None return if network_state_dict != None: agent.load_state_dict(network_state_dict) else: return agent.to_device(0) frame_idx = 0 print("Start Test Episode for {}".format(n)) done = False ### Interaction step = 0 state = env.reset() prev_action = env.action_space.sample() prev_reward = 0. while not done: # or step < MAX_STEPS: env.render() state = torch.FloatTensor(state) prev_action = torch.FloatTensor(prev_action) prev_reward = torch.FloatTensor([prev_reward]) #agent.eval_mode(step) # determinitic distribution. The std is ignored. action = agent.step(state, prev_action, prev_reward).action action = action.detach().cpu().numpy() next_state, reward, done, _ = env.step(action) state = next_state prev_action = action prev_reward = reward frame_idx += 1 step += 1 if done: break env.close()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) sampler = CpuSampler( EnvCls=gym_make, env_kwargs=config["env"], CollectorCls=CpuResetCollector, **config["sampler"] ) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, affinity=affinity, seed=int(run_ID) * 1000, **config["runner"] ) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): if slot_affinity_code is 'None': # affinity = affinity_from_code(run_slot_affinity_code) slot_affinity_code = prepend_run_slot(0, affinity_code) affinity = affinity_from_code(slot_affinity_code) else: affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] # load variant of experiment (there may not be a variant, though) variant = load_variant(log_dir) config = update_config(config, variant) sampler = CpuSampler( EnvCls=make_env, env_kwargs={}, CollectorCls=CpuResetCollector, **config["sampler"] ) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = config["env"]["id"] with logger_context(log_dir, run_ID, name, config): runner.train()
def build_and_train(): p = psutil.Process() cpus = p.cpu_affinity() affinity = dict(cuda_idx=None, master_cpus=cpus, workers_cpus=list([x] for x in cpus), set_affinity=True) sampler = CpuSampler( EnvCls=_make_env, env_kwargs=dict(rank=0), max_decorrelation_steps=0, batch_T=6000, batch_B=len(cpus), # 20 parallel environments. ) model_kwargs = dict(model_kwargs=dict(hidden_sizes=[256, 256])) ppo_config = { "discount": 0.98, "entropy_loss_coeff": 0.01, "learning_rate": 0.00025, "value_loss_coeff": 0.5, "clip_grad_norm": 0.5, "minibatches": 40, "gae_lambda": 0.95, "ratio_clip": 0.2, "epochs": 4 } algo = PPO(**ppo_config) agent = MujocoFfAgent(**model_kwargs) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=int(60e6), log_interval_steps=int(1e6), affinity=affinity, ) config = dict(rank=0, env_id='picking') name = "ppo_rlpyt_pushing" log_dir = os.path.join(os.path.dirname(__file__), name) with logger_context(log_dir, 0, name, config, use_summary_writer=True, snapshot_mode='all'): runner.train()
def build_and_train(slot_affinity_code, log_dir, run_ID, config_key): affinity = affinity_from_code(slot_affinity_code) config = configs[config_key] variant = load_variant(log_dir) config = update_config(config, variant) env = IsaacGymEnv(config['env']['task']) # Make env import torch.nn as nn config["model"]["hidden_nonlinearity"] = getattr(nn, config["model"]["hidden_nonlinearity"]) # Replace string with proper activation sampler = IsaacSampler(env, **config["sampler"]) algo = PPO(optim_kwargs=config["optim"], **config["algo"]) agent = MujocoFfAgent(model_kwargs=config["model"], **config["agent"]) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, affinity=affinity, **config["runner"] ) name = "ppo_nv_" + config["env"]["task"] with logger_context(log_dir, run_ID, name, config): runner.train()
formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--path', help='path to params.pkl', default='/home/alex/parkour-learning/data/params.pkl') parser.add_argument( '--env', default='HumanoidPrimitivePretraining-v0', choices=['HumanoidPrimitivePretraining-v0', 'TrackEnv-v0']) parser.add_argument('--algo', default='ppo', choices=['sac', 'ppo']) args = parser.parse_args() snapshot = torch.load(args.path, map_location=torch.device('cpu')) agent_state_dict = snapshot['agent_state_dict'] env = GymEnvWrapper(gym.make(args.env, render=True)) if args.algo == 'ppo': if args.env == 'TrackEnv-v0': agent = MujocoFfAgent(ModelCls=PpoMcpVisionModel) else: agent = MujocoFfAgent(ModelCls=PPOMcpModel) else: if args.env == 'TrackEnv-v0': agent = SacAgent(ModelCls=PiVisionModel, QModelCls=QofMuVisionModel) else: agent = SacAgent(ModelCls=PiMCPModel, QModelCls=QofMCPModel) agent.initialize(env_spaces=env.spaces) agent.load_state_dict(agent_state_dict) agent.eval_mode(0) simulate_policy(env, agent)
def start_experiment(args): args_json = json.dumps(vars(args), indent=4) if not os.path.isdir(args.log_dir): os.makedirs(args.log_dir) with open(args.log_dir + '/arguments.json', 'w') as jsonfile: jsonfile.write(args_json) with open(args.log_dir + '/git.txt', 'w') as git_file: branch = subprocess.check_output(['git', 'rev-parse', '--abbrev-ref', 'HEAD']).strip().decode('utf-8') commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']).strip().decode('utf-8') git_file.write('{}/{}'.format(branch, commit)) config = dict(env_id=args.env) if args.sample_mode == 'gpu': # affinity = dict(num_gpus=args.num_gpus, workers_cpus=list(range(args.num_cpus))) if args.num_gpus > 0: # import ipdb; ipdb.set_trace() affinity = make_affinity( run_slot=0, n_cpu_core=args.num_cpus, # Use 16 cores across all experiments. n_gpu=args.num_gpus, # Use 8 gpus across all experiments. # contexts_per_gpu=2, # hyperthread_offset=72, # If machine has 24 cores. # n_socket=2, # Presume CPU socket affinity to lower/upper half GPUs. gpu_per_run=args.gpu_per_run, # How many GPUs to parallelize one run across. # cpu_per_run=1, ) print('Make multi-gpu affinity') else: affinity = dict(cuda_idx=0, workers_cpus=list(range(args.num_cpus))) os.environ['CUDA_VISIBLE_DEVICES'] = str(0) else: affinity = dict(workers_cpus=list(range(args.num_cpus))) # potentially reload models initial_optim_state_dict = None initial_model_state_dict = None if args.pretrain != 'None': os.system(f"find {args.log_dir} -name '*.json' -delete") # clean up json files for video recorder checkpoint = torch.load(os.path.join(_RESULTS_DIR, args.pretrain, 'params.pkl')) initial_optim_state_dict = checkpoint['optimizer_state_dict'] initial_model_state_dict = checkpoint['agent_state_dict'] # ----------------------------------------------------- POLICY ----------------------------------------------------- # model_args = dict(curiosity_kwargs=dict(curiosity_alg=args.curiosity_alg), curiosity_step_kwargs=dict()) if args.curiosity_alg =='icm': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['forward_loss_wt'] = args.forward_loss_wt model_args['curiosity_kwargs']['forward_model'] = args.forward_model model_args['curiosity_kwargs']['feature_space'] = args.feature_space elif args.curiosity_alg == 'micm': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['forward_loss_wt'] = args.forward_loss_wt model_args['curiosity_kwargs']['forward_model'] = args.forward_model model_args['curiosity_kwargs']['ensemble_mode'] = args.ensemble_mode model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'disagreement': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['ensemble_size'] = args.ensemble_size model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['forward_loss_wt'] = args.forward_loss_wt model_args['curiosity_kwargs']['device'] = args.sample_mode model_args['curiosity_kwargs']['forward_model'] = args.forward_model elif args.curiosity_alg == 'ndigo': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['pred_horizon'] = args.pred_horizon model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'rnd': model_args['curiosity_kwargs']['feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs']['drop_probability'] = args.drop_probability model_args['curiosity_kwargs']['gamma'] = args.discount model_args['curiosity_kwargs']['device'] = args.sample_mode if args.curiosity_alg != 'none': model_args['curiosity_step_kwargs']['curiosity_step_minibatches'] = args.curiosity_step_minibatches if args.env in _MUJOCO_ENVS: if args.lstm: agent = MujocoLstmAgent(initial_model_state_dict=initial_model_state_dict) else: agent = MujocoFfAgent(initial_model_state_dict=initial_model_state_dict) else: if args.lstm: agent = AtariLstmAgent( initial_model_state_dict=initial_model_state_dict, model_kwargs=model_args, no_extrinsic=args.no_extrinsic, dual_model=args.dual_model, ) else: agent = AtariFfAgent(initial_model_state_dict=initial_model_state_dict, model_kwargs=model_args, no_extrinsic=args.no_extrinsic, dual_model=args.dual_model) # ----------------------------------------------------- LEARNING ALG ----------------------------------------------------- # if args.alg == 'ppo': algo = PPO( discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict=initial_optim_state_dict, # is None is not reloading a checkpoint gae_lambda=args.gae_lambda, minibatches=args.minibatches, # if recurrent: batch_B needs to be at least equal, if not recurrent: batch_B*batch_T needs to be at least equal to this epochs=args.epochs, ratio_clip=args.ratio_clip, linear_lr_schedule=args.linear_lr, normalize_advantage=args.normalize_advantage, normalize_reward=args.normalize_reward, curiosity_type=args.curiosity_alg, policy_loss_type=args.policy_loss_type ) elif args.alg == 'a2c': algo = A2C( discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict=initial_optim_state_dict, gae_lambda=args.gae_lambda, normalize_advantage=args.normalize_advantage ) # ----------------------------------------------------- SAMPLER ----------------------------------------------------- # # environment setup traj_info_cl = TrajInfo # environment specific - potentially overriden below if 'mario' in args.env.lower(): env_cl = mario_make env_args = dict( game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000 ) elif args.env in _PYCOLAB_ENVS: env_cl = deepmind_make traj_info_cl = PycolabTrajInfo env_args = dict( game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, log_heatmaps=args.log_heatmaps, logdir=args.log_dir, obs_type=args.obs_type, grayscale=args.grayscale, max_steps_per_episode=args.max_episode_steps ) elif args.env in _MUJOCO_ENVS: env_cl = gym_make env_args = dict( id=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=False, normalize_obs_steps=10000 ) elif args.env in _ATARI_ENVS: env_cl = AtariEnv traj_info_cl = AtariTrajInfo env_args = dict( game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, downsampling_scheme='classical', record_freq=args.record_freq, record_dir=args.log_dir, horizon=args.max_episode_steps, score_multiplier=args.score_multiplier, repeat_action_probability=args.repeat_action_probability, fire_on_reset=args.fire_on_reset ) if args.sample_mode == 'gpu': if args.lstm: collector_class = GpuWaitResetCollector else: collector_class = GpuResetCollector sampler = GpuSampler( EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, batch_B=args.num_envs, max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class ) else: if args.lstm: collector_class = CpuWaitResetCollector else: collector_class = CpuResetCollector sampler = CpuSampler( EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, # timesteps in a trajectory episode batch_B=args.num_envs, # environments distributed across workers max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class ) # ----------------------------------------------------- RUNNER ----------------------------------------------------- # if args.eval_envs > 0: runner = (MinibatchRlEval if args.num_gpus <= 1 else SyncRlEval)( algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain ) else: runner = (MinibatchRl if args.num_gpus <= 1 else SyncRl)( algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain ) with logger_context(args.log_dir, config, snapshot_mode="last", use_summary_writer=True): runner.train()
'normalize_observation': False } PPO_kwargs={ 'learning_rate': 3e-4, 'clip_vf_loss': False, 'entropy_loss_coeff': 0., 'discount': 0.99, 'linear_lr_schedule': False, 'epochs': 10, 'clip_grad_norm': 2., 'minibatches': 2, 'normalize_rewards': None, 'value_loss_coeff': 2. } agent = MujocoFfAgent(model_kwargs=model_kwargs) algo = PPO(**PPO_kwargs) runner = MinibatchRl( algo=algo, agent=agent, sampler=sampler, n_steps=1e8, log_interval_steps=1e4, affinity=affinity, transfer=True, transfer_iter=transfer_iter, # log_traj_window=10 ) config = dict(task=task) name = "ppo_nt_nv_" + task log_dir = "example_2a"
def start_experiment(args): args_json = json.dumps(vars(args), indent=4) if not os.path.isdir(args.log_dir): os.makedirs(args.log_dir) with open(args.log_dir + '/arguments.json', 'w') as jsonfile: jsonfile.write(args_json) config = dict(env_id=args.env) if args.sample_mode == 'gpu': assert args.num_gpus > 0 affinity = dict(cuda_idx=0, workers_cpus=list(range(args.num_cpus))) os.environ['CUDA_VISIBLE_DEVICES'] = str(0) else: affinity = dict(workers_cpus=list(range(args.num_cpus))) # potentially reload models initial_optim_state_dict = None initial_model_state_dict = None if args.pretrain != 'None': os.system(f"find {args.log_dir} -name '*.json' -delete" ) # clean up json files for video recorder checkpoint = torch.load( os.path.join(_RESULTS_DIR, args.pretrain, 'params.pkl')) initial_optim_state_dict = checkpoint['optimizer_state_dict'] initial_model_state_dict = checkpoint['agent_state_dict'] # ----------------------------------------------------- POLICY ----------------------------------------------------- # model_args = dict(curiosity_kwargs=dict(curiosity_alg=args.curiosity_alg)) if args.curiosity_alg == 'icm': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs'][ 'prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs'][ 'forward_loss_wt'] = args.forward_loss_wt elif args.curiosity_alg == 'disagreement': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['ensemble_size'] = args.ensemble_size model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs'][ 'prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs'][ 'forward_loss_wt'] = args.forward_loss_wt model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'ndigo': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs']['pred_horizon'] = args.pred_horizon model_args['curiosity_kwargs']['batch_norm'] = args.batch_norm model_args['curiosity_kwargs']['num_predictors'] = args.num_predictors model_args['curiosity_kwargs']['device'] = args.sample_mode elif args.curiosity_alg == 'rnd': model_args['curiosity_kwargs'][ 'feature_encoding'] = args.feature_encoding model_args['curiosity_kwargs'][ 'prediction_beta'] = args.prediction_beta model_args['curiosity_kwargs'][ 'drop_probability'] = args.drop_probability model_args['curiosity_kwargs']['gamma'] = args.discount model_args['curiosity_kwargs']['device'] = args.sample_mode if args.env in _MUJOCO_ENVS: if args.lstm: agent = MujocoLstmAgent( initial_model_state_dict=initial_model_state_dict) else: agent = MujocoFfAgent( initial_model_state_dict=initial_model_state_dict) else: if args.lstm: agent = AtariLstmAgent( initial_model_state_dict=initial_model_state_dict, model_kwargs=model_args, no_extrinsic=args.no_extrinsic) else: agent = AtariFfAgent( initial_model_state_dict=initial_model_state_dict) # ----------------------------------------------------- LEARNING ALG ----------------------------------------------------- # if args.alg == 'ppo': if args.kernel_mu == 0.: kernel_params = None else: kernel_params = (args.kernel_mu, args.kernel_sigma) algo = PPO( discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict= initial_optim_state_dict, # is None is not reloading a checkpoint gae_lambda=args.gae_lambda, minibatches=args. minibatches, # if recurrent: batch_B needs to be at least equal, if not recurrent: batch_B*batch_T needs to be at least equal to this epochs=args.epochs, ratio_clip=args.ratio_clip, linear_lr_schedule=args.linear_lr, normalize_advantage=args.normalize_advantage, normalize_reward=args.normalize_reward, kernel_params=kernel_params, curiosity_type=args.curiosity_alg) elif args.alg == 'a2c': algo = A2C(discount=args.discount, learning_rate=args.lr, value_loss_coeff=args.v_loss_coeff, entropy_loss_coeff=args.entropy_loss_coeff, OptimCls=torch.optim.Adam, optim_kwargs=None, clip_grad_norm=args.grad_norm_bound, initial_optim_state_dict=initial_optim_state_dict, gae_lambda=args.gae_lambda, normalize_advantage=args.normalize_advantage) # ----------------------------------------------------- SAMPLER ----------------------------------------------------- # # environment setup traj_info_cl = TrajInfo # environment specific - potentially overriden below if 'mario' in args.env.lower(): env_cl = mario_make env_args = dict(game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000) elif 'deepmind' in args.env.lower(): # pycolab deepmind environments env_cl = deepmind_make traj_info_cl = PycolabTrajInfo env_args = dict(game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, log_heatmaps=args.log_heatmaps, logdir=args.log_dir, obs_type=args.obs_type, max_steps_per_episode=args.max_episode_steps) elif args.env in _MUJOCO_ENVS: env_cl = gym_make env_args = dict(id=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=False, normalize_obs_steps=10000) elif args.env in _ATARI_ENVS: env_cl = AtariEnv traj_info_cl = AtariTrajInfo env_args = dict( game=args.env, no_extrinsic=args.no_extrinsic, no_negative_reward=args.no_negative_reward, normalize_obs=args.normalize_obs, normalize_obs_steps=10000, downsampling_scheme='classical', record_freq=args.record_freq, record_dir=args.log_dir, horizon=args.max_episode_steps, ) if args.sample_mode == 'gpu': if args.lstm: collector_class = GpuWaitResetCollector else: collector_class = GpuResetCollector sampler = GpuSampler(EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, batch_B=args.num_envs, max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class) else: if args.lstm: collector_class = CpuWaitResetCollector else: collector_class = CpuResetCollector sampler = CpuSampler( EnvCls=env_cl, env_kwargs=env_args, eval_env_kwargs=env_args, batch_T=args.timestep_limit, # timesteps in a trajectory episode batch_B=args.num_envs, # environments distributed across workers max_decorrelation_steps=0, TrajInfoCls=traj_info_cl, eval_n_envs=args.eval_envs, eval_max_steps=args.eval_max_steps, eval_max_trajectories=args.eval_max_traj, record_freq=args.record_freq, log_dir=args.log_dir, CollectorCls=collector_class) # ----------------------------------------------------- RUNNER ----------------------------------------------------- # if args.eval_envs > 0: runner = MinibatchRlEval(algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain) else: runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, n_steps=args.iterations, affinity=affinity, log_interval_steps=args.log_interval, log_dir=args.log_dir, pretrain=args.pretrain) with logger_context(args.log_dir, config, snapshot_mode="last", use_summary_writer=True): runner.train()
def build_and_train(env_id="Pendulum-v0", run_ID=0, cuda_idx=None, method="adam", trial=0): sampler = SerialSampler( EnvCls=gym_make, env_kwargs=dict(id=env_id), batch_T=HORIZON, # Time-step per sampler iteration, T. batch_B=1, # One environment (i.e. sampler Batch dimension). max_decorrelation_steps= 400 #(int): if taking random number of steps before start of training, to decorrelate batch states ) if method == "adam": algo = ALGO(learning_rate=LEARNING_RATE, value_loss_coeff=C1, entropy_loss_coeff=C2, gae_lambda=GAE_PARAM, minibatches=NUM_MINIBATCHES, epochs=NUM_EPOCHS, ratio_clip=CLIPPING, linear_lr_schedule=False) elif method == "tadam": algo = ALGO(learning_rate=TLEARNING_RATE, value_loss_coeff=C1, entropy_loss_coeff=C2, OptimCls=TAdam, gae_lambda=GAE_PARAM, minibatches=NUM_MINIBATCHES, epochs=NUM_EPOCHS, ratio_clip=CLIPPING, linear_lr_schedule=False) elif method == "amsgrad": algo = ALGO(learning_rate=LEARNING_RATE, value_loss_coeff=C1, entropy_loss_coeff=C2, optim_kwargs={'amsgrad': True}, gae_lambda=GAE_PARAM, minibatches=NUM_MINIBATCHES, epochs=NUM_EPOCHS, ratio_clip=CLIPPING, linear_lr_schedule=False) elif method == "tamsgrad": algo = ALGO(learning_rate=TLEARNING_RATE, value_loss_coeff=C1, entropy_loss_coeff=C2, OptimCls=TAdam, optim_kwargs={'amsgrad': True}, gae_lambda=GAE_PARAM, minibatches=NUM_MINIBATCHES, epochs=NUM_EPOCHS, ratio_clip=CLIPPING, linear_lr_schedule=False) agent = MujocoFfAgent() runner = MinibatchRl(algo=algo, agent=agent, sampler=sampler, n_steps=MAX_FRAMES, log_interval_steps=1e3, affinity=dict(cuda_idx=cuda_idx), seed=trial) config = dict(env_id=env_id) name = "ppo_" + env_id log_dir = method + "_" + str(trial) DATA_DIRECTORY = "/home/isc-lab/Documents/rlpyt/data/local/" + datetime.now( ).strftime("%Y%m%d") with logger_context(log_dir, run_ID, name, config): runner.train() log_dir = osp.join(log_dir, f"run_{run_ID}") exp_dir = osp.join(DATA_DIRECTORY, log_dir) ### Save Model torch.save(agent.state_dict(), exp_dir + 'agent_model.pth') return exp_dir