json.dump(vars(args), f) pprint(vars(args)) if not os.path.exists(os.path.join(args.log, 'models')): os.mkdir(os.path.join(args.log, 'models')) np.random.seed(args.seed) torch.manual_seed(args.seed) device_name = 'cpu' if args.cuda < 0 else "cuda:{}".format(args.cuda) device = torch.device(device_name) set_device(device) score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) logger.add_tensorboard_output(args.log) env = GymEnv( args.env_name, log_dir=os.path.join( args.log, 'movie'), record_video=args.record) env.env.seed(args.seed) if args.c2d: env = C2DEnv(env) observation_space = env.observation_space action_space = env.action_space # Generate teacher (t) policy and student (s) policy and load teacher policy
def main(args): init_ray(args.num_cpus, args.num_gpus, args.ray_redis_address) if not os.path.exists(args.log): os.makedirs(args.log) if not os.path.exists(os.path.join(args.log, 'models')): os.mkdir(os.path.join(args.log, 'models')) score_file = os.path.join(args.log, 'progress.csv') logger.add_tabular_output(score_file) logger.add_tensorboard_output(args.log) with open(os.path.join(args.log, 'args.json'), 'w') as f: json.dump(vars(args), f) pprint(vars(args)) # when doing the distributed training, disable video recordings env = GymEnv(args.env_name) env.env.seed(args.seed) if args.c2d: env = C2DEnv(env) observation_space = env.observation_space action_space = env.action_space pol_net = PolNet(observation_space, action_space) rnn = False # pol_net = PolNetLSTM(observation_space, action_space) # rnn = True if isinstance(action_space, gym.spaces.Box): pol = GaussianPol(observation_space, action_space, pol_net, rnn=rnn) elif isinstance(action_space, gym.spaces.Discrete): pol = CategoricalPol(observation_space, action_space, pol_net) elif isinstance(action_space, gym.spaces.MultiDiscrete): pol = MultiCategoricalPol(observation_space, action_space, pol_net) else: raise ValueError('Only Box, Discrete, and MultiDiscrete are supported') vf_net = VNet(observation_space) vf = DeterministicSVfunc(observation_space, vf_net) trainer = TrainManager(Trainer, args.num_trainer, args.master_address, args=args, vf=vf, pol=pol) sampler = EpiSampler(env, pol, args.num_parallel, seed=args.seed) total_epi = 0 total_step = 0 max_rew = -1e6 start_time = time.time() while args.max_epis > total_epi: with measure('sample'): sampler.set_pol_state(trainer.get_state("pol")) epis = sampler.sample(max_steps=args.max_steps_per_iter) with measure('train'): result_dict = trainer.train(epis=epis) step = result_dict["traj_num_step"] total_step += step total_epi += result_dict["traj_num_epi"] rewards = [np.sum(epi['rews']) for epi in epis] mean_rew = np.mean(rewards) elapsed_time = time.time() - start_time logger.record_tabular('ElapsedTime', elapsed_time) logger.record_results(args.log, result_dict, score_file, total_epi, step, total_step, rewards, plot_title=args.env_name) with measure('save'): pol_state = trainer.get_state("pol") vf_state = trainer.get_state("vf") optim_pol_state = trainer.get_state("optim_pol") optim_vf_state = trainer.get_state("optim_vf") torch.save(pol_state, os.path.join(args.log, 'models', 'pol_last.pkl')) torch.save(vf_state, os.path.join(args.log, 'models', 'vf_last.pkl')) torch.save(optim_pol_state, os.path.join(args.log, 'models', 'optim_pol_last.pkl')) torch.save(optim_vf_state, os.path.join(args.log, 'models', 'optim_vf_last.pkl')) if mean_rew > max_rew: torch.save(pol_state, os.path.join(args.log, 'models', 'pol_max.pkl')) torch.save(vf_state, os.path.join(args.log, 'models', 'vf_max.pkl')) torch.save( optim_pol_state, os.path.join(args.log, 'models', 'optim_pol_max.pkl')) torch.save( optim_vf_state, os.path.join(args.log, 'models', 'optim_vf_max.pkl')) max_rew = mean_rew del sampler del trainer