def render_growspace_with_ddpg(): seed = 123 num_processes = 1 gamma = 0.99 log_dir = "." custom_gym = "growspace" recurrent_policy = False cuda = True device = torch.device("cuda:0" if cuda else "cpu") envs = make_vec_envs("GrowSpaceEnv-Continuous-v0", seed, num_processes, gamma, log_dir, device, False, custom_gym) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': recurrent_policy}) actor_critic.to(device) eval_recurrent_hidden_states = torch.zeros( num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(num_processes, 1, device=device) obs = envs.reset() while True: with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) obs, rewards, dones, info = envs.step(action) envs.render()
def initialize_warm_up_batch(args, device): # using evenly distributed weights for warm-up stage weights_batch = [] generate_weights_batch_dfs(0, args.obj_num, args.min_weight, args.max_weight, args.delta_weight, [], weights_batch) sample_batch = [] scalarization_batch = [] temp_env = gym.make( args.env_name) # temp_env is only used for initialization for weights in weights_batch: actor_critic = Policy(temp_env.observation_space.shape, temp_env.action_space, base_kwargs={'layernorm': args.layernorm}, obj_num=args.obj_num) actor_critic.to(device).double() if args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=1e-5, max_grad_norm=args.max_grad_norm) else: # NOTE: other algorithms are not supported yet raise NotImplementedError envs = make_vec_envs(env_name=args.env_name, seed=args.seed, num_processes=args.num_processes, \ gamma=args.gamma, log_dir=None, device=device, allow_early_resets=False, \ obj_rms = args.obj_rms, ob_rms = args.ob_rms) env_params = {} env_params['ob_rms'] = deepcopy( envs.ob_rms) if envs.ob_rms is not None else None env_params['ret_rms'] = deepcopy( envs.ret_rms) if envs.ret_rms is not None else None env_params['obj_rms'] = deepcopy( envs.obj_rms) if envs.obj_rms is not None else None envs.close() scalarization = WeightedSumScalarization(num_objs=args.obj_num, weights=weights) sample = Sample(env_params, actor_critic, agent, optgraph_id=-1) objs = evaluation(args, sample) sample.objs = objs sample_batch.append(sample) scalarization_batch.append(scalarization) temp_env.close() return sample_batch, scalarization_batch
def ppo_experiment(args): torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda:0" if args.cuda else "cpu") if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True env_kwargs = dict() env_kwargs['timestep'] = args.timestep if "push" in args.env_name: env_kwargs['params'] = 'random_goal_unconstrained' if "soccer" in args.env_name: env_kwargs['params'] = 'random_goal_unconstrained' if "faucet" in args.env_name: secondary_output = True envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs) test_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, None, args.log_dir, device, False, env_kwargs=env_kwargs) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) train_ppo(actor_critic, agent, rollouts, envs, test_envs, args)
def main(): args = extend_arguments(get_parser()).parse_args() configs = common.config.get_config(args.env, args.experiment_name) assert args.alg in ['a2c', 'ppo', 'acktr', 'sac'] if args.recurrent_policy: assert args.alg in ['a2c', 'ppo' ], 'Recurrent policy is not implemented for ACKTR' if args.test: args.num_processes = 1 args.use_wandb = False logger = setup_logger(args.verbose, args.model_name, configs.log_directory) torch.set_num_threads(1) # set seed values seed = args.seed torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if args.use_wandb: import wandb resume_wandb = True if args.wandb_resume_id is not None else False wandb.init(config=args, resume=resume_wandb, id=args.wandb_resume_id, project='rl', name=args.experiment_name) # make environements (envs[0] is used for evaluation) envs, env_vector = make_vec_envs_pytorch(args.env, return_evn_vector=True, device=device, log_dir=configs.log_directory, **vars(args)) eval_envs = wrap_env_pytorch(env_vector[0], args.gamma, device) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'hidden_size': args.hidden_layer_size }) # load model if args.load_path is not None: logger.info("loading model: {}".format(args.load_path)) actor_critic = torch.load(args.load_path) actor_critic.to(device) if args.test: test(eval_envs, actor_critic, args, logger) else: train(envs, env_vector, eval_envs, actor_critic, args, configs, logger)
def job(rank, args, device, shared_model): episode_rewards = deque(maxlen=10) envs = gym.make(args.env_name) envs.seed(args.seed + rank) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) actor_critic.load_state_dict(shared_model.state_dict()) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() obs = torch.from_numpy(obs) rollouts.obs[0].copy_(obs) rollouts.to(device) acc_r = 0 done = [False] for step in range(args.num_steps): if done[0]: episode_rewards.append(acc_r) obs = envs.reset() obs = torch.from_numpy(obs) rollouts.obs[0].copy_(obs) rollouts.to(device) acc_r = 0 # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs target_action = action.numpy()[0] obs, reward, done, infos = envs.step(target_action) acc_r += reward obs = torch.from_numpy(obs).float().to(device) reward = torch.from_numpy(np.array([reward])).unsqueeze(dim=1).float() done = [done] masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor([[1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) print(rank, np.mean(episode_rewards)) return rollouts
def initialize_policy(envs): actor_critic = Policy(envs.obs_shape, envs.action_space, base_kwargs=dict(recurrent=args.recurrent_policy, hidden_size=args.policy_hidden_size, init_gain=args.init_gain)) actor_critic.to(args.device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) return actor_critic, agent
def get_agent(args, envs, encoder, device): actor_critic = Policy( [encoder.feature_size], envs.action_space, base=SimpleBase, base_kwargs={"encoder": encoder} ) actor_critic.to(device) agent = algo.PPO( actor_critic, args.ppo_clip_param, args.ppo_epoch, args.ppo_num_mini_batch, args.ppo_value_loss_coef, args.ppo_entropy_coef, lr=args.ppo_lr, eps=args.ppo_eps, max_grad_norm=args.ppo_max_grad_norm, ) return agent, actor_critic
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") #envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, device, False) envs = make_parallel_env(args.env_name, args.num_processes, args.seed, True) ''' actor_critic = Policy( envs.observation_space[0].shape, envs.action_space[0], agent_num=args.agent_num, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) ''' actor_critic = [] for i in range(args.agent_num): ac = Policy( envs.observation_space[0].shape, envs.action_space[0], agent_num=args.agent_num, agent_i = i, base_kwargs={'recurrent': args.recurrent_policy}) ac.to(device) actor_critic.append(ac) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': ''' agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) ''' agent = [] for i in range(args.agent_num): agent.append(algo.PPO( actor_critic[i], i, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, model_dir = args.model_dir)) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) ''' rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space[0].shape, envs.action_space[0], actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs[:,0,:])) rollouts.to(device) ''' rollouts = [] for i in range(args.agent_num): rollout = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space[0].shape, envs.action_space[0], actor_critic[i].recurrent_hidden_state_size, args.agent_num, i) rollouts.append(rollout) obs = envs.reset() # pdb.set_trace() for i in range(args.agent_num): rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1))) rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:])) rollouts[i].to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print(num_updates) for j in range(num_updates): #pdb.set_trace() if args.use_linear_lr_decay: # decrease learning rate linearly for i in range(args.agent_num): utils.update_linear_schedule(agent[i].optimizer, j, num_updates, agent[i].optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions value_list, action_list, action_log_prob_list, recurrent_hidden_states_list = [], [], [], [] with torch.no_grad(): for i in range(args.agent_num): #pdb.set_trace() value, action, action_log_prob, recurrent_hidden_states = actor_critic[i].act( rollouts[i].share_obs[step], rollouts[i].obs[step], rollouts[i].recurrent_hidden_states[step], rollouts[i].masks[step]) # import pdb; pdb.set_trace() value_list.append(value) action_list.append(action) action_log_prob_list.append(action_log_prob) recurrent_hidden_states_list.append(recurrent_hidden_states) # Obser reward and next obs action = [] for i in range(args.num_processes): one_env_action = [] for k in range(args.agent_num): one_hot_action = np.zeros(envs.action_space[0].n) one_hot_action[action_list[k][i]] = 1 one_env_action.append(one_hot_action) action.append(one_env_action) #start = time.time() #pdb.set_trace() obs, reward, done, infos = envs.step(action) # print(obs[0][0]) # pdb.set_trace() #end = time.time() #print("step time: ", end-start) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. ''' masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done[0]]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos[0]]) ''' masks = torch.ones(args.num_processes, 1) bad_masks = torch.ones(args.num_processes, 1) ''' rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) ''' #import pdb; pdb.set_trace() for i in range(args.agent_num): rollouts[i].insert(torch.tensor(obs.reshape(args.num_processes, -1)), torch.tensor(obs[:,i,:]), recurrent_hidden_states, action_list[i], action_log_prob_list[i], value_list[i], torch.tensor(reward[:, i].reshape(-1,1)), masks, bad_masks) #import pdb; pdb.set_trace() with torch.no_grad(): next_value_list = [] for i in range(args.agent_num): next_value = actor_critic[i].get_value( rollouts[i].share_obs[-1], rollouts[i].obs[-1], rollouts[i].recurrent_hidden_states[-1], rollouts[i].masks[-1]).detach() next_value_list.append(next_value) if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) for i in range(args.agent_num): rollouts[i].compute_returns(next_value_list[i], args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) #import pdb; pdb.set_trace() for i in range(args.agent_num): value_loss, action_loss, dist_entropy = agent[i].update(rollouts[i]) if (i == 0): print("value loss: " + str(value_loss)) # print(value_loss) # pdb.set_trace() #rollouts.after_update() obs = envs.reset() # pdb.set_trace() for i in range(args.agent_num): rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1))) rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:])) rollouts[i].to(device) # save for every interval-th episode or for the last epoch #pdb.set_trace() if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) if not os.path.exists(save_path + args.model_dir): os.makedirs(save_path + args.model_dir) for i in range(args.agent_num): torch.save([ actor_critic[i], getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], save_path + args.model_dir + '/agent_%i' % (i+1) + ".pt") ''' if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) ''' '''
def __init__(self, learner, envs, maxat, maxupd, targ_policy, args, device="cpu", hidden_sizes=(64, 64), activation=nn.Tanh, rand_select=False): super(TargAttacker, self).__init__() self.args = args self.targ_policy = targ_policy self.learner = learner self.gamma = args.gamma self.device = device self.radius = args.radius self.frac = args.frac self.stepsize = args.stepsize self.maxiter = args.maxiter self.maxat = maxat self.maxupd = maxupd self.delta = args.delta self.dist_thres = args.dist_thres self.rand_select = rand_select self.disc_action = isinstance(envs.action_space, Discrete) if self.disc_action: self.action_dim = envs.action_space.n attack_policy = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) attack_policy.to(device) if isinstance(learner, algo.A2C_ACKTR): self.im_learner = algo.A2C_ACKTR(attack_policy, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=learner.acktr) elif isinstance(learner, algo.PPO): self.im_learner = algo.PPO(attack_policy, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) self.cp_net() self.critic = Value(envs.observation_space.shape[0], hidden_sizes, activation).to(device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=args.lr) self.dist_list = np.array([]) self.attack_num = 0 self.update_num = 0 self.state_buffer = None self.state_buffer_limit = 100
def onpolicy_main(): print("onpolicy main") torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") summary_name = args.log_dir + '{0}_{1}' writer = SummaryWriter(summary_name.format(args.env_name, args.save_name)) # Make vector env envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) # agly ways to access to the environment attirubutes if args.env_name.find('doorenv') > -1: if args.num_processes > 1: visionnet_input = envs.venv.venv.visionnet_input nn = envs.venv.venv.nn env_name = envs.venv.venv.xml_path else: visionnet_input = envs.venv.venv.envs[ 0].env.env.env.visionnet_input nn = envs.venv.venv.envs[0].env.env.env.nn env_name = envs.venv.venv.envs[0].env.env.env.xml_path dummy_obs = np.zeros(nn * 2 + 3) else: dummy_obs = envs.observation_space visionnet_input = None nn = None if pretrained_policy_load: print("loading", pretrained_policy_load) actor_critic, ob_rms = torch.load(pretrained_policy_load) else: actor_critic = Policy(dummy_obs.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) if visionnet_input: visionmodel = load_visionmodel(env_name, args.visionmodel_path, VisionModelXYZ()) actor_critic.visionmodel = visionmodel.eval() actor_critic.nn = nn actor_critic.to(device) #disable normalizer vec_norm = get_vec_normalize(envs) vec_norm.eval() if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, dummy_obs.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) full_obs = envs.reset() initial_state = full_obs[:, :envs.action_space.shape[0]] if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, 0) else: if knob_noisy: obs = add_noise(full_obs, 0) else: obs = full_obs rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) pos_control = False total_switches = 0 prev_selection = "" for step in range(args.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) next_action = action if pos_control: frame_skip = 2 if step % (512 / frame_skip - 1) == 0: current_state = initial_state next_action = current_state + next_action for kk in range(frame_skip): full_obs, reward, done, infos = envs.step(next_action) current_state = full_obs[:, :envs.action_space.shape[0]] else: full_obs, reward, done, infos = envs.step(next_action) # convert img to obs if door_env and using visionnet if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: if knob_noisy: obs = add_noise(full_obs, j) else: obs = full_obs for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() writer.add_scalar("Value loss", value_loss, j) writer.add_scalar("action loss", action_loss, j) writer.add_scalar("dist entropy loss", dist_entropy, j) writer.add_scalar("Episode rewards", np.mean(episode_rewards), j) # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join( save_path, args.env_name + "_{}.{}.pt".format(args.save_name, j))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) DR = True #Domain Randomization ################## for multiprocess world change ###################### if DR: print("changing world") envs.close_extras() envs.close() del envs envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, env_kwargs=env_kwargs, ) full_obs = envs.reset() if args.env_name.find('doorenv') > -1 and visionnet_input: obs = actor_critic.obs2inputs(full_obs, j) else: obs = full_obs
def main(): tb_path = os.path.join(os.path.expanduser(args.log_dir), "tensorboard_log") makedir_if_not_exists(tb_path) writer = SummaryWriter(tb_path) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # p = multiprocessing.Process(target=_tb_task,args=(tb_path,5013) ,daemon=True) # p.start() if args.start_tb: _tb_task(tb_path, port=5013) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_eps = 0 # num training eps num_steps = 0 # num training eps for j in range(num_updates): # list of all values all eps in num updates num_steps_basline_info = defaultdict(list) if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) env_basline_info = defaultdict(list) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: # episode is done # add addisiotnal baseline rw info in infos: if 'basline_rw_mse' in info: env_basline_info['rw_mse'].append(info['basline_rw_mse']) env_basline_info['rw_rec'].append(info['basline_rw_rec']) if 'basline_rw_tcn' in info: env_basline_info['rw_tcn'].append(info['basline_rw_tcn']) if 'episode' in info.keys(): # end of episode episode_rewards.append(info['episode']['r']) num_steps_basline_info['len_episode'].append( info['episode']['l']) # distance of the pushed block num_steps_basline_info['push_distance'].append( info['basline_rw_push_dist']) # take mean over eps for k, step_vals in env_basline_info.items(): num_steps_basline_info[k].append(np.sum(step_vals)) # add baseline infos num_eps += 1 env_basline_info = defaultdict(list) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps # write baseline finfos for tcn writer_step = total_num_steps for k, vals_step_eps in num_steps_basline_info.items(): writer.add_scalar('basline/' + k, np.mean(vals_step_eps), writer_step) writer.add_scalar('basline/episodes', num_eps, writer_step) len_eps = np.mean(num_steps_basline_info['len_episode']) if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() log.info( "Updates {}, num timesteps {}, FPS {} Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}, len eps {}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), int(len_eps), dist_entropy, value_loss, action_loss)) if j == num_updates or (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): vid_log_dir = os.getenv('TCN_ENV_VID_LOG_FOLDER', '/tmp/env_tcn/train_vid') vid_log_inter = os.getenv('TCN_ENV_VID_LOG_INTERVAL', train_vid_log_iter) os.environ[ 'TCN_ENV_VID_LOG_FOLDER'] = "eval_vid" # os.path.join(vid_log_dir,"../eval_vid/","interval_"+str(j)) os.environ['TCN_ENV_VID_LOG_INTERVAL'] = '1' os.environ['TCN_ENV_EVAL_EPISODE'] = '1' with redirect_stdout(open(os.devnull, "w")): # no stdout with suppress_logging(): # eval envs eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, 1, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 1: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append( info['episode']['r']) eval_envs.close() os.environ['TCN_ENV_VID_LOG_FOLDER'] = vid_log_dir os.environ['TCN_ENV_EVAL_EPISODE'] = '0' os.environ['TCN_ENV_VID_LOG_INTERVAL'] = vid_log_inter writer.add_scalar('eval/rw', np.mean(eval_episode_rewards), j) log.info( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if j % args.vis_interval == 0: try: td_plot(writer, args.log_dir) # Sometimes monitor doesn't properly flush the outputs # win = visdom_plot(viz, win, args.log_dir, args.env_name, # args.algo, args.num_env_steps) except IOError: print("plt error") pass
def main(): args = get_args() # Record trajectories if args.record_trajectories: record_trajectories() return print(args) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Append the model name log_dir = os.path.expanduser(args.log_dir) log_dir = os.path.join(log_dir, args.model_name, str(args.seed)) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, log_dir, device, False) # Take activation for carracing print("Loaded env...") activation = None if args.env_name == 'CarRacing-v0' and args.use_activation: activation = torch.tanh print(activation) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'env': args.env_name }, activation=activation) actor_critic.to(device) # Load from previous model if args.load_model_name: state = torch.load( os.path.join(args.save_dir, args.load_model_name, args.load_model_name + '_{}.pt'.format(args.seed)))[0] try: actor_critic.load_state_dict(state) except: actor_critic = state if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: if len(envs.observation_space.shape) == 1: discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=3, subsample_frequency=1) expert_dataset_test = gail.ExpertDataset(file_name, num_trajectories=1, start=3, subsample_frequency=1) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) gail_test_loader = torch.utils.data.DataLoader( dataset=expert_dataset_test, batch_size=args.gail_batch_size, shuffle=False, drop_last=False) print(len(expert_dataset), len(expert_dataset_test)) else: # env observation shape is 3 => its an image assert len(envs.observation_space.shape) == 3 discr = gail.CNNDiscriminator(envs.observation_space.shape, envs.action_space, 100, device) file_name = os.path.join(args.gail_experts_dir, 'expert_data.pkl') expert_dataset = gail.ExpertImageDataset(file_name, train=True) test_dataset = gail.ExpertImageDataset(file_name, train=False) gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=len(expert_dataset) > args.gail_batch_size, ) gail_test_loader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=args.gail_batch_size, shuffle=False, drop_last=len(test_dataset) > args.gail_batch_size, ) print('Dataloader size', len(gail_train_loader)) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() #num_updates = int( #args.num_env_steps) // args.num_steps // args.num_processes num_updates = args.num_steps print(num_updates) # count the number of times validation loss increases val_loss_increase = 0 prev_val_action = np.inf best_val_loss = np.inf for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: try: envs.venv.eval() except: pass gail_epoch = args.gail_epoch #if j < 10: #gail_epoch = 100 # Warm up for _ in range(gail_epoch): #discr.update(gail_train_loader, rollouts, #None) pass for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) #value_loss, action_loss, dist_entropy = agent.update(rollouts) value_loss = 0 dist_entropy = 0 for data in gail_train_loader: expert_states, expert_actions = data expert_states = Variable(expert_states).to(device) expert_actions = Variable(expert_actions).to(device) loss = agent.update_bc(expert_states, expert_actions) action_loss = loss.data.cpu().numpy() print("Epoch: {}, Loss: {}".format(j, action_loss)) with torch.no_grad(): cnt = 0 val_action_loss = 0 for data in gail_test_loader: expert_states, expert_actions = data expert_states = Variable(expert_states).to(device) expert_actions = Variable(expert_actions).to(device) loss = agent.get_action_loss(expert_states, expert_actions) val_action_loss += loss.data.cpu().numpy() cnt += 1 val_action_loss /= cnt print("Val Loss: {}".format(val_action_loss)) #rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": if val_action_loss < best_val_loss: val_loss_increase = 0 best_val_loss = val_action_loss save_path = os.path.join(args.save_dir, args.model_name) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic.state_dict(), getattr(utils.get_vec_normalize(envs), 'ob_rms', None), getattr(utils.get_vec_normalize(envs), 'ret_rms', None) ], os.path.join( save_path, args.model_name + "_{}.pt".format(args.seed))) elif val_action_loss > prev_val_action: val_loss_increase += 1 if val_loss_increase == 10: print("Val loss increasing too much, breaking here...") break elif val_action_loss < prev_val_action: val_loss_increase = 0 # Update prev val action prev_val_action = val_action_loss # log interval if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): all_episode_rewards = [] ### 记录 6/29 all_temp_rewards = [] ### 记录 6/29 args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print('num_updates ', num_updates) print('num_steps ', args.num_steps) count = 0 h5_path = './data/' + args.env_name if not os.path.exists(h5_path): os.makedirs(h5_path) h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (count) data = {} data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] episode_step = 0 for j in range(num_updates): ### num-steps temp_states = [] temp_actions = [] temp_rewards = [] temp_done = [] temp_lenthgs = [] if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if j == 0 and step == 0: print('obs ', type(rollouts.obs[step]), rollouts.obs[step].shape) print('hidden_states ', type(rollouts.recurrent_hidden_states[step]), rollouts.recurrent_hidden_states[step].shape) print('action ', type(action), action.shape) print('action prob ', type(action_log_prob), action_log_prob.shape) print('-' * 20) # Obser reward and next obs obs, reward, done, infos = envs.step(action) #print(infos) #print(reward) temp_states += [np.array(rollouts.obs[step].cpu())] temp_actions += [np.array(action.cpu())] #temp_rewards += [np.array(reward.cpu())] temp_rewards += [np.array([infos[0]['myrewards']]) ] ### for halfcheetah不能直接用 reward !! 6/29 temp_done += [np.array(done)] if j == 0 and step == 0: print('obs ', type(obs), obs.shape) print('reward ', type(reward), reward.shape) print('done ', type(done), done.shape) print('infos ', len(infos)) for k, v in infos[0].items(): print(k, v.shape) print() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) all_episode_rewards += [info['episode']['r']] ### 记录 6/29 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) temp_lengths = len(temp_states) temp_states = np.concatenate(temp_states) temp_actions = np.concatenate(temp_actions) temp_rewards = np.concatenate(temp_rewards) temp_done = np.concatenate(temp_done) #print('temp_lengths',temp_lengths) #print('temp_states', temp_states.shape) #print('temp_actions', temp_actions.shape) #print('temp_rewards', temp_rewards.shape) if j > int(0.4 * num_updates): data['states'] += [temp_states] data['actions'] += [temp_actions] data['rewards'] += [temp_rewards] data['lengths'] += [temp_lengths] data['done'] += [temp_done] #print('temp_lengths',data['lengths'].shape) #print('temp_states', data['states'].shape) #print('temp_actions', data['actions'].shape) #print('temp_rewards', data['rewards'].shape) if args.save_expert and len(data['states']) >= 100: with h5py.File(h5_filename, 'w') as f: f['states'] = np.array(data['states']) f['actions'] = np.array(data['actions']) f['rewards'] = np.array(data['rewards']) f['done'] = np.array(data['done']) f['lengths'] = np.array(data['lengths']) #print('f_lengths',f['lengths'].shape) #print('f_states', f['states'].shape) #print('f_actions', f['actions'].shape) #print('f_rewards', f['rewards'].shape) count += 1 h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % ( count) data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "_%d.pt" % (args.seed))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) #np.save(os.path.join(save_path, args.env_name+"_%d"%(args.seed)), all_episode_rewards) ### 保存记录 6/29 #print(temp_rewards) print("temp rewards size", temp_rewards.shape, "mean", np.mean(temp_rewards), "min", np.min(temp_rewards), "max", np.max(temp_rewards)) all_temp_rewards += [temp_rewards] np.savez(os.path.join(save_path, args.env_name + "_%d" % (args.seed)), episode=all_episode_rewards, timestep=all_temp_rewards) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) '''data['states'] = np.array(data['states'])
def train_ppo_from_scratch(args): torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(2) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, True) actor_critic = Policy( # 2-layer fully connected network envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': False, 'hidden_size': 32 }) actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes episode_reward_means = [] episode_reward_times = [] for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) episode_reward_means.append(np.mean(episode_rewards)) episode_reward_times.append(total_num_steps) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) print(episode_reward_means, episode_reward_times) return episode_reward_means, episode_reward_times
def main(): args = get_args() use_ppo = args.algo == 'ppo' torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy, 'share_parameter': args.share_parameter}) actor_critic.to(device) return_distributions = False if args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo_rb': agent = algo.PPO_RB( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.rb_alpha, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm ) elif args.algo == 'tr_ppo': agent = algo.TR_PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ppo_clip_param=args.ppo_clip_param ) return_distributions = True elif args.algo == 'tr_ppo_rb': agent = algo.TR_PPO_RB( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.rb_alpha, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ppo_clip_param=args.ppo_clip_param ) return_distributions = True if not return_distributions: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) else: if actor_critic.dist_name == 'DiagGaussian': rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, distribution_param_dim=envs.action_space.shape[0]*2 ) elif actor_critic.dist_name == 'Bernoulli' or actor_critic.dist_name == 'Categorical': rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, distribution_param_dim=envs.action_space.n ) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes prev_mean_reward = None for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, parameters = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], return_distribution=True) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks, parameters) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts, use_ppo) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) mean_rewards = np.mean(episode_rewards) if (prev_mean_reward is not None) and (mean_rewards < prev_mean_reward) and \ (use_ppo == False) and args.revert_to_ppo and j > 3: use_ppo = True print('Revert Back to PPO Training') # args.lr = 3e-4 prev_mean_reward = mean_rewards if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(args): try: os.makedirs(args.log_dir) except OSError: files = glob.glob(os.path.join(args.log_dir, '*.monitor.csv')) for f in files: os.remove(f) eval_log_dir = args.log_dir + "_eval" try: os.makedirs(eval_log_dir) except OSError: files = glob.glob(os.path.join(eval_log_dir, '*.monitor.csv')) for f in files: os.remove(f) assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' if args.eval_render: render_env = make_vec_envs(args.env_name, args.seed, 1, None, None, args.add_timestep, device='cpu', allow_early_resets=False) torch.set_num_threads(1) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Uses gpu/cuda by default device = torch.device("cuda:0" if args.cuda else "cpu") # Only if running visdoom if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) # Set up actor_critic actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) # Set algorithm with actor critic and use to learn if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join( save_path, args.env_name + "-AvgRwrd" + str(int(np.mean(episode_rewards))) + ".pt")) print("Saving Model") total_num_steps = (j + 1) * args.num_processes * args.num_steps # Logs every log_interval steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) if args.eval_render: show_model(render_env, actor_critic) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.env_name.startswith("lab_"): gym_name, flow_json = make_lab_env(args.env_name) args.env_name = gym_name envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: " "mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def train(): processes = [] if os.path.isdir(args.log_dir): ans = input('{} exists\ncontinue and overwrite? y/n: '.format(args.log_dir)) if ans == 'n': return logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join(args.log_dir, 'params.json'), 'w')) torch.set_num_threads(2) start = time.time() policy_update_time, policy_forward_time = 0, 0 step_time_env, step_time_total, step_time_rewarder = 0, 0, 0 visualize_time = 0 rewarder_fit_time = 0 envs = RL2EnvInterface(args) if args.look: looker = Looker(args.log_dir) actor_critic = Policy(envs.obs_shape, envs.action_space, base=RL2Base, base_kwargs={'recurrent': True, 'num_act_dim': envs.action_space.shape[0]}) actor_critic.to(args.device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.obs_shape, envs.action_space, actor_critic.recurrent_hidden_state_size) rollouts.to(args.device) def copy_obs_into_beginning_of_storage(obs): obs_raw, obs_act, obs_rew, obs_flag = obs rollouts.obs[0].copy_(obs_raw) rollouts.obs_act[0].copy_(obs_act) rollouts.obs_rew[0].copy_(obs_rew) rollouts.obs_flag[0].copy_(obs_flag) for j in range(args.num_updates): obs = envs.reset() copy_obs_into_beginning_of_storage(obs) if args.use_linear_lr_decay: update_linear_schedule(agent.optimizer, j, args.num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(args.num_updates)) episode_returns = [0 for i in range(args.trial_length)] episode_final_reward = [0 for i in range(args.trial_length)] i_episode = 0 log_marginal = 0 lambda_log_s_given_z = 0 for step in range(args.num_steps): # Sample actions policy_forward_start = time.time() with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.get_obs(step), rollouts.recurrent_hidden_states[step], rollouts.masks[step]) policy_forward_time += time.time() - policy_forward_start # Obser reward and next obs step_total_start = time.time() obs, reward, done, info = envs.step(action) step_time_total += time.time() - step_total_start step_time_env += info['step_time_env'] step_time_rewarder += info['reward_time'] log_marginal += info['log_marginal'].sum().item() lambda_log_s_given_z += info['lambda_log_s_given_z'].sum().item() episode_returns[i_episode] += reward.sum().item() if all(done['episode']): episode_final_reward[i_episode] += reward.sum().item() i_episode = (i_episode + 1) % args.trial_length # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done['trial']]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) assert all(done['trial']) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.get_obs(-1), rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) policy_update_start = time.time() if args.rewarder != 'supervised' and envs.rewarder.fit_counter == 0 and not args.vae_load: value_loss, action_loss, dist_entropy = 0, 0, 0 else: value_loss, action_loss, dist_entropy = agent.update(rollouts) policy_update_time += time.time() - policy_update_start rollouts.after_update() # metrics trajectories_pre = envs.trajectories_pre_current_update state_entropy_pre = calculate_state_entropy(args, trajectories_pre) trajectories_post = envs.trajectories_post_current_update state_entropy_post = calculate_state_entropy(args, trajectories_post) return_avg = rollouts.rewards.sum() / args.trials_per_update reward_avg = return_avg / (args.trial_length * args.episode_length) log_marginal_avg = log_marginal / args.trials_per_update / (args.trial_length * args.episode_length) lambda_log_s_given_z_avg = lambda_log_s_given_z / args.trials_per_update / (args.trial_length * args.episode_length) num_steps = (j + 1) * args.num_steps * args.num_processes num_episodes = num_steps // args.episode_length num_trials = num_episodes // args.trial_length logger.logkv('state_entropy_pre', state_entropy_pre) logger.logkv('state_entropy_post', state_entropy_post) logger.logkv('value_loss', value_loss) logger.logkv('action_loss', action_loss) logger.logkv('dist_entropy', dist_entropy) logger.logkv('return_avg', return_avg.item()) logger.logkv('reward_avg', reward_avg.item()) logger.logkv('steps', (j + 1) * args.num_steps * args.num_processes) logger.logkv('episodes', num_episodes) logger.logkv('trials', num_trials) logger.logkv('policy_updates', (j + 1)) logger.logkv('time', time.time() - start) logger.logkv('policy_forward_time', policy_forward_time) logger.logkv('policy_update_time', policy_update_time) logger.logkv('step_time_rewarder', step_time_rewarder) logger.logkv('step_time_env', step_time_env) logger.logkv('step_time_total', step_time_total) logger.logkv('visualize_time', visualize_time) logger.logkv('rewarder_fit_time', rewarder_fit_time) logger.logkv('log_marginal_avg', log_marginal_avg) logger.logkv('lambda_log_s_given_z_avg', lambda_log_s_given_z_avg) for i_episode in range(args.trial_length): logger.logkv('episode_return_avg_{}'.format(i_episode), episode_returns[i_episode] / args.trials_per_update) logger.logkv('episode_final_reward_{}'.format(i_episode), episode_final_reward[i_episode] / args.trials_per_update) if (j % args.save_period == 0 or j == args.num_updates - 1) and args.log_dir != '': save_model(args, actor_critic, envs, iteration=j) if not args.vae_freeze and j % args.rewarder_fit_period == 0: rewarder_fit_start = time.time() envs.fit_rewarder() rewarder_fit_time += time.time() - rewarder_fit_start if (j % args.vis_period == 0 or j == args.num_updates - 1) and args.log_dir != '': visualize_start = time.time() if args.look: eval_return_avg, eval_episode_returns, eval_episode_final_reward = looker.look(iteration=j) logger.logkv('eval_return_avg', eval_return_avg) for i_episode in range(args.trial_length): logger.logkv('eval_episode_return_avg_{}'.format(i_episode), eval_episode_returns[i_episode] / args.trials_per_update) logger.logkv('eval_episode_final_reward_{}'.format(i_episode), eval_episode_final_reward[i_episode] / args.trials_per_update) if args.plot: p = Popen('python visualize.py --log-dir {}'.format(args.log_dir), shell=True) processes.append(p) visualize_time += time.time() - visualize_start logger.dumpkvs()
def main(): if not os.path.exists("./plots"): os.makedirs("./plots") gbench = read_gbench('./data/gbench.txt') args = my_get_args() print(args) config = dict(sigma=args.sim_sigma, momentum=args.sim_momentum, pump_bins=args.sim_bins, lag=1000 // args.num_steps, rshift=args.sim_rshift, pump_scale=args.sim_scale, reward_kind=args.sim_reward, continuous=args.sim_continuous, span=args.sim_span, percentile=args.sim_percentile, last_runs=args.sim_perc_len, add_linear=not args.sim_no_linear, start_pump=args.sim_start, static_features=not args.sim_no_static, extra_features=not args.sim_no_extra, curiosity_num=args.curiosity) base_kwargs = { 'hidden_size': args.hidden_size, 'film_size': 800 * (not args.sim_no_static) } if args.relu: base_kwargs['activation'] = 'relu' base = FILMBase #FILMBase if args.gset > 0: test_graphs = [args.gset] else: test_graphs = [1, 2, 3, 4, 5] #--------------------------------------------------------- assert args.algo in ['a2c', 'ppo', 'acktr'] if args.recurrent_policy: assert args.algo in ['a2c', 'ppo' ], 'Recurrent policy is not implemented for ACKTR' num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print('Num updates: ', num_updates) if args.dry_run: return random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") logdata = defaultdict(list) if args.gset > 0: envs = [] for g in test_graphs: g_ = read_gset('./data/G{}.txt'.format(g), negate=True) s = SIMCIM(g_, device=device, batch_size=args.num_processes, **config) s.runpump() envs.append(s) envs = SIMCollection(envs, [gbench[g] for g in test_graphs]) logdata['bls_bench'] = [gbench[g] for g in test_graphs] else: envs = SIMGeneratorRandom(800, 0.06, args.num_processes, config, keep=args.sim_keep, n_sims=args.sim_nsim, device=device) if args.snapshot is None: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base=base, base_kwargs=base_kwargs) else: actor_critic, _ = torch.load( os.path.join(args.save_dir, args.algo, args.snapshot + ".pt")) actor_critic.to(device) print(actor_critic) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() print(rollouts.obs.shape, obs.shape) rollouts.obs[0].copy_(obs) rollouts.to(device) eval_envs = [] for g in test_graphs: g_ = read_gset('./data/G{}.txt'.format(g), negate=True) s = SIMCIM(g_, device=device, batch_size=args.num_val_processes, **config) s.runpump() eval_envs.append(s) eval_envs = SIMCollection(eval_envs, [gbench[g] for g in test_graphs]) ref_cuts = [s.lastcuts for s in eval_envs.envs] logdata['ref_cuts'] = [e.tolist() for e in ref_cuts] stoch_cuts = None start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # ROLLOUT DATA for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) if 'episode' in infos[0].keys(): rw = np.mean([e['episode']['r'] for e in infos]) logdata['episode_rewards'].append(rw.item()) if args.gset > 0: cuts = [e.lastcuts for e in envs.envs] logdata['train_median'].append( [np.median(e).item() for e in cuts]) logdata['train_max'].append( [np.max(e).item() for e in cuts]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) #UPDATE AGENT with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, _ = agent.update(rollouts) logdata['alosses'].append(action_loss) logdata['vlosses'].append(value_loss) logdata['train_percentiles'].append(envs.perc.tolist()) rollouts.after_update() #CHECKPOINTS # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join(save_path, args.env_name + '-' + str(j) + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps #LOGGING if j % args.log_interval == 0 and len(logdata['episode_rewards']) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: \ mean/median reward {:.3f}/{:.3f}, min/max reward {:.3f}/{:.3f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(logdata['episode_rewards']), np.mean(logdata['episode_rewards'][-10:]), np.median(logdata['episode_rewards'][-10:]), np.min(logdata['episode_rewards'][-10:]), np.max(logdata['episode_rewards'][-10:]))) #EVALUATION if (args.eval_interval is not None and j % args.eval_interval == 0): logdata['spumps'] = [] vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_val_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_val_processes, 1, device=device) eval_done = False while not eval_done: p = eval_envs.envs[0].old_p logdata['spumps'].append(p[:10].cpu().numpy().tolist()) with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=False) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_done = np.all(done) eval_masks = torch.tensor([[0.0] if done_ else [1.0] for done_ in done], dtype=torch.float32, device=device) stoch_cuts = [e.lastcuts for e in eval_envs.envs] logdata['stoch_cuts'] = [e.tolist() for e in stoch_cuts] logdata['eval_median'].append( [np.median(e).item() for e in stoch_cuts]) logdata['eval_max'].append([np.max(e).item() for e in stoch_cuts]) logdata['test_percentiles'].append(eval_envs.perc.tolist()) rw = np.mean([e['episode']['r'] for e in infos]) logdata['eval_episode_rewards'].append(rw.item()) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(logdata['eval_episode_rewards']), np.mean(logdata['eval_episode_rewards']))) if j % args.log_interval == 0: fn = os.path.join(save_path, args.env_name + ".res") with open(fn, 'w') as f: json.dump(logdata, f, sort_keys=True, indent=2) #VISUALIZATION if j % args.vis_interval == 0: #if False: plt.figure(figsize=(15, 10)) plt.subplot(231) plt.title('Rewards') plt.xlabel('SIM runs') plt.plot(logdata['episode_rewards'], c='r', label='mean train') plt.plot(np.linspace(0, len(logdata['episode_rewards']), len(logdata['eval_episode_rewards'])), logdata['eval_episode_rewards'], 'b', label='mean eval') plt.legend() plt.subplot(232) plt.plot(logdata['alosses']) plt.title('Policy loss') plt.subplot(233) plt.plot(logdata['vlosses']) plt.title('Value loss') plt.subplot(234) plt.title('Pumps') plt.xlabel('SIM iterations / 10') plt.plot(np.array(logdata['spumps'])) plt.ylim(-0.05, 1.1) plt.subplot(235) plt.plot(logdata['train_percentiles']) plt.title('Train average percentile') plt.subplot(236) plt.title('Test percentiles') plt.plot(logdata['test_percentiles']) plt.legend([str(e) for e in test_graphs]) plt.tight_layout() plt.savefig('./plots/agent_' + args.env_name + '.pdf') plt.clf() plt.close() gc.collect() #plt.show() if stoch_cuts is not None: fig, axs = plt.subplots(len(ref_cuts), 1, sharex=False, tight_layout=True) if len(ref_cuts) == 1: axs = [axs] for gi in range(len(ref_cuts)): mn = min(ref_cuts[gi]) axs[gi].hist(ref_cuts[gi], bins=100, alpha=0.7) dc = stoch_cuts[gi][stoch_cuts[gi] >= mn] if dc.size > 0: axs[gi].hist(dc, bins=100, alpha=0.7) plt.savefig('./plots/cuts_' + args.env_name + '.pdf') plt.clf() plt.close() gc.collect()
def record_trajectories(): args = get_args() print(args) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Append the model name log_dir = os.path.expanduser(args.log_dir) log_dir = os.path.join(log_dir, args.model_name, str(args.seed)) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, 1, args.gamma, log_dir, device, True, training=False) # Take activation for carracing print("Loaded env...") activation = None if args.env_name == 'CarRacing-v0' and args.use_activation: activation = torch.tanh print(activation) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'env': args.env_name }, activation=activation, ) actor_critic.to(device) # Load from previous model if args.load_model_name: loaddata = torch.load( os.path.join(args.save_dir, args.load_model_name, args.load_model_name + '_{}.pt'.format(args.seed))) state = loaddata[0] try: obs_rms, ret_rms = loaddata[1:] # Feed it into the env envs.obs_rms = None envs.ret_rms = None except: print("Couldnt load obsrms") obs_rms = ret_rms = None try: actor_critic.load_state_dict(state) except: actor_critic = state else: raise NotImplementedError # Record trajectories actions = [] rewards = [] observations = [] episode_starts = [] for eps in range(args.num_episodes): obs = envs.reset() # Init variables for storing episode_starts.append(True) reward = 0 while True: # Take action act = actor_critic.act(obs, None, None, None)[1] next_state, rew, done, info = envs.step(act) #print(obs.shape, act.shape, rew.shape, done) reward += rew # Add the current observation and act observations.append(obs.data.cpu().numpy()[0]) # [C, H, W] actions.append(act.data.cpu().numpy()[0]) # [A] rewards.append(rew[0, 0].data.cpu().numpy()) if done[0]: break episode_starts.append(False) obs = next_state + 0 print("Total reward: {}".format(reward[0, 0].data.cpu().numpy())) # Save these values save_trajectories_images(observations, actions, rewards, episode_starts)
def main(): args = get_args() torch.manual_seed(args.seed) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") actor_critic = Policy(STATE_DIM, ACTION_DIM, USER_DIM) actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) if args.cgail: discr = cgail.Discriminator(STATE_DIM, ACTION_DIM, USER_DIM, device, lr=args.D_lr) train_file_name = os.path.join(args.experts_dir, "expert_traj.pkl") test_file_name = os.path.join(args.experts_dir, "test_traj.pkl") ground_file_name = os.path.join(args.experts_dir, "exp_loc.pkl") expert_st, expert_ur, expert_ac = pickle.load(open(train_file_name, 'rb')) train_load = data_utils.TensorDataset( torch.from_numpy(np.asarray(expert_st)), torch.from_numpy(np.asarray(expert_ur)), torch.from_numpy(np.asarray(expert_ac))) gail_train_loader = torch.utils.data.DataLoader( train_load, batch_size=args.gail_batch_size, shuffle=True) test_st, test_ur, test_ac = pickle.load(open(test_file_name, 'rb')) test_load = data_utils.TensorDataset(torch.from_numpy(np.asarray(test_st)), torch.from_numpy(np.asarray(test_ur)), torch.from_numpy(np.asarray(test_ac))) test_loader = torch.utils.data.DataLoader(test_load, batch_size=args.gail_batch_size, shuffle=True) exp_loc = pickle.load(open(ground_file_name, 'rb')) envs = make_vec_envs(expert_st, expert_ur, args.seed, args.num_processes, args.gamma, device) rollouts = RolloutStorage(args.num_steps, args.num_processes, STATE_DIM * 5, USER_DIM, ACTION_DIM) obs, user = envs.reset() rollouts.obs[0].copy_(obs[0]) rollouts.user[0].copy_(user[0]) rollouts.to(device) result_log = [] start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule(agent.optimizer, j, num_updates, args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step], rollouts.user[step]) # Obser reward and next obs if action.item() != 9: obs = decide_next_state(action, rollouts.obs[step][0], 1) if obs != None: rollouts.insert(obs, rollouts.user[step], action, action_log_prob, value) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.user[-1]).detach() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts) for step in range(args.num_steps): if cgail: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.user[step], rollouts.actions[step], args.gamma) else: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, str(args.lr), str(args.gail_batch_size), "entropy_" + str(args.entropy_coef), "D_lr" + str(args.D_lr)) try: os.makedirs(save_path) except OSError: pass torch.save(actor_critic, os.path.join(save_path, "ac_{}.pt".format(j))) torch.save(discr, os.path.join(save_path, "D_{}.pt".format(j))) if j % args.log_interval == 0: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print("Updates {}, num timesteps {}, FPS {}".format( j, total_num_steps, int(total_num_steps / (end - start)))) out_loc = {} for i, data in enumerate(test_loader, 0): inputs, user, labels = data inputs = inputs.float() user = user.float() labels = labels.long() output = actor_critic.act(inputs, user)[1].tolist() for i in range(inputs.size(0)): x = int(inputs[i][0].item()) y = int(inputs[i][1].item()) if (x, y) not in out_loc: out_loc[(x, y)] = np.zeros(10) out_loc[(x, y)][output[i]] += 1 else: out_loc[(x, y)][output[i]] += 1 target = [] ground = [] for key in out_loc: o1 = out_loc[key].copy() o1 /= sum(o1) if key in exp_loc: o2 = np.zeros(10) for b, w in exp_loc[key].items(): o2[b] += w o2 /= sum(o2) target.append(o1) ground.append(o2) k, kls = cross_entropy(target, ground) print(k)
if args.mobilenet: base = MobilenetBase if args.efficientnet: base = EfficientnetBase obs_shape = envs.observation_space.shape actor_critic = Policy( obs_shape, envs.action_space, base_kwargs={"recurrent": args.recurrent_policy}, navi=args.navi, base=base, hidden_size=args.hidden_size, n_layers=args.n_layers, ) actor_critic.to(device) if args.algo == "a2c": agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == "ppo": agent = algo.PPO( actor_critic, args.clip_param,
def main(): args = get_args() torch.manual_seed(config.seed) torch.cuda.manual_seed_all(config.seed) if config.cuda and torch.cuda.is_available() and config.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True logger, final_output_dir, tb_log_dir = create_logger(config, args.cfg, 'train', seed=config.seed) eval_log_dir = final_output_dir + "_eval" utils.cleanup_log_dir(final_output_dir) utils.cleanup_log_dir(eval_log_dir) logger.info(pprint.pformat(args)) logger.info(pprint.pformat(config)) writer = SummaryWriter(tb_log_dir) torch.set_num_threads(1) device = torch.device("cuda:" + config.GPUS if config.cuda else "cpu") width = height = 84 envs = make_vec_envs(config.env_name, config.seed, config.num_processes, config.gamma, final_output_dir, device, False, width=width, height=height, ram_wrapper=False) # create agent actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': config.recurrent_policy, 'hidden_size': config.hidden_size, 'feat_from_selfsup_attention': config.feat_from_selfsup_attention, 'feat_add_selfsup_attention': config.feat_add_selfsup_attention, 'feat_mul_selfsup_attention_mask': config.feat_mul_selfsup_attention_mask, 'selfsup_attention_num_keypoints': config.SELFSUP_ATTENTION.NUM_KEYPOINTS, 'selfsup_attention_gauss_std': config.SELFSUP_ATTENTION.GAUSS_STD, 'selfsup_attention_fix': config.selfsup_attention_fix, 'selfsup_attention_fix_keypointer': config.selfsup_attention_fix_keypointer, 'selfsup_attention_pretrain': config.selfsup_attention_pretrain, 'selfsup_attention_keyp_maps_pool': config.selfsup_attention_keyp_maps_pool, 'selfsup_attention_image_feat_only': config.selfsup_attention_image_feat_only, 'selfsup_attention_feat_masked': config.selfsup_attention_feat_masked, 'selfsup_attention_feat_masked_residual': config.selfsup_attention_feat_masked_residual, 'selfsup_attention_feat_load_pretrained': config.selfsup_attention_feat_load_pretrained, 'use_layer_norm': config.use_layer_norm, 'selfsup_attention_keyp_cls_agnostic': config.SELFSUP_ATTENTION.KEYPOINTER_CLS_AGNOSTIC, 'selfsup_attention_feat_use_ln': config.SELFSUP_ATTENTION.USE_LAYER_NORM, 'selfsup_attention_use_instance_norm': config.SELFSUP_ATTENTION.USE_INSTANCE_NORM, 'feat_mul_selfsup_attention_mask_residual': config.feat_mul_selfsup_attention_mask_residual, 'bottom_up_form_objects': config.bottom_up_form_objects, 'bottom_up_form_num_of_objects': config.bottom_up_form_num_of_objects, 'gaussian_std': config.gaussian_std, 'train_selfsup_attention': config.train_selfsup_attention, 'block_selfsup_attention_grad': config.block_selfsup_attention_grad, 'sep_bg_fg_feat': config.sep_bg_fg_feat, 'mask_threshold': config.mask_threshold, 'fix_feature': config.fix_feature }) # init / load parameter if config.MODEL_FILE: logger.info('=> loading model from {}'.format(config.MODEL_FILE)) state_dict = torch.load(config.MODEL_FILE) state_dict = OrderedDict( (_k, _v) for _k, _v in state_dict.items() if 'dist' not in _k) actor_critic.load_state_dict(state_dict, strict=False) elif config.RESUME: checkpoint_file = os.path.join(final_output_dir, 'checkpoint.pth') if os.path.exists(checkpoint_file): logger.info("=> loading checkpoint '{}'".format(checkpoint_file)) checkpoint = torch.load(checkpoint_file) actor_critic.load_state_dict(checkpoint['state_dict']) logger.info("=> loaded checkpoint '{}' (epoch {})".format( checkpoint_file, checkpoint['epoch'])) actor_critic.to(device) if config.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm, train_selfsup_attention=config.train_selfsup_attention) elif config.algo == 'ppo': agent = algo.PPO(actor_critic, config.clip_param, config.ppo_epoch, config.num_mini_batch, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, max_grad_norm=config.max_grad_norm) elif config.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, config.value_loss_coef, config.entropy_coef, acktr=True, train_selfsup_attention=config.train_selfsup_attention, max_grad_norm=config.max_grad_norm) # rollouts: environment rollouts = RolloutStorage( config.num_steps, config.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, keep_buffer=config.train_selfsup_attention, buffer_size=config.train_selfsup_attention_buffer_size) if config.RESUME: if os.path.exists(checkpoint_file): agent.optimizer.load_state_dict(checkpoint['optimizer']) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( config.num_env_steps) // config.num_steps // config.num_processes best_perf = 0.0 best_model = False print('num updates', num_updates, 'num steps', config.num_steps) for j in range(num_updates): if config.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if config.algo == "acktr" else config.lr) for step in range(config.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) recurrent_hidden_states, meta = recurrent_hidden_states # Obser reward and next obs obs, reward, done, infos = envs.step(action) objects_locs = [] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) if objects_locs: objects_locs = torch.FloatTensor(objects_locs) objects_locs = objects_locs * 2 - 1 # -1, 1 else: objects_locs = None rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks, objects_loc=objects_locs) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], ).detach() rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.gae_lambda, config.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() if config.train_selfsup_attention and j > 15: for _iter in range(config.num_steps // 5): frame_x, frame_y = rollouts.generate_pair_image() selfsup_attention_loss, selfsup_attention_output, image_b_keypoints_maps = \ agent.update_selfsup_attention(frame_x, frame_y, config.SELFSUP_ATTENTION) if j % config.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * config.num_processes * config.num_steps end = time.time() msg = 'Updates {}, num timesteps {}, FPS {} \n' \ 'Last {} training episodes: mean/median reward {:.1f}/{:.1f} ' \ 'min/max reward {:.1f}/{:.1f} ' \ 'dist entropy {:.1f}, value loss {:.1f}, action loss {:.1f}\n'. \ format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) if config.train_selfsup_attention and j > 15: msg = msg + 'selfsup attention loss {:.5f}\n'.format( selfsup_attention_loss) logger.info(msg) if (config.eval_interval is not None and len(episode_rewards) > 1 and j % config.eval_interval == 0): total_num_steps = (j + 1) * config.num_processes * config.num_steps ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None) eval_mean_score, eval_max_score, eval_scores = evaluate( actor_critic, ob_rms, config.env_name, config.seed, config.num_processes, eval_log_dir, device, width=width, height=height) perf_indicator = eval_mean_score if perf_indicator > best_perf: best_perf = perf_indicator best_model = True else: best_model = False # record test scores with open(os.path.join(final_output_dir, 'test_scores'), 'a+') as f: out_s = "TEST: {}, {}, {}, {}\n".format( str(total_num_steps), str(eval_mean_score), str(eval_max_score), [str(_eval_scores) for _eval_scores in eval_scores]) print(out_s, end="", file=f) logger.info(out_s) writer.add_scalar('data/mean_score', eval_mean_score, total_num_steps) writer.add_scalar('data/max_score', eval_max_score, total_num_steps) writer.add_scalars('test', {'mean_score': eval_mean_score}, total_num_steps) # save for every interval-th episode or for the last epoch if (j % config.save_interval == 0 or j == num_updates - 1) and config.save_dir != "": logger.info( "=> saving checkpoint to {}".format(final_output_dir)) epoch = j / config.save_interval save_checkpoint( { 'epoch': epoch + 1, 'model': get_model_name(config), 'state_dict': actor_critic.state_dict(), 'perf': perf_indicator, 'optimizer': agent.optimizer.state_dict(), 'ob_rms': getattr(utils.get_vec_normalize(envs), 'ob_rms', None) }, best_model, final_output_dir) final_model_state_file = os.path.join(final_output_dir, 'final_state.pth') logger.info( '=> saving final model state to {}'.format(final_model_state_file)) torch.save(actor_critic.state_dict(), final_model_state_file) # export_scalars_to_json needs results from add scalars writer.export_scalars_to_json(os.path.join(tb_log_dir, 'all_scalars.json')) writer.close()
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, True) frame_skip = 4 # frame skip if args.tb_dir[-1] != '/': args.tb_dir = args.tb_dir + '/' logger = Logger(args.tb_dir) logger.write_settings(args) if args.use_tdm: # beta scheduler if args.beta_schedule == 'const': beta_func = lambda x: float(args.beta_int) elif args.beta_schedule == 'sqrt': beta_func = lambda x: 1. / np.sqrt(x + 2) elif args.beta_schedule == 'log': beta_func = lambda x: 1. / np.log(x + 2) elif args.beta_schedule == 'linear': beta_func = lambda x: 1. / (x + 2) # bonus function variations if args.bonus_func == 'linear': bonus_func = lambda x: x + 1 elif args.bonus_func == 'square': bonus_func = lambda x: (x + 1)**2 elif args.bonus_func == 'sqrt': bonus_func = lambda x: (x + 1)**(1 / 2) elif args.bonus_func == 'log': bonus_func = lambda x: np.log(x + 1) # temporal difference module tdm = TemporalDifferenceModule( inputSize=2 * int(envs.observation_space.shape[0]), outputSize=args.time_intervals, num_fc_layers=int(args.num_layers), depth_fc_layers=int(args.fc_width), lr=float(args.opt_lr), buffer_max_length=args.buffer_max_length, buffer_RL_ratio=args.buffer_RL_ratio, frame_skip=frame_skip, tdm_epoch=args.tdm_epoch, tdm_batchsize=args.tdm_batchsize, logger=logger, bonus_func=bonus_func).to(device) #collect random trajectories sample_collector = CollectSamples(envs, args.num_processes, initial=True) tdm.buffer_rand = sample_collector.collect_trajectories( args.num_rollouts, args.steps_per_rollout) # initial training tdm.update() actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) # acting for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs # envs.render() obs_old = obs.clone() obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) #compute intrinsic bonus if args.use_tdm: tdm.symm_eval = True if step == args.num_steps - 1 else False reward_int = tdm.compute_bonus(obs_old, obs).float() reward += beta_func( step + j * args.num_steps) * reward_int.cpu().unsqueeze(1) if (j % args.log_interval == 0) and (step == args.num_steps - 1): logger.add_reward_intrinsic(reward_int, (j + 1) * args.num_steps * args.num_processes) #saving to buffer. rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) # saving to buffer and periodic updating parameters if (args.use_tdm): tdm.buffer_RL_temp.append((rollouts.obs, rollouts.masks)) if (j % args.num_steps == 0 and j > 0): tdm.update() with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch # no # save every 1-million steps if (((j + 1) * args.num_steps * args.num_processes) % 1e6 == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] if j == num_updates - 1: save_here = os.path.join( save_path, args.env_name + "_step_{}M.pt".format( (j + 1) * args.num_steps * args.num_processes // 1e6)) else: save_here = os.path.join(save_path, args.env_name + "_final.pt") torch.save(save_model, save_here) # saved policy. total_num_steps = (j + 1) * args.num_processes * args.num_steps # printing outputs if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) logger.add_reward(episode_rewards, (j + 1) * args.num_steps * args.num_processes) # # if j % args.tb_interval == 0: # # mean/std or median/1stqt? # logger.add_tdm_loss(loss, self.epoch_count*i) # evaluation process # if (args.eval_interval is not None # and len(episode_rewards) > 1 # and j % args.eval_interval == 0): # eval_envs = make_vec_envs( # args.env_name, args.seed + args.num_processes, args.num_processes, # args.gamma, eval_log_dir, args.add_timestep, device, True) # # vec_norm = get_vec_normalize(eval_envs) # if vec_norm is not None: # vec_norm.eval() # vec_norm.ob_rms = get_vec_normalize(envs).ob_rms # # eval_episode_rewards = [] # # obs = eval_envs.reset() # eval_recurrent_hidden_states = torch.zeros(args.num_processes, # actor_critic.recurrent_hidden_state_size, device=device) # eval_masks = torch.zeros(args.num_processes, 1, device=device) # # while len(eval_episode_rewards) < 10: # with torch.no_grad(): # _, action, _, eval_recurrent_hidden_states = actor_critic.act( # obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # # # Obser reward and next obs # # envs.render() # obs, reward, done, infos = eval_envs.step(action) # # eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] # for done_ in done]) # for info in infos: # if 'episode' in info.keys(): # eval_episode_rewards.append(info['episode']['r']) # # eval_envs.close() # # print(" Evaluation using {} episodes: mean reward {:.5f}\n". # format(len(eval_episode_rewards), # np.mean(eval_episode_rewards))) # # plotting # if args.vis and j % args.vis_interval == 0: # try: # # Sometimes monitor doesn't properly flush the outputs # win = visdom_plot(viz, win, args.log_dir, args.env_name, # args.algo, args.num_env_steps) # except IOError: # pass #if done save::::::::::: logger.save()
def main(): ARGUMENTS.update(vars(args)) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_lr_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [save_model, getattr(get_vec_normalize(envs), 'ob_rms', None)] torch.save(save_model, os.path.join(save_path, args.env_name + ".pt")) total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() print("Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n". format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) ALL_UPDATES.append(j) ALL_TIMESTEPS.append(total_num_steps) ALL_FPS.append(int(total_num_steps / (end - start))) ALL_MEAN_REWARDS.append(np.mean(episode_rewards)) ALL_MEDIAN_REWARDS.append(np.median(episode_rewards)) ALL_MIN_REWARDS.append(np.min(episode_rewards)) ALL_MAX_REWARDS.append(np.max(episode_rewards)) ALL_DIST_ENTROPY.append(dist_entropy) ALL_VALUE_LOSS.append(value_loss) ALL_ACTION_LOSS.append(action_loss) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs( args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros(args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n". format(len(eval_episode_rewards), np.mean(eval_episode_rewards))) if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass # Save the results name = ARGUMENTS['env_name'] + '-' + ARGUMENTS['algo'] + '-' + ARGUMENTS['experiment'] + '-grad_noise' + str(ARGUMENTS['gradient_noise']) experiment = ro.Experiment(name, directory='results') data = { 'updates': ALL_UPDATES, 'timesteps': ALL_TIMESTEPS, 'fps': ALL_FPS, 'mean_rewards': ALL_MEAN_REWARDS, 'median_rewards': ALL_MEDIAN_REWARDS, 'min_rewards': ALL_MIN_REWARDS, 'max_rewards': ALL_MAX_REWARDS, 'dist_entropy': ALL_DIST_ENTROPY, 'value_loss': ALL_VALUE_LOSS, 'action_loss': ALL_ACTION_LOSS, } data.update(ARGUMENTS) result = data['mean_rewards'][-1] experiment.add_result(result, data)
def main(): args = get_args() torch.set_num_threads(1) # device = torch.device("cuda:0" if args.cuda else "cpu") device = torch.device("cpu") # args.env_name = 'Pong-ramNoFrameskip-v4' args.env_name = 'Pong-ram-v0' args.num_processes = 2 envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) # ss('here') actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) print(args.recurrent_policy) print(args.clip_param) print(args.ppo_epoch) print('ccccccccc') print(args.num_mini_batch) print(args.value_loss_coef) print(args.entropy_coef) print('dddddddddddd') print(args.lr) print(args.eps) print(args.max_grad_norm) ss('in main, after actor_critic') args.num_mini_batch = 2 agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) # ss('out of define ppo') args.num_steps = 4 rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) # ss('rollouts') obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes # print(args.num_env_steps) # print() # ss('pp') sum_re = torch.zeros(args.num_processes, 1) # print(sum_re.shape) for j in range(num_updates): # ss('pp') is_any_done = False for step in range(args.num_steps): # for step in range(50000): # print(step) # ss('pp') # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # print(value) # print(action_log_prob) # print(action) # ss('runner') # Obser reward and next obs obs, reward, done, infos = envs.step(action) sum_re += reward # print('- --'*20) # print(reward) # print(sum_re) # print() # print(reward.shape) if any(done): # print(sum_re) # print(done) # input('hi') # is_any_done = True for i in range(len(done)): if done[i]: # print(i) # print(*sum_re[i]) # print(sum_re[i].item()) episode_rewards.append(sum_re[i].item()) # print(sum_re[i]) sum_re[i] *= 0 # pass # episode_rewards.append(reward.item()) # ss('make reward') # print(infos) # ss('runner') for info in infos: # print(info) if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) print('what env info with episode do?', info.keys()) # ss('break') # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) # ss('runner') with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, is_any_done, args.use_proper_time_limits) # ss('runner1') value_loss, action_loss, dist_entropy = agent.update(rollouts) # ss('runner1') rollouts.after_update() # ss('runner2') # save for every interval-th episode or for the last epoch # if (j % args.save_interval == 0 # or j == num_updates - 1) and args.save_dir != "": # save_path = os.path.join(args.save_dir, args.algo) # try: # os.makedirs(save_path) # except OSError: # pass # # torch.save([ # actor_critic, # getattr(utils.get_vec_normalize(envs), 'ob_rms', None) # ], os.path.join(save_path, args.env_name + ".pt")) # print(args.log_interval) args.log_interval = 100 if j % args.log_interval == 0 and len(episode_rewards) > 1: # if j % args.log_interval == 0: # and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n Ent {},V {},A {}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss))
def main(): torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, args.add_timestep, device, False) if args.load_policy is not None: actor_critic, ob_rms = torch.load(args.load_policy) vec_norm = get_vec_normalize(envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = ob_rms else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque( maxlen=(args.num_processes if args.num_processes > 10 else 10)) start = time.time() snapshot_counter = 0 last_delete = -1 try: os.makedirs(os.path.join(args.save_dir, args.algo)) except OSError: pass log_out_file = open(os.path.join(args.save_dir, args.algo, 'log_info.txt'), 'w') for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly if args.algo == "acktr": # use optimizer's learning rate since it's hard-coded in kfac.py update_linear_schedule(agent.optimizer, j, num_updates, agent.optimizer.lr) else: update_linear_schedule(agent.optimizer, j, num_updates, args.lr) if args.algo == 'ppo' and args.use_linear_clip_decay: agent.clip_param = args.clip_param * (1 - j / float(num_updates)) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, getattr(get_vec_normalize(envs), 'ob_rms', None) ] torch.save( save_model, os.path.join(save_path, args.env_name + "epoch_{:07d}.pt".format(j))) snapshot_counter += 1 last_delete += 1 if snapshot_counter > 100: os.system('rm ' + os.path.join( save_path, args.env_name + 'epoch_{:07d}.py'.format(last_delete))) snapshot_counter -= 1 total_num_steps = (j + 1) * args.num_processes * args.num_steps if j % args.log_interval == 0 and len(episode_rewards) > 1: end = time.time() log_info = "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n".\ format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss) print(log_info) sys.stdout.flush() log_out_file.write(log_info) log_out_file.flush() if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): eval_envs = make_vec_envs(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, args.add_timestep, device, True) vec_norm = get_vec_normalize(eval_envs) if vec_norm is not None: vec_norm.eval() vec_norm.ob_rms = get_vec_normalize(envs).ob_rms eval_episode_rewards = [] obs = eval_envs.reset() eval_recurrent_hidden_states = torch.zeros( args.num_processes, actor_critic.recurrent_hidden_state_size, device=device) eval_masks = torch.zeros(args.num_processes, 1, device=device) while len(eval_episode_rewards) < 10: with torch.no_grad(): _, action, _, eval_recurrent_hidden_states = actor_critic.act( obs, eval_recurrent_hidden_states, eval_masks, deterministic=True) # Obser reward and next obs obs, reward, done, infos = eval_envs.step(action) eval_masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) for info in infos: if 'episode' in info.keys(): eval_episode_rewards.append(info['episode']['r']) eval_envs.close() print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.write( " Evaluation using {} episodes: mean reward {:.5f}\n".format( len(eval_episode_rewards), np.mean(eval_episode_rewards))) log_out_file.flush() sys.stdout.flush() if args.vis and j % args.vis_interval == 0: try: # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.log_dir, args.env_name, args.algo, args.num_env_steps) except IOError: pass
def train(train_states, run_dir, num_env_steps, eval_env_steps, writer, writer_name, args, init_model=None): envs = make_vec_envs(train_states, args.seed, args.num_processes, args.gamma, 'cpu', 'train', args) if init_model: actor_critic, env_step, model_name = init_model obs_space = actor_critic.obs_space obs_process = actor_critic.obs_process obs_module = actor_critic.obs_module print(f" [load] Loaded model {model_name} at step {env_step}") else: obs_space = envs.observation_space actor_critic = Policy(obs_space, args.obs_process, args.obs_module, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) env_step = 0 actor_critic.to(args.device) #print(actor_critic) run_name = run_dir.replace('/', '_') vid_save_dir = f"{run_dir}/videos/" try: os.makedirs(vid_save_dir) except OSError: pass ckpt_save_dir = f"{run_dir}/ckpts/" try: os.makedirs(ckpt_save_dir) except OSError: pass if args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.device, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=False) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=True) else: raise NotImplementedError rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() actor_critic.eval() """ try: writer.add_graph(actor_critic, obs) except ValueError: print("Unable to write model graph to tensorboard.") """ actor_critic.train() for k in rollouts.obs.keys(): rollouts.obs[k][0].copy_(obs[k][0]) episode_rewards = deque(maxlen=10) num_updates = num_env_steps // args.num_steps // args.num_processes batch_size = args.num_steps * args.num_processes start = time.time() while env_step < num_env_steps: s = time.time() if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act( { k: rollouts.obs[k][step].float().to(args.device) for k in rollouts.obs.keys() }, rollouts.recurrent_hidden_states[step].to(args.device), rollouts.masks[step].to(args.device)) value = value.cpu() action = action.cpu() action_log_prob = action_log_prob.cpu() recurrent_hidden_states = recurrent_hidden_states.cpu() # Observe reward and next obs obs, reward, dones, infos = envs.step(action) for done, info in zip(dones, infos): env_state = info['env_state'][1] if done: writer.add_scalar(f'train_episode_x/{env_state}', info['max_x'], env_step) writer.add_scalar(f'train_episode_%/{env_state}', info['max_x'] / info['lvl_max_x'] * 100, env_step) writer.add_scalar(f'train_episode_r/{env_state}', info['sum_r'], env_step) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done else [1.0] for done in dones]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( { k: rollouts.obs[k][-1].float().to(args.device) for k in rollouts.obs.keys() }, rollouts.recurrent_hidden_states[-1].to(args.device), rollouts.masks[-1].to(args.device)).detach().cpu() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() env_step += batch_size fps = batch_size / (time.time() - s) #res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) #writer.add_scalar(f'gpu_usage/{writer_name}', res.gpu, env_step) #writer.add_scalar(f'gpu_mem/{writer_name}', res.memory, env_step) total_norm = 0 for p in list( filter(lambda p: p.grad is not None, actor_critic.parameters())): param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 total_norm = total_norm**(1. / 2) obs_norm = {} for obs_name in args.obs_keys: t_norm = 0 if obs_name == 'video': md = actor_critic.base.video_module elif obs_name == 'audio': md = actor_critic.base.audio_module else: raise NotImplementedError for p in list(filter(lambda p: p.grad is not None, md.parameters())): param_norm = p.grad.data.norm(2) t_norm += param_norm.item()**2 obs_norm[obs_name] = t_norm**(1. / 2) prev_env_step = max(0, env_step + 1 - batch_size) # write training metrics for this batch, usually takes 0.003s if (env_step + 1 ) // args.write_interval > prev_env_step // args.write_interval: writer.add_scalar(f'grad_norm/{writer_name}', total_norm, env_step) writer.add_scalar(f'fps/{writer_name}', fps, env_step) writer.add_scalar(f'value_loss/{writer_name}', value_loss / batch_size, env_step) writer.add_scalar(f'action_loss/{writer_name}', action_loss / batch_size, env_step) writer.add_scalar(f'dist_entropy/{writer_name}', dist_entropy / batch_size, env_step) writer.add_scalar(f'cpu_usage/{writer_name}', psutil.cpu_percent(), env_step) writer.add_scalar(f'cpu_mem/{writer_name}', psutil.virtual_memory()._asdict()['percent'], env_step) for obs_name in args.obs_keys: writer.add_scalar(f'grad_norm_{obs_name}/{writer_name}', obs_norm[obs_name], env_step) # print log to console if (env_step + 1) // args.log_interval > prev_env_step // args.log_interval: end = time.time() print(" [log] Env step {} of {}: {:.1f}s, {:.1f}fps".format( env_step + 1, num_env_steps, end - start, fps)) if len(episode_rewards) > 0: print( " Last {} episodes: mean/med reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) print( " dist_entropy {:.5f}, value_loss {:.6f}, action_loss {:.6f}, grad_norm {:.6f}" .format(dist_entropy, value_loss, action_loss, total_norm)) start = time.time() # save model to ckpt if ((env_step + 1) // args.save_interval > prev_env_step // args.save_interval): torch.save([ actor_critic, env_step, run_name, ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")) print(f" [save] Saved model at step {env_step+1}.") # save model to ckpt and run evaluation if eval_interval and not final iteration in training loop if ((env_step + 1) // args.eval_interval > prev_env_step // args.eval_interval ) and env_step < num_env_steps and eval_env_steps > 0: torch.save([ actor_critic, env_step, run_name, ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")) print(f" [save] Saved model at step {env_step+1}.") envs.close() del envs # close does not actually get rid of envs, need to del actor_critic.eval() eval_score, e_dict = evaluate(train_states, actor_critic, eval_env_steps, env_step, writer, vid_save_dir, args.vid_tb_steps, args.vid_file_steps, args.obs_viz_layer, args) print(f" [eval] Evaluation score: {eval_score}") writer.add_scalar('eval_score', eval_score, env_step) actor_critic.train() envs = make_vec_envs(train_states, args.seed, args.num_processes, args.gamma, 'cpu', 'train', args) obs = envs.reset() # TODO: does this work? do we need to increment env step or something? whydden_states insert at 0 for k in rollouts.obs.keys(): rollouts.obs[k][0].copy_(obs[k][0]) # final model save final_model_path = os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt") torch.save([ actor_critic, env_step, run_name, ], final_model_path) print( f" [save] Final model saved at step {env_step+1} to {final_model_path}" ) # final model eval envs.close() del envs eval_score = None eval_dict = None if eval_env_steps > 0: eval_score, eval_dict = evaluate(train_states, actor_critic, eval_env_steps, env_step, writer, vid_save_dir, args.vid_tb_steps, args.vid_file_steps, args.obs_viz_layer, args) print(f" [eval] Final model evaluation score: {eval_score:.3f}") return (actor_critic, env_step, run_name), eval_score, eval_dict
def main(): args = get_args() writer = SummaryWriter(os.path.join('logs', args.save_name), ) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs( basic_env.BasicFlatDiscreteEnv, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, task='lift', gripper_type='RobotiqThreeFingerDexterousGripper', robot='Panda', controller='JOINT_TORQUE' if args.vel else 'JOINT_POSITION', horizon=1000, reward_shaping=True) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base=Surreal, # base=OpenAI, # base=MLP_ATTN, base_kwargs={ 'recurrent': args.recurrent_policy, # 'dims': basic_env.BasicFlatEnv().modality_dims 'config': dict(act='relu' if args.relu else 'tanh', rec=args.rec, fc=args.fc) }) print(actor_critic) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes best_reward = 0 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) writer.add_scalar('lr', agent.optimizer.param_groups[0]['lr']) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps if len(episode_rewards) > 1: writer.add_scalar('loss/value', value_loss, total_num_steps) writer.add_scalar('loss/policy', action_loss, total_num_steps) writer.add_scalar('experiment/num_updates', j, total_num_steps) writer.add_scalar('experiment/FPS', int(total_num_steps / (end - start)), total_num_steps) writer.add_scalar('experiment/EPISODE MEAN', np.mean(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPISODE MEDIAN', np.median(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPISODE MIN', np.min(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPSIDOE MAX', np.max(episode_rewards), total_num_steps) rollouts.after_update() # save for every interval-th episode or for the last epoch if len(episode_rewards) > 1 and args.save_dir != "": rew = np.mean(episode_rewards) if rew > best_reward: best_reward = rew print('saved with best reward', rew) save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, args.save_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) writer.close()
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir + args.env_name) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) log_dir2 = os.path.expanduser(args.log_dir2 + args.env_name2) eval_log_dir2 = log_dir + "_eval" utils.cleanup_log_dir(log_dir2) utils.cleanup_log_dir(eval_log_dir2) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") import json file_path = "config.json" setup_json = json.load(open(file_path, 'r')) env_conf = setup_json["Default"] for i in setup_json.keys(): if i in args.env_name: env_conf = setup_json[i] # 1 game envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, env_conf, False) # 2 game envs2 = make_vec_envs(args.env_name2, args.seed, args.num_processes, args.gamma, args.log_dir2, device, env_conf, False) save_model, ob_rms = torch.load('./trained_models/PongNoFrameskip-v4.pt') from a2c_ppo_acktr.cnn import CNNBase a = CNNBase(envs.observation_space.shape[0], recurrent=False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, #(obs_shape[0], ** base_kwargs) base=a, #base_kwargs={'recurrent': args.recurrent_policy} ) #actor_critic.load_state_dict(save_model.state_dict()) actor_critic.to(device) actor_critic2 = Policy(envs2.observation_space.shape, envs2.action_space, base=a) #base_kwargs={'recurrent': args.recurrent_policy}) #actor_critic2.load_state_dict(save_model.state_dict()) actor_critic2.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, actor_critic2, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) rollouts2 = RolloutStorage(args.num_steps, args.num_processes, envs2.observation_space.shape, envs2.action_space, actor_critic2.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) obs2 = envs2.reset() rollouts2.obs[0].copy_(obs2) rollouts2.to(device) episode_rewards = deque(maxlen=10) episode_rewards2 = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): # if args.use_linear_lr_decay: # # decrease learning rate linearly # utils.update_linear_schedule( # agent.optimizer, j, num_updates, # agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) value2, action2, action_log_prob2, recurrent_hidden_states2, _ = actor_critic2.act( rollouts2.obs[step], rollouts2.recurrent_hidden_states[step], rollouts2.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) obs2, reward2, done2, infos2 = envs2.step(action2) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) for info2 in infos2: if 'episode' in info2.keys(): episode_rewards2.append(info2['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) masks2 = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done2]) bad_masks2 = torch.FloatTensor( [[0.0] if 'bad_transition' in info2.keys() else [1.0] for info2 in infos2]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) rollouts2.insert(obs2, recurrent_hidden_states2, action2, action_log_prob2, value2, reward2, masks2, bad_masks2) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() next_value2 = actor_critic2.get_value( rollouts2.obs[-1], rollouts2.recurrent_hidden_states[-1], rollouts2.masks[-1]).detach() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) rollouts2.compute_returns(next_value2, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy, value_loss2, action_loss2, dist_entropy2 = agent.update( rollouts, rollouts2) rollouts.after_update() rollouts2.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) torch.save([ actor_critic2, getattr(utils.get_vec_normalize(envs2), 'ob_rms2', None) ], os.path.join(save_path, args.env_name2 + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards2), np.mean(episode_rewards2), np.median(episode_rewards2), np.min(episode_rewards2), np.max(episode_rewards2), dist_entropy2, value_loss2, action_loss2)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) ob_rms2 = utils.get_vec_normalize(envs2).ob_rms evaluate(actor_critic2, ob_rms2, args.env_name2, args.seed, args.num_processes, eval_log_dir2, device)