def main(): realEval = True #False gettrace = getattr(sys, 'gettrace', None) parser = argparse.ArgumentParser(description='RL') parser.add_argument('--action-type', type=int, default=-1, help='action type to play (default: -1)') parser.add_argument('--tasks-difficulty-from', type=int, default=0, help='tasks_difficulty_from') parser.add_argument('--tasks-difficulty-to', type=int, default=100000, help='tasks-difficulty-to') parser.add_argument('--verboseLevel', type=int, default=5, help='verboseLevel') parser.add_argument('--filesNamesSuffix', default="", help='filesNamesSuffix') parser.add_argument('--nobest-exit', type=int, default=10000, help='nobest_exit') args = get_args(parser) args.algo = 'ppo' args.env_name = 'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' args.use_gae = True args.num_steps = 2048 #args.num_processes = 4 args.num_processes = 4 if gettrace(): args.num_processes = 1 args.lr = 0.0001 args.entropy_coef = 0.0 args.value_loss_coef = 0.5 args.ppo_epoch = 4 args.num_mini_batch = 256 args.gamma = 0.99 args.gae_lambda = 0.95 args.clip_param = 0.2 args.use_linear_lr_decay = True #True #True #True #True args.use_proper_time_limits = True args.save_dir = "./trained_models/" + args.env_name + "/" args.load_dir = "./trained_models/" + args.env_name + "/" args.log_dir = "./logs/robot" if gettrace(): args.save_dir = "./trained_models/" + args.env_name + "debug/" args.load_dir = "./trained_models/" + args.env_name + "debug/" args.log_dir = "./logs/robot_d" args.log_interval = 30 args.hidden_size = 64 args.last_hidden_size = args.hidden_size args.recurrent_policy = False #True args.save_interval = 20 #args.seed = 1 reward_shaping = 0.01 allowMutate = False if args.seed == -1: args.seed = time.clock_gettime_ns(time.CLOCK_REALTIME) quadruppedEnv.settings.tasks_difficulty_from = args.tasks_difficulty_from quadruppedEnv.settings.tasks_difficulty_to = args.tasks_difficulty_to # 0 is a walk # 1 is a balance # 2 multitasks # 3 multitask experiments trainType = 14 filesNamesSuffix = "" if args.action_type >= 0: trainType = args.action_type makeEnvFunction = makeEnv.make_env_with_best_settings if trainType == 1: filesNamesSuffix = "balance_" makeEnvFunction = makeEnv.make_env_for_balance if trainType == 2: filesNamesSuffix = "analytical_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical if trainType == 3: filesNamesSuffix = "analytical2_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical2 if trainType == 4: filesNamesSuffix = "frontback_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_front_back if trainType == 5: filesNamesSuffix = "leftright_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_left_right if trainType == 6: filesNamesSuffix = "all_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_all if trainType == 7: filesNamesSuffix = "rotate_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_rotate if trainType == 8: filesNamesSuffix = "compound_" makeEnvFunction = make_env_multinetwork if trainType == 9: import pickle realEval = False allowMutate = False args.use_linear_lr_decay = True #False args.num_env_steps = 5000000 filesNamesSuffix = "test_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_test if trainType == 10: import pickle realEval = False allowMutate = False args.use_linear_lr_decay = True #False args.num_env_steps = 5000000 filesNamesSuffix = "zoo_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_test_zoo if trainType == 11: args.hidden_size = 128 #64 #128 args.last_hidden_size = args.hidden_size import pickle if gettrace(): args.num_processes = 1 else: args.num_processes = 8 realEval = False allowMutate = False args.lr = 0.00001 args.use_linear_lr_decay = True #False args.num_env_steps = 10000000 filesNamesSuffix = "zigote2_updown_" print("Samples preload") global samplesEnvData samplesEnvData = pickle.load( open("./QuadruppedWalk-v1_MoveNoPhys.samples", "rb")) # samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1.samples", "rb" ) ) makeEnvFunction = makeSamplesEnv if trainType == 12: import pickle args.lr = 0.00001 args.hidden_size = 64 args.last_hidden_size = args.hidden_size filesNamesSuffix = "zigote2_front_back_" args.clip_param = 0.9 args.value_loss_coef = 0.9 makeEnvFunction = makeEnv.make_env_with_best_settings_for_train #makeEnvFunction = makeEnv.make_env_with_best_settings_for_record #makeEnv.samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1_MoveNoPhys.samples", "rb" ) ) if trainType == 13: filesNamesSuffix = "all_bytasks_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_all if trainType == 14: #args.lr = 0.00001 #args.num_env_steps = 000000 #args.clip_param = 0.5 #args.value_loss_coef =0.8 #random.seed(time.clock_gettime_ns(time.CLOCK_REALTIME)) #args.num_steps = random.choice([256,512,1024,2048,4096]) #args.num_mini_batch = random.choice([32,64,256,512]) #args.ppo_epoch = random.choice([2,4,8,10]) #args.clip_param = random.choice([0.2,0.4,0.6,0.8]) #args.value_loss_coef =random.choice([0.4,0.5,0.6,0.8]) #args.lr = random.choice([0.00001,0.0001,0.00005,0.0005]) args.num_steps = 2048 args.num_mini_batch = 64 args.ppo_epoch = 8 args.lr = 0.0001 args.hidden_size = 64 args.last_hidden_size = args.hidden_size # filesNamesSuffix = args.filesNamesSuffix makeEnvFunction = makeEnv.make_env_with_best_settings_for_all ''' num_steps: 1024 num_mini_batch 64 ppo_epoch 2 clip_param: 0.2 value_loss_coef 0.6 lr 0.0001 ''' if trainType == 15: args.num_env_steps = 5000000 filesNamesSuffix = "zigote_updown_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_train_analytic if trainType == 16: args.lr = 0.00001 filesNamesSuffix = "compound_tasks_" makeEnvFunction = make_env_multinetwork reward_shaper = DefaultRewardsShaper(scale_value=reward_shaping) print("ActionType ", trainType, " ", filesNamesSuffix, "seed", args.seed, "num env steps:", args.num_env_steps, " tasks_dif", args.tasks_difficulty_from, args.tasks_difficulty_to) print("Num processes:", args.num_processes) print("num_steps:", args.num_steps, "num_mini_batch", args.num_mini_batch, "ppo_epoch", args.ppo_epoch) print("clip_param:", args.clip_param, "value_loss_coef", args.value_loss_coef, "lr", args.lr) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True args.log_dir = "/tmp/tensorboard/" #TesnorboardX writer = SummaryWriter(log_dir=args.log_dir + 'runs/{}_PPO_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, "ppo")) writer.add_scalar('options/num_steps', args.num_steps, 0) writer.add_scalar('options/num_mini_batch', args.num_mini_batch, 0) writer.add_scalar('options/ppo_epoch', args.ppo_epoch, 0) writer.add_scalar('options/clip_param', args.clip_param, 0) writer.add_scalar('options/value_loss_coef', args.value_loss_coef, 0) writer.add_scalar('options/lr', args.lr, 0) device = torch.device("cuda:0" if args.cuda else "cpu") torch.set_num_threads(1) load_dir = os.path.join(args.load_dir, args.algo) multiNetworkName = ["frontback_", "all_", "leftright_", "rotate_"] if trainType == 8: for net in multiNetworkName: bestFilename = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, net, args.hidden_size)) ac, _ = torch.load(bestFilename) policies.append(PPOPlayer(ac, device)) print("Policy multi loaded: ", bestFilename) multiNetworkName2 = [ "all_bytasks_0_", "all_bytasks_1_", "all_bytasks_2_", "all_bytasks_3_", "all_bytasks_4_", "all_bytasks_5_", "all_bytasks_6_", "all_bytasks_7_", "all_bytasks_8_", "all_bytasks_9_", "all_bytasks_10_", "all_bytasks_11_", "all_bytasks_12_", ] if trainType == 16: for net in multiNetworkName2: bestFilename = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, net, args.hidden_size)) ac, _ = torch.load(bestFilename) policies.append(PPOPlayer(ac, device)) print("Policy multi loaded: ", bestFilename) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, None, device, False, normalizeOb=False, normalizeReturns=False, max_episode_steps=args.num_steps, makeEnvFunc=makeEnvFunction, num_frame_stack=1, info_keywords=( 'episode_steps', 'episode_reward', 'progress', 'servo', 'distToTarget', )) #print(envs.observation_space.shape,envs.action_space) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'hidden_size': args.hidden_size, 'last_hidden_size': args.last_hidden_size, 'activation_layers_type': "Tanh" }) ''' # if args.load_dir not None: load_path = os.path.join(args.load_dir, args.algo) actor_critic, ob_rms = torch.load(os.path.join(load_path, args.env_name + ".pt")) ''' load_path = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) #load_path = os.path.join(load_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) preptrained_path = "../Train/trained_models/QuadruppedWalk-v1/Train_QuadruppedWalk-v1_256.pth" loadPretrained = False if loadPretrained and os.path.isfile(preptrained_path): print("Load preptrained") abj = torch.load(preptrained_path) print(abj) print(actor_critic.base) actor_critic.base.load_state_dict() actor_critic.base.eval() if os.path.isfile(load_path) and not loadPretrained: actor_critic, ob_rms = torch.load(load_path) actor_critic.eval() print("----NN loaded: ", load_path, " -----") else: bestFilename = os.path.join( load_dir, "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) if os.path.isfile(bestFilename): actor_critic, ob_rms = torch.load(bestFilename) actor_critic.eval() print("----NN loaded: ", bestFilename, " -----") maxReward = -10000.0 maxSteps = 0 minDistance = 50000.0 actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) deque_maxLen = 10 episode_rewards = deque(maxlen=deque_maxLen) episode_steps = deque(maxlen=deque_maxLen) episode_rewards_alive = deque(maxlen=deque_maxLen) episode_rewards_progress = deque(maxlen=deque_maxLen) episode_rewards_servo = deque(maxlen=deque_maxLen) episode_dist_to_target = deque(maxlen=deque_maxLen) ''' load_path = os.path.join(args.load_dir, args.algo) load_path = os.path.join(load_path, args.env_name + ".pt") actor_critic, ob_rms = torch.load(load_path) actor_critic.to(device) actor_critic.eval() #ob_rms.eval() ''' ''' args.use_gym_monitor = 1 args.monitor_dir = "./results/" monitor_path = os.path.join(args.monitor_dir, args.algo) monitor_path = os.path.join(monitor_path, args.env_name) args. if args.use_gym_monitor: env = wrappers.Monitor( env, monitor_path, video_callable=False, force=True) ''' i_episode = 0 save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass trainOnSamplesAndExit = False #False if trainOnSamplesAndExit: import pickle print("---------------------------------------") print("Samples preload") data = pickle.load(open("./QuadruppedWalk-v1_UpDown.samples", "rb")) #data = pickle.load( open( "../QuadruppedWalk-v1_NN.samples", "rb" ) ) learning_rate = 0.0001 max_episodes = 100 max_timesteps = 4000 betas = (0.9, 0.999) log_interval = 1 envSamples = SamplesEnv(data) envSamples.numSteps = max_timesteps # create a stochastic gradient descent optimizer optimizer = torch.optim.Adam(actor_critic.base.actor.parameters(), lr=learning_rate, betas=betas) #optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9) # create a loss function criterion = nn.MSELoss(reduction="sum") # run the main training loop for epoch in range(max_episodes): state = envSamples.reset() time_step = 0 testReward = 0 testSteps = 0 loss_sum = 0 loss_max = 0 for t in range(max_timesteps): time_step += 1 nn_state = torch.FloatTensor((state).reshape(1, -1)).to(device) optimizer.zero_grad() net_out = actor_critic.base.forwardActor(nn_state) net_out = actor_critic.dist.fc_mean(net_out) state, reward, done, info = envSamples.step( net_out.detach().numpy()) sim_action = envSamples.recordedActions sim_action_t = torch.FloatTensor([sim_action]).to(device) loss = criterion(net_out, sim_action_t) loss.backward() optimizer.step() loss_sum += loss.mean() loss_max = max(loss_max, loss.max()) testReward += reward testSteps += 1 if done: if epoch % log_interval == 0: #print(best_action_t*scaleActions-net_out*scaleActions) if args.verboseLevel > 0: print( 'Train Episode: {} t:{} Reward:{} Loss: mean:{:.6f} max: {:.6f}' .format(epoch, t, testReward, loss_sum / t, loss_max)) print(info) reward = 0 break bestFilename = os.path.join( save_path, "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) exit(0) skipWriteBest = True if args.verboseLevel > 0: printNetwork(actor_critic.base.actor) lock(actor_critic, first=False, last=False) #if trainType==9: #allowMutate = False #lock(actor_critic,first=True,last=False) #mutate(actor_critic,power=0.00,powerLast=0.3) if args.verboseLevel > 0: printNetwork(actor_critic.base.actor) #from torchsummary import summary #summary(actor_critic.base.actor, (1, 48, 64)) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes episodeBucketIndex = 0 maxReward = -10000000000 numEval = 10 if realEval: envEval = makeEnvFunction(args.env_name) if hasattr(envEval.env, "tasks") and len(envEval.env.tasks): numEval = max(numEval, len(envEval.env.tasks)) maxReward = evaluate_policy(envEval, actor_critic, numEval * 2, render=False, device=device, verbose=args.verboseLevel) print("MaxReward on start", maxReward) noMaxRewardCount = 0 updateIndex = 0 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) episode_r = 0.0 stepsDone = 0 for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) #envs.venv.venv.venv.envs[0].render() if args.verboseLevel > 0: index = 0 for d in done: if d: print(infos[index], flush=True) index += 1 episodeDone = False ''' index = 0 for d in done: if d: print("") print(infos[index]) index+=1 ''' for info in infos: if 'reward' in info.keys(): episodeDone = True i_episode += 1 episode_rewards.append(info['reward']) writer.add_scalar('reward/episode', info['reward'], i_episode) #print("E:",i_episode," T:",info['episode_steps'], " R:", info['episode_reward'], " D:",info['distToTarget']) if 'steps' in info.keys(): episode_steps.append(info['steps']) writer.add_scalar('reward/steps', info['steps'], i_episode) if 'alive' in info.keys(): episode_rewards_alive.append(info['alive']) writer.add_scalar('reward/alive', info['alive'], i_episode) if 'prog' in info.keys(): episode_rewards_progress.append(info['prog']) writer.add_scalar('reward/progress', info['prog'], i_episode) if 'servo' in info.keys(): episode_rewards_servo.append(info['servo']) writer.add_scalar('reward/servo', info['servo'], i_episode) if 'd2T' in info.keys(): episode_dist_to_target.append(info['d2T']) writer.add_scalar('reward/distToTarget', info['d2T'], i_episode) for val in info.keys(): if val not in [ "reward", "steps", "alive", "prog", "servo", "d2T", 'epos', 't' ]: writer.add_scalar('reward/' + val, info[val], i_episode) #if episodeDone and i_episode%10==0: # print(i_episode,"({:.1f}/{}/{:.2f}) ".format(episode_rewards[-1],episode_steps[-1],episode_dist_to_target[-1]),end='',flush=True) if episodeDone: episodeBucketIndex += 1 if args.verboseLevel > 0: print("Mean:", Fore.WHITE, np.mean(episode_rewards), Style.RESET_ALL, " Median:", Fore.WHITE, np.median(episode_rewards), Style.RESET_ALL, " max reward:", maxReward) #'''len(episode_rewards) and np.mean(episode_rewards)>maxReward and''' if realEval: if episodeBucketIndex % args.log_interval == 0 and episodeBucketIndex > args.log_interval: print("Step:", (j + 1) * args.num_processes * args.num_steps) if skipWriteBest == False: evalReward = evaluate_policy( envEval, actor_critic, numEval, device=device, verbose=args.verboseLevel) writer.add_scalar('reward/eval', evalReward, i_episode) if evalReward > maxReward: maxReward = evalReward #maxReward = np.mean(episode_rewards) bestFilename = os.path.join( save_path, "{}_{}{}_best.pt".format( args.env_name, filesNamesSuffix, args.hidden_size)) print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{:.1f}/{}/{:.2f}) ".format( maxReward, np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps), episode_dist_to_target[-1]), Style.RESET_ALL, bestFilename) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) noMaxRewardCount = 0 else: noMaxRewardCount += 1 if allowMutate: if noMaxRewardCount == 5: print("Mutation low last layer") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.00, powerLast=0.01) if noMaxRewardCount == 8: print("Mutation low non last") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.01, powerLast=0.0) if noMaxRewardCount == 11: print("Mutation low all") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.02, powerLast=0.2) if noMaxRewardCount == 14: print("Mutation hi all") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.03, powerLast=0.03) noMaxRewardCount = 0 if noMaxRewardCount == args.nobest_exit: exit(0) else: skipWriteBest = False else: if len(episode_rewards) and np.mean( episode_rewards ) > maxReward and j > args.log_interval: if skipWriteBest == False: maxReward = np.mean(episode_rewards) writer.add_scalar('reward/maxReward', maxReward, i_episode) bestFilename = os.path.join( save_path, "{}_{}{}_best.pt".format( args.env_name, filesNamesSuffix, args.hidden_size)) if len(episode_dist_to_target): print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{}/{:.2f}) ".format( np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps), episode_dist_to_target[-1]), Style.RESET_ALL, bestFilename) else: print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{}) ".format( np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps)), Style.RESET_ALL, bestFilename) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) else: skipWriteBest = False # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) shaped_reward = reward_shaper(reward) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, shaped_reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) writer.add_scalar('reward/value_loss', value_loss, updateIndex) writer.add_scalar('reward/action_loss', action_loss, updateIndex) writer.add_scalar('reward/dist_entropy', dist_entropy, updateIndex) updateIndex += 1 rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": ''' fileName = os.path.join(save_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], fileName) print("Saved:",fileName, " cur avg rewards:",np.mean(episode_rewards)) fileName = os.path.join(save_path, "{}_{}{}_actor.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) torch.save(actor_critic.state_dict, fileName) print("Saved:",fileName) ''' if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() if args.verboseLevel > 0: print("") print("Updates {}, num timesteps {}, FPS {}".format( j, total_num_steps, int(total_num_steps / (end - start)))) print(" Last {} training episodes:".format( len(episode_rewards))) print( " reward mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}". format(np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) print(" steps mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}". format(np.mean(episode_steps), np.median(episode_steps), np.min(episode_steps), np.max(episode_steps))) if len(episode_rewards_alive): print( " alive mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_alive), np.median(episode_rewards_alive), np.min(episode_rewards_alive), np.max(episode_rewards_alive))) if len(episode_rewards_progress): print( " progress mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_progress), np.median(episode_rewards_progress), np.min(episode_rewards_progress), np.max(episode_rewards_progress))) if len(episode_rewards_servo): print( " servo mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_servo), np.median(episode_rewards_servo), np.min(episode_rewards_servo), np.max(episode_rewards_servo))) if len(episode_dist_to_target): print( " dist to target mean/median {:.3f}/{:.3f} min/max {:.3f}/{:.3f}" .format(np.mean(episode_dist_to_target), np.median(episode_dist_to_target), np.min(episode_dist_to_target), np.max(episode_dist_to_target))) print( " Reward/Steps {:.3f} Progress/Steps: {:.3f} entropy {:.1f} value_loss {:.5f} action_loss {:.5f}\n" .format( np.mean(episode_rewards) / np.mean(episode_steps), (0 if len(episode_rewards_progress) == 0 else np.mean(episode_rewards_progress) / np.mean(episode_steps)), dist_entropy, value_loss, action_loss))
def __init__(self, env_def, processes=1, dir='.', version=0, lr=2e-4, architecture='base', dropout=0, reconstruct=None, r_weight=.05): self.env_def = env_def self.num_processes = processes #cpu processes self.lr = lr self.version = version self.save_dir = dir + '/trained_models/' #Setup pathlib.Path(self.save_dir).mkdir(parents=True, exist_ok=True) if (self.num_mini_batch > processes): self.num_mini_batch = processes self.writer = SummaryWriter() self.total_steps = 0 #State torch.manual_seed(self.seed) torch.cuda.manual_seed_all(self.seed) if not self.no_cuda and torch.cuda.is_available( ) and self.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True utils.cleanup_log_dir(self.log_dir) utils.cleanup_log_dir(self.eval_log_dir) torch.set_num_threads(1) self.level_path = None self.envs = None self.num_envs = -1 self.set_envs(num_envs=1) if (version > 0): self.actor_critic = self.load(path, version) else: self.actor_critic = Policy( self.envs.observation_space.shape, self.envs.action_space, base_kwargs={ 'recurrent': self.recurrent_policy, 'shapes': list(reversed(self.env_def.model_shape)), 'dropout': dropout }, model=architecture) self.actor_critic.to(self.device) #Reconstruction self.reconstruct = reconstruct is not None if (self.reconstruct): #layers = self.envs.observation_space.shape[0] #shapes = list(self.env_def.model_shape) #self.r_model = Decoder(layers, shapes=shapes).to(self.device) reconstruct.to(self.device) self.r_model = lambda x: reconstruct.adapter(reconstruct(x)) #self.r_model = lambda x: reconstruct.adapter(reconstruct(x)).clamp(min=1e-6).log() #self.r_loss = nn.L1Loss() #nn.NLLLoss() #nn.MSELoss() self.r_loss = lambda pred, true: -r_weight * (true * torch.log( pred.clamp(min=1e-7, max=1 - 1e-7))).sum(dim=1).mean() self.r_optimizer = reconstruct.optimizer #optim.Adam(reconstruct.parameters(), lr = .0001) if self.algo == 'a2c': self.agent = A2C_ACKTR(self.actor_critic, self.value_loss_coef, self.entropy_coef, lr=self.lr, eps=self.eps, alpha=self.alpha, max_grad_norm=self.max_grad_norm) elif self.algo == 'ppo': self.agent = PPO(self.actor_critic, self.clip_param, self.ppo_epoch, self.num_mini_batch, self.value_loss_coef, self.entropy_coef, lr=self.lr, eps=self.eps, max_grad_norm=self.max_grad_norm, use_clipped_value_loss=False) elif self.algo == 'acktr': self.agent = algo.A2C_ACKTR(self.actor_critic, self.value_loss_coef, self.entropy_coef, acktr=True) self.gail = False self.gail_experts_dir = './gail_experts' if self.gail: assert len(self.envs.observation_space.shape) == 1 self.gail_discr = gail.Discriminator( self.envs.observation_space.shape[0] + self.envs.action_space.shape[0], 100, self.device) file_name = os.path.join( self.gail_experts_dir, "trajs_{}.pt".format(env_name.split('-')[0].lower())) self.gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=self.gail_batch_size, shuffle=True, drop_last=True) self.rollouts = RolloutStorage( self.num_steps, self.num_processes, self.envs.observation_space.shape, self.envs.action_space, self.actor_critic.recurrent_hidden_state_size)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:" + str(args.cuda_id) if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) ########## file related filename = args.env_name + "_" + args.algo + "_n" + str(args.max_episodes) if args.attack: filename += "_" + args.type + "_" + args.aim filename += "_s" + str(args.stepsize) + "_m" + str( args.maxiter) + "_r" + str(args.radius) + "_f" + str(args.frac) if args.run >= 0: filename += "_run" + str(args.run) logger = get_log(args.logdir + filename + "_" + current_time) logger.info(args) rew_file = open(args.resdir + filename + ".txt", "w") if args.compute: radius_file = open( args.resdir + filename + "_radius" + "_s" + str(args.stepsize) + "_m" + str(args.maxiter) + "_th" + str(args.dist_thres) + ".txt", "w") if args.type == "targ" or args.type == "fgsm": targ_file = open(args.resdir + filename + "_targ.txt", "w") num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes if args.type == "wb": attack_net = WbAttacker(agent, envs, int(args.frac * num_updates), num_updates, args, device=device) if args.type == "bb": attack_net = BbAttacker(agent, envs, int(args.frac * num_updates), num_updates, args, device=device) elif args.type == "rand": attack_net = RandAttacker(envs, radius=args.radius, frac=args.frac, maxat=int(args.frac * num_updates), device=device) elif args.type == "semirand": attack_net = WbAttacker(agent, envs, int(args.frac * num_updates), num_updates, args, device, rand_select=True) elif args.type == "targ": if isinstance(envs.action_space, Discrete): action_dim = envs.action_space.n target_policy = action_dim - 1 elif isinstance(envs.action_space, Box): action_dim = envs.action_space.shape[0] target_policy = torch.zeros(action_dim) # target_policy[-1] = 1 print("target policy is", target_policy) attack_net = TargAttacker(agent, envs, int(args.frac * num_updates), num_updates, target_policy, args, device=device) elif args.type == "fgsm": if isinstance(envs.action_space, Discrete): action_dim = envs.action_space.n target_policy = action_dim - 1 elif isinstance(envs.action_space, Box): action_dim = envs.action_space.shape[0] target_policy = torch.zeros(action_dim) def targ_policy(obs): return target_policy attack_net = FGSMAttacker(envs, agent, targ_policy, radius=args.radius, frac=args.frac, maxat=int(args.frac * num_updates), device=device) # if args.aim == "obs" or aim == "hybrid": # obs_space = gym.make(args.env_name).observation_space # attack_net.set_obs_range(obs_space.low, obs_space.high) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) episode = 0 start = time.time() for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions if args.type == "fgsm": # print("before", rollouts.obs[step]) rollouts.obs[step] = attack_net.attack( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]).clone() # print("after", rollouts.obs[step]) with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if args.type == "targ" or args.type == "fgsm": if isinstance(envs.action_space, Discrete): num_target = ( action == target_policy).nonzero()[:, 0].size()[0] targ_file.write( str(num_target / args.num_processes) + "\n") print("percentage of target:", num_target / args.num_processes) elif isinstance(envs.action_space, Box): target_action = target_policy.repeat(action.size()[0], 1) targ_file.write( str( torch.norm(action - target_action).item() / args.num_processes) + "\n") print("percentage of target:", torch.sum(action).item() / args.num_processes) # Obser reward and next obs obs, reward, done, infos = envs.step(action.cpu()) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # rew_file.write("episode: {}, total reward: {}\n".format(episode, info['episode']['r'])) episode += 1 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) if args.attack and args.type != "fgsm": if args.aim == "reward": logger.info(rollouts.rewards.flatten()) rollouts.rewards = attack_net.attack_r_general( rollouts, next_value).clone().detach() logger.info("after attack") logger.info(rollouts.rewards.flatten()) elif args.aim == "obs": origin = rollouts.obs.clone() rollouts.obs = attack_net.attack_s_general( rollouts, next_value).clone().detach() logger.info(origin) logger.info("after") logger.info(rollouts.obs) elif args.aim == "action": origin = torch.flatten(rollouts.actions).clone() rollouts.actions = attack_net.attack_a_general( rollouts, next_value).clone().detach() logger.info("attack value") logger.info(torch.flatten(rollouts.actions) - origin) elif args.aim == "hybrid": res_aim, attack = attack_net.attack_hybrid( rollouts, next_value, args.radius_s, args.radius_a, args.radius_r) print("attack ", res_aim) if res_aim == "obs": origin = rollouts.obs.clone() rollouts.obs = attack.clone().detach() logger.info(origin) logger.info("attack obs") logger.info(rollouts.obs) elif res_aim == "action": origin = torch.flatten(rollouts.actions).clone() rollouts.actions = attack.clone().detach() logger.info("attack action") logger.info(torch.flatten(rollouts.actions) - origin) elif res_aim == "reward": logger.info(rollouts.rewards.flatten()) rollouts.rewards = attack.clone().detach() logger.info("attack reward") logger.info(rollouts.rewards.flatten()) if args.compute: stable_radius = attack_net.compute_radius(rollouts, next_value) print("stable radius:", stable_radius) radius_file.write("update: {}, radius: {}\n".format( j, np.round(stable_radius, decimals=3))) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if args.attack and args.type == "bb": attack_net.learning(rollouts) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) >= 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) rew_file.write("updates: {}, mean reward: {}\n".format( j, np.mean(episode_rewards))) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) # if episode > args.max_episodes: # print("reach episodes limit") # break if args.attack: logger.info("total attacks: {}\n".format(attack_net.attack_num)) print("total attacks: {}\n".format(attack_net.attack_num)) rew_file.close() if args.compute: radius_file.close() if args.type == "targ" or args.type == "fgsm": targ_file.close()
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # coinrun environments need to be treated differently. coinrun_envs = { 'CoinRun': 'standard', 'CoinRun-Platforms': 'platform', 'Random-Mazes': 'maze' } envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, coin_run_level=args.num_levels, difficulty=args.high_difficulty, coin_run_seed=args.seed) if args.env_name in coinrun_envs.keys(): observation_space_shape = (3, 64, 64) args.save_dir = args.save_dir + "/NUM_LEVELS_{}".format( args.num_levels) # Save the level info in the else: observation_space_shape = envs.observation_space.shape # trained model name if args.continue_ppo_training: actor_critic, _ = torch.load(os.path.join(args.check_point, args.env_name + ".pt"), map_location=torch.device(device)) elif args.cor_gail: embed_size = args.embed_size actor_critic = Policy(observation_space_shape, envs.action_space, hidden_size=args.hidden_size, embed_size=embed_size, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) correlator = Correlator(observation_space_shape, envs.action_space, hidden_dim=args.hidden_size, embed_dim=embed_size, lr=args.lr, device=device) correlator.to(device) embeds = torch.zeros(1, embed_size) else: embed_size = 0 actor_critic = Policy(observation_space_shape, envs.action_space, hidden_size=args.hidden_size, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) embeds = None if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, use_clipped_value_loss=True, ftrl_mode=args.cor_gail or args.no_regret_gail, correlated_mode=args.cor_gail) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail or args.no_regret_gail or args.cor_gail: file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=50, subsample_frequency=1) #if subsample set to a different number, # grad_pen might need adjustment drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) if args.gail: discr = gail.Discriminator(observation_space_shape, envs.action_space, device=device) if args.no_regret_gail or args.cor_gail: queue = deque( maxlen=args.queue_size ) # Strategy Queues: Each element of a queue is a dicr strategy agent_queue = deque( maxlen=args.queue_size ) # Strategy Queues: Each element of a queue is an agent strategy pruning_frequency = 1 if args.no_regret_gail: discr = regret_gail.NoRegretDiscriminator(observation_space_shape, envs.action_space, device=device) if args.cor_gail: discr = cor_gail.CorDiscriminator(observation_space_shape, envs.action_space, hidden_size=args.hidden_size, embed_size=embed_size, device=device) discr.to(device) rollouts = RolloutStorage(args.num_steps, args.num_processes, observation_space_shape, envs.action_space, actor_critic.recurrent_hidden_state_size, embed_size) obs = envs.reset() rollouts.obs[0].copy_(obs) if args.cor_gail: rollouts.embeds[0].copy_(embeds) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions # Roll-out with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], rollouts.embeds[step]) obs, reward, done, infos = envs.step(action.to('cpu')) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) # Sample mediating/correlating actions # Correlated Roll-out if args.cor_gail: embeds, embeds_log_prob, mean = correlator.act( rollouts.obs[step], rollouts.actions[step]) rollouts.insert_embedding(embeds, embeds_log_prob) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], rollouts.embeds[-1]).detach() if args.gail or args.no_regret_gail or args.cor_gail: if args.env_name not in {'CoinRun', 'Random-Mazes'}: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if args.gail: if j < 10: gail_epoch = 100 # Warm up # no need for gail epoch or warm up in the no-regret case and cor_gail. for _ in range(gail_epoch): if utils.get_vec_normalize(envs): obfilt = utils.get_vec_normalize(envs)._obfilt else: obfilt = None if args.gail: discr.update(gail_train_loader, rollouts, obfilt) if args.no_regret_gail or args.cor_gail: last_strategy = discr.update(gail_train_loader, rollouts, queue, args.max_grad_norm, obfilt, j) for step in range(args.num_steps): if args.gail: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) if args.no_regret_gail: rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step], queue) if args.cor_gail: rollouts.rewards[ step], correlator_reward = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], rollouts.embeds[step], args.gamma, rollouts.masks[step], queue) rollouts.correlated_reward[step] = correlator_reward rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if args.gail: value_loss, action_loss, dist_entropy = agent.update(rollouts, j) elif args.no_regret_gail or args.cor_gail: value_loss, action_loss, dist_entropy, agent_gains, agent_strategy = \ agent.mixed_update(rollouts, agent_queue, j) if args.cor_gail: correlator.update(rollouts, agent_gains, args.max_grad_norm) if args.no_regret_gail or args.cor_gail: queue, _ = utils.queue_update(queue, pruning_frequency, args.queue_size, j, last_strategy) agent_queue, pruning_frequency = utils.queue_update( agent_queue, pruning_frequency, args.queue_size, j, agent_strategy) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass if not args.cor_gail: torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) else: print("saving models in {}".format( os.path.join(save_path, args.env_name))) torch.save( correlator.state_dict(), os.path.join(save_path, args.env_name + "correlator.pt")) torch.save([ actor_critic.state_dict(), getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "actor.pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}," " value loss/action loss {:.1f}/{}".format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") if args.env_name.startswith("lab_"): gym_name, flow_json = make_lab_env(args.env_name) args.env_name = gym_name envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: " "mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(arglist): FLAGS = flags.FLAGS FLAGS([""]) # get all arguments from command line - there are lots! args = get_args(arglist) tb_log = LogRLAlgorithm(num_processes=args.num_processes) # set manual seeds for reproducability torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # set up logging directories log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) # set singlethreaded + gpu if applicable. # Not sure how this interacts with num processes. torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") # make the gym environment envs = make_vec_envs( args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, 1, ) # Creates the policy network. Of note it the observation and action spaces used to # set input and output sizes actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={"recurrent": args.recurrent_policy}, ) actor_critic.to(device) # Set up agent details if args.algo == "a2c": agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, ) elif args.algo == "ppo": agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == "acktr": agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) # set up imitation learning if relevant if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split("-")[0].lower()), ) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True, ) # setup experience buffer rollouts = RolloutStorage( args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, ) # initialise environment obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) # perform main learning loop for fixed duration. start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr, ) # for each episode for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step], ) # Observe reward and next obs obs, reward, done, infos = envs.step(action) tb_log.expl_data_collector.add_step(action, action_log_prob, reward, done, value) # for each thread for info in infos: if "episode" in info.keys(): # if episode ended episode_rewards.append(info["episode"]["r"]) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if "bad_transition" in info.keys() else [1.0] for info in infos]) rollouts.insert( obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks, ) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1], ).detach() # do imitation learning if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step], ) # compute returns rollouts.compute_returns( next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits, ) # update agent value_loss, action_loss, dist_entropy = agent.update(rollouts) # re-initialise experience buffer with current state rollouts.after_update() # save model for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save( [ actor_critic, getattr(utils.get_vec_normalize(envs), "ob_rms", None) ], os.path.join(save_path, args.env_name + ".pt"), ) # log each log interval & print results to cmd. if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss, )) # histogram parameters # layer = 0 # for l in actor_critic.base.main: # try: # b = l.bias # w = l.weight # logger.record_histogram_dict( # {f"{layer}/bias": b, f"{layer}/weight": w}, prefix="layer" # ) # layer += 1 # except AttributeError: # pass # tb_log._log_stats(int(j / args.log_interval)) # tb_log.expl_data_collector.end_epoch(int(j / args.log_interval)) # evaluate if requested if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate( actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device, )
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}, dimh=args.dimh) actor_critic.to(device) exp_name = "%s_%s_seed%d_dimh%d_" % (args.env_name, args.algo, args.seed, args.dimh) if args.gail: exp_name += '_gail_' if args.split: exp_name += 'splitevery' + str(args.split_every) if args.random_split: exp_name += '_rsplit' else: exp_name += 'baseline' writer = SummaryWriter('./runs/' + exp_name) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print(num_updates) stats = { 'seed': args.seed, 'experiment': exp_name, 'env': args.env_name, 'dimh': args.dimh, 'split every': args.split_every, 'random split': args.random_split, 'steps': [], 'mean reward': [], 'actor neurons': [], 'critic neurons': [], } save_dir = './experiment_results/%s/' % args.env_name stats_save_path = save_dir + exp_name check_path(save_dir) print('start') count = -1 num_updates = 488 * 2 meanreward = [] for j in range(num_updates): #if j % 50 == 0: # print('STEP', j) if args.use_linear_lr_decay: # decrease learning rate linearly count += 1 if j % 488 == 0: count = 0 total = 488 * 2 else: total = 488 * 2 if args.split: utils.update_linear_schedule( agent.optimizer, count, total, agent.optimizer.lr if args.algo == "acktr" else args.lr) else: utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) # splitting if args.split and (j + 1) % args.split_every == 0 and j < 200: print("[INFO] split on iteration %d..." % j) agent.split(rollouts, args.random_split) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) meanreward.append(np.mean(episode_rewards)) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() if True: print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) stats['mean reward'].append(np.mean(episode_rewards)) stats['steps'].append(j) if args.split: a, c = agent.actor_critic.get_num_params() stats['actor neurons'].append(a) stats['critic neurons'].append(c) if j % 10 == 0: print("[INFO] saving to ", stats_save_path) np.save(stats_save_path, stats) if j % 5 == 0: s = (j + 1) * args.num_processes * args.num_steps if args.split: a, c = agent.actor_critic.get_num_params() writer.add_scalar('A neurons', a, s) writer.add_scalar('C neurons', c, s) writer.add_scalar('mean reward', np.mean(episode_rewards), s) writer.add_scalar('entropy loss', dist_entropy, s) writer.add_scalar('value loss', value_loss, s) writer.add_scalar('action loss', action_loss, s) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) writer.close() import pickle pickle.dump(meanreward, open(stats_save_path + '.pkl', 'wb'))
def main(): all_episode_rewards = [] ### 记录 6/29 all_temp_rewards = [] ### 记录 6/29 args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print('num_updates ', num_updates) print('num_steps ', args.num_steps) count = 0 h5_path = './data/' + args.env_name if not os.path.exists(h5_path): os.makedirs(h5_path) h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % (count) data = {} data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] episode_step = 0 for j in range(num_updates): ### num-steps temp_states = [] temp_actions = [] temp_rewards = [] temp_done = [] temp_lenthgs = [] if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) if j == 0 and step == 0: print('obs ', type(rollouts.obs[step]), rollouts.obs[step].shape) print('hidden_states ', type(rollouts.recurrent_hidden_states[step]), rollouts.recurrent_hidden_states[step].shape) print('action ', type(action), action.shape) print('action prob ', type(action_log_prob), action_log_prob.shape) print('-' * 20) # Obser reward and next obs obs, reward, done, infos = envs.step(action) #print(infos) #print(reward) temp_states += [np.array(rollouts.obs[step].cpu())] temp_actions += [np.array(action.cpu())] #temp_rewards += [np.array(reward.cpu())] temp_rewards += [np.array([infos[0]['myrewards']]) ] ### for halfcheetah不能直接用 reward !! 6/29 temp_done += [np.array(done)] if j == 0 and step == 0: print('obs ', type(obs), obs.shape) print('reward ', type(reward), reward.shape) print('done ', type(done), done.shape) print('infos ', len(infos)) for k, v in infos[0].items(): print(k, v.shape) print() for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) all_episode_rewards += [info['episode']['r']] ### 记录 6/29 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) temp_lengths = len(temp_states) temp_states = np.concatenate(temp_states) temp_actions = np.concatenate(temp_actions) temp_rewards = np.concatenate(temp_rewards) temp_done = np.concatenate(temp_done) #print('temp_lengths',temp_lengths) #print('temp_states', temp_states.shape) #print('temp_actions', temp_actions.shape) #print('temp_rewards', temp_rewards.shape) if j > int(0.4 * num_updates): data['states'] += [temp_states] data['actions'] += [temp_actions] data['rewards'] += [temp_rewards] data['lengths'] += [temp_lengths] data['done'] += [temp_done] #print('temp_lengths',data['lengths'].shape) #print('temp_states', data['states'].shape) #print('temp_actions', data['actions'].shape) #print('temp_rewards', data['rewards'].shape) if args.save_expert and len(data['states']) >= 100: with h5py.File(h5_filename, 'w') as f: f['states'] = np.array(data['states']) f['actions'] = np.array(data['actions']) f['rewards'] = np.array(data['rewards']) f['done'] = np.array(data['done']) f['lengths'] = np.array(data['lengths']) #print('f_lengths',f['lengths'].shape) #print('f_states', f['states'].shape) #print('f_actions', f['actions'].shape) #print('f_rewards', f['rewards'].shape) count += 1 h5_filename = h5_path + '/trajs_' + args.env_name + '_%05d.h5' % ( count) data['states'] = [] data['actions'] = [] data['rewards'] = [] data['done'] = [] data['lengths'] = [] with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "_%d.pt" % (args.seed))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) #np.save(os.path.join(save_path, args.env_name+"_%d"%(args.seed)), all_episode_rewards) ### 保存记录 6/29 #print(temp_rewards) print("temp rewards size", temp_rewards.shape, "mean", np.mean(temp_rewards), "min", np.min(temp_rewards), "max", np.max(temp_rewards)) all_temp_rewards += [temp_rewards] np.savez(os.path.join(save_path, args.env_name + "_%d" % (args.seed)), episode=all_episode_rewards, timestep=all_temp_rewards) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) '''data['states'] = np.array(data['states'])
def main(): args = get_args() # Record trajectories if args.record_trajectories: record_trajectories() return print(args) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Append the model name log_dir = os.path.expanduser(args.log_dir) log_dir = os.path.join(log_dir, args.model_name, str(args.seed)) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, log_dir, device, False) obs_shape = len(envs.observation_space.shape) # Take activation for carracing print("Loaded env...") activation = None if args.env_name == 'CarRacing-v0' and args.use_activation: activation = torch.tanh print(activation) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'env': args.env_name }, activation=activation) actor_critic.to(device) # Load from previous model if args.load_model_name: state = torch.load( os.path.join(args.save_dir, args.load_model_name, args.load_model_name + '_{}.pt'.format(args.seed)))[0] try: actor_critic.load_state_dict(state) except: actor_critic = state # If BCGAIL, then decay factor and gamma should be float if args.bcgail: assert type(args.decay) == float assert type(args.gailgamma) == float if args.decay < 0: args.decay = 1 elif args.decay > 1: args.decay = 0.5**(1. / args.decay) print('Gamma: {}, decay: {}'.format(args.gailgamma, args.decay)) print('BCGAIL used') if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, gamma=args.gailgamma, decay=args.decay, act_space=envs.action_space, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: if len(envs.observation_space.shape) == 1: # Load RED here red = None if args.red: red = gail.RED( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device, args.redsigma, args.rediters) discr = gail.Discriminator(envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device, red=red, sail=args.sail, learn=args.learn) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=args.num_traj, subsample_frequency=1) args.gail_batch_size = min(args.gail_batch_size, len(expert_dataset)) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) print("Data loader size", len(expert_dataset)) else: # env observation shape is 3 => its an image assert len(envs.observation_space.shape) == 3 discr = gail.CNNDiscriminator(envs.observation_space.shape, envs.action_space, 100, device) file_name = os.path.join(args.gail_experts_dir, 'expert_data.pkl') expert_dataset = gail.ExpertImageDataset(file_name, act=envs.action_space) gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=len(expert_dataset) > args.gail_batch_size, ) print('Dataloader size', len(gail_train_loader)) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print(num_updates) for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: try: envs.venv.eval() except: pass gail_epoch = args.gail_epoch if j < 10 and obs_shape == 1: gail_epoch = 100 # Warm up for _ in range(gail_epoch): if obs_shape == 1: discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) else: discr.update(gail_train_loader, rollouts, None) if obs_shape == 3: obfilt = None else: obfilt = utils.get_vec_normalize(envs)._rev_obfilt for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step], obfilt ) # The reverse function is passed down for RED to receive unnormalized obs which it is trained on rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if args.bcgail: if obs_shape == 3: obfilt = None else: obfilt = utils.get_vec_normalize(envs)._obfilt value_loss, action_loss, dist_entropy = agent.update( rollouts, gail_train_loader, obfilt) else: value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.model_name) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic.state_dict(), getattr(utils.get_vec_normalize(envs), 'ob_rms', None), getattr(utils.get_vec_normalize(envs), 'ret_rms', None) ], os.path.join( save_path, args.model_name + "_{}.pt".format(args.seed))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() if args.state: assert args.algo == "ppo" torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) if args.load_dir: actor_critic, obsrms = torch.load(args.load_dir) vec_norm = utils.get_vec_normalize(envs) if vec_norm is not None: vec_norm.train() vec_norm.ob_rms = obsrms actor_critic.base.deterministic = args.deterministic actor_critic.base.humanoid = args.env_name.startswith("SH") else: if args.state: actor_critic = StatePolicy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'deterministic': args.deterministic, 'hidden_size': args.code_size, 'humanoid': args.env_name.startswith("SH") }) else: actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}, ) actor_critic.to(device) if args.state: agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, beta=args.beta, beta_end=args.beta_end, state=True) elif args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) # A bunch of tensors; circular buffer if args.state: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size, code_size=args.code_size) mis = [] else: rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) # Populate the first observation in rollouts obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # Rewards is a deque episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): # print(j) agent.ratio = j / num_updates if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): if args.state: value, action, action_log_prob, recurrent_hidden_states, eps, code = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # import ipdb; ipdb.set_trace() else: value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obs reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) if args.state: rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks, eps=eps, code=code) else: rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy, mi_loss = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "_" + str(j) + ".pt")) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() if args.state: print("DKL loss " + str(mi_loss)) mis.append(mi_loss) print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms if args.env_name.startswith("SH"): masses = [ 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.15, 1.25, 1.35, 1.45, 1.55 ] damps = [ 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.15, 1.25, 1.35, 1.45, 1.55 ] means = np.zeros((len(masses), len(damps))) stds = np.zeros((len(masses), len(damps))) for m_i in range(len(masses)): for d_i in range(len(damps)): m = masses[m_i] d = masses[d_i] u, s = evaluate( actor_critic, ob_rms, 'OracleSHTest' + str(m) + "_" + str(d) + '-v0', args.seed, args.num_processes, eval_log_dir, device) means[m_i, d_i] = u stds[m_i, d_i] = s a, _ = args.load_dir.split(".") a = a.split("_")[-1] with open("sh_means_" + str(a) + ".npz", "wb") as f: np.save(f, means) with open("sh_stds_" + str(a) + ".npz", "wb") as f: np.save(f, stds) elif args.env_name.startswith("Oracle"): fs = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40 ] ls = [ 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.00, 1.05, 1.10, 1.15, 1.20, 1.25, 1.30, 1.35, 1.40, 1.45, 1.50, 1.55, 1.60, 1.65, 1.70 ] a, _ = args.load_dir.split(".") a = a.split("_")[-1] means = np.zeros((len(fs), len(ls))) stds = np.zeros((len(fs), len(ls))) for f_i in range(len(fs)): for l_i in range(len(ls)): f = fs[f_i] l = ls[l_i] u, s = evaluate( actor_critic, ob_rms, 'OracleCartpoleTest' + str(f) + "_" + str(l) + '-v0', args.seed, args.num_processes, eval_log_dir, device) means[f_i, l_i] = u stds[f_i, l_i] = s with open("cp_means" + str(a) + ".npz", "wb") as f: np.save(f, means) with open("cp_stds" + str(a) + ".npz", "wb") as f: np.save(f, stds) elif args.env_name.startswith("HC"): ds = [ 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 1950, 2000 ] us = np.zeros_like(ds) ss = np.zeros_like(ds) for i in range(len(ds)): d = ds[i] u, s = evaluate(actor_critic, ob_rms, "OracleHalfCheetahTest_" + str(d) + "-v0", args.seed, args.num_processes, eval_log_dir, device) us[i] = u ss[i] = s a, _ = args.load_dir.split(".") a = a.split("_")[-1] with open("hc_means" + str(a) + ".npz", "wb") as f: np.save(f, us) with open("hc_stds" + str(a) + ".npz", "wb") as f: np.save(f, ss) assert False, "Evaluation Ended"
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) # 固定随机种子 args.seed= # 为CPU设置种子用于生成随机数,以使得结果是确定的 #torch.manual_seed(args.seed) # 为所有的GPU设置种子 #torch.cuda.manual_seed_all(args.seed) # 没有使用GPU的时候设置的固定生成的随机数 if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False #为整个网络的每个卷积层搜索最适合它的卷积实现算法,进而实现网络的加速 torch.backends.cudnn.deterministic = True #固定每次返回的卷积算法 log_dir = os.path.expanduser(args.log_dir) #把Linux ~展开 eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) #在指定路径建立文件,若已存在,则删除后重新创建。 utils.cleanup_log_dir(eval_log_dir) #在指定路径建立文件,若已存在,则删除后重新创建。 torch.set_num_threads(1) #限制Pytorch占用过多CPU资源。 device = torch.device("cuda:0" if args.cuda else "cpu") # 选择CPU 或 GPU envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy }) #main属性:卷积网络,用于卷积输入图像; #critic_linear属性:全连接网络,可能用于记录Q值。 actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join(args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-') [0].lower())) #./ gail_experts / trajs_halfcheetah.pt # ------------start - --------------- # args.gail_experts_dir:./ gail_experts # trajs_{}.pt.format(args.env_name.split('-')[0].lower())): trajs_halfcheetah.pt # args.env_name.split('-'): ['HalfCheetah', 'v2'] # args.env_name.split('-')[0].lower(): halfcheetah #file_name:./ gail_experts / trajs_halfcheetah.pt # - ------------end - ---------------- # print("------------start----------------") # print("args.gail_experts_dir:", args.gail_experts_dir) # print("trajs_{}.pt.format(args.env_name.split('-')[0].lower())):", "trajs_{}.pt".format( # args.env_name.split('-')[0].lower())) # print("args.env_name.split('-'):", args.env_name.split('-')) # print("args.env_name.split('-')[0].lower():", args.env_name.split('-')[0].lower()) #print("file_name:", file_name) # print("-------------end-----------------") expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes episode_len = 0 episode_return = 0 episode_num = 0 total_steps = 0 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) #print("value:", value, "action:", action, "action_log_prob:", action_log_prob, "recurrent_hidden_stes:", recurrent_hidden_states) # value: tensor([[-1.6006]], device='cuda:0') # action: tensor([[0.2846, 0.4442, 0.1657, -1.0094, -1.7039, 0.6084]], # device='cuda:0') # action_log_prob: tensor([[-7.2011]], device='cuda:0') # recurrent_hidden_stes: tensor([[0.]], device='cuda:0') # Obser reward and next obs obs, reward, done, infos = envs.step(action) episode_len += 1 episode_return += reward total_steps += 1 if done: data = [episode_return, episode_len, total_steps] cav_path = "data/csv/GAIL_pytorch_Ant.csv" with open(cav_path, "a+", newline='') as f: # print("-----------{Ant_sam.csv} added a new line!------------".format()) csv_writer = csv.writer(f) csv_writer.writerow(data) episode_return, episode_len = 0, 0 episode_len = 0 episode_num += 1 episode_return = 0 #print("obs:", obs,"rewards:", reward,"donne:", done,"infos:", infos) # obs: tensor([[-0.2471, 0.5996, -0.4484, 1.2435, 0.1895, 1.3830, -0.6217, 0.7217, # -0.6454, 2.1233, -0.8465, -0.5543, 1.2418, -0.9192, 2.0461, -0.7358, # 0.9339]], device='cuda:0') # rewards: tensor([[-0.0649]]) donne: [False] # infos: [{'reward_run': -0.6749512241805, 'reward_ctrl': -0.7481081485748291}] for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) # import pdb; pdb.set_trace() save_path = os.path.join(args.save_dir, args.algo) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) # import pdb; pdb.set_trace() actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) # transforms = [ hl.transforms.Prune('Constant') ] # Removes Constant nodes from graph. # graph = hl.build_graph(actor_critic, torch.zeros([1, 1, 64, 64]), transforms=transforms) # graph.theme = hl.graph.THEMES['blue'].copy() # graph.save('rnn_hiddenlayer2', format='png') # print(args.re) # import pdb; pdb.set_trace() my_model_state_dict = actor_critic.state_dict() count = 0 pretrained_weights = torch.load('net_main_4rh_v2_64.pth') # pretrained_weights = torch.load(os.path.join(save_path, args.env_name + "_ft.pt")) # pretrained_weights[''] old_names = list(pretrained_weights.items()) pretrained_weights_items = list(pretrained_weights.items()) for key, value in my_model_state_dict.items(): layer_name, weights = pretrained_weights_items[count] my_model_state_dict[key] = weights print(count) print(layer_name) count += 1 if layer_name == 'enc_dense.bias': break # pretrained_weights = torch.load(os.path.join(save_path, args.env_name + "_random.pt"))[1] actor_critic.load_state_dict(my_model_state_dict) start_epoch = 0 ka = 0 # for param in actor_critic.parameters(): # ka += 1 # # import pdb; pdb.set_trace() # param.requires_grad = False # if ka == 14: # break count = 0 # import pdb; pdb.set_trace()n actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() rewards_mean = [] rewards_median = [] val_loss = [] act_loss = [] num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(start_epoch, num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, actor_critic.state_dict(), getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + "_finetune.pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) rewards_mean.append(np.mean(episode_rewards)) rewards_median.append(np.median(episode_rewards)) val_loss.append(value_loss) act_loss.append(action_loss) torch.save( rewards_mean, "./plot_data/" + args.env_name + "_avg_rewards_finetune.pt") torch.save( rewards_median, "./plot_data/" + args.env_name + "_median_rewards_finetune.pt") # torch.save(val_loss, "./plot_data/"+args.env_name+"_val_loss_enc_weights.pt") # torch.save(act_loss, "./plot_data/"+args.env_name+"_act_loss_enc_weights.pt") plt.plot(rewards_mean) # print(plt_points2) plt.savefig("./imgs/" + args.env_name + "avg_reward_finetune.png") # plt.show(block = False) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def run(args): args.cuda = not args.no_cuda and torch.cuda.is_available() assert args.algo in ['a2c', 'ppo', 'acktr'] if args.model.recurrent: assert args.algo in ['a2c', 'ppo'], \ 'Recurrent policy is not implemented for ACKTR' use_wandb = args.use_wandb eval_eps = args.eval_eps if use_wandb: experiment_name = f"{args.full_title}_{args.run_id}" from wandb_key import WANDB_API_KEY os.environ['WANDB_API_KEY'] = WANDB_API_KEY wandb.init(project="atari_ppo", name=experiment_name) wandb.config.update(dict(flatten_cfg(args))) if args.seed == 0: args.seed = args.run_id + 1 print(f"SEED: {args.seed}") torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = args.out_dir os.environ['OPENAI_LOGDIR'] = args.out_dir eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) flog = open(log_dir + "/logs.csv", 'w') log_writer = csv.DictWriter(flog, LOG_HEADER.keys()) log_writer.writeheader() torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, log_dir, device, False) base_model = get_model(args.model, envs.observation_space.shape, envs.action_space) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_model, base_kwargs=args.model) actor_critic.to(device) print("Neural Network:") print(actor_critic) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes eval_episodes = args.eval_episodes repeat_eval_eps = getattr(args, "repeat_eval_eps", 1) eval_env_max_steps = 6000 * (eval_episodes // args.num_processes + 1) # load eval envs checkpoints num_batch_eval_ckpt_envs = 10 eval_checkpoint_envs = np.load("env_test_states.npy", allow_pickle=True).item()[args.env_name] eval_checkpoint_envs = eval_checkpoint_envs[:(num_batch_eval_ckpt_envs * args.num_processes)] # 80 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, args.env_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {} | num timesteps {} | FPS {} | Last {} training episodes: mean/median " "reward {:.1f}/{:.1f} | min/max reward {:.1f}/{:.1f}".format( j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) data_plot = { "update": j, "timesteps": total_num_steps, "reward": np.mean(episode_rewards), "median": np.median(episode_rewards), "min": np.min(episode_rewards), "max": np.max(episode_rewards), "dist_entropy": dist_entropy, "value_loss": value_loss, "action_loss": action_loss, } log_writer.writerow(data_plot) if use_wandb: wandb.log(data_plot, step=total_num_steps) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): total_num_steps = (j + 1) * args.num_processes * args.num_steps ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None) determinitistic = args.eval_determinitistic make_env_args = (args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, device, True) eval_args = (actor_critic, ob_rms, args.num_processes, device) eval_kwargs = dict({ "deterministic": determinitistic, "eval_ep": eval_episodes, "max_steps": eval_env_max_steps, "eval_envs": None, }) # -------------------------------------------------------------------------------------- # Evaluate simple 1 step eps greedy eval_inf = dict() base_score = 0 evaluations = [ ("train", dict({ "eps": 0., "repeat_eps": 1, "use_rand_actions": True })), ("eps", dict({ "eps": eval_eps, "repeat_eps": 1, "use_rand_actions": True })), ("eps_rp_10", dict({ "eps": eval_eps, "repeat_eps": 10, "use_rand_actions": True })), ("eps_rp_10_la", dict({ "eps": eval_eps, "repeat_eps": 10, "use_rand_actions": False })), ("eps10_rp_20", dict({ "eps": eval_eps * 2, "repeat_eps": 20, "use_rand_actions": True })), ] for eval_name, eval_custom_args in evaluations: eval_envs = make_vec_envs(*make_env_args) eval_kwargs["eval_envs"] = eval_envs eval_info = evaluate_same_env(*eval_args, **eval_kwargs, **eval_custom_args) eval_envs.close() if eval_info is not None: for k, v in eval_info.items(): eval_inf[f"{eval_name}_{k}"] = v if eval_name == "train": if eval_info is None: base_score = None else: base_score = eval_info["eval_reward"] elif base_score is not None: eval_inf[f"{eval_name}_gap"] = base_score - eval_info[ "eval_reward"] # -------------------------------------------------------------------------------------- # Eval from checkpoint ckpt_r = [] for batch_i in range(0, len(eval_checkpoint_envs), args.num_processes): eval_states = eval_checkpoint_envs[batch_i:batch_i + args.num_processes] eval_envs = make_vec_envs_state(args.env_name, args.seed + args.num_processes, args.num_processes, args.gamma, eval_log_dir, device, True, eval_states) eval_kwargs["eval_envs"] = eval_envs eval_info = evaluate_first_ep(actor_critic, ob_rms, args.num_processes, device, eval_envs=eval_envs) eval_envs.close() ckpt_r += eval_info["eval_reward"] eval_inf[f"eval_ckpt_reward"] = np.mean(ckpt_r) if base_score != 0 and base_score is not None: eval_inf[f"eval_ckpt_reward_gap"] = base_score - np.mean( ckpt_r) print(" Evaluation using {} episodes: mean reward {:.5f}\n".format( len(ckpt_r), np.mean(ckpt_r))) # -------------------------------------------------------------------------------------- if use_wandb and len(eval_inf) > 0: wandb.log(eval_inf, step=total_num_steps)
lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, ) elif args.algo == "random": agent = algo.RANDOM_AGENT(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) actor_critic = RandomPolicy( obs_shape, envs.action_space, base_kwargs={"recurrent": args.recurrent_policy}, navi=args.navi, base=base, ) elif args.algo == "acktr": agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator(envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join(args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split("-")[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True, ) rollouts = RolloutStorage( args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size,
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) save_name = '%s_%s' % (args.env_name, args.algo) if args.postfix != '': save_name += ('_' + args.postfix) logger_filename = os.path.join(log_dir, save_name) logger = utils.create_logger(logger_filename) torch.set_num_threads(1) device = torch.device("cuda:%d" % args.gpu if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, 4, obs_type="grid" if args.grid else "image", skip_frames=args.num_skip_frames) if args.load_dir != None: actor_critic, ob_rms = \ torch.load(os.path.join(args.load_dir), map_location=lambda storage, loc: storage) vec_norm = utils.get_vec_normalize(envs) if vec_norm is not None: vec_norm.ob_rms = ob_rms print("load pretrained...") else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base="grid" if args.grid else None, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) lines = deque(maxlen=10) start = time.time() kk = 0 num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes # learning_start = 1000 learning_start = 0 best_reward = -100 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) explore = exploration_rate(j - learning_start, 'exp') # print(j) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # if j < learning_start: # action[0, 0] = random.randint(0, envs.action_space.n - 1) # elif random.uniform(0, 1) < explore: # action[0, 0] = random.randint(0, envs.action_space.n - 1) # else: # pass # Obser reward and next obs # action[0, 0] = 1 # envs.take_turns() obs, reward, done, infos = envs.step(action) # print(obs) # im = Image.fromarray(obs[0].reshape(224 * 4, -1).cpu().numpy().astype(np.uint8)) # im.save("samples/%d.png" % kk) # kk += 1 # info = infos[0] # if len(info) > 0: # print(info) # print(done) # print(infos) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) if 'sent' in info.keys(): lines.append(info['sent']) # kk += 1 # print(action.shape) # print(obs.shape) # print(done.shape) # if done[0]: # print(time.time() - start) # print(kk) # exit() # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "" \ and np.mean(episode_rewards) > best_reward: save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass best_reward = np.mean(episode_rewards) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, save_name + ".pt")) # print(episode_rewards) if j % args.log_interval == 0 and len(episode_rewards) > 1: if j < learning_start: logger.info("random action") total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() logger.info( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) logger.info( ' lines sent: mean/median lines {:.1f}/{:.1f}, min/max lines {:.1f}/{:.1f}\n' .format(np.mean(lines), np.median(lines), np.min(lines), np.max(lines))) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() # Record trajectories if args.record_trajectories: record_trajectories() return print(args) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True # Append the model name log_dir = os.path.expanduser(args.log_dir) log_dir = os.path.join(log_dir, args.model_name, str(args.seed)) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, log_dir, device, False) # Take activation for carracing print("Loaded env...") activation = None if args.env_name == 'CarRacing-v0' and args.use_activation: activation = torch.tanh print(activation) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'env': args.env_name }, activation=activation) actor_critic.to(device) # Load from previous model if args.load_model_name: state = torch.load( os.path.join(args.save_dir, args.load_model_name, args.load_model_name + '_{}.pt'.format(args.seed)))[0] try: actor_critic.load_state_dict(state) except: actor_critic = state if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: if len(envs.observation_space.shape) == 1: discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=3, subsample_frequency=1) expert_dataset_test = gail.ExpertDataset(file_name, num_trajectories=1, start=3, subsample_frequency=1) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) gail_test_loader = torch.utils.data.DataLoader( dataset=expert_dataset_test, batch_size=args.gail_batch_size, shuffle=False, drop_last=False) print(len(expert_dataset), len(expert_dataset_test)) else: # env observation shape is 3 => its an image assert len(envs.observation_space.shape) == 3 discr = gail.CNNDiscriminator(envs.observation_space.shape, envs.action_space, 100, device) file_name = os.path.join(args.gail_experts_dir, 'expert_data.pkl') expert_dataset = gail.ExpertImageDataset(file_name, train=True) test_dataset = gail.ExpertImageDataset(file_name, train=False) gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=len(expert_dataset) > args.gail_batch_size, ) gail_test_loader = torch.utils.data.DataLoader( dataset=test_dataset, batch_size=args.gail_batch_size, shuffle=False, drop_last=len(test_dataset) > args.gail_batch_size, ) print('Dataloader size', len(gail_train_loader)) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() #num_updates = int( #args.num_env_steps) // args.num_steps // args.num_processes num_updates = args.num_steps print(num_updates) # count the number of times validation loss increases val_loss_increase = 0 prev_val_action = np.inf best_val_loss = np.inf for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Observe reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: try: envs.venv.eval() except: pass gail_epoch = args.gail_epoch #if j < 10: #gail_epoch = 100 # Warm up for _ in range(gail_epoch): #discr.update(gail_train_loader, rollouts, #None) pass for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) #value_loss, action_loss, dist_entropy = agent.update(rollouts) value_loss = 0 dist_entropy = 0 for data in gail_train_loader: expert_states, expert_actions = data expert_states = Variable(expert_states).to(device) expert_actions = Variable(expert_actions).to(device) loss = agent.update_bc(expert_states, expert_actions) action_loss = loss.data.cpu().numpy() print("Epoch: {}, Loss: {}".format(j, action_loss)) with torch.no_grad(): cnt = 0 val_action_loss = 0 for data in gail_test_loader: expert_states, expert_actions = data expert_states = Variable(expert_states).to(device) expert_actions = Variable(expert_actions).to(device) loss = agent.get_action_loss(expert_states, expert_actions) val_action_loss += loss.data.cpu().numpy() cnt += 1 val_action_loss /= cnt print("Val Loss: {}".format(val_action_loss)) #rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": if val_action_loss < best_val_loss: val_loss_increase = 0 best_val_loss = val_action_loss save_path = os.path.join(args.save_dir, args.model_name) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic.state_dict(), getattr(utils.get_vec_normalize(envs), 'ob_rms', None), getattr(utils.get_vec_normalize(envs), 'ret_rms', None) ], os.path.join( save_path, args.model_name + "_{}.pt".format(args.seed))) elif val_action_loss > prev_val_action: val_loss_increase += 1 if val_loss_increase == 10: print("Val loss increasing too much, breaking here...") break elif val_action_loss < prev_val_action: val_loss_increase = 0 # Update prev val action prev_val_action = val_action_loss # log interval if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): ob_rms = utils.get_vec_normalize(envs).ob_rms evaluate(actor_critic, ob_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): #wandb.run = config.tensorboard.run wandb.init(settings=wandb.Settings(start_method="fork"), project='growspaceenv_baselines', entity='growspace') #torch.manual_seed(config.seed) #torch.cuda.manual_seed_all(config.seed) if config.cuda and torch.cuda.is_available() and config.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(config.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if config.cuda else "cpu") envs = make_vec_envs(config.env_name, config.seed, config.num_processes, config.gamma, config.log_dir, device, False, config.custom_gym) if "Mnist" in config.env_name: base = 'Mnist' else: base = None actor_critic = Policy(envs.observation_space.shape, envs.action_space, base, base_kwargs={'recurrent': config.recurrent_policy}) actor_critic.to(device) if config.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, alpha=config.alpha, max_grad_norm=config.max_grad_norm) elif config.algo == 'ppo': agent = algo.PPO(actor_critic, config.clip_param, config.ppo_epoch, config.num_mini_batch, config.value_loss_coef, config.entropy_coef, lr=config.lr, eps=config.eps, max_grad_norm=config.max_grad_norm, optimizer=config.optimizer, momentum=config.momentum) elif config.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, config.value_loss_coef, config.entropy_coef, acktr=True) if config.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( config.gail_experts_dir, "trajs_{}.pt".format(config.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > config.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=config.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(config.num_steps, config.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = [] episode_length = [] episode_branches = [] episode_branch1 = [] episode_branch2 = [] episode_light_width = [] episode_light_move = [] episode_success = [] episode_plantpixel = [] start = time.time() num_updates = int( config.num_env_steps) // config.num_steps // config.num_processes x = 0 action_space_type = envs.action_space for j in range(num_updates): if isinstance(action_space_type, Discrete): action_dist = np.zeros(envs.action_space.n) total_num_steps = (j + 1) * config.num_processes * config.num_steps if config.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if config.algo == "acktr" else config.lr) #new_branches = [] for step in range(config.num_steps): with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) if isinstance(action_space_type, Discrete): action_dist[action] += 1 for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) episode_length.append(info['episode']['l']) wandb.log({"Episode_Reward": info['episode']['r']}, step=total_num_steps) if 'new_branches' in info.keys(): episode_branches.append(info['new_branches']) if 'new_b1' in info.keys(): episode_branch1.append(info['new_b1']) if 'new_b2' in info.keys(): episode_branch2.append(info['new_b2']) if 'light_width' in info.keys(): episode_light_width.append(info['light_width']) if 'light_move' in info.keys(): episode_light_move.append(info['light_move']) if 'success' in info.keys(): episode_success.append(info['success']) if 'plant_pixel' in info.keys(): episode_plantpixel.append(info['plant_pixel']) if j == x: if 'img' in info.keys(): img = info['img'] path = './hittiyas/growspaceenv_braselines/scripts/imgs/' cv2.imwrite( os.path.join(path, 'step' + str(step) + '.png'), img) x += 1000 # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if config.gail: if j >= 10: envs.venv.eval() gail_epoch = config.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(config.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], config.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, config.use_gae, config.gamma, config.gae_lambda, config.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % config.save_interval == 0 or j == num_updates - 1) and config.save_dir != "": save_path = os.path.join(config.save_dir, config.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], os.path.join(save_path, config.env_name + ".pt")) if j % config.log_interval == 0 and len(episode_rewards) > 1: if isinstance(action_space_type, Discrete): np_hist = np.histogram(np.arange(action_dist.shape[0]), weights=action_dist) wandb.log( { "Discrete Actions": wandb.Histogram(np_histogram=np_hist) }, step=total_num_steps) wandb.log({"Reward Min": np.min(episode_rewards)}, step=total_num_steps) wandb.log({"Summed Reward": np.sum(episode_rewards)}, step=total_num_steps) wandb.log({"Reward Mean": np.mean(episode_rewards)}, step=total_num_steps) wandb.log({"Reward Max": np.max(episode_rewards)}, step=total_num_steps) wandb.log( {"Number of Mean New Branches": np.mean(episode_branches)}, step=total_num_steps) wandb.log({"Number of Max New Branches": np.max(episode_branches)}, step=total_num_steps) wandb.log({"Number of Min New Branches": np.min(episode_branches)}, step=total_num_steps) wandb.log( { "Number of Mean New Branches of Plant 1": np.mean(episode_branch1) }, step=total_num_steps) wandb.log( { "Number of Mean New Branches of Plant 2": np.mean(episode_branch2) }, step=total_num_steps) wandb.log( { "Number of Total Displacement of Light": np.sum(episode_light_move) }, step=total_num_steps) wandb.log({"Mean Light Displacement": episode_light_move}, step=total_num_steps) wandb.log({"Mean Light Width": episode_light_width}, step=total_num_steps) wandb.log( { "Number of Steps in Episode with Tree is as close as possible": np.sum(episode_success) }, step=total_num_steps) wandb.log({"Entropy": dist_entropy}, step=total_num_steps) wandb.log( { "Displacement of Light Position": wandb.Histogram(episode_light_move) }, step=total_num_steps) wandb.log( { "Displacement of Beam Width": wandb.Histogram(episode_light_width) }, step=total_num_steps) wandb.log({"Mean Plant Pixel": np.mean(episode_plantpixel)}, step=total_num_steps) wandb.log({"Summed Plant Pixel": np.sum(episode_plantpixel)}, step=total_num_steps) wandb.log( {"Plant Pixel Histogram": wandb.Histogram(episode_plantpixel)}, step=total_num_steps) episode_rewards.clear() episode_length.clear() episode_branches.clear() episode_branch2.clear() episode_branch1.clear() episode_light_move.clear() episode_light_width.clear() episode_success.clear() episode_plantpixel.clear() if (config.eval_interval is not None and len(episode_rewards) > 1 and j % config.eval_interval == 0): ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None) evaluate(actor_critic, ob_rms, config.env_name, config.seed, config.num_processes, eval_log_dir, device, config.custom_gym) ob_rms = getattr(utils.get_vec_normalize(envs), 'ob_rms', None) evaluate(actor_critic, ob_rms, config.env_name, config.seed, config.num_processes, eval_log_dir, device, config.custom_gym, gif=True)
def main(): args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") #envs = make_vec_envs(args.env_name, args.seed, args.num_processes, # args.gamma, args.log_dir, device, False) envs = make_parallel_env(args.env_name, args.num_processes, args.seed, True) ''' actor_critic = Policy( envs.observation_space[0].shape, envs.action_space[0], agent_num=args.agent_num, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) ''' actor_critic = [] for i in range(args.agent_num): ac = Policy( envs.observation_space[0].shape, envs.action_space[0], agent_num=args.agent_num, agent_i = i, base_kwargs={'recurrent': args.recurrent_policy}) ac.to(device) actor_critic.append(ac) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': ''' agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) ''' agent = [] for i in range(args.agent_num): agent.append(algo.PPO( actor_critic[i], i, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm, model_dir = args.model_dir)) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset( file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) ''' rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space[0].shape, envs.action_space[0], actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(torch.tensor(obs[:,0,:])) rollouts.to(device) ''' rollouts = [] for i in range(args.agent_num): rollout = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space[0].shape, envs.action_space[0], actor_critic[i].recurrent_hidden_state_size, args.agent_num, i) rollouts.append(rollout) obs = envs.reset() # pdb.set_trace() for i in range(args.agent_num): rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1))) rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:])) rollouts[i].to(device) episode_rewards = deque(maxlen=10) num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print(num_updates) for j in range(num_updates): #pdb.set_trace() if args.use_linear_lr_decay: # decrease learning rate linearly for i in range(args.agent_num): utils.update_linear_schedule(agent[i].optimizer, j, num_updates, agent[i].optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions value_list, action_list, action_log_prob_list, recurrent_hidden_states_list = [], [], [], [] with torch.no_grad(): for i in range(args.agent_num): #pdb.set_trace() value, action, action_log_prob, recurrent_hidden_states = actor_critic[i].act( rollouts[i].share_obs[step], rollouts[i].obs[step], rollouts[i].recurrent_hidden_states[step], rollouts[i].masks[step]) # import pdb; pdb.set_trace() value_list.append(value) action_list.append(action) action_log_prob_list.append(action_log_prob) recurrent_hidden_states_list.append(recurrent_hidden_states) # Obser reward and next obs action = [] for i in range(args.num_processes): one_env_action = [] for k in range(args.agent_num): one_hot_action = np.zeros(envs.action_space[0].n) one_hot_action[action_list[k][i]] = 1 one_env_action.append(one_hot_action) action.append(one_env_action) #start = time.time() #pdb.set_trace() obs, reward, done, infos = envs.step(action) # print(obs[0][0]) # pdb.set_trace() #end = time.time() #print("step time: ", end-start) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. ''' masks = torch.FloatTensor( [[0.0] if done_ else [1.0] for done_ in done[0]]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos[0]]) ''' masks = torch.ones(args.num_processes, 1) bad_masks = torch.ones(args.num_processes, 1) ''' rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) ''' #import pdb; pdb.set_trace() for i in range(args.agent_num): rollouts[i].insert(torch.tensor(obs.reshape(args.num_processes, -1)), torch.tensor(obs[:,i,:]), recurrent_hidden_states, action_list[i], action_log_prob_list[i], value_list[i], torch.tensor(reward[:, i].reshape(-1,1)), masks, bad_masks) #import pdb; pdb.set_trace() with torch.no_grad(): next_value_list = [] for i in range(args.agent_num): next_value = actor_critic[i].get_value( rollouts[i].share_obs[-1], rollouts[i].obs[-1], rollouts[i].recurrent_hidden_states[-1], rollouts[i].masks[-1]).detach() next_value_list.append(next_value) if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) for i in range(args.agent_num): rollouts[i].compute_returns(next_value_list[i], args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) #import pdb; pdb.set_trace() for i in range(args.agent_num): value_loss, action_loss, dist_entropy = agent[i].update(rollouts[i]) if (i == 0): print("value loss: " + str(value_loss)) # print(value_loss) # pdb.set_trace() #rollouts.after_update() obs = envs.reset() # pdb.set_trace() for i in range(args.agent_num): rollouts[i].share_obs[0].copy_(torch.tensor(obs.reshape(args.num_processes, -1))) rollouts[i].obs[0].copy_(torch.tensor(obs[:,i,:])) rollouts[i].to(device) # save for every interval-th episode or for the last epoch #pdb.set_trace() if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) if not os.path.exists(save_path + args.model_dir): os.makedirs(save_path + args.model_dir) for i in range(args.agent_num): torch.save([ actor_critic[i], getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], save_path + args.model_dir + '/agent_%i' % (i+1) + ".pt") ''' if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) ''' '''
def run(self): args = self.args torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) print("CUDA is available: ", torch.cuda.is_available()) if args.cuda: print("CUDA enabled") torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True else: if args.cuda_deterministic: print("Warning CUDA is requested but is not available") else: print("CUDA disabled") log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) print("get_num_thread", torch.get_num_threads()) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, self.config_parameters, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = create_IAM_model(envs, args, self.config_parameters) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) # This algorithm should be used for the reproduction project. elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) # Always return the average of the last 100 steps. This means the average is sampled. episode_rewards = deque(maxlen=100) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, self.model_file_name)) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() elapsed_time = end - start data = [ j, # Updates total_num_steps, # timesteps int(total_num_steps / elapsed_time), # FPS len(episode_rewards), # Only useful for print statement np.mean(episode_rewards), # mean of rewards np.median(episode_rewards), # median of rewards np.min(episode_rewards), # min rewards np.max(episode_rewards), # max rewards dist_entropy, value_loss, action_loss, elapsed_time ] output = ''.join([str(x) + ',' for x in data]) self.data_saver.append(output) print( f"Updates {data[0]}, num timesteps {data[1]}, FPS {data[2]}, elapsed time {int(data[11])} sec. Last {data[3]} training episodes: mean/median reward {data[4]:.2f}/{data[5]:.2f}, min/max reward {data[6]:.1f}/{data[7]:.1f}", end="\r") if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device)
def main(): args = get_args() writer = SummaryWriter(os.path.join('logs', args.save_name), ) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs( basic_env.BasicFlatDiscreteEnv, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, task='lift', gripper_type='RobotiqThreeFingerDexterousGripper', robot='Panda', controller='JOINT_TORQUE' if args.vel else 'JOINT_POSITION', horizon=1000, reward_shaping=True) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base=Surreal, # base=OpenAI, # base=MLP_ATTN, base_kwargs={ 'recurrent': args.recurrent_policy, # 'dims': basic_env.BasicFlatEnv().modality_dims 'config': dict(act='relu' if args.relu else 'tanh', rec=args.rec, fc=args.fc) }) print(actor_critic) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) expert_dataset = gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20) drop_last = len(expert_dataset) > args.gail_batch_size gail_train_loader = torch.utils.data.DataLoader( dataset=expert_dataset, batch_size=args.gail_batch_size, shuffle=True, drop_last=drop_last) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=100) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes best_reward = 0 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) writer.add_scalar('lr', agent.optimizer.param_groups[0]['lr']) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) end = time.time() total_num_steps = (j + 1) * args.num_processes * args.num_steps if len(episode_rewards) > 1: writer.add_scalar('loss/value', value_loss, total_num_steps) writer.add_scalar('loss/policy', action_loss, total_num_steps) writer.add_scalar('experiment/num_updates', j, total_num_steps) writer.add_scalar('experiment/FPS', int(total_num_steps / (end - start)), total_num_steps) writer.add_scalar('experiment/EPISODE MEAN', np.mean(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPISODE MEDIAN', np.median(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPISODE MIN', np.min(episode_rewards), total_num_steps) writer.add_scalar('experiment/EPSIDOE MAX', np.max(episode_rewards), total_num_steps) rollouts.after_update() # save for every interval-th episode or for the last epoch if len(episode_rewards) > 1 and args.save_dir != "": rew = np.mean(episode_rewards) if rew > best_reward: best_reward = rew print('saved with best reward', rew) save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, args.save_name + ".pt")) if j % args.log_interval == 0 and len(episode_rewards) > 1: print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): obs_rms = utils.get_vec_normalize(envs).obs_rms evaluate(actor_critic, obs_rms, args.env_name, args.seed, args.num_processes, eval_log_dir, device) writer.close()
def main(): ##TensorboardX summary = SummaryWriter() args = get_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True log_dir = os.path.expanduser(args.log_dir) eval_log_dir = log_dir + "_eval" utils.cleanup_log_dir(log_dir) utils.cleanup_log_dir(eval_log_dir) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) actor_critic.to(device) if args.algo == 'a2c': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm) elif args.algo == 'ppo': agent = algo.PPO( actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'acktr': agent = algo.A2C_ACKTR( actor_critic, args.value_loss_coef, args.entropy_coef, acktr=True) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format( args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset( file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes print(num_updates)