def train(train_states, run_dir, num_env_steps, eval_env_steps, writer, writer_name, args, init_model=None): envs = make_vec_envs(train_states, args.seed, args.num_processes, args.gamma, 'cpu', 'train', args) if init_model: actor_critic, env_step, model_name = init_model obs_space = actor_critic.obs_space obs_process = actor_critic.obs_process obs_module = actor_critic.obs_module print(f" [load] Loaded model {model_name} at step {env_step}") else: obs_space = envs.observation_space actor_critic = Policy(obs_space, args.obs_process, args.obs_module, envs.action_space, base_kwargs={'recurrent': args.recurrent_policy}) env_step = 0 actor_critic.to(args.device) #print(actor_critic) run_name = run_dir.replace('/', '_') vid_save_dir = f"{run_dir}/videos/" try: os.makedirs(vid_save_dir) except OSError: pass ckpt_save_dir = f"{run_dir}/ckpts/" try: os.makedirs(ckpt_save_dir) except OSError: pass if args.algo == 'ppo': agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, args.device, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) elif args.algo == 'a2c': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=False) elif args.algo == 'acktr': agent = algo.A2C_ACKTR(actor_critic, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, alpha=args.alpha, max_grad_norm=args.max_grad_norm, acktr=True) else: raise NotImplementedError rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() actor_critic.eval() """ try: writer.add_graph(actor_critic, obs) except ValueError: print("Unable to write model graph to tensorboard.") """ actor_critic.train() for k in rollouts.obs.keys(): rollouts.obs[k][0].copy_(obs[k][0]) episode_rewards = deque(maxlen=10) num_updates = num_env_steps // args.num_steps // args.num_processes batch_size = args.num_steps * args.num_processes start = time.time() while env_step < num_env_steps: s = time.time() if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states, _ = actor_critic.act( { k: rollouts.obs[k][step].float().to(args.device) for k in rollouts.obs.keys() }, rollouts.recurrent_hidden_states[step].to(args.device), rollouts.masks[step].to(args.device)) value = value.cpu() action = action.cpu() action_log_prob = action_log_prob.cpu() recurrent_hidden_states = recurrent_hidden_states.cpu() # Observe reward and next obs obs, reward, dones, infos = envs.step(action) for done, info in zip(dones, infos): env_state = info['env_state'][1] if done: writer.add_scalar(f'train_episode_x/{env_state}', info['max_x'], env_step) writer.add_scalar(f'train_episode_%/{env_state}', info['max_x'] / info['lvl_max_x'] * 100, env_step) writer.add_scalar(f'train_episode_r/{env_state}', info['sum_r'], env_step) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done else [1.0] for done in dones]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( { k: rollouts.obs[k][-1].float().to(args.device) for k in rollouts.obs.keys() }, rollouts.recurrent_hidden_states[-1].to(args.device), rollouts.masks[-1].to(args.device)).detach().cpu() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() env_step += batch_size fps = batch_size / (time.time() - s) #res = nvidia_smi.nvmlDeviceGetUtilizationRates(handle) #writer.add_scalar(f'gpu_usage/{writer_name}', res.gpu, env_step) #writer.add_scalar(f'gpu_mem/{writer_name}', res.memory, env_step) total_norm = 0 for p in list( filter(lambda p: p.grad is not None, actor_critic.parameters())): param_norm = p.grad.data.norm(2) total_norm += param_norm.item()**2 total_norm = total_norm**(1. / 2) obs_norm = {} for obs_name in args.obs_keys: t_norm = 0 if obs_name == 'video': md = actor_critic.base.video_module elif obs_name == 'audio': md = actor_critic.base.audio_module else: raise NotImplementedError for p in list(filter(lambda p: p.grad is not None, md.parameters())): param_norm = p.grad.data.norm(2) t_norm += param_norm.item()**2 obs_norm[obs_name] = t_norm**(1. / 2) prev_env_step = max(0, env_step + 1 - batch_size) # write training metrics for this batch, usually takes 0.003s if (env_step + 1 ) // args.write_interval > prev_env_step // args.write_interval: writer.add_scalar(f'grad_norm/{writer_name}', total_norm, env_step) writer.add_scalar(f'fps/{writer_name}', fps, env_step) writer.add_scalar(f'value_loss/{writer_name}', value_loss / batch_size, env_step) writer.add_scalar(f'action_loss/{writer_name}', action_loss / batch_size, env_step) writer.add_scalar(f'dist_entropy/{writer_name}', dist_entropy / batch_size, env_step) writer.add_scalar(f'cpu_usage/{writer_name}', psutil.cpu_percent(), env_step) writer.add_scalar(f'cpu_mem/{writer_name}', psutil.virtual_memory()._asdict()['percent'], env_step) for obs_name in args.obs_keys: writer.add_scalar(f'grad_norm_{obs_name}/{writer_name}', obs_norm[obs_name], env_step) # print log to console if (env_step + 1) // args.log_interval > prev_env_step // args.log_interval: end = time.time() print(" [log] Env step {} of {}: {:.1f}s, {:.1f}fps".format( env_step + 1, num_env_steps, end - start, fps)) if len(episode_rewards) > 0: print( " Last {} episodes: mean/med reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}" .format(len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) print( " dist_entropy {:.5f}, value_loss {:.6f}, action_loss {:.6f}, grad_norm {:.6f}" .format(dist_entropy, value_loss, action_loss, total_norm)) start = time.time() # save model to ckpt if ((env_step + 1) // args.save_interval > prev_env_step // args.save_interval): torch.save([ actor_critic, env_step, run_name, ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")) print(f" [save] Saved model at step {env_step+1}.") # save model to ckpt and run evaluation if eval_interval and not final iteration in training loop if ((env_step + 1) // args.eval_interval > prev_env_step // args.eval_interval ) and env_step < num_env_steps and eval_env_steps > 0: torch.save([ actor_critic, env_step, run_name, ], os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt")) print(f" [save] Saved model at step {env_step+1}.") envs.close() del envs # close does not actually get rid of envs, need to del actor_critic.eval() eval_score, e_dict = evaluate(train_states, actor_critic, eval_env_steps, env_step, writer, vid_save_dir, args.vid_tb_steps, args.vid_file_steps, args.obs_viz_layer, args) print(f" [eval] Evaluation score: {eval_score}") writer.add_scalar('eval_score', eval_score, env_step) actor_critic.train() envs = make_vec_envs(train_states, args.seed, args.num_processes, args.gamma, 'cpu', 'train', args) obs = envs.reset() # TODO: does this work? do we need to increment env step or something? whydden_states insert at 0 for k in rollouts.obs.keys(): rollouts.obs[k][0].copy_(obs[k][0]) # final model save final_model_path = os.path.join(ckpt_save_dir, f"{run_name}-{env_step}.pt") torch.save([ actor_critic, env_step, run_name, ], final_model_path) print( f" [save] Final model saved at step {env_step+1} to {final_model_path}" ) # final model eval envs.close() del envs eval_score = None eval_dict = None if eval_env_steps > 0: eval_score, eval_dict = evaluate(train_states, actor_critic, eval_env_steps, env_step, writer, vid_save_dir, args.vid_tb_steps, args.vid_file_steps, args.obs_viz_layer, args) print(f" [eval] Final model evaluation score: {eval_score:.3f}") return (actor_critic, env_step, run_name), eval_score, eval_dict
def main(): realEval = True #False gettrace = getattr(sys, 'gettrace', None) parser = argparse.ArgumentParser(description='RL') parser.add_argument('--action-type', type=int, default=-1, help='action type to play (default: -1)') parser.add_argument('--tasks-difficulty-from', type=int, default=0, help='tasks_difficulty_from') parser.add_argument('--tasks-difficulty-to', type=int, default=100000, help='tasks-difficulty-to') parser.add_argument('--verboseLevel', type=int, default=5, help='verboseLevel') parser.add_argument('--filesNamesSuffix', default="", help='filesNamesSuffix') parser.add_argument('--nobest-exit', type=int, default=10000, help='nobest_exit') args = get_args(parser) args.algo = 'ppo' args.env_name = 'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' #'RoboschoolAnt-v1' #'QuadruppedWalk-v1' args.use_gae = True args.num_steps = 2048 #args.num_processes = 4 args.num_processes = 4 if gettrace(): args.num_processes = 1 args.lr = 0.0001 args.entropy_coef = 0.0 args.value_loss_coef = 0.5 args.ppo_epoch = 4 args.num_mini_batch = 256 args.gamma = 0.99 args.gae_lambda = 0.95 args.clip_param = 0.2 args.use_linear_lr_decay = True #True #True #True #True args.use_proper_time_limits = True args.save_dir = "./trained_models/" + args.env_name + "/" args.load_dir = "./trained_models/" + args.env_name + "/" args.log_dir = "./logs/robot" if gettrace(): args.save_dir = "./trained_models/" + args.env_name + "debug/" args.load_dir = "./trained_models/" + args.env_name + "debug/" args.log_dir = "./logs/robot_d" args.log_interval = 30 args.hidden_size = 64 args.last_hidden_size = args.hidden_size args.recurrent_policy = False #True args.save_interval = 20 #args.seed = 1 reward_shaping = 0.01 allowMutate = False if args.seed == -1: args.seed = time.clock_gettime_ns(time.CLOCK_REALTIME) quadruppedEnv.settings.tasks_difficulty_from = args.tasks_difficulty_from quadruppedEnv.settings.tasks_difficulty_to = args.tasks_difficulty_to # 0 is a walk # 1 is a balance # 2 multitasks # 3 multitask experiments trainType = 14 filesNamesSuffix = "" if args.action_type >= 0: trainType = args.action_type makeEnvFunction = makeEnv.make_env_with_best_settings if trainType == 1: filesNamesSuffix = "balance_" makeEnvFunction = makeEnv.make_env_for_balance if trainType == 2: filesNamesSuffix = "analytical_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical if trainType == 3: filesNamesSuffix = "analytical2_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_analytical2 if trainType == 4: filesNamesSuffix = "frontback_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_front_back if trainType == 5: filesNamesSuffix = "leftright_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_left_right if trainType == 6: filesNamesSuffix = "all_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_all if trainType == 7: filesNamesSuffix = "rotate_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_rotate if trainType == 8: filesNamesSuffix = "compound_" makeEnvFunction = make_env_multinetwork if trainType == 9: import pickle realEval = False allowMutate = False args.use_linear_lr_decay = True #False args.num_env_steps = 5000000 filesNamesSuffix = "test_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_test if trainType == 10: import pickle realEval = False allowMutate = False args.use_linear_lr_decay = True #False args.num_env_steps = 5000000 filesNamesSuffix = "zoo_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_test_zoo if trainType == 11: args.hidden_size = 128 #64 #128 args.last_hidden_size = args.hidden_size import pickle if gettrace(): args.num_processes = 1 else: args.num_processes = 8 realEval = False allowMutate = False args.lr = 0.00001 args.use_linear_lr_decay = True #False args.num_env_steps = 10000000 filesNamesSuffix = "zigote2_updown_" print("Samples preload") global samplesEnvData samplesEnvData = pickle.load( open("./QuadruppedWalk-v1_MoveNoPhys.samples", "rb")) # samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1.samples", "rb" ) ) makeEnvFunction = makeSamplesEnv if trainType == 12: import pickle args.lr = 0.00001 args.hidden_size = 64 args.last_hidden_size = args.hidden_size filesNamesSuffix = "zigote2_front_back_" args.clip_param = 0.9 args.value_loss_coef = 0.9 makeEnvFunction = makeEnv.make_env_with_best_settings_for_train #makeEnvFunction = makeEnv.make_env_with_best_settings_for_record #makeEnv.samplesEnvData = pickle.load( open( "./QuadruppedWalk-v1_MoveNoPhys.samples", "rb" ) ) if trainType == 13: filesNamesSuffix = "all_bytasks_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_all if trainType == 14: #args.lr = 0.00001 #args.num_env_steps = 000000 #args.clip_param = 0.5 #args.value_loss_coef =0.8 #random.seed(time.clock_gettime_ns(time.CLOCK_REALTIME)) #args.num_steps = random.choice([256,512,1024,2048,4096]) #args.num_mini_batch = random.choice([32,64,256,512]) #args.ppo_epoch = random.choice([2,4,8,10]) #args.clip_param = random.choice([0.2,0.4,0.6,0.8]) #args.value_loss_coef =random.choice([0.4,0.5,0.6,0.8]) #args.lr = random.choice([0.00001,0.0001,0.00005,0.0005]) args.num_steps = 2048 args.num_mini_batch = 64 args.ppo_epoch = 8 args.lr = 0.0001 args.hidden_size = 64 args.last_hidden_size = args.hidden_size # filesNamesSuffix = args.filesNamesSuffix makeEnvFunction = makeEnv.make_env_with_best_settings_for_all ''' num_steps: 1024 num_mini_batch 64 ppo_epoch 2 clip_param: 0.2 value_loss_coef 0.6 lr 0.0001 ''' if trainType == 15: args.num_env_steps = 5000000 filesNamesSuffix = "zigote_updown_" makeEnvFunction = makeEnv.make_env_with_best_settings_for_train_analytic if trainType == 16: args.lr = 0.00001 filesNamesSuffix = "compound_tasks_" makeEnvFunction = make_env_multinetwork reward_shaper = DefaultRewardsShaper(scale_value=reward_shaping) print("ActionType ", trainType, " ", filesNamesSuffix, "seed", args.seed, "num env steps:", args.num_env_steps, " tasks_dif", args.tasks_difficulty_from, args.tasks_difficulty_to) print("Num processes:", args.num_processes) print("num_steps:", args.num_steps, "num_mini_batch", args.num_mini_batch, "ppo_epoch", args.ppo_epoch) print("clip_param:", args.clip_param, "value_loss_coef", args.value_loss_coef, "lr", args.lr) random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True args.log_dir = "/tmp/tensorboard/" #TesnorboardX writer = SummaryWriter(log_dir=args.log_dir + 'runs/{}_PPO_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, "ppo")) writer.add_scalar('options/num_steps', args.num_steps, 0) writer.add_scalar('options/num_mini_batch', args.num_mini_batch, 0) writer.add_scalar('options/ppo_epoch', args.ppo_epoch, 0) writer.add_scalar('options/clip_param', args.clip_param, 0) writer.add_scalar('options/value_loss_coef', args.value_loss_coef, 0) writer.add_scalar('options/lr', args.lr, 0) device = torch.device("cuda:0" if args.cuda else "cpu") torch.set_num_threads(1) load_dir = os.path.join(args.load_dir, args.algo) multiNetworkName = ["frontback_", "all_", "leftright_", "rotate_"] if trainType == 8: for net in multiNetworkName: bestFilename = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, net, args.hidden_size)) ac, _ = torch.load(bestFilename) policies.append(PPOPlayer(ac, device)) print("Policy multi loaded: ", bestFilename) multiNetworkName2 = [ "all_bytasks_0_", "all_bytasks_1_", "all_bytasks_2_", "all_bytasks_3_", "all_bytasks_4_", "all_bytasks_5_", "all_bytasks_6_", "all_bytasks_7_", "all_bytasks_8_", "all_bytasks_9_", "all_bytasks_10_", "all_bytasks_11_", "all_bytasks_12_", ] if trainType == 16: for net in multiNetworkName2: bestFilename = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, net, args.hidden_size)) ac, _ = torch.load(bestFilename) policies.append(PPOPlayer(ac, device)) print("Policy multi loaded: ", bestFilename) envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, None, device, False, normalizeOb=False, normalizeReturns=False, max_episode_steps=args.num_steps, makeEnvFunc=makeEnvFunction, num_frame_stack=1, info_keywords=( 'episode_steps', 'episode_reward', 'progress', 'servo', 'distToTarget', )) #print(envs.observation_space.shape,envs.action_space) actor_critic = Policy(envs.observation_space.shape, envs.action_space, base_kwargs={ 'recurrent': args.recurrent_policy, 'hidden_size': args.hidden_size, 'last_hidden_size': args.last_hidden_size, 'activation_layers_type': "Tanh" }) ''' # if args.load_dir not None: load_path = os.path.join(args.load_dir, args.algo) actor_critic, ob_rms = torch.load(os.path.join(load_path, args.env_name + ".pt")) ''' load_path = os.path.join( load_dir, "{}_{}{}_best.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) #load_path = os.path.join(load_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) preptrained_path = "../Train/trained_models/QuadruppedWalk-v1/Train_QuadruppedWalk-v1_256.pth" loadPretrained = False if loadPretrained and os.path.isfile(preptrained_path): print("Load preptrained") abj = torch.load(preptrained_path) print(abj) print(actor_critic.base) actor_critic.base.load_state_dict() actor_critic.base.eval() if os.path.isfile(load_path) and not loadPretrained: actor_critic, ob_rms = torch.load(load_path) actor_critic.eval() print("----NN loaded: ", load_path, " -----") else: bestFilename = os.path.join( load_dir, "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) if os.path.isfile(bestFilename): actor_critic, ob_rms = torch.load(bestFilename) actor_critic.eval() print("----NN loaded: ", bestFilename, " -----") maxReward = -10000.0 maxSteps = 0 minDistance = 50000.0 actor_critic.to(device) agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) if args.gail: assert len(envs.observation_space.shape) == 1 discr = gail.Discriminator( envs.observation_space.shape[0] + envs.action_space.shape[0], 100, device) file_name = os.path.join( args.gail_experts_dir, "trajs_{}.pt".format(args.env_name.split('-')[0].lower())) gail_train_loader = torch.utils.data.DataLoader( gail.ExpertDataset(file_name, num_trajectories=4, subsample_frequency=20), batch_size=args.gail_batch_size, shuffle=True, drop_last=True) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) deque_maxLen = 10 episode_rewards = deque(maxlen=deque_maxLen) episode_steps = deque(maxlen=deque_maxLen) episode_rewards_alive = deque(maxlen=deque_maxLen) episode_rewards_progress = deque(maxlen=deque_maxLen) episode_rewards_servo = deque(maxlen=deque_maxLen) episode_dist_to_target = deque(maxlen=deque_maxLen) ''' load_path = os.path.join(args.load_dir, args.algo) load_path = os.path.join(load_path, args.env_name + ".pt") actor_critic, ob_rms = torch.load(load_path) actor_critic.to(device) actor_critic.eval() #ob_rms.eval() ''' ''' args.use_gym_monitor = 1 args.monitor_dir = "./results/" monitor_path = os.path.join(args.monitor_dir, args.algo) monitor_path = os.path.join(monitor_path, args.env_name) args. if args.use_gym_monitor: env = wrappers.Monitor( env, monitor_path, video_callable=False, force=True) ''' i_episode = 0 save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass trainOnSamplesAndExit = False #False if trainOnSamplesAndExit: import pickle print("---------------------------------------") print("Samples preload") data = pickle.load(open("./QuadruppedWalk-v1_UpDown.samples", "rb")) #data = pickle.load( open( "../QuadruppedWalk-v1_NN.samples", "rb" ) ) learning_rate = 0.0001 max_episodes = 100 max_timesteps = 4000 betas = (0.9, 0.999) log_interval = 1 envSamples = SamplesEnv(data) envSamples.numSteps = max_timesteps # create a stochastic gradient descent optimizer optimizer = torch.optim.Adam(actor_critic.base.actor.parameters(), lr=learning_rate, betas=betas) #optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9) # create a loss function criterion = nn.MSELoss(reduction="sum") # run the main training loop for epoch in range(max_episodes): state = envSamples.reset() time_step = 0 testReward = 0 testSteps = 0 loss_sum = 0 loss_max = 0 for t in range(max_timesteps): time_step += 1 nn_state = torch.FloatTensor((state).reshape(1, -1)).to(device) optimizer.zero_grad() net_out = actor_critic.base.forwardActor(nn_state) net_out = actor_critic.dist.fc_mean(net_out) state, reward, done, info = envSamples.step( net_out.detach().numpy()) sim_action = envSamples.recordedActions sim_action_t = torch.FloatTensor([sim_action]).to(device) loss = criterion(net_out, sim_action_t) loss.backward() optimizer.step() loss_sum += loss.mean() loss_max = max(loss_max, loss.max()) testReward += reward testSteps += 1 if done: if epoch % log_interval == 0: #print(best_action_t*scaleActions-net_out*scaleActions) if args.verboseLevel > 0: print( 'Train Episode: {} t:{} Reward:{} Loss: mean:{:.6f} max: {:.6f}' .format(epoch, t, testReward, loss_sum / t, loss_max)) print(info) reward = 0 break bestFilename = os.path.join( save_path, "{}_{}{}_best_pretrain.pt".format(args.env_name, filesNamesSuffix, args.hidden_size)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) exit(0) skipWriteBest = True if args.verboseLevel > 0: printNetwork(actor_critic.base.actor) lock(actor_critic, first=False, last=False) #if trainType==9: #allowMutate = False #lock(actor_critic,first=True,last=False) #mutate(actor_critic,power=0.00,powerLast=0.3) if args.verboseLevel > 0: printNetwork(actor_critic.base.actor) #from torchsummary import summary #summary(actor_critic.base.actor, (1, 48, 64)) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes episodeBucketIndex = 0 maxReward = -10000000000 numEval = 10 if realEval: envEval = makeEnvFunction(args.env_name) if hasattr(envEval.env, "tasks") and len(envEval.env.tasks): numEval = max(numEval, len(envEval.env.tasks)) maxReward = evaluate_policy(envEval, actor_critic, numEval * 2, render=False, device=device, verbose=args.verboseLevel) print("MaxReward on start", maxReward) noMaxRewardCount = 0 updateIndex = 0 for j in range(num_updates): if args.use_linear_lr_decay: # decrease learning rate linearly utils.update_linear_schedule( agent.optimizer, j, num_updates, agent.optimizer.lr if args.algo == "acktr" else args.lr) episode_r = 0.0 stepsDone = 0 for step in range(args.num_steps): # Sample actions with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) # Obser reward and next obs obs, reward, done, infos = envs.step(action) #envs.venv.venv.venv.envs[0].render() if args.verboseLevel > 0: index = 0 for d in done: if d: print(infos[index], flush=True) index += 1 episodeDone = False ''' index = 0 for d in done: if d: print("") print(infos[index]) index+=1 ''' for info in infos: if 'reward' in info.keys(): episodeDone = True i_episode += 1 episode_rewards.append(info['reward']) writer.add_scalar('reward/episode', info['reward'], i_episode) #print("E:",i_episode," T:",info['episode_steps'], " R:", info['episode_reward'], " D:",info['distToTarget']) if 'steps' in info.keys(): episode_steps.append(info['steps']) writer.add_scalar('reward/steps', info['steps'], i_episode) if 'alive' in info.keys(): episode_rewards_alive.append(info['alive']) writer.add_scalar('reward/alive', info['alive'], i_episode) if 'prog' in info.keys(): episode_rewards_progress.append(info['prog']) writer.add_scalar('reward/progress', info['prog'], i_episode) if 'servo' in info.keys(): episode_rewards_servo.append(info['servo']) writer.add_scalar('reward/servo', info['servo'], i_episode) if 'd2T' in info.keys(): episode_dist_to_target.append(info['d2T']) writer.add_scalar('reward/distToTarget', info['d2T'], i_episode) for val in info.keys(): if val not in [ "reward", "steps", "alive", "prog", "servo", "d2T", 'epos', 't' ]: writer.add_scalar('reward/' + val, info[val], i_episode) #if episodeDone and i_episode%10==0: # print(i_episode,"({:.1f}/{}/{:.2f}) ".format(episode_rewards[-1],episode_steps[-1],episode_dist_to_target[-1]),end='',flush=True) if episodeDone: episodeBucketIndex += 1 if args.verboseLevel > 0: print("Mean:", Fore.WHITE, np.mean(episode_rewards), Style.RESET_ALL, " Median:", Fore.WHITE, np.median(episode_rewards), Style.RESET_ALL, " max reward:", maxReward) #'''len(episode_rewards) and np.mean(episode_rewards)>maxReward and''' if realEval: if episodeBucketIndex % args.log_interval == 0 and episodeBucketIndex > args.log_interval: print("Step:", (j + 1) * args.num_processes * args.num_steps) if skipWriteBest == False: evalReward = evaluate_policy( envEval, actor_critic, numEval, device=device, verbose=args.verboseLevel) writer.add_scalar('reward/eval', evalReward, i_episode) if evalReward > maxReward: maxReward = evalReward #maxReward = np.mean(episode_rewards) bestFilename = os.path.join( save_path, "{}_{}{}_best.pt".format( args.env_name, filesNamesSuffix, args.hidden_size)) print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{:.1f}/{}/{:.2f}) ".format( maxReward, np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps), episode_dist_to_target[-1]), Style.RESET_ALL, bestFilename) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) noMaxRewardCount = 0 else: noMaxRewardCount += 1 if allowMutate: if noMaxRewardCount == 5: print("Mutation low last layer") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.00, powerLast=0.01) if noMaxRewardCount == 8: print("Mutation low non last") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.01, powerLast=0.0) if noMaxRewardCount == 11: print("Mutation low all") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.02, powerLast=0.2) if noMaxRewardCount == 14: print("Mutation hi all") lock(actor_critic, first=False, last=False) mutate(actor_critic, power=0.03, powerLast=0.03) noMaxRewardCount = 0 if noMaxRewardCount == args.nobest_exit: exit(0) else: skipWriteBest = False else: if len(episode_rewards) and np.mean( episode_rewards ) > maxReward and j > args.log_interval: if skipWriteBest == False: maxReward = np.mean(episode_rewards) writer.add_scalar('reward/maxReward', maxReward, i_episode) bestFilename = os.path.join( save_path, "{}_{}{}_best.pt".format( args.env_name, filesNamesSuffix, args.hidden_size)) if len(episode_dist_to_target): print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{}/{:.2f}) ".format( np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps), episode_dist_to_target[-1]), Style.RESET_ALL, bestFilename) else: print( "Writing best reward:", Fore.GREEN, "({:.1f}/{:.1f}/{}) ".format( np.mean(episode_rewards), np.median(episode_rewards), np.mean(episode_steps)), Style.RESET_ALL, bestFilename) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], bestFilename) else: skipWriteBest = False # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) shaped_reward = reward_shaper(reward) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, shaped_reward, masks, bad_masks) with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() if args.gail: if j >= 10: envs.venv.eval() gail_epoch = args.gail_epoch if j < 10: gail_epoch = 100 # Warm up for _ in range(gail_epoch): discr.update(gail_train_loader, rollouts, utils.get_vec_normalize(envs)._obfilt) for step in range(args.num_steps): rollouts.rewards[step] = discr.predict_reward( rollouts.obs[step], rollouts.actions[step], args.gamma, rollouts.masks[step]) rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) value_loss, action_loss, dist_entropy = agent.update(rollouts) writer.add_scalar('reward/value_loss', value_loss, updateIndex) writer.add_scalar('reward/action_loss', action_loss, updateIndex) writer.add_scalar('reward/dist_entropy', dist_entropy, updateIndex) updateIndex += 1 rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": ''' fileName = os.path.join(save_path, "{}_{}{}.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'ob_rms', None) ], fileName) print("Saved:",fileName, " cur avg rewards:",np.mean(episode_rewards)) fileName = os.path.join(save_path, "{}_{}{}_actor.pt".format(args.env_name,filesNamesSuffix,args.hidden_size)) torch.save(actor_critic.state_dict, fileName) print("Saved:",fileName) ''' if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() if args.verboseLevel > 0: print("") print("Updates {}, num timesteps {}, FPS {}".format( j, total_num_steps, int(total_num_steps / (end - start)))) print(" Last {} training episodes:".format( len(episode_rewards))) print( " reward mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}". format(np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards))) print(" steps mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}". format(np.mean(episode_steps), np.median(episode_steps), np.min(episode_steps), np.max(episode_steps))) if len(episode_rewards_alive): print( " alive mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_alive), np.median(episode_rewards_alive), np.min(episode_rewards_alive), np.max(episode_rewards_alive))) if len(episode_rewards_progress): print( " progress mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_progress), np.median(episode_rewards_progress), np.min(episode_rewards_progress), np.max(episode_rewards_progress))) if len(episode_rewards_servo): print( " servo mean/median {:.1f}/{:.1f} min/max {:.1f}/{:.1f}" .format(np.mean(episode_rewards_servo), np.median(episode_rewards_servo), np.min(episode_rewards_servo), np.max(episode_rewards_servo))) if len(episode_dist_to_target): print( " dist to target mean/median {:.3f}/{:.3f} min/max {:.3f}/{:.3f}" .format(np.mean(episode_dist_to_target), np.median(episode_dist_to_target), np.min(episode_dist_to_target), np.max(episode_dist_to_target))) print( " Reward/Steps {:.3f} Progress/Steps: {:.3f} entropy {:.1f} value_loss {:.5f} action_loss {:.5f}\n" .format( np.mean(episode_rewards) / np.mean(episode_steps), (0 if len(episode_rewards_progress) == 0 else np.mean(episode_rewards_progress) / np.mean(episode_steps)), dist_entropy, value_loss, action_loss))
def main(): args = get_args() import random random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) if args.cuda and torch.cuda.is_available() and args.cuda_deterministic: torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True logdir = args.env_name + '_' + args.algo + '_num_arms_' + str( args.num_processes) + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") if args.use_privacy: logdir = logdir + '_privacy' elif args.use_noisygrad: logdir = logdir + '_noisygrad' elif args.use_pcgrad: logdir = logdir + '_pcgrad' elif args.use_testgrad: logdir = logdir + '_testgrad' elif args.use_median_grad: logdir = logdir + '_mediangrad' logdir = os.path.join('runs', logdir) logdir = os.path.join(os.path.expanduser(args.log_dir), logdir) utils.cleanup_log_dir(logdir) # Ugly but simple logging log_dict = { 'task_steps': args.task_steps, 'grad_noise_ratio': args.grad_noise_ratio, 'max_task_grad_norm': args.max_task_grad_norm, 'use_noisygrad': args.use_noisygrad, 'use_pcgrad': args.use_pcgrad, 'use_testgrad': args.use_testgrad, 'use_testgrad_median': args.use_testgrad_median, 'testgrad_quantile': args.testgrad_quantile, 'median_grad': args.use_median_grad, 'use_meanvargrad': args.use_meanvargrad, 'meanvar_beta': args.meanvar_beta, 'no_special_grad_for_critic': args.no_special_grad_for_critic, 'use_privacy': args.use_privacy, 'seed': args.seed, 'recurrent': args.recurrent_policy, 'obs_recurrent': args.obs_recurrent, 'cmd': ' '.join(sys.argv[1:]) } for eval_disp_name, eval_env_name in EVAL_ENVS.items(): log_dict[eval_disp_name] = [] summary_writer = SummaryWriter() summary_writer.add_hparams( { 'task_steps': args.task_steps, 'grad_noise_ratio': args.grad_noise_ratio, 'max_task_grad_norm': args.max_task_grad_norm, 'use_noisygrad': args.use_noisygrad, 'use_pcgrad': args.use_pcgrad, 'use_testgrad': args.use_testgrad, 'use_testgrad_median': args.use_testgrad_median, 'testgrad_quantile': args.testgrad_quantile, 'median_grad': args.use_median_grad, 'use_meanvargrad': args.use_meanvargrad, 'meanvar_beta': args.meanvar_beta, 'no_special_grad_for_critic': args.no_special_grad_for_critic, 'use_privacy': args.use_privacy, 'seed': args.seed, 'recurrent': args.recurrent_policy, 'obs_recurrent': args.obs_recurrent, 'cmd': ' '.join(sys.argv[1:]) }, {}) torch.set_num_threads(1) device = torch.device("cuda:0" if args.cuda else "cpu") print('making envs...') envs = make_vec_envs(args.env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, steps=args.task_steps, free_exploration=args.free_exploration, recurrent=args.recurrent_policy, obs_recurrent=args.obs_recurrent, multi_task=True) val_envs = make_vec_envs(args.val_env_name, args.seed, args.num_processes, args.gamma, args.log_dir, device, False, steps=args.task_steps, free_exploration=args.free_exploration, recurrent=args.recurrent_policy, obs_recurrent=args.obs_recurrent, multi_task=True) eval_envs_dic = {} for eval_disp_name, eval_env_name in EVAL_ENVS.items(): eval_envs_dic[eval_disp_name] = make_vec_envs( eval_env_name[0], args.seed, args.num_processes, None, logdir, device, True, steps=args.task_steps, recurrent=args.recurrent_policy, obs_recurrent=args.obs_recurrent, multi_task=True, free_exploration=args.free_exploration) prev_eval_r = {} print('done') if args.hard_attn: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base=MLPHardAttnBase, base_kwargs={ 'recurrent': args.recurrent_policy or args.obs_recurrent }) else: actor_critic = Policy(envs.observation_space.shape, envs.action_space, base=MLPAttnBase, base_kwargs={ 'recurrent': args.recurrent_policy or args.obs_recurrent }) actor_critic.to(device) if (args.continue_from_epoch > 0) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) actor_critic_, loaded_obs_rms_ = torch.load( os.path.join( save_path, args.env_name + "-epoch-{}.pt".format(args.continue_from_epoch))) actor_critic.load_state_dict(actor_critic_.state_dict()) if args.algo != 'ppo': raise "only PPO is supported" agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, num_tasks=args.num_processes, attention_policy=False, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) val_agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.val_lr, eps=args.eps, num_tasks=args.num_processes, attention_policy=True, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay) rollouts = RolloutStorage(args.num_steps, args.num_processes, envs.observation_space.shape, envs.action_space, actor_critic.recurrent_hidden_state_size) val_rollouts = RolloutStorage(args.num_steps, args.num_processes, val_envs.observation_space.shape, val_envs.action_space, actor_critic.recurrent_hidden_state_size) obs = envs.reset() rollouts.obs[0].copy_(obs) rollouts.to(device) val_obs = val_envs.reset() val_rollouts.obs[0].copy_(val_obs) val_rollouts.to(device) episode_rewards = deque(maxlen=10) start = time.time() num_updates = int( args.num_env_steps) // args.num_steps // args.num_processes save_copy = True for j in range(args.continue_from_epoch, args.continue_from_epoch + num_updates): # policy rollouts for step in range(args.num_steps): # Sample actions actor_critic.eval() with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( rollouts.obs[step], rollouts.recurrent_hidden_states[step], rollouts.masks[step]) actor_critic.train() # Obser reward and next obs obs, reward, done, infos = envs.step(action) for info in infos: if 'episode' in info.keys(): episode_rewards.append(info['episode']['r']) for k, v in info['episode'].items(): summary_writer.add_scalar( f'training/{k}', v, j * args.num_processes * args.num_steps + args.num_processes * step) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) actor_critic.eval() with torch.no_grad(): next_value = actor_critic.get_value( rollouts.obs[-1], rollouts.recurrent_hidden_states[-1], rollouts.masks[-1]).detach() actor_critic.train() rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) if save_copy: prev_weights = copy.deepcopy(actor_critic.state_dict()) prev_opt_state = copy.deepcopy(agent.optimizer.state_dict()) prev_val_opt_state = copy.deepcopy( val_agent.optimizer.state_dict()) save_copy = False value_loss, action_loss, dist_entropy = agent.update(rollouts) rollouts.after_update() # validation rollouts for val_iter in range(args.val_agent_steps): for step in range(args.num_steps): # Sample actions actor_critic.eval() with torch.no_grad(): value, action, action_log_prob, recurrent_hidden_states = actor_critic.act( val_rollouts.obs[step], val_rollouts.recurrent_hidden_states[step], val_rollouts.masks[step]) actor_critic.train() # Obser reward and next obs obs, reward, done, infos = val_envs.step(action) # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) bad_masks = torch.FloatTensor( [[0.0] if 'bad_transition' in info.keys() else [1.0] for info in infos]) val_rollouts.insert(obs, recurrent_hidden_states, action, action_log_prob, value, reward, masks, bad_masks) actor_critic.eval() with torch.no_grad(): next_value = actor_critic.get_value( val_rollouts.obs[-1], val_rollouts.recurrent_hidden_states[-1], val_rollouts.masks[-1]).detach() actor_critic.train() val_rollouts.compute_returns(next_value, args.use_gae, args.gamma, args.gae_lambda, args.use_proper_time_limits) val_value_loss, val_action_loss, val_dist_entropy = val_agent.update( val_rollouts) val_rollouts.after_update() # save for every interval-th episode or for the last epoch if (j % args.save_interval == 0 or j == num_updates - 1) and args.save_dir != "": save_path = os.path.join(args.save_dir, args.algo) try: os.makedirs(save_path) except OSError: pass torch.save([ actor_critic, getattr(utils.get_vec_normalize(envs), 'obs_rms', None) ], os.path.join(save_path, args.env_name + "-epoch-{}.pt".format(j))) if j % args.log_interval == 0 and len(episode_rewards) > 1: total_num_steps = (j + 1) * args.num_processes * args.num_steps end = time.time() print( "Updates {}, num timesteps {}, FPS {} \n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n" .format(j, total_num_steps, int(total_num_steps / (end - start)), len(episode_rewards), np.mean(episode_rewards), np.median(episode_rewards), np.min(episode_rewards), np.max(episode_rewards), dist_entropy, value_loss, action_loss)) revert = False if (args.eval_interval is not None and len(episode_rewards) > 1 and j % args.eval_interval == 0): actor_critic.eval() obs_rms = utils.get_vec_normalize(envs).obs_rms eval_r = {} printout = f'Seed {args.seed} Iter {j} ' for eval_disp_name, eval_env_name in EVAL_ENVS.items(): eval_r[eval_disp_name] = evaluate( actor_critic, obs_rms, eval_envs_dic, eval_disp_name, args.seed, args.num_processes, eval_env_name[1], logdir, device, steps=args.task_steps, recurrent=args.recurrent_policy, obs_recurrent=args.obs_recurrent, multi_task=True, free_exploration=args.free_exploration) if eval_disp_name in prev_eval_r: diff = np.array(eval_r[eval_disp_name]) - np.array( prev_eval_r[eval_disp_name]) if eval_disp_name == 'many_arms': if np.sum(diff > 0) - np.sum( diff < 0) < args.val_improvement_threshold: print('no update') revert = True summary_writer.add_scalar(f'eval/{eval_disp_name}', np.mean(eval_r[eval_disp_name]), (j + 1) * args.num_processes * args.num_steps) log_dict[eval_disp_name].append([ (j + 1) * args.num_processes * args.num_steps, eval_r[eval_disp_name] ]) printout += eval_disp_name + ' ' + str( np.mean(eval_r[eval_disp_name])) + ' ' # summary_writer.add_scalars('eval_combined', eval_r, (j+1) * args.num_processes * args.num_steps) if revert: actor_critic.load_state_dict(prev_weights) agent.optimizer.load_state_dict(prev_opt_state) val_agent.optimizer.load_state_dict(prev_val_opt_state) else: print(printout) prev_eval_r = eval_r.copy() save_copy = True actor_critic.train() save_obj(log_dict, os.path.join(logdir, 'log_dict.pkl')) envs.close() val_envs.close() for eval_disp_name, eval_env_name in EVAL_ENVS.items(): eval_envs_dic[eval_disp_name].close()
if not os.path.isfile(args.model_path): print_error('Model file does not exist') torch.manual_seed(0) torch.set_num_threads(1) device = torch.device('cpu') render_env = gym.make(args.env_name, args = args) render_env.seed(0) envs = make_vec_envs(args.env_name, 0, 4, 0.995, None, device, False, args = args) actor_critic = Policy( envs.observation_space.shape, envs.action_space, base_kwargs={'recurrent': False}) actor_critic.to(device) ob_rms = utils.get_vec_normalize(envs).ob_rms actor_critic, ob_rms = torch.load(args.model_path) actor_critic.eval() envs.close() render_full(render_env, actor_critic, ob_rms, deterministic = True, repeat = True)
record_video_filename), force=True) policy = Policy(env.observation_space.shape, env.action_space, base_kwargs={ 'recurrent': False, 'layernorm': args.layernorm }, obj_num=env.obj_dim) state_dict = torch.load(state_dict_path) policy.load_state_dict(state_dict) policy = policy.to(device) policy.double() policy.eval() ob_rms = None if args.env_params is not None and os.path.exists(args.env_params): with open(args.env_params, 'rb') as fp: env_params = pickle.load(fp) ob_rms = env_params['ob_rms'] while True: obs = env.reset() obj = np.zeros(env.obj_dim) t = time() done = False iter = 0 while not done: if ob_rms is not None: