def main(path): env = gym.make('CarRacing-v0') model = SAC(env.observation_space, env.action_space) actor, critic = torch.load(path) model.load_model(actor, critic) while True: obs = env.reset(random_position=False) done = False rews = [] while not done: act = model.select_action(obs, evaluate=True) obs, rew, done, _ = env.step(act) rews.append(rew) print(np.sum(rews))
def main(args=None): if args is None: args = readParser() # Initial environment env = gym.make(args.env_name) job_name = 'MBPO_{}_{}_{}'.format(args.env_name, args.model_type, args.seed) writer = SummaryWriter("tensorboard/{}".format(job_name)) writer.add_text( 'hyperparameters', "|param|value|\n|-|-|\n%s" % ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agent agent = SAC(env.observation_space.shape[0], env.action_space, args) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ReplayMemory(args.replay_size) # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ReplayMemory(new_pool_size) # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer)
def test_policy_dependent_models(args, env, state_size, action_size, env_sampler, writer): save_freq = args.save_model_freq checkpoint_epochs = np.arange(0, args.num_epoch, save_freq) model_policy_return_dict = {} for model_epoch in checkpoint_epochs: dynamics_model_checkpoint = torch.load(args.save_model_path + 'EnsembleDynamicsModel_' + str(int(model_epoch)) + '.pt') env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) env_model.ensemble_model.load_state_dict( dynamics_model_checkpoint['dynamics_model_state_dict']) for policy_epoch in checkpoint_epochs: policy_network_checkpoint = torch.load(args.save_policy_path + 'PolicyNetwork_' + str(int(policy_epoch)) + '.pt') agent = SAC(env.observation_space.shape[0], env.action_space, args) agent.policy.load_state_dict( policy_network_checkpoint['policy_model_state_dict']) avg_episode_reward = [] for i in range(args.num_eval_episode): env_sampler.current_state = None sum_reward = 0 done = False counter = 0 while not done: cur_state, action, next_state, reward, done, info = env_sampler.sample( agent, eval_t=True) sum_reward += reward counter += 1 logging.info( 'Policy epoch{} | DynamicsModel epoch{} | number of steps: {} | inner eval num: {} | sum reward: {}' .format(counter, i, sum_reward)) avg_episode_reward.append(sum_reward) writer.add_scalar( 'returns/mean_eval_return_model_{}_policy_{}'.format( model_epoch, policy_epoch), sum_reward, i) mean_episode_reward = torch.mean(torch.tensor(avg_episode_reward)) std_episode_reward = torch.std(torch.tensor(avg_episode_reward)) model_policy_return_dict['model_{}_policy_{}'.format( model_epoch, policy_epoch)] = [mean_episode_reward, std_episode_reward] with open('test_policy_dependent_results_2/mean_std_evaluated_policy.json', 'w') as f: json.dump(model_policy_return_dict, f) f.close()
def main(): logging.basicConfig(filename=time.strftime("%Y%m%d-%H%M%S") + '_train.log', level=logging.INFO) args = readParser() # Initial environment env = gym.make(args.env_name) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agents ensemble agents = [] for _ in range(args.num_agents): agent = SAC(env.observation_space.shape[0], env.action_space, args) agents.append(agent) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = Ensemble_Model(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ModelReplayMemory(args.replay_size) # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ModelReplayMemory(new_pool_size) # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agents, env_pool, model_pool)
def main(args=None): if args is None: args = readParser() # Initial environment env = gym.make(args.env_name) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agent agent = SAC(env.observation_space.shape[0], env.action_space, args) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ReplayMemory(args.replay_size) # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ReplayMemory(new_pool_size) # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agent, env_pool, model_pool)
def train( seed: int = 69, batch_size: int = 256, num_steps: int = 5000000, updates_per_step: int = 1, start_steps: int = 100000, replay_size: int = 1000000, eval: bool = True, eval_interval: int = 50, accelerated_exploration: bool = True, save_models: bool = True, load_models: bool = True, save_memory: bool = True, load_memory: bool = False, path_to_actor: str = "./models/sac_actor_carracer_klein_6_24_18.pt", path_to_critic: str = "./models/sac_critic_carracer_klein_6_24_18.pt", path_to_buffer: str = "./memory/buffer_klein_6_24_18.pkl"): """ ## The train function consist of: - Setting up the environment, agent and replay buffer - Logging hyperparameters and training results - Loading previously saved actor and critic models - Training loop - Evaluation (every *eval_interval* episodes) - Saving actor and critic models ## Parameters: - **seed** *(int)*: Seed value to generate random numbers. - **batch_size** *(int)*: Number of samples that will be propagated through the Q, V, and policy network. - **num_steps** *(int)*: Number of steps that the agent takes in the environment. Determines the training duration. - **updates_per_step** *(int)*: Number of network parameter updates per step in the environment. - **start_steps** *(int)*: Number of steps for which a random action is sampled. After reaching *start_steps* an action according to the learned policy is chosen. - **replay_size** *(int)*: Size of the replay buffer. - **eval** *(bool)*: If *True* the trained policy is evaluated every *eval_interval* episodes. - **eval_interval** *(int)*: Interval of episodes after which to evaluate the trained policy. - **accelerated_exploration** *(bool)*: If *True* an action with acceleration bias is sampled. - **save_memory** *(bool)*: If *True* the experience replay buffer is saved to the harddrive. - **save_models** *(bool)*: If *True* actor and critic models are saved to the harddrive. - **load_models** *(bool)*: If *True* actor and critic models are loaded from *path_to_actor* and *path_to_critic*. - **path_to_actor** *(str)*: Path to actor model. - **path_to_critic** *(str)*: Path to critic model. """ # Environment env = gym.make("CarRacing-v0") torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) # NOTE: ALWAYS CHECK PARAMETERS BEFORE TRAINING agent = SAC(env.action_space, policy="Gaussian", gamma=0.99, tau=0.005, lr=0.0003, alpha=0.2, automatic_temperature_tuning=True, batch_size=batch_size, hidden_size=512, target_update_interval=2, input_dim=32) # Memory memory = ReplayMemory(replay_size) if load_memory: # load memory and deactivate random exploration memory.load(path_to_buffer) if load_memory or load_models: start_steps = 0 # Training Loop total_numsteps = 0 updates = 0 # Log Settings and training results date = datetime.now() log_dir = Path(f"runs/{date.year}_SAC_{date.month}_{date.day}_{date.hour}") writer = SummaryWriter(log_dir=log_dir) settings_msg = ( f"Training SAC for {num_steps} steps" "\n\nTRAINING SETTINGS:\n" f"Seed={seed}, Batch size: {batch_size}, Updates per step: {updates_per_step}\n" f"Accelerated exploration: {accelerated_exploration}, Start steps: {start_steps}, Replay size: {replay_size}" "\n\nALGORITHM SETTINGS:\n" f"Policy: {agent.policy_type}, Automatic temperature tuning: {agent.automatic_temperature_tuning}\n" f"Gamma: {agent.gamma}, Tau: {agent.tau}, Alpha: {agent.alpha}, LR: {agent.lr}\n" f"Target update interval: {agent.target_update_interval}, Latent dim: {agent.input_dim}, Hidden size: {agent.hidden_size}" ) with open(log_dir / "settings.txt", "w") as file: file.write(settings_msg) if load_models: try: agent.load_model(path_to_actor, path_to_critic) except FileNotFoundError: warnings.warn( "Couldn't locate models in the specified paths. Training from scratch.", RuntimeWarning) for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0 done = False state = env.reset() state = process_observation(state) state = encoder.sample(state) # choose random starting position for the car position = np.random.randint(len(env.track)) env.car = Car(env.world, *env.track[position][1:4]) if accelerated_exploration: # choose random starting position for the car # position = np.random.randint(len(env.track)) # env.car = Car(env.world, *env.track[position][1:4]) # Sample random action action = env.action_space.sample() while not done: if total_numsteps < start_steps and not load_models: # sample action with acceleration bias if accelerated_action = True if accelerated_exploration: action = generate_action(action) else: action = env.action_space.sample() else: action = agent.select_action(state) if len(memory) > batch_size: # Number of updates per step in environment for _ in range(updates_per_step): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters( memory, batch_size, updates) writer.add_scalar('loss/critic_1', critic_1_loss, updates) writer.add_scalar('loss/critic_2', critic_2_loss, updates) writer.add_scalar('loss/policy', policy_loss, updates) writer.add_scalar('loss/entropy_loss', ent_loss, updates) writer.add_scalar('entropy_temperature/alpha', alpha, updates) updates += 1 next_state, reward, done, _ = env.step(action) # Step next_state = process_observation(next_state) next_state = encoder.sample(next_state) episode_steps += 1 total_numsteps += 1 episode_reward += reward # Ignore the "done" signal if it comes from hitting the time horizon. # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py) mask = 1 if episode_steps == env._max_episode_steps else float( not done) memory.push(state, action, reward, next_state, mask) # Append transition to memory state = next_state if total_numsteps > num_steps: break writer.add_scalar('reward/train', episode_reward, i_episode) print( f"Episode: {i_episode}, total numsteps: {total_numsteps}, episode steps: {episode_steps}, reward: {round(episode_reward, 2)}" ) if i_episode % eval_interval == 0 and eval == True: avg_reward = 0. episodes = 10 if save_models: agent.save_model( "carracer", f"{getuser()}_{date.month}_{date.day}_{date.hour}") for _ in range(episodes): state = env.reset() state = process_observation(state) state = encoder.sample(state) episode_reward = 0 done = False while not done: action = agent.select_action(state, eval=True) next_state, reward, done, _ = env.step(action) next_state = process_observation(next_state) next_state = encoder.sample(next_state) episode_reward += reward state = next_state avg_reward += episode_reward avg_reward /= episodes if save_models: agent.save_model( "carracer", f"{getuser()}_{date.month}_{date.day}_{date.hour}") if save_memory: memory.save( f"buffer_{getuser()}_{date.month}_{date.day}_{date.hour}") writer.add_scalar("avg_reward/test", avg_reward, i_episode) print("-" * 40) print( f"Test Episodes: {episodes}, Avg. Reward: {round(avg_reward, 2)}" ) print("-" * 40) env.close()
def main(args=None): if args is None: args = readParser() save_model_dir = os.path.join(args.save_dir, args.env_name, 'dynamics_model') save_policy_dir = os.path.join(args.save_dir, args.env_name, 'policy_network') save_env_buffer_dir = os.path.join(args.save_dir, args.env_name, 'env_buffer') save_dynamics_buffer_dir = os.path.join(args.save_dir, args.env_name, 'dynamics_buffer') if not os.path.exists(save_model_dir): os.makedirs(save_model_dir) if not os.path.exists(save_policy_dir): os.makedirs(save_policy_dir) if not os.path.exists(save_env_buffer_dir): os.makedirs(save_env_buffer_dir) if not os.path.exists(save_dynamics_buffer_dir): os.makedirs(save_dynamics_buffer_dir) # Initial environment if 'Ant' in args.env_name: args.env_name = new_env.register_mbpo_environments()[0] print('Loaded TruncatedObs-version of the Ant environment: {}'.format( args.env_name)) # else: # env_name = args.env_name env = gym.make(args.env_name) job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format( args.env_name, args.model_type, args.seed) writer = SummaryWriter( str(os.path.join(args.save_dir, 'tensorboard', job_name))) writer.add_text( 'hyperparameters', "|param|value|\n|-|-|\n%s" % ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agent agent = SAC(env.observation_space.shape[0], env.action_space, args) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ReplayMemory(args.replay_size) # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ReplayMemory(new_pool_size) # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer, save_model_dir, save_policy_dir, save_env_buffer_dir, save_dynamics_buffer_dir) print('Training complete!') print( '---------------------------------------------------------------------' ) print( 'Start evaluating different policies at different model checkpoints...' ) print( '---------------------------------------------------------------------' ) test_policy_dependent_models(args, env, state_size, action_size, args.save_model_freq, args.save_model_freq * 6, save_model_dir, save_policy_dir)
def test_policy_dependent_models(args, env, state_size, action_size, start_eval, end_eval, save_model_dir, save_policy_dir): save_freq = args.save_model_freq checkpoint_epochs = np.arange(start_eval, end_eval, save_freq) # checkpoint_epochs = np.arange(20, 40, 2) # checkpoint_epochs = [20, 26, 32, 38] # checkpoint_epochs = np.append(checkpoint_epochs, args.num_epoch-1) model_policy_return_dict = {} state_error_dict = {} reward_error_dict = {} with open( os.path.join( args.save_dir, 'scaler_mu_std_{}.pkl'.format(str(int(args.num_epoch - 1)))), 'rb') as f: mean, std = pickle.load(f) for model_epoch in checkpoint_epochs: dynamics_model_checkpoint = torch.load( str( os.path.join( save_model_dir, 'EnsembleDynamicsModel_{}.pt'.format(model_epoch)))) env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) env_model.ensemble_model.load_state_dict( dynamics_model_checkpoint['dynamics_model_state_dict']) env_model.scaler.mu = mean env_model.scaler.std = std print('dynamics_model_{} loaded'.format(model_epoch)) predict_env = PredictEnv(env_model, args.env_name, args.model_type) predict_env_sampler = Predict_EnvSample(env, predict_env) for policy_epoch in checkpoint_epochs: policy_network_checkpoint = torch.load( str( os.path.join(save_policy_dir, 'PolicyNetwork_{}.pt'.format(model_epoch)))) agent = SAC(env.observation_space.shape[0], env.action_space, args) agent.policy.load_state_dict( policy_network_checkpoint['policy_model_state_dict']) avg_episode_reward = [] for i in range(args.num_eval_episode): predict_env_sampler.current_state = None sum_reward = 0 done = False counter = 0 state_error = [] reward_error = [] while not done and counter < args.epoch_length: cur_state, action, next_state, reward, done, info, model_error = predict_env_sampler.sample( agent, eval_t=True, ret_true_reward=False) sum_reward += reward counter += 1 state_error.append(model_error[0]) reward_error.append(model_error[1]) # logging.info('Policy epoch{} | DynamicsModel epoch{} | number of steps: {} | inner eval num: {} | sum reward: {} | model_error: {}'.format(policy_epoch, model_epoch, counter, i, sum_reward, np.sum(model_error_list))) avg_episode_reward.append(sum_reward) # writer.add_scalar('returns/mean_eval_return_model_{}_policy_{}'.format(model_epoch, policy_epoch), sum_reward, i) mean_episode_reward = torch.mean( torch.tensor(avg_episode_reward) * 1.) std_episode_reward = torch.std( torch.tensor(avg_episode_reward) * 1.) model_policy_return_dict['model_{}_policy_{}'.format( model_epoch, policy_epoch)] = [ mean_episode_reward.item(), std_episode_reward.item() ] state_error_dict['model_{}_policy_{}'.format( model_epoch, policy_epoch)] = state_error reward_error_dict['model_{}_policy_{}'.format( model_epoch, policy_epoch)] = reward_error print( 'model epoch: {} | policy epoch: {} | mean return: {:.3f} | state error: {:.2f} | reward error: {:.2f} | total steps: {} | Done' .format(model_epoch, policy_epoch, mean_episode_reward, np.mean(state_error), np.mean(reward_error), counter)) with open( str( os.path.join( args.save_dir, args.env_name, 'model_policy_return_dict_{}_{}_{}'.format( start_eval, save_freq, end_eval))), 'w') as f: json.dump(model_policy_return_dict, f) with open( str( os.path.join( args.save_dir, args.env_name, 'state_error_dict_{}_{}_{}.json'.format( start_eval, save_freq, end_eval))), 'w') as f: json.dump(state_error_dict, f) with open( str( os.path.join( args.save_dir, args.env_name, 'reward_error_dict_{}_{}_{}.json'.format( start_eval, save_freq, end_eval))), 'w') as f: json.dump( { k: np.array(v).astype(np.float64).tolist() for k, v in reward_error_dict.items() }, f) f.close()
def test(): # Environment # env = NormalizedActions(gym.make(args.env_name)) #env = gym.make(args.env_name) args = get_args() args.eval = True set_env_arg(t_type=args.t_type, n_type=args.n_type, r_type=args.r_type, proj=str_to_bool(args.proj), cam_r_noise=str_to_bool(args.cam_r_noise), cam_t_noise=str_to_bool(args.cam_t_noise), cam_in_noise=str_to_bool(args.cam_in_noise), test=str_to_bool(args.test)) torch.manual_seed(args.seed) np.random.seed(args.seed) # env.seed(args.seed) # Agent agent = SAC(env.state_dim, env.action_space, args) agent.load_model('models/sac_actor_crane_', 'models/sac_critic_crane_') # TesnorboardX writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "")) # Memory memory = ReplayMemory(args.replay_size) # Training Loop total_numsteps = 0 updates = 0 for ep in range(MAX_EP_STEPS): state, gt = env.reset() episode_reward = 0 for t in range(MAX_STEP): # while True: env.render() action = agent.select_action(state) next_state, reward, done, _ = env.step(action) # Step if done: mask = 1 else: mask = 0 memory.push(state, action, reward, next_state, mask) # Append transition to memory """# store experience trans = np.hstack((s, a, [r], s_)) outfile = exp_path + '/' + str(ep) + '_' + str(t) np.save(outfile, trans) """ state = next_state episode_reward += reward if t == MAX_STEP - 1 or done: # if done: result = '| done' if done else '| ----' print( 'Ep:', ep, result, '| R: %i' % int(episode_reward), '| Explore: %.2f' % var, ) out_s = 'Ep: ' + str(ep) + ' result: ' + str(done) + \ " R: " + str(episode_reward) + " Explore " + str(var) + " \n" """
help='Value target update per no. of updates per step (default: 1)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 10000000)') parser.add_argument('--cuda', action="store_true", help='run on CUDA (default: False)') args = parser.parse_args() # Environment # env = NormalizedActions(gym.make(args.env_name)) env = gym.make(args.env_name) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Agent agent = SAC(env.observation_space.shape[0], env.action_space, args) #TesnorboardX writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "")) # Memory memory = ReplayMemory(args.replay_size) # Training Loop total_numsteps = 0 updates = 0 for i_episode in itertools.count(1): episode_reward = 0 episode_steps = 0
def main(args=None): if args is None: args = readParser() # if not os.path.exists(args.save_model_path): # os.makedirs(args.save_model_path) # if not os.path.exists(args.save_policy_path): # os.makedirs(args.save_policy_path) # Initial environment env = gym.make(args.env_name) # job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(args.env_name, args.model_type, args.seed) # writer = SummaryWriter("test_policy_dependent_results_2/tensorboard/{}".format(job_name)) # writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( # '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agent agent = SAC(env.observation_space.shape[0], env.action_space, args) policy_network_checkpoint = torch.load( './test_policy_dependent_results_2/policy/PolicyNetwork_20.pt') agent.policy.load_state_dict( policy_network_checkpoint['policy_model_state_dict']) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) dynamics_model_checkpoint = torch.load( './test_policy_dependent_results_2/dynamics_model/EnsembleDynamicsModel_20.pt' ) env_model.ensemble_model.load_state_dict( dynamics_model_checkpoint['dynamics_model_state_dict']) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ReplayMemory(args.replay_size) env_pool.load( './test_policy_dependent_results_2/env_buffer/env_buffer_20.pkl') env_pool.position = len(env_pool.buffer) # env_pool.buffer = np.array(env_pool.buffer)[~np.where(np.array(env_pool.buffer)==None)[0]] # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ReplayMemory(new_pool_size) model_pool.load( './test_policy_dependent_results_2/model_buffer/model_buffer_20.pkl') model_pool.position = len(model_pool.buffer) # model_pool.buffer = np.array(model_pool.buffer)[~np.where(np.array(model_pool.buffer)==None)[0]] # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agent, env_pool, model_pool)
def main(): if sys.platform.startswith('win'): # Add the _win_handler function to the windows console's handler function list win32api.SetConsoleCtrlHandler(_win_handler, True) if os.path.exists( os.path.join(config_file.config['config_file'], 'config.yaml')): config = sth.load_config(config_file.config['config_file']) else: config = config_file.config print(f'load config from config.') hyper_config = config['hyper parameters'] train_config = config['train config'] record_config = config['record config'] basic_dir = record_config['basic_dir'] last_name = record_config['project_name'] + '/' \ + record_config['remark'] \ + record_config['run_id'] cp_dir = record_config['checkpoint_basic_dir'] + last_name cp_file = cp_dir + '/rb' log_dir = record_config['log_basic_dir'] + last_name excel_dir = record_config['excel_basic_dir'] + last_name config_dir = record_config['config_basic_dir'] + last_name sth.check_or_create(basic_dir, 'basic') sth.check_or_create(cp_dir, 'checkpoints') sth.check_or_create(log_dir, 'logs(summaries)') sth.check_or_create(excel_dir, 'excel') sth.check_or_create(config_dir, 'config') logger = create_logger( name='logger', console_level=logging.INFO, console_format='%(levelname)s : %(message)s', logger2file=record_config['logger2file'], file_name=log_dir + '\log.txt', file_level=logging.WARNING, file_format= '%(lineno)d - %(asctime)s - %(module)s - %(funcName)s - %(levelname)s - %(message)s' ) if train_config['train']: sth.save_config(config_dir, config) if train_config['unity_mode']: env = UnityEnvironment() else: env = UnityEnvironment( file_name=train_config['unity_file'], no_graphics=True if train_config['train'] else False, base_port=train_config['port']) brain_name = env.external_brain_names[0] brain = env.brains[brain_name] # set the memory use proportion of GPU tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 tf.reset_default_graph() graph = tf.Graph() with graph.as_default() as g: with tf.Session(graph=g, config=tf_config) as sess: logger.info('Algorithm: {0}'.format( train_config['algorithm'].name)) if train_config['algorithm'] == config_file.algorithms.ppo_sep_ac: from ppo.ppo_base import PPO_SEP model = PPO_SEP(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('PPO_SEP initialize success.') elif train_config['algorithm'] == config_file.algorithms.ppo_com: from ppo.ppo_base import PPO_COM model = PPO_COM(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('PPO_COM initialize success.') elif train_config['algorithm'] == config_file.algorithms.sac: from sac.sac import SAC model = SAC(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('SAC initialize success.') elif train_config['algorithm'] == config_file.algorithms.sac_no_v: from sac.sac_no_v import SAC_NO_V model = SAC_NO_V(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('SAC_NO_V initialize success.') elif train_config['algorithm'] == config_file.algorithms.ddpg: from ddpg.ddpg import DDPG model = DDPG(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('DDPG initialize success.') elif train_config['algorithm'] == config_file.algorithms.td3: from td3.td3 import TD3 model = TD3(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) logger.info('TD3 initialize success.') recorder = Recorder(log_dir, excel_dir, record_config, logger, max_to_keep=5, pad_step_number=True, graph=g) episode = init_or_restore(cp_dir, sess, recorder, cp_file) try: if train_config['train']: train_OnPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=episode, model=model, recorder=recorder, cp_file=cp_file, hyper_config=hyper_config, train_config=train_config) if not train_config[ 'use_replay_buffer'] else train_OffPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=episode, model=model, recorder=recorder, cp_file=cp_file, hyper_config=hyper_config, train_config=train_config) tf.train.write_graph(g, cp_dir, 'raw_graph_def.pb', as_text=False) export_model(cp_dir, g) else: inference(env, brain_name, model, train_config) except Exception as e: logger.error(e) finally: env.close() recorder.close() sys.exit()
def main(args=None): if args is None: args = readParser() if not os.path.exists(args.save_model_path): os.makedirs(args.save_model_path) if not os.path.exists(args.save_policy_path): os.makedirs(args.save_policy_path) # Initial environment env = gym.make(args.env_name) # job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(args.env_name, args.model_type, args.seed) # writer = SummaryWriter("test_policy_dependent_results/tensorboard/{}".format(job_name)) # writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( # '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) # # Set random seed # torch.manual_seed(args.seed) # np.random.seed(args.seed) # env.seed(args.seed) # Intial agent agent = SAC(env.observation_space.shape[0], env.action_space, args) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) # else: # env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, # num_elites=args.num_elites) # Predict environments # predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env # env_pool = ReplayMemory(args.replay_size) # # Initial pool for model # rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq # model_steps_per_epoch = int(1 * rollouts_per_epoch) # new_pool_size = args.model_retain_epochs * model_steps_per_epoch # model_pool = ReplayMemory(new_pool_size) # Sampler of environment env_sampler = EnvSampler(env) # train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer) print('Training complete!') print( '---------------------------------------------------------------------' ) print( 'Start evaluating different policies at different model checkpoints...' ) print( '---------------------------------------------------------------------' ) test_policy_dependent_models(args, env, state_size, action_size, env_sampler)
env.close() env = gym.make('LunarLanderContinuous-v2') torch.manual_seed(1) np.random.seed(1) s_size = env.observation_space.shape[0] a_size = env.action_space.shape[0] print("State Size:", s_size) print("Action Size:", a_size) sac = SAC(in_dim=s_size, out_dim=a_size, p_alpha=1e-3, q_alpha=1e-3) reward = train(env, sac, epochs=300, episodes=1, steps=100, render=False, graphing=True, run=False) run(sac, env, episodes=3, steps=100) """ plt.plot(reward/np.max(reward), label="Reward") plt.plot(np.array(sac.q_loss)/np.max(sac.q_loss), label="Q loss") plt.plot(np.array(sac.p_loss)/np.max(sac.p_loss), label="P loss") plt.legend()
def main(): if sys.platform.startswith('win'): win32api.SetConsoleCtrlHandler(_win_handler, True) if train_config['unity_mode']: env = UnityEnvironment() else: env = UnityEnvironment( file_name=train_config['unity_file'], no_graphics=True if train_config['train'] else False, base_port=train_config['port']) brain_name = env.external_brain_names[0] brain = env.brains[brain_name] # set the memory use proportion of GPU tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # tf_config.gpu_options.per_process_gpu_memory_fraction = 0.5 tf.reset_default_graph() graph = tf.Graph() with graph.as_default() as g: with tf.Session(graph=g, config=tf_config) as sess: print('Algorithm: {0}'.format(train_config['algorithm'].name)) if train_config['algorithm'] == algorithms.ppo_sep_ac: from ppo.ppo_base import PPO_SEP model = PPO_SEP(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('PPO_SEP initialize success.') elif train_config['algorithm'] == algorithms.ppo_com: from ppo.ppo_base import PPO_COM model = PPO_COM(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('PPO_COM initialize success.') elif train_config['algorithm'] == algorithms.sac: from sac.sac import SAC model = SAC(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('SAC initialize success.') elif train_config['algorithm'] == algorithms.sac_no_v: from sac.sac_no_v import SAC_NO_V model = SAC_NO_V(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('SAC_NO_V initialize success.') elif train_config['algorithm'] == algorithms.ddpg: from ddpg.ddpg import DDPG model = DDPG(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('DDPG initialize success.') elif train_config['algorithm'] == algorithms.td3: from td3.td3 import TD3 model = TD3(sess=sess, s_dim=brain.vector_observation_space_size, a_counts=brain.vector_action_space_size[0], hyper_config=hyper_config) print('TD3 initialize success.') sess.run(tf.global_variables_initializer()) try: if train_config['train']: train_OnPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=0, model=model, hyper_config=hyper_config, train_config=train_config) if not train_config[ 'use_replay_buffer'] else train_OffPolicy( sess=sess, env=env, brain_name=brain_name, begin_episode=0, model=model, hyper_config=hyper_config, train_config=train_config) else: inference(env, brain_name, model, train_config) except Exception as e: print(e) finally: env.close() sys.exit()
def train_SAC(env_name, exp_name, seed, logdir, two_qf=False, reparam=False, nepochs=100, paras={}): alpha = { 'Ant-v2': 0.1, 'HalfCheetah-v2': 0.2, 'Hopper-v2': 0.2, 'Humanoid-v2': 0.05, 'Walker2d-v2': 0.2, 'Toddler': 0.05, 'Adult': 0.05, 'LunarLander': 0.1 }.get(env_name, 0.2) algorithm_params = { 'alpha': alpha, 'batch_size': 256, 'discount': 0.99, 'learning_rate': 1e-3, 'reparameterize': reparam, 'tau': 0.01, 'epoch_length': 1000, 'n_epochs': nepochs, # 500 'two_qf': two_qf, } sampler_params = { 'max_episode_length': 1000, 'prefill_steps': 1000, } replay_pool_params = { 'max_size': 1e6, } value_function_params = { 'hidden_layer_sizes': (128, 128), } q_function_params = { 'hidden_layer_sizes': (128, 128), } policy_params = { 'hidden_layer_sizes': (128, 128), } logz.configure_output_dir(logdir) params = { 'exp_name': exp_name, 'env_name': env_name, 'algorithm_params': algorithm_params, 'sampler_params': sampler_params, 'replay_pool_params': replay_pool_params, 'value_function_params': value_function_params, 'q_function_params': q_function_params, 'policy_params': policy_params } logz.save_params(params) if env_name == 'Toddler' or env_name == 'Adult': env = CustomHumanoidEnv(template=env_name) elif env_name == 'LunarLander': env = LunarLanderContinuous(**paras) else: env = gym.envs.make(env_name) # Observation and action sizes ac_dim = env.action_space.n \ if isinstance(env.action_space, gym.spaces.Discrete) \ else env.action_space.shape[0] # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) q_function = nn.QFunction(name='q_function', **q_function_params) if algorithm_params.get('two_qf', False): q_function2 = nn.QFunction(name='q_function2', **q_function_params) else: q_function2 = None value_function = nn.ValueFunction(name='value_function', **value_function_params) target_value_function = nn.ValueFunction(name='target_value_function', **value_function_params) policy = nn.GaussianPolicy( action_dim=ac_dim, reparameterize=algorithm_params['reparameterize'], **policy_params) samplers = [] replay_pools = [] sampler = utils.SimpleSampler(**sampler_params) replay_pool = utils.SimpleReplayPool( observation_shape=env.observation_space.shape, action_shape=(ac_dim, ), **replay_pool_params) sampler.initialize(env, policy, replay_pool) samplers.append(sampler) replay_pools.append(replay_pool) algorithm = SAC(**algorithm_params) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True # may need if using GPU with tf.Session(config=tf_config): algorithm.build(env=env, policy=policy, q_function=q_function, q_function2=q_function2, value_function=value_function, target_value_function=target_value_function) # algorithm_params.get('n_epochs', 1000) for epoch in algorithm.train(sampler, n_epochs=algorithm_params.get( 'n_epochs', 100)): logz.log_tabular('Iteration', epoch) for k, v in algorithm.get_statistics().items(): logz.log_tabular(k, v) for k, v in replay_pool.get_statistics().items(): logz.log_tabular(k, v) for k, v in sampler.get_statistics().items(): logz.log_tabular(k, v) logz.dump_tabular()
def train(): # Environment # env = NormalizedActions(gym.make(args.env_name)) #env = gym.make(args.env_name) global var args = get_args() set_env_arg(t_type=args.t_type, n_type=args.n_type, r_type=args.r_type, proj=str_to_bool(args.proj), cam_r_noise=str_to_bool(args.cam_r_noise), cam_t_noise=str_to_bool(args.cam_t_noise), cam_in_noise=str_to_bool(args.cam_in_noise), test=str_to_bool(args.test)) torch.manual_seed(args.seed) np.random.seed(args.seed) # env.seed(args.seed) # Agent agent = SAC(env.state_dim, env.action_space, args) agent.load_model('models/sac_actor_crane70_', 'models/sac_critic_crane70_') # TesnorboardX writer = SummaryWriter(logdir='runs/{}_SAC_{}_{}_{}'.format( datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env_name, args.policy, "autotune" if args.automatic_entropy_tuning else "")) # Memory memory = ReplayMemory(args.replay_size) # Training Loop total_numsteps = 0 updates = 0 for ep in range(MAX_EP_STEPS): state, gt = env.reset() episode_reward = 0 for t in range(MAX_STEP): # while True: env.render() # Added exploration noise if ep < sample_numsteps: print('sample') action = env.action_space.sample() # Sample random action else: # Sample action from policy action = agent.select_action(state) # add randomness to action selection for exploration action = np.clip(np.random.normal(action, var), *ACTION_BOUND) next_state, reward, done, _ = env.step(action) # Step if done: mask = 1 else: mask = 0 memory.push(state, action, reward, next_state, mask) # Append transition to memory """# store experience trans = np.hstack((s, a, [r], s_)) outfile = exp_path + '/' + str(ep) + '_' + str(t) np.save(outfile, trans) """ if len(memory) > sample_numsteps * MAX_STEP: # Number of updates per step in environment var = max([var * .9999, VAR_MIN]) for i in range(1): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters( memory, 512, updates) writer.add_scalar('loss/critic_1', critic_1_loss, updates) writer.add_scalar('loss/critic_2', critic_2_loss, updates) writer.add_scalar('loss/policy', policy_loss, updates) writer.add_scalar('loss/entropy_loss', ent_loss, updates) writer.add_scalar('entropy_temprature/alpha', alpha, updates) updates += 1 state = next_state episode_reward += reward if t == MAX_STEP - 1 or done: if len(memory) > sample_numsteps * MAX_STEP: for i in range(10): # Update parameters of all the networks critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters( memory, 512, updates) writer.add_scalar('loss/critic_1', critic_1_loss, updates) writer.add_scalar('loss/critic_2', critic_2_loss, updates) writer.add_scalar('loss/policy', policy_loss, updates) writer.add_scalar('loss/entropy_loss', ent_loss, updates) writer.add_scalar('entropy_temprature/alpha', alpha, updates) updates += 1 # if done: result = '| done' if done else '| ----' print( 'Ep:', ep, result, '| R: %i' % int(episode_reward), '| Explore: %.2f' % var, ) out_s = 'Ep: ' + str(ep) + ' result: ' + str(done) + \ " R: " + str(episode_reward) + " Explore " + str(var) + " \n" break """ f = open(log_path, "a+") f.write(out_s) f.close() """ if ep % 10 == 0: agent.save_model(env_name='crane' + str(ep)) agent.save_model(env_name='crane')
def main(seed: int = 111, batch_size: int = 512, episodes: int = 100, path_to_actor: str = "models/sac_actor_carracer_klein_6_24_18.pt", path_to_critic: str = "models/sac_critic_carracer_klein_6_24_18.pt"): """ Function for displaying a trained Soft Actor-Critic agent. ## Parameters: - **seed** *(int=111)*: RNG seed determining the generated tracks. - **batch_size** *(batch_size=512)*: Batch size needed for SAC algorithm initialization. - **episodes** *(int=100)*: Number of episodes in the evaluation run. - **path_to_actor** *(str)*: Path to saved SAC actor model. - **path_to_critic** *(str)*: Path to saved SAC critic model. """ # Environment env = gym.make("CarRacing-v0") torch.manual_seed(seed) np.random.seed(seed) env.seed(seed) # Agent agent = SAC(env.action_space, policy="Gaussian", gamma=0.99, tau=0.005, lr=0.0003, alpha=0.2, automatic_temperature_tuning=False, batch_size=batch_size, hidden_size=256, target_update_interval=1, input_dim=32) #load models and throws error if the paths are wrong agent.load_model(path_to_actor, path_to_critic) avg_reward = 0. rewards = [] for i_episode in range(episodes): episode_reward = 0 # Get initial observation state = env.reset() state = process_observation(state) state = encoder.sample(state) done = False for t in range(1000): # render the environment at each step env.render() # move the car using the policy actions action = agent.select_action(state, eval=True) state, reward, done, _ = env.step(action) state = process_observation(state) state = encoder.sample(state) episode_reward += reward if done: print("Episode finished after {} timesteps".format(t + 1)) break rewards.append(episode_reward) avg_reward += episode_reward # Close the rendering np.save("rewards.npy", rewards) env.close() avg_reward /= episodes print("-" * 40) print(f"Test Episodes: {episodes}, Avg. Reward: {round(avg_reward, 2)}") print("-" * 40)