def main(args=None): if args is None: args = readParser() # Initial environment env = gym.make(args.env_name) job_name = 'MBPO_{}_{}_{}'.format(args.env_name, args.model_type, args.seed) writer = SummaryWriter("tensorboard/{}".format(job_name)) writer.add_text( 'hyperparameters', "|param|value|\n|-|-|\n%s" % ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agent agent = SAC(env.observation_space.shape[0], env.action_space, args) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ReplayMemory(args.replay_size) # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ReplayMemory(new_pool_size) # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer)
def main(): logging.basicConfig(filename=time.strftime("%Y%m%d-%H%M%S") + '_train.log', level=logging.INFO) args = readParser() # Initial environment env = gym.make(args.env_name) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agents ensemble agents = [] for _ in range(args.num_agents): agent = SAC(env.observation_space.shape[0], env.action_space, args) agents.append(agent) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = Ensemble_Model(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ModelReplayMemory(args.replay_size) # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ModelReplayMemory(new_pool_size) # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agents, env_pool, model_pool)
def main(args=None): if args is None: args = readParser() # Initial environment env = gym.make(args.env_name) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agent agent = SAC(env.observation_space.shape[0], env.action_space, args) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ReplayMemory(args.replay_size) # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ReplayMemory(new_pool_size) # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agent, env_pool, model_pool)
def main(args=None): if args is None: args = readParser() save_model_dir = os.path.join(args.save_dir, args.env_name, 'dynamics_model') save_policy_dir = os.path.join(args.save_dir, args.env_name, 'policy_network') save_env_buffer_dir = os.path.join(args.save_dir, args.env_name, 'env_buffer') save_dynamics_buffer_dir = os.path.join(args.save_dir, args.env_name, 'dynamics_buffer') if not os.path.exists(save_model_dir): os.makedirs(save_model_dir) if not os.path.exists(save_policy_dir): os.makedirs(save_policy_dir) if not os.path.exists(save_env_buffer_dir): os.makedirs(save_env_buffer_dir) if not os.path.exists(save_dynamics_buffer_dir): os.makedirs(save_dynamics_buffer_dir) # Initial environment if 'Ant' in args.env_name: args.env_name = new_env.register_mbpo_environments()[0] print('Loaded TruncatedObs-version of the Ant environment: {}'.format( args.env_name)) # else: # env_name = args.env_name env = gym.make(args.env_name) job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format( args.env_name, args.model_type, args.seed) writer = SummaryWriter( str(os.path.join(args.save_dir, 'tensorboard', job_name))) writer.add_text( 'hyperparameters', "|param|value|\n|-|-|\n%s" % ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agent agent = SAC(env.observation_space.shape[0], env.action_space, args) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ReplayMemory(args.replay_size) # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ReplayMemory(new_pool_size) # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agent, env_pool, model_pool, writer, save_model_dir, save_policy_dir, save_env_buffer_dir, save_dynamics_buffer_dir) print('Training complete!') print( '---------------------------------------------------------------------' ) print( 'Start evaluating different policies at different model checkpoints...' ) print( '---------------------------------------------------------------------' ) test_policy_dependent_models(args, env, state_size, action_size, args.save_model_freq, args.save_model_freq * 6, save_model_dir, save_policy_dir)
def test_policy_dependent_models(args, env, state_size, action_size, start_eval, end_eval, save_model_dir, save_policy_dir): save_freq = args.save_model_freq checkpoint_epochs = np.arange(start_eval, end_eval, save_freq) # checkpoint_epochs = np.arange(20, 40, 2) # checkpoint_epochs = [20, 26, 32, 38] # checkpoint_epochs = np.append(checkpoint_epochs, args.num_epoch-1) model_policy_return_dict = {} state_error_dict = {} reward_error_dict = {} with open( os.path.join( args.save_dir, 'scaler_mu_std_{}.pkl'.format(str(int(args.num_epoch - 1)))), 'rb') as f: mean, std = pickle.load(f) for model_epoch in checkpoint_epochs: dynamics_model_checkpoint = torch.load( str( os.path.join( save_model_dir, 'EnsembleDynamicsModel_{}.pt'.format(model_epoch)))) env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) env_model.ensemble_model.load_state_dict( dynamics_model_checkpoint['dynamics_model_state_dict']) env_model.scaler.mu = mean env_model.scaler.std = std print('dynamics_model_{} loaded'.format(model_epoch)) predict_env = PredictEnv(env_model, args.env_name, args.model_type) predict_env_sampler = Predict_EnvSample(env, predict_env) for policy_epoch in checkpoint_epochs: policy_network_checkpoint = torch.load( str( os.path.join(save_policy_dir, 'PolicyNetwork_{}.pt'.format(model_epoch)))) agent = SAC(env.observation_space.shape[0], env.action_space, args) agent.policy.load_state_dict( policy_network_checkpoint['policy_model_state_dict']) avg_episode_reward = [] for i in range(args.num_eval_episode): predict_env_sampler.current_state = None sum_reward = 0 done = False counter = 0 state_error = [] reward_error = [] while not done and counter < args.epoch_length: cur_state, action, next_state, reward, done, info, model_error = predict_env_sampler.sample( agent, eval_t=True, ret_true_reward=False) sum_reward += reward counter += 1 state_error.append(model_error[0]) reward_error.append(model_error[1]) # logging.info('Policy epoch{} | DynamicsModel epoch{} | number of steps: {} | inner eval num: {} | sum reward: {} | model_error: {}'.format(policy_epoch, model_epoch, counter, i, sum_reward, np.sum(model_error_list))) avg_episode_reward.append(sum_reward) # writer.add_scalar('returns/mean_eval_return_model_{}_policy_{}'.format(model_epoch, policy_epoch), sum_reward, i) mean_episode_reward = torch.mean( torch.tensor(avg_episode_reward) * 1.) std_episode_reward = torch.std( torch.tensor(avg_episode_reward) * 1.) model_policy_return_dict['model_{}_policy_{}'.format( model_epoch, policy_epoch)] = [ mean_episode_reward.item(), std_episode_reward.item() ] state_error_dict['model_{}_policy_{}'.format( model_epoch, policy_epoch)] = state_error reward_error_dict['model_{}_policy_{}'.format( model_epoch, policy_epoch)] = reward_error print( 'model epoch: {} | policy epoch: {} | mean return: {:.3f} | state error: {:.2f} | reward error: {:.2f} | total steps: {} | Done' .format(model_epoch, policy_epoch, mean_episode_reward, np.mean(state_error), np.mean(reward_error), counter)) with open( str( os.path.join( args.save_dir, args.env_name, 'model_policy_return_dict_{}_{}_{}'.format( start_eval, save_freq, end_eval))), 'w') as f: json.dump(model_policy_return_dict, f) with open( str( os.path.join( args.save_dir, args.env_name, 'state_error_dict_{}_{}_{}.json'.format( start_eval, save_freq, end_eval))), 'w') as f: json.dump(state_error_dict, f) with open( str( os.path.join( args.save_dir, args.env_name, 'reward_error_dict_{}_{}_{}.json'.format( start_eval, save_freq, end_eval))), 'w') as f: json.dump( { k: np.array(v).astype(np.float64).tolist() for k, v in reward_error_dict.items() }, f) f.close()
def main(args=None): if args is None: args = readParser() # if not os.path.exists(args.save_model_path): # os.makedirs(args.save_model_path) # if not os.path.exists(args.save_policy_path): # os.makedirs(args.save_policy_path) # Initial environment env = gym.make(args.env_name) # job_name = 'MBPO_test_policy_dependent_models_{}_{}_{}'.format(args.env_name, args.model_type, args.seed) # writer = SummaryWriter("test_policy_dependent_results_2/tensorboard/{}".format(job_name)) # writer.add_text('hyperparameters', "|param|value|\n|-|-|\n%s" % ( # '\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) # Set random seed torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed) # Intial agent agent = SAC(env.observation_space.shape[0], env.action_space, args) policy_network_checkpoint = torch.load( './test_policy_dependent_results_2/policy/PolicyNetwork_20.pt') agent.policy.load_state_dict( policy_network_checkpoint['policy_model_state_dict']) # Initial ensemble model state_size = np.prod(env.observation_space.shape) action_size = np.prod(env.action_space.shape) if args.model_type == 'pytorch': env_model = EnsembleDynamicsModel(args.num_networks, args.num_elites, state_size, action_size, args.reward_size, args.pred_hidden_size, use_decay=args.use_decay) else: env_model = construct_model(obs_dim=state_size, act_dim=action_size, hidden_dim=args.pred_hidden_size, num_networks=args.num_networks, num_elites=args.num_elites) dynamics_model_checkpoint = torch.load( './test_policy_dependent_results_2/dynamics_model/EnsembleDynamicsModel_20.pt' ) env_model.ensemble_model.load_state_dict( dynamics_model_checkpoint['dynamics_model_state_dict']) # Predict environments predict_env = PredictEnv(env_model, args.env_name, args.model_type) # Initial pool for env env_pool = ReplayMemory(args.replay_size) env_pool.load( './test_policy_dependent_results_2/env_buffer/env_buffer_20.pkl') env_pool.position = len(env_pool.buffer) # env_pool.buffer = np.array(env_pool.buffer)[~np.where(np.array(env_pool.buffer)==None)[0]] # Initial pool for model rollouts_per_epoch = args.rollout_batch_size * args.epoch_length / args.model_train_freq model_steps_per_epoch = int(1 * rollouts_per_epoch) new_pool_size = args.model_retain_epochs * model_steps_per_epoch model_pool = ReplayMemory(new_pool_size) model_pool.load( './test_policy_dependent_results_2/model_buffer/model_buffer_20.pkl') model_pool.position = len(model_pool.buffer) # model_pool.buffer = np.array(model_pool.buffer)[~np.where(np.array(model_pool.buffer)==None)[0]] # Sampler of environment env_sampler = EnvSampler(env) train(args, env_sampler, predict_env, agent, env_pool, model_pool)