def get_multinomial_action(state, action_space_size): if len(state.size()) != 2: policy = policies.CategoricalPolicy(MLP_factory(1, output_size=action_space_size)) else: policy = policies.CategoricalPolicy(MLP_factory(state.size()[1], output_size=action_space_size)) action = policy(state) return action
def __init__(self, environment_details, seeds=None, **algo_args): super().__init__(environment_details, seeds, **algo_args) self.env = interfaces.make_parallelized_gym_env(environment_details['env_name'], 0, algo_args['cpu_count']) if algo_args['baseline'] == 'moving_average': self.baseline = MovingAverageBaseline(0.9) elif algo_args['baseline'] == 'neural_network': self.val_approximator = MLP_factory(self.env.observation_space_info['shape'][0], [16, 16], output_size=1, hidden_non_linearity=nn.ReLU) self.val_optimizer = torch.optim.SGD(self.val_approximator.parameters(), lr=algo_args['value_lr']) self.baseline = NeuralNetworkBaseline(self.val_approximator, self.val_optimizer, bootstrap=False) else: self.baseline = None fn_approximator, policy = experiment.setup_policy(self.env, hidden_non_linearity=nn.ReLU, hidden_sizes=[16, 16]) self.fn_approximator = fn_approximator self.policy = policy self.optimizer = torch.optim.SGD(fn_approximator.parameters(), lr=algo_args['policy_lr']) self.algorithm = VanillaPolicyGradient(self.env, self.policy, self.optimizer, gamma=environment_details['gamma'], baseline=self.baseline)
def setup_baseline(baseline_type, env=None): if baseline_type == 'moving_averag': return MovingAverageBaseline(0.9) elif baseline_type == 'neural_network': val_approximator = MLP_factory(env.observation_space_info['shape'][0], [16, 16], output_size=1, hidden_non_linearity=nn.ReLU) val_optimizer = torch.optim.SGD(val_approximator.parameters(), lr=0.001) return NeuralNetworkBaseline(val_approximator, val_optimizer, bootstrap=False) else: return None
def test_categorial_policy(): fn_approximator = MLP_factory(input_size=4, output_size=3) policy = policies.CategoricalPolicy(fn_approximator) action, log_prob = policy(Variable(torch.randn(1, 4), volatile=True)) assert type(action.data[0]) is int assert log_prob.data[0] <= np.log(1)
def setup_policy(env, hidden_sizes=[16], hidden_non_linearity=None): if env.observation_space_info['type'] == 'continuous': input_size = env.observation_space_info['shape'][0] elif env.observation_space_info['type'] == 'discrete': input_size = env.observation_space_info['possible_values'] else: raise ValueError('Unknown observation space type {}!'.format( env.observation_space_info['type'])) if env.action_space_info['type'] == 'continuous': output_size = env.action_space_info['shape'][0] approximator = MLP_factory_two_heads( input_size, hidden_sizes, output_size=output_size, hidden_non_linearity=hidden_non_linearity) policy = GaussianPolicy(approximator) elif env.action_space_info['type'] == 'discrete': output_size = env.action_space_info['possible_values'] approximator = MLP_factory(input_size, hidden_sizes, output_size=output_size, hidden_non_linearity=hidden_non_linearity) policy = CategoricalPolicy(approximator) else: raise ValueError('Unknown action space type {}!'.format( env.action_space_info['type'])) return approximator, policy
def test_bernoulli_policy_batched(): fn_approximator = MLP_factory(input_size=4, output_size=1) policy = policies.BernoulliPolicy(fn_approximator) action, log_prob = policy(Variable(torch.randn(10, 4), volatile=True)) assert tuple(action.size()) == (10, 1) # we must get back 10 actions. assert torch.sum( log_prob.data <= torch.log(torch.ones_like(log_prob.data))) assert sum([action.data[i, 0] in [0, 1] for i in range(action.size()[0]) ]), 'Actions are not between 0 and 1'
def test_categorial_policy_batched(): fn_approximator = MLP_factory(input_size=4, output_size=3) policy = policies.CategoricalPolicy(fn_approximator) action, log_prob = policy(Variable(torch.randn(10, 4), volatile=True)) assert tuple(action.size()) == (10, ) # we must get back 10 actions. assert sum([type(action.data[i]) is int for i in range(action.size()[0])]) assert torch.sum( log_prob.data <= torch.log(torch.ones_like(log_prob.data)))
def test_multi_bernoulli_policy_batched(): """ Simulates when each action consists of 5 bernoulli choices. """ fn_approximator = MLP_factory(input_size=4, output_size=5) policy = policies.BernoulliPolicy(fn_approximator) action, log_prob = policy(Variable(torch.randn(10, 4), volatile=True)) assert tuple(action.size()) == ( 10, 5) # we must get back 10 actions with 5 switches. assert torch.sum( log_prob.data <= torch.log(torch.ones_like(log_prob.data))) assert sum([action.data[i, 0] in [0, 1] for i in range(action.size()[0]) ]), 'Actions are not between 0 and 1'
class REINFORCEAlgorithm(AlgorithmWrapper): def __init__(self, environment_details, seeds=None, **algo_args): super().__init__(environment_details, seeds, **algo_args) self.env = interfaces.make_parallelized_gym_env(environment_details['env_name'], 0, algo_args['cpu_count']) if algo_args['baseline'] == 'moving_average': self.baseline = MovingAverageBaseline(0.9) elif algo_args['baseline'] == 'neural_network': self.val_approximator = MLP_factory(self.env.observation_space_info['shape'][0], [16, 16], output_size=1, hidden_non_linearity=nn.ReLU) self.val_optimizer = torch.optim.SGD(self.val_approximator.parameters(), lr=algo_args['value_lr']) self.baseline = NeuralNetworkBaseline(self.val_approximator, self.val_optimizer, bootstrap=False) else: self.baseline = None fn_approximator, policy = experiment.setup_policy(self.env, hidden_non_linearity=nn.ReLU, hidden_sizes=[16, 16]) self.fn_approximator = fn_approximator self.policy = policy self.optimizer = torch.optim.SGD(fn_approximator.parameters(), lr=algo_args['policy_lr']) self.algorithm = VanillaPolicyGradient(self.env, self.policy, self.optimizer, gamma=environment_details['gamma'], baseline=self.baseline) def act(self, state): # torch 0.3 sign... # TODO: upgrade here state = Variable(self.env.observation_processor.gym2pytorch(state), volatile=True) action, _ = self.policy(state) return self.env.action_processor.pytorch2gym(action.data) def train_step(self): self.algorithm.run(1, verbose=False)
folder_name = args.env_name algorithm_name = 'VPG' if args.name == '' else 'VPG_' + args.name experiment_logger = experiment.Experiment({'algorithm_name': algorithm_name}, os.path.join('./', folder_name)) experiment_logger.start() hidden_sizes = [16] * args.n_hidden_layers for replicate in range(args.n_replicates): if args.baseline == 'moving_average': baseline = MovingAverageBaseline(0.9) elif args.baseline == 'neural_network': val_approximator = MLP_factory(env.observation_space_info['shape'][0], [16, 16], output_size=1, hidden_non_linearity=nn.ReLU) val_optimizer = torch.optim.SGD(val_approximator.parameters(), lr=args.value_lr) baseline = NeuralNetworkBaseline(val_approximator, val_optimizer, bootstrap=False) else: baseline = None fn_approximator, policy = experiment.setup_policy( env, hidden_non_linearity=nn.ReLU, hidden_sizes=[16, 16]) optimizer = torch.optim.SGD(fn_approximator.parameters(), lr=args.policy_lr)
def get_bernoulli_action(state, *args): policy = policies.BernoulliPolicy( MLP_factory(state.size()[1], output_size=1)) action = policy(state) return action
def run_algorithm(MODE, alpha, baseline): gamma = 0.99 n_episodes = 500 logging.info('Mode: %s', MODE) objective = PolicyGradientObjective() # env = parallelized_gym.SubprocVecEnv([lambda : PointEnv() for _ in range(5)]) # dist_get = get_distribution_gaussian env_name = 'MountainCar-v0' # env_name = 'CartPole-v0' env = interfaces.make_parallelized_gym_env(env_name, seed=int(time.time()), n_workers=BATCH_SIZE) dist_get = get_distribution_discrete fn_approximator, policy = experiment.setup_policy( env, hidden_non_linearity=torch.nn.ReLU, hidden_sizes=[16, 16]) if baseline == 'moving_average': baseline = MovingAverageBaseline(0.99) elif baseline == 'function_approximator': input_size = env.observation_space_info['shape'][0] value_function = MLP_factory(input_size, hidden_sizes=[16, 16], output_size=1, hidden_non_linearity=torch.nn.ReLU) optimizer = torch.optim.RMSprop(value_function.parameters(), lr=0.001) baseline = FunctionApproximatorBaseline(value_function, optimizer) else: raise ValueError('Unknown baseline.') accum_rewards = [] for i in range(n_episodes): trajectories = obtain_trajectories(env, policy, 200, reset=True, value_function=baseline) trajectories.torchify() returns = gradients.calculate_returns(trajectories.rewards, gamma, trajectories.masks) advantages = returns - trajectories.values baseline_loss = baseline.update_baseline(trajectories, returns) loss = objective(advantages, trajectories) policy.zero_grad() vpg_grad = torch.autograd.grad(loss, policy.parameters(), create_graph=True) vpg_grad = parameters_to_vector(vpg_grad).detach().numpy() curr_params = parameters_to_vector(policy.parameters()) if MODE == 'npg': # print('vpg_grad',vpg_grad) # Last state is just the state after getting our done so we leave it out. states_to_process = trajectories.states[:-1] traj_len, batch_size, state_shape = states_to_process.size() states_to_process = states_to_process.view(traj_len * batch_size, state_shape) def hvp_fn(vector): return compute_hessian_vector_product(policy, states_to_process, vector, dist_get) npg_grad = conjugate_gradient_algorithm( hvp_fn, vpg_grad, # x_0=vpg_grad.copy(), cg_iters=CG_ITERS) # if alpha is not None: # n_step_size = (alpha ** 2) * np.dot(vpg_grad.T, npg_grad) # else: # n_step_size = self.n_step_size eff_alpha = np.sqrt( np.abs(alpha / (np.dot(vpg_grad.T, npg_grad) + 1e-20))) if np.allclose(npg_grad, vpg_grad): raise ValueError('No change in npg, vpg') new_params = curr_params - eff_alpha * torch.from_numpy(npg_grad) accum_rewards_npg = accum_rewards elif MODE == 'vpg': new_params = curr_params - alpha * torch.from_numpy(vpg_grad) accum_rewards_vpg = accum_rewards else: raise ValueError('Unkown algorithm') vector_to_parameters(new_params, policy.parameters()) reward_summary = torch.sum(trajectories.rewards * trajectories.masks.float(), dim=0) # print(reward_summary.mean()) accum_rewards.append(reward_summary.mean()) return accum_rewards