예제 #1
0
def get_multinomial_action(state, action_space_size):
    if len(state.size()) != 2:
        policy = policies.CategoricalPolicy(MLP_factory(1, output_size=action_space_size))
    else:
        policy = policies.CategoricalPolicy(MLP_factory(state.size()[1], output_size=action_space_size))
    action = policy(state)
    return action
예제 #2
0
    def __init__(self,
                 environment_details,
                 seeds=None,
                 **algo_args):
        super().__init__(environment_details, seeds, **algo_args)
        self.env = interfaces.make_parallelized_gym_env(environment_details['env_name'], 0,
                                                        algo_args['cpu_count'])
        if algo_args['baseline'] == 'moving_average':
            self.baseline = MovingAverageBaseline(0.9)
        elif algo_args['baseline'] == 'neural_network':
            self.val_approximator = MLP_factory(self.env.observation_space_info['shape'][0],
                                           [16, 16],
                                           output_size=1,
                                           hidden_non_linearity=nn.ReLU)
            self.val_optimizer = torch.optim.SGD(self.val_approximator.parameters(), lr=algo_args['value_lr'])
            self.baseline = NeuralNetworkBaseline(self.val_approximator, self.val_optimizer, bootstrap=False)
        else:
            self.baseline = None

        fn_approximator, policy = experiment.setup_policy(self.env,
                                                          hidden_non_linearity=nn.ReLU,
                                                          hidden_sizes=[16, 16])

        self.fn_approximator = fn_approximator
        self.policy = policy
        self.optimizer = torch.optim.SGD(fn_approximator.parameters(), lr=algo_args['policy_lr'])

        self.algorithm = VanillaPolicyGradient(self.env,
                                               self.policy,
                                               self.optimizer,
                                               gamma=environment_details['gamma'],
                                               baseline=self.baseline)
def setup_baseline(baseline_type, env=None):
    if baseline_type == 'moving_averag':
        return MovingAverageBaseline(0.9)
    elif baseline_type == 'neural_network':
        val_approximator = MLP_factory(env.observation_space_info['shape'][0],
                                       [16, 16],
                                       output_size=1,
                                       hidden_non_linearity=nn.ReLU)
        val_optimizer = torch.optim.SGD(val_approximator.parameters(),
                                        lr=0.001)
        return NeuralNetworkBaseline(val_approximator,
                                     val_optimizer,
                                     bootstrap=False)
    else:
        return None
예제 #4
0
def test_categorial_policy():
    fn_approximator = MLP_factory(input_size=4, output_size=3)
    policy = policies.CategoricalPolicy(fn_approximator)

    action, log_prob = policy(Variable(torch.randn(1, 4), volatile=True))
    assert type(action.data[0]) is int
    assert log_prob.data[0] <= np.log(1)
def setup_policy(env, hidden_sizes=[16], hidden_non_linearity=None):

    if env.observation_space_info['type'] == 'continuous':
        input_size = env.observation_space_info['shape'][0]
    elif env.observation_space_info['type'] == 'discrete':
        input_size = env.observation_space_info['possible_values']
    else:
        raise ValueError('Unknown observation space type {}!'.format(
            env.observation_space_info['type']))

    if env.action_space_info['type'] == 'continuous':
        output_size = env.action_space_info['shape'][0]
        approximator = MLP_factory_two_heads(
            input_size,
            hidden_sizes,
            output_size=output_size,
            hidden_non_linearity=hidden_non_linearity)
        policy = GaussianPolicy(approximator)

    elif env.action_space_info['type'] == 'discrete':
        output_size = env.action_space_info['possible_values']

        approximator = MLP_factory(input_size,
                                   hidden_sizes,
                                   output_size=output_size,
                                   hidden_non_linearity=hidden_non_linearity)
        policy = CategoricalPolicy(approximator)

    else:
        raise ValueError('Unknown action space type {}!'.format(
            env.action_space_info['type']))

    return approximator, policy
예제 #6
0
def test_bernoulli_policy_batched():
    fn_approximator = MLP_factory(input_size=4, output_size=1)
    policy = policies.BernoulliPolicy(fn_approximator)

    action, log_prob = policy(Variable(torch.randn(10, 4), volatile=True))
    assert tuple(action.size()) == (10, 1)  # we must get back 10 actions.
    assert torch.sum(
        log_prob.data <= torch.log(torch.ones_like(log_prob.data)))
    assert sum([action.data[i, 0] in [0, 1] for i in range(action.size()[0])
                ]), 'Actions are not between 0 and 1'
예제 #7
0
def test_categorial_policy_batched():
    fn_approximator = MLP_factory(input_size=4, output_size=3)
    policy = policies.CategoricalPolicy(fn_approximator)

    action, log_prob = policy(Variable(torch.randn(10, 4), volatile=True))

    assert tuple(action.size()) == (10, )  # we must get back 10 actions.
    assert sum([type(action.data[i]) is int for i in range(action.size()[0])])
    assert torch.sum(
        log_prob.data <= torch.log(torch.ones_like(log_prob.data)))
예제 #8
0
def test_multi_bernoulli_policy_batched():
    """
    Simulates when each action consists of 5 bernoulli choices.
    """
    fn_approximator = MLP_factory(input_size=4, output_size=5)
    policy = policies.BernoulliPolicy(fn_approximator)

    action, log_prob = policy(Variable(torch.randn(10, 4), volatile=True))

    assert tuple(action.size()) == (
        10, 5)  # we must get back 10 actions with 5 switches.
    assert torch.sum(
        log_prob.data <= torch.log(torch.ones_like(log_prob.data)))
    assert sum([action.data[i, 0] in [0, 1] for i in range(action.size()[0])
                ]), 'Actions are not between 0 and 1'
예제 #9
0
class REINFORCEAlgorithm(AlgorithmWrapper):
    def __init__(self,
                 environment_details,
                 seeds=None,
                 **algo_args):
        super().__init__(environment_details, seeds, **algo_args)
        self.env = interfaces.make_parallelized_gym_env(environment_details['env_name'], 0,
                                                        algo_args['cpu_count'])
        if algo_args['baseline'] == 'moving_average':
            self.baseline = MovingAverageBaseline(0.9)
        elif algo_args['baseline'] == 'neural_network':
            self.val_approximator = MLP_factory(self.env.observation_space_info['shape'][0],
                                           [16, 16],
                                           output_size=1,
                                           hidden_non_linearity=nn.ReLU)
            self.val_optimizer = torch.optim.SGD(self.val_approximator.parameters(), lr=algo_args['value_lr'])
            self.baseline = NeuralNetworkBaseline(self.val_approximator, self.val_optimizer, bootstrap=False)
        else:
            self.baseline = None

        fn_approximator, policy = experiment.setup_policy(self.env,
                                                          hidden_non_linearity=nn.ReLU,
                                                          hidden_sizes=[16, 16])

        self.fn_approximator = fn_approximator
        self.policy = policy
        self.optimizer = torch.optim.SGD(fn_approximator.parameters(), lr=algo_args['policy_lr'])

        self.algorithm = VanillaPolicyGradient(self.env,
                                               self.policy,
                                               self.optimizer,
                                               gamma=environment_details['gamma'],
                                               baseline=self.baseline)

    def act(self, state):
        # torch 0.3 sign...
        # TODO: upgrade here
        state = Variable(self.env.observation_processor.gym2pytorch(state), volatile=True)
        action, _ = self.policy(state)
        return self.env.action_processor.pytorch2gym(action.data)


    def train_step(self):
        self.algorithm.run(1, verbose=False)
예제 #10
0
folder_name = args.env_name
algorithm_name = 'VPG' if args.name == '' else 'VPG_' + args.name

experiment_logger = experiment.Experiment({'algorithm_name': algorithm_name},
                                          os.path.join('./', folder_name))
experiment_logger.start()

hidden_sizes = [16] * args.n_hidden_layers

for replicate in range(args.n_replicates):

    if args.baseline == 'moving_average':
        baseline = MovingAverageBaseline(0.9)
    elif args.baseline == 'neural_network':
        val_approximator = MLP_factory(env.observation_space_info['shape'][0],
                                       [16, 16],
                                       output_size=1,
                                       hidden_non_linearity=nn.ReLU)
        val_optimizer = torch.optim.SGD(val_approximator.parameters(),
                                        lr=args.value_lr)
        baseline = NeuralNetworkBaseline(val_approximator,
                                         val_optimizer,
                                         bootstrap=False)
    else:
        baseline = None

    fn_approximator, policy = experiment.setup_policy(
        env, hidden_non_linearity=nn.ReLU, hidden_sizes=[16, 16])

    optimizer = torch.optim.SGD(fn_approximator.parameters(),
                                lr=args.policy_lr)
def get_bernoulli_action(state, *args):
    policy = policies.BernoulliPolicy(
        MLP_factory(state.size()[1], output_size=1))
    action = policy(state)
    return action
def run_algorithm(MODE, alpha, baseline):
    gamma = 0.99
    n_episodes = 500
    logging.info('Mode: %s', MODE)

    objective = PolicyGradientObjective()
    # env = parallelized_gym.SubprocVecEnv([lambda : PointEnv() for _ in range(5)])
    # dist_get = get_distribution_gaussian

    env_name = 'MountainCar-v0'
    # env_name = 'CartPole-v0'
    env = interfaces.make_parallelized_gym_env(env_name,
                                               seed=int(time.time()),
                                               n_workers=BATCH_SIZE)
    dist_get = get_distribution_discrete

    fn_approximator, policy = experiment.setup_policy(
        env, hidden_non_linearity=torch.nn.ReLU, hidden_sizes=[16, 16])
    if baseline == 'moving_average':
        baseline = MovingAverageBaseline(0.99)
    elif baseline == 'function_approximator':
        input_size = env.observation_space_info['shape'][0]
        value_function = MLP_factory(input_size,
                                     hidden_sizes=[16, 16],
                                     output_size=1,
                                     hidden_non_linearity=torch.nn.ReLU)
        optimizer = torch.optim.RMSprop(value_function.parameters(), lr=0.001)
        baseline = FunctionApproximatorBaseline(value_function, optimizer)
    else:
        raise ValueError('Unknown baseline.')

    accum_rewards = []
    for i in range(n_episodes):
        trajectories = obtain_trajectories(env,
                                           policy,
                                           200,
                                           reset=True,
                                           value_function=baseline)
        trajectories.torchify()
        returns = gradients.calculate_returns(trajectories.rewards, gamma,
                                              trajectories.masks)
        advantages = returns - trajectories.values
        baseline_loss = baseline.update_baseline(trajectories, returns)
        loss = objective(advantages, trajectories)
        policy.zero_grad()
        vpg_grad = torch.autograd.grad(loss,
                                       policy.parameters(),
                                       create_graph=True)
        vpg_grad = parameters_to_vector(vpg_grad).detach().numpy()

        curr_params = parameters_to_vector(policy.parameters())
        if MODE == 'npg':
            #   print('vpg_grad',vpg_grad)
            # Last state is just the state after getting our done so we leave it out.
            states_to_process = trajectories.states[:-1]
            traj_len, batch_size, state_shape = states_to_process.size()
            states_to_process = states_to_process.view(traj_len * batch_size,
                                                       state_shape)

            def hvp_fn(vector):
                return compute_hessian_vector_product(policy,
                                                      states_to_process,
                                                      vector, dist_get)

            npg_grad = conjugate_gradient_algorithm(
                hvp_fn,
                vpg_grad,
                # x_0=vpg_grad.copy(),
                cg_iters=CG_ITERS)

            #   if alpha is not None:
            #   n_step_size = (alpha ** 2) * np.dot(vpg_grad.T, npg_grad)
            #             else:
            #               n_step_size = self.n_step_size
            eff_alpha = np.sqrt(
                np.abs(alpha / (np.dot(vpg_grad.T, npg_grad) + 1e-20)))
            if np.allclose(npg_grad, vpg_grad):
                raise ValueError('No change in npg, vpg')
            new_params = curr_params - eff_alpha * torch.from_numpy(npg_grad)
            accum_rewards_npg = accum_rewards
        elif MODE == 'vpg':
            new_params = curr_params - alpha * torch.from_numpy(vpg_grad)
            accum_rewards_vpg = accum_rewards
        else:
            raise ValueError('Unkown algorithm')
        vector_to_parameters(new_params, policy.parameters())
        reward_summary = torch.sum(trajectories.rewards *
                                   trajectories.masks.float(),
                                   dim=0)
        #   print(reward_summary.mean())
        accum_rewards.append(reward_summary.mean())

    return accum_rewards