Exemplo n.º 1
0
def main(env='Pendulum-v0'):
    agent = ActorCritic(HIDDEN_SIZE)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)
    replay = ch.ExperienceReplay()

    env = gym.make(env)
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Logger(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    for step in range(1, MAX_STEPS + 1):
        replay += env.run(agent, episodes=1)

        if len(replay) >= BATCH_SIZE:
            with torch.no_grad():
                advantages = pg.generalized_advantage(DISCOUNT,
                                                      TRACE_DECAY,
                                                      replay.reward(),
                                                      replay.done(),
                                                      replay.value(),
                                                      torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT,
                                         replay.reward(),
                                         replay.done())
                old_log_probs = replay.log_prob()

            new_values = replay.value()
            new_log_probs = replay.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    _, infos = agent(replay.state())
                    masses = infos['mass']
                    new_values = infos['value'].view(-1, 1)
                    new_log_probs = masses.log_prob(replay.action())

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(new_log_probs,
                                                            old_log_probs,
                                                            advantages,
                                                            clip=PPO_CLIP_RATIO)
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(new_values,
                                                                returns)
                critic_optimiser.zero_grad()
                value_loss.backward()
                critic_optimiser.step()

            replay.empty()
Exemplo n.º 2
0
    def test_discount(self):
        vector = th.randn(VECTOR_SIZE)
        for i in range(4):
            self.replay.append(vector, vector, 8.0, vector, False)
        self.replay.append(vector, vector, 8.0, vector, True)
        discounted = discount(GAMMA,
                              self.replay.reward(),
                              self.replay.done(),
                              bootstrap=0)
        ref = th.Tensor([15.5, 15.0, 14.0, 12.0, 8.0]).view(-1, 1)
        self.assertTrue(close(discounted, ref))

        # Test overlapping episodes with bootstrap
        overlap = self.replay[2:] + self.replay[:3]
        overlap_discounted = discount(GAMMA,
                                      overlap.reward(),
                                      overlap.done(),
                                      bootstrap=discounted[3])
        ref = th.cat((discounted[2:], discounted[:3]), dim=0)
        self.assertTrue(close(overlap_discounted, ref))
Exemplo n.º 3
0
def update(replay, optimizer):
    policy_loss = []
    value_loss = []
    rewards = discount(GAMMA, replay.reward(), replay.done())
    rewards = ch.normalize(rewards)
    for sars, reward in zip(replay, rewards):
        log_prob = sars.log_prob
        value = sars.value
        policy_loss.append(-log_prob * (reward - value.item()))
        value_loss.append(F.mse_loss(value, reward.detach()))
    optimizer.zero_grad()
    loss = th.stack(policy_loss).sum() + V_WEIGHT * th.stack(value_loss).sum()
    loss.backward()
    optimizer.step()
Exemplo n.º 4
0
def main(env='Pendulum-v0'):
    agent = ActorCritic(HIDDEN_SIZE).to(device)
    agent.apply(weights_init)

    actor_optimizer = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimizer = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)
    actor_scheduler = torch.optim.lr_scheduler.StepLR(actor_optimizer,
                                                      step_size=2000,
                                                      gamma=0.5)
    critic_scheduler = torch.optim.lr_scheduler.StepLR(critic_optimizer,
                                                       step_size=2000,
                                                       gamma=0.5)
    replay = ch.ExperienceReplay()

    env = gym.make(env)
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Logger(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    def get_action(state):
        return agent(state.to(device))

    for step in range(1, MAX_STEPS + 1):
        replay += env.run(get_action, episodes=1)

        if len(replay) >= BATCH_SIZE:
            #batch = replay.sample(BATCH_SIZE).to(device)
            batch = replay.to(device)
            with torch.no_grad():
                advantages = pg.generalized_advantage(
                    DISCOUNT, TRACE_DECAY, batch.reward(), batch.done(),
                    batch.value(),
                    torch.zeros(1).to(device))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT, batch.reward(), batch.done())
                old_log_probs = batch.log_prob()

            new_values = batch.value()
            new_log_probs = batch.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    _, infos = agent(batch.state())
                    masses = infos['mass']
                    new_values = infos['value'].view(-1, 1)
                    new_log_probs = masses.log_prob(batch.action())

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(
                    new_log_probs,
                    old_log_probs,
                    advantages,
                    clip=PPO_CLIP_RATIO)
                actor_optimizer.zero_grad()
                policy_loss.backward()
                #nn.utils.clip_grad_norm_(agent.actor.parameters(), 1.0)
                actor_optimizer.step()

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(
                    new_values, returns)
                critic_optimizer.zero_grad()
                value_loss.backward()
                #nn.utils.clip_grad_norm_(agent.critic.parameters(), 1.0)
                critic_optimizer.step()

            actor_scheduler.step()
            critic_scheduler.step()

            replay.empty()
Exemplo n.º 5
0
def main():
    order_book_id_number = 10
    toy_data = create_toy_data(order_book_ids_number=order_book_id_number,
                               feature_number=20,
                               start="2019-05-01",
                               end="2019-12-12",
                               frequency="D")
    env = PortfolioTradingGym(data_df=toy_data,
                              sequence_window=5,
                              add_cash=True)
    env = Numpy(env)
    env = ch.envs.Logger(env, interval=1000)
    env = ch.envs.Torch(env)
    env = ch.envs.Runner(env)

    # create net
    action_size = env.action_space.shape[0]
    number_asset, seq_window, features_number = env.observation_space.shape
    input_size = features_number

    agent = ActorCritic(input_size=input_size,
                        hidden_size=HIDDEN_SIZE,
                        action_size=action_size)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)

    replay = ch.ExperienceReplay()

    for step in range(1, MAX_STEPS + 1):
        replay += env.run(agent, episodes=1)

        if len(replay) >= BATCH_SIZE:
            with torch.no_grad():
                advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY,
                                                      replay.reward(),
                                                      replay.done(),
                                                      replay.value(),
                                                      torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT, replay.reward(), replay.done())
                old_log_probs = replay.log_prob()

            # here is to add readability
            new_values = replay.value()
            new_log_probs = replay.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    _, infos = agent(replay.state())
                    masses = infos['mass']
                    new_values = infos['value']
                    new_log_probs = masses.log_prob(
                        replay.action()).unsqueeze(-1)

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(
                    new_log_probs,
                    old_log_probs,
                    advantages,
                    clip=PPO_CLIP_RATIO)
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(
                    new_values, returns)
                critic_optimiser.zero_grad()
                value_loss.backward()
                critic_optimiser.step()

            replay.empty()
Exemplo n.º 6
0
def train_cherry():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    env = gym.make('Pendulum-v0')
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    agent = ActorCritic(HIDDEN_SIZE)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)

    def get_action(state):
        mass, value = agent(state)
        action = mass.sample()
        log_prob = mass.log_prob(action)
        return action, {
            'log_prob': log_prob,
            'value': value,
        }

    result = {
        'rewards': [],
        'policy_losses': [],
        'value_losses': [],
        'weights': [],
    }

    for step in range(1, CHERRY_MAX_STEPS + 1):
        replay += env.run(get_action, episodes=1)

        if len(replay) >= BATCH_SIZE:
            for r in replay.reward():
                result['rewards'].append(r.item())
            with torch.no_grad():
                advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY,
                                                      replay.reward(),
                                                      replay.done(),
                                                      replay.value(),
                                                      torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT, replay.reward(), replay.done())
                old_log_probs = replay.log_prob()

            new_values = replay.value()
            new_log_probs = replay.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    masses, new_values = agent(replay.state())
                    new_log_probs = masses.log_prob(replay.action())
                    new_values = new_values.view(-1, 1)

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(
                    new_log_probs,
                    old_log_probs,
                    advantages,
                    clip=PPO_CLIP_RATIO)
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()
                result['policy_losses'].append(policy_loss.item())

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(
                    new_values, returns)
                critic_optimiser.zero_grad()
                value_loss.backward()
                critic_optimiser.step()
                result['value_losses'].append(value_loss.item())
            replay.empty()

    result['weights'] = list(agent.parameters())
    return result