Exemplo n.º 1
0
    def test_vectorized_discount(self):
        state = th.randn(TIME_STEPS, NUM_ENVS, VECTOR_SIZE)
        action = th.randn(TIME_STEPS, NUM_ENVS)
        reward = th.randn(TIME_STEPS, NUM_ENVS)
        boostrap = th.randn(NUM_ENVS)
        done = th.zeros_like(reward)
        for i in list(reversed(range(TIME_STEPS)))[:4]:
            done[i, i % NUM_ENVS] = 1

        # Computing the discounted rewards
        # as non-vectorized environment
        nonvec_discounted_rewards = []
        for i in range(NUM_ENVS):
            replay = ch.ExperienceReplay()
            for t in range(TIME_STEPS):
                replay.append(state[t, i, :], action[t, i], reward[t, i],
                              state[t, i, :], done[t, i])
            nonvec_discounted_rewards.append(
                ch.td.discount(GAMMA, replay.reward(), replay.done(),
                               boostrap[i]))
        # Computing the discounted rewards
        # as vectorized environment
        replay = ch.ExperienceReplay()
        for t in range(TIME_STEPS):
            replay.append(state[t, :, :], action[t, :], reward[t, :],
                          state[t, :, :], done[t, :])
        vec_discounted_rewards = ch.td.discount(GAMMA, replay.reward(),
                                                replay.done(), boostrap)

        for i in range(NUM_ENVS):
            assert th.all(
                nonvec_discounted_rewards[i][:,
                                             0] == vec_discounted_rewards[:,
                                                                          i], )
Exemplo n.º 2
0
def main(env='Pendulum-v0'):
    agent = ActorCritic(HIDDEN_SIZE)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)
    replay = ch.ExperienceReplay()

    env = gym.make(env)
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Logger(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    for step in range(1, MAX_STEPS + 1):
        replay += env.run(agent, episodes=1)

        if len(replay) >= BATCH_SIZE:
            with torch.no_grad():
                advantages = pg.generalized_advantage(DISCOUNT,
                                                      TRACE_DECAY,
                                                      replay.reward(),
                                                      replay.done(),
                                                      replay.value(),
                                                      torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT,
                                         replay.reward(),
                                         replay.done())
                old_log_probs = replay.log_prob()

            new_values = replay.value()
            new_log_probs = replay.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    _, infos = agent(replay.state())
                    masses = infos['mass']
                    new_values = infos['value'].view(-1, 1)
                    new_log_probs = masses.log_prob(replay.action())

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(new_log_probs,
                                                            old_log_probs,
                                                            advantages,
                                                            clip=PPO_CLIP_RATIO)
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(new_values,
                                                                returns)
                critic_optimiser.zero_grad()
                value_loss.backward()
                critic_optimiser.step()

            replay.empty()
Exemplo n.º 3
0
def main(env):
    env = gym.make(env)
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.ActionLambda(env, convert_discrete_to_continuous_action)
    env = envs.Logger(env)
    env = envs.Runner(env)

    replay = ch.ExperienceReplay()
    agent = DQN(HIDDEN_SIZE, ACTION_DISCRETISATION)
    target_agent = create_target_network(agent)
    optimiser = optim.Adam(agent.parameters(), lr=LEARNING_RATE)

    def get_random_action(state):
        action = torch.tensor([[random.randint(0, ACTION_DISCRETISATION - 1)]])
        return action

    def get_action(state):
        # Original sampling (for unit test)
        #if random.random() < EPSILON:
        #  action = torch.tensor([[random.randint(0, ACTION_DISCRETISATION - 1)]])
        #else:
        #  action = agent(state)[1].argmax(dim=1, keepdim=True)
        #return action
        return agent(state)[0]

    for step in range(1, MAX_STEPS + 1):
        with torch.no_grad():
            if step < UPDATE_START:
                replay += env.run(get_random_action, steps=1)
            else:
                replay += env.run(get_action, steps=1)

            replay = replay[-REPLAY_SIZE:]

        if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
            # Randomly sample a batch of experience
            batch = random.sample(replay, BATCH_SIZE)
            batch = ch.ExperienceReplay(batch)

            # Compute targets
            target_values = target_agent(batch.next_state())[1].max(
                dim=1, keepdim=True)[0]
            target_values = batch.reward() + DISCOUNT * (
                1 - batch.done()) * target_values

            # Update Q-function by one step of gradient descent
            pred_values = agent(batch.state())[1].gather(1, batch.action())
            value_loss = F.mse_loss(pred_values, target_values)
            optimiser.zero_grad()
            value_loss.backward()
            optimiser.step()

        if step > UPDATE_START and step % TARGET_UPDATE_INTERVAL == 0:
            # Update target network
            target_agent = create_target_network(agent)
Exemplo n.º 4
0
def flatten_episodes(replay, episodes, num_workers):
    """
    TODO: This implementation is not efficient.

    NOTE: Additional info (other than a transition's default fields) is simply copied.
    To know from which worker the data was gathered, you can access sars.runner_id
    TODO: This is not great. What is the best behaviour with infos here ?
    """
    flat_replay = ch.ExperienceReplay()
    worker_replays = [ch.ExperienceReplay() for w in range(num_workers)]
    flat_episodes = 0
    for sars in replay:
        state = sars.state.view(_min_size(sars.state))
        action = sars.action.view(_min_size(sars.action))
        reward = sars.reward.view(_min_size(sars.reward))
        next_state = sars.next_state.view(_min_size(sars.next_state))
        done = sars.done.view(_min_size(sars.done))
        fields = set(sars._Transition__fields) - {
            'state', 'action', 'reward', 'next_state', 'done'
        }
        infos = {f: getattr(sars, f) for f in fields}
        for worker in range(num_workers):
            # Populate infos per worker
            worker_infos = {'runner_id': worker}
            for key, value in infos.items():
                worker_infos[key] = value[worker]

            # The following attemps to split additional infos. (WIP. Remove ?)
            # infos = {}
            # for f in fields:
            #     value = getattr(sars, f)
            #     if isinstance(value, Iterable) and len(value) == num_workers:
            #         value = value[worker]
            #     elif _istensorable(value):
            #         tvalue = ch.totensor(value)
            #         tvalue = tvalue.view(_min_size(tvalue))
            #         if tvalue.size(0) == num_workers:
            #             value = tvalue[worker]
            #     infos[f] = value
            worker_replays[worker].append(
                state[worker],
                action[worker],
                reward[worker],
                next_state[worker],
                done[worker],
                **worker_infos,
            )
            if bool(done[worker]):
                flat_replay += worker_replays[worker]
                worker_replays[worker] = ch.ExperienceReplay()
                flat_episodes += 1
            if flat_episodes >= episodes:
                break
        if flat_episodes >= episodes:
            break
    return flat_replay
Exemplo n.º 5
0
def main(env='Pendulum-v0'):
    env = gym.make(env)
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Logger(env)
    env = envs.Runner(env)

    actor = Actor(HIDDEN_SIZE, stochastic=False, layer_norm=True)
    critic = Critic(HIDDEN_SIZE, state_action=True, layer_norm=True)
    target_actor = create_target_network(actor)
    target_critic = create_target_network(critic)
    actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE)
    replay = ch.ExperienceReplay()

    get_action = lambda s: (actor(s) + ACTION_NOISE * torch.randn(1, 1)).clamp(
        -1, 1)

    for step in range(1, MAX_STEPS + 1):
        with torch.no_grad():
            if step < UPDATE_START:
                replay += env.run(get_random_action, steps=1)
            else:
                replay += env.run(get_action, steps=1)

        replay = replay[-REPLAY_SIZE:]
        if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
            sample = random.sample(replay, BATCH_SIZE)
            batch = ch.ExperienceReplay(sample)

            next_values = target_critic(batch.next_state(),
                                        target_actor(batch.next_state())).view(
                                            -1, 1)
            values = critic(batch.state(), batch.action()).view(-1, 1)
            value_loss = ch.algorithms.ddpg.state_value_loss(
                values, next_values.detach(), batch.reward(), batch.done(),
                DISCOUNT)
            critic_optimiser.zero_grad()
            value_loss.backward()
            critic_optimiser.step()

            # Update policy by one step of gradient ascent
            policy_loss = -critic(batch.state(), actor(batch.state())).mean()
            actor_optimiser.zero_grad()
            policy_loss.backward()
            actor_optimiser.step()

            # Update target networks
            ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR)
            ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)
Exemplo n.º 6
0
    def test_save_and_load(self):
        old_replay = self.replay
        vector = np.random.rand(VECTOR_SIZE)
        for i in range(NUM_SAMPLES):
            old_replay.append(vector, vector, i, vector, False, vector=vector)
        # save the old file
        old_replay.save('testing_temp_file.pt')

        # load the saved file to a new file
        new_replay = ch.ExperienceReplay()
        new_replay.load('testing_temp_file.pt')

        # check size
        self.assertEqual(len(old_replay._storage), len(new_replay._storage))
        self.assertEqual(len(old_replay.state()), len(new_replay.state()))
        self.assertEqual(len(old_replay.action()), len(new_replay.action()))
        self.assertEqual(len(old_replay.reward()), len(new_replay.reward()))
        self.assertEqual(len(old_replay.next_state()),
                         len(new_replay.next_state()))
        self.assertEqual(len(old_replay.done()), len(new_replay.done()))
        self.assertEqual(len(old_replay.vector()), len(new_replay.vector()))

        # check content
        for a, b in zip(old_replay, new_replay):
            self.assertTrue(close(a.state, b.state))
            self.assertTrue(close(a.action, b.action))
            self.assertTrue(close(a.reward, b.reward))
            self.assertTrue(close(a.next_state, b.next_state))
            self.assertTrue(close(a.done, b.done))
            self.assertTrue(close(a.vector, b.vector))

        os.remove('testing_temp_file.pt')
Exemplo n.º 7
0
def flatten_episodes(replay, episodes, num_workers, extra_info=False):
    """
    NOTE: Additional info (other than a transition's default fields) is simply copied.
    To know from which worker the data was gathered, you can access sars.runner_id
    """
    flat_replay = ch.ExperienceReplay()
    worker_replays = [ch.ExperienceReplay() for w in range(num_workers)]
    flat_episodes = 0
    for sars in replay:
        state = sars.state.view(_min_size(sars.state))
        action = sars.action.view(_min_size(sars.action))
        reward = sars.reward.view(_min_size(sars.reward))
        next_state = sars.next_state.view(_min_size(sars.next_state))
        done = sars.done.view(_min_size(sars.done))
        fields = set(sars._Transition__fields) - {
            'state', 'action', 'reward', 'next_state', 'done'
        }
        infos = {f: getattr(sars, f) for f in fields}
        for worker in range(num_workers):
            # Populate infos per worker
            worker_infos = {'runner_id': worker}

            # This slightly slows down the runner!
            # e.g from 1.15 it/sec we go to 1.25 it/sec
            if extra_info:
                for key, value in infos.items():
                    worker_infos[key] = value[worker]

            worker_replays[worker].append(
                state[worker],
                action[worker],
                reward[worker],
                next_state[worker],
                done[worker],
                **worker_infos,
            )
            if bool(done[worker]):
                flat_replay += worker_replays[worker]
                worker_replays[worker] = ch.ExperienceReplay()
                flat_episodes += 1
            if flat_episodes >= episodes:
                break
        if flat_episodes >= episodes:
            break
    return flat_replay
Exemplo n.º 8
0
 def test_append(self):
     new_replay = ch.ExperienceReplay()
     vector = np.random.rand(VECTOR_SIZE)
     for i in range(NUM_SAMPLES):
         self.replay.append(vector, vector, i, vector, False, vector=vector)
         new_replay.append(vector, vector, i, vector, False, vector=vector)
     self.assertEqual(len(self.replay), len(new_replay))
     new_replay = self.replay + new_replay
     self.assertEqual(NUM_SAMPLES * 2, len(new_replay))
     self.replay += new_replay
     self.assertEqual(NUM_SAMPLES * 3, len(self.replay))
Exemplo n.º 9
0
def main(env='HalfCheetahBulletEnv-v0'):
    random.seed(SEED)
    np.random.seed(SEED)
    th.manual_seed(SEED)
    env = gym.make(env)
    env = envs.VisdomLogger(env, interval=1000)
    env = envs.ActionSpaceScaler(env)
    env = envs.Torch(env)
    env = envs.Runner(env)
    env.seed(SEED)

    log_alpha = th.zeros(1, requires_grad=True)
    if USE_AUTOMATIC_ENTROPY_TUNING:
        # Heuristic target entropy
        target_entropy = -np.prod(env.action_space.shape).item()
    else:
        target_entropy = TARGET_ENTROPY

    state_size = env.state_size
    action_size = env.action_size

    policy = Policy(input_size=state_size, output_size=action_size)
    critic_qf1 = MLP(input_size=state_size + action_size, output_size=1)
    critic_qf2 = MLP(input_size=state_size + action_size, output_size=1)
    target_qf1 = copy.deepcopy(critic_qf1)
    target_qf2 = copy.deepcopy(critic_qf2)

    policy_opt = optim.Adam(policy.parameters(), lr=ALL_LR)
    qf1_opt = optim.Adam(critic_qf1.parameters(), lr=ALL_LR)
    qf2_opt = optim.Adam(critic_qf2.parameters(), lr=ALL_LR)
    alpha_opt = optim.Adam([log_alpha], lr=ALL_LR)

    replay = ch.ExperienceReplay()
    get_action = lambda state: policy(state).rsample()

    for step in range(TOTAL_STEPS):
        # Collect next step
        ep_replay = env.run(get_action, steps=1, render=RENDER)

        # Update policy
        replay += ep_replay
        replay = replay[-REPLAY_SIZE:]
        if len(replay) > MIN_REPLAY:
            update(env, replay, policy, critic_qf1, critic_qf2, target_qf1,
                   target_qf2, log_alpha, policy_opt, qf1_opt, qf2_opt,
                   alpha_opt, target_entropy)
Exemplo n.º 10
0
def main(num_steps=10000000,
         env_name='PongNoFrameskip-v4',
#         env_name='BreakoutNoFrameskip-v4',
         seed=42):
    th.set_num_threads(1)
    random.seed(seed)
    th.manual_seed(seed)
    np.random.seed(seed)

    env = gym.make(env_name)
    env = envs.Logger(env, interval=1000)
    env = envs.OpenAIAtari(env)
    env = envs.Torch(env)
    env = envs.Runner(env)
    env.seed(seed)

    dqn = DQN(env)
    target_dqn = copy.deepcopy(dqn)
    optimizer = optim.RMSprop(dqn.parameters(), lr=LR, alpha=0.95,
                              eps=0.01, centered=True)
    replay = ch.ExperienceReplay()
    epsilon = EPSILON
    get_action = lambda state: epsilon_greedy(dqn(state), epsilon)

    for step in range(num_steps // UPDATE_FREQ + 1):
        # Sample some transitions
        ep_replay = env.run(get_action, steps=UPDATE_FREQ)
        replay += ep_replay

        if step * UPDATE_FREQ < 1e6:
            # Update epsilon
            epsilon -= 9.9e-7 * UPDATE_FREQ

        if step * UPDATE_FREQ > EXPLORATION_STEPS:
            # Only keep the last 1M transitions
            replay = replay[-REPLAY_SIZE:]

            # Update Q-function
            update(replay, optimizer, dqn, target_dqn, env=env)

            if step % TARGET_UPDATE_FREQ == 0:
                target_dqn.load_state_dict(dqn.state_dict())
Exemplo n.º 11
0
def run_trpo():
    ch.debug.debug()
    for i, env_name in enumerate(sweep.SWEEP):
        dm_env = bsuite.load_and_record_to_csv(env_name,
                                               results_dir=TRPO_RESULTS_PATH,
                                               overwrite=True)

        #  Instanciate the env and agent
        env = gym_wrapper.GymWrapper(dm_env)
        env = ch.envs.Torch(env)
        env = ch.envs.Runner(env)
        policy = Policy(env)
        baseline = LinearValue(env.state_size)

        #  Generate the results
        replay = ch.ExperienceReplay()
        for episode in tqdm(range(1, 1 + env.bsuite_num_episodes),
                            desc=env_name):
            replay += env.run(policy, episodes=1)
            if episode % 10 == 0:
                trpo_update(replay, policy, baseline)
                replay.empty()
Exemplo n.º 12
0
number_asset, seq_window, features_all = env.observation_space.shape
assert action_size == number_asset + 1
input_size = features_all - 1

net = ActorCritic(input_size=input_size,
                  hidden_size=50,
                  action_size=action_size)
net_tgt = ActorCritic(input_size=input_size,
                      hidden_size=50,
                      action_size=action_size)
net_tgt.eval()
print(net_tgt)
net_tgt.load_state_dict(net.state_dict())

# create replay
replay = ch.ExperienceReplay()

# create loss function
criterion_mse = nn.MSELoss()

# create optimizer
optimizer_actor = torch.optim.Adam(net.actor.parameters(), lr=0.001)
optimizer_critic = torch.optim.Adam(net.critic.parameters(), lr=0.001)


def update(replay):
    # batch-data
    state_batch = replay.state()
    next_state_batch = replay.next_state()
    action_batch = replay.action()
    reward_batch = replay.reward()
Exemplo n.º 13
0
def train_cherry():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    result = {
        'rewards': [],
        'plosses': [],
        'vlosses': [],
        'qlosses': [],
        'pweights': [],
        'vweights': [],
        'vweights_target': [],
        'qweights1': [],
        'qweights2': [],
    }

    env = gym.make('Pendulum-v0')
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    actor = SoftActor(HIDDEN_SIZE)
    critic_1 = Critic(HIDDEN_SIZE, state_action=True)
    critic_2 = Critic(HIDDEN_SIZE, state_action=True)
    value_critic = Critic(HIDDEN_SIZE)
    target_value_critic = create_target_network(value_critic)
    actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE)
    critics_optimiser = optim.Adam(list(critic_1.parameters()) + list(critic_2.parameters()), lr=LEARNING_RATE)
    value_critic_optimiser = optim.Adam(value_critic.parameters(), lr=LEARNING_RATE)

    def get_random_action(state):
        return torch.tensor([[2 * random.random() - 1]])

    def get_action(state):
        return actor(state).sample()

    for step in range(1, MAX_STEPS + 1):
        with torch.no_grad():
            if step < UPDATE_START:
                replay += env.run(get_random_action, steps=1)
            else:
                replay += env.run(get_action, steps=1)
        replay = replay[-REPLAY_SIZE:]
        result['rewards'].append(replay.reward()[-1].item())

        if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
            sample = random.sample(replay, BATCH_SIZE)
            batch = ch.ExperienceReplay(sample)

            # Pre-compute some quantities
            masses = actor(batch.state())
            actions = masses.rsample()
            log_probs = masses.log_prob(actions)
            q_values = torch.min(critic_1(batch.state(), actions.detach()),
                                 critic_2(batch.state(), actions.detach())).view(-1, 1)

            # Compute Q losses
            v_next = target_value_critic(batch.next_state()).view(-1, 1)
            q_old_pred1 = critic_1(batch.state(), batch.action().detach()).view(-1, 1)
            q_old_pred2 = critic_2(batch.state(), batch.action().detach()).view(-1, 1)
            qloss1 = ch.algorithms.sac.action_value_loss(q_old_pred1,
                                                         v_next.detach(),
                                                         batch.reward(),
                                                         batch.done(),
                                                         DISCOUNT)
            qloss2 = ch.algorithms.sac.action_value_loss(q_old_pred2,
                                                         v_next.detach(),
                                                         batch.reward(),
                                                         batch.done(),
                                                         DISCOUNT)

            # Update Q-functions by one step of gradient descent
            qloss = qloss1 + qloss2
            critics_optimiser.zero_grad()
            qloss.backward()
            critics_optimiser.step()
            result['qlosses'].append(qloss.item())

            # Update V-function by one step of gradient descent
            v_pred = value_critic(batch.state()).view(-1, 1)
            vloss = ch.algorithms.sac.state_value_loss(v_pred,
                                                       log_probs.detach(),
                                                       q_values.detach(),
                                                       alpha=ENTROPY_WEIGHT)
            value_critic_optimiser.zero_grad()
            vloss.backward()
            value_critic_optimiser.step()
            result['vlosses'].append(vloss.item())

            # Update policy by one step of gradient ascent
            q_actions = critic_1(batch.state(), actions).view(-1, 1)
            policy_loss = ch.algorithms.sac.policy_loss(log_probs,
                                                        q_actions,
                                                        alpha=ENTROPY_WEIGHT)
            actor_optimiser.zero_grad()
            policy_loss.backward()
            actor_optimiser.step()
            result['plosses'].append(policy_loss.item())

            # Update target value network
            ch.models.polyak_average(target_value_critic,
                                     value_critic,
                                     POLYAK_FACTOR)
    result['pweights'] = list(actor.parameters())
    result['vweights'] = list(value_critic.parameters())
    result['vweights_target'] = list(target_value_critic.parameters())
    result['qweights1'] = list(critic_1.parameters())
    result['qweights2'] = list(critic_2.parameters())
    return result
Exemplo n.º 14
0
def train_cherry():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    result = {
        'rewards': [],
        'plosses': [],
        'vlosses': [],
        'pweights': [],
        'vweights': [],
        'target_vweights': [],
        'target_pweights': [],
    }

    env = gym.make('Pendulum-v0')
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Runner(env)

    actor = Actor(HIDDEN_SIZE, stochastic=False, layer_norm=True)
    critic = Critic(HIDDEN_SIZE, state_action=True, layer_norm=True)
    target_actor = create_target_network(actor)
    target_critic = create_target_network(critic)
    actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE)
    replay = ch.ExperienceReplay()

    def get_random_action(state):
        return torch.tensor([[2 * random.random() - 1]])

    def get_action(state):
        action = actor(state) + ACTION_NOISE * torch.randn(1, 1)
        return torch.clamp(action, min=-1, max=1)

    for step in range(1, MAX_STEPS + 1):
        with torch.no_grad():
            if step < UPDATE_START:
                replay += env.run(get_random_action, steps=1)
            else:
                replay += env.run(get_action, steps=1)

        result['rewards'].append(replay.reward()[-1].item())
        replay = replay[-REPLAY_SIZE:]
        if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
            sample = random.sample(replay, BATCH_SIZE)
            batch = ch.ExperienceReplay(sample)

            next_values = target_critic(batch.next_state(),
                                        target_actor(batch.next_state())).view(
                                            -1, 1)
            values = critic(batch.state(), batch.action()).view(-1, 1)
            value_loss = ch.algorithms.ddpg.state_value_loss(
                values, next_values.detach(), batch.reward(), batch.done(),
                DISCOUNT)
            critic_optimiser.zero_grad()
            value_loss.backward()
            critic_optimiser.step()
            result['vlosses'].append(value_loss.item())

            # Update policy by one step of gradient ascent
            policy_loss = -critic(batch.state(), actor(batch.state())).mean()
            actor_optimiser.zero_grad()
            policy_loss.backward()
            actor_optimiser.step()
            result['plosses'].append(policy_loss.item())

            # Update target networks
            ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR)
            ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)

    result['pweights'] = list(actor.parameters())
    result['target_pweights'] = list(target_actor.parameters())
    result['vweights'] = list(critic.parameters())
    result['target_vweights'] = list(target_critic.parameters())
    return result
Exemplo n.º 15
0
    def run(self, get_action, steps=None, episodes=None, render=False):
        """
        Runner wrapper's run method.
        """

        if steps is None:
            steps = float('inf')
            if self.is_vectorized:
                self._needs_reset = True
        elif episodes is None:
            episodes = float('inf')
        else:
            msg = 'Either steps or episodes should be set.'
            raise Exception(msg)

        replay = ch.ExperienceReplay()
        collected_episodes = 0
        collected_steps = 0
        while True:
            if collected_steps >= steps or collected_episodes >= episodes:
                if self.is_vectorized and collected_episodes >= episodes:
                    replay = flatten_episodes(replay, episodes, self.num_envs)
                    self._needs_reset = True
                return replay
            if self._needs_reset:
                self.reset()
            info = {}
            action = get_action(self._current_state)
            if isinstance(action, tuple):
                skip_unpack = False
                if self.is_vectorized:
                    if len(action) > 2:
                        skip_unpack = True
                    elif len(action) == 2 and \
                            self.env.num_envs == 2 and \
                            not isinstance(action[1], dict):
                        # action[1] is not info but an action
                        action = (action, )

                if not skip_unpack:
                    if len(action) == 2:
                        info = action[1]
                        action = action[0]
                    elif len(action) == 1:
                        action = action[0]
                    else:
                        msg = 'get_action should return 1 or 2 values.'
                        raise NotImplementedError(msg)
            old_state = self._current_state
            state, reward, done, info = self.env.step(action)
            if not self.is_vectorized and done:
                collected_episodes += 1
                self._needs_reset = True
            elif self.is_vectorized:
                collected_episodes += sum(done)
                # Convert tuple info dictionaries (one per worker) in a single
                # dictionary with a list of values for each key for each worker
                # e.g from this
                # ({key_0: value_0, key_1:value_1}, // worker_0 values
                #  {key_0: value_0, key_1:value_1}) // worker_1 values
                # we get this
                # {key_0: [value_0,  // value of worker 0
                #          value_0], // value of worker 1
                #  key_1: [value_1,  // value of worker 0
                #          value_1], // value of worker 1}
                tmp_info = defaultdict(list)
                for info_worker in info:
                    for key, value in info_worker.items():
                        # Ignore types that cannot be converted to tensors
                        if _istensorable(value):
                            tmp_info[key] += [value]
                info = tmp_info
            replay.append(old_state, action, reward, state, done, **info)
            self._current_state = state
            if render:
                self.env.render()
            collected_steps += 1
Exemplo n.º 16
0
    optimizer.zero_grad()
    policy_loss = th.stack(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()


if __name__ == '__main__':
    env = gym.make('CartPole-v0')
    env = envs.Logger(env, interval=1000)
    env = envs.Torch(env)
    env.seed(SEED)

    policy = PolicyNet()
    optimizer = optim.Adam(policy.parameters(), lr=1e-2)
    running_reward = 10.0
    replay = ch.ExperienceReplay()

    for i_episode in count(1):
        state = env.reset()
        for t in range(10000):  # Don't infinite loop while learning
            mass = Categorical(policy(state))
            action = mass.sample()
            old_state = state
            state, reward, done, _ = env.step(action)
            replay.append(
                old_state,
                action,
                reward,
                state,
                done,
                # Cache log_prob for later
Exemplo n.º 17
0
import cherry as ch

# Wrap environments
env = gym.make('CartPole-v0')
env = ch.envs.Logger(env, interval=1000)
env = ch.envs.Torch(env)

policy = PolicyNet()
optimizer = optim.Adam(policy.parameters(), lr=1e-2)
replay = ch.ExperienceReplay()  # Manage transitions

for step in range(1000):
    state = env.reset()
    while True:
        mass = Categorical(policy(state))
        action = mass.sample()
        log_prob = mass.log_prob(action)
        next_state, reward, done, _ = env.step(action)

        # Build the ExperienceReplay
        replay.append(state, action, reward, next_state, done, log_prob=log_prob)
        if done:
            break
        else:
            state = next_state

    # Discounting and normalizing rewards
    rewards = ch.td.discount(0.99, replay.reward(), replay.done())
    rewards = ch.normalize(rewards)

    loss = -th.sum(replay.log_prob() * rewards)
Exemplo n.º 18
0
def main():
    order_book_id_number = 10
    toy_data = create_toy_data(order_book_ids_number=order_book_id_number,
                               feature_number=20,
                               start="2019-05-01",
                               end="2019-12-12",
                               frequency="D")
    env = PortfolioTradingGym(data_df=toy_data,
                              sequence_window=5,
                              add_cash=True)
    env = Numpy(env)
    env = ch.envs.Logger(env, interval=1000)
    env = ch.envs.Torch(env)
    env = ch.envs.Runner(env)

    # create net
    action_size = env.action_space.shape[0]
    number_asset, seq_window, features_number = env.observation_space.shape
    input_size = features_number

    agent = ActorCritic(input_size=input_size,
                        hidden_size=HIDDEN_SIZE,
                        action_size=action_size)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)

    replay = ch.ExperienceReplay()

    for step in range(1, MAX_STEPS + 1):
        replay += env.run(agent, episodes=1)

        if len(replay) >= BATCH_SIZE:
            with torch.no_grad():
                advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY,
                                                      replay.reward(),
                                                      replay.done(),
                                                      replay.value(),
                                                      torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT, replay.reward(), replay.done())
                old_log_probs = replay.log_prob()

            # here is to add readability
            new_values = replay.value()
            new_log_probs = replay.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    _, infos = agent(replay.state())
                    masses = infos['mass']
                    new_values = infos['value']
                    new_log_probs = masses.log_prob(
                        replay.action()).unsqueeze(-1)

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(
                    new_log_probs,
                    old_log_probs,
                    advantages,
                    clip=PPO_CLIP_RATIO)
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(
                    new_values, returns)
                critic_optimiser.zero_grad()
                value_loss.backward()
                critic_optimiser.step()

            replay.empty()
Exemplo n.º 19
0
def train_cherry():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    env = gym.make('Pendulum-v0')
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    agent = ActorCritic(HIDDEN_SIZE)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)

    def get_action(state):
        mass, value = agent(state)
        action = mass.sample()
        log_prob = mass.log_prob(action)
        return action, {
            'log_prob': log_prob,
            'value': value,
        }

    result = {
        'rewards': [],
        'policy_losses': [],
        'value_losses': [],
        'weights': [],
    }

    for step in range(1, CHERRY_MAX_STEPS + 1):
        replay += env.run(get_action, episodes=1)

        if len(replay) >= BATCH_SIZE:
            for r in replay.reward():
                result['rewards'].append(r.item())
            with torch.no_grad():
                advantages = pg.generalized_advantage(DISCOUNT, TRACE_DECAY,
                                                      replay.reward(),
                                                      replay.done(),
                                                      replay.value(),
                                                      torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT, replay.reward(), replay.done())
                old_log_probs = replay.log_prob()

            new_values = replay.value()
            new_log_probs = replay.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    masses, new_values = agent(replay.state())
                    new_log_probs = masses.log_prob(replay.action())
                    new_values = new_values.view(-1, 1)

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(
                    new_log_probs,
                    old_log_probs,
                    advantages,
                    clip=PPO_CLIP_RATIO)
                actor_optimiser.zero_grad()
                policy_loss.backward()
                actor_optimiser.step()
                result['policy_losses'].append(policy_loss.item())

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(
                    new_values, returns)
                critic_optimiser.zero_grad()
                value_loss.backward()
                critic_optimiser.step()
                result['value_losses'].append(value_loss.item())
            replay.empty()

    result['weights'] = list(agent.parameters())
    return result
Exemplo n.º 20
0
    def run(self,
            get_action,
            steps=None,
            episodes=None,
            render=False):
        """
        Runner wrapper's run method.
        """

        if steps is None:
            steps = float('inf')
            if self.is_vectorized:
                self._needs_reset = True
        elif episodes is None:
            episodes = float('inf')
        else:
            msg = 'Either steps or episodes should be set.'
            raise Exception(msg)

        replay = ch.ExperienceReplay()
        collected_episodes = 0
        collected_steps = 0
        while True:
            if collected_steps >= steps or collected_episodes >= episodes:
                if self.is_vectorized and collected_episodes >= episodes:
                    replay = flatten_episodes(replay, episodes, self.num_envs)
                    self._needs_reset = True
                return replay
            if self._needs_reset:
                self.reset()
            info = {}
            action = get_action(self._current_state)
            if isinstance(action, tuple):
                skip_unpack = False
                if self.is_vectorized:
                    if len(action) > 2:
                        skip_unpack = True
                    elif len(action) == 2 and \
                            self.env.num_envs == 2 and \
                            not isinstance(action[1], dict):
                        # action[1] is not info but an action
                        action = (action, )

                if not skip_unpack:
                    if len(action) == 2:
                        info = action[1]
                        action = action[0]
                    elif len(action) == 1:
                        action = action[0]
                    else:
                        msg = 'get_action should return 1 or 2 values.'
                        raise NotImplementedError(msg)
            old_state = self._current_state
            state, reward, done, _ = self.env.step(action)
            if not self.is_vectorized and done:
                collected_episodes += 1
                self._needs_reset = True
            elif self.is_vectorized:
                collected_episodes += sum(done)
            replay.append(old_state, action, reward, state, done, **info)
            self._current_state = state
            if render:
                self.env.render()
            collected_steps += 1
Exemplo n.º 21
0
 def setUp(self):
     self.replay = ch.ExperienceReplay()
Exemplo n.º 22
0
def train_cherry():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)

    agent = ActorCritic(HIDDEN_SIZE)
    actor_optimiser = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimiser = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)

    def get_action(state):
            mass, value = agent(state)
            action = mass.sample()
            log_prob = mass.log_prob(action)
            return action, {
                    'log_prob': log_prob,
                    'value': value,
            }

    env = gym.make('Pendulum-v0')
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    result = {
        'rewards': [],
        'policy_losses': [],
        'value_losses': [],
        'weights': [],
    }

    for step in range(1, CHERRY_MAX_STEPS + 1):

        replay += env.run(get_action, episodes=1)
        if len(replay) > BATCH_SIZE:
            for r in replay.reward():
                result['rewards'].append(r.item())
            with torch.no_grad():
                advantages = ch.pg.generalized_advantage(DISCOUNT,
                                                         TRACE_DECAY,
                                                         replay.reward(),
                                                         replay.done(),
                                                         replay.value(),
                                                         torch.zeros(1))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = ch.td.discount(DISCOUNT,
                                         replay.reward(),
                                         replay.done())

            # Policy loss
            log_probs = replay.log_prob()
            policy_loss = ch.algorithms.a2c.policy_loss(log_probs, advantages)
            actor_optimiser.zero_grad()
            policy_loss.backward()
            actor_optimiser.step()
            result['policy_losses'].append(policy_loss.item())

            # Value loss
            value_loss = ch.algorithms.a2c.state_value_loss(replay.value(),
                                                            returns)
            critic_optimiser.zero_grad()
            value_loss.backward()
            critic_optimiser.step()
            result['value_losses'].append(value_loss.item())

            replay.empty()

    result['weights'] = list(agent.parameters())
    return result
Exemplo n.º 23
0
    def run(self, get_action, steps=None, episodes=None, render=False):
        """
        Runner wrapper's run method.
        """

        if steps is None:
            steps = float('inf')
            if self.is_vectorized:
                self._needs_reset = True
        elif episodes is None:
            episodes = float('inf')
        else:
            msg = 'Either steps or episodes should be set.'
            raise Exception(msg)

        steps = 1000

        replay = ch.ExperienceReplay()
        collected_episodes = 0
        collected_steps = 0
        while True:
            print("collected_steps", collected_steps)
            if collected_steps >= steps or collected_episodes >= episodes:
                if self.is_vectorized and collected_episodes >= episodes:
                    replay = flatten_episodes(replay, episodes, self.num_envs)
                    self._needs_reset = True
                return replay
            if self._needs_reset:
                self.reset()
            info = {}
            action = get_action(self._current_state)
            print("action", action)

            if isinstance(action, tuple):
                skip_unpack = False
                if self.is_vectorized:
                    if len(action) > 2:
                        skip_unpack = True
                    elif len(action) == 2 and \
                            self.env.num_envs == 2 and \
                            not isinstance(action[1], dict):
                        # action[1] is not info but an action
                        action = (action, )

                if not skip_unpack:
                    if len(action) == 2:
                        info = action[1]
                        action = action[0]
                    elif len(action) == 1:
                        action = action[0]
                    else:
                        msg = 'get_action should return 1 or 2 values.'
                        raise NotImplementedError(msg)
            old_state = self._current_state
            state, reward, done, _ = self.env.step(action)
            #print("reward: ", reward)
            #print("state.shape", state.shape)
            #print("state", state)
            #state = rgb2gray(state)
            state = self.full_obs_to_smol_boi(state)
            #reward = reward.to(ptu.get_device())
            #print("INNER LOOPS")
            #print(state)
            #print(reward)
            # print("gray.shape", gray.shape)
            # print("gray", gray)
            # if collected_steps >= 0:
            #     collected_episodes += 1
            #     self._needs_reset = True
            if not self.is_vectorized and done:
                collected_episodes += 1
                self._needs_reset = True
            elif self.is_vectorized:
                collected_episodes += sum(done)
            replay.append(old_state, action, reward, state, done, **info)
            self._current_state = state
            if render:
                self.env.render()
            collected_steps += 1
Exemplo n.º 24
0
def main(env='Pendulum-v0'):
    agent = ActorCritic(HIDDEN_SIZE).to(device)
    agent.apply(weights_init)

    actor_optimizer = optim.Adam(agent.actor.parameters(), lr=LEARNING_RATE)
    critic_optimizer = optim.Adam(agent.critic.parameters(), lr=LEARNING_RATE)
    actor_scheduler = torch.optim.lr_scheduler.StepLR(actor_optimizer,
                                                      step_size=2000,
                                                      gamma=0.5)
    critic_scheduler = torch.optim.lr_scheduler.StepLR(critic_optimizer,
                                                       step_size=2000,
                                                       gamma=0.5)
    replay = ch.ExperienceReplay()

    env = gym.make(env)
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Logger(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    def get_action(state):
        return agent(state.to(device))

    for step in range(1, MAX_STEPS + 1):
        replay += env.run(get_action, episodes=1)

        if len(replay) >= BATCH_SIZE:
            #batch = replay.sample(BATCH_SIZE).to(device)
            batch = replay.to(device)
            with torch.no_grad():
                advantages = pg.generalized_advantage(
                    DISCOUNT, TRACE_DECAY, batch.reward(), batch.done(),
                    batch.value(),
                    torch.zeros(1).to(device))
                advantages = ch.normalize(advantages, epsilon=1e-8)
                returns = td.discount(DISCOUNT, batch.reward(), batch.done())
                old_log_probs = batch.log_prob()

            new_values = batch.value()
            new_log_probs = batch.log_prob()
            for epoch in range(PPO_EPOCHS):
                # Recalculate outputs for subsequent iterations
                if epoch > 0:
                    _, infos = agent(batch.state())
                    masses = infos['mass']
                    new_values = infos['value'].view(-1, 1)
                    new_log_probs = masses.log_prob(batch.action())

                # Update the policy by maximising the PPO-Clip objective
                policy_loss = ch.algorithms.ppo.policy_loss(
                    new_log_probs,
                    old_log_probs,
                    advantages,
                    clip=PPO_CLIP_RATIO)
                actor_optimizer.zero_grad()
                policy_loss.backward()
                #nn.utils.clip_grad_norm_(agent.actor.parameters(), 1.0)
                actor_optimizer.step()

                # Fit value function by regression on mean-squared error
                value_loss = ch.algorithms.a2c.state_value_loss(
                    new_values, returns)
                critic_optimizer.zero_grad()
                value_loss.backward()
                #nn.utils.clip_grad_norm_(agent.critic.parameters(), 1.0)
                critic_optimizer.step()

            actor_scheduler.step()
            critic_scheduler.step()

            replay.empty()
Exemplo n.º 25
0
def main():
    order_book_id_number = 10
    toy_data = create_toy_data(order_book_ids_number=order_book_id_number,
                               feature_number=20,
                               start="2019-05-01",
                               end="2019-12-12",
                               frequency="D")
    env = PortfolioTradingGym(data_df=toy_data,
                              sequence_window=5,
                              add_cash=True)
    env = Numpy(env)
    env = ch.envs.Logger(env, interval=1000)
    env = ch.envs.Torch(env)
    env = ch.envs.Runner(env)

    # create net
    action_size = env.action_space.shape[0]
    number_asset, seq_window, features_number = env.observation_space.shape

    input_size = features_number

    actor = Actor(input_size=input_size,
                  hidden_size=50,
                  action_size=action_size)
    critic = Critic(input_size=input_size,
                    hidden_size=50,
                    action_size=action_size)

    target_actor = create_target_network(actor)
    target_critic = create_target_network(critic)
    actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE_ACTOR)
    critic_optimiser = optim.Adam(critic.parameters(), lr=LEARNING_RATE_CRITIC)
    replay = ch.ExperienceReplay()
    ou_noise = OrnsteinUhlenbeckNoise(mu=np.zeros(action_size))

    def get_action(state):
        action = actor(state)
        action = action + ou_noise()[0]
        return action

    def get_random_action(state):
        action = torch.softmax(torch.randn(action_size), dim=0)
        return action

    for step in range(1, MAX_STEPS + 1):
        with torch.no_grad():

            if step < UPDATE_START:
                replay += env.run(get_random_action, steps=1)
            else:
                replay += env.run(get_action, steps=1)

        replay = replay[-REPLAY_SIZE:]
        if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
            sample = random.sample(replay, BATCH_SIZE)
            batch = ch.ExperienceReplay(sample)

            next_values = target_critic(batch.next_state(),
                                        target_actor(batch.next_state())).view(
                                            -1, 1)
            values = critic(batch.state(), batch.action()).view(-1, 1)
            rewards = ch.normalize(batch.reward())
            #rewards = batch.reward()/100.0   change the convergency a lot
            value_loss = ch.algorithms.ddpg.state_value_loss(
                values, next_values.detach(), rewards, batch.done(), DISCOUNT)
            critic_optimiser.zero_grad()
            value_loss.backward()
            critic_optimiser.step()

            # Update policy by one step of gradient ascent
            policy_loss = -critic(batch.state(), actor(batch.state())).mean()
            actor_optimiser.zero_grad()
            policy_loss.backward()
            actor_optimiser.step()

            # Update target networks
            ch.models.polyak_average(target_critic, critic, POLYAK_FACTOR)
            ch.models.polyak_average(target_actor, actor, POLYAK_FACTOR)
Exemplo n.º 26
0
def main(env='Pendulum-v0'):
    env = gym.make(env)
    env.seed(SEED)
    env = envs.Torch(env)
    env = envs.Logger(env)
    env = envs.Runner(env)
    replay = ch.ExperienceReplay()

    actor = SoftActor(HIDDEN_SIZE)
    critic_1 = Critic(HIDDEN_SIZE, state_action=True)
    critic_2 = Critic(HIDDEN_SIZE, state_action=True)
    value_critic = Critic(HIDDEN_SIZE)
    target_value_critic = create_target_network(value_critic)
    actor_optimiser = optim.Adam(actor.parameters(), lr=LEARNING_RATE)
    critics_optimiser = optim.Adam(
        (list(critic_1.parameters()) + list(critic_2.parameters())),
        lr=LEARNING_RATE)
    value_critic_optimiser = optim.Adam(value_critic.parameters(),
                                        lr=LEARNING_RATE)
    get_action = lambda state: actor(state).sample()

    for step in range(1, MAX_STEPS + 1):
        with torch.no_grad():
            if step < UPDATE_START:
                replay += env.run(get_random_action, steps=1)
            else:
                replay += env.run(get_action, steps=1)
        replay = replay[-REPLAY_SIZE:]

        if step > UPDATE_START and step % UPDATE_INTERVAL == 0:
            sample = random.sample(replay, BATCH_SIZE)
            batch = ch.ExperienceReplay(sample)

            # Pre-compute some quantities
            states = batch.state()
            rewards = batch.reward()
            old_actions = batch.action()
            dones = batch.done()
            masses = actor(states)
            actions = masses.rsample()
            log_probs = masses.log_prob(actions)
            q_values = torch.min(critic_1(states, actions.detach()),
                                 critic_2(states,
                                          actions.detach())).view(-1, 1)

            # Compute Q losses
            v_next = target_value_critic(batch.next_state()).view(-1, 1)
            q_old_pred1 = critic_1(states, old_actions.detach()).view(-1, 1)
            q_old_pred2 = critic_2(states, old_actions.detach()).view(-1, 1)
            qloss1 = ch.algorithms.sac.action_value_loss(
                q_old_pred1, v_next.detach(), rewards, dones, DISCOUNT)
            qloss2 = ch.algorithms.sac.action_value_loss(
                q_old_pred2, v_next.detach(), rewards, dones, DISCOUNT)

            # Update Q-functions by one step of gradient descent
            qloss = qloss1 + qloss2
            critics_optimiser.zero_grad()
            qloss.backward()
            critics_optimiser.step()

            # Update V-function by one step of gradient descent
            v_pred = value_critic(batch.state()).view(-1, 1)
            vloss = ch.algorithms.sac.state_value_loss(v_pred,
                                                       log_probs.detach(),
                                                       q_values.detach(),
                                                       alpha=ENTROPY_WEIGHT)
            value_critic_optimiser.zero_grad()
            vloss.backward()
            value_critic_optimiser.step()

            # Update policy by one step of gradient ascent
            q_actions = critic_1(batch.state(), actions).view(-1, 1)
            policy_loss = ch.algorithms.sac.policy_loss(log_probs,
                                                        q_actions,
                                                        alpha=ENTROPY_WEIGHT)
            actor_optimiser.zero_grad()
            policy_loss.backward()
            actor_optimiser.step()

            # Update target value network
            ch.models.polyak_average(target_value_critic, value_critic,
                                     POLYAK_FACTOR)