Exemplo n.º 1
0
    def train(self,
              n_steps: int = None,
              n_episodes: int = None,
              save_every: int = None,
              save_path: str = None,
              callback: callable = None,
              **kwargs):
        batch_size: int = kwargs.get('batch_size', 128)
        discount_factor: float = kwargs.get('discount_factor', 0.9999)
        learning_rate: float = kwargs.get('learning_rate', 0.0001)
        eps_start: float = kwargs.get('eps_start', 0.9)
        eps_end: float = kwargs.get('eps_end', 0.05)
        eps_decay_steps: int = kwargs.get('eps_decay_steps', 200)
        entropy_c: int = kwargs.get('entropy_c', 0.0001)
        memory_capacity: int = kwargs.get('memory_capacity', 1000)

        memory = ReplayMemory(memory_capacity, transition_type=A2CTransition)
        episode = 0
        steps_done = 0
        stop_training = False

        if n_steps and not n_episodes:
            n_episodes = np.iinfo(np.int32).max

        while episode < n_episodes and not stop_training:
            self.episode_id = str(uuid.uuid4())
            state = self.env.reset()
            done = False

            print('====      EPISODE ID: {}      ===='.format(self.episode_id))

            while not done:
                threshold = eps_end + (eps_start - eps_end) * np.exp(
                    -steps_done / eps_decay_steps)
                action = self.get_action(state, threshold=threshold)
                next_state, reward, done, _ = self.env.step(action)

                value = self.critic_network(state[None, :], training=False)
                value = tf.squeeze(value, axis=1)

                memory.push(state, action, reward, done, value)

                state = next_state
                steps_done += 1

                if len(memory) < batch_size:
                    continue

                self._apply_gradient_descent(memory, batch_size, learning_rate,
                                             discount_factor, entropy_c)

                if n_steps and steps_done >= n_steps:
                    done = True
                    stop_training = True

            is_checkpoint = save_every and episode % save_every == 0

            if save_path and (is_checkpoint or episode == n_episodes):
                self.save(save_path, episode=episode)
Exemplo n.º 2
0
    def _apply_gradient_descent(self, memory: ReplayMemory, batch_size: int,
                                learning_rate: float, discount_factor: float):
        optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
        loss = tf.keras.losses.Huber()

        transitions = memory.sample(batch_size)
        batch = DQNTransition(*zip(*transitions))

        state_batch = tf.convert_to_tensor(batch.state)
        action_batch = tf.convert_to_tensor(batch.action)
        reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32)
        next_state_batch = tf.convert_to_tensor(batch.next_state)
        done_batch = tf.convert_to_tensor(batch.done)

        with tf.GradientTape() as tape:
            state_action_values = tf.math.reduce_sum(
                self.policy_network(state_batch) *
                tf.one_hot(action_batch, self.n_actions),
                axis=1)

            next_state_values = tf.where(
                done_batch, tf.zeros(batch_size),
                tf.math.reduce_max(self.target_network(next_state_batch),
                                   axis=1))

            expected_state_action_values = reward_batch + (discount_factor *
                                                           next_state_values)
            loss_value = loss(expected_state_action_values,
                              state_action_values)

        variables = self.policy_network.trainable_variables
        gradients = tape.gradient(loss_value, variables)
        optimizer.apply_gradients(zip(gradients, variables))
Exemplo n.º 3
0
    def _apply_gradient_descent(
        self,
        memory: ReplayMemory,
        batch_size: int,
        learning_rate: float,
        discount_factor: float,
        entropy_c: float,
    ):
        huber_loss = tf.keras.losses.Huber()
        wsce_loss = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True)
        optimizer = tf.keras.optimizers.Adam(lr=learning_rate)

        transitions = memory.tail(batch_size)
        batch = A2CTransition(*zip(*transitions))

        states = tf.convert_to_tensor(batch.state)
        actions = tf.convert_to_tensor(batch.action)
        rewards = tf.convert_to_tensor(batch.reward, dtype=tf.float32)
        dones = tf.convert_to_tensor(batch.done)
        values = tf.convert_to_tensor(batch.value)

        returns = []
        exp_weighted_return = 0

        for reward, done in zip(rewards[::-1], dones[::-1]):
            exp_weighted_return = reward + discount_factor * exp_weighted_return * (
                1 - int(done))
            returns += [exp_weighted_return]

        returns = returns[::-1]

        with tf.GradientTape() as tape:
            state_values = self.critic_network(states)
            critic_loss_value = huber_loss(returns, state_values)

        gradients = tape.gradient(critic_loss_value,
                                  self.critic_network.trainable_variables)
        optimizer.apply_gradients(
            zip(gradients, self.critic_network.trainable_variables))

        with tf.GradientTape() as tape:
            returns = tf.reshape(returns, [batch_size, 1])
            advantages = returns - values

            actions = tf.cast(actions, tf.int32)
            logits = self.actor_network(states)
            policy_loss_value = wsce_loss(actions,
                                          logits,
                                          sample_weight=advantages)

            probs = tf.nn.softmax(logits)
            entropy_loss_value = tf.keras.losses.categorical_crossentropy(
                probs, probs)
            policy_total_loss_value = policy_loss_value - entropy_c * entropy_loss_value

        gradients = tape.gradient(policy_total_loss_value,
                                  self.actor_network.trainable_variables)
        optimizer.apply_gradients(
            zip(gradients, self.actor_network.trainable_variables))
    def run(self):
        memory = ReplayMemory(self.memory_capacity,
                              transition_type=DQNTransition)

        optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate)
        loss_fn = tf.keras.losses.Huber()

        while self.done_queue.qsize() < self.n_envs:
            while self.memory_queue.qsize() > 0:
                sample = self.memory_queue.get()
                memory.push(*sample)

            if len(memory) < self.batch_size:
                continue

            transitions = memory.sample(self.batch_size)
            batch = DQNTransition(*zip(*transitions))

            state_batch = tf.convert_to_tensor(batch.state)
            action_batch = tf.convert_to_tensor(batch.action)
            reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32)
            next_state_batch = tf.convert_to_tensor(batch.next_state)
            done_batch = tf.convert_to_tensor(batch.done)

            with tf.GradientTape() as tape:
                state_action_values = tf.math.reduce_sum(
                    self.model.policy_network(state_batch) *
                    tf.one_hot(action_batch, self.model.n_actions),
                    axis=1)

                next_state_values = tf.where(
                    done_batch, tf.zeros(self.batch_size),
                    tf.math.reduce_max(
                        self.model.target_network(next_state_batch), axis=1))

                expected_state_action_values = reward_batch + \
                    (self.discount_factor * next_state_values)
                loss_value = loss_fn(expected_state_action_values,
                                     state_action_values)

            variables = self.model.policy_network.trainable_variables
            gradients = tape.gradient(loss_value, variables)
            optimizer.apply_gradients(zip(gradients, variables))

            self.model_update_queue.put(self.model)
Exemplo n.º 5
0
    def train(self,
              n_steps: int = None,
              n_episodes: int = None,
              save_every: int = None,
              save_path: str = None,
              callback: callable = None,
              **kwargs) -> float:
        batch_size: int = kwargs.get('batch_size', 128)
        discount_factor: float = kwargs.get('discount_factor', 0.9999)
        learning_rate: float = kwargs.get('learning_rate', 0.0001)
        eps_start: float = kwargs.get('eps_start', 0.9)
        eps_end: float = kwargs.get('eps_end', 0.05)
        eps_decay_steps: int = kwargs.get('eps_decay_steps', 200)
        update_target_every: int = kwargs.get('update_target_every', 1000)
        memory_capacity: int = kwargs.get('memory_capacity', 1000)
        render_interval: int = kwargs.get(
            'render_interval',
            50)  # in steps, None for episode end renderers only

        memory = ReplayMemory(memory_capacity, transition_type=DQNTransition)
        episode = 0
        total_steps_done = 0
        total_reward = 0
        stop_training = False

        if n_steps and not n_episodes:
            n_episodes = np.iinfo(np.int32).max

        print('====      AGENT ID: {}      ===='.format(self.id))

        while episode < n_episodes and not stop_training:
            state = self.env.reset()
            done = False
            steps_done = 0

            while not done:
                threshold = eps_end + (eps_start - eps_end) * np.exp(
                    -total_steps_done / eps_decay_steps)
                action = self.get_action(state, threshold=threshold)
                next_state, reward, done, _ = self.env.step(action)

                memory.push(state, action, reward, next_state, done)

                state = next_state
                total_reward += reward
                steps_done += 1
                total_steps_done += 1

                if len(memory) < batch_size:
                    continue

                self._apply_gradient_descent(memory, batch_size, learning_rate,
                                             discount_factor)

                if n_steps and steps_done >= n_steps:
                    done = True

                if render_interval is not None and steps_done % render_interval == 0:
                    self.env.render(episode=episode,
                                    max_episodes=n_episodes,
                                    max_steps=n_steps)

                if steps_done % update_target_every == 0:
                    self.target_network = tf.keras.models.clone_model(
                        self.policy_network)
                    self.target_network.trainable = False

            is_checkpoint = save_every and episode % save_every == 0

            if save_path and (is_checkpoint or episode == n_episodes - 1):
                self.save(save_path, episode=episode)

            if not render_interval or steps_done < n_steps:
                self.env.render(
                    episode=episode,
                    max_episodes=n_episodes,
                    max_steps=n_steps
                )  # renderers final state at episode end if not rendered earlier

            self.env.save()

            episode += 1

        mean_reward = total_reward / steps_done

        return mean_reward
Exemplo n.º 6
0
    def train(self,
              n_steps: int = None,
              n_episodes: int = None,
              save_every: int = None,
              save_path: str = None,
              callback: callable = None,
              **kwargs) -> float:
        batch_size: int = kwargs.get('batch_size', 128)
        discount_factor: float = kwargs.get('discount_factor', 0.9999)
        learning_rate: float = kwargs.get('learning_rate', 0.0001)
        eps_start: float = kwargs.get('eps_start', 0.9)
        eps_end: float = kwargs.get('eps_end', 0.05)
        eps_decay_steps: int = kwargs.get('eps_decay_steps', 400)
        entropy_c: int = kwargs.get('entropy_c', 0.0001)
        memory_capacity: int = kwargs.get('memory_capacity', 1000)
        train_end: float = kwargs.get('train_end', 0.3)

        memory = ReplayMemory(memory_capacity,
                              transition_type=A2C_LSTM_Transition)
        episode = 0
        steps_done = 0
        total_reward = 0
        stop_training = False

        if n_steps and not n_episodes:
            n_episodes = np.iinfo(np.int32).max

        print('====      AGENT ID: {}      ===='.format(self.id))

        while episode < n_episodes and not stop_training:
            if not episode or not self.env.feed.has_next():
                state = self.env.reset()
            done = False
            steps_done = 0
            if episode:
                #self.LSTM.reset_states()
                memory = ReplayMemory(memory_capacity,
                                      transition_type=A2C_LSTM_Transition)

            self.env.portfolio.reset()
            print(self.env.portfolio.balances)
            print('====      TRAIN EPISODE ID ({}/{}): {}      ===='.format(
                episode + 1, n_episodes, self.env.episode_id))

            while not done:
                if steps_done % 24 == 0:  #each day
                    print("step {}/{}".format(steps_done, n_steps))
                    print(self.env.portfolio.balances)
                    print(self.env.portfolio.net_worth)
                    print('exchange:', state[-1][0])

                if not self.env.feed.has_next():
                    done = True
                    continue
                threshold = eps_end + (eps_start - eps_end) * np.exp(
                    -steps_done / eps_decay_steps)
                action = self.get_action(state, threshold=threshold)
                next_state, reward, done, _ = self.env.step(action)
                value = self.critic_network(state[None, None, :],
                                            training=False)
                value = tf.squeeze(value, axis=-1)

                memory.push(state, action, reward, done, value)

                state = next_state
                total_reward += reward
                steps_done += 1

                if self.env.portfolio.net_worth < self.env.portfolio.initial_net_worth * train_end:
                    done = True
                    continue

                if len(memory) < batch_size:
                    continue

                if True or steps_done % batch_size == 0:
                    self._apply_gradient_descent(memory, batch_size,
                                                 learning_rate,
                                                 discount_factor, entropy_c)

                if n_steps and steps_done >= n_steps:
                    done = True

            # VALIDATION

            test_state = self.test_env.reset()
            self.LSTM.reset_states()
            done = False
            steps_done = 0
            threshold = 0.1

            print(self.env.portfolio.balances)
            print('====      TEST EPISODE ID ({}/{}): {}      ===='.format(
                episode + 1, n_episodes, self.test_env.episode_id))

            while not done:
                if steps_done % 24 == 0:  #each day
                    print("step {}/{}".format(steps_done, n_steps))
                    print(self.test_env.portfolio.balances)
                    print(self.test_env.portfolio.net_worth)
                    print('exchange:', test_state[-1][0])

                if not self.test_env.feed.has_next():
                    done = True
                    continue

                action = self.get_action(test_state, threshold=threshold)
                next_state, reward, done, _ = self.test_env.step(action)
                value = self.critic_network(test_state[None, None, :],
                                            training=False)
                value = tf.squeeze(value, axis=-1)
                test_state = next_state
                steps_done += 1
                if self.test_env.portfolio.net_worth < self.test_env.portfolio.initial_net_worth * train_end:
                    done = True
                    continue

            portfolio_perf = self.test_env.portfolio.performance.values
            np.savetxt(save_path + '/test{}.csv'.format(episode + 1),
                       portfolio_perf,
                       delimiter=',',
                       fmt='%s')

            self.LSTM.reset_states()

            is_checkpoint = save_every and episode % save_every == 0

            if save_path and (is_checkpoint or episode == n_episodes):
                self.save(save_path, episode=episode)

            episode += 1

        mean_reward = total_reward / steps_done

        return mean_reward
Exemplo n.º 7
0
    def train(self,
              n_steps: int = None,
              n_episodes: int = None,
              save_every: int = None,
              save_path: str = None,
              callback: callable = None,
              **kwargs) -> float:
        batch_size: int = kwargs.get('batch_size', 64)
        discount_factor: float = kwargs.get('discount_factor', 0.9999)
        learning_rate: float = kwargs.get('learning_rate', 0.0001)
        eps_start: float = kwargs.get('eps_start', 0.9)
        eps_end: float = kwargs.get('eps_end', 0.05)
        eps_decay_steps: int = kwargs.get('eps_decay_steps', 600)
        update_target_every: int = kwargs.get('update_target_every', 500)
        memory_capacity: int = kwargs.get('memory_capacity', 1000)
        render_interval: int = kwargs.get(
            'render_interval',
            50)  # in steps, None for episode end render only

        memory = ReplayMemory(memory_capacity, transition_type=DQNTransition)
        episode = 0
        total_reward = 0
        stop_training = False

        if n_steps and not n_episodes:
            n_episodes = np.iinfo(np.int32).max

        print('====      AGENT ID: {}      ===='.format(self.id))
        self.env.max_episodes = n_episodes
        self.env.max_steps = n_steps

        #Do a certain number of episodes
        while episode < n_episodes and not stop_training:
            #Reset the state at the beginning of every episode
            state = self.env.reset()
            done = False
            steps_done = 0
            episode_reward = 0

            #Repeat until we reach a terminal state
            while not done:
                #Pick some action, take it, and store the SARSA in our memory
                threshold = eps_end + (eps_start - eps_end) * np.exp(
                    -steps_done / eps_decay_steps)
                action = self.get_action(state, threshold=threshold)
                next_state, reward, done, _ = self.env.step(action)

                memory.push(state, action, reward, next_state, done)

                state = next_state
                total_reward += reward
                episode_reward += reward
                steps_done += 1

                #Don't train until we get enough things in our memory to train with
                if len(memory) < batch_size:
                    continue

                self._apply_gradient_descent(memory, batch_size, learning_rate,
                                             discount_factor)

                if n_steps and steps_done >= n_steps:
                    done = True

                if render_interval is not None and steps_done % render_interval == 0:
                    self.env.render(episode)

                if steps_done % update_target_every == 0:
                    self.target_network = tf.keras.models.clone_model(
                        self.policy_network)
                    self.target_network.trainable = False

            is_checkpoint = save_every and episode % save_every == 0

            if save_path and (is_checkpoint or episode == n_episodes - 1):
                self.save(save_path, episode=episode)

            if not render_interval or steps_done < n_steps:
                self.env.render(
                    episode
                )  # render final state at episode end if not rendered earlier

            #Plot the rewards:
            if plot_rewards:
                episode_rewards.append(episode_reward)
                episode_step_sequences.append(steps_done)
                episode_reward = 0
                #Now plot
                plt.clf()
                plt.xlabel('Step')
                plt.ylabel('Reward')
                for ed, steps in zip(episode_rewards, episode_step_sequences):
                    plt.plot(steps, ed)
                plt.show() if done else plt.pause(
                    0.001)  # Pause a bit so that the graph is updated

            self.env.save()

            episode += 1

        mean_reward = total_reward / steps_done

        return mean_reward
Exemplo n.º 8
0
    def train(self,
              n_steps: int = None,
              n_episodes: int = None,
              save_every: int = None,
              save_path: str = None,
              callback: callable = None,
              **kwargs) -> float:
        batch_size: int = kwargs.get('batch_size', 128)
        discount_factor: float = kwargs.get('discount_factor', 0.9999)
        learning_rate: float = kwargs.get('learning_rate', 0.0001)
        eps_start: float = kwargs.get('eps_start', 0.9)
        eps_end: float = kwargs.get('eps_end', 0.05)
        eps_decay_steps: int = kwargs.get('eps_decay_steps', 200)
        entropy_c: int = kwargs.get('entropy_c', 0.0001)
        memory_capacity: int = kwargs.get('memory_capacity', 1000)
        render_interval: int = kwargs.get(
            'render_interval',
            50)  # in steps, None for episode end render only

        memory = ReplayMemory(memory_capacity, transition_type=A2CTransition)
        episode = 0
        total_steps_done = 0
        total_reward = 0

        if n_steps and not n_episodes:
            n_episodes = np.iinfo(np.int32).max

        print('====      AGENT ID: {}      ===='.format(self.id))

        while episode < n_episodes:
            state = self.env.reset()
            done = False
            steps_done = 0

            print('====      EPISODE ID ({}/{}): {}      ===='.format(
                episode + 1, n_episodes, self.env.episode_id))

            while not done:
                threshold = eps_end + (eps_start - eps_end) * np.exp(
                    -total_steps_done / eps_decay_steps)
                action = self.get_action(state, threshold=threshold)
                next_state, reward, done, _ = self.env.step(action)

                value = self.critic_network(state[None, :], training=False)
                value = tf.squeeze(value, axis=-1)

                memory.push(state, action, reward, done, value)

                state = next_state
                total_reward += reward
                steps_done += 1
                total_steps_done += 1

                if len(memory) < batch_size:
                    continue

                self._apply_gradient_descent(memory, batch_size, learning_rate,
                                             discount_factor, entropy_c)

                if n_steps and steps_done >= n_steps:
                    done = True

                if render_interval is not None and steps_done % render_interval == 0:
                    self.env.render(episode)

            is_checkpoint = save_every and episode % save_every == 0

            if not render_interval or steps_done < n_steps:
                self.env.render(
                    episode
                )  # render final state at episode end if not rendered earlier

            self.env.save()

            if save_path and (is_checkpoint or episode == n_episodes - 1):
                self.save(save_path, episode=episode)

            episode += 1

        mean_reward = total_reward / steps_done

        return mean_reward
Exemplo n.º 9
0
    def run(self):
        episode = 0

        for i in range(self.n_envs):
            self.model_update_queue.put(self.model.policy_network.to_json())

        memory = ReplayMemory(self.memory_capacity,
                              transition_type=DQNTransition)

        optimizer = tf.keras.optimizers.Adam(lr=self.learning_rate)
        loss_fn = tf.keras.losses.Huber()

        while self.done_queue.qsize() < self.n_envs:
            while self.sync_queue.qsize() < self.n_envs:
                while self.memory_queue.qsize() > 0:
                    sample = self.memory_queue.get()
                    memory.push(*sample)

            self.empty_queue(self.sync_queue)

            transitions = memory.sample(self.batch_size)
            batch = DQNTransition(*zip(*transitions))

            state_batch = tf.convert_to_tensor(batch.state)
            action_batch = tf.convert_to_tensor(batch.action)
            reward_batch = tf.convert_to_tensor(batch.reward, dtype=tf.float32)
            next_state_batch = tf.convert_to_tensor(batch.next_state)
            done_batch = tf.convert_to_tensor(batch.done)

            with tf.GradientTape() as tape:
                state_action_values = tf.math.reduce_sum(
                    self.model.policy_network(state_batch) *
                    tf.one_hot(action_batch, self.model.n_actions),
                    axis=1)

                next_state_values = tf.where(
                    done_batch, tf.zeros(self.batch_size),
                    tf.math.reduce_max(
                        self.model.target_network(next_state_batch), axis=1))

                expected_state_action_values = reward_batch + \
                                               (self.discount_factor * next_state_values)
                loss_value = loss_fn(expected_state_action_values,
                                     state_action_values)

            variables = self.model.policy_network.trainable_variables
            gradients = tape.gradient(loss_value, variables)
            optimizer.apply_gradients(zip(gradients, variables))

            self.model.update_target_network()

            json_model = self.model.policy_network.to_json()
            for n in range(self.n_envs):
                self.model_update_queue.put(json_model)

            for n in range(self.n_envs * 2):
                self.sync_queue.put(1)

            episode += 1
            if episode == self.n_episodes:
                break

            while self.model_update_queue.qsize() > 0:
                time.sleep(.1)

            print("")
            self.empty_queue(self.sync_queue)

        self.empty_queue(self.model_update_queue)

        time.sleep(2)

        return self.model