示例#1
0
    def save(self, fld):
        makedirs(fld)
        with open(os.path.join(fld, 'model.json'), 'w') as json_file:
            json_file.write(self.model.to_json())
        self.model.save_weights(os.path.join(fld, 'weights.hdf5'))

        attr = dict()
        for a in self.attr2save:
            attr[a] = getattr(self, a)
        pickle.dump(attr, open(os.path.join(fld, 'Qmodel_attr.pickle'), 'wb'))
示例#2
0
    def play_one_episode(
        self,
        exploration,
        training=True,
        rand_price=True,
        verbose=False,
    ):
        state, valid_actions = self.env.reset(rand_price=rand_price,
                                              training=training)
        done = False
        env_t = 0
        try:
            env_t = self.env.t
        except AttributeError:
            pass

        cum_rewards = [np.nan] * env_t
        actions = [np.nan] * env_t  # history of previous actions
        states = [None] * env_t  # history of previous states
        prev_cum_rewards = 0.
        extra = {}  # extra data used for charts
        while not done:
            action = self.agent.act(state, exploration, valid_actions)
            next_state, reward, done, valid_actions = self.env.step(
                action,
                verbose=verbose,
            )
            # next_state, reward, done, valid_actions = self.env.step_verbose(action)

            cum_rewards.append(prev_cum_rewards + reward)
            prev_cum_rewards = cum_rewards[-1]
            actions.append(action)
            states.append(next_state)

            if training:
                self.agent.remember(state, action, reward, next_state, done,
                                    valid_actions)
                self.agent.replay()

            state = next_state
            if verbose and not training:
                steps_path = os.path.join(self.fld_save, 'steps')
                makedirs(steps_path)
                save_path = os.path.join(steps_path,
                                         'step_{:03d}'.format(self.env.t))
                show_step_chart(
                    prices=self.env.prices,
                    slots=self.env.slots.transpose(),
                    actions=actions,
                    step=self.env.t,
                    window_state=self.env.window_state,
                    save_path=save_path,
                )
        extra['profit'] = self.env._profit_abs
        return cum_rewards, actions, states, extra
示例#3
0
    def save(self, fld):
        makedirs(fld)

        attr = {
            'batch_size': self.batch_size,
            'discount_factor': self.discount_factor,
            #'memory':self.memory
        }

        pickle.dump(attr, open(os.path.join(fld, 'agent_attr.pickle'), 'wb'))
        self.model.save(fld)
示例#4
0
    def train(
        self,
        n_episode,
        *,
        save_per_episode=10,
        exploration_init=1.,
        exploration_decay=0.995,
        exploration_min=0.01,
        verbose=True,
        chart_per_episode=10,
    ):
        fld_model = os.path.join(self.fld_save, 'model')
        makedirs(fld_model)  # don't overwrite if already exists
        with open(os.path.join(fld_model, 'QModel.txt'), 'w') as f:
            f.write(self.agent.model.qmodel)

        exploration = exploration_init
        fld_save = os.path.join(self.fld_save, 'training')
        makedirs(fld_save)

        # Store statistics, used for visualization
        safe_total_rewards = []  # for all episodes
        explored_total_rewards = []  # for all episodes
        explorations = []  # for all episodes
        ma_explored_total_rewards = []  # updated after each episode
        ma_safe_total_rewards = []  # updated after each episode
        safe_total_actions = []

        path_record = os.path.join(fld_save, 'record.csv')
        episodes_path = os.path.join(fld_save, 'episodes')
        makedirs(episodes_path)
        with open(path_record, 'w') as f:
            f.write('episode,game,exploration,explored_reward,'
                    'safe_reward,MA_explored,MA_safe\n')

        for n in range(n_episode):
            print('{}/{} training...'.format(n, n_episode))
            extra = {}
            exploration = max(exploration_min, exploration * exploration_decay)
            explorations.append(exploration)
            explored_cum_rewards, explored_actions, _, explored_extra = \
                self.play_one_episode(
                    exploration,
                    rand_price=True,  # use new data for each new episode
                    verbose=True,
            )
            extra['profit_explored'] = explored_extra['profit']
            extra['reward_explored'] = explored_cum_rewards[-1]
            explored_total_rewards.append(explored_cum_rewards[-1])

            # Safe values: exploration is completely disabled
            safe_cum_rewards, safe_actions, _, safe_extra = \
                self.play_one_episode(
                    exploration=0,  # exploit existing model
                    training=False,  # do not append to replay buffer
                    rand_price=False,  # reuse previous sampled prices
            )
            extra['profit_safe'] = safe_extra['profit']
            extra['reward_safe'] = safe_cum_rewards[-1]
            safe_total_rewards.append(safe_cum_rewards[-1])
            safe_total_actions.extend(safe_actions)

            # for all episodes
            ma_explored_total_reward = np.median(
                explored_total_rewards[-self.ma_window:])
            ma_explored_total_rewards.append(ma_explored_total_reward)
            # for all episodes
            ma_safe_total_reward = np.median(
                safe_total_rewards[-self.ma_window:])
            ma_safe_total_rewards.append(ma_safe_total_reward)

            ss = [
                str(n),
                self.env.title.replace(',', ';'),
                '%.1f' % (exploration * 100.),  # exploration factor
                '%.1f' % (explored_total_rewards[-1]),  # explored rewards
                '%.1f' % (safe_total_rewards[-1]),  # safe rewards
                '%.1f' % ma_explored_total_reward,  # MA explored rewards
                '%.1f' % ma_safe_total_reward,  # MA safe rewards
            ]

            with open(path_record, 'a') as f:
                f.write(','.join(ss) + '\n')

            last_reward = safe_cum_rewards[-1]
            profit = last_reward
            if verbose:
                header = [
                    '#',
                    'Data used',
                    'Exploration, %',
                    '[E] reward',
                    '[S] reward',
                    'MA [E] reward',
                    'MA [S] reward',
                ]
                explored_rewards = '%.2f' % (explored_cum_rewards[-1])
                safe_rewards = '%.2f' % (safe_cum_rewards[-1])
                if explored_cum_rewards[-1] > 0:
                    explored_rewards = click.style(explored_rewards,
                                                   fg='green')
                if safe_cum_rewards[-1] > 0:
                    safe_rewards = click.style(safe_rewards, fg='green')

                data = [[
                    n,  # current episode
                    self.env.title,  # data label used for episode
                    '%.1f' % (exploration * 100.),
                    explored_rewards,
                    safe_rewards,
                    '%.2f' % ma_explored_total_reward,
                    '%.2f' % ma_safe_total_reward,
                    # '%.2f' % profit,
                ]]
                show_step(data=data, header=header)

            # if n % save_per_episode == 0:
            if last_reward > self._best_result:
                print('{} saving results...'.format(n))
                self.agent.save(fld_model)
                self._best_result = last_reward

            if n % chart_per_episode == 0:
                save_path = os.path.join(episodes_path,
                                         'episode_{:04d}'.format(n))
                show_episode_chart(
                    episode=n,
                    safe_actions=safe_actions,
                    safe_rewards=safe_cum_rewards,
                    explored_rewards=explored_cum_rewards,
                    exploration=exploration,
                    extra=extra,
                    save_path=save_path,
                )
        save_path = os.path.join(episodes_path, 'summary')
        show_episodes_chart(
            n_episodes=n_episode,
            safe_total_rewards=safe_total_rewards,
            ma_safe_total_rewards=ma_safe_total_rewards,
            explored_total_rewards=explored_total_rewards,
            ma_explored_total_rewards=ma_explored_total_rewards,
            explorations=explorations,
            safe_total_actions=safe_total_actions,
            ma_window=self.ma_window,
            save_path=save_path,
        )
示例#5
0
    def test(
        self,
        n_episode,
        *,
        save_per_episode=10,
        subfld='testing',
        verbose=True,
    ):
        """
        Test on `n_episode` episodes, disable exploration, use only trained
        model.
        """
        fld_save = os.path.join(self.fld_save, subfld)
        makedirs(fld_save)
        safe_total_rewards = []
        path_record = os.path.join(fld_save, 'record.csv')

        with open(path_record, 'w') as f:
            f.write('episode,game,safe_reward,MA_safe\n')

        for n in range(n_episode):
            print('{}/{} testing...'.format(n, n_episode))

            save_all_episodes = False
            if n == 0:
                save_all_episodes = True
            safe_cum_rewards, safe_actions, _, extra = self.play_one_episode(
                0,
                training=False,
                rand_price=True,
                verbose=save_all_episodes,
            )

            last_reward = safe_cum_rewards[-1]
            safe_total_rewards.append(last_reward)
            MA_safe_total_rewards = np.median(
                safe_cum_rewards[-self.ma_window:])
            ss = [
                str(n),  # number of episode
                self.env.title.replace(',', ';'),
                '%.1f' %
                (safe_cum_rewards[-1]),  # pnl, safe cumulative rewards
                '%.1f' %
                MA_safe_total_rewards  # moving average on safe total rewards
            ]

            with open(path_record, 'a') as f:
                f.write(','.join(ss) + '\n')

            if verbose:
                header = [
                    '# (testing)',
                    'Data used',
                    '[S] reward',
                    'MA [S] reward',
                ]

                safe_reward = '%.2f' % last_reward
                if last_reward > 0:
                    safe_reward = click.style(safe_reward, fg='green')

                data = [[
                    n,  # current episode
                    self.env.title,  # data label used for episode
                    safe_reward,
                    '%.2f' % MA_safe_total_rewards,
                ]]
                print()
                show_step(data=data, header=header)

            if n % save_per_episode == 0:
                pass