def save(self, fld): makedirs(fld) with open(os.path.join(fld, 'model.json'), 'w') as json_file: json_file.write(self.model.to_json()) self.model.save_weights(os.path.join(fld, 'weights.hdf5')) attr = dict() for a in self.attr2save: attr[a] = getattr(self, a) pickle.dump(attr, open(os.path.join(fld, 'Qmodel_attr.pickle'), 'wb'))
def play_one_episode( self, exploration, training=True, rand_price=True, verbose=False, ): state, valid_actions = self.env.reset(rand_price=rand_price, training=training) done = False env_t = 0 try: env_t = self.env.t except AttributeError: pass cum_rewards = [np.nan] * env_t actions = [np.nan] * env_t # history of previous actions states = [None] * env_t # history of previous states prev_cum_rewards = 0. extra = {} # extra data used for charts while not done: action = self.agent.act(state, exploration, valid_actions) next_state, reward, done, valid_actions = self.env.step( action, verbose=verbose, ) # next_state, reward, done, valid_actions = self.env.step_verbose(action) cum_rewards.append(prev_cum_rewards + reward) prev_cum_rewards = cum_rewards[-1] actions.append(action) states.append(next_state) if training: self.agent.remember(state, action, reward, next_state, done, valid_actions) self.agent.replay() state = next_state if verbose and not training: steps_path = os.path.join(self.fld_save, 'steps') makedirs(steps_path) save_path = os.path.join(steps_path, 'step_{:03d}'.format(self.env.t)) show_step_chart( prices=self.env.prices, slots=self.env.slots.transpose(), actions=actions, step=self.env.t, window_state=self.env.window_state, save_path=save_path, ) extra['profit'] = self.env._profit_abs return cum_rewards, actions, states, extra
def save(self, fld): makedirs(fld) attr = { 'batch_size': self.batch_size, 'discount_factor': self.discount_factor, #'memory':self.memory } pickle.dump(attr, open(os.path.join(fld, 'agent_attr.pickle'), 'wb')) self.model.save(fld)
def train( self, n_episode, *, save_per_episode=10, exploration_init=1., exploration_decay=0.995, exploration_min=0.01, verbose=True, chart_per_episode=10, ): fld_model = os.path.join(self.fld_save, 'model') makedirs(fld_model) # don't overwrite if already exists with open(os.path.join(fld_model, 'QModel.txt'), 'w') as f: f.write(self.agent.model.qmodel) exploration = exploration_init fld_save = os.path.join(self.fld_save, 'training') makedirs(fld_save) # Store statistics, used for visualization safe_total_rewards = [] # for all episodes explored_total_rewards = [] # for all episodes explorations = [] # for all episodes ma_explored_total_rewards = [] # updated after each episode ma_safe_total_rewards = [] # updated after each episode safe_total_actions = [] path_record = os.path.join(fld_save, 'record.csv') episodes_path = os.path.join(fld_save, 'episodes') makedirs(episodes_path) with open(path_record, 'w') as f: f.write('episode,game,exploration,explored_reward,' 'safe_reward,MA_explored,MA_safe\n') for n in range(n_episode): print('{}/{} training...'.format(n, n_episode)) extra = {} exploration = max(exploration_min, exploration * exploration_decay) explorations.append(exploration) explored_cum_rewards, explored_actions, _, explored_extra = \ self.play_one_episode( exploration, rand_price=True, # use new data for each new episode verbose=True, ) extra['profit_explored'] = explored_extra['profit'] extra['reward_explored'] = explored_cum_rewards[-1] explored_total_rewards.append(explored_cum_rewards[-1]) # Safe values: exploration is completely disabled safe_cum_rewards, safe_actions, _, safe_extra = \ self.play_one_episode( exploration=0, # exploit existing model training=False, # do not append to replay buffer rand_price=False, # reuse previous sampled prices ) extra['profit_safe'] = safe_extra['profit'] extra['reward_safe'] = safe_cum_rewards[-1] safe_total_rewards.append(safe_cum_rewards[-1]) safe_total_actions.extend(safe_actions) # for all episodes ma_explored_total_reward = np.median( explored_total_rewards[-self.ma_window:]) ma_explored_total_rewards.append(ma_explored_total_reward) # for all episodes ma_safe_total_reward = np.median( safe_total_rewards[-self.ma_window:]) ma_safe_total_rewards.append(ma_safe_total_reward) ss = [ str(n), self.env.title.replace(',', ';'), '%.1f' % (exploration * 100.), # exploration factor '%.1f' % (explored_total_rewards[-1]), # explored rewards '%.1f' % (safe_total_rewards[-1]), # safe rewards '%.1f' % ma_explored_total_reward, # MA explored rewards '%.1f' % ma_safe_total_reward, # MA safe rewards ] with open(path_record, 'a') as f: f.write(','.join(ss) + '\n') last_reward = safe_cum_rewards[-1] profit = last_reward if verbose: header = [ '#', 'Data used', 'Exploration, %', '[E] reward', '[S] reward', 'MA [E] reward', 'MA [S] reward', ] explored_rewards = '%.2f' % (explored_cum_rewards[-1]) safe_rewards = '%.2f' % (safe_cum_rewards[-1]) if explored_cum_rewards[-1] > 0: explored_rewards = click.style(explored_rewards, fg='green') if safe_cum_rewards[-1] > 0: safe_rewards = click.style(safe_rewards, fg='green') data = [[ n, # current episode self.env.title, # data label used for episode '%.1f' % (exploration * 100.), explored_rewards, safe_rewards, '%.2f' % ma_explored_total_reward, '%.2f' % ma_safe_total_reward, # '%.2f' % profit, ]] show_step(data=data, header=header) # if n % save_per_episode == 0: if last_reward > self._best_result: print('{} saving results...'.format(n)) self.agent.save(fld_model) self._best_result = last_reward if n % chart_per_episode == 0: save_path = os.path.join(episodes_path, 'episode_{:04d}'.format(n)) show_episode_chart( episode=n, safe_actions=safe_actions, safe_rewards=safe_cum_rewards, explored_rewards=explored_cum_rewards, exploration=exploration, extra=extra, save_path=save_path, ) save_path = os.path.join(episodes_path, 'summary') show_episodes_chart( n_episodes=n_episode, safe_total_rewards=safe_total_rewards, ma_safe_total_rewards=ma_safe_total_rewards, explored_total_rewards=explored_total_rewards, ma_explored_total_rewards=ma_explored_total_rewards, explorations=explorations, safe_total_actions=safe_total_actions, ma_window=self.ma_window, save_path=save_path, )
def test( self, n_episode, *, save_per_episode=10, subfld='testing', verbose=True, ): """ Test on `n_episode` episodes, disable exploration, use only trained model. """ fld_save = os.path.join(self.fld_save, subfld) makedirs(fld_save) safe_total_rewards = [] path_record = os.path.join(fld_save, 'record.csv') with open(path_record, 'w') as f: f.write('episode,game,safe_reward,MA_safe\n') for n in range(n_episode): print('{}/{} testing...'.format(n, n_episode)) save_all_episodes = False if n == 0: save_all_episodes = True safe_cum_rewards, safe_actions, _, extra = self.play_one_episode( 0, training=False, rand_price=True, verbose=save_all_episodes, ) last_reward = safe_cum_rewards[-1] safe_total_rewards.append(last_reward) MA_safe_total_rewards = np.median( safe_cum_rewards[-self.ma_window:]) ss = [ str(n), # number of episode self.env.title.replace(',', ';'), '%.1f' % (safe_cum_rewards[-1]), # pnl, safe cumulative rewards '%.1f' % MA_safe_total_rewards # moving average on safe total rewards ] with open(path_record, 'a') as f: f.write(','.join(ss) + '\n') if verbose: header = [ '# (testing)', 'Data used', '[S] reward', 'MA [S] reward', ] safe_reward = '%.2f' % last_reward if last_reward > 0: safe_reward = click.style(safe_reward, fg='green') data = [[ n, # current episode self.env.title, # data label used for episode safe_reward, '%.2f' % MA_safe_total_rewards, ]] print() show_step(data=data, header=header) if n % save_per_episode == 0: pass