def preprocess(self): data = pd.read_csv(self.dataset_path) message = 'Columns found in the dataset {}'.format(data.columns) print_and_log_message(message, self.logger) data = data.dropna() start_time_stamp = data['Timestamp'][0] timestamps = data['Timestamp'].apply(lambda x: (x - start_time_stamp) / 60) timestamps = timestamps - range(timestamps.shape[0]) data.insert(0, 'blocks', timestamps) blocks = data.groupby('blocks') message = 'Number of blocks of continuous prices found are {}'.format( len(blocks)) print_and_log_message(message, self.logger) self._data_blocks = [] distinct_episodes = 0 for name, indices in blocks.indices.items(): if len(indices) > (self.history_length + self.horizon): self._data_blocks.append(blocks.get_group(name)) distinct_episodes = distinct_episodes + ( len(indices) - (self.history_length + self.horizon) + 1) data = None message_list = [ 'Number of usable blocks obtained from the dataset are {}'.format( len(self._data_blocks)) ] message_list.append( 'Number of distinct episodes for the current configuration are {}'. format(distinct_episodes)) print_and_log_message_list(message_list, self.logger)
def sample(self): if self.count <= self.history_length: print_and_log_message(REPLAY_MEMORY_INSUFFICIENT, self.logger) else: indexes = [] while len(indexes) < self.batch_size: # find random index while True: # sample one index (ignore states wraping over) index = random.randint(self.history_length, self.count - 1) # if wraps over current pointer, then get new one if index >= self.current and index - self.history_length < self.current: continue # if wraps over episode end, then get new one # NB! poststate (last screen) can be terminal state! if self.terminals[(index - self.history_length):index].any(): continue # otherwise use this index break # NB! having index first is fastest in C-order matrices self.prestates[len(indexes), ...] = self.getState(index - 1) self.poststates[len(indexes), ...] = self.getState(index) indexes.append(index) actions = self.actions[indexes] rewards = self.rewards[indexes] terminals = self.terminals[indexes] return self.prestates, actions, rewards, self.poststates, terminals
def preprocess(self): data = pd.read_csv(self.dataset_path) message = 'Columns found in the dataset {}'.format(data.columns) print_and_log_message(message, self.logger) data = data.dropna() start_time_stamp = data['Timestamp'][0] timestamps = data['Timestamp'].apply(lambda x: (x - start_time_stamp) / 60) timestamps = timestamps - range(timestamps.shape[0]) data.insert(0, 'blocks', timestamps) blocks = data.groupby('blocks') message = 'Number of blocks of continuous prices found are {}'.format(len(blocks)) print_and_log_message(message, self.logger) self._data_blocks = [] distinct_episodes = 0 for name, indices in blocks.indices.items(): ''' Length of the block should exceed the history length and horizon by 1. Extra 1 is required to normalize each price block by previos time stamp ''' if len(indices) > (self.history_length + self.horizon + 1): self._data_blocks.append(blocks.get_group(name)) # similarly, we subtract an extra 1 to calculate the number of distinct episodes distinct_episodes = distinct_episodes + (len(indices) - (self.history_length + self.horizon) + 1 + 1) data = None message_list = ['Number of usable blocks obtained from the dataset are {}'.format(len(self._data_blocks))] message_list.append('Number of distinct episodes for the current configuration are {}'.format(distinct_episodes)) print_and_log_message_list(message_list, self.logger)
def save(self): message = "Saving replay memory to {}".format(self._model_dir) for idx, (name, array) in enumerate( zip([ACTIONS, REWARDS, SCREENS, TERMINALS, PRESTATES, POSTSTATES], [self.actions, self.rewards, self.screens, self.terminals, self.prestates, self.poststates])): save_npy(array, join(self._model_dir, name)) message = "Replay memory successfully saved to {}".format(self._model_dir) print_and_log_message(message, self.logger)
def getState(self, index): if self.count == 0: print_and_log_message(REPLAY_MEMORY_ZERO, self.logger) else: index = index % self.count if index >= self.history_length - 1: return self.screens[(index - (self.history_length - 1)):(index + 1), ...] else: indexes = [(index - i) % self.count for i in reversed(range(self.history_length))] return self.screens[indexes, ...]
def add(self, screen, reward, action, terminal): if screen.shape != (self.num_channels): print_and_log_message(INVALID_TIMESTEP, self.logger) else: self.actions[self.current] = action self.rewards[self.current] = reward self.screens[self.current, ...] = screen self.terminals[self.current] = terminal self.count = max(self.count, self.current + 1) self.current = (self.current + 1) % self.memory_size
def load_model(self): message = "Loading checkpoint from {}".format(self.checkpoint_dir) print_and_log_message(message, self.logger) ckpt = tf.train.get_checkpoint_state(self.checkpoint_dir) if ckpt and ckpt.model_checkpoint_path: ckpt_name = os.path.basename(ckpt.model_checkpoint_path) fname = os.path.join(self.checkpoint_dir, ckpt_name) self.saver.restore(self.sess, fname) message = "Checkpoint successfully loaded from {}".format(fname) print_and_log_message(message, self.logger) return True else: message = "Checkpoint could not be loaded from {}".format(self.checkpoint_dir) print_and_log_message(message, self.logger) return False
def set_history(self, history): if history.shape != self._history: print_and_log_message(INVALID_HISTORY, self.logger) self._history = history
def add(self, screen): if screen.shape != (self.num_channels): print_and_log_message(INVALID_TIMESTEP, self.logger) self._history[:-1] = self._history[1:] self._history[-1] = screen
def save_model(self, step=None): message = "Saving checkpoint to {}".format(self.checkpoint_dir) print_and_log_message(message, self.logger) self.saver.save(self.sess, self.checkpoint_dir, global_step=step)
def load_npy(path, logger): obj = np.load(path) message = " [*] loaded from {}".format(path) print_and_log_message(message, logger) return obj
def save_npy(obj, path, logger): np.save(path, obj) message = " [*] saved at {}".format(path) print_and_log_message(message, logger)
def train(self): start_step = self.step_op.eval() num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. max_avg_ep_reward = 0 ep_rewards, actions = [], [] self.env.new_random_episode(self.history) for self.step in tqdm(range(start_step, self.max_step), ncols=70, initial=start_step): if self.step == self.learn_start: num_episodes, self.update_count, ep_reward = 0, 0, 0. total_reward, self.total_loss, self.total_q = 0., 0., 0. ep_rewards, actions = [], [] # 1. predict action = self.predict(self.history.get()) # 2. act screen, reward, terminal = self.env.act(action) # 3. observe self.observe(screen, reward, action, terminal) if terminal: self.env.new_random_episode(self.history) num_episodes += 1 ep_rewards.append(ep_reward) ep_reward = 0. else: ep_reward += reward actions.append(action) total_reward += reward if self.step >= self.learn_start: if self.step % self.test_step == self.test_step - 1: avg_reward = total_reward / self.test_step avg_loss = self.total_loss / self.update_count avg_q = self.total_q / self.update_count try: max_ep_reward = np.max(ep_rewards) min_ep_reward = np.min(ep_rewards) avg_ep_reward = np.mean(ep_rewards) except: max_ep_reward, min_ep_reward, avg_ep_reward = 0, 0, 0 message = 'avg_r: %.4f, avg_l: %.6f, avg_q: %3.6f, avg_ep_r: %.4f, max_ep_r: %.4f, min_ep_r: %.4f, # game: %d' \ % (avg_reward, avg_loss, avg_q, avg_ep_reward, max_ep_reward, min_ep_reward, num_game) print_and_log_message(message, self.logger) if max_avg_ep_reward * 0.9 <= avg_ep_reward: self.step_assign_op.eval( {self.step_input: self.step + 1}) self.save_model(self.step + 1) max_avg_ep_reward = max(max_avg_ep_reward, avg_ep_reward) if self.step > 180: self.inject_summary( { 'average.reward': avg_reward, 'average.loss': avg_loss, 'average.q': avg_q, 'episode.max reward': max_ep_reward, 'episode.min reward': min_ep_reward, 'episode.avg reward': avg_ep_reward, 'episode.num of game': num_game, 'episode.rewards': ep_rewards, 'episode.actions': actions, 'training.learning_rate': self.learning_rate_op.eval( {self.learning_rate_step: self.step}), }, self.step) num_game = 0 total_reward = 0. self.total_loss = 0. self.total_q = 0. self.update_count = 0 ep_reward = 0. ep_rewards = [] actions = []