def get_encoded_text(line: str, article_length: int = 1000, silent: bool = True): '''get encoded text of an article :param line: one line of article :return: array of word indexes ''' mt = tag_pattern.search(line) assert mt line = line[mt.end():].strip() # get words words = filter_words(line.split(' ')) encoded_list = [] # encoding for word in words: index = word_index.get(word) if index is None: if not silent: print('word %s is unknown' % word) else: encoded_list.append(index) # padding if article_length > 0: padding = article_length - len(encoded_list) if padding < 0: # article too long, randomly slice short cut = np.random.randint(0, len(encoded_list) - article_length) encoded_list = encoded_list[cut:cut + article_length] else: # pad zeros to time_step for _ in range(padding): encoded_list.append(0) return encoded_list
def generator_from_file_debug(raw_path: str, batch_size: int = 10, shuffle: bool = True): ''' generate data from file :param raw_path: news path :return: yield (X, Y, articles) X shape like (batch_size, 500), Y shape like (batch_size, 8), articles shape like (batch_size,) of str ''' with open(raw_path, 'r', encoding='utf-8') as f: lines = f.readlines() if shuffle: np.random.shuffle(lines) cur_idx = 0 while True: X, Y, articles = [], [], [] for _ in range(batch_size): line = lines[cur_idx] cur_idx += 1 if cur_idx == len(lines): cur_idx = 0 # start over Y.append(get_tag(line)) X.append(get_encoded_text(line)) mt = tag_pattern.search(line) articles.append(''.join(line[mt.end():].split(' '))) yield np.array(X), np.array(Y), articles
def get_tag_one_hot(line: str): ''' :param line: one line of article :return: tag (one hot) , ndarray ''' mt = tag_pattern.search(line) assert mt tag = list(map(int, mt.groups())) y = np.zeros_like(tag) y[np.argmax(tag)] = 1 return y
def get_tag_norm(line: str): ''' :param line: one line of article :return: tag (normalized) , ndarray ''' mt = tag_pattern.search(line) assert mt tag = list(map(int, mt.groups())) sum_up = sum(tag) tag = list(map(lambda x: x / sum_up, tag)) # normalize return np.array(tag)
def get_article_max_len(file: str): ''' stat longest article in a dataset :param file: filename of dataset :return: int, max_len ''' with open(file, 'r', encoding='utf-8') as f: max_len = 0 while True: line = f.readline() if not line: break mt = tag_pattern.search(line) assert mt line = line[mt.end():].strip() words = line.split(' ') max_len = max(max_len, len(words)) return max_len