def _load_docs(self): logging.info("[+] loading docs metadata") title_parser = document.DocumentParser( document.parse_document(), document.pad_document(1, self.config.title_shape) ) body_parser = document.DocumentParser( document.parse_document(), document.pad_document(1, self.config.body_shape) ) with utils.open(self.config.doc_meta_input) as file: docs = [line.strip('\n').split('\t') for line in file] self.docs = { int(line[1]): self.News( title_parser(line[4])[0], body_parser(line[5])[0], ) for line in docs} self.doc_count = max(self.docs.keys()) + 1 doc_example = self.docs[self.doc_count - 1] self.docs[0] = self.News( np.zeros_like(doc_example.title), np.zeros_like(doc_example.body)) logging.info("[-] loaded docs metadata")
def _load_docs(self): logging.info("[+] loading docs metadata") title_parser = document.DocumentParser( document.parse_document(), document.pad_document(1, self.config.title_shape)) body_parser = document.DocumentParser( document.parse_document(), document.pad_document(1, self.config.body_shape)) with open(self.config.doc_meta_input) as file: lines = [line.strip('\n').split('\t') for line in file] self.docs = [ self.News( line[2], line[3], title_parser(line[4])[0], body_parser(line[5])[0], ) for line in lines ] self.verticals = list(set(news.vertical for news in self.docs)) self.sub_verticals = list(set(news.sub_vertical for news in self.docs)) self.data_verticals = keras.utils.to_categorical( np.array( [self.verticals.index(news.vertical) for news in self.docs])) self.data_titles = np.stack([news.title for news in self.docs]) data = np.arange(len(self.docs)) np.random.shuffle(data) self.train_index = data[:len(self.docs) // 10] self.valid_index = data[len(self.docs) // 10:] logging.info("[-] loaded docs metadata")
def _load_docs(self): logging.info("[+] loading docs metadata") title_parser = document.DocumentParser( document.parse_document(), document.pad_document(1, self.config.title_shape) ) body_parser = document.DocumentParser( document.parse_document(), document.pad_document(1, self.config.body_shape) ) doc_parser = document.DocumentParser( document.parse_document(), document.pad_document(1, self.config.title_shape + self.config.body_shape) ) vert2idx = {} vert_cnt = 1 subvert2idx = {} subvert_cnt = 1 vert2idx_path = self.config.vertical2idx_input with utils.open(vert2idx_path[0]) as file: for line in file: vert, idx = line.strip('\n').split('\t') vert2idx[vert] = idx vert_cnt = vert_cnt + 1 with utils.open(vert2idx_path[1]) as file: for line in file: vert, idx = line.strip('\n').split('\t') subvert2idx[vert] = idx subvert_cnt = subvert_cnt + 1 self.vert_cnt = vert_cnt self.subvert_cnt = subvert_cnt with utils.open(self.config.doc_meta_input) as file: docs = [line.strip('\n').split('\t') for line in file] self.docs = { int(line[1]): self.News( title_parser(line[4])[0], body_parser(line[5])[0], doc_parser(line[4] + ' ' + line[5])[0], [int(vert2idx[line[2]])], [int(subvert2idx[line[3]])] ) for line in docs} self.doc_count = max(self.docs.keys()) + 1 doc_example = self.docs[self.doc_count - 1] self.docs[0] = self.News( np.zeros_like(doc_example.title), np.zeros_like(doc_example.body), np.zeros_like(doc_example.doc), [0], [0]) logging.info("[-] loaded docs metadata")
def get_doc_parser(self): doc_encoder = self.model.get_layer('doc_encoder') doc_input_shape = doc_encoder.layers[0].input_shape[-1] title_parser = document.DocumentParser( document.parse_document(), document.pad_document(1, doc_input_shape)) return title_parser
def _load_docs(self): logging.info("[+] loading docs metadata") parser = document.DocumentParser( document.parse_document(), document.pad_document(1, self.config.title_shape)) with utils.open(self.config.doc_meta_input) as file: docs = [line.split('\t') for line in file] self.docs = {int(line[1]): parser(line[4])[0] for line in docs} self.doc_count = max(self.docs.keys()) + 1 self.doc_freq = np.zeros(self.doc_count) for line in docs: self.doc_freq[int(line[1])] = int(line[2]) self.doc_freq = self.doc_freq**0.75 self.doc_freq = self.doc_freq / np.sum(self.doc_freq) logging.info("[-] loaded {} docs".format(self.doc_count))