def preProcess(): print 'PreProcess Reuters Corpus' start_time = time.time() docs = 0 bad = 0 tokenizer = Tokenizer() if not os.path.isdir(Paths.base): os.makedirs(Paths.base) with open(Paths.text_index, 'w') as fileid_out: with codecs.open(Paths.texts_clean, 'w', 'utf-8-sig') as out: with codecs.open(Paths.reuter_test, 'w', 'utf-8-sig') as test: for f in reuters.fileids(): contents = reuters.open(f).read() try: tokens = tokenizer.tokenize(contents) docs += 1 if docs % 1000 == 0: print "Normalised %d documents" % (docs) out.write(' '.join(tokens) + "\n") # if f.startswith("train"): # # else: # test.write(' '.join(tokens) + "\n") fileid_out.write(f + "\n") except UnicodeDecodeError: bad += 1 print "Normalised %d documents" % (docs) print "Skipped %d bad documents" % (bad) print 'Finished building train file ' + Paths.texts_clean end_time = time.time() print '(Time to preprocess Reuters Corpus: %s)' % (end_time - start_time)
def append(h_in, h_out, split): tkn = Tokenizer(args.tagger) data = h_in[split] # train, dev, test cur_size = len(data['product']) bcateid = data['bcateid'][()] mcateid = data['mcateid'][()] scateid = data['scateid'][()] dcateid = data['dcateid'][()] def get_label(i, vocab_type="bmsd"): b = bcateid[i] m = mcateid[i] s = scateid[i] d = dcateid[i] if split == 'train': if vocab_type == "bmsd": y = bmsd_vocab['%s>%s>%s>%s' % (b, m, s, d)] else: raise return y else: return -1 h_out['img_feat'] = data['img_feat'][:] h_out['pid'] = data['pid'][:] h_out['label'] = [get_label(i, vocab_type="bmsd") for i in range(cur_size)] for col in columns: result = [] for i in range(cur_size): txt = normalize(data[col][i], col_type=col) words = tkn.tokenize(txt) result.append(np.string_(words)) h_out[col] = np.array(result, dtype="S1000")
class DataLoader(object): """Load the images and labels from the database and process into batches Attributes: data_base_dir (str): Folder with the processed images label_path (str): File with latex math formulas max_aspect_ratio (int): Maximum aspect ratio (width/height) for images max_encoder_l_h (int): Maximum size for the images height max_encoder_l_w (int): Maximum size for the images width max_decoder_l (int): Maximum number of tokens for the latex formula """ def __init__(self, data_base_dir, label_path, max_aspect_ratio, max_encoder_l_h, max_encoder_l_w, max_decoder_l, max_vocab_size, initial_id2voc, initial_voc2id): # folder with processed images self.data_base_dir = data_base_dir # .lst file with formulas self.label_path = label_path self.max_width = 10000 self.max_aspect_ratio = max_aspect_ratio self.max_encoder_l_h = max_encoder_l_h self.max_encoder_l_w = max_encoder_l_w self.max_decoder_l = max_decoder_l self.min_aspect_ratio = 0.5 self.vocab_size = max_vocab_size self.tokenizer = Tokenizer(initial_id2voc, initial_voc2id) # buffer to save groups of batches with same width and height self.buffer = defaultdict(lambda: defaultdict(list)) def process_batch(self, buf, img_width, img_height): """ Return a batch of images with labels and take it out of the buffer Args: buf (:obj:dict:dict:list): object containing images according to the images size and width img_width (int): size of the image's width in the batch img_height (int): size of the image's height in the batch """ # store images and targets in tensors batch_size = len(buf[img_width][img_height]) images = torch.Tensor(batch_size, 1, img_height, img_width) img_paths = [] max_target_length = max([ len(buf_element[1]) for buf_element in buf[img_width][img_height] ]) for k in range(batch_size): img_paths.append(buf[img_width][img_height][k][2]) images[k] = torch.from_numpy(buf[img_width][img_height][k][0]) targets = torch.zeros(batch_size, max_target_length - 1) targets_eval = torch.zeros(batch_size, max_target_length - 1) num_nonzer = 0 for m in range(len(buf[img_width][img_height])): num_nonzer = (num_nonzer + len(buf[img_width][img_height][m][1]) - 2) for j in range(len(buf[img_width][img_height][m][1]) - 1): targets[m][j] = buf[img_width][img_height][m][1][j] targets_eval[m][j] = buf[img_width][img_height][m][1][j + 1] # restart buffer buf[img_width][img_height] = [] return images, targets, targets_eval, num_nonzer, img_paths def create_data_generator(self, batch_size, directory_path): """ Create a generator that will yield the images and labels Args: batch_size (int): size of the batch to generate directory_path (string): path of the file containing filenames of the images and formulas """ image_list = read_formulas_directory(directory_path) for i in range(0, len(image_list)): # Get the image path and read the image img_path = image_list[i][0] img = imageio.imread("../data/images_processed/" + img_path) # Convert color image to grayscale # (the shape of the image object changes from (h,w,3) to (h,w)) rgb2gray_weights = [0.299, 0.587, 0.114] img = np.average(img, weights=rgb2gray_weights, axis=2) # Get the formula number and save it to a list label_str = image_list[i][1] # tokenize function label_list = self.tokenizer.tokenize(self.label_path, label_str) origH = img.shape[0] origW = img.shape[1] # if list of tokens is too big, truncate if len(label_list) > self.max_decoder_l: label_list = label_list[:self.max_decoder_l] bounds_check = (len(label_list), math.floor(origH / 8.0), math.floor(origW / 8.0)) bounds_tuple = (self.max_decoder_l, self.max_encoder_l_h, self.max_encoder_l_w) if bounds_check <= bounds_tuple: # get aspect_ratio and assure is between # max and min aspect ratios defined aspect_ratio = origW / origH aspect_ratio = min(aspect_ratio, self.max_aspect_ratio) aspect_ratio = max(aspect_ratio, self.min_aspect_ratio) imgW = origW imgH = origH self.buffer[imgW][imgH].append([img, label_list, img_path]) # when buffer reaches batch_size, # return images and targets as tensors if len(self.buffer[imgW][imgH]) == batch_size: images, targets, targets_eval, num_nonzer, img_paths = ( self.process_batch(self.buffer, imgW, imgH)) yield images, targets, targets_eval, num_nonzer, img_paths # when we have gone through all the lines, # return incomplete batches stored in buffer if i == len(image_list) - 1: for imgW in self.buffer: for imgH in self.buffer[imgW]: if len(self.buffer[imgW][imgH]) > 0: images, targets, targets_eval, num_nonzer, img_paths = ( self.process_batch(self.buffer, imgW, imgH)) yield images, targets, targets_eval, num_nonzer, img_paths
class ExsBuilder: """ExsBuilder produces a list of examples given a document set""" def __init__(self, bert_model='bert-base-uncased', file_emb='', vocab_size=150000, min_src_nsents=1, max_src_nsents=50, min_src_ntokens_per_sent=3, max_src_ntokens_per_sent=100): logger.info('=== Initializing a example builder'.ljust(80, '=')) self.min_src_nsents = min_src_nsents self.max_src_nsents = max_src_nsents self.min_src_ntokens_per_sent = min_src_ntokens_per_sent self.max_src_ntokens_per_sent = max_src_ntokens_per_sent logger.debug(f'Loading BERT pre-trained model [{bert_model}]') self.tokB = BertTokenizer.from_pretrained(bert_model) self.tokC = None if file_emb != '': logger.debug('Loading the WBMET dictionary for custom tokenizer') self.tokC = Tokenizer(vocab_size=vocab_size) self.tokC.from_pretrained(file_emb) self.doc_lbl_freq = [0, 0] # document-level [irrel, rel] self.ext_lbl_freq = [0, 0] # token-level [irrel, rel] @staticmethod def tokenize(data, src_keys=['title', 'body'], tgt_key='text'): """Use Stanford CoreNLP tokenizer to tokenize all the documents.""" REMAP = { "-LRB-": "(", "-RRB-": ")", "-LCB-": "{", "-RCB-": "}", "-LSB-": "[", "-RSB-": "]", "``": '"', "''": '"' } with CoreNLPClient(annotators=['tokenize', 'ssplit'], threads=CPU_CNT)\ as client: for did, d in tqdm(data.items()): text = '' for k in src_keys: text += d[k] + ' ' ann = client.annotate(text.strip()) tokens = [] # list of tokenized sentences for sent in ann.sentence: tokens.append([ REMAP[t.word] if t.word in REMAP else t.word.lower() for t in sent.token ]) d[tgt_key] = tokens def encode(self, exs): """Convert sequences into indicies and create data entries for model inputs""" rtn = [] logger.info('Encoding examples...') for qid, did, rel, doc, flds, mesh, keywords in tqdm(exs): entry = { 'qid': qid, 'did': did, 'src': [], 'src_sent_lens': [], 'tgtB': [], 'tgtB_sent_lens': [], 'tgtC': [], 'tgtC_sent_lens': [] } # src for s in doc: # CoreNLP tokenized sequences (list of sentences) if len(s) <= self.min_src_ntokens_per_sent: continue src_str = ' '.join(s[:self.max_src_ntokens_per_sent]) entry['src'] += self.tokB.convert_tokens_to_ids( self.tokB.tokenize(src_str)) entry['src_sent_lens'].append(len(entry['src'])) if len(entry['src']) == 0: continue # tgt - fields tgt_tokens = set() # Used in identifying token-level labels for seq in flds: # flds (disease, gene, demo) # BERT ids = self.tokB.convert_tokens_to_ids(self.tokB.tokenize(seq)) tgt_tokens.update(ids) entry['tgtB'] += ids entry['tgtB_sent_lens'].append(len(entry['tgtB'])) # BMET ids = self.tokC.convert_tokens_to_ids(self.tokC.tokenize(seq)) ids = list(filter(lambda x: x > 1, ids)) # Remove UNKs entry['tgtC'] += ids entry['tgtC_sent_lens'].append(len(entry['tgtC'])) # tgt - mesh mesh = [f'εmesh_{t}' for t in mesh[0].lower().split()] ids = self.tokC.convert_tokens_to_ids(mesh) ids = list(filter(lambda x: x > 1, ids)) # Remove UNKs entry['tgtC'] += ids entry['tgtC_sent_lens'].append(len(entry['tgtC'])) # tgt - keywords seq = ' '.join(keywords) ids = self.tokC.convert_tokens_to_ids(self.tokC.tokenize(seq)) ids = list(filter(lambda x: x > 1, ids)) # Remove UNKs tgt_tokens.update(ids) entry['tgtC'] += ids entry['tgtC_sent_lens'].append(len(entry['tgtC'])) entry['token_labels'] = \ [1 if t in tgt_tokens else 0 for t in entry['src']] sum_ = sum(entry['token_labels']) self.ext_lbl_freq[0] += len(entry['token_labels']) - sum_ self.ext_lbl_freq[1] += sum_ entry['doc_label'] = 0 if rel == 0 else 1 rtn.append(entry) return rtn def build_trec_exs(self, topics, docs): """For each topic and doc pair, encode them, and construct example list """ exs = list() # Tokenize document using Stanford CoreNLP Tokenizer logger.debug( 'Tokenizing %s documents using Stanford CoreNLP ' 'Tokenizer...', len(docs)) self.tokenize(docs) # Add positive examples for qid in topics: for did, rel in topics[qid]['docs']: if did not in docs or \ len(docs[did]['text']) < self.min_src_nsents: continue d = docs[did] # Complete keywords: doc_keywords > doc_mesh > q_mesh keywords = d['keywords'] if len(d['keywords']) > 0 \ else d['mesh_names'] if len(keywords) == 0 and rel > 0: keywords = [topics[qid]['mesh'][1]] exs.append( (qid, did, rel, d['text'][:self.max_src_nsents], topics[qid]['fields'], topics[qid]['mesh'], keywords)) self.doc_lbl_freq[int(rel > 0)] += 1 # Add negative examples neg_docs_ids = [did for did, d in docs.items() if not d['pos']] qids = random.choices(list(topics.keys()), k=len(neg_docs_ids)) for i, did in enumerate(neg_docs_ids): exs.append( (qids[i], did, 0, docs[did]['text'][:self.max_src_nsents], topics[qid]['fields'], topics[qid]['mesh'], [])) self.doc_lbl_freq[0] += 1 random.shuffle(exs) rtn = self.encode(exs) return rtn # todo. Following function will be changed def build(self, examples, docs): """Bulding examples is done in two modes: one for data preparation and the other for prediction. In data preparation, - `exs` are quries in TREC ref datasets - `docs` consists of pos and neg documents prepared by `read_pubmed_docs` In prediction, - `exs` only contains one query with no labels - `docs` the retrieved documents from Solr search results """ # Tokenize documents and build examples with doc_labels exs = [] # Title and Text are multivalued ('text_general' in Solr) results = docs docs = {} for r in results: title = ' '.join(r['ArticleTitle'] if 'ArticleTitle' in r else []) body = ' '.join(r['AbstractText'] if 'AbstractText' in r else []) docs[r['id']] = (title + ' ' + body).strip() logger.debug(f'Tokinizing {len(docs)} retrieved docs...') pos_docs = self.tokenize(docs) # Build examples (with dummy label -1) qid = list(examples.keys())[0] # There's only one anyways logger.info(f'Preparing examples for {qid}...') for did, text in pos_docs.items(): if len(pos_docs[did]) < self.min_src_nsents: continue exs.append((qid, did, -1, pos_docs[did][:self.max_src_nsents], examples[qid]['topics'])) data = self.encode(exs) return data
class TrecTopics(TextDirectoryCorpus): def __init__(self, topics_path, min_depth=0, max_depth=None, metadata=True, lemmatization=True, use_stop=True, pattern=None, exclude_pattern=None, **kwargs): super(TrecTopics, self).__init__(topics_path, dictionary={}, metadata=metadata, min_depth=min_depth, max_depth=max_depth, pattern=pattern, exclude_pattern=exclude_pattern, lines_are_documents=True, **kwargs) self.topics = {} self.topics_vecs = None self.topic_row_maps = {} self.oov = {} self.tokenizer = Tokenizer(minimum_len=TOKEN_MIN_LEN, maximum_len=TOKEN_MAX_LEN, lowercase=True, output_lemma=lemmatization, use_stopwords=use_stop, extra_stopwords=EXTRA_STOPWORDS) def get_texts(self): inside_top = False inside_desc = False inside_narr = False topic_no = None title = "" desc = "" narr = "" for line in self.getstream(): if line.startswith("<top>"): inside_top = True continue if inside_desc: if line.startswith("<"): inside_desc = False else: # desc += line + linesep desc += line + " " continue if inside_narr: if line.startswith("<"): inside_narr = False else: # narr += line + linesep narr += line + " " continue if inside_top: if line.startswith("<num>"): topic_no = line[line.find("Number:", len("<num>")) + len("Number:"):].strip() elif line.startswith("<title>"): title = line[len("<title>"):].strip().replace("Topic:", "") elif line.startswith("<desc>"): inside_desc = True elif line.startswith("<narr>"): inside_narr = True elif line.startswith("</top>"): inside_top = False yield int(topic_no), \ self.tokenizer.tokenize(title), self.tokenizer.tokenize(desc), self.tokenizer.tokenize(narr) title = "" desc = "" narr = "" def init(self): for topic_no, title, desc, narr in self.get_texts(): self.topics[topic_no] = { "title": title, "desc": desc, "narr": narr } def get_title(self, topic_no): return self.topics[topic_no]["title"] def get_desc(self, topic_no): return self.topics[topic_no]["desc"] def get_narr(self, topic_no): return self.topics[topic_no]["narr"] def total_topics(self): return len(self.topics) def indexedize(self, vocab_dict, topic_no, term_list): index_list = [] for term in term_list: if term in vocab_dict: index_list.append(vocab_dict[term].index) else: self.oov.setdefault(topic_no, set()).add(term) return index_list def vectorize(self, vocab_dict, include_title=True, include_desc=False, include_narr=False, norm='l2'): assert include_title or include_desc or include_narr is True vector_length = len(vocab_dict) topic_row = [0] index_col = [] freq_val = [] for topic_no, title, desc, narr in self.get_texts(): self.topic_row_maps.setdefault(topic_no, len(self.topic_row_maps)) indexed_word_list = [] if include_title: indexed_word_list.extend( self.indexedize(vocab_dict, topic_no, title)) if include_desc: indexed_word_list.extend( self.indexedize(vocab_dict, topic_no, desc)) if include_narr: indexed_word_list.extend( self.indexedize(vocab_dict, topic_no, narr)) for index in indexed_word_list: index_col.append(index) freq_val.append(1) topic_row.append(len(index_col)) self.topics_vecs = normalize(csr_matrix( (freq_val, index_col, topic_row), dtype=int, shape=(len(topic_row) - 1, vector_length)).toarray(), norm=norm, axis=1) # self.topics_vecs = csr_matrix((freq_val, index_col, topic_row), dtype=int, # shape=(len(topic_row)-1, vector_length)).toarray() def get_topic_vector(self, topic_no): assert 51 <= topic_no <= 450 return self.topics_vecs[self.topic_row_maps[topic_no]]