def get_sentence_pairs(self): for pairs in prun_unordered(self.get_sentence_lookup(), initializer=SentenceLookup.initialize, initargs=(self._ned_sql_file, self._sentence_subset), processes=self._pairing_processes): if WikipediaDataset.quit: break if pairs is None: self._lookup_sem.release() continue for idx, row in pairs.iterrows(): if WikipediaDataset.quit: break yield row.id_a, row.id_b,\ json.loads(row.sen_a), json.loads(row.sen_b), \ row.pos_a, row.pos_b, \ row.end_a, row.end_b, \ row.label self._lookup_sem.release() del pairs
def get_sentence_lookup(self): for entity_title, ranking in \ prun_unordered(self.get_random_lookup(), initializer=LookUpBySurface.initialize, initargs=(self._entities_file, self._embeddings, self._n_trees, self._distance_measure, self._entity_index_path, self._search_k, self._max_dist), processes=self._lookup_processes): if WikipediaDataset.quit: break ranking['rank']=ranking.index good = ranking.loc[ranking.guessed_title == entity_title].copy() bad = ranking.loc[ranking.guessed_title != entity_title].copy() if len(good) == 0: # There aren't any hits ... skip. logger.debug('0') self._lookup_sem.release() continue # we want to have at least bad_count bad examples but also at most max_bad_count examples. nbad = max(self._bad_count, min(self._max_bad_count, good['rank'].min())) bad = bad.iloc[0:nbad] yield SentenceLookup(good, bad) del good del bad
def get_features(self): for features in prun_unordered(self.get_feature_tasks(), initializer=ConvertSamples2Features.initialize, initargs=(self._tokenizer, self._max_seq_length), processes=10): self._convert_sem.release() if features is None: continue yield features del features
def get_sentence_lookup(self): for job_id, sentences, (entity_id, candidates) in \ prun_unordered(self.get_lookup(), initializer=LookUpByEmbeddings.initialize, initargs=(self._entities_file, self._entity_types, self._n_trees, self._distance_measure, self._entity_index_path, self._ned_sql_file), processes=self._lookup_processes): if len(candidates) == 0: self._queue_sentences.add_to_job(job_id, (sentences, entity_id, None)) else: for idx in range(0, len(candidates)): self._queue_sentences.add_to_job( job_id, (sentences, entity_id, candidates.iloc[[idx]])) self._queue_sentences.add_to_job(job_id, (sentences, None, None)) while True: job_id, task_info, iter_quit = self._queue_sentences.get_next_task( ) if iter_quit: return if task_info is None: break sentences, entity_id, candidates, params = task_info if self._verbose: print("get_sentence_lookup: {}:{}".format( job_id, entity_id)) yield SentenceLookupWrapper(job_id, entity_id, sentences=sentences, candidates=candidates, max_pairs=self._max_pairs, **params)
def get_lookup(self): for job_id, entity_id, ent_type, sentences, (_, embedded, embedding_config) in \ prun_unordered(self.get_embed(), initializer=EmbedTask.initialize, initargs=(self._embeddings,), processes=self._embed_processes): self._queue_lookup.add_to_job(job_id, (entity_id, ent_type, sentences)) while True: job_id, task_info, iter_quit = self._queue_lookup.get_next_task( ) if iter_quit: return if task_info is None: break entity_id, ent_type, sentences, params = task_info if self._verbose: print("get_lookup: {}:{}({})".format( job_id, entity_id, params)) # return all the candidates - filtering is done below yield LookUpByEmbeddingWrapper( job_id, entity_id, sentences, page_title=entity_id, entity_embeddings=embedded, embedding_config=embedding_config, entity_title=entity_id, entity_type=ent_type, split_parts=self._split_parts, **params)