Пример #1
0
    def get_sentence_pairs(self):

        for pairs in prun_unordered(self.get_sentence_lookup(), initializer=SentenceLookup.initialize,
                                    initargs=(self._ned_sql_file, self._sentence_subset),
                                    processes=self._pairing_processes):

            if WikipediaDataset.quit:
                break

            if pairs is None:
                self._lookup_sem.release()
                continue

            for idx, row in pairs.iterrows():

                if WikipediaDataset.quit:
                    break

                yield row.id_a, row.id_b,\
                      json.loads(row.sen_a), json.loads(row.sen_b), \
                      row.pos_a, row.pos_b, \
                      row.end_a, row.end_b, \
                      row.label

            self._lookup_sem.release()
            del pairs
Пример #2
0
    def get_sentence_lookup(self):

        for entity_title, ranking in \
                prun_unordered(self.get_random_lookup(), initializer=LookUpBySurface.initialize,
                               initargs=(self._entities_file, self._embeddings, self._n_trees, self._distance_measure,
                                         self._entity_index_path, self._search_k, self._max_dist),
                               processes=self._lookup_processes):

            if WikipediaDataset.quit:
                break

            ranking['rank']=ranking.index
            good = ranking.loc[ranking.guessed_title == entity_title].copy()
            bad = ranking.loc[ranking.guessed_title != entity_title].copy()

            if len(good) == 0:  # There aren't any hits ... skip.
                logger.debug('0')
                self._lookup_sem.release()
                continue

            # we want to have at least bad_count bad examples but also at most max_bad_count examples.
            nbad = max(self._bad_count, min(self._max_bad_count, good['rank'].min()))

            bad = bad.iloc[0:nbad]

            yield SentenceLookup(good, bad)

            del good
            del bad
Пример #3
0
    def get_features(self):

        for features in prun_unordered(self.get_feature_tasks(), initializer=ConvertSamples2Features.initialize,
                                       initargs=(self._tokenizer, self._max_seq_length), processes=10):

            self._convert_sem.release()

            if features is None:
                continue

            yield features

            del features
Пример #4
0
    def get_sentence_lookup(self):

        for job_id, sentences, (entity_id, candidates) in \
                prun_unordered(self.get_lookup(), initializer=LookUpByEmbeddings.initialize,
                               initargs=(self._entities_file, self._entity_types, self._n_trees, self._distance_measure,
                                         self._entity_index_path, self._ned_sql_file),
                               processes=self._lookup_processes):

            if len(candidates) == 0:
                self._queue_sentences.add_to_job(job_id,
                                                 (sentences, entity_id, None))
            else:
                for idx in range(0, len(candidates)):
                    self._queue_sentences.add_to_job(
                        job_id, (sentences, entity_id, candidates.iloc[[idx]]))

            self._queue_sentences.add_to_job(job_id, (sentences, None, None))

            while True:
                job_id, task_info, iter_quit = self._queue_sentences.get_next_task(
                )

                if iter_quit:
                    return

                if task_info is None:
                    break

                sentences, entity_id, candidates, params = task_info

                if self._verbose:
                    print("get_sentence_lookup: {}:{}".format(
                        job_id, entity_id))

                yield SentenceLookupWrapper(job_id,
                                            entity_id,
                                            sentences=sentences,
                                            candidates=candidates,
                                            max_pairs=self._max_pairs,
                                            **params)
Пример #5
0
    def get_lookup(self):

        for job_id, entity_id, ent_type, sentences, (_, embedded, embedding_config) in \
                prun_unordered(self.get_embed(), initializer=EmbedTask.initialize, initargs=(self._embeddings,),
                               processes=self._embed_processes):

            self._queue_lookup.add_to_job(job_id,
                                          (entity_id, ent_type, sentences))

            while True:
                job_id, task_info, iter_quit = self._queue_lookup.get_next_task(
                )

                if iter_quit:
                    return

                if task_info is None:
                    break

                entity_id, ent_type, sentences, params = task_info
                if self._verbose:
                    print("get_lookup: {}:{}({})".format(
                        job_id, entity_id, params))

                # return all the candidates - filtering is done below
                yield LookUpByEmbeddingWrapper(
                    job_id,
                    entity_id,
                    sentences,
                    page_title=entity_id,
                    entity_embeddings=embedded,
                    embedding_config=embedding_config,
                    entity_title=entity_id,
                    entity_type=ent_type,
                    split_parts=self._split_parts,
                    **params)