Exemplo n.º 1
0
 def load_token2id(folder: str) -> pc.Token2Id:
     dictionary: pd.DataFrame = smart_load(jj(folder, 'dictionary.zip'),
                                           feather_pipe=pu.set_index,
                                           columns='token_id')
     token2id: pc.Token2Id = pc.Token2Id(
         data={t: i
               for (t, i) in zip(dictionary.token, dictionary.index)})
     return token2id
Exemplo n.º 2
0
    def __post_init__(self):

        if isinstance(self.corpus, (pc.VectorizedCorpus, pc.TokenizedCorpus)):

            if not isinstance(self.token2id, pc.Token2Id):
                self.token2id = pc.Token2Id(data=self.corpus.token2id)

            self.document_index = self.corpus.document_index

        if isinstance(self.token2id, dict):
            self.token2id = pc.Token2Id(data=self.token2id)

        self.update_token_counts()

        self.vectorizer_args = {
            **DEFAULT_VECTORIZE_PARAMS,
            **(self.vectorizer_args or {})
        }
Exemplo n.º 3
0
def create_train_corpus() -> tm.TrainingCorpus:
    corpus: TranströmerCorpus = TranströmerCorpus()
    sparse_corpus, vocabulary = convert.TranslateCorpus().translate(corpus, id2token=None)
    tc: tm.TrainingCorpus = tm.TrainingCorpus(
        corpus=sparse_corpus,
        document_index=corpus.document_index,
        token2id=pc.Token2Id(vocabulary.token2id),
    )
    return tc
Exemplo n.º 4
0
def test_id2token2token2id():
    assert pc.id2token2token2id({1: 'a', 2: 'b'}) == {'a': 1, 'b': 2}
    assert pc.id2token2token2id(pc.Token2Id({
        1: 'a',
        2: 'b'
    })) == {
        'a': 1,
        'b': 2
    }
Exemplo n.º 5
0
    def load(folder: str) -> TrainingCorpus:
        """Loads an training corpus from pickled file."""
        """Load from vectorized corpus if exists"""
        if pc.VectorizedCorpus.dump_exists(tag='train', folder=folder):
            corpus: pc.VectorizedCorpus = pc.VectorizedCorpus.load(
                tag='train', folder=folder)
            return TrainingCorpus(
                corpus=corpus,
                document_index=corpus.document_index,
                token2id=pc.Token2Id(data=corpus.token2id),
                corpus_options=utility.read_json(jj(folder,
                                                    CORPUS_OPTIONS_FILENAME),
                                                 default={}),
                vectorizer_args=utility.read_json(jj(folder,
                                                     VECTORIZER_ARGS_FILENAME),
                                                  default={}),
            )

        return None
Exemplo n.º 6
0
 def token2id(self) -> pc.Token2Id:
     return pc.Token2Id(data={
         t: i
         for t, i in zip(self.vocabulary.token, self.vocabulary.token_id)
     })
Exemplo n.º 7
0
    def instream_to_corpus(self, id2token: Mapping[int, str] | None) -> tm.TrainingCorpus:

        content_type: ContentType = self.resolved_prior_out_content_type()

        if self.train_corpus_folder:

            if tm.TrainingCorpus.exists(self.train_corpus_folder):
                logger.info(
                    f"using existing corpus in folder {self.train_corpus_folder} for target mode {self.target_mode}"
                )
                corpus: tm.TrainingCorpus = tm.TrainingCorpus.load(self.train_corpus_folder)
                return corpus

            tags: List[str] = pc.VectorizedCorpus.find_tags(self.train_corpus_folder)

            if len(tags) == 0:
                raise ValueError(f"no train or predict input corpus found in {self.train_corpus_folder}")

            if len(tags) > 1:
                raise ValueError(f"multiple corpus found in folder {self.train_corpus_folder}")

            logger.info(
                f"using corpus tagged {tags[0]} in folder {self.train_corpus_folder} for target mode {self.target_mode}"
            )
            vectorized_corpus: pc.VectorizedCorpus = pc.VectorizedCorpus.load(
                folder=self.train_corpus_folder, tag=tags[0]
            )
            corpus: tm.TrainingCorpus = tm.TrainingCorpus(corpus=vectorized_corpus)
            return corpus

        if content_type == ContentType.VECTORIZED_CORPUS:

            logger.info("creating sparse corpus out of input stream...")

            payload: DocumentPayload = next(self.prior.outstream())
            vectorized_corpus: pc.VectorizedCorpus = payload.content
            vectorize_opts: pc.VectorizeOpts = payload.recall('vectorize_opts')

            if id2token is not None:
                """We must consolidate the vocabularies"""
                logger.info("translating vocabulary to training model's vocabulary...")

                vectorized_corpus.translate_to_vocab(id2token, inplace=True)

            corpus: tm.TrainingCorpus = tm.TrainingCorpus(
                corpus=vectorized_corpus,
                corpus_options={},
                vectorizer_args={} if vectorize_opts is None else vectorize_opts.props,
            )
            logger.info("training corpus created!")

            return corpus

        if content_type == ContentType.TOKENS:
            token2id: pc.Token2Id = (
                pc.Token2Id(pc.id2token2token2id(id2token)) if id2token is not None else self.pipeline.payload.token2id
            )
            corpus: tm.TrainingCorpus = tm.TrainingCorpus(
                corpus=self.prior.filename_content_stream(),
                document_index=self.document_index,
                token2id=token2id,
                corpus_options={},
            )
            return corpus

        raise ValueError("unable to resolve input corpus")
Exemplo n.º 8
0
 def token2id(self) -> pc.Token2Id:
     return pc.Token2Id(data=self.term2id)