Exemplo n.º 1
0
    def test_model(self):
        size = 10

        # Current directory
        dir_path = os.path.dirname(os.path.realpath(__file__))

        corpus = LineNewsCorpus(input=os.path.join(dir_path, "..", "data",
                                                   "genuine"),
                                language="en")

        temp_corpus_file = tempfile.NamedTemporaryFile(delete=False)

        # Serialize pre-processed corpus to temp file
        LineCorpus.serialize(temp_corpus_file, corpus, corpus.dictionary)

        loaded_corpus = LineCorpus(temp_corpus_file.name)
        # assert list(loaded_corpus) == []

        model = Doc2vec(loaded_corpus, size, min_count=1)
        it = iter(loaded_corpus)
        vector = model[next(it)]

        # Remove temp file
        os.remove(temp_corpus_file.name)

        assert isinstance(vector, np.ndarray)
        assert size == len(vector)
    def test_serialize_with_metadata(self):
        file = tempfile.mktemp()

        LineCorpus.serialize(file,
                             self.CORPUS_WITH_META,
                             self.dictionary,
                             metadata=True)
        corpus = LineCorpus(file)

        assert np.array_equal(self.CORPUS_TEXTS, list(corpus))
    def _get_group_corpus(self, group_name, group_path):
        """Return Line corpus for a group."""
        temp_corpus_file = os.path.join(self.temp_directory,
                                        group_name + '.line')
        # Check if we have already pre-processed the corpus
        if not os.path.exists(temp_corpus_file):
            corpus = LineNewsCorpus(input=group_path,
                                    metadata=True,
                                    language=self.language,
                                    dictionary=self.dictionary)

            # Serialize pre-processed corpus to temp files
            LineCorpus.serialize(temp_corpus_file,
                                 corpus,
                                 self.dictionary,
                                 metadata=True)

        # Load corpus from temp file
        return MetaLineCorpusWrapper(temp_corpus_file)
    def test_serialize_load(self):
        # Current directory
        dir_path = os.path.dirname(os.path.realpath(__file__))

        corpus = LineNewsCorpus(input=os.path.join(dir_path, "..", "data",
                                                   "genuine"),
                                language="en")

        temp_corpus_file = tempfile.NamedTemporaryFile(delete=False)

        # Serialize pre-processed corpus to temp file
        LineCorpus.serialize(temp_corpus_file, corpus, corpus.dictionary)

        loaded_corpus = LineCorpus(temp_corpus_file.name)
        docs = []
        for d in loaded_corpus:
            docs.append(d)

        # Remove temp file
        os.remove(temp_corpus_file.name)

        np.testing.assert_array_equal(
            [['human', 'human', 'steal', 'job'],
             ['human', 'human', 'steal', 'dog', 'cat']], docs)
def train(model_type: Model, dimension: int):
    if not os.path.exists(dictionary_file):
        logging.warning("Missing dictionary file '%s'" % dictionary_file)
        return

    if not os.path.exists(bow_corpus_file):
        logging.warning("Missing BoW corpus file '%s'" % bow_corpus_file)
        return

    if not os.path.exists(low_corpus_file):
        logging.warning("Missing LoW corpus file '%s'" % low_corpus_file)
        return

    dictionary = Dictionary.load(dictionary_file)
    training_corpus = MmCorpus(bow_corpus_file) if model_type in [
        Model.LSA, Model.LDA
    ] else LineCorpus(low_corpus_file)

    logging.info("Training model %s with dimension %d..." %
                 (model_type, dimension))
    if model_type == Model.LSA:
        model_file = os.path.join(dir_path, 'model_%d.lsa' % dimension)

        model = Lsa(dictionary, corpus=training_corpus, size=dimension)
    elif model_type == Model.LDA:
        model_file = os.path.join(dir_path, 'model_%d.lda' % dimension)

        model = Lda(dictionary, corpus=training_corpus, size=dimension)
    elif model_type == Model.doc2vec:
        model_file = os.path.join(dir_path, 'model_%d.d2v' % dimension)

        model = Doc2vec(corpus=training_corpus, size=dimension)
    else:
        logging.error("Unknown model type '%s" % model_type)
        return

    logging.info("Saving trained model %s with dimension %d..." %
                 (model_type, dimension))
    model.save(model_file)
    def test_serialize(self):
        file = tempfile.mktemp()
        LineCorpus.serialize(file, self.CORPUS, self.dictionary)
        corpus = LineCorpus(file)

        assert np.array_equal(self.CORPUS_TEXTS, list(corpus))
Exemplo n.º 7
0
            model = Lsa(dictionary,
                        size=size,
                        lsa_filename=model_file,
                        tfidf_filename=tfidf_file)
        elif model_type == Model.LDA:
            model_file = os.path.join(training_dir, 'model_%d.lda' % size)

            model = Lda(dictionary, size=size, lda_filename=model_file)
        else:
            logging.error("Unknown model type '%s'" % model_type)
            sys.exit(1)
    else:
        # Load LoW corpus and dictionary from files
        dictionary = Dictionary.load(dictionary_file)
        training_corpus = LineCorpus(training_low_corpus_file)

        model_file = os.path.join(training_dir, 'model_%d.d2v' % size)

        model = Doc2vec(size=size, d2v_filename=model_file)

    # Load test corpora
    sep_t = None

    if model_type == Model.doc2vec:
        # Heldout LoW corpora
        corpora = FolderAggregatedLineNewsCorpora(heldout_dir,
                                                  temp_dir,
                                                  dictionary,
                                                  language=language)
Exemplo n.º 8
0
 def __init__(self, filename):
     self.corpus = LineCorpus(filename)
     self.metadata = unpickle(filename + ".metadata.cpickle")
                                    dictionary=dictionary,
                                    language=args.lang)

    dictionary = training_corpus.dictionary
    if args.filter:
        logging.info("Filtering dictionary...")
        # https://onlinelibrary.wiley.com/doi/epdf/10.1111/j.1756-8765.2010.01108.x
        dictionary.filter_extremes(no_below=10, no_above=0.5, keep_n=2000000)
        dictionary.compactify()

    # Serialize pre-processed BoW corpus and dictionary to files
    logging.info("Saving dictionary to '%s'" % dictionary_file)
    dictionary.save(dictionary_file)
    logging.info("Dictionary size: %d MB" %
                 (os.path.getsize(dictionary_file) >> 20))

    logging.info("Saving BoW corpus to '%s'" % bow_corpus_file)
    MmCorpus.serialize(bow_corpus_file, training_corpus, id2word=dictionary)
    logging.info("BoW corpus size: %d MB" %
                 (os.path.getsize(bow_corpus_file) >> 20))

    # Create LoW corpus
    logging.info("Creating LoW corpus...")
    training_corpus = LineNewsCorpus(input=args.dirs,
                                     dictionary=dictionary,
                                     language=args.lang)
    logging.info("Saving LoW corpus to '%s'" % low_corpus_file)
    LineCorpus.serialize(low_corpus_file, training_corpus, dictionary)
    logging.info("LoW corpus size: %d MB" %
                 (os.path.getsize(low_corpus_file) >> 20))