Exemplos de Dictionary.load_from_text em Python, exemplos de gensim.corpora.Dictionary.load_from_text em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: test_ldavowpalwabbit_wrapper.py Projeto: Zarak36/NLP

def get_corpus():
    text_path = datapath('ldavowpalwabbit.txt')
    dict_path = datapath('ldavowpalwabbit.dict.txt')
    dictionary = Dictionary.load_from_text(dict_path)
    with open(text_path) as fhandle:
        corpus = [dictionary.doc2bow(line.strip().split()) for line in fhandle]
    return corpus, dictionary

Exemplo n.º 2

0

Exibir arquivo

 def __init__(self, config, trained_model_path, id2word_path):
     self.model_path = trained_model_path
     self.id2word_path = id2word_path
     self.model = LdaModel.load(self.model_path)
     self.id2word = Dictionary.load_from_text(self.id2word_path)
     self.num_topics = config.num_topics
     assert self.model.num_topics == self.num_topics

Exemplo n.º 3

0

Exibir arquivo

Arquivo: build_lsa_model.py Projeto: gilzoide/coh-metrix-dementia

def apply_tfidf(dictionary_path, mm_corpus_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    mm = MmCorpus(mm_corpus_path)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm',
                       tfidf[mm],
                       progress_cnt=10000)

Exemplo n.º 4

0

Exibir arquivo

Arquivo: esa.py Projeto: werayuthgswu/cs224u-1

    def __init__(self, model_prefix=None, num_best=None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix +
                                                    '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix +
                                           '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix +
                                                '_similarity.index',
                                                mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")

Exemplo n.º 5

0

Exibir arquivo

Arquivo: related_articles.py Projeto: phantomlei3/ArticleScrape

def get_wiki_dictionary():
    '''
    return a dictionary that contains {"tokens": document frequency} in wikipedia corpus
    '''

    # Unpack Wiki dump
    # wiki = WikiCorpus('wikicorpus/enwiki-20201120-pages-articles-multistream1.xml-p1p41242.bz2', lemmatize=False)
    # MmCorpus.serialize("wikicorpus/wiki-corpus.mm", wiki)

    # create documents to save wiki articles
    # documents = list()
    # for i, text in enumerate(wiki.get_texts()):
    #     documents.append(text)

    # Dictionary of document frequencies
    dct = Dictionary.load_from_text("wikicorpus/wiki_dictionary")

    # the document size of wiki corpus is 21126
    wiki_document_size = 21126

    # return dictionary
    df_dictionary = dict()

    # for each word, the p(word) = document frequency / N, where N is the size of documents in this corpus
    id2token = {v: k for k, v in dct.token2id.items()}
    for token_id, document_frequency in dct.dfs.items():
        # Katz smoothing to handle zero occurrences in wiki-corpus
        df_dictionary[id2token[token_id]] = (document_frequency +
                                             1) / wiki_document_size

    return df_dictionary

Exemplo n.º 6

0

Exibir arquivo

Arquivo: lda_model_loader.py Projeto: umeshprasadk/PrathamBooks-Sprint-2018

 def __init__(self,
              MODEL_PATH,
              DICT_LOCATION=DEFAULT_DICTIONARY_FILE_LOCATION):
     self.__model = LdaMulticore.load(MODEL_PATH)
     self.__id2word_dictionary = Dictionary.load_from_text(DICT_LOCATION)
     print(self.__model)
     print(self.__id2word_dictionary)

Exemplo n.º 7

0

Exibir arquivo

Arquivo: test_ldavowpalwabbit_wrapper.py Projeto: rmalouf/gensim

def get_corpus():
    text_path = datapath('ldavowpalwabbit.txt')
    dict_path = datapath('ldavowpalwabbit.dict.txt')
    dictionary = Dictionary.load_from_text(dict_path)
    with open(text_path) as fhandle:
        corpus = [dictionary.doc2bow(l.strip().split()) for l in fhandle]
    return corpus, dictionary

Exemplo n.º 8

0

Exibir arquivo

Arquivo: make_simple_wiki_corpus.py Projeto: fajifr/recontent

def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")

Exemplo n.º 9

0

Exibir arquivo

Arquivo: lsa.py Projeto: andrecunha/coh-metrix-dementia

    def __init__(self, dict_path, model_path):
        """Load an LSA space from a file.

        :dict_path: path to the dictionary file.
        :model_path: path to the model file.
        """
        self._dictionary = Dictionary.load_from_text(dict_path)
        self._lsi_model = LsiModel.load(model_path)

Exemplo n.º 10

0

Exibir arquivo

Arquivo: lsa.py Projeto: gilzoide/coh-metrix-dementia

    def __init__(self, dict_path, model_path):
        """Load an LSA space from a file.

        :dict_path: path to the dictionary file.
        :model_path: path to the model file.
        """
        self._dictionary = Dictionary.load_from_text(dict_path)
        self._lsi_model = LsiModel.load(model_path)

Exemplo n.º 11

0

Exibir arquivo

Arquivo: data_preprocess.py Projeto: Odrec/semantic_doc

def get_corpus(data, save_path_dict='extracted_data/lda_dictionary'):
    if isfile(save_path_dict):
        dictionary = Dictionary.load_from_text(save_path_dict)
        corpus = [dictionary.doc2bow(doc) for doc in data]
        return corpus
    else:
        print("Didn't find a dictionary.")
        import sys
        sys.exit(1)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: text_classifier.py Projeto: lewsn2008/learning_examples

    def __init__(self, dict_file=None, model_file=None):
        if dict_file:
            self.dictionary = Dictionary.load_from_text(dict_file)
        else:
            self.dictionary = Dictionary()

        if model_file:
            self.model = joblib.load(model_file)
        else:
            self.model = None

Exemplo n.º 13

0

Exibir arquivo

def main():
    global args
    taskname = args.taskname
    no_below = args.no_below
    no_above = args.no_above
    n_topic = args.n_topic
    n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2
    use_tfidf = args.use_tfidf
    dist = args.dist
    model_path = args.model_path
    model_name = args.model_name
    save_dir = args.save_dir
    test_path = args.test_path

    device = torch.device('cuda')

    cwd = os.getcwd()
    tmpDir = os.path.join(cwd, 'data', taskname)
    if os.path.exists(os.path.join(tmpDir, 'corpus.mm')):
        dictionary = Dictionary.load_from_text(os.path.join(
            tmpDir, 'dict.txt'))
    else:
        raise Exception("Build corpus first")

    testSet = TestData(dictionary=dictionary,
                       txtPath=test_path,
                       no_below=no_below,
                       no_above=no_above,
                       use_tfidf=use_tfidf)
    voc_size = testSet.vocabsize

    Model = globals()[model_name]
    model = Model(bow_dim=voc_size,
                  n_topic=n_topic,
                  device=device,
                  dist=dist,
                  taskname=taskname)
    model.load_model(model_path)

    topics = model.show_topic_words(dictionary=dictionary)
    for i in range(len(topics)):
        print(i, str(topics[i]))

    infer_topics = []
    for doc in tqdm(testSet):
        if doc is None:
            infer_topics.append(None)
        else:
            infer_topics.append(
                int(
                    np.argmax(
                        model.inference(doc_tokenized=doc,
                                        dictionary=dictionary))))
    with open(save_dir + "/inference_result.txt", "w") as f:
        json.dump(infer_topics, f)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: Linker.py Projeto: PPPI/a-m

 def __load_from_disk(self, path):
     """
     Function that is used internally to load and set-up the class state
     :param path: Location from where the class internal state should be loaded
     :return: None, side-effect on the class on which this is called
     """
     # Read config,
     with open(os.path.join(path, 'config.json')) as f:
         params = jsonpickle.decode(f.read())
     self.net_size_in_days = params['net_size_in_days']
     self.min_tok_len = params['min_tok_len']
     self.undersample_multiplicity = params['undersample_multiplicity']
     self.prediction_threshold = params['prediction_threshold']
     self.use_sim_cs = params['use_sim_cs']
     self.use_sim_j = params['use_sim_j']
     self.use_sim_d = params['use_sim_d']
     self.use_social = params['use_social']
     self.use_temporal = params['use_temporal']
     self.use_file = params['use_file']
     self.use_pr_only = params['use_pr_only']
     self.use_issue_only = params['use_issue_only']
     self.predictions_between_updates = params[
         'predictions_between_updates']
     name = params['name']
     try:
         with open(os.path.join(path, name, 'repository_data.json')) as f:
             self.repository_obj = jsonpickle.decode(f.read())
         with open(os.path.join(path, name, 'truth_data.json')) as f:
             self.truth = jsonpickle.decode(f.read())
     except FileNotFoundError:
         pass
     try:
         with open(os.path.join(path, name, 'fingerprint_data.json')) as f:
             self.fingerprint = jsonpickle.decode(f.read())
     except FileNotFoundError:
         pass
     try:
         self.dictionary = Dictionary.load_from_text(
             os.path.join(path, 'tfidf', 'term2id.txt'))
         self.model = TfidfModel.load(
             os.path.join(path, 'tfidf', 'model.tfidf'))
         with open(os.path.join(path, name, 'stopwords_data.json')) as f:
             self.stopwords = jsonpickle.decode(f.read())
     except FileNotFoundError:
         pass
     try:
         self.clf = pickle.load(
             open(os.path.join(path, 'clf_model', 'model.p'), 'rb'))
     except FileNotFoundError:
         pass
     try:
         self.feature_generator = pickle.load(
             open(os.path.join(path, 'feature_generator', 'gen.p'), 'rb'))
     except FileNotFoundError:
         pass

Exemplo n.º 15

0

Exibir arquivo

Arquivo: nlp.py Projeto: usccolumbia/ogeek

def makeDictionary(docList, dictFile="", add=False):
    '''
    生成词典
    '''
    if os.path.isfile(dictFile) and add:
        dictionary = Dictionary.load_from_text(dictFile)
        dictionary.add_documents(docList)
    else:
        dictionary = Dictionary(docList)
    # dictionary.save_as_text(dictFile)
    return dictionary

Exemplo n.º 16

0

Exibir arquivo

Arquivo: test_corpora_dictionary.py Projeto: jesterhazy/gensim

    def test_saveAsText_and_loadFromText(self):
        """ `Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        d = Dictionary(self.texts)
        d.save_as_text(tmpf)
        # does the file exists
        self.assertTrue(os.path.exists(tmpf))

        d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt'))
        self.assertNotEqual(d_loaded, None)
        self.assertEqual(d_loaded.token2id, d.token2id)

Exemplo n.º 17

0

Exibir arquivo

    def test_saveAsText_and_loadFromText(self):
        """`Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        for sort_by_word in [True, False]:
            d = Dictionary(self.texts)
            d.save_as_text(tmpf, sort_by_word=sort_by_word)
            self.assertTrue(os.path.exists(tmpf))

            d_loaded = Dictionary.load_from_text(tmpf)
            self.assertNotEqual(d_loaded, None)
            self.assertEqual(d_loaded.token2id, d.token2id)

Exemplo n.º 18

0

Exibir arquivo

Arquivo: test_corpora_dictionary.py Projeto: Hrehory/gensim

    def test_saveAsText_and_loadFromText(self):
        """ `Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        d = Dictionary(self.texts)
        d.save_as_text(tmpf)
        # does the file exists
        self.assertTrue(os.path.exists(tmpf))

        d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt'))
        self.assertNotEqual(d_loaded, None)
        self.assertEqual(d_loaded.token2id, d.token2id)

Exemplo n.º 19

0

Exibir arquivo

Arquivo: test_corpora_dictionary.py Projeto: BSatyaKishore/gensim

    def test_saveAsText_and_loadFromText(self):
        """`Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        for sort_by_word in [True, False]:
            d = Dictionary(self.texts)
            d.save_as_text(tmpf, sort_by_word=sort_by_word)
            self.assertTrue(os.path.exists(tmpf))

            d_loaded = Dictionary.load_from_text(tmpf)
            self.assertNotEqual(d_loaded, None)
            self.assertEqual(d_loaded.token2id, d.token2id)

Exemplo n.º 20

0

Exibir arquivo

Arquivo: gensim_model.py Projeto: GarryGaller/nlp_toolkit

 def get_dict(self,path):
     path = path + '.dict'   
     if not os.path.exists(path):
         self.texts = self.get_texts()
         dct = Dictionary(self.texts)
         dct.save_as_text(path)
     else:
         dct = Dictionary()
         dct = dct.load_from_text(path)
         for path in self.inputs:
             self.id_to_path.append(os.path.basename(path))
     return dct

Exemplo n.º 21

0

Exibir arquivo

Arquivo: TopicModel.py Projeto: Ncohen10/theme_extraction

 def display_data(self):
     lda = LdaMulticore.load(self.lda_model_filepath)
     trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
     trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath)
     LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                               trigram_dictionary)
     with open(self.LDAvis_data_filepath, 'w') as f:
         f.write(str(LDAvis_prepared))
         # json.dump(LDAvis_prepared.to_json(), f)
     with open(self.LDAvis_data_filepath) as f:
         LDAvis_prepared = f
     pyLDAvis.display(LDAvis_prepared)

Exemplo n.º 22

0

Exibir arquivo

    def test_loadFromText(self):
        """`Dictionary` can be loaded from textfile."""
        tmpf = get_tmpfile('load_dict_test.txt')
        no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 2)

Exemplo n.º 23

0

Exibir arquivo

Arquivo: test_corpora_dictionary.py Projeto: JKamlah/gensim

    def test_loadFromText(self):
        """`Dictionary` can be loaded from textfile."""
        tmpf = get_tmpfile('load_dict_test.txt')
        no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 2)

Exemplo n.º 24

0

Exibir arquivo

    def get_dictionary(self):
        tmp_fname = self.path + "lda.dictionary"

        if os.path.exists(tmp_fname):
            return Dictionary.load_from_text(tmp_fname)

        else:
            print("Creating dictionary.")
            docs_by_id = read_ap.get_processed_docs()
            docs = [doc for doc_id, doc in docs_by_id.items()]
            dictionary = Dictionary(docs)
            dictionary.save_as_text(tmp_fname)
            return dictionary

Exemplo n.º 25

0

Exibir arquivo

Arquivo: gensimple.py Projeto: fajifr/recontent

 def load_corpus(self, corpus_name):
     ''' This is were we load the corpus files. This needs to be
     moved to a more general class initialization. (FIXME Freija)
     '''
     corpusfile = corpus_name + '.mm'
     corpusdict = corpus_name + '_wordids.txt'
     lsimodel = corpus_name + '.lsi_model'
     lsiindex = corpus_name + '-lsi.index'
     self.corpus_name = corpus_name
     self.corpus_mm = MmCorpus(corpusfile)
     self.corpus_dict = Dictionary.load_from_text(corpusdict)
     self.model = LsiModel.load(lsimodel)
     self.index = similarities.MatrixSimilarity.load(lsiindex)

Exemplo n.º 26

0

Exibir arquivo

 def load_corpus(self, corpus_name):
     ''' This is were we load the corpus files. This needs to be
     moved to a more general class initialization. (FIXME Freija)
     '''
     corpusfile = corpus_name + '.mm'
     corpusdict = corpus_name + '_wordids.txt'
     lsimodel = corpus_name + '.lsi_model'
     lsiindex = corpus_name + '-lsi.index'
     self.corpus_name = corpus_name
     self.corpus_mm = MmCorpus(corpusfile)
     self.corpus_dict = Dictionary.load_from_text(corpusdict)
     self.model = LsiModel.load(lsimodel)
     self.index = similarities.MatrixSimilarity.load(lsiindex)

Exemplo n.º 27

0

Exibir arquivo

Arquivo: topics_analysis.py Projeto: ochachacha/contextualLSTM

def load_corpus_and_dict(corpus_path, id2word_path):
    print("[BLOCK] Loading  corpus and dictionary files from %s and %s" %
          (data_path, id2word_path))
    sys.stdout.flush()
    dictionary = Dictionary.load_from_text(id2word_path)

    print("[BLOCK] Loading corpus iterator")
    sys.stdout.flush()
    #mm = gensim.corpora.MmCorpus(corpus_path)
    corpus = MmCorpus(
        bz2.BZ2File(corpus_path)
    )  # use this if you compressed the TFIDF output (recommended)

    return corpus, dictionary

Exemplo n.º 28

0

Exibir arquivo

    def get_dictionary(self):
        tmp_fname = self.path + self.model_type + "_dictionary"

        if os.path.exists(tmp_fname):
            return Dictionary.load_from_text(tmp_fname)

        else:
            print("Creating dictionary.")
            docs_by_id = read_ap.get_processed_docs()
            docs = [doc for doc_id, doc in docs_by_id.items()]
            dictionary = Dictionary(docs)
            dictionary.filter_extremes(no_below=20, no_above=0.5)
            dictionary.save_as_text(tmp_fname)
            return dictionary

Exemplo n.º 29

0

Exibir arquivo

def main():
    global dictionary
    try:
        dictionary = Dictionary.load_from_text(
            "persist/reuters_dictionary.txt")
        #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2")

    except:
        dictionary = Dictionary(ReutersCorpus())
        dictionary.filter_extremes()
        dictionary.save_as_text("persist/reuters_dictionary.txt")

    models = train_models()

    if settings["models"]["bow"]:
        bowmodel = BOWmodel()
        bowmodel.__out_size = len(dictionary)
        models["bow"] = bowmodel

    if settings["models"]["noise"]:
        noisemodel = NoiseModel(1000)
        noisemodel.__out_size = 1000
        models["noise"] = noisemodel

    num_train_samples = 21578 - settings["held_out_docs"]
    test_samples = []

    class generate_train_samples(object):
        first_iteration = True

        def __iter__(self):
            count = 0
            for document in stream_reuters_documents():
                sample = document["content"], "acq" in document[
                    "topics"]  # todo: maybe try "usa" or "earn"
                if count > num_train_samples:
                    if self.first_iteration:
                        test_samples.append(sample)
                else:
                    yield sample
                count += 1
            self.first_iteration = False

    classifiers = train_classifiers(models, generate_train_samples())

    classifications = run_evaluation(classifiers, models, test_samples)
    #output_results(classifications)

    return classifications

Exemplo n.º 30

0

Exibir arquivo

    def __init__(self, model_prefix='wiki_en'):
        logger = logging.getLogger("LDA")
        self.model_prefix = model_prefix
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        self.fname = 'lda_model.p'

        logger.info("LDA: Loading word dictionary...")
        self.dict = Dictionary.load_from_text(model_prefix + '_wordids.txt')

        logger.info("LDA: Loading pretrained model...")
        self.model = pickle.load(open(self.fname, 'r'))

        logger.info("LDA: Finished loading model files.")

Exemplo n.º 31

0

Exibir arquivo

Arquivo: models.py Projeto: consciousgaze/cs224u

	def __init__(self, model_prefix = 'wiki_en'):
		logger = logging.getLogger("LDA")
		self.model_prefix = model_prefix
		if self.model_prefix is None:
			raise ValueError("model_prefix must be specified")

		self.fname = 'lda_model.p'

		logger.info("LDA: Loading word dictionary...")
		self.dict = Dictionary.load_from_text(model_prefix + '_wordids.txt')

		logger.info("LDA: Loading pretrained model...")
		self.model = pickle.load(open(self.fname, 'r'))

		logger.info("LDA: Finished loading model files.")

Exemplo n.º 32

0

Exibir arquivo

Arquivo: benchmark.py Projeto: phdowling/mSDA

def main():
    global dictionary
    try:
        dictionary = Dictionary.load_from_text("persist/reuters_dictionary.txt")
        #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2")

    except:
        dictionary = Dictionary(ReutersCorpus())
        dictionary.filter_extremes()
        dictionary.save_as_text("persist/reuters_dictionary.txt")

    models = train_models()

    if settings["models"]["bow"]:
        bowmodel = BOWmodel()
        bowmodel.__out_size = len(dictionary)
        models["bow"] = bowmodel

    if settings["models"]["noise"]:
        noisemodel = NoiseModel(1000)
        noisemodel.__out_size = 1000
        models["noise"] = noisemodel

    num_train_samples = 21578 - settings["held_out_docs"]
    test_samples = []


    class generate_train_samples(object):
        first_iteration = True

        def __iter__(self):
            count = 0
            for document in stream_reuters_documents():
                sample = document["content"], "acq" in document["topics"]  # todo: maybe try "usa" or "earn"
                if count > num_train_samples:
                    if self.first_iteration:
                        test_samples.append(sample)
                else:
                    yield sample
                count += 1
            self.first_iteration = False

    classifiers = train_classifiers(models, generate_train_samples())

    classifications = run_evaluation(classifiers, models, test_samples)
    #output_results(classifications)

    return classifications

Exemplo n.º 33

0

Exibir arquivo

    def test_loadFromText_legacy(self):
        """
        `Dictionary` can be loaded from textfile in legacy format.
        Legacy format does not have num_docs on the first line.
        """
        tmpf = get_tmpfile('load_dict_test_legacy.txt')
        no_num_docs_serialization = to_utf8("1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 0)

Exemplo n.º 34

0

Exibir arquivo

Arquivo: test_corpora_dictionary.py Projeto: JKamlah/gensim

    def test_loadFromText_legacy(self):
        """
        `Dictionary` can be loaded from textfile in legacy format.
        Legacy format does not have num_docs on the first line.
        """
        tmpf = get_tmpfile('load_dict_test_legacy.txt')
        no_num_docs_serialization = to_utf8("1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 0)

Exemplo n.º 35

0

Exibir arquivo

def crawl_new_article(request):
    news_list = get_current_news_article()

    # 取得してきたニュースをレコメンドすべきか判断
    download_blob('word/all_id2word.txt', '/tmp/all_id2word.txt')
    dct = Dictionary.load_from_text("/tmp/all_id2word.txt")

    download_blob('model_2.pickle', '/tmp/model_2.pickle')
    with open('/tmp/model_2.pickle', mode='rb') as f:
        classifier = pickle.load(f)

    bow_docs = make_bow(dct)
    result = predict(news_list, dct, classifier, bow_docs)

    upsert_new_articles(result)

    return {"status": "ok"}

Exemplo n.º 36

0

Exibir arquivo

Arquivo: fea2.py Projeto: usccolumbia/ogeek

 def loadDictionary(self, type='offline'):
     '''
     加载字典，若字典不存在则建立字典
     '''
     startTime = datetime.now()
     filePath = self.cachePath + '%s_dictionary_%s.txt' % (self.name, type)
     if os.path.isfile(filePath):
         dictionary = Dictionary.load_from_text(filePath)
     else:
         if type == 'offline':
             docList = self.getDocList('train')
             dictionary = makeDictionary(docList)
         elif type == 'all':
             docList = []
             if os.path.isfile(self.cachePath +
                               '%s_dictionary_online.txt' % self.name):
                 logging.warning('dictionary continue')
                 docList.extend(self.getDocList('testA'))
                 docList.extend(self.getDocList('testB'))
                 dictionary = makeDictionary(
                     docList,
                     dictFile=self.cachePath +
                     '%s_dictionary_online.txt' % self.name,
                     add=True)
             else:
                 for dfName in self.dfFile.keys():
                     docList.extend(self.getDocList(dfName))
                 dictionary = makeDictionary(docList)
         elif type == 'online' and os.path.isfile(
                 self.cachePath + '%s_dictionary_offline.txt' % self.name):
             docList = self.getDocList('valid')
             dictionary = makeDictionary(
                 docList,
                 dictFile=self.cachePath +
                 '%s_dictionary_offline.txt' % self.name,
                 add=True)
         else:
             docList = self.getDocList('train')
             docList.extend(self.getDocList('valid'))
             dictionary = makeDictionary(docList)
         dictionary.save_as_text(filePath)
         logging.warning('make dictionary time: %s' %
                         (datetime.now() - startTime))
     self.dictionary[type] = dictionary
     return dictionary

Exemplo n.º 37

0

Exibir arquivo

Arquivo: data_preprocess.py Projeto: Odrec/semantic_doc

def get_dictionary_corpus(data,
                          save_path_dict='extracted_data/lda_dictionary',
                          save_path_bcorp='extracted_data/lda_bow_corpus'):
    if isfile(save_path_dict):
        dictionary = Dictionary.load_from_text(save_path_dict)
        corpus = gensim.corpora.MmCorpus(save_path_bcorp)
    else:
        dictionary = gensim.corpora.Dictionary(data)
        dictionary.filter_extremes(no_above=0.5, keep_n=100000)
        corpus = [dictionary.doc2bow(doc) for doc in data]
        dictionary.save_as_text(save_path_dict)
        gensim.corpora.MmCorpus.serialize(save_path_bcorp, corpus)

    #    bow_doc_2 = bow_corpus[2]
    #    for i in range(len(bow_doc_2)):
    #        print("Word {} (\"{}\") appears {} time.".format(bow_doc_2[i][0], dictionary[bow_doc_2[i][0]], bow_doc_2[i][1]))

    return corpus, dictionary

Exemplo n.º 38

0

Exibir arquivo

def documentFrequencies():
    dictionary = Dictionary.load_from_text(
        'C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017_wordids.txt.bz2')
    print(max(dictionary.token2id.values()))
    #... get the id corresponding to token "hello"
    tokenid = (dictionary.token2id["hello"])
    print(tokenid)
    #... get the document frequencies in the full corpus for which "hello" appeared
    print(dictionary.dfs[dictionary.token2id["hello"]])
    #... compute the total number of features in this corpus
    print(len(dictionary))

    #... CONSTRUCT THE Document Frequency OUTPUT FILE
    dforig = dictionary.dfs
    dfdict = {}
    for key, val in dforig.items():
        dfdict[str(dictionary[key])] = val

    fieldnames = ["term", "df"]
    with open("document_frequencies.tsv", "w+", encoding="utf-8") as handle:
        writer = csv.writer(handle, delimiter="\t")
        #writer.writerows(dfdict)
        for key, val in dfdict.items():
            writer.writerow([key, val])
        handle.close()

    #... load in the bag-of-words matrix market file for comparison
    mm_name = "C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017_bow.mm"
    wikimodel = MmCorpus(mm_name)
    #... the matrix market should have the same number of features as len(dictionary)
    print(wikimodel)

    #... checked and verified that "hello" appears in the same number of documents as computed earlier
    if False:
        counter = 0
        featcount = 0
        for doc in wikimodel:
            res = [x for x in doc if x[0] == tokenid]
            if len(res) > 0:
                #print (counter, ":",res)
                featcount += 1
                #break
        print(featcount)

Exemplo n.º 39

0

Exibir arquivo

def get_lda_topics(transcript_utterances, trained_lda_model_filepath,
                   trained_lda_wordids_filepath):
    '''
    Parameters
    transcript_utterances: list of lists of strings (words), each row is a plaintext utterance in the transcript.
    trained_lda_model_filepath: string, path to trained LDA model ('/p/spoclab/models/LDA/lda_model_wiki').
    trained_lda_wordids_filepath: string, path to word IDs of trained LDA model (''/p/spoclab/models/LDA/lda_wordids.txt.bz2).

    Returns:
    topic_probabilities: list of floats, probability of each k topic.
    kurtosis: float, kurtosis of all topic probablities.
    skewness: float, skewness of all topic probabilities.
    entrpy: float, entropy of all topic probabilities.
    '''

    # Get files
    trained_lda_model = return_file(trained_lda_model_filepath)
    trained_lda_wordids = return_file(trained_lda_wordids_filepath)

    # Load LDA model
    lda_model = ldamodel.LdaModel.load(trained_lda_model)

    # Load wordids as a dictionary
    id2word = Dictionary.load_from_text(trained_lda_wordids)

    # Convert transcript of tokens into a BoW document
    document_bow = []
    for transcript_utterance in transcript_utterances:
        document_bow += id2word.doc2bow(transcript_utterance)

    # Get document topics
    doc_topics = lda_model.get_document_topics(document_bow,
                                               minimum_probability=0)
    topic_probabilities = [doc_topic[1] for doc_topic in doc_topics]

    skewness = stats.skew(topic_probabilities)
    kurtosis = stats.kurtosis(topic_probabilities)

    # Entropy: SUM(-plog2p)
    entropy = np.sum([-(p * np.log2(p)) for p in topic_probabilities])

    return topic_probabilities, kurtosis, skewness, entropy

Exemplo n.º 40

0

Exibir arquivo

Arquivo: make_wikicorpus.py Projeto: k1vd/lsi-document-similarity

def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute(
        'SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.5,
                                    keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)

Exemplo n.º 41

0

Exibir arquivo

Arquivo: create_tfidf.py Projeto: buruzaemon/LDA-reuters

def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # load back the id->word mapping directly from file
    fin = path.join(datadir, "reuters21578.dict.txt")
    vocabulary = Dictionary.load_from_text(fin)

    # load the corpus
    fin = path.join(datadir, "reuters21578.mm")
    mm = MmCorpus(fin)

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True)

    # save the TfidfModel instance to file
    fout = path.join(datadir, "reuters21578.tfidf.model")
    tfidf.save(fout)

    # save TF-IDF vectors in matrix market format
    fout = path.join(datadir, "reuters21578.tfidf.mm")
    MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)

Exemplo n.º 42

0

Exibir arquivo

Arquivo: temp2.py Projeto: buruzaemon/LDA-reuters

def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # load the LDA model
    fin = path.join(datadir, "reuters21578.lda.model.bz2")
    lda = LdaModel.load(fin)

    # load the corpus
    fin = path.join(datadir, "reuters21578.mm.bz2")
    mm = MmCorpus(fin)


    # load the vocabulary
    fin = path.join(datadir, "reuters21578.dict.txt")
    vocabulary = Dictionary.load_from_text(fin)

    data = pyLDAvis.gensim.prepare(lda, corpus, vocabulary)
    pyLDAvis.show(data,
                  ip=socket.gethostname().lower(),
                  local=True,
                  open_browser=True,
                  http_server=None)

Exemplo n.º 43

0

Exibir arquivo

Arquivo: make_wikicorpus.py Projeto: dvictor/lsi-document-similarity

def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)

Exemplo n.º 44

0

Exibir arquivo

Arquivo: esa.py Projeto: consciousgaze/cs224u

    def __init__(self, model_prefix = None, num_best = None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")

Exemplo n.º 45

0

Exibir arquivo

Arquivo: make_speech_corpus.py Projeto: fajifr/recontent

def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")

Exemplo n.º 46

0

Exibir arquivo

Arquivo: make_wikicorpus.py Projeto: yinlosky/mygensim

        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        mywiki = myWikiCorpus(inp, lemmatize=lemmatize)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)

Exemplo n.º 47

0

Exibir arquivo

Arquivo: query_esa.py Projeto: consciousgaze/cs224u

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)

    # check and process input arguments
    if len(sys.argv) < 3:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    input_file, output_prefix = sys.argv[1:3]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(output_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True
    similarity_index.preload_reverse_index()

    logger.info("Finished loading model files.")

Exemplo n.º 48

0

Exibir arquivo

Arquivo: temp.py Projeto: buruzaemon/LDA-reuters

import os
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
import scipy.stats as stats
import time

from gensim import matutils
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LdaMulticore
from os import path

datadir = path.abspath(path.join(os.getcwd(), "data"))

fin = path.join(datadir, "reuters21578.dict.txt")
vocabulary = Dictionary.load_from_text(fin)

fin = path.join(datadir, "reuters21578.mm.bz2")
mm = MmCorpus(fin)

def sym_kl(p, q):
    return np.sum([stats.entropy(p, q), stats.entropy(p, q)])
    
def arun(corpus, dictionary, min_topics=10, max_topics=100, step=10):
    l = np.array([sum(cnt for _, cnt in doc) for doc in corpus])
    
    kl = []
    for n in range(min_topics, max_topics+step, step):
        print("starting multicore LDA for num_topics={}".format(n))
        st = time.clock()
        lda = LdaMulticore(corpus=corpus,

Exemplo n.º 49

0

Exibir arquivo

Arquivo: build_lsa_model.py Projeto: andrecunha/coh-metrix-dementia

def build_model(dictionary_path, mm_corpus_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    # Use the if-idf corpus here, not the original one.
    mm = MmCorpus(mm_corpus_path)
    lsi = lsimodel.LsiModel(corpus=mm, id2word=dictionary, num_topics=400)
    lsi.save('/home/andre/Develop/corpora/lsamodel_lsi.model')

Exemplo n.º 50

0

Exibir arquivo

Arquivo: validate_reverse_index.py Projeto: consciousgaze/cs224u

    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)

    # check and process input arguments
    if len(sys.argv) < 2:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    model_prefix = sys.argv[1]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True

    logger.info("Finished loading model files.")

    mismatches = 0

Exemplo n.º 51

0

Exibir arquivo

Arquivo: make_wiki_from_stream_items.py Projeto: diffeo/gensim

    else:  ## not online
        # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        wiki = WikiCorpus(
            args.input, lemmatize=args.lemmatize, 
            max_articles=args.max_articles,
            expect_streamitems=args.expect_streamitems,                          
            file_name_pattern=args.file_name_pattern,
        ) 
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(args.output + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(args.output + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(args.output + '_wordids.txt.bz2')
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(args.output + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(args.output + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %r" % args.__dict__)

Exemplo n.º 52

0

Exibir arquivo

Arquivo: build_lsa_model.py Projeto: andrecunha/coh-metrix-dementia

def apply_tfidf(dictionary_path, mm_corpus_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    mm = MmCorpus(mm_corpus_path)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm',
                       tfidf[mm], progress_cnt=10000)

Exemplo n.º 53

0

Exibir arquivo

Arquivo: build_lsa_model.py Projeto: andrecunha/coh-metrix-dementia

def build_corpus(dictionary_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    corpus = CorpusIterator(dir_list=dir_list, bow=True, dictionary=dictionary)
    MmCorpus.serialize(
        '/home/andre/Develop/corpora/lsamodel_bow.mm',
        corpus, progress_cnt=10000)

Exemplo n.º 54

0

Exibir arquivo

Arquivo: topic.py Projeto: KeithYue/WebTopicModel

 def load_dict():
     return Dictionary.load_from_text('./dict.txt')

Exemplo n.º 55

0

Exibir arquivo

Arquivo: topicmodel.py Projeto: ciex/souma

 def __init__(self, lda_file, dic_file):
     self.lda_model = LdaModel.load(lda_file)
     self.dictionary = Dictionary.load_from_text(dic_file)

Exemplo n.º 56

0

Exibir arquivo

Arquivo: run.py Projeto: zhuwenya/Rumor_Detection

        with codecs.open(self.path_, 'r', 'utf-8') as in_f:
            for line in in_f:
                doc = [word for word in line.strip().split()
                       if len(word) > 0 and word in tokens]
                doc = vocab.doc2bow(doc)
                if len(doc) > 0:
                    yield doc


if __name__ == "__main__":
    logging.basicConfig(
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        level=logging.INFO
    )
    vocab = Dictionary.load_from_text('./vocab.txt')
    corpus = UnlabeledCorpus('./rumor_train.csv', vocab)
    valid_corpus = UnlabeledCorpus('./rumor_valid.csv', vocab)
    valid_sentences = [doc for doc in valid_corpus][5000:]

    # varing number of topics
    # result = {}
    # for num_topics in [2, 4, 8, 16, 32, 64]:
    #     best_value = -100
    #     for i in range(5):
    #         model = LdaModel(corpus=corpus, id2word=vocab, num_topics=num_topics)
    #         likelihood = model.log_perplexity(valid_sentences)
    #         best_value = max(best_value, likelihood)
    #     result[num_topics]= best_value
    #
    # for num_topics, likelihood in result.iteritems():

Exemplo n.º 57

0

Exibir arquivo

Arquivo: lsa_train.py Projeto: ZhangBanger/allen-ai-challenge

from utils import generate_timestamp

logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)
timestamp = generate_timestamp()

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt")
parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm")
parser.add_argument("-m", "--model", help="path to model output")
args = parser.parse_args()

# load id->word mapping (the dictionary)
id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary))

# load corpus iterator
mm = MmCorpus(args.corpus)

print(mm)
# MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries)

# extract num_topics LSI topics; use the default one-pass algorithm
num_topics = 400
model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics)

# print the most contributing words (both positively and negatively) for each of the first ten topics
model.print_topics(10)

model.save("%s/%s.model" % (args.model, timestamp))

Exemplo n.º 58

0

Exibir arquivo

Arquivo: make_wikicorpus.py Projeto: luispedro/gensim

        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
        wiki.save(outp + '_corpus.pkl.bz2', use_bzip2=True)
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2', use_bzip2=True)
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)

Exemplo n.º 59

0

Exibir arquivo

Arquivo: inference.py Projeto: daoudclarke/ldacomposition

 def __init__(self):
     self.model = LdaModel.load(settings.lda_model_name)
     self.dictionary = Dictionary.load_from_text(settings.wordids_txt)

Exemplo n.º 60

0

Exibir arquivo

Arquivo: make_wikicorpus.py Projeto: Hrehory/gensim

    print globals()['__doc__'] % locals()
    sys.exit(1)
inp, outp = sys.argv[1:3]
if len(sys.argv) > 3:
    keep_words = int(sys.argv[3])
else:
    keep_words = DEFAULT_DICT_SIZE

# build dictionary. only keep the most frequent words (out of total ~8.2m
# unique tokens) takes about 9h on a macbook pro, for 3.5m articles (june 2011)
wiki = WikiCorpus(inp, keep_words=keep_words)
# save dictionary and bag-of-words (term-document frequency matrix)
# another ~9h
wiki.dictionary.save_as_text(outp + '_wordids.txt')
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
del wiki

# initialize corpus reader and word->id mapping
id2token = Dictionary.load_from_text(outp + '_wordids.txt')
mm = MmCorpus(outp + '_bow.mm')

# build tfidf,
# ~30min
tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

# save tfidf vectors in matrix market format
# ~2h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

logger.info("finished running %s" % program)