Exemplo n.º 1
0
def get_corpus():
    text_path = datapath('ldavowpalwabbit.txt')
    dict_path = datapath('ldavowpalwabbit.dict.txt')
    dictionary = Dictionary.load_from_text(dict_path)
    with open(text_path) as fhandle:
        corpus = [dictionary.doc2bow(line.strip().split()) for line in fhandle]
    return corpus, dictionary
Exemplo n.º 2
0
 def __init__(self, config, trained_model_path, id2word_path):
     self.model_path = trained_model_path
     self.id2word_path = id2word_path
     self.model = LdaModel.load(self.model_path)
     self.id2word = Dictionary.load_from_text(self.id2word_path)
     self.num_topics = config.num_topics
     assert self.model.num_topics == self.num_topics
def apply_tfidf(dictionary_path, mm_corpus_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    mm = MmCorpus(mm_corpus_path)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm',
                       tfidf[mm],
                       progress_cnt=10000)
Exemplo n.º 4
0
    def __init__(self, model_prefix=None, num_best=None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix +
                                                    '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix +
                                           '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix +
                                                '_similarity.index',
                                                mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Exemplo n.º 5
0
def get_wiki_dictionary():
    '''
    return a dictionary that contains {"tokens": document frequency} in wikipedia corpus
    '''

    # Unpack Wiki dump
    # wiki = WikiCorpus('wikicorpus/enwiki-20201120-pages-articles-multistream1.xml-p1p41242.bz2', lemmatize=False)
    # MmCorpus.serialize("wikicorpus/wiki-corpus.mm", wiki)

    # create documents to save wiki articles
    # documents = list()
    # for i, text in enumerate(wiki.get_texts()):
    #     documents.append(text)

    # Dictionary of document frequencies
    dct = Dictionary.load_from_text("wikicorpus/wiki_dictionary")

    # the document size of wiki corpus is 21126
    wiki_document_size = 21126

    # return dictionary
    df_dictionary = dict()

    # for each word, the p(word) = document frequency / N, where N is the size of documents in this corpus
    id2token = {v: k for k, v in dct.token2id.items()}
    for token_id, document_frequency in dct.dfs.items():
        # Katz smoothing to handle zero occurrences in wiki-corpus
        df_dictionary[id2token[token_id]] = (document_frequency +
                                             1) / wiki_document_size

    return df_dictionary
 def __init__(self,
              MODEL_PATH,
              DICT_LOCATION=DEFAULT_DICTIONARY_FILE_LOCATION):
     self.__model = LdaMulticore.load(MODEL_PATH)
     self.__id2word_dictionary = Dictionary.load_from_text(DICT_LOCATION)
     print(self.__model)
     print(self.__id2word_dictionary)
def get_corpus():
    text_path = datapath('ldavowpalwabbit.txt')
    dict_path = datapath('ldavowpalwabbit.dict.txt')
    dictionary = Dictionary.load_from_text(dict_path)
    with open(text_path) as fhandle:
        corpus = [dictionary.doc2bow(l.strip().split()) for l in fhandle]
    return corpus, dictionary
Exemplo n.º 8
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Exemplo n.º 9
0
    def __init__(self, dict_path, model_path):
        """Load an LSA space from a file.

        :dict_path: path to the dictionary file.
        :model_path: path to the model file.
        """
        self._dictionary = Dictionary.load_from_text(dict_path)
        self._lsi_model = LsiModel.load(model_path)
Exemplo n.º 10
0
    def __init__(self, dict_path, model_path):
        """Load an LSA space from a file.

        :dict_path: path to the dictionary file.
        :model_path: path to the model file.
        """
        self._dictionary = Dictionary.load_from_text(dict_path)
        self._lsi_model = LsiModel.load(model_path)
Exemplo n.º 11
0
def get_corpus(data, save_path_dict='extracted_data/lda_dictionary'):
    if isfile(save_path_dict):
        dictionary = Dictionary.load_from_text(save_path_dict)
        corpus = [dictionary.doc2bow(doc) for doc in data]
        return corpus
    else:
        print("Didn't find a dictionary.")
        import sys
        sys.exit(1)
Exemplo n.º 12
0
    def __init__(self, dict_file=None, model_file=None):
        if dict_file:
            self.dictionary = Dictionary.load_from_text(dict_file)
        else:
            self.dictionary = Dictionary()

        if model_file:
            self.model = joblib.load(model_file)
        else:
            self.model = None
Exemplo n.º 13
0
def main():
    global args
    taskname = args.taskname
    no_below = args.no_below
    no_above = args.no_above
    n_topic = args.n_topic
    n_cpu = cpu_count() - 2 if cpu_count() > 2 else 2
    use_tfidf = args.use_tfidf
    dist = args.dist
    model_path = args.model_path
    model_name = args.model_name
    save_dir = args.save_dir
    test_path = args.test_path

    device = torch.device('cuda')

    cwd = os.getcwd()
    tmpDir = os.path.join(cwd, 'data', taskname)
    if os.path.exists(os.path.join(tmpDir, 'corpus.mm')):
        dictionary = Dictionary.load_from_text(os.path.join(
            tmpDir, 'dict.txt'))
    else:
        raise Exception("Build corpus first")

    testSet = TestData(dictionary=dictionary,
                       txtPath=test_path,
                       no_below=no_below,
                       no_above=no_above,
                       use_tfidf=use_tfidf)
    voc_size = testSet.vocabsize

    Model = globals()[model_name]
    model = Model(bow_dim=voc_size,
                  n_topic=n_topic,
                  device=device,
                  dist=dist,
                  taskname=taskname)
    model.load_model(model_path)

    topics = model.show_topic_words(dictionary=dictionary)
    for i in range(len(topics)):
        print(i, str(topics[i]))

    infer_topics = []
    for doc in tqdm(testSet):
        if doc is None:
            infer_topics.append(None)
        else:
            infer_topics.append(
                int(
                    np.argmax(
                        model.inference(doc_tokenized=doc,
                                        dictionary=dictionary))))
    with open(save_dir + "/inference_result.txt", "w") as f:
        json.dump(infer_topics, f)
Exemplo n.º 14
0
Arquivo: Linker.py Projeto: PPPI/a-m
 def __load_from_disk(self, path):
     """
     Function that is used internally to load and set-up the class state
     :param path: Location from where the class internal state should be loaded
     :return: None, side-effect on the class on which this is called
     """
     # Read config,
     with open(os.path.join(path, 'config.json')) as f:
         params = jsonpickle.decode(f.read())
     self.net_size_in_days = params['net_size_in_days']
     self.min_tok_len = params['min_tok_len']
     self.undersample_multiplicity = params['undersample_multiplicity']
     self.prediction_threshold = params['prediction_threshold']
     self.use_sim_cs = params['use_sim_cs']
     self.use_sim_j = params['use_sim_j']
     self.use_sim_d = params['use_sim_d']
     self.use_social = params['use_social']
     self.use_temporal = params['use_temporal']
     self.use_file = params['use_file']
     self.use_pr_only = params['use_pr_only']
     self.use_issue_only = params['use_issue_only']
     self.predictions_between_updates = params[
         'predictions_between_updates']
     name = params['name']
     try:
         with open(os.path.join(path, name, 'repository_data.json')) as f:
             self.repository_obj = jsonpickle.decode(f.read())
         with open(os.path.join(path, name, 'truth_data.json')) as f:
             self.truth = jsonpickle.decode(f.read())
     except FileNotFoundError:
         pass
     try:
         with open(os.path.join(path, name, 'fingerprint_data.json')) as f:
             self.fingerprint = jsonpickle.decode(f.read())
     except FileNotFoundError:
         pass
     try:
         self.dictionary = Dictionary.load_from_text(
             os.path.join(path, 'tfidf', 'term2id.txt'))
         self.model = TfidfModel.load(
             os.path.join(path, 'tfidf', 'model.tfidf'))
         with open(os.path.join(path, name, 'stopwords_data.json')) as f:
             self.stopwords = jsonpickle.decode(f.read())
     except FileNotFoundError:
         pass
     try:
         self.clf = pickle.load(
             open(os.path.join(path, 'clf_model', 'model.p'), 'rb'))
     except FileNotFoundError:
         pass
     try:
         self.feature_generator = pickle.load(
             open(os.path.join(path, 'feature_generator', 'gen.p'), 'rb'))
     except FileNotFoundError:
         pass
Exemplo n.º 15
0
def makeDictionary(docList, dictFile="", add=False):
    '''
    生成词典
    '''
    if os.path.isfile(dictFile) and add:
        dictionary = Dictionary.load_from_text(dictFile)
        dictionary.add_documents(docList)
    else:
        dictionary = Dictionary(docList)
    # dictionary.save_as_text(dictFile)
    return dictionary
Exemplo n.º 16
0
    def test_saveAsText_and_loadFromText(self):
        """ `Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        d = Dictionary(self.texts)
        d.save_as_text(tmpf)
        # does the file exists
        self.assertTrue(os.path.exists(tmpf))

        d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt'))
        self.assertNotEqual(d_loaded, None)
        self.assertEqual(d_loaded.token2id, d.token2id)
Exemplo n.º 17
0
    def test_saveAsText_and_loadFromText(self):
        """`Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        for sort_by_word in [True, False]:
            d = Dictionary(self.texts)
            d.save_as_text(tmpf, sort_by_word=sort_by_word)
            self.assertTrue(os.path.exists(tmpf))

            d_loaded = Dictionary.load_from_text(tmpf)
            self.assertNotEqual(d_loaded, None)
            self.assertEqual(d_loaded.token2id, d.token2id)
Exemplo n.º 18
0
    def test_saveAsText_and_loadFromText(self):
        """ `Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        d = Dictionary(self.texts)
        d.save_as_text(tmpf)
        # does the file exists
        self.assertTrue(os.path.exists(tmpf))

        d_loaded = Dictionary.load_from_text(get_tmpfile('dict_test.txt'))
        self.assertNotEqual(d_loaded, None)
        self.assertEqual(d_loaded.token2id, d.token2id)
Exemplo n.º 19
0
    def test_saveAsText_and_loadFromText(self):
        """`Dictionary` can be saved as textfile and loaded again from textfile. """
        tmpf = get_tmpfile('dict_test.txt')
        for sort_by_word in [True, False]:
            d = Dictionary(self.texts)
            d.save_as_text(tmpf, sort_by_word=sort_by_word)
            self.assertTrue(os.path.exists(tmpf))

            d_loaded = Dictionary.load_from_text(tmpf)
            self.assertNotEqual(d_loaded, None)
            self.assertEqual(d_loaded.token2id, d.token2id)
Exemplo n.º 20
0
 def get_dict(self,path):
     path = path + '.dict'   
     if not os.path.exists(path):
         self.texts = self.get_texts()
         dct = Dictionary(self.texts)
         dct.save_as_text(path)
     else:
         dct = Dictionary()
         dct = dct.load_from_text(path)
         for path in self.inputs:
             self.id_to_path.append(os.path.basename(path))
     return dct
Exemplo n.º 21
0
 def display_data(self):
     lda = LdaMulticore.load(self.lda_model_filepath)
     trigram_bow_corpus = MmCorpus(self.trigram_bow_filepath)
     trigram_dictionary = Dictionary.load_from_text(self.trigram_dictionary_filepath)
     LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                               trigram_dictionary)
     with open(self.LDAvis_data_filepath, 'w') as f:
         f.write(str(LDAvis_prepared))
         # json.dump(LDAvis_prepared.to_json(), f)
     with open(self.LDAvis_data_filepath) as f:
         LDAvis_prepared = f
     pyLDAvis.display(LDAvis_prepared)
Exemplo n.º 22
0
    def test_loadFromText(self):
        """`Dictionary` can be loaded from textfile."""
        tmpf = get_tmpfile('load_dict_test.txt')
        no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 2)
Exemplo n.º 23
0
    def test_loadFromText(self):
        """`Dictionary` can be loaded from textfile."""
        tmpf = get_tmpfile('load_dict_test.txt')
        no_num_docs_serialization = to_utf8("2\n1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 2)
Exemplo n.º 24
0
    def get_dictionary(self):
        tmp_fname = self.path + "lda.dictionary"

        if os.path.exists(tmp_fname):
            return Dictionary.load_from_text(tmp_fname)

        else:
            print("Creating dictionary.")
            docs_by_id = read_ap.get_processed_docs()
            docs = [doc for doc_id, doc in docs_by_id.items()]
            dictionary = Dictionary(docs)
            dictionary.save_as_text(tmp_fname)
            return dictionary
Exemplo n.º 25
0
 def load_corpus(self, corpus_name):
     ''' This is were we load the corpus files. This needs to be
     moved to a more general class initialization. (FIXME Freija)
     '''
     corpusfile = corpus_name + '.mm'
     corpusdict = corpus_name + '_wordids.txt'
     lsimodel = corpus_name + '.lsi_model'
     lsiindex = corpus_name + '-lsi.index'
     self.corpus_name = corpus_name
     self.corpus_mm = MmCorpus(corpusfile)
     self.corpus_dict = Dictionary.load_from_text(corpusdict)
     self.model = LsiModel.load(lsimodel)
     self.index = similarities.MatrixSimilarity.load(lsiindex)
Exemplo n.º 26
0
 def load_corpus(self, corpus_name):
     ''' This is were we load the corpus files. This needs to be
     moved to a more general class initialization. (FIXME Freija)
     '''
     corpusfile = corpus_name + '.mm'
     corpusdict = corpus_name + '_wordids.txt'
     lsimodel = corpus_name + '.lsi_model'
     lsiindex = corpus_name + '-lsi.index'
     self.corpus_name = corpus_name
     self.corpus_mm = MmCorpus(corpusfile)
     self.corpus_dict = Dictionary.load_from_text(corpusdict)
     self.model = LsiModel.load(lsimodel)
     self.index = similarities.MatrixSimilarity.load(lsiindex)
Exemplo n.º 27
0
def load_corpus_and_dict(corpus_path, id2word_path):
    print("[BLOCK] Loading  corpus and dictionary files from %s and %s" %
          (data_path, id2word_path))
    sys.stdout.flush()
    dictionary = Dictionary.load_from_text(id2word_path)

    print("[BLOCK] Loading corpus iterator")
    sys.stdout.flush()
    #mm = gensim.corpora.MmCorpus(corpus_path)
    corpus = MmCorpus(
        bz2.BZ2File(corpus_path)
    )  # use this if you compressed the TFIDF output (recommended)

    return corpus, dictionary
Exemplo n.º 28
0
    def get_dictionary(self):
        tmp_fname = self.path + self.model_type + "_dictionary"

        if os.path.exists(tmp_fname):
            return Dictionary.load_from_text(tmp_fname)

        else:
            print("Creating dictionary.")
            docs_by_id = read_ap.get_processed_docs()
            docs = [doc for doc_id, doc in docs_by_id.items()]
            dictionary = Dictionary(docs)
            dictionary.filter_extremes(no_below=20, no_above=0.5)
            dictionary.save_as_text(tmp_fname)
            return dictionary
Exemplo n.º 29
0
def main():
    global dictionary
    try:
        dictionary = Dictionary.load_from_text(
            "persist/reuters_dictionary.txt")
        #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2")

    except:
        dictionary = Dictionary(ReutersCorpus())
        dictionary.filter_extremes()
        dictionary.save_as_text("persist/reuters_dictionary.txt")

    models = train_models()

    if settings["models"]["bow"]:
        bowmodel = BOWmodel()
        bowmodel.__out_size = len(dictionary)
        models["bow"] = bowmodel

    if settings["models"]["noise"]:
        noisemodel = NoiseModel(1000)
        noisemodel.__out_size = 1000
        models["noise"] = noisemodel

    num_train_samples = 21578 - settings["held_out_docs"]
    test_samples = []

    class generate_train_samples(object):
        first_iteration = True

        def __iter__(self):
            count = 0
            for document in stream_reuters_documents():
                sample = document["content"], "acq" in document[
                    "topics"]  # todo: maybe try "usa" or "earn"
                if count > num_train_samples:
                    if self.first_iteration:
                        test_samples.append(sample)
                else:
                    yield sample
                count += 1
            self.first_iteration = False

    classifiers = train_classifiers(models, generate_train_samples())

    classifications = run_evaluation(classifiers, models, test_samples)
    #output_results(classifications)

    return classifications
Exemplo n.º 30
0
    def __init__(self, model_prefix='wiki_en'):
        logger = logging.getLogger("LDA")
        self.model_prefix = model_prefix
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        self.fname = 'lda_model.p'

        logger.info("LDA: Loading word dictionary...")
        self.dict = Dictionary.load_from_text(model_prefix + '_wordids.txt')

        logger.info("LDA: Loading pretrained model...")
        self.model = pickle.load(open(self.fname, 'r'))

        logger.info("LDA: Finished loading model files.")
Exemplo n.º 31
0
	def __init__(self, model_prefix = 'wiki_en'):
		logger = logging.getLogger("LDA")
		self.model_prefix = model_prefix
		if self.model_prefix is None:
			raise ValueError("model_prefix must be specified")

		self.fname = 'lda_model.p'

		logger.info("LDA: Loading word dictionary...")
		self.dict = Dictionary.load_from_text(model_prefix + '_wordids.txt')

		logger.info("LDA: Loading pretrained model...")
		self.model = pickle.load(open(self.fname, 'r'))

		logger.info("LDA: Finished loading model files.")
Exemplo n.º 32
0
def main():
    global dictionary
    try:
        dictionary = Dictionary.load_from_text("persist/reuters_dictionary.txt")
        #dictionary = Dictionary.load_from_text("persist/wiki_stem-False_keep-100000_nobelow-20_noabove-0.1_wordids.txt.bz2")

    except:
        dictionary = Dictionary(ReutersCorpus())
        dictionary.filter_extremes()
        dictionary.save_as_text("persist/reuters_dictionary.txt")

    models = train_models()

    if settings["models"]["bow"]:
        bowmodel = BOWmodel()
        bowmodel.__out_size = len(dictionary)
        models["bow"] = bowmodel

    if settings["models"]["noise"]:
        noisemodel = NoiseModel(1000)
        noisemodel.__out_size = 1000
        models["noise"] = noisemodel

    num_train_samples = 21578 - settings["held_out_docs"]
    test_samples = []


    class generate_train_samples(object):
        first_iteration = True

        def __iter__(self):
            count = 0
            for document in stream_reuters_documents():
                sample = document["content"], "acq" in document["topics"]  # todo: maybe try "usa" or "earn"
                if count > num_train_samples:
                    if self.first_iteration:
                        test_samples.append(sample)
                else:
                    yield sample
                count += 1
            self.first_iteration = False

    classifiers = train_classifiers(models, generate_train_samples())

    classifications = run_evaluation(classifiers, models, test_samples)
    #output_results(classifications)

    return classifications
Exemplo n.º 33
0
    def test_loadFromText_legacy(self):
        """
        `Dictionary` can be loaded from textfile in legacy format.
        Legacy format does not have num_docs on the first line.
        """
        tmpf = get_tmpfile('load_dict_test_legacy.txt')
        no_num_docs_serialization = to_utf8("1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 0)
Exemplo n.º 34
0
    def test_loadFromText_legacy(self):
        """
        `Dictionary` can be loaded from textfile in legacy format.
        Legacy format does not have num_docs on the first line.
        """
        tmpf = get_tmpfile('load_dict_test_legacy.txt')
        no_num_docs_serialization = to_utf8("1\tprvé\t1\n2\tslovo\t2\n")
        with open(tmpf, "wb") as file:
            file.write(no_num_docs_serialization)

        d = Dictionary.load_from_text(tmpf)
        self.assertEqual(d.token2id[u"prvé"], 1)
        self.assertEqual(d.token2id[u"slovo"], 2)
        self.assertEqual(d.dfs[1], 1)
        self.assertEqual(d.dfs[2], 2)
        self.assertEqual(d.num_docs, 0)
Exemplo n.º 35
0
def crawl_new_article(request):
    news_list = get_current_news_article()

    # 取得してきたニュースをレコメンドすべきか判断
    download_blob('word/all_id2word.txt', '/tmp/all_id2word.txt')
    dct = Dictionary.load_from_text("/tmp/all_id2word.txt")

    download_blob('model_2.pickle', '/tmp/model_2.pickle')
    with open('/tmp/model_2.pickle', mode='rb') as f:
        classifier = pickle.load(f)

    bow_docs = make_bow(dct)
    result = predict(news_list, dct, classifier, bow_docs)

    upsert_new_articles(result)

    return {"status": "ok"}
Exemplo n.º 36
0
 def loadDictionary(self, type='offline'):
     '''
     加载字典,若字典不存在则建立字典
     '''
     startTime = datetime.now()
     filePath = self.cachePath + '%s_dictionary_%s.txt' % (self.name, type)
     if os.path.isfile(filePath):
         dictionary = Dictionary.load_from_text(filePath)
     else:
         if type == 'offline':
             docList = self.getDocList('train')
             dictionary = makeDictionary(docList)
         elif type == 'all':
             docList = []
             if os.path.isfile(self.cachePath +
                               '%s_dictionary_online.txt' % self.name):
                 logging.warning('dictionary continue')
                 docList.extend(self.getDocList('testA'))
                 docList.extend(self.getDocList('testB'))
                 dictionary = makeDictionary(
                     docList,
                     dictFile=self.cachePath +
                     '%s_dictionary_online.txt' % self.name,
                     add=True)
             else:
                 for dfName in self.dfFile.keys():
                     docList.extend(self.getDocList(dfName))
                 dictionary = makeDictionary(docList)
         elif type == 'online' and os.path.isfile(
                 self.cachePath + '%s_dictionary_offline.txt' % self.name):
             docList = self.getDocList('valid')
             dictionary = makeDictionary(
                 docList,
                 dictFile=self.cachePath +
                 '%s_dictionary_offline.txt' % self.name,
                 add=True)
         else:
             docList = self.getDocList('train')
             docList.extend(self.getDocList('valid'))
             dictionary = makeDictionary(docList)
         dictionary.save_as_text(filePath)
         logging.warning('make dictionary time: %s' %
                         (datetime.now() - startTime))
     self.dictionary[type] = dictionary
     return dictionary
Exemplo n.º 37
0
def get_dictionary_corpus(data,
                          save_path_dict='extracted_data/lda_dictionary',
                          save_path_bcorp='extracted_data/lda_bow_corpus'):
    if isfile(save_path_dict):
        dictionary = Dictionary.load_from_text(save_path_dict)
        corpus = gensim.corpora.MmCorpus(save_path_bcorp)
    else:
        dictionary = gensim.corpora.Dictionary(data)
        dictionary.filter_extremes(no_above=0.5, keep_n=100000)
        corpus = [dictionary.doc2bow(doc) for doc in data]
        dictionary.save_as_text(save_path_dict)
        gensim.corpora.MmCorpus.serialize(save_path_bcorp, corpus)

    #    bow_doc_2 = bow_corpus[2]
    #    for i in range(len(bow_doc_2)):
    #        print("Word {} (\"{}\") appears {} time.".format(bow_doc_2[i][0], dictionary[bow_doc_2[i][0]], bow_doc_2[i][1]))

    return corpus, dictionary
Exemplo n.º 38
0
def documentFrequencies():
    dictionary = Dictionary.load_from_text(
        'C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017_wordids.txt.bz2')
    print(max(dictionary.token2id.values()))
    #... get the id corresponding to token "hello"
    tokenid = (dictionary.token2id["hello"])
    print(tokenid)
    #... get the document frequencies in the full corpus for which "hello" appeared
    print(dictionary.dfs[dictionary.token2id["hello"]])
    #... compute the total number of features in this corpus
    print(len(dictionary))

    #... CONSTRUCT THE Document Frequency OUTPUT FILE
    dforig = dictionary.dfs
    dfdict = {}
    for key, val in dforig.items():
        dfdict[str(dictionary[key])] = val

    fieldnames = ["term", "df"]
    with open("document_frequencies.tsv", "w+", encoding="utf-8") as handle:
        writer = csv.writer(handle, delimiter="\t")
        #writer.writerows(dfdict)
        for key, val in dfdict.items():
            writer.writerow([key, val])
        handle.close()

    #... load in the bag-of-words matrix market file for comparison
    mm_name = "C:/Users/Admin/Anaconda2/envs/py27/corpora/wiki2017_bow.mm"
    wikimodel = MmCorpus(mm_name)
    #... the matrix market should have the same number of features as len(dictionary)
    print(wikimodel)

    #... checked and verified that "hello" appears in the same number of documents as computed earlier
    if False:
        counter = 0
        featcount = 0
        for doc in wikimodel:
            res = [x for x in doc if x[0] == tokenid]
            if len(res) > 0:
                #print (counter, ":",res)
                featcount += 1
                #break
        print(featcount)
Exemplo n.º 39
0
def get_lda_topics(transcript_utterances, trained_lda_model_filepath,
                   trained_lda_wordids_filepath):
    '''
    Parameters
    transcript_utterances: list of lists of strings (words), each row is a plaintext utterance in the transcript.
    trained_lda_model_filepath: string, path to trained LDA model ('/p/spoclab/models/LDA/lda_model_wiki').
    trained_lda_wordids_filepath: string, path to word IDs of trained LDA model (''/p/spoclab/models/LDA/lda_wordids.txt.bz2).

    Returns:
    topic_probabilities: list of floats, probability of each k topic.
    kurtosis: float, kurtosis of all topic probablities.
    skewness: float, skewness of all topic probabilities.
    entrpy: float, entropy of all topic probabilities.
    '''

    # Get files
    trained_lda_model = return_file(trained_lda_model_filepath)
    trained_lda_wordids = return_file(trained_lda_wordids_filepath)

    # Load LDA model
    lda_model = ldamodel.LdaModel.load(trained_lda_model)

    # Load wordids as a dictionary
    id2word = Dictionary.load_from_text(trained_lda_wordids)

    # Convert transcript of tokens into a BoW document
    document_bow = []
    for transcript_utterance in transcript_utterances:
        document_bow += id2word.doc2bow(transcript_utterance)

    # Get document topics
    doc_topics = lda_model.get_document_topics(document_bow,
                                               minimum_probability=0)
    topic_probabilities = [doc_topic[1] for doc_topic in doc_topics]

    skewness = stats.skew(topic_probabilities)
    kurtosis = stats.kurtosis(topic_probabilities)

    # Entropy: SUM(-plog2p)
    entropy = np.sum([-(p * np.log2(p)) for p in topic_probabilities])

    return topic_probabilities, kurtosis, skewness, entropy
Exemplo n.º 40
0
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute(
        'SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20,
                                    no_above=0.5,
                                    keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
Exemplo n.º 41
0
def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # load back the id->word mapping directly from file
    fin = path.join(datadir, "reuters21578.dict.txt")
    vocabulary = Dictionary.load_from_text(fin)

    # load the corpus
    fin = path.join(datadir, "reuters21578.mm")
    mm = MmCorpus(fin)

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True)

    # save the TfidfModel instance to file
    fout = path.join(datadir, "reuters21578.tfidf.model")
    tfidf.save(fout)

    # save TF-IDF vectors in matrix market format
    fout = path.join(datadir, "reuters21578.tfidf.mm")
    MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
Exemplo n.º 42
0
def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # load the LDA model
    fin = path.join(datadir, "reuters21578.lda.model.bz2")
    lda = LdaModel.load(fin)

    # load the corpus
    fin = path.join(datadir, "reuters21578.mm.bz2")
    mm = MmCorpus(fin)


    # load the vocabulary
    fin = path.join(datadir, "reuters21578.dict.txt")
    vocabulary = Dictionary.load_from_text(fin)

    data = pyLDAvis.gensim.prepare(lda, corpus, vocabulary)
    pyLDAvis.show(data,
                  ip=socket.gethostname().lower(),
                  local=True,
                  open_browser=True,
                  http_server=None)
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
Exemplo n.º 44
0
    def __init__(self, model_prefix = None, num_best = None):
        self.model_prefix = model_prefix
        self.num_best = num_best
        if self.model_prefix is None:
            raise ValueError("model_prefix must be specified")

        logger.info("ESA: Loading word dictionary...")
        self.dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')

        logger.info("ESA: Loading document name map...")
        self.article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

        logger.info("ESA: Loading TF-IDF model...")
        self.tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

        logger.info("ESA: Loading similarity index...")
        self.similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')

        #logger.info("ESA: Preloading reverse indexes...")
        #self.similarity_index.preload_reverse_index()

        logger.info("ESA: Finished loading model files.")
Exemplo n.º 45
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
Exemplo n.º 46
0
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        mywiki = myWikiCorpus(inp, lemmatize=lemmatize)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
Exemplo n.º 47
0
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)

    # check and process input arguments
    if len(sys.argv) < 3:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    input_file, output_prefix = sys.argv[1:3]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(output_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(output_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(output_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(output_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True
    similarity_index.preload_reverse_index()

    logger.info("Finished loading model files.")
Exemplo n.º 48
0
import os
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
import scipy.stats as stats
import time

from gensim import matutils
from gensim.corpora import Dictionary, MmCorpus
from gensim.models import LdaMulticore
from os import path

datadir = path.abspath(path.join(os.getcwd(), "data"))

fin = path.join(datadir, "reuters21578.dict.txt")
vocabulary = Dictionary.load_from_text(fin)

fin = path.join(datadir, "reuters21578.mm.bz2")
mm = MmCorpus(fin)

def sym_kl(p, q):
    return np.sum([stats.entropy(p, q), stats.entropy(p, q)])
    
def arun(corpus, dictionary, min_topics=10, max_topics=100, step=10):
    l = np.array([sum(cnt for _, cnt in doc) for doc in corpus])
    
    kl = []
    for n in range(min_topics, max_topics+step, step):
        print("starting multicore LDA for num_topics={}".format(n))
        st = time.clock()
        lda = LdaMulticore(corpus=corpus,
def build_model(dictionary_path, mm_corpus_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    # Use the if-idf corpus here, not the original one.
    mm = MmCorpus(mm_corpus_path)
    lsi = lsimodel.LsiModel(corpus=mm, id2word=dictionary, num_topics=400)
    lsi.save('/home/andre/Develop/corpora/lsamodel_lsi.model')
Exemplo n.º 50
0
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)

    # check and process input arguments
    if len(sys.argv) < 2:
        print(inspect.cleandoc(__doc__) % locals())
        sys.exit(1)
    model_prefix = sys.argv[1]

    logger.info("running %s" % ' '.join(sys.argv))

    logger.info("Loading word dictionary...")
    dictionary = Dictionary.load_from_text(model_prefix + '_wordids.txt.bz2')
    logger.debug(dictionary)

    logger.info("Loading document name map...")
    article_dict = utils.unpickle(model_prefix + '_bow.mm.metadata.cpickle')

    logger.info("Loading tf-idf model...")
    tfidf = TfidfModel.load(model_prefix + '.tfidf_model')

    logger.info("Loading similarity index...")
    similarity_index = Similarity.load(model_prefix + '_similarity.index', mmap='r')
    similarity_index.use_reverse_index = True

    logger.info("Finished loading model files.")

    mismatches = 0
Exemplo n.º 51
0
    else:  ## not online
        # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        wiki = WikiCorpus(
            args.input, lemmatize=args.lemmatize, 
            max_articles=args.max_articles,
            expect_streamitems=args.expect_streamitems,                          
            file_name_pattern=args.file_name_pattern,
        ) 
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(args.output + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(args.output + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(args.output + '_wordids.txt.bz2')
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(args.output + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(args.output + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %r" % args.__dict__)
def apply_tfidf(dictionary_path, mm_corpus_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    mm = MmCorpus(mm_corpus_path)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm',
                       tfidf[mm], progress_cnt=10000)
def build_corpus(dictionary_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    corpus = CorpusIterator(dir_list=dir_list, bow=True, dictionary=dictionary)
    MmCorpus.serialize(
        '/home/andre/Develop/corpora/lsamodel_bow.mm',
        corpus, progress_cnt=10000)
Exemplo n.º 54
0
 def load_dict():
     return Dictionary.load_from_text('./dict.txt')
Exemplo n.º 55
0
 def __init__(self, lda_file, dic_file):
     self.lda_model = LdaModel.load(lda_file)
     self.dictionary = Dictionary.load_from_text(dic_file)
Exemplo n.º 56
0
        with codecs.open(self.path_, 'r', 'utf-8') as in_f:
            for line in in_f:
                doc = [word for word in line.strip().split()
                       if len(word) > 0 and word in tokens]
                doc = vocab.doc2bow(doc)
                if len(doc) > 0:
                    yield doc


if __name__ == "__main__":
    logging.basicConfig(
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        level=logging.INFO
    )
    vocab = Dictionary.load_from_text('./vocab.txt')
    corpus = UnlabeledCorpus('./rumor_train.csv', vocab)
    valid_corpus = UnlabeledCorpus('./rumor_valid.csv', vocab)
    valid_sentences = [doc for doc in valid_corpus][5000:]

    # varing number of topics
    # result = {}
    # for num_topics in [2, 4, 8, 16, 32, 64]:
    #     best_value = -100
    #     for i in range(5):
    #         model = LdaModel(corpus=corpus, id2word=vocab, num_topics=num_topics)
    #         likelihood = model.log_perplexity(valid_sentences)
    #         best_value = max(best_value, likelihood)
    #     result[num_topics]= best_value
    #
    # for num_topics, likelihood in result.iteritems():
Exemplo n.º 57
0
from utils import generate_timestamp

logging.basicConfig(
        format='%(asctime)s : %(levelname)s : %(message)s',
        level=logging.INFO
)
timestamp = generate_timestamp()

parser = argparse.ArgumentParser()
parser.add_argument("-d", "--dictionary", help="path to wiki_en_wordids.txt")
parser.add_argument("-c", "--corpus", help="path to wiki_en_tfidf.mm")
parser.add_argument("-m", "--model", help="path to model output")
args = parser.parse_args()

# load id->word mapping (the dictionary)
id2word = Dictionary.load_from_text(bz2.BZ2File(args.dictionary))

# load corpus iterator
mm = MmCorpus(args.corpus)

print(mm)
# MmCorpus(3933461 documents, 100000 features, 612118814 non-zero entries)

# extract num_topics LSI topics; use the default one-pass algorithm
num_topics = 400
model = LsiModel(corpus=mm, id2word=id2word, num_topics=num_topics)

# print the most contributing words (both positively and negatively) for each of the first ten topics
model.print_topics(10)

model.save("%s/%s.model" % (args.model, timestamp))
Exemplo n.º 58
0
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
        wiki.save(outp + '_corpus.pkl.bz2', use_bzip2=True)
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2', use_bzip2=True)
        # load back the id->word mapping directly from file
        # this seems to save more memory, compared to keeping the wiki.dictionary object from above
        dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2', use_bzip2=True)
    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    # save tfidf vectors in matrix market format
    # ~4h; result file is 15GB! bzip2'ed down to 4.5GB
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
Exemplo n.º 59
0
 def __init__(self):
     self.model = LdaModel.load(settings.lda_model_name)
     self.dictionary = Dictionary.load_from_text(settings.wordids_txt)
Exemplo n.º 60
0
    print globals()['__doc__'] % locals()
    sys.exit(1)
inp, outp = sys.argv[1:3]
if len(sys.argv) > 3:
    keep_words = int(sys.argv[3])
else:
    keep_words = DEFAULT_DICT_SIZE

# build dictionary. only keep the most frequent words (out of total ~8.2m
# unique tokens) takes about 9h on a macbook pro, for 3.5m articles (june 2011)
wiki = WikiCorpus(inp, keep_words=keep_words)
# save dictionary and bag-of-words (term-document frequency matrix)
# another ~9h
wiki.dictionary.save_as_text(outp + '_wordids.txt')
MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
del wiki

# initialize corpus reader and word->id mapping
id2token = Dictionary.load_from_text(outp + '_wordids.txt')
mm = MmCorpus(outp + '_bow.mm')

# build tfidf,
# ~30min
tfidf = TfidfModel(mm, id2word=id2token, normalize=True)

# save tfidf vectors in matrix market format
# ~2h; result file is 15GB! bzip2'ed down to 4.5GB
MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

logger.info("finished running %s" % program)