예제 #1
0
def train_tfidf(inpath=config.path_train_cut):
    train_df = pd.read_csv(inpath,
                           sep="\t",
                           header=None,
                           names=["id", "s1", "s2", "label"],
                           encoding="utf-8")
    tfidf_txt = train_df["s1"].tolist() + train_df["s2"].tolist()
    texts = [tokenize(text) for text in tfidf_txt]

    # remove words that appear only once
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    documents = [[token for token in text if frequency[token] > 1]
                 for text in texts]

    dictionary = Dictionary(documents)
    dictionary.save_as_text("./model/words.dic")

    # dictionary = Dictionary.load_from_text("./model/words.dic")

    class MyCorpus(object):
        def __iter__(self):
            for doc in documents:
                yield dictionary.doc2bow(doc)

    corpus = MyCorpus()
    MmCorpus.serialize("./model/corpus.mm", corpus)
    # corpus = MmCorpus("./model/corpus.mm")
    tfidf = TfidfModel(corpus)
    tfidf.save("./model/tf_idf.model")
예제 #2
0
파일: data.py 프로젝트: lliutianc/plda-imdb
def load_20newsgroup(root=DATA_DIR, download=False):
    data_folder = os.path.join(root, "datasets/20newsgroup")
    processed_folder = os.path.join(data_folder, "processed")
    if download:
        if not os.path.exists(data_folder):
            os.makedirs(data_folder)
        pull_from_url(data_folder, dataset="20newsgroup")
        docs = []
        groups = []
        subfolders = os.listdir(
            os.path.join(data_folder, NEWSGROUP_DECOMPRESS_FOLDER))
        for subf in subfolders:
            work_dir = os.path.join(data_folder, NEWSGROUP_DECOMPRESS_FOLDER,
                                    subf)
            filenames = os.listdir(work_dir)
            for file in filenames:
                with open(os.path.join(work_dir, file), "r") as f:
                    docs.append(word_tokenize(f.read().strip()))
                    groups.append(subf)
        docs = [[LEMMATIZER.lemmatize(token) for token in doc] for doc in docs]
        if not os.path.exists(processed_folder):
            os.mkdir(processed_folder)
        logging.info("Building dictionary and BOW corpus for the dataset...")
        corpus, dictionary = build_corpus(docs)
        MmCorpus.serialize(os.path.join(processed_folder, "corpus.mm"), corpus)
        dictionary.save(os.path.join(processed_folder, "dictionary"))
        return corpus, dictionary
    else:
        try:
            return MmCorpus(os.path.join(processed_folder, "corpus.mm")), \
                   Dictionary.load(os.path.join(processed_folder, "dictionary"))
        except FileNotFoundError:
            logging.warning(
                "The dataset does not exist, please set download to True!")
            return None, None
예제 #3
0
    def __convertToCorpus(self, documents):
        """
        Steps to make the documents compatible to gensim
        Changelog
        - 15/3 KS First commit
        :param documents:
        :return:
        """
        #Preprocessing the text
        dp = DataPreprocessing()
        text = dp.getBagOfWords(documentDF=documents,
                                return_type='document_tokens')

        #Create a Gensim text corpus based on documents
        print("Creating a text dictionary")
        self.dictionary = Dictionary(line.lower().split()
                                     for line in documents)
        print(self.dictionary)
        print("Saving text dictionary to file")
        self.dictionary.save('../data.prune/producttext.dict')

        #Create a Gensim document corpus based on text corpus and each document
        print("Creating a Gensim document corpus")
        self.corpus = [self.dictionary.doc2bow(line) for line in text]

        print("Saving corpus to file")
        MmCorpus.serialize('../data.prune/productcorpus.mm', self.corpus)
        self.corpus = MmCorpus('../data.prune/productcorpus.mm')
        print(self.corpus)
예제 #4
0
def create_topic_model(application_id: str) -> None:
    """Creates the topic model from all completely fetched accounts"""
    logging.info('Starting to create topic model for application id: %s' %
                 application_id)
    with engine.begin() as connection:
        logging.info('Requesting complete accounts')
        accounts = list(
            models.account.select_multiple_complete(application_id,
                                                    SOURCES['TWITTER'],
                                                    connection))

        logging.info('Loading documents')
        documents = load_documents(accounts, connection)

        topic_model_path = get_topic_model_path(application_id)
        create_folder_if_not_exists(topic_model_path)

        logging.info('Creating dictionary')
        dictionary = create_dictionary(documents)
        dictionary.save(os.path.join(topic_model_path, 'dictionary'))

        logging.info('Creating corpus')
        MmCorpus.serialize(os.path.join(topic_model_path, 'corpus.mm'),
                           MyCorpus(dictionary, documents))
        corpus = MmCorpus(os.path.join(topic_model_path, 'corpus.mm'))

        logging.info('Creating LDA Model')
        lda_model = create_lda_model(corpus, dictionary)
        lda_model.save(os.path.join(topic_model_path, 'ldamodel'))
        topics_words = get_topic_words(lda_model, NUM_TOPIC_WORDS)
        models.topic_model.insert_one(application_id, SOURCES['TWITTER'],
                                      topics_words, connection)
예제 #5
0
    def extend_corpus(self, corpus):
        """
        Add new documents in `corpus` to `self.corpus`. If serialization is used,
        then the entire corpus (`self.corpus`) is re-serialized and the new documents
        are added in the process. If serialization is not used, the corpus, as a list
        of documents, is simply extended.

        """
        if self.serialized:
            # Re-serialize the entire corpus while appending the new documents.
            if isinstance(corpus, MmCorpus):
                # Check that we are not attempting to overwrite the serialized corpus.
                assert self.corpus.input != corpus.input, 'Input corpus cannot have the same file path as the model corpus (serialization_path).'
            corpus_chain = chain(
                self.corpus,
                corpus)  # A generator with the old and new documents.
            copyfile(
                self.serialization_path, self.serialization_path + '.tmp'
            )  # Make a temporary copy of the file where the corpus is serialized.
            self.corpus.input = self.serialization_path + '.tmp'  # Point the old corpus at this temporary file.
            MmCorpus.serialize(
                self.serialization_path, corpus_chain
            )  # Re-serialize the old corpus, and extend it with the new corpus.
            self.corpus = MmCorpus(
                self.serialization_path
            )  # Store the new serialized corpus object in self.corpus.
            remove(self.serialization_path +
                   '.tmp')  # Remove the temporary file again.
        else:
            # self.corpus and corpus are just lists, just extend the list.
            # First check that corpus is actually a list.
            assert isinstance(
                corpus, list
            ), "If serialized == False, all input corpora must be lists."
            self.corpus.extend(corpus)
예제 #6
0
def create_LDA_dict():
    #ONE TIME USE, to create and save LDA model
    trigram_dictionary_filepath = '../Dataset/trigram_dict_all.dict'
    trigram_reviews = LineSentence(
        '../Dataset/trigram_transformed_reviews_all.txt')
    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()
    trigram_dictionary.save(trigram_dictionary_filepath)
    print('LDA dict saved.')
    trigram_bow_filepath = '../Models/trigram_bow_corpus_all.mm'
    MmCorpus.serialize(
        trigram_bow_filepath,
        trigram_bow_generator(
            '../Dataset/trigram_transformed_reviews_all.txt'))
    trigram_bow_corpus = MmCorpus(trigram_bow_filepath)
    lda_model_filepath = '../Models/lda_model_all'  #lda_model_all_30, lda_model_10topic
    # created LDA model with 10, 30, 50 topics, found 30 has best result
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda = LdaMulticore(
            trigram_bow_corpus,
            num_topics=30,  #10, 30, 50
            id2word=trigram_dictionary,
            workers=8)
    lda.save(lda_model_filepath)
    print('LDA model saved.')
def apply_tfidf(dictionary_path, mm_corpus_path):
    dictionary = Dictionary.load_from_text(dictionary_path)
    mm = MmCorpus(mm_corpus_path)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    MmCorpus.serialize('/home/andre/Develop/corpora/lsamodel_tfidf.mm',
                       tfidf[mm],
                       progress_cnt=10000)
예제 #8
0
def prepare_corpus(tweets_file, corpus_file, dictionary_file, author_topic):
    stop_words = set(stopwords.words('english'))
    stop_words.add(u'rt')

    print('Loading tweets from ' + tweets_file)
    tweets = pd.read_pickle(tweets_file)

    if author_topic:
        tweets = tweets.groupby('user').agg({'text': 'sum'})

    print('%d tweets loaded' % len(tweets.index))

    dictionary = Dictionary(tweets['text'])
    stopword_ids = map(dictionary.token2id.get, stop_words)
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None)
    dictionary.compactify()

    corpus = [dictionary.doc2bow(doc) for doc in tweets['text']]

    # print(corpus)
    print("Writing corpus to " + corpus_file)
    MmCorpus.serialize(corpus_file, corpus)
    # print(dictionary)
    print("Writing dictionary to " + dictionary_file)

    dictionary.save(dictionary_file)
    def test_apply(self):

        transformed_vtcorp = self.transformer._apply(self.vtcorp)

        self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))

        transformed_names = self.loader.layout.required_text_corpus_names(self.transformation_label)
        text_data_name = os.path.join(self.data_root,
                                      self.loader.layout.corpus_dir,
                                      transformed_names[0])
        text_obj_name = os.path.join(self.data_root,
                                      self.loader.layout.corpus_dir,
                                      transformed_names[2])

        MmCorpus.serialize(text_data_name, transformed_vtcorp)
        transformed_vtcorp.save(text_obj_name)

        self.assertTrue(self.loader.has_text_corpora(self.transformation_label))

        self.temporary_files.extend([ os.path.join(self.data_root,
                                                   self.loader.layout.corpus_dir,
                                                   transformed_name)
                                      for transformed_name in transformed_names])

        transformed_vtcorp = TransformedCorpus.load(text_obj_name)

        self.assertIsInstance(transformed_vtcorp, TransformedCorpus)
        self.assertIsInstance(transformed_vtcorp.corpus, VTextCorpus)
        self.assertTrue(hasattr(transformed_vtcorp.corpus, 'dictionary'))

        print 'Transformed corpus dictionary size: %i' % len(transformed_vtcorp.corpus.dictionary)
        self.assertEqual(self.k, len(transformed_vtcorp.obj.orig2transformed))
예제 #10
0
파일: atmodel.py 프로젝트: abs51295/gensim
    def extend_corpus(self, corpus):
        """
        Add new documents in `corpus` to `self.corpus`. If serialization is used,
        then the entire corpus (`self.corpus`) is re-serialized and the new documents
        are added in the process. If serialization is not used, the corpus, as a list
        of documents, is simply extended.

        """
        if self.serialized:
            # Re-serialize the entire corpus while appending the new documents.
            if isinstance(corpus, MmCorpus):
                # Check that we are not attempting to overwrite the serialized corpus.
                assert self.corpus.input != corpus.input, \
                    'Input corpus cannot have the same file path as the model corpus (serialization_path).'
            corpus_chain = chain(self.corpus, corpus)  # A generator with the old and new documents.
            # Make a temporary copy of the file where the corpus is serialized.
            copyfile(self.serialization_path, self.serialization_path + '.tmp')
            self.corpus.input = self.serialization_path + '.tmp'  # Point the old corpus at this temporary file.
            # Re-serialize the old corpus, and extend it with the new corpus.
            MmCorpus.serialize(self.serialization_path, corpus_chain)
            self.corpus = MmCorpus(self.serialization_path)  # Store the new serialized corpus object in self.corpus.
            remove(self.serialization_path + '.tmp')  # Remove the temporary file again.
        else:
            # self.corpus and corpus are just lists, just extend the list.
            # First check that corpus is actually a list.
            assert isinstance(corpus, list), "If serialized == False, all input corpora must be lists."
            self.corpus.extend(corpus)
예제 #11
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    print('Creating simple wiki serialized corpus')
    # Download the raw file if we do not have it already
    if not os.path.isfile(WIKIFILE):
        # Get the file
        wget.download(WIKIURL)
    wiki = WikiCorpus(WIKIFILE, lemmatize=False)
    i = 0
    article_dict = {}
    for text in wiki.get_texts(meta=True):
        url_string = 'https://simple.wikipedia.org/wiki/?curid={}'
        article_dict[i] = (url_string.format(text[0]), text[1])
        i += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.1,
                                    keep_n=DEFAULT_DICT_SIZE)
    MmCorpus.serialize(MMFILE, wiki, progress_cnt=10000, )
    wiki.dictionary.save_as_text(DICTFILE)
    print('Simple wiki serialized corpus created')
    # Now run LSI
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
예제 #12
0
def get_corpus(docs):
    print("Building corpus ...")
    tfidf_model = None

    # load corpus from disk 
    if ARGS.load_corpus: 
        corpus = MmCorpus(ARGS.path_corpus)

    else:
        corpus = [dictionary.doc2bow(doc) for doc in docs]

        # serialize corpus to disk to prevent memory problems if corpus gets too large
        MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_bow.mm', corpus)  
        corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_bow.mm')

        if ARGS.corpus_type == "TFIDF": 
            tfidf_model = TfidfModel(corpus)

            tfidf_model.save(ARGS.save_dir + "/models/tfidf_model.mm")
            corpus = tfidf_model[corpus]

            # serialize corpus to disk to prevent memory problems if corpus gets too large
            MmCorpus.serialize(ARGS.save_dir + '/corpora/corpus_tfidf.mm', corpus)  
            corpus = MmCorpus(ARGS.save_dir + '/corpora/corpus_tfidf.mm')
    return corpus, tfidf_model
예제 #13
0
 def save(self, dictionary_file="corpus.dict", corpus_file="corpus.mm", sup_file=None):
     if dictionary_file:
         Dictionary.save(self.dictionary, dictionary_file)
     if corpus_file:
         MmCorpus.serialize(corpus_file, self)
     if sup_file and type(self.docs) is PaperCorpus:
         self.docs.save(sup_file)
예제 #14
0
def lda(clean_docs, model_name, topics):
    # turn all data into a dictionary mappping of normalized words and their integer ids
    from gensim import corpora
    dictionary = corpora.Dictionary(clean_docs)

    # convert each document, called text, into bag-of-words representation (list of (token_id, token_count) tuples)
    # in other words, it counts how often each word occurs in each doc of the text and saves that in the corpus
    corpus = []
    for doc in clean_docs:
        corpus.append(dictionary.doc2bow(doc))

    # serialize version: save dictionary and corpus for future use
    from gensim.corpora import MmCorpus
    MmCorpus.serialize('corpus_' + model_name + '.mm', corpus)
    dictionary.save('dictionary_' + model_name + '.gensim')

    # Train LDA model
    from gensim.models import LdaModel
    num_topics = topics  # find this number of topics in the data
    passes = 15

    ldamodel = LdaModel(corpus,
                        num_topics=num_topics,
                        id2word=dictionary,
                        passes=passes)
    ldamodel.save('model_' + model_name + '.gensim')
    topics = ldamodel.print_topics(num_words=5)

    for topic in topics:
        print(topic)
예제 #15
0
 def _create_corpus(self,
                    data_path,
                    stopwords_path,
                    corpus_path,
                    data_ready_path,
                    save=True):
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                         level=logging.INFO)
     with open(stopwords_path, 'r', encoding='utf8') as f:
         stopwords = f.read().split()
     with open(data_path, 'r', encoding='utf8') as f:
         data = f.readlines()
     texts = []
     doc = []
     for row in map(lambda r: json.loads(r), data):
         tmp = [word for word in tokenize(row) if word not in stopwords]
         texts.append(tmp)
         doc.append(" ".join(tmp))
     # Create Dictionary
     without_stopwords = data_path + 'processed'
     with open(without_stopwords, 'w', encoding='utf8') as f:
         for raw in doc:
             f.write(raw)
             f.write('\n')
     corpus = gensim.corpora.textcorpus.TextCorpus(without_stopwords)
     if save:
         tmp_file = get_tmpfile(corpus_path)
         MmCorpus.serialize(tmp_file, corpus)
         with open(data_ready_path, 'wb') as f:
             pkl.dump(texts, f)
     return corpus, texts
예제 #16
0
def guidedLDA_Model(topics, cores=11):
    """
    Topics represents desired LDA topics,
    cores should be physical cores minus one.
    Both should be integers.
    """

    # load finished dictionary from disk
    trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict')

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize('./models2/trigram_bow_corpus.nm',
                        trigram_bow_generator('./models2/trigram_transformed_reviews.txt'))

    # load finished bag-of-words corpus from disk
    trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm')


    # Pass the bag-of-words matrix and Dictionary from previous steps to LdaMulticore as inputs,
    # along with the number of topics the model should learn

    # workers => sets the parallelism, and should be
    # set to your number of physical cores minus one
    lda = LdaMulticore(trigram_bow_corpus,
                       num_topics=topics,
                       id2word=trigram_dictionary,
                       workers=cores)

    lda.save('./models2/lda_model')

    # load the finished LDA model from disk
    #lda = LdaMulticore.load('./models/lda_model_neg')

    return trigram_bow_corpus, lda
def pylda_visualize(csv_chemin, ecriture_chemin, tfidf_visualization = False, num_topic=3, filter_by_cluster=None):
    ''' gets the clustering result from csv_chemin and then writes the LDA visualisation as an html file into ecriture_chemin
        csv_chemin points to a dataframe with two columns: one corresponding to the cluster, the other containing the text
         num_topic is the number of topics we want to extract from the texts
         filter_by_cluster is the cluster index, if we want to extract topics from one cluster only
    '''
    #df = pd.read_csv('df_brown.csv')
    clustering_result_df = pd.read_csv(csv_chemin)
    if filter_by_cluster:
        clustering_result_df[clustering_result_df['pred_cluster'] == filter_by_cluster]
    text = clustering_result_df['text'].values
    #text = ' '.join(text)

    docs = pd.DataFrame(list(map(load_doc, enumerate(list(clustering_result_df['text'].apply(clean))))))
    docs.head()

    dictionary, corpus = prep_corpus(docs['tokens'])
    #dictionary : keys = word_id ; value = word
    #corpus[i] = list of tuples (word_id, count) where count is the number of occurence of the word in the text corpus[i]

    if tfidf_visualization:
        # Instead of representing each text as tuples (word_idx, term_frequency), we represent them as (word_idx, word_tfidf_weight)
        model = TfidfModel(corpus)
        new_corpus = []
        for i in range(len(corpus)):
            element = corpus[i]
            new_element = []
            for j in range(len(element)):
                #word = dictionary[pair[0]]
                pair = element[j]
                #dict_idx = pair[0]
                tfidf_vector = model[element]
                word_tfidf_weight = tfidf_vector[j]
                new_element += (pair[0], word_tfidf_weight)
            new_corpus.append(new_element)

        MmCorpus.serialize(ecriture_chemin + '.mm', corpus)
        dictionary.save(ecriture_chemin + '.dict')

        lda = models.ldamodel.LdaModel(corpus=new_corpus, id2word=dictionary, num_topics=15, passes=10)

        lda.save(ecriture_chemin + '.model')


        vis_data = gensimvis.prepare(lda, new_corpus, dictionary)
        pyLDAvis.display(vis_data)
        pyLDAvis.save_html(vis_data, ecriture_chemin + '.html')

    else:
        MmCorpus.serialize(ecriture_chemin + '.mm', corpus)
        dictionary.save(ecriture_chemin + '.dict')

        lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topic, passes=10)

        lda.save(ecriture_chemin + '.model')

        vis_data = gensimvis.prepare(lda, corpus, dictionary)
        pyLDAvis.display(vis_data)
        pyLDAvis.save_html(vis_data, ecriture_chemin + '.html')
def createcorpus(bg_corpus,output_dictionary,output_serialize):
	# Generating a training/background corpus from your own source of documents
	#saving dictionary and corpus in Matrix method form
	print("Creating corpus and dictionary")
	background_corpus = TextCorpus(input=bg_corpus)
	background_corpus.dictionary.save(output_dictionary)
	MmCorpus.serialize(output_serialize,background_corpus)  
	return background_corpus,background_corpus.dictionary
예제 #19
0
 def _create_bow_representation(self):
     """Create bag-of-words representation of collection, and save it 
        in Matrix Matrix format to disk."""
     
     print('Create bag-of-words matrix representation.')
     self.bow_corpus = [self.dictionary.doc2bow(article) 
                        for article in self.articles]
     MmCorpus.serialize(self.bowmm_filepath, self.bow_corpus)
 def _getCorpus():
     try:
         wc = MmCorpus(corpusPath)
     except FileNotFoundError:
         wc = WikiCorpus(getInputPath(), tokenizer_func=tokenize)
         wc.dictionary.save(dictPath)
         MmCorpus.serialize(corpusPath, wc)
     return wc
예제 #21
0
def model_all():
    dictionary, corpus = prep_corpus(df_text['text_tokens'])
    MmCorpus.serialize('wiki_articles.mm', corpus)
    dictionary.save('wiki_articles_new.dict')
    lda = models.ldamodel.LdaModel(corpus=corpus,
                                   id2word=dictionary,
                                   num_topics=15,
                                   passes=50)
    return lda, dictionary, corpus
def main():
    # a command line interface for running Gensim operations
    # can create a corpus from a directory of texts or from a wikipedia dump
    # options for lemmatize words, build model and/or pyLDAvis graph output
    parser = argparse.ArgumentParser(description='Create a corpus from a collection of tweets and/or build an LDA model')
    subparsers = parser.add_subparsers(dest='mode')
    
    text_corpus_parser = subparsers.add_parser('text', help='Build corpus from directory of text files')
    text_corpus_parser.add_argument('-d', '--docs_loc', required=True, action='store', dest='docs_loc', help='Directory where tweet documents stored')
    text_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
    text_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')

    wiki_corpus_parser = subparsers.add_parser('wiki', help='Build corpus from compressed Wikipedia articles')
    wiki_corpus_parser.add_argument('-w', '--wiki_loc', required=True, action='store', dest='wiki_loc', help='Location of compressed Wikipedia dump')
    wiki_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus')
    wiki_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words')

    lda_model_parser = subparsers.add_parser('lda', help='Create LDA model from saved corpus')
    lda_model_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
    lda_model_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
    lda_model_parser.add_argument('-n', '--num_topics', required=True, action='store', dest='num_topics', help='Number of topics to assign to LDA model')
    lda_model_parser.add_argument('-p', '--num_pass', required=True, action='store', dest='num_pass', help='Number of passes through corpus when training the LDA model')
    lda_model_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location and name to save LDA model')

    lda_vis_parser = subparsers.add_parser('ldavis', help='Create visualization of LDA model')
    lda_vis_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus')
    lda_vis_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary')
    lda_vis_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of LDA model')

    argcomplete.autocomplete(parser)
    args = parser.parse_args()

    if args.mode == 'text':
        doc_corpus = DocCorpus(args.docs_loc, args.lemma)

        doc_corpus.dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)

        MmCorpus.serialize(args.corp_loc + '.mm', doc_corpus)
        doc_corpus.dictionary.save(args.corp_loc + '.dict')

    if args.mode == 'wiki':
        if args.lemma:
            wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=True, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)
        else:
            wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=False, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15)

        wiki_corpus.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)

        MmCorpus.serialize(args.corp_loc + '.mm', wiki_corpus)
        wiki_corpus.dictionary.save(args.corp_loc + '.dict')

    if args.mode == 'lda':
        build_LDA_model(args.corp_loc, args.dict_loc, args.num_topics, args.num_pass, args.lda_loc)

    if args.mode == 'ldavis':
        build_pyLDAvis_output(args.corp_loc, args.dict_loc, args.lda_loc)
예제 #23
0
파일: lda.py 프로젝트: huaiwen/GraBTax
def generate_matrix_market(
    dictionary,
    save=False,
    file=os.path.join(config.map("Storage")['storage_dir'] + 'corpus.mm')):
    corpus = iter_docs(dictionary)

    if save:
        MmCorpus.serialize(file, corpus)

    return corpus
예제 #24
0
def load_experts():
    """
    load expert data and save to file
    """
    expert_corpus = ExpertCorpus()
    MmCorpus.serialize(corpus=expert_corpus, fname='expert_corpus_new_test.mm')
    """
    save expert-to-document map to pickle
    """
    pickle.dump(expert2doc, open('expert2doc_new_test.p', 'wb'))
예제 #25
0
def create_bow(trigram_reviews_filepath, trigram_bow_filepath,
               trigram_dictionary):
    """
    generate bag-of-words representations for
    # all reviews and save them as a matrix
    """

    MmCorpus.serialize(
        trigram_bow_filepath,
        trigram_bow_generator(trigram_reviews_filepath, trigram_dictionary))
예제 #26
0
    def _create_files(self):
        wikidata = WikiData(self.config.dataset_dir, self.tokenizer)
        doc_stream = (tokens for tokens in wikidata)
        dictionary_wiki = Dictionary(doc_stream)
        dictionary_wiki.filter_extremes(no_below=20, no_above=0.1)

        dictionary_wiki.save(self.wiki_dict_file)

        wiki_corpus = WikiCorpus(wikidata, dictionary_wiki)
        MmCorpus.serialize(self.mm_corpus_file, wiki_corpus)
예제 #27
0
    def create_bow_corpus(self, persist=True):
        self.log('Making Bag-of-Words corpus...')
        self.bow_corpus = list(
            map(self.dict.doc2bow, self.preprocessed_article_texts))
        sleep(2)

        if persist:
            self.log('Saving Bag-of-Words corpus to disk for future use...')
            MmCorpus.serialize(self.bow_corpus_file, self.bow_corpus)
            sleep(2)
def _serialize_corpus_(fpath,
                       dic,
                       outfpath=fpathroot + fpathappend + '_serialized.mm',
                       returncorp=True):
    """
    create serialized corpus
    """
    MmCorpus.serialize(outfpath, _bow_generator_(fpath, dic))
    if returncorp == True:
        return MmCorpus(outfpath)
 def set_corpus(self, language_processed_data: list, corpus_file_path: str):
     logging.info("---- Creating corpus from processed data")
     corpus = [
         self.dictionary.doc2bow(list_of_words_of_doc)
         for list_of_words_of_doc in language_processed_data
     ]
     MmCorpus.serialize(corpus_file_path, corpus)
     self.corpus = corpus
     logging.info("---- Corpus is created")
     return
예제 #30
0
 def _create_tfidf_matrix(self):
     """Create TF-IDF matrix and save it in Matrix Matrix format to 
        disk"""
     
     print('Create TF-IDF matrix of collection.')
     tfidf = TfidfModel(self.bow_corpus, 
                        id2word=self.dictionary, 
                        normalize=True)
     MmCorpus.serialize(self.tfidf_filepath, 
                        tfidf[self.bow_corpus])
     print('Number of documents:', tfidf.num_docs)
예제 #31
0
def load_experts():
    """
    load expert data and save to file
    """
    expert_corpus = ExpertCorpus()
    MmCorpus.serialize(corpus=expert_corpus, fname='expert_corpus_new_test.mm')

    """
    save expert-to-document map to pickle
    """
    pickle.dump(expert2doc, open('expert2doc_new_test.p', 'wb'))
예제 #32
0
 def persist_corpus(self, corpus: Corpus, key: str = "corpus") -> Corpus:
     """Takes a transient corpus generator and persists it on disk. Only necessary when using a corpus more than once."""
     with Message(f"Storing {key} corpus"):
         f = get_tmpfile(f"irsel_{key}")
         # By serializing a corpus to disk, we can read it multiple times (which is impossible with a generator)
         # without having to load it into RAM as a whole at any time.
         MmCorpus.serialize(f, corpus)
         corpus = MmCorpus(
             f)  # this instance can be consumed as often as we want
     printq(corpus)
     return corpus
예제 #33
0
def create_corpus(input_file, corpus_filepath, dictionary, run_or_load_flag):
    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    if run_or_load_flag:
        MmCorpus.serialize(corpus_filepath,
                           trigram_bow_generator(input_file, dictionary))
        corpus = MmCorpus(corpus_filepath)
    else:
        corpus = MmCorpus(corpus_filepath)

    return corpus
예제 #34
0
    def _train_lda(vectorizer,
                   corpora_path,
                   id2word_path,
                   model_dir,
                   model_fname=model_fname,
                   num_topics=10):
        """训练和保存基于tfidf的lda模型

        基于{corpora_path}文件保存的语料和{id2word_path}保存的gensim字典来训练lda_tfidf模型,

        保存该模型到{model_dir}文件夹下

        Args:
            vectorizer(str) :- 向量化方法, choices=["bow", "tfidf"]
            corpora_path(path) :- 保存语料的.txt文件
            id2word_path(path) :- 保存gensim字典的文件
            model_dir(path) :- 保存gensim LDA模型的文件夹
            model_fname(path) :- 模型文件名
            num_topics(int) :- lda的超参,主题数
        """
        try:
            assert vectorizer in ["bow", "tfidf"]
        except AssertionError:
            raise AssertionError("vectorizer must be bow or tfidf")

        if not os.path.isdir(model_dir):
            raise OSError(model_dir, "doesn't exist")

        corpora = []
        with open(corpora_path, 'r', encoding="utf8") as fp:
            lines = fp.readlines()
            for line in lines:
                corpora.append(line.strip())
        id2word = gensim.corpora.Dictionary.load(id2word_path)
        corpus = [id2word.doc2bow(corpus.split(" ")) for corpus in corpora]

        # tfidf的话需要计算idf
        if vectorizer == "tfidf":
            MmCorpus.serialize(corpus_tfidf_mm, corpus)
            corpus = MmCorpus(corpus_tfidf_mm)

        model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

        model_path = os.path.join(model_dir, vectorizer)
        make_dir(model_path)
        model_path = os.path.join(model_path, model_fname)
        if not os.path.isfile(model_path):
            model.save(model_path)
            print('model saved')
        else:
            print(f"{model_path} already exists")
        return model
예제 #35
0
    def _create_files1(self):
        dir = "/media/rohola/data/dialog_systems/alexa_prize_topical_chat_dataset/reading_sets/"
        wikidata = TopicalDataset(dir, self.tokenizer)
        doc_stream = (tokens for tokens in wikidata)

        id2word_wiki = Dictionary(doc_stream)
        id2word_wiki.filter_extremes(no_below=20, no_above=0.2)

        id2word_wiki.save(self.wiki_dict_file)

        wiki_corpus = WikiCorpus(wikidata, id2word_wiki)
        MmCorpus.serialize(self.mm_corpus_file, wiki_corpus)
예제 #36
0
def main():
    dataset, version, nbfiles, pos_tags, tfidf, args = parse_args()

    corpus_type = "tfidf" if tfidf else "bow"

    logger = init_logging(name=f'MM_{dataset}_{corpus_type}',
                          basic=False,
                          to_stdout=True,
                          to_file=True)
    logg = logger.info if logger else print
    log_args(logger, args)

    texts, stats, nbfiles = make_texts(dataset, nbfiles, pos_tags, logg=logg)
    gc.collect()

    file_name = f'{dataset}{nbfiles if nbfiles else ""}_{version}'
    directory = join(LDA_PATH, version)
    if not exists(directory):
        makedirs(directory)

    # --- saving texts ---
    file_path = join(directory, f'{file_name}_texts.json')
    logg(f'Saving {file_path}')
    with open(file_path, 'w') as fp:
        json.dump(texts, fp, ensure_ascii=False)

    # --- saving stats ---
    file_path = join(directory, f'{file_name}_stats.json')
    logg(f'Saving {file_path}')
    with open(file_path, 'w') as fp:
        json.dump(stats, fp)

    # generate and save the dataset as bow or tfidf corpus in Matrix Market format,
    # including dictionary, texts (json) and some stats about corpus size (json)
    corpus, dictionary = texts2corpus(texts,
                                      tfidf=tfidf,
                                      filter_below=5,
                                      filter_above=0.5,
                                      logg=logg)

    file_name += f'_{corpus_type}'
    directory = join(directory, corpus_type)

    # --- saving corpus ---
    file_path = join(directory, f'{file_name}.mm')
    logg(f'Saving {file_path}')
    MmCorpus.serialize(file_path, corpus)

    # --- saving dictionary ---
    file_path = join(directory, f'{file_name}.dict')
    logg(f'Saving {file_path}')
    dictionary.save(file_path)
예제 #37
0
 def discover(self, textcolname, num_topics, passes):
     self.num_topics = num_topics
     self.passes = passes
     self._logger.info("cleanzing data for '%s'", textcolname)
     self.__cleanze(textcolname)
     self._logger.info("creating corpus and dictionary for '%s'", textcolname)
     self.dictionary, self.corpus = self.__corpus()
     self._logger.info("applying lda model '%s'", textcolname)
     self.lda = self.__model()
     self._logger.info("saving models for '%s'", textcolname)
     MmCorpus.serialize(PConstant.CORPUS_DIR_PATH.value + textcolname +'_corpus.mm', self.corpus)
     self.dictionary.save( PConstant.DICTIONARY_DIR_PATH.value + textcolname + '_dictionary.dict')
     self.lda.save( PConstant.LDA_DIR_PATH.value + textcolname + '_lda.model')
예제 #38
0
    def init_empty_corpus(self):
        """
        Initialize an empty corpus. If the corpora are to be treated as lists, simply
        initialize an empty list. If serialization is used, initialize an empty corpus
        of the class `gensim.corpora.MmCorpus`.

        """
        if self.serialized:
            # Initialize the corpus as a serialized empty list.
            # This corpus will be extended in self.update.
            MmCorpus.serialize(self.serialization_path, [])  # Serialize empty corpus.
            self.corpus = MmCorpus(self.serialization_path)  # Store serialized corpus object in self.corpus.
        else:
            # All input corpora are assumed to just be lists.
            self.corpus = []
예제 #39
0
def pretrain():
    """pre train the text corpus and build the dictionary"""
    gutenberg_corpus = TextCorpus(text_corpus_file)
    gutenberg_corpus.dictionary.save(dict_file)
    gutenberg_corpus.dictionary.save_as_text(dic_txt_file)
    mm = MmCorpus.serialize(mm_corpus_file, gutenberg_corpus)
    print mm;
예제 #40
0
def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # Read in the corpus from within the archive file
    fin = path.join(datadir, "reuters21578.tar.gz")
    rc = ReutersCorpus(fin)

    # filter out some of the more common words,
    # and some of the less-common ones as well
    rc.dictionary.filter_extremes(no_below=20, no_above=0.1)
    rc.dictionary.compactify()

    # Serialize the Reuters 21578 corpus
    fout = path.join(datadir, "reuters21578.mm")
    MmCorpus.serialize(fout, rc)

    # Save the dictionary to file as text
    fout = path.join(datadir, "reuters21578.dict.txt")
    rc.dictionary.save_as_text(fout)
예제 #41
0
파일: task2.py 프로젝트: ypandit/exercises
def get_topics_lda(tokens, n_topics=10):
    """
    Using the `gensim` package for LDA. 
    LDA is a little better than LSA as it provides a reasonal mixture of topics (Wikipedia).
    `gensim` is a package for topic modeling only. So for a particular topic modeling task,
    it is a lighter option to install and run. Also it can be run distributed and updated over an existing model

    :param tokens: Preprocessed tokens for faster dictionary building
    :param n_topics: Number of topics to decompose data to
    :return: list() of topics
    """
    dict_file = 'resources/deals.dict'
    if not os.path.isfile(dict_file):
        print "Dictionary file does not exist. Creating one"
        dictionary = Dictionary(tokens)
        freq1 = [id for id, freq in dictionary.dfs.iteritems() if freq == 1]
        dictionary.filter_tokens(freq1)
        dictionary.compactify()
        dictionary.save(dict_file)
    dictionary = Dictionary.load(dict_file)
    # print dictionary

    corpus_file = 'resources/deals.mm'
    if not os.path.isfile(corpus_file):
        print "Corpus file does not exist. Creating one"
        corpus = [dictionary.doc2bow(token) for token in tokens]
        MmCorpus.serialize(corpus_file, corpus)
    mm = MmCorpus(corpus_file)
    # print mm
    # tfidf = TfidfModel(mm)
    # corpus_tfidf = tfidf[mm]

    lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=n_topics, update_every=1, chunksize=1000,
                   passes=1)
    topics = []
    for i in range(0, n_topics):
        words = lda.print_topic(i).split('+')
        topic = []
        for word in words:
            score, w = word.split('*')
            topic.append((w, score))
        topics.append(topic)
    return topics
예제 #42
0
def main():
    datadir = path.abspath(path.join(os.getcwd(), "data"))

    # load back the id->word mapping directly from file
    fin = path.join(datadir, "reuters21578.dict.txt")
    vocabulary = Dictionary.load_from_text(fin)

    # load the corpus
    fin = path.join(datadir, "reuters21578.mm")
    mm = MmCorpus(fin)

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=vocabulary, normalize=True)

    # save the TfidfModel instance to file
    fout = path.join(datadir, "reuters21578.tfidf.model")
    tfidf.save(fout)

    # save TF-IDF vectors in matrix market format
    fout = path.join(datadir, "reuters21578.tfidf.mm")
    MmCorpus.serialize(fout, tfidf[mm], progress_cnt=10000)
예제 #43
0
def main(argv=None):
    if argv is None:
        argv = sys.argv
    print('Creating speech serialized corpus')
    # Create the speech corpus, it is inside the rawfile as a json format:
    # "id0": {"text": [" "], "url": "http://www.americanrhetoric.com/"}
    with open(RAWFILE, 'r') as f:
        speech_dict = json.load(f)
    with open(RAWIDS, 'r') as f:
        id_dict = json.load(f)
    # We also need to make sure that the article ids are saved in the correct
    # format so that the gensimple engine can understand it, like this:
    # "int": ["url", "title"],
    texts = []
    article_dict = {}
    counter = 0
    for key, value in speech_dict.items():
        texts.append([token for token in value['text']])
        article_dict[str(counter)] = [value['url'], id_dict[key]['title']]
        counter += 1
    with open(ARTICLEDICT, 'w') as f:
        json.dump(article_dict, f)
    dictionary = Dictionary(texts)
    dictionary.save_as_text(DICTFILE)
    corpus = [dictionary.doc2bow(text) for text in texts]
    MmCorpus.serialize(MMFILE, corpus)
    print('Speech serialized corpus created')
    # # Now run LSI on TDIDF
    dictionary = Dictionary.load_from_text(DICTFILE)
    mm = MmCorpus(MMFILE)
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(TDIFMODEL)
    MmCorpus.serialize(TDIFFILE, tfidf[mm], progress_cnt=10000)
    mm_tdif = MmCorpus(TDIFFILE)
    lsi = LsiModel(mm_tdif, id2word=dictionary, num_topics=300)
    index = similarities.MatrixSimilarity(lsi[mm_tdif])
    index.save(SIMMATRIX)
    lsi.save(LSIMODEL)
    print("LSI model and index created")
def main():
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))

    outp = OUT_PREFIX
    keep_words = DEFAULT_DICT_SIZE

    # the doc index
    dbc = get_cursor()
    dbc.execute('SELECT id, title FROM wiki_pages WHERE is_artist=1 ORDER BY id')
    docindex = [(pageid, title) for pageid, title in dbc]
    pickle.dump(docindex, open(outp + '_docindex.p', 'wb'))

    lemmatize = True  # 'lemma' in program

    wiki = WikiCorpus(pages_gen, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=20, no_above=0.5, keep_n=DEFAULT_DICT_SIZE)
    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf, ~50min
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
    # another long task
    MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000)

    logger.info("finished running %s" % program)
예제 #45
0
def main(args):

    logging.info('Initializing loaders with root %s, name %s' % (
        args.root, args.name))

    dloader = MultimodalDatasetLoader(args.root, args.name)

    icorp = dloader.load_image_corpus(args.img_label)

    transformer = NormalizationTransform()

    normalized_icorp = transformer._apply(icorp)

    corpus_names = dloader.layout.required_img_corpus_names(args.transformation_label)
    corpus_full_path = os.path.join(args.root, corpus_names[0])

    logging.info('Serializing to file %s' % corpus_full_path)

    MmCorpus.serialize(corpus_full_path, normalized_icorp)

    logging.info('Re-saving original corpus object with infix %s' % args.transformation_label)

    dloader.save_image_corpus(normalized_icorp.corpus, args.transformation_label)
예제 #46
0
    def _build_model(self, all_documents, remove_once=False):
        '''
        Builds the lsa model

        Returns:
            dictionary, corpus
        '''
        doc_hash = hash_obj(all_documents)
        corp_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_corp_' + str(int(remove_once))
        dic_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_dic_' + str(int(remove_once))
        lsi_cache_path = CACHE_DIR + '/' + doc_hash +\
            '_lsi_' + str(int(remove_once))
        if os.path.exists(corp_cache_path) \
                and os.path.exists(dic_cache_path)\
                and os.path.exists(lsi_cache_path):
            lsi = models.LsiModel.load(lsi_cache_path)
            corp = MmCorpus(corp_cache_path)
            dic = Dictionary.load(dic_cache_path)
        else:
            texts = [self.tokenize(doc) for doc in all_documents]
            all_tokens = sum(texts, [])
            if remove_once:
                tokens_once = set(word for word in set(all_tokens)
                                  if all_tokens.count(word) == 1)
                texts = [[word for word in text if word not in tokens_once]
                         for text in texts]
            dic = Dictionary(texts)
            corp = [dic.doc2bow(text) for text in texts]

            MmCorpus.serialize(corp_cache_path, corp)
            dic.save(dic_cache_path)
            lsi = models.LsiModel(
                corp, id2word=dic, num_topics=20)
            lsi.save(lsi_cache_path)
        return dic, corp, lsi
예제 #47
0
파일: demo.py 프로젝트: pielstroem/Topics
def upload_file():
    """
    Upload csv files and create:
        * ~/out/corpus.dict
        * ~/out/corpus.lda
        * ~/out/corpus.lda.state
        * ~/out/corpus.mm
        * ~/out/corpus.mm.index
        * ~/out/corpus_doclabels.txt
        * ~/out/corpus_topics.txt
        * ~/mycorpus.txt

    As well as (for example):
        * ~/swcorp/Doyle_AStudyinScarlet.txt
        * ~/swcorp/Lovecraft_AttheMountainofMadness.txt
        * etc.
    """

    # INPUT
    # columns to read from csv file
    columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity']

    # parts-of-speech to include into the model
    pos_tags = ['ADJ', 'NN', 'V']

    # stopwords
    regex = re.compile('\w+')
    stopwords = request.files['stoplist']
    stopwords = str(stopwords.readlines())
    stopwords = regex.findall(stopwords)
    stopwords.extend(("'", "'d", "'s")) # temporary solution
    print(stopwords)

    # document size (in words)
    doc_size = 1000

    # uses the pipeline's ParagraphId to split text into documents,
    # overrides doc_size - 1: on, 0: off
    doc_split = 0

    # no. of topics to be generated
    no_of_topics = 30

    # no. of lda iterations - usually, the more the better, but
    # increases computing time
    no_of_passes = 1

    # perplexity estimation every n chunks -
    # the smaller the better, but increases computing time
    eval = 1

    # documents to process at once
    chunk = 100

    # "symmetric", "asymmetric", "auto", or array
    # (default: a symmetric 1.0/num_topics prior) affects sparsity of
    # the document-topic (theta) distribution
    alpha = "symmetric"

    # custom alpha may increase topic coherence, but may also produce
    # more topics with zero probability alpha = np.array([ 0.02, 0.02,
    # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04,
    # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02])

    # can be a number (int/float), an array, or None
    # affects topic-word (lambda) distribution - not necessarily
    # beneficial to topic coherence
    eta = None

    # PREPROCESSING
    files = request.files.getlist('files')
    docs = []
    doc_labels = []

    print("\n reading files ...\n")

    for file in files:
        file_label = secure_filename(file.filename).split('.')[0]

        df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE)
        df = df[columns]
        df = df.groupby('CPOS')

        doc = pd.DataFrame()
        for p in pos_tags:  # collect only the specified parts-of-speech
            doc = doc.append(df.get_group(p))
            # construct documents
            if doc_split:  # size according to paragraph id
                doc = doc.groupby('ParagraphId')
                for para_id, para in doc:
                    docs.append(para['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(para_id)]))
            else:  # size according to doc_size
                doc = doc.sort_values(by='TokenId')
                i = 1
                while(doc_size < doc.shape[0]):
                    docs.append(
                        doc[:doc_size]['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(i)]))
                    doc = doc.drop(doc.index[:doc_size])
                    i += 1
                docs.append(doc['Lemma'].values.astype(str))
                doc_labels.append(''.join([file_label, " #", str(i)]))

            if not os.path.exists(os.path.join(os.getcwd(), "swcorp")):
                os.makedirs(os.path.join(os.getcwd(), "swcorp"))

            swpath = os.path.join('swcorp', "".join(file_label))

            with open(swpath + ".txt", 'w', encoding="utf-8") as text:
                text.write(" ".join(
                    word for word in doc['Lemma'].values.astype(str)
                    if word not in stopwords))

    print("\n normalizing and vectorizing ...\n")

    # texts = [
    #   [word for word in doc if word not in stopwords] for doc in docs]

    print("\n stopwords removed ...\n")

    print("\n writing mastercorpus ...\n")

    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    with open(mastercorpus, 'w', encoding="utf-8") as data:
        folder = glob.glob("swcorp/*")
        for text in folder:
            with open(text, 'r', encoding="utf-8") as text:
                textline = [re.sub(
                    r'\\n\\r', '', document) for document in ' '.join(
                        text.read().split())]
                if text != folder[-1]:
                    data.write("".join(textline) + "\n")
                else:
                    data.write("".join(textline))

    # MAIN PART
    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    dictionary = corpora.Dictionary(
        line.lower().split() for line in open(
            mastercorpus, encoding="utf-8"))

    class MyCorpus(object):
        def __iter__(self):
            for line in open('mycorpus.txt'):
                # assume there's one document per line, tokens
                # separated by whitespace
                yield dictionary.doc2bow(line.lower().split())

    # corpus = buildCorpus(mastercorpus, dictionary)

    corpus = MyCorpus()

    # corpus = glob.glob("swcorpus/*")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)): os.makedirs(os.path.join
    # (os.path.join(os.getcwd(), 'out'), foldername))

    MmCorpus.serialize(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus.mm'])), corpus)
    mm = MmCorpus('out/corpus.mm')

    print(mm)

    # doc_labels = glob.glob("corpus/*")

    print("fitting the model ...\n")

    model = LdaModel(
        corpus=mm, id2word=dictionary, num_topics=no_of_topics,
        passes=no_of_passes, eval_every=eval, chunksize=chunk,
        alpha=alpha, eta=eta)

    # model = LdaMulticore(corpus=corpus, id2word=dictionary,
    # num_topics=no_of_topics, passes=no_of_passes,
    # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

    print(model, "\n")

    topics = model.show_topics(num_topics=no_of_topics)

    for item, i in zip(topics, enumerate(topics)):
        print("topic #"+str(i[0])+": "+str(item)+"\n")

    print("saving ...\n")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)):
    # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'),
    # foldername))

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f:
            for item in doc_labels:
                f.write(item + "\n")

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_topics.txt"])), "w", encoding="utf-8") as f:
        for item, i in zip(topics, enumerate(topics)):
            f.write(
                "".join(["topic #", str(i[0]), ": ", str(item), "\n"]))

    dictionary.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'dict'])))
    # MmCorpus.serialize(
    # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
    # [foldername, 'mm'])), corpus)
    model.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'lda'])))

    print("\n ta-daaaa ...\n")
    
    # VISUALIZATION
    no_of_topics = model.num_topics
    no_of_docs = len(doc_labels)
    doc_topic = np.zeros((no_of_docs, no_of_topics))
    
    for doc, i in zip(corpus, range(no_of_docs)):
        # topic_dist is a list of tuples (topic_id, topic_prob)
        topic_dist = model.__getitem__(doc)
        for topic in topic_dist:
            doc_topic[i][topic[0]] = topic[1]
    
    # get plot labels
    topic_labels = []
    for i in range(no_of_topics):
        # show_topic() returns tuples (word_prob, word)
        topic_terms = [x[0] for x in model.show_topic(i, topn=3)]
        topic_labels.append(" ".join(topic_terms))
        
    # cf. https://de.dariah.eu/tatom/topic_model_visualization.html

    if no_of_docs > 20 or no_of_topics > 20:
        plt.figure(figsize=(20, 20)) # if many items, enlarge figure
    plt.pcolor(doc_topic, norm=None, cmap='Reds')
    plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels)
    plt.xticks(
        np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90')
    plt.gca().invert_yaxis()
    plt.colorbar(cmap='Reds')
    plt.tight_layout()
    plt.savefig("./static/corpus_heatmap.svg")
    return render_template('success.html')
예제 #48
0

logging.basicConfig(stream=sys.stdout, level=logging.INFO)


### Generating a training/background corpus
from gensim.corpora import TextCorpus, MmCorpus, Dictionary

# Provide a filename or a file-like object as input and TextCorpus will be automatically initialized with a
# dictionary in `self.dictionary`and will support the `iter` corpus method. For other kinds of corpora, you only
# need to override `get_texts` and provide your own implementation..
background_corpus = TextCorpus(input=YOUR_CORPUS)

background_corpus.dictionary.save("my_dict.dict")  # Important -- save the dictionary generated by the corpus, or future operations will not be able to map results back to original words.

MmCorpus.serialize("background_corpus.mm", background_corpus)  #  Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs.


### Generating a large training/background corpus using Wikipedia
from gensim.corpora import WikiCorpus, wikicorpus

articles = "enwiki-latest-pages-articles.xml.bz2"  # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download

wiki_corpus = WikiCorpus(articles)  # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix.
wiki_corpus.dictionary.save("wiki_dict.dict")

MmCorpus.serialize("wiki_corpus.mm", wiki_corpus)  #  File will be several GBs.


### Working with persisted corpus and dictionary
bow_corpus = MmCorpus("wiki_corpus.mm")  # Revive a corpus
예제 #49
0
def make_corpus(path):
    wiki = WikiCorpus(path)
    MmCorpus.serialize('/mnt/ebs/wikidata/wiki_jp_vocab.mm', wiki)
예제 #50
0
파일: lda2.py 프로젝트: pielstroem/Topics

class MyCorpus(object):
     def __iter__(self):
         for line in open('mycorpus.txt'):
             # assume there's one document per line, tokens separated by whitespace
             yield dictionary.doc2bow(line.lower().split())
        
corpus = MyCorpus()

#create output folder
if not os.path.exists("out"): os.makedirs("out")

corpusPath = os.path.join(os.path.join(os.getcwd(), "out"), '.'.join([foldername, 'mm']))

MmCorpus.serialize(corpusPath, corpus)

mm = MmCorpus(corpusPath)

doc_labels = makeDocLabels(path)

log.info('fitting the model ...')

# fitting the model
model = LdaModel(corpus=mm, id2word=dictionary, num_topics=no_of_topics, passes=no_of_passes,
                 eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

log.info('generated topics...')

# print topics
topics = model.show_topics(num_topics=no_of_topics)
예제 #51
0
파일: GenSim.py 프로젝트: rsteckel/EDA
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq == 2]
dictionary.filter_tokens(once_ids)


dictionary.save(os.environ['NM_HOME']+'/Data/product_text.dict") 


corpus = [dictionary.doc2bow(text) for text in texts]
#corpus = TextCorpus(input=texts)

# Important -- save the dictionary generated by the corpus, or future operations will not be able to map results
# back to original words.
#corpus.dictionary.save("/Users/rsteckel/Workspace/NM/product_text.dict")
#dictionary = corpus.dictionary

MmCorpus.serialize(os.environ['NM_HOME']+"/Data/product_corpus.mm", corpus)  

documents.close()


#-------------LDA-------------
lda = LdaModel(corpus, num_topics=10, id2word=dictionary)

#lda.show_topics()
for i in np.arange(10):
    print lda.print_topic(i), '\n'



#--------------LSI----------------
tfidf = models.TfidfModel(corpus)
예제 #52
0
    # remove stopwords
    stopwords = nltk_stopwords().union(additional_stopwords)
    stopword_ids = map(dictionary.token2id.get, stopwords)
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(stopword_ids)
    dictionary.compactify()
    # get ids for short words len(word)<=3
    shortword_ids = [tokenid for tokenid, word in dictionary.iteritems() if len(word.split('/')[0])<= 3]
    dictionary.filter_tokens(shortword_ids)
    dictionary.compactify()
    # remove words that appear only once
    once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems()if docfreq == 1]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    # filter extreme values
    dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=None)
    dictionary.compactify()

    print('Building corpus...')
    corpus = [dictionary.doc2bow(doc) for doc in docs]

    return dictionary, corpus

dictionary ,corpus = prep_corpus(docs['tokens'])

MmCorpus.serialize('data/model/newsgroups.mm', corpus)
dictionary.save('data/model/newsgroups.dict')

lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=50, passes=10)
lda.save('data/model/newsgroups_50.model')
예제 #53
0
    wiki = WikiCorpus(inp, lemmatize=lemmatize)
    # only keep the most frequent words
    wiki.dictionary.filter_extremes(no_below=min_threshold, no_above=max_threshold, keep_n=keep_words)

    # Remove stop words (additional removal of common words used in spoken language)
    stop_ids = []
    with open(stop_words_file, 'r') as infile:
        for line in infile:
            try:
                stop_ids.append(wiki.dictionary.token2id[line.lower().strip()])
            except KeyError:
                continue
    wiki.dictionary.filter_tokens(bad_ids=stop_ids)

    # save dictionary and bag-of-words (term-document frequency matrix)
    MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000)
    wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
    # load back the id->word mapping directly from file
    # this seems to save more memory, compared to keeping the wiki.dictionary object from above
    dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2')

    del wiki

    # initialize corpus reader and word->id mapping
    mm = MmCorpus(outp + '_bow.mm')

    # build tfidf
    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)
    tfidf.save(outp + '.tfidf_model')

    # save tfidf vectors in matrix market format
예제 #54
0
argc=len(sys.argv)
if(argc>1):
    year=int(sys.argv[1])

if(argc>2):
    topicnum=int(sys.argv[2])

print  year, topicnum

if(not os.path.exists('%d/CVPRpapers%d.mm')):
    with open('%d/allpapers%d.txt'%(year,year)) as fp:
        d=fp.readlines()
        docs=[i.split(" ") for i in d]

        dictionary, corpus = prep_corpus(docs)

        MmCorpus.serialize('%d/CVPRpapers%d.mm'%(year,year),
                           corpus)
        dictionary.save('%d/CVPRpapers%d.dict'%(year,year))


t0=time.clock()
lda = models.ldamodel.LdaModel(corpus=corpus, 
                               id2word=dictionary,
                               num_topics=topicnum,
                               passes=10)
print time.clock()-t0
                                      
lda.save('%d/CVPRpapers%d_%d.model'%(year,year,topicnum))

예제 #55
0
def saveCorpus(corpus, corpusfile):
  from gensim.corpora.dictionary import Dictionary
  from gensim.corpora import MmCorpus
  MmCorpus.serialize(corpusfile, corpus)
예제 #56
0
if(argc>2):
    topicnum=int(sys.argv[2])

if(argc>3):
    conference=sys.argv[3]

relpath= conference+str(year)
rname= relpath+'/papers'
print  conference,year, topicnum

if(not os.path.exists(rname+'.mm')):
    with open(relpath+'/allpapers.txt') as fp:
        d=fp.readlines()
        docs=[i.split(" ") for i in d]

        dictionary, corpus = prep_corpus(docs)

        MmCorpus.serialize(rname+'.mm',
                           corpus)
        dictionary.save(rname+'.dict')

t0=time.clock()
lda = models.ldamodel.LdaModel(corpus=corpus, 
                               id2word=dictionary,
                               num_topics=topicnum,
                               passes=10)
print time.clock()-t0
                                      
lda.save(relpath+'/papers_%d.model'%(topicnum))

예제 #57
0
print 'Building dictionary of terms ...'
dictionary = corpora.Dictionary(texts)
print '%d word types' % len(dictionary)

print 'Filtering infrequent and frequent terms ...'
dictionary.filter_extremes(no_below=5, no_above=0.5)
print '%d word types, after filtering' % len(dictionary)

print 'Saving dictionary (%s)...' % DICT
dictionary.save(DICT)

print 'Building bag-of-words corpus ...'
bow_corpus = [ dictionary.doc2bow(t) for t in texts ]

print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)

size = len(bow_corpus) * 4 / 5
training = bow_corpus[:size]
testing = bow_corpus[size:]

print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5, iterations = 1000)

print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)

print 'Random subset of topics:'
print '\n'.join(lda.print_topics())

print 'Computing perplexity on %d held-out documents ...' % len(testing)
예제 #58
0
파일: app.py 프로젝트: farokojil/segme
    processed = []
    tokenizer = RegexpTokenizer(r'\w+')
    stopWords = stopwords.words('spanish')
    stemmer = SnowballStemmer('spanish')
    for userQueries in usersQueries:
        tokenized = tokenizer.tokenize(userQueries[0])
        cleaned = [token for token in tokenized if token not in stopWords]
        stemmed = [stemmer.stem(i) for i in cleaned]
        processed.append(stemmed)

finally:
    srcFile.close()

dictionary = Dictionary(processed)
corpus = [dictionary.doc2bow(userQueries) for userQueries in processed]
MmCorpus.serialize('data/corpus_all.mat', corpus)
# corpus = MmCorpus('data/corpus_10k')

def trainLda(corpus, dictionary):
    ldaModel = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=40)
    ldaModel.save('data/model/ldamodel_all.mod')
    ldaUsers = ldaModel[corpus]
    ldaUsers.save('data/ldausers_all.mat')

def trainHdp(corpus, dictionary):
    hdpModel = hdpmodel.HdpModel(corpus, id2word=dictionary)
    hdpModel.save('data/model/hdpmodel_all.mod')
    hdpUsers = hdpModel[corpus]
    hdpUsers.save('data/hdpusers_all.mat')

trainLda(corpus, dictionary)
예제 #59
0
        print(globals()['__doc__'] % locals())
        sys.exit(1)
    inp, outp = sys.argv[1:3]
    if len(sys.argv) > 3:
        keep_words = int(sys.argv[3])
    else:
        keep_words = DEFAULT_DICT_SIZE
    online = 'online' in program
    lemmatize = 'lemma' in program
    debug = 'nodebug' not in program

    if online:
        dictionary = HashDictionary(id_range=keep_words, debug=debug)
        dictionary.allow_update = True # start collecting document frequencies
        wiki = WikiCorpus(inp, lemmatize=lemmatize, dictionary=dictionary)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # ~4h on my macbook pro without lemmatization, 3.1m articles (august 2012)
        # with HashDictionary, the token->id mapping is only fully instantiated now, after `serialize`
        dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        dictionary.save_as_text(outp + '_wordids.txt.bz2')
        wiki.save(outp + '_corpus.pkl.bz2')
        dictionary.allow_update = False
    else:
        wiki = WikiCorpus(inp, lemmatize=lemmatize) # takes about 9h on a macbook pro, for 3.5m articles (june 2011)
        mywiki = myWikiCorpus(inp, lemmatize=lemmatize)
        # only keep the most frequent words (out of total ~8.2m unique tokens)
        wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE)
        # save dictionary and bag-of-words (term-document frequency matrix)
        MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h
        MmCorpus.serialize(outp + '_bowm.mm', mywiki, progress_cnt=10000) # another ~9h
        wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2')
        # load back the id->word mapping directly from file
예제 #60
0
def setup():
    documents = []
    import glob
    import os
    directoryNames = list(set(glob.glob(os.path.join("Data", "*"))).difference(set(glob.glob(os.path.join("Data","*.*")))))
    numberOfDocuments = 0

    for folder in directoryNames:
        for fileNameDir in os.walk(folder):
            for fileName in fileNameDir[2]:
                if fileName[-4:] != ".txt":
                    continue
                nameFileDocument = "{0}{1}{2}".format(fileNameDir[0], os.sep, fileName)
                with open(nameFileDocument, 'r') as doc:
                    doc_text = doc.read().replace('\n', '')
                import re
                processed_doc_text = re.sub('[^a-zA-Z0-9\n]', ' ', doc_text)
                documents.append(processed_doc_text)
                numberOfDocuments += 1
                break

    print(numberOfDocuments)

    # remove common words and tokenize

    #from gensim.utils import lemmatize
    #lemmatized_docs = [lemmatize(document) for document in documents]

    from stop_words import get_stop_words
    stop_words = get_stop_words('english')
    texts = [[word for word in document if word not in stop_words]
             for document in documents]

    # remove words that appear only once
    from collections import defaultdict
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]

    from gensim import corpora
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000)
    dictionary.save('files/pmc-data.dict') # store the dictionary, for future reference

    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('files/pmc-data.mm', corpus) # store to disk, for later use


    from gensim.corpora import MmCorpus

    mm = MmCorpus('files/pmc-data.mm')

    from gensim.models import TfidfModel

    tfidf = TfidfModel(mm, id2word=dictionary, normalize=True)

    MmCorpus.serialize('files/pmc-data-tfidf.mm', tfidf[mm], progress_cnt=10000)