def store_contents(data_path, save_path, datasource, processOnlyFilesinOriginalQrels, num_workers=None): """Preprocess and store a corpus of documents in sqlite. Args: data_path: Root path to directory (or directory of directories) of files containing json encoded documents (must have `id` and `text` fields). save_path: Path to output sqlite db. preprocess: Path to file defining a custom `preprocess` function. Takes in and outputs a structured doc. num_workers: Number of parallel processes to use when reading docs. """ if os.path.isfile(save_path): raise RuntimeError('%s already exists! Not overwriting.' % save_path) print save_path print data_path docIds = [] # list of TREC DocID docIdToDocIndex = {} # key is DocID, value is docIndex docIndex = 0 workers = ProcessPool(num_workers) files = [] if processOnlyFilesinOriginalQrels == True: topicData = TRECTopics(datasource, start_topic[datasource], end_topic[datasource]) qrelDocList = topicData.qrelDocIdLister( qrelAddress[datasource], save_path, topic_original_qrels_doc_list_file_name) files = [] for docId in qrelDocList: fileid = docId + '.txt' files.append(os.path.join(data_path, fileid)) #files = [f for f in iter_files(data_path) if os.path.splitext(os.path.basename(f))[0] in qrelDocList] print "Number of unique documents in the qrels", len(files) else: files = [f for f in iter_files(data_path)] dictionary = Dictionary() count = 0 with tqdm(total=len(files)) as pbar: for pairs in tqdm(workers.imap_unordered(get_contents, files)): count += len(pairs) dictionary.add_documents([ pairs[0][1].split() ]) # pairs[0][0]-->docId, pairs[0][1]-->documentContent docIdToDocIndex[pairs[0][0]] = docIndex docIds.append(pairs[0][0]) docIndex = docIndex + 1 pbar.update() print("Number of documents:", docIndex, len(docIds), len(docIdToDocIndex)) total_documents = len(docIds) metadata = {} metadata['docIdToDocIndex'] = docIdToDocIndex metadata['docIndexToDocId'] = docIds # protocol 2 for version compaitability pickle.dump(metadata, open(save_path + meta_data_file_name[datasource], 'wb'), protocol=2) # keep only words that # exist within at least 20 articles # keep only the top most freqent 15000 tokens dictionary.filter_extremes(no_below=20, keep_n=dictionary_features_number) dictionary.compactify() dictionary.save_as_text(save_path + dictionary_name) dictionary = Dictionary.load_from_text(save_path + dictionary_name) start_time = time.time() corpus_bow_stream = stream_corpus(data_path, dictionary, files) MmCorpus.serialize(save_path + corpus_bow_file_name, corpus_bow_stream, progress_cnt=10000) corpus_bow = MmCorpus(save_path + corpus_bow_file_name) model_tfidf = TfidfModel(corpus_bow, id2word=dictionary, normalize=True) model_tfidf.save(save_path + corpus_tfidf_model_file_name) corpus_tfidf = model_tfidf[corpus_bow] # apply model MmCorpus.serialize(save_path + corpus_tfidf_file_name, corpus_tfidf, progress_cnt=1000) # Load the tf-idf corpus back from disk. corpus_tfidf = MmCorpus(save_path + corpus_tfidf_file_name) #n_items = len(dictionary) #print corpus_tfidf # CSR matrix construction phase indptr = [0] indices = [] data = [] # processing took 9:26s with tqdm(total=total_documents) as pbar: for doc in corpus_tfidf: for (index, values) in doc: indices.append(index) data.append(values) indptr.append(len(indices)) pbar.update() start = time.time() sparse_matrix = sp.csr_matrix((data, indices, indptr), dtype=float) # saving took 01:21s sp.save_npz(save_path + csr_matrix_file_name[datasource], sparse_matrix) print "Finished in:", (time.time() - start)
# In[511]: dictionary = Dictionary(tokens_lst) print("Dictionary: ", dictionary) corpus = [] for i in tokens_lst: corpus.append(dictionary.doc2bow(i)) print("Corpus:", corpus) #file_path_corpus = "/Users/manukarreddy/Desktop/kiruba_python/mkbhd" lda = LdaModel.load( "/Users/manukarreddy/Desktop/BigDataProject/mkbhdtfidf_modelfinal_lda") #corpus = MmCorpus")) #mkbhd_file_path_corpus = "/Users/manukarreddy/Desktop/kiruba_python/mkbhd" mkbhd_corpus = MmCorpus( "/Users/manukarreddy/Desktop/BigDataProject/mkbhdmkbhd.mm") # In[512]: import random import numpy def random_floats(low, high, size): return [random.uniform(low, high) for _ in range(size)] scores_ = random_floats(0.0001, 0.2, 57) #scores_
from gensim.corpora import MmCorpus from multiprocessing import Array, Process, Pool, Queue, Manager import numpy as np import tqdm import sys import gensim import numpy as np import multiprocessing as mp import io from gensim.models import TfidfModel from gensim.corpora import Dictionary, MmCorpus tf_idf_model = TfidfModel.load('/mnt/disk/wikipedia/wikipedia.tfidf_model') dct = Dictionary.load_from_text('/mnt/disk/wikipedia/wikipedia_wordids.txt.bz2') corpus = MmCorpus('/mnt/disk/wikipedia/wikipedia_bow.mm') def load_vectors(fname): fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore') n, d = map(int, fin.readline().split()) data = {} for line in tqdm.tqdm(fin): tokens = line.rstrip().split(' ') data[tokens[0]] = map(float, tokens[1:]) return data w2v_fasttext = load_vectors('wiki-news-300d-1M.vec') import numpy as np
# gammaln(n.sum(self._lambda, 1))) # # return(score) if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s') logging.root.setLevel(level=logging.INFO) logging.info("running %s" % ' '.join(sys.argv)) import os.path program = os.path.basename(sys.argv[0]) # The number of documents to analyze each iteration vocab = WikiCorpus.loadDictionary( '/Users/kofola/gensim/results/wiki10_en_wordids.txt') corpus = MmCorpus('/Users/kofola/gensim/results/wiki10_en_bow.mm') sumcnts = sum(sum(cnt for _, cnt in doc) for doc in corpus) logger.info("running LDA on %i documents, %i total tokens" % (len(corpus), sumcnts)) batchsize = 100000 D = 100000 # total number of docs K = 100 # number of topics iterations = int(sys.argv[1]) # Initialize the algorithm with alpha=1/K, eta=1/K, tau_0=1024, kappa=0.7 olda = OnlineLDA(vocab.values(), K, D, 1. / K, 1. / K, 1., kappa=0.0) # Run until we've seen D documents. (Feel free to interrupt *much* # sooner than this.) for iteration in range(0, iterations): # maybe select only a subset of corpus here (to simulate their "stochastic" approach)
wiki.save(outp + '_corpus.pkl.bz2') dictionary.allow_update = False else: wiki = WikiCorpus( inp, lemmatize=lemmatize ) # takes about 9h on a macbook pro, for 3.5m articles (june 2011) # only keep the most frequent words (out of total ~8.2m unique tokens) wiki.dictionary.filter_extremes(no_below=20, no_above=0.1, keep_n=DEFAULT_DICT_SIZE) # save dictionary and bag-of-words (term-document frequency matrix) MmCorpus.serialize(outp + '_bow.mm', wiki, progress_cnt=10000) # another ~9h wiki.dictionary.save_as_text(outp + '_wordids.txt.bz2') # load back the id->word mapping directly from file # this seems to save more memory, compared to keeping the wiki.dictionary object from above dictionary = Dictionary.load_from_text(outp + '_wordids.txt.bz2') del wiki # initialize corpus reader and word->id mapping mm = MmCorpus(outp + '_bow.mm') # build tfidf, ~50min tfidf = TfidfModel(mm, id2word=dictionary, normalize=True) # save tfidf vectors in matrix market format # ~4h; result file is 15GB! bzip2'ed down to 4.5GB MmCorpus.serialize(outp + '_tfidf.mm', tfidf[mm], progress_cnt=10000) logger.info("finished running %s" % program)
words in both sentences.) ''' #Please modify your all the paths for your resources print('Modify the paths of your corpus on config.ini file') input() config = ConfigParser() config.read('config.ini') #TODO: Generalize this step puting corpus_path as your actual corpus #Config file must allow wikipedia, Gutenberg, ... corpus_path = config['WIKI']['en'][1:-1] dictionary = Dictionary.load_from_text(os.path.relpath(corpus_path+'_wordids.txt.bz2')) bow_corpus = MmCorpus(os.path.relpath(corpus_path+'_bow.mm')) try: tfidf = TfidfModel.load(corpus_path+'wiki-tfidf.model') except: tfidf = TfidfModel() tfidf = TfidfModel(bow_corpus,dictionary) tfidf._smart_save(corpus_path+'wiki-tfidf.model') pass #testing sentences sentence1 = 'pilar pescado en la tarde es fatal' sentence2 = 'machacar pescado al atardecer es terrible' #Transforming sentences sent1 = sentence1.split()
background_corpus.dictionary.save( "my_dict.dict") MmCorpus.serialize("background_corpus.mm", background_corpus) from gensim.corpora import WikiCorpus, wikicorpus articles = "enwiki-latest-pages-articles.xml.bz2" wiki_corpus = WikiCorpus(articles) wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) bow_corpus = MmCorpus("wiki_corpus.mm") dictionary = Dictionary.load("wiki_dict.dict") from gensim.models import LsiModel, LogEntropyModel logent_transformation = LogEntropyModel(wiki_corpus, id2word=dictionary) tokenize_func = wikicorpus.tokenize document = "Some text to be transformed." bow_document = dictionary.doc2bow(tokenize_func( document)) logent_document = logent_transformation[[ bow_document]]
# Sklearn from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.model_selection import GridSearchCV from pprint import pprint # Plotting tools import pyLDAvis import pyLDAvis.sklearn import matplotlib.pyplot as plt # %matplotlib inline from gensim.corpora import Dictionary, MmCorpus trigram_dictionary = Dictionary.load('./models2/trigram_dict_all.dict') trigram_bow_corpus = MmCorpus('./models2/trigram_bow_corpus.nm') # Document to matrix import numpy as np from scipy.sparse import csr_matrix rows = [] cols = [] data = [] Nrow = 1000000 #len(trigram_bow_corpus) Ncol = len(trigram_dictionary) for i in range(0, Nrow): # line = trigram_bow_corpus[i] for indx, freq in line: rows.append(i) cols.append(indx) data.append(freq) dtm = csr_matrix((data, (rows, cols)), shape=(Nrow, Ncol), dtype=int)
print(" ============ RELATED TITLES =======================") related_articles = get_related_articles(doc, 10) for article in related_articles: print(article) argparser = argparse.ArgumentParser() argparser.add_argument('--fileName') args = argparser.parse_args() titles, texts, documents, urls = load_stuff(args.fileName) dictionary = corpora.Dictionary.load( basename(args.fileName) + '.dict') # store the dictionary, for future reference corpus = MmCorpus(basename(args.fileName) + '.mm') lsi = models.LsiModel.load(basename(args.fileName) + '.lsi') index = similarities.MatrixSimilarity.load( basename(args.fileName) + '.index') # transform corpus to LSI space and index it do_print_related( "TO FIX ITS TOXIC AD PROBLEM, FACEBOOK MUST BREAK ITSELF\nIT IS A sure sign that Facebook’s algorithms have run amok when they allow anyone to target ads to people with an expressed interest in burning Jews. Likewise, when Russians can sow chaos in American elections by purchasing thousands of phony Facebook ads without Facebook realizing it, the automated systems selling those ads may need some oversight." ) do_print_related( "ABOUT A WEEK ago, Stanford University researchers (posted online)[https://osf.io/zn79k/] a study on the latest dystopian AI: They'd made a machine learning algorithm that essentially works as gaydar. After training the algorithm with tens of thousands of photographs from a dating site, the algorithm could, for example, guess if a white man in a photograph was gay with 81 percent accuracy. The researchers’ motives? They wanted to protect gay people. " ) do_print_related(
#!/usr/bin/env python from gensim.models import LdaModel from gensim.corpora import MmCorpus, Dictionary import sys, os import pyLDAvis.gensim if len(sys.argv) < 2: print("usage: {0} [path to model.lda]\n".format(sys.argv[0])) sys.exit(1) path, file = os.path.split(sys.argv[1]) corpusname = file.split(".")[0] dictionary = Dictionary.load(path + "/" + corpusname + ".dict") corpus = MmCorpus(path + "/" + corpusname + ".mm") model = LdaModel.load(sys.argv[1]) ############## # cf. https://pyldavis.readthedocs.org/en/latest/modules/API.html vis = pyLDAvis.gensim.prepare(model, corpus, dictionary) pyLDAvis.save_html(vis, path + "/" + corpusname + "_interactive.html") pyLDAvis.show(vis)
) # Uses numpy to persist wiki corpus in Matrix Market format. File will be several GBs. ### Generating a large training/background corpus using Wikipedia from gensim.corpora import WikiCorpus, wikicorpus articles = "enwiki-latest-pages-articles.xml.bz2" # available from http://en.wikipedia.org/wiki/Wikipedia:Database_download wiki_corpus = WikiCorpus( articles ) # This will take many hours! Output is Wikipedia in bucket-of-words (BOW) sparse matrix. wiki_corpus.dictionary.save("wiki_dict.dict") MmCorpus.serialize("wiki_corpus.mm", wiki_corpus) # File will be several GBs. ### Working with persisted corpus and dictionary bow_corpus = MmCorpus("wiki_corpus.mm") # Revive a corpus dictionary = Dictionary.load("wiki_dict.dict") # Load a dictionary ### Transformations among vector spaces from gensim.models import LsiModel, LogEntropyModel logent_transformation = LogEntropyModel( wiki_corpus, id2word=dictionary ) # Log Entropy weights frequencies of all document features in the corpus tokenize_func = wikicorpus.tokenize # The tokenizer used to create the Wikipedia corpus document = "Some text to be transformed." bow_document = dictionary.doc2bow( tokenize_func(document) ) # First, tokenize document using the same tokenization as was used on the background corpus, and then convert it to BOW representation using the dictionary created when generating the background corpus.
""" trigram_reviews_filepath = os.path.join( 'results', 'trigram_transformed_reviews_all.txt') trigram_dictionary_filepath = os.path.join('trigram_dict_all.dict') learn_vocab_corpus(trigram_reviews_filepath, trigram_dictionary_filepath) # load the finished dictionary from disk trigram_dictionary = Dictionary.load(trigram_dictionary_filepath) trigram_bow_filepath = os.path.join('trigram_bow_corpus_all.mm') create_bow(trigram_reviews_filepath, trigram_bow_filepath, trigram_dictionary) # load the finished bag-of-words corpus from disk trigram_bow_corpus = MmCorpus(trigram_bow_filepath) """ find topics """ lda_model_filepath = os.path.join('lda_model_all') create_topics(lda_model_filepath, trigram_bow_corpus, trigram_dictionary) # load the finished LDA model from disk lda = LdaMulticore.load(lda_model_filepath) explore_topic(lda, topic_number=0)
cleared_docs = [[token for token in document if token in cleared_tokens] for document in cleared_docs] ## Save dictionary in serialized form dictionary = Dictionary(cleared_docs) dictionary.save('./dictionaries/python_tags.dict') corpus = [dictionary.doc2bow(document) for document in cleared_docs] MmCorpus.serialize('./dictionaries/python_tags.mm', corpus) ######################################## ## Load Data ######################################## if (os.path.exists("./dictionaries/python_tags.dict")): dictionary = Dictionary.load('./dictionaries/python_tags.dict') corpus = MmCorpus('./dictionaries/python_tags.mm') print("Used dictionary generated") else: print("Please run the preprocessing to generate a dictionary file") ######################################## ## Create Model ######################################## print(corpus) tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ######################################## ## Applying LSI ######################################## lsi = LsiModel(corpus_tfidf,
def LDA_Analysis(): #http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb if 0 == 1: with open('data/review_text_all.txt','w') as myfile: myfile.write("") ''' loop through db and write jobs descriptions ''' with open('data/review_text_all.txt','a') as myfile: with Job() as db: a=0 max_ = int(db.getNoJobs()[0][0]) while (a < max_): #print(a) sample_review = db.readJobDetailClean(a)[0][1] if (sample_review != 'Json Error'): myfile.write(str(sample_review)+'\n') a += 1 #unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt') if 0 == 1: with codecs.open('data/unigram_sentences_all.txt', 'w', encoding='utf_8') as f: for sentence in lemmatized_sentence_corpus('data/review_text_all.txt'): f.write(sentence + '\n') unigram_sentences = LineSentence('data/unigram_sentences_all.txt') ''' for unigram_sentence in it.islice(unigram_sentences, 230, 240): print(u' '.join(unigram_sentence)) print(u'') ''' #bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all') if 0 == 1: bigram_model = Phrases('data/unigram_sentences_all.txt') bigram_model.save('data/bigram_model_all') # load the finished model from disk bigram_model = Phrases.load('data/bigram_model_all') #bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt') if 0 == 1: with codecs.open('data/bigram_sentences_all.txt', 'w', encoding='utf_8') as f: for unigram_sentence in unigram_sentences: bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n') bigram_sentences = LineSentence('data/bigram_sentences_all.txt') ''' for bigram_sentence in it.islice(bigram_sentences, 230, 240): print(u' '.join(bigram_sentence)) print(u'') ''' #trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all') if 0 == 1: trigram_model = Phrases(bigram_sentences) trigram_model.save('data/trigram_model_all') # load the finished model from disk trigram_model = Phrases.load('data/trigram_model_all') #trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt') if 0 == 1: with codecs.open('data/trigram_sentences_all.txt', 'w', encoding='utf_8') as f: for bigram_sentence in bigram_sentences: trigram_sentence = u' '.join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence('data/trigram_sentences_all.txt') ''' for trigram_sentence in it.islice(trigram_sentences, 230, 240): print(u' '.join(trigram_sentence)) print(u'') ''' #trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt') if 0 == 1: import csv ''' Variant A: Use Stopwords 1) download StopWords.csv from MySQL table: KeyWords. 2) Remove all relevant words by hand ;) ''' with open('data/StopWords.csv', newline='') as csvfile: stopwords_ = csv.reader(csvfile, delimiter=' ', quotechar='|') for words_ in stopwords_: #print(words_[0]) STOP_WORDS.add(words_[0]) #print(STOP_WORDS) ''' Varaint B: Use Dictionary ''' with open('data/Dictionary.csv', 'r', newline='') as csvfile: file_ = csv.reader(csvfile, delimiter=',', quotechar='"') dictionary_ = [] for row in file_: dictionary_.append(row[0]) #with open('file.csv', 'r') as f: #reader = csv.reader(f) #your_list = list(reader) with codecs.open('data/trigram_transformed_reviews_all.txt', 'w', encoding='utf_8') as f: for parsed_review in nlp.pipe(line_review('data/review_text_all.txt'), batch_size=10000, n_threads=4): # lemmatize the text, removing punctuation and whitespace unigram_review = [token.lemma_ for token in parsed_review if not punct_space(token)] # apply the first-order and second-order phrase models bigram_review = bigram_model[unigram_review] trigram_review = trigram_model[bigram_review] # remove any remaining stopwords ''' Variant A: ''' #trigram_review = [term for term in trigram_review # if term not in STOP_WORDS]#spacy.en.STOPWORDS] !!!!! CHECK THIS !!!!! module 'spacy' has no attribute 'en' ''' Variant B: ''' trigram_review = [term for term in trigram_review if term in dictionary_]# # write the transformed review as a line in the new file trigram_review = u' '.join(trigram_review) f.write(trigram_review + '\n') ''' print(u'Original:' + u'\n') for review in it.islice(line_review('review_text_all.txt'), 11, 12): print(review) print(u'----' + u'\n') print(u'Transformed:' + u'\n') with codecs.open('trigram_transformed_reviews_all.txt', encoding='utf_8') as f: for review in it.islice(f, 11, 12): print(review) ''' #trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict') if 0 == 1: trigram_reviews = LineSentence('data/trigram_transformed_reviews_all.txt') # learn the dictionary by iterating over all of the reviews trigram_dictionary = Dictionary(trigram_reviews) # filter tokens that are very rare or too common from # the dictionary (filter_extremes) and reassign integer ids (compactify) trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)#,keep_n=100000)#,) trigram_dictionary.compactify() trigram_dictionary.save('data/trigram_dict_all.dict') # load the finished dictionary from disk trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict') #trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm') if 0 == 1: # generate bag-of-words representations for # all reviews and save them as a matrix MmCorpus.serialize('data/trigram_bow_corpus_all.mm', trigram_bow_generator(trigram_dictionary,'data/trigram_transformed_reviews_all.txt')) # load the finished bag-of-words corpus from disk trigram_bow_corpus = MmCorpus('data/trigram_bow_corpus_all.mm') #lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all') if 0 == 1: with warnings.catch_warnings(): warnings.simplefilter('ignore') # workers => sets the parallelism, and should be # set to your number of physical cores minus one lda = LdaMulticore(trigram_bow_corpus, num_topics=15, id2word=trigram_dictionary, workers=1) lda.save('data/lda_model_all') # load the finished LDA model from disk lda = LdaMulticore.load('data/lda_model_all') #explore_topic(lda, topic_number=1) topic_names = {0:u'Risk Management Bank', 1:u'Big Data Report', 2:u'Automotive SAP', 3:u'Microsoft Java Scrum', 4:u'Medical Consultant', 5:u'Java Engineer', 6:u'Computer Vision Developer', 7:u'Data Analyst', 8:u'BI SAP BW', 9:u'IOT Reporting R', 10:u'Global Project Presentation', 11:u'Cloud Engineer IOT', 12:u'Industry 4.0', 13:u'Risk Consulting', 14:u'Machine Learning Data Science'} #topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl') with open('data/topic_names.pkl', 'wb') as f: pickle.dump(topic_names, f) #load sameple_review from database #sample_review = get_sample_review(10) #lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, sample_review) #LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared') if 0 == 1: #term_ix = np.sort(topic_info.index.unique().values) LDAvis_prepared = pyLDAvis.gensim_.prepare(lda, trigram_bow_corpus, trigram_dictionary) with open('data/ldavis_prepared', 'wb') as f: pickle.dump(LDAvis_prepared, f) ''' export LDA file ''' # load the pre-prepared pyLDAvis data from disk with open('data/ldavis_prepared', 'rb') as f: LDAvis_prepared = pickle.load(f) with open('data/DSJobs_LDA.html', 'w') as f: pyLDAvis.save_html(LDAvis_prepared, f)
help="specify LDA model.") args.add_argument("-s", "--save_to_file", type=str, help="speficy file which the HTML will be saved to.") args.add_argument("-t", "--use_tfidf", action="store_true", help="use TF-IDF corpus.") args.add_argument( "--method", type=str, default="pcoa", help="specify a method for MDS by one from 'pcoa', 'mmds', or 'tsne'.") return args.parse_args() if __name__ == "__main__": args = parse_arg() model = LdaModel.load(args.model[0]) corpus = MmCorpus(args.corpus[0]) if args.use_tfidf: tfidf = TfidfModel(corpus) corpus = tfidf[corpus] dictionary = Dictionary.load_from_text(args.dictionary[0]) vis = pyLDAvis.gensim.prepare(model, corpus, dictionary, mds=args.method) if args.save_to_file is not None: pyLDAvis.save_html(vis, args.save_to_file) else: pyLDAvis.show(vis)
# approximately 8 hours on a 8GB machine with a dual core processor wiki_corpus =corpora.wikicorpus.WikiCorpus(wiki_file) print('Finished making the wikicorpus, saving BOW corpus\n') corpora.mmcorpus.MmCorpus.serialize('../data/wiki_en_vocab200k', wiki_corpus) print('Done saving BOW Corpus\n') # Save the dicitonary, you will need it to convert future documents into # BOW format #wiki.dictionary.save("../data/wiki_dict.dict") #print 'Saved dictionary' print('Creating LogEntropy TF-IDF and regular TF-IDF matrices and models') BOW_corpus = MmCorpus('../data/wiki_en_vocab200k') #Resurrect BOW corpus #log_entropy = LogEntropyModel(BOW_corpus) #log_entropy.save('../models/logEntropy.model') #already provided log_entropy = LogEntropyModel.load('../models/logEntropy.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', log_entropy[BOW_corpus]) print('Saved LogEntropy TF-IDF matrix') #tfidf = TfidfModel(BOW_corpus) #tfidf.save('../models/tfidf.model') #already provided tfidf = TfidfModel.load('../models/tfidf.model') corpora.mmcorpus.MmCorpus.serialize('../data/log_entropy_matrix', tfidf[BOW_corpus])
def SAVE_CORPUS_MM_FORMAT(): ###save corpus into mm format MmCorpus.serialize('bow_corpus.mm', bow_corpus) mm = MmCorpus('bow_corpus.mm') print(mm[1]) #retrieve doc 1
if __name__ == '__main__': # training_file_path = 'E:/2017_Deep_learning/text similarity' training_file_path = './keywords/sentidata' # Lsi model dictionary = Dictionary() corpus = sohu_corpus(fname=os.path.join(training_file_path, 'neg_1.txt'), dic=dictionary) # save dictionary # dictionary.save(os.path.join(training_file_path, '07_11_dictionary.dict')) MmCorpus.serialize(os.path.join(training_file_path, '07_11_corpus_12.mm'), corpus) # dictionary = Dictionary.load(os.path.join(training_file_path, '07_11_dictionary.dict')) corpus_tfidf_mm = MmCorpus( os.path.join(training_file_path, '07_11_corpus_12.mm')) training_src_data = sogou_corpus_file( os.path.join(training_file_path, 'neg_1.txt')) training_src = [] for each_file in training_src_data: training_src.append(each_file) # convert counts to tfidf tfidf = TfidfModel(corpus=corpus_tfidf_mm) index = MatrixSimilarity(tfidf[corpus_tfidf_mm]) sims = index[tfidf[dictionary.doc2bow(['阳台', '打死'])]] print('doc2bow:') print(dictionary.doc2bow(['阳台']))
# import argparse from gensim.corpora import Dictionary, MmCorpus from gensim.models.nmf import Nmf from gensim.models import TfidfModel from codebase.utils import TweetRawCorpusStream from codebase.topic_utilities import export_dtm if __name__ == "__main__": corpora_path = "./corpora/" model_path = "./models/" num_topics = 50 model_suffix = "-{}topics".format(num_topics) modelTag = "Seventh-and-EighthWeek-Tweets-Rolling" nmf = Nmf.load("{}{}{}.model".format(model_path, modelTag, model_suffix)) fileTag_list = ["Fifth-and-SixthWeek-Tweets-Rolling"] for fileTag in fileTag_list: tfidf_corpus = MmCorpus('{}{}-tf-idf.mm'.format(corpora_path, fileTag)) export_dtm(nmf=nmf, corpus=tfidf_corpus,\ out_path="{}{}{}-dtm.csv".format(model_path, fileTag, model_suffix),\ stop_at=None)
from gensim.models.ldamodel import LdaModel logger = logging.getLogger(__name__) if __name__ == '__main__': ''' What Visualize LDA topic model using pyLDAvis Documentation https://pyldavis.readthedocs.io/en/latest/ Source https://github.com/bmabey/pyLDAvis Article https://nlp.stanford.edu/events/illvi2014/papers/sievert-illvi2014.pdf ''' data_folder = '/tmp/Data' models_names = [ ] # store.get_model_names(data_folder) logger.info(models_names) OPTS = { 'R': 100, 'mds': 'tsne', 'sort_topics': False, 'plot_opts': { 'xlab': 'PC1', 'ylab': 'PC2' } } for basename in models_names: target_folder = os.path.join(data_folder, basename) corpus_filename = os.path.join(target_folder, 'corpus.mm') dictionary_filename = os.path.join(target_folder, 'corpus.dict.gz') model_filename = os.path.join(target_folder, 'gensim_model_{}.gensim.gz'.format(basename)) lda = LdaModel.load(model_filename) dictionary = Dictionary.load(dictionary_filename) corpus = MmCorpus(corpus_filename) convert_to_pyLDAvis(data_folder, basename, **OPTS)
input, output = sys.argv[1:3] if len(sys.argv) > 3: keep_words = int(sys.argv[3]) else: keep_words = DEFAULT_DICT_SIZE # build dictionary. only keep 200k most frequent words (out of total ~7m unique tokens) # takes about 8h on a macbook pro wiki = WikiCorpus(input, keep_words=keep_words) # save dictionary and bag-of-words # another ~8h wiki.saveAsText(output) del wiki # initialize corpus reader and word->id mapping from gensim.corpora import MmCorpus id2token = WikiCorpus.loadDictionary(output + '_wordids.txt') mm = MmCorpus(output + '_bow.mm') # build tfidf # ~20min from gensim.models import TfidfModel tfidf = TfidfModel(mm, id2word=id2token, normalize=True) # save tfidf vectors in matrix market format # ~1.5h; result file is 14GB! bzip2'ed down to 4.5GB MmCorpus.saveCorpus(output + '_tfidf.mm', tfidf[mm], progressCnt=10000) logging.info("finished running %s" % program)
def loadCorpus(corpusPath): return MmCorpus(corpusPath)
def main(): parser = argparse.ArgumentParser( description= 'maps a given document-author-contribution file to a weighted bipartite network of document and author nodes' ) parser.add_argument( '--contribs', type=argparse.FileType('r'), help='path to input contribution MatrixMarket file (.mm/.mm.bz2)', required=True) parser.add_argument('--bipart-graph', type=argparse.FileType('w'), help='path to output graph (.graph/.graph.bz2) file', required=True) parser.add_argument('--top-n-contribs', type=int, help='keep at most N highest contribs per author', required=True) args = parser.parse_args() input_contribs_path = args.contribs.name output_bipart_graph_path = args.bipart_graph.name top_n_contribs = args.top_n_contribs logger.info('running with:\n{}'.format( pformat({ 'input_contribs_path': input_contribs_path, 'output_bipart_graph_path': output_bipart_graph_path, 'top_n_contribs': top_n_contribs }))) # lade gespeicherte Beiträge contribs = MmCorpus(input_contribs_path) num_docs = contribs.num_docs num_authors = contribs.num_terms logger.info('processing contributions of {} documents, {} authors'.format( num_docs, num_authors)) # erzeuge bipartites Affiliationsnetzwerk: enthält Dokumente & Autoren als Knoten, Dokument-Autor-Beiträge ergeben entsprechende gewichtete Kanten bipart_graph = nx.Graph() doc_nodes = tuple('d' + str(n) for n in range(0, num_docs)) bipart_graph.add_nodes_from(doc_nodes, bipartite=0) auth_nodes = tuple('a' + str(n) for n in range(0, num_authors)) bipart_graph.add_nodes_from(auth_nodes, bipartite=1) bipart_graph.add_weighted_edges_from(get_edges_from_contribs(contribs), weight='weight') log_nwx(bipart_graph) logger.info('bipartite? {}'.format(bipartite.is_bipartite(bipart_graph))) simplify_graph_nwx(bipart_graph) logger.info( 'actual numbers after simplifying: {} docs, {} authors, {} edges'. format(*get_bipartite_node_counts(bipart_graph), len(bipart_graph.edges))) # gib höchsten Knotengrad eines Autoren aus max_degree_author = max(bipart_graph.degree(auth_nodes), key=lambda node_deg: node_deg[1]) logger.info('author {} having max degree of {}'.format(*max_degree_author)) # aktalisiere variablen doc_nodes, auth_nodes = get_bipartite_nodes(bipart_graph) # prune die Anzahl aller inzidenten Kanten von Autoren jeweils auf die K Kanten mit den größten Gewichten logger.info('pruning to top {} edges per author'.format(top_n_contribs)) for auth_node in auth_nodes: logger.debug('author {}'.format(auth_node)) auth_edges = bipart_graph[auth_node] auth_edges = tuple((neighbor, weight['weight']) for neighbor, weight in auth_edges.items()) logger.debug('incident edges \n{}'.format(pformat(auth_edges))) num_remove = len(auth_edges) - top_n_contribs author_min_edges = nsmallest(num_remove, auth_edges, key=lambda edge: edge[1]) logger.debug('removing edges \n{}'.format(pformat(author_min_edges))) bipart_graph.remove_edges_from( (auth_node, neighbor) for neighbor, weight in author_min_edges) # keep_max_edges = 10000 # logger.info('pruning to {} highest edges'.format(keep_max_edges)) # num_edges_to_remove = len(bipart_graph.edges) - keep_max_edges # min_edges = nsmallest(num_edges_to_remove, bipart_graph.edges(data='weight'), key=lambda edge: edge[2]) # bipart_graph.remove_edges_from(min_edges) # log_nwx(bipart_graph) # gib höchsten Knotengrad eines Autoren aus max_degree_author = max(bipart_graph.degree(auth_nodes), key=lambda node_deg: node_deg[1]) logger.info('author {} having max degree of {}'.format(*max_degree_author)) # entferne isolierte Knoten simplify_graph_nwx(bipart_graph) log_nwx(bipart_graph) logger.info('new number of documents {}, authors {}'.format( *get_bipartite_node_counts(bipart_graph))) # speichere Affiliationsnetzwerk logger.info('writing graph to {}'.format(output_bipart_graph_path)) nx.write_gpickle(bipart_graph, output_bipart_graph_path)
def lda(): """ LDA model https://radimrehurek.com/gensim/models/ldamodel.html num_topics is the number of requested latent topics to be extracted from the training corpus. id2word is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing. alpha and eta are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda) distributions. Both default to a symmetric 1.0/num_topics prior. alpha can be set to an explicit array = prior of your choice. It also support special values of ‘asymmetric’ and ‘auto’: the former uses a fixed normalized asymmetric 1.0/topicno prior, the latter learns an asymmetric prior directly from your data. eta can be a scalar for a symmetric prior over topic/word distributions, or a vector of shape num_words, which can be used to impose (user defined) asymmetric priors over the word distribution. It also supports the special value ‘auto’, which learns an asymmetric prior over words directly from your data. eta can also be a matrix of shape num_topics x num_words, which can be used to impose asymmetric priors over the word distribution on a per-topic basis (can not be learned from data). Calculate and log perplexity estimate from the latest mini-batch every eval_every model updates (setting this to 1 slows down training ~2x; default is 10 for better performance). Set to None to disable perplexity estimation. decay and offset parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively. minimum_probability controls filtering the topics returned for a document (bow). random_state can be a np.random.RandomState object or the seed for one callbacks a list of metric callbacks to log/visualize evaluation metrics of topic model during training The model can be updated (trained) with new documents via >>> lda.update(other_corpus) You can then infer topic distributions on new, unseen documents, with >>> doc_lda = lda[doc_bow] """ # load word-id dictionary id2word = Dictionary.load('foobar.txtdic') # load matrix market format bow vectors # mm = MmCorpus('bow.mm') # load Tfidf Model in matrix market format mm = MmCorpus('tfidf_JD.mm') # train LDA model lda = LdaModel( corpus=mm, id2word=id2word, num_topics=21, distributed=False, chunksize=2000, passes=3, update_every=1, alpha='symmetric', decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001, minimum_probability=0.01, random_state=None, ns_conf=None, minimum_phi_value=0.01, per_word_topics=False, callbacks=None) # save LDA model lda.save('lda.model')
resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) try: dml = DmlCorpus.load(config.resultFile('.pkl')) except IOError, e: raise IOError( "no word-count corpus found at %s; you must first generate it through gensim_build.py" ) logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) if method == 'tfidf': corpus = MmCorpus(config.resultFile('bow.mm')) model = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) model.save(config.resultFile('tfidfmodel.pkl')) elif method == 'lda': corpus = MmCorpus(config.resultFile('bow.mm')) model = ldamodel.LdaModel(corpus, id2word=id2word, numTopics=DIM_LDA) model.save(config.resultFile('ldamodel%i.pkl' % DIM_LDA)) elif method == 'lsi' or method == 'lsa': # first, transform word counts to tf-idf weights corpus = MmCorpus(config.resultFile('bow.mm')) tfidf = tfidfmodel.TfidfModel(corpus, id2word=id2word, normalize=True) # then find the transformation from tf-idf to latent space model = lsimodel.LsiModel(tfidf.apply(corpus), id2word=id2word, numTopics=DIM_LSI) model.save(config.resultFile('lsimodel%i.pkl' % DIM_LSI))
import warnings import pickle import time from gensim.corpora import Dictionary, MmCorpus from gensim.models.ldamulticore import LdaMulticore from gensim.models import Phrases from gensim.models.word2vec import LineSentence trigram_dict_file = 'trigram_dict.dict' trigram_dictionary = Dictionary.load(trigram_dict_file) trigram_threads_bow_file = 'trigram_threads_bow_corpus.mm' trigram_users_bow_file = 'trigram_users_bow_corpus.mm' trigram_threads_bow_corpus = MmCorpus(trigram_threads_bow_file) trigram_users_bow_corpus = MmCorpus(trigram_users_bow_file) lda_threads_model_file = "lda_threads_model" lda_users_model_file = "lda_users_model" lda_threads = LdaMulticore.load(lda_threads_model_file) lda_users = LdaMulticore.load(lda_users_model_file) LDAvis_threads_file = 'ldavis_threads_prep' LDAvis_users_file = 'ldavis_users_prep' t0 = time.time() LDAvis_threads_prep = pyLDAvis.gensim.prepare(lda_threads, trigram_threads_bow_corpus, trigram_dictionary) t1 = time.time()
language = sys.argv[1] method = sys.argv[2].strip().lower() logging.info("loading corpus mappings") config = dmlcorpus.DmlConfig('gensim_%s' % language, resultDir=gensim_build.RESULT_DIR, acceptLangs=[language]) logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = dmlcorpus.DmlCorpus.loadDictionary( config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) corpus = dmlcorpus.DmlCorpus.load(config.resultFile('.pkl')) input = MmCorpus(config.resultFile('corpus_%s.mm' % method)) assert len(input) == len( corpus ), "corpus size mismatch (%i vs %i): run ./gensim_genmodel again" % ( len(input), len(corpus)) # initialize structure for similarity queries if method == 'lsi' or method == 'rp': # for these methods, use dense vectors index = MatrixSimilarity(input, numBest=MAX_SIMILAR + 1, numFeatures=input.numTerms) else: index = SparseMatrixSimilarity(input, numBest=MAX_SIMILAR + 1) index.normalize = False # do not normalize query vectors during similarity queries (the index is already built normalized, so it would be a no-op) generateSimilar(
logging.info("loading corpus mappings") try: dml = DmlCorpus.load(config.resultFile('.pkl')) except IOError, e: raise IOError( "no word-count corpus found at %s; you must first generate it through gensim_build.py" ) config = dml.config logging.info("loading word id mapping from %s" % config.resultFile('wordids.txt')) id2word = DmlCorpus.loadDictionary(config.resultFile('wordids.txt')) logging.info("loaded %i word ids" % len(id2word)) input = MmCorpus(bow.mm) if method == 'tfidf': model = tfidfmodel.TfidfModel.load(modelfname('tfidf')) elif method == 'lsi': tfidf = tfidfmodel.TfidfModel.load(modelfname('tfidf')) input = tfidf[input] # transform to tfidf model = lsimodel.LsiModel.load(modelfname('lsi')) elif method == 'lda': model = ldamodel.LdaModel.load(modelfname('lda')) else: raise ValueError('unknown method: %s' % repr(method)) topics = model[ input] # documents from 'input' will be represented via 'model' sims = SparseMatrixSimilarity(
def __init__(self, filename): self.corpus = MmCorpus(filename) self.metadata = unpickle(filename + ".metadata.cpickle")
============================================================================= bi-grammed tokenized article: {} """.format(docs[1], docs_tokens[1], docs_phrased[1])) # %% get corpus & dictionary to use for further nlp analysis # get dictionary and write it to a file ws_dictionary = Dictionary(docs_tokens) ws_dictionary.save('.data/ws_dictionary.dict') # get corpus and write it to a file ws_corpus = [ws_dictionary.doc2bow(doc) for doc in docs_tokens] out_f = ('.data/ws_corpus.mm') MmCorpus.serialize(out_f, ws_corpus) mm = MmCorpus(out_f) # `mm` document stream now has random access # send tokenized test to MongoDB # --+ open monog pipeline # ----+ params mongo_host = "10.16.142.91" mongo_db = "digitalTechs" mongo_user = "******" mongo_pass = "******" # ----+ server server = SSHTunnelForwarder(mongo_host, ssh_username=mongo_user, ssh_password=mongo_pass, remote_bind_address=('127.0.0.1', 27017)) # ----+ start server server.start()