def createLsiModelforCorpus(corpusfile, dictfile, numtop): print "\nLoading dictionary..." dict = corpora.Dictionary.load_from_text(dictfile) print(dict) print "\nLoading corpus..." corpus = corpora.MmCorpus(corpusfile) print(corpus) print "\nPerforming Latent Semantic Indexing..." lsi = LsiModel(corpus=corpus, num_topics=numtop, id2word=dict, distributed=False) ## This is the fancy stochastic (aka truncated) SVD, however it throws runtime memory errors for me (e.g. segmentation fault) #lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics) corpustopics=lsi.show_topics(num_words=10, log=True, formatted=False) rootdir=os.getcwd() foldername='lsi_output' folderpath=os.path.join(rootdir,foldername) if (os.path.exists(folderpath)==True): shutil.rmtree(folderpath) os.makedirs(folderpath) else: os.makedirs(folderpath) os.chdir(folderpath) lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model' lsi.save(lsimodelfile) filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl' with open(filename1,'wb') as output: pickle.dump(corpustopics, output) os.chdir(rootdir) return corpustopics, lsi
def run(): try: print "starting to build LSI Model" start = datetime.now() documents = Feature.objects.exclude(text=None).values_list("text", flat=True) number_of_documents = len(documents) print "number_of_documents:", number_of_documents stopwords = [] stopwords += [month.lower() for month in month_to_number.keys()] stopwords += nltk_stopwords.words('english') print "stopwords:", len(stopwords) with open(path_to_directory_of_this_file + "/stopwords.txt") as f: stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")]) stopwords = set(stopwords) texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents] counter = Counter() for text in texts: counter.update(text) texts = [[token for token in text if counter[token] > 1] for text in texts] dictionary = Dictionary(texts) print "dictionary:", dictionary dictionary.save(path_to_directory_of_this_file + "/dictionary") corpus = [dictionary.doc2bow(text) for text in texts] print "corpus:", type(corpus) print "generating lsi model" lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10) print "saving LSI model" lsi.save(path_to_directory_of_this_file + "/model") Topic.objects.all().delete() topics = [] for topic in lsi.show_topics(): topics.append(Topic(id=topic[0], name=prettify_topic(topic[1]))) Topic.objects.bulk_create(topics) except Exception as e: print e
def run(): try: print("starting to build LSI Model") start = datetime.now() documents = Feature.objects.exclude(text=None).values_list("text", flat=True) number_of_documents = len(documents) print("number_of_documents:", number_of_documents) texts = [tokenize(document) for document in documents] counter = Counter() for text in texts: counter.update(text) texts = [[token for token in text if counter[token] > 1] for text in texts] print("texts:", len(texts), texts[:5]) dictionary = Dictionary(texts) #print "dictionary:", dictionary dictionary.save(path_to_directory_of_this_file + "/dictionary") corpus = [dictionary.doc2bow(text) for text in texts] print("corpus:", type(corpus)) print("generating lsi model") lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10) print("saving LSI model") lsi.save(path_to_directory_of_this_file + "/model") # nullifyin all topics on features and places Feature.objects.exclude(topic=None).update(topic=None) Place.objects.exclude(topic=None).update(topic=None) Topic.objects.all().delete() print("deleted all topics") topics = [] for topic in lsi.show_topics(): topics.append(Topic(id=topic[0], name=prettify_topic(topic[1]))) Topic.objects.bulk_create(topics) print("bulk created all topics") """ # re-create topics for all features in database for feature in Feature.objects.exclude(text=None).exclude(text=""): words = tokenize(feature.text) if words: probabilities = lsi[dictionary.doc2bow(words)] if probabilities: topic_id = sorted(probabilities, key=lambda tup: -1*tup[1])[0][0] if topic_id: feature.topic_id = topic_id feature.save() # assign as topic to each place based on most popular topic found in features for place_id in Place.objects.exclude(featureplace=None).values_list("id", flat=True): counter = Counter(Feature.objects.filter(featureplace__place_id=place_id).values_list("topic_id")) print "counter:", counter """ except Exception as e: print(e)
k = 40 # wanted number of topics ### SVD DECOMPOSITION (LSA) ## ### USING GENSIM ############# ans = raw_input("Start Latent Semantic Analysis with Gensim ? ") if ans != "y": exit() from gensim.models.lsimodel import LsiModel from gensim.matutils import Sparse2Corpus, corpus2dense co = Sparse2Corpus(X, documents_columns = False) lsi = LsiModel(corpus=co, num_topics=k) list_topics = lsi.show_topics(formatted=False) topics = map(lambda li : [(value, feature_names[int(key)]) for (value, key) in li] ,list_topics) print(topics) genreMat = [] for genre in Genre.objects.all(): index = filmsbygenre[genre.name] if index != []: obj = lsi[Sparse2Corpus(X[index, :], documents_columns = False)] E = corpus2dense(obj, k).transpose() genreMat.append( np.hstack([ [genre.name] , np.mean(E, axis = 0)]) ) else: genreMat.append( np.hstack([ [genre.name] , np.zeros(k) ] )) genreMat = np.vstack(genreMat) print genreMat
k = 40 # wanted number of topics ### SVD DECOMPOSITION (LSA) ## ### USING GENSIM ############# ans = raw_input("Start Latent Semantic Analysis with Gensim ? ") if ans != "y": exit() from gensim.models.lsimodel import LsiModel from gensim.matutils import Sparse2Corpus, corpus2dense co = Sparse2Corpus(X, documents_columns=False) lsi = LsiModel(corpus=co, num_topics=k) list_topics = lsi.show_topics(formatted=False) topics = map( lambda li: [(value, feature_names[int(key)]) for (value, key) in li], list_topics) print(topics) genreMat = [] for genre in Genre.objects.all(): index = filmsbygenre[genre.name] if index != []: obj = lsi[Sparse2Corpus(X[index, :], documents_columns=False)] E = corpus2dense(obj, k).transpose() genreMat.append(np.hstack([[genre.name], np.mean(E, axis=0)])) else: genreMat.append(np.hstack([[genre.name], np.zeros(k)]))
#lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics) #if len(args.query)!=1: #print corpus[args.query] queryresult = lsi[corpus[args.query]] sortedqueryresult = sorted(list(queryresult), key=lambda query: abs(query[1]), reverse=True) #screenqueryresult = sorted(list(queryresult), key=itemgetter(1)) #screenoutput = lsi.print_topics(num_topics=10, num_words=1) #output = lsi.print_topics(num_topics=10, num_words=10) #print "\nResult:" #pp.pprint(screenoutput) #lsi.save('lsi_result.txt') corpustopics=lsi.show_topics(num_words=args.ntopics, log=True, formatted=False) screencorpustopics=lsi.show_topics(num_words=args.nwords, log=False, formatted=True) print "\nCorpus top five topics: (full set of topics printed to file)" pp.pprint(screencorpustopics[:5]) # Use this to show all the words within e.g. the first topic, not just the top 10 words #pp.pprint(corpustopics[:1]) print "\nSimilarity of document number {0} in corpus with corpus topics:".format(args.query) pp.pprint(sortedqueryresult) #pp.pprint(sortedqueryresult[:10]) rootdir=os.getcwd() foldername='lsi_output' folderpath=os.path.join(rootdir,foldername) if (os.path.exists(folderpath)==True): shutil.rmtree(folderpath)
kalimat = kalimat.lower() # menghapus tanda koma dan ':' kalimat = kalimat.replace('"', '') kalimat = re.sub("[.,:\n-]", "", kalimat) # memisah kalimat menjadi kata-kata cleanteks.append(kalimat.split()) # tampilkan hasil praproses print('cleanteks :\n', cleanteks) # membuat kamus kumpulan data from gensim import corpora dictioanary = corpora.Dictionary(cleanteks) # melihat hasil dictionary print('\ndictioanary :\n', dictioanary) # mengubah teks ke dalam bentuk document-term matrix dtm = [dictioanary.doc2bow(text) for text in cleanteks] # melihat hasil document-term matrix print('\ndocument-term matrix :\n', dtm) # melakukan proses topic extraction menggunakan LSI model (topic = 2) from gensim.models.lsimodel import LsiModel lsimodel = LsiModel(dtm, num_topics=5, id2word=dictioanary) # melihat hasil topic print('\nhasil topic :\n', lsimodel.show_topics(num_words=3))
cursor.execute(query,(sent_id-1,sent_id)) sent_list = cursor.fetchall() sentence ='' for sent in sent_list: sentence+=' ' sentence+=sent[0] yield dictionary.doc2bow(cleanSent(sentence).lower().split()) corpus=MyCorpus() print(dictionary) lsi = LsiModel(corpus, num_topics=50,id2word=dictionary) print(lsi[doc_tfidf]) # project some document into LSI space lsi.add_documents(corpus2) # update LSI on additional documents print(lsi[doc_tfidf]) lsi.show_topics(num_topics=-1, num_words=10, log=False, formatted=True) print lsi.projection.u # finding embeddings of valid formulae V = gensim.matutils.corpus2dense(lsi[corpus], len(lsi.projection.s)).T / lsi.projection.s import numpy as np np.asarray np.save