Пример #1
0
def createLsiModelforCorpus(corpusfile, dictfile, numtop):
    print "\nLoading dictionary..."
    dict = corpora.Dictionary.load_from_text(dictfile)
    print(dict)
    print "\nLoading corpus..."
    corpus = corpora.MmCorpus(corpusfile)
    print(corpus)
    print "\nPerforming Latent Semantic Indexing..."
    lsi = LsiModel(corpus=corpus, num_topics=numtop, id2word=dict, distributed=False)
    ## This is the fancy stochastic (aka truncated) SVD, however it throws runtime memory errors for me (e.g. segmentation fault)
    #lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics)
    corpustopics=lsi.show_topics(num_words=10, log=True, formatted=False)

    rootdir=os.getcwd()
    foldername='lsi_output'
    folderpath=os.path.join(rootdir,foldername)
    if (os.path.exists(folderpath)==True):
        shutil.rmtree(folderpath)
        os.makedirs(folderpath)
    else:
        os.makedirs(folderpath)
    os.chdir(folderpath)
    lsimodelfile=(str(args.corpus).replace('.mm',''))+'_lsi.model'
    lsi.save(lsimodelfile)
    filename1= (str(args.corpus).replace('.mm',''))+'_lsi_topics.pkl'
    with open(filename1,'wb') as output:
        pickle.dump(corpustopics, output)
    os.chdir(rootdir)

    return corpustopics, lsi
Пример #2
0
def run():
  try:
    print "starting to build LSI Model"

    start = datetime.now()
    documents = Feature.objects.exclude(text=None).values_list("text", flat=True)
    number_of_documents = len(documents)
    print "number_of_documents:", number_of_documents

    stopwords = []
    stopwords += [month.lower() for month in month_to_number.keys()]
    stopwords += nltk_stopwords.words('english')
    print "stopwords:", len(stopwords)
    with open(path_to_directory_of_this_file + "/stopwords.txt") as f:
        stopwords.extend([word for word in f.read().decode("utf-8").split("\n") if word and not word.startswith("#")])
    stopwords = set(stopwords)

    texts = [[word for word in document.lower().replace("#"," ").replace("_"," ").replace("("," ").replace(")"," ").replace("/"," ").replace(":"," ").replace("."," ").split() if word not in stopwords and len(word) > 3 ] for document in documents]

    counter = Counter()
    for text in texts:
        counter.update(text)

    texts = [[token for token in text if counter[token] > 1] for text in texts]

    dictionary = Dictionary(texts)
    print "dictionary:", dictionary
    dictionary.save(path_to_directory_of_this_file + "/dictionary")

    corpus = [dictionary.doc2bow(text) for text in texts]
    print "corpus:", type(corpus)

    print "generating lsi model"
    
    lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10)
    print "saving LSI model"
    lsi.save(path_to_directory_of_this_file + "/model")

    Topic.objects.all().delete()
    topics = []
    for topic in lsi.show_topics():
        topics.append(Topic(id=topic[0], name=prettify_topic(topic[1])))

    Topic.objects.bulk_create(topics)

  except Exception as e:
    print e
Пример #3
0
def run():
  try:
    print("starting to build LSI Model")

    start = datetime.now()
    documents = Feature.objects.exclude(text=None).values_list("text", flat=True)
    number_of_documents = len(documents)
    print("number_of_documents:", number_of_documents)

    texts = [tokenize(document) for document in documents]

    counter = Counter()
    for text in texts:
        counter.update(text)

    texts = [[token for token in text if counter[token] > 1] for text in texts]

    print("texts:", len(texts), texts[:5])

    dictionary = Dictionary(texts)
    #print "dictionary:", dictionary
    dictionary.save(path_to_directory_of_this_file + "/dictionary")

    corpus = [dictionary.doc2bow(text) for text in texts]
    print("corpus:", type(corpus))

    print("generating lsi model")
    
    lsi = LsiModel(corpus=corpus, id2word=dictionary, num_topics=10)
    print("saving LSI model")
    lsi.save(path_to_directory_of_this_file + "/model")


    # nullifyin all topics on features and places
    Feature.objects.exclude(topic=None).update(topic=None)
    Place.objects.exclude(topic=None).update(topic=None)

    Topic.objects.all().delete()
    print("deleted all topics")
    topics = []
    for topic in lsi.show_topics():
        topics.append(Topic(id=topic[0], name=prettify_topic(topic[1])))

    Topic.objects.bulk_create(topics)
    print("bulk created all topics")


    """
    # re-create topics for all features in database
    for feature in Feature.objects.exclude(text=None).exclude(text=""):
        words = tokenize(feature.text)
        if words:
            probabilities = lsi[dictionary.doc2bow(words)]
            if probabilities:
                topic_id = sorted(probabilities, key=lambda tup: -1*tup[1])[0][0]
                if topic_id:
                    feature.topic_id = topic_id
                    feature.save()

    # assign as topic to each place based on most popular topic found in features
    for place_id in Place.objects.exclude(featureplace=None).values_list("id", flat=True):
        counter = Counter(Feature.objects.filter(featureplace__place_id=place_id).values_list("topic_id"))
        print "counter:", counter
    """


  except Exception as e:
    print(e)
k = 40 # wanted number of topics


### SVD DECOMPOSITION (LSA) ##
### USING GENSIM #############
ans = raw_input("Start Latent Semantic Analysis with Gensim ? ")
if ans != "y":
    exit()

from gensim.models.lsimodel import LsiModel
from gensim.matutils import Sparse2Corpus, corpus2dense

co = Sparse2Corpus(X, documents_columns = False)

lsi = LsiModel(corpus=co, num_topics=k)
list_topics = lsi.show_topics(formatted=False)
topics = map(lambda li : [(value, feature_names[int(key)]) for (value, key) in li] ,list_topics)
print(topics)

genreMat = []

for genre in Genre.objects.all():
    index = filmsbygenre[genre.name]
    if index != []:
        obj = lsi[Sparse2Corpus(X[index, :], documents_columns = False)]
        E = corpus2dense(obj, k).transpose()
        genreMat.append( np.hstack([ [genre.name] , np.mean(E, axis = 0)]) )
    else:
        genreMat.append( np.hstack([ [genre.name] , np.zeros(k) ] ))
genreMat = np.vstack(genreMat)
print genreMat
Пример #5
0
k = 40  # wanted number of topics

### SVD DECOMPOSITION (LSA) ##
### USING GENSIM #############
ans = raw_input("Start Latent Semantic Analysis with Gensim ? ")
if ans != "y":
    exit()

from gensim.models.lsimodel import LsiModel
from gensim.matutils import Sparse2Corpus, corpus2dense

co = Sparse2Corpus(X, documents_columns=False)

lsi = LsiModel(corpus=co, num_topics=k)
list_topics = lsi.show_topics(formatted=False)
topics = map(
    lambda li: [(value, feature_names[int(key)]) for (value, key) in li],
    list_topics)
print(topics)

genreMat = []

for genre in Genre.objects.all():
    index = filmsbygenre[genre.name]
    if index != []:
        obj = lsi[Sparse2Corpus(X[index, :], documents_columns=False)]
        E = corpus2dense(obj, k).transpose()
        genreMat.append(np.hstack([[genre.name], np.mean(E, axis=0)]))
    else:
        genreMat.append(np.hstack([[genre.name], np.zeros(k)]))
Пример #6
0
#lsi = stochastic_svd(corpus,rank=100,num_terms=args.ntopics)

#if len(args.query)!=1:
#print corpus[args.query]
queryresult = lsi[corpus[args.query]]
sortedqueryresult = sorted(list(queryresult), key=lambda query: abs(query[1]), reverse=True)
#screenqueryresult = sorted(list(queryresult), key=itemgetter(1))

#screenoutput = lsi.print_topics(num_topics=10, num_words=1)
#output = lsi.print_topics(num_topics=10, num_words=10)
#print "\nResult:"
#pp.pprint(screenoutput)
#lsi.save('lsi_result.txt')


corpustopics=lsi.show_topics(num_words=args.ntopics, log=True, formatted=False)
screencorpustopics=lsi.show_topics(num_words=args.nwords, log=False, formatted=True)

print "\nCorpus top five topics: (full set of topics printed to file)"
pp.pprint(screencorpustopics[:5])
# Use this to show all the words within e.g. the first topic, not just the top 10 words
#pp.pprint(corpustopics[:1])
print "\nSimilarity of document number {0} in corpus with corpus topics:".format(args.query)
pp.pprint(sortedqueryresult)
#pp.pprint(sortedqueryresult[:10])

rootdir=os.getcwd()
foldername='lsi_output'
folderpath=os.path.join(rootdir,foldername)
if (os.path.exists(folderpath)==True):
    shutil.rmtree(folderpath)
Пример #7
0
    kalimat = kalimat.lower()
    # menghapus tanda koma dan ':'
    kalimat = kalimat.replace('"', '')
    kalimat = re.sub("[.,:\n-]", "", kalimat)
    # memisah kalimat menjadi kata-kata
    cleanteks.append(kalimat.split())

# tampilkan hasil praproses
print('cleanteks :\n', cleanteks)

# membuat kamus kumpulan data
from gensim import corpora

dictioanary = corpora.Dictionary(cleanteks)

# melihat hasil dictionary
print('\ndictioanary :\n', dictioanary)

# mengubah teks ke dalam bentuk document-term matrix
dtm = [dictioanary.doc2bow(text) for text in cleanteks]

# melihat hasil document-term matrix
print('\ndocument-term matrix :\n', dtm)

# melakukan proses topic extraction menggunakan LSI model (topic = 2)
from gensim.models.lsimodel import LsiModel
lsimodel = LsiModel(dtm, num_topics=5, id2word=dictioanary)

# melihat hasil topic
print('\nhasil topic :\n', lsimodel.show_topics(num_words=3))
Пример #8
0
               		cursor.execute(query,(sent_id-1,sent_id))
               		sent_list = cursor.fetchall() 
			sentence =''
               		for sent in sent_list:
				sentence+=' '
				sentence+=sent[0]
                yield 	dictionary.doc2bow(cleanSent(sentence).lower().split())

corpus=MyCorpus()
print(dictionary)

lsi = LsiModel(corpus, num_topics=50,id2word=dictionary)
print(lsi[doc_tfidf]) # project some document into LSI space
lsi.add_documents(corpus2) # update LSI on additional documents
print(lsi[doc_tfidf])

lsi.show_topics(num_topics=-1, num_words=10, log=False, formatted=True)
print lsi.projection.u 


# finding embeddings of valid formulae 


V = gensim.matutils.corpus2dense(lsi[corpus], len(lsi.projection.s)).T / lsi.projection.s

import numpy as np
np.asarray
np.save