示例#1
0
texts = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(texts)
#print(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=400)
corpus_lsi = lsi[corpus_tfidf]
#lsi.print_topics(20)
#print(corpus_lsi[0], max(corpus_lsi[0], key=lambda x:x[1]))
#print(corpus_lsi[1], max(corpus_lsi[0], key=lambda x:x[1]))

index = similarities.MatrixSimilarity(corpus_lsi)
#vec_lsi = corpus_lsi[0]
#sims = index[vec_lsi]
#sims = sorted(enumerate(sims), key=lambda item: -item[1])
#print(list(enumerate(sims))[0:9])
#print(titles[0])
#for i in range(5):
#    print(titles[sims[i][0]])

#For now, just print out first 5 papers in DB and top 3 recommended papers
for i in range(6):
    print("Title:", titles[i])
    print(
        "Abstract:",
        abstracts[i],
    )
# 10. Compute distance between texts
# The results of the tf-idf algorithm now return stemmed tokens which are specific to each book.
# We can, for example, see that topics such as selection, breeding or domestication are defining "On the Origin of Species" (and yes, in this book,
# Charles Darwin talks quite a lot about pigeons too). Now that we have a model associating tokens to how specific they are to each book,
# we can measure how related to books are between each other.
# To this purpose, we will use a measure of similarity called cosine similarity
# and we will visualize the results as a distance matrix, i.e., a matrix showing all pairwise distances between Darwin's books.



# Load the library allowing similarity computations
from gensim import similarities

# Compute the similarity matrix (pairwise distance between all texts)
sims = similarities.MatrixSimilarity(model[bows])

# Transform the resulting list into a dataframe
sim_df = pd.DataFrame(list(sims))

# Add the titles of the books as columns and index of the dataframe
sim_df.columns = titles
sim_df.index = titles

# Print the resulting matrix
sim_df




# 11. The book most similar to "On the Origin of Species"
示例#3
0
#
lsi_model.save(os.getcwd() + '/tmp/model.lsi')  # same for tfidf, lda, ...
lsi_model = models.LsiModel.load(os.getcwd() + '/tmp/model.lsi')
# # #  *********************************************************
# #
# # ## LDA模型 **************************************************
# lda_model = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=2)
# corpus_lda = lda_model[corpus_tfidf]
# # Show2dCorpora(corpus_lsi)
# print("===========corpus_ldacorpus_lda")
# nodes = list(corpus_lda)
# pprint(list(corpus_lda))
# #
# # # 此外,还有Random Projections, Hierarchical Dirichlet Process等模型

corpus_simi_matrix = similarities.MatrixSimilarity(corpus_lsi)
# 计算一个新的文本与既有文本的相关度
#要处理的对象登场
target_courses = ['环境还行,但是感觉不是很好吃,排队的人太多了']
target_text = tp.seg_fil_rew(target_courses)
print(target_text)
test_bow = dictionary.doc2bow(target_text[0])  #转换成次数
test_tfidf = tfidf_model[test_bow]
test_lsi = lsi_model[test_tfidf]
test_simi = corpus_simi_matrix[test_lsi]
print(list(enumerate(test_simi)))
# 排序,为输出方便
sort_sims = sorted(enumerate(test_simi), key=lambda item: -item[1])

# 查看结果
print(sort_sims[0:10])  # 看下前10个最相似的,第一个是基准数据自身
    FILENAME = 'panda_corpus.txt'

    panda_g = corpus(FILENAME)

    si = SenSimi(panda_g)

    panda_raw = si.reconstructdata()
    print(type(panda_raw))

    bowlist = si.bowcorpus(panda_raw)
    print(bowlist[1])

    panda_tfidfmodel = si.tfidfmodel(bowlist)
    panda_tfidf = panda_tfidfmodel[bowlist]
    # FIXME:不能用全部的语料生成索引,超出numpy.array限制
    # FIXME: 如果用部分语料,用于比较相似的句子不包括在索引矩阵的特征向量中(基)
    print('using lsi model...')
    panda_lsi = LsiModel(corpus=panda_tfidf,
                         id2word=si.word_dict,
                         num_topics=300)
    index = similarities.MatrixSimilarity(panda_lsi[panda_tfidf])
    good = ['可爱', '萌', '喜欢', '国宝', '神奇']
    good_bow = si.word_dict.doc2bow(good)
    good_tfidf = panda_tfidfmodel[good_bow]
    good_lsi = panda_lsi[good_tfidf]
    simi = index[good_lsi]
    simi_list = list(simi)
    print(max(simi_list))
    where = simi_list.index(max(simi_list))
    print(panda_raw[where])
示例#5
0
logging.root.level = logging.INFO

dictionary = corpora.Dictionary.load('D:/workspace/scrap_sg/dictionary.dict')
corpus = corpora.MmCorpus('D:/workspace/scrap_sg/corpus.mm')
lda = models.ldamodel.LdaModel.load('D:/workspace/scrap_sg/lda.model')


def print_text(filename):
    with open(filename, "r") as input:
        raw = input.read()
        print raw


lda.print_topics(50)

index = similarities.MatrixSimilarity(lda[corpus])
index.save("D:/workspace/scrap_sg/simIndex.index")

doc_lda = lda[corpus]


# inspect one doc
def inspect_corpus(index, doc_lda, file_lst):
    pprint(doc_lda[index])
    print file_lst[index]
    topics = [topic for topic, weight in doc_lda[index]]
    for i in range(0, lda.num_topics):
        if i in topics:
            print "TOPIC " + str(i) + ":" + str(lda.print_topic(i))
            print "\n"
    print_text(file_lst[index])
stoplist = set('for a of the and to in'.split())
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]
# remove words that appear only once
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

from pprint import pprint  # pretty-printer
#pprint(texts)

dictionary = corpora.Dictionary(texts)
#print dictionary.token2id

new_doc = "Human computer interaction"
new_vec = dictionary.doc2bow(new_doc.lower().split())

#print new_vec
corpus = [dictionary.doc2bow(text) for text in texts]
#print corpus

tfidf = models.TfidfModel(corpus)
index = similarities.MatrixSimilarity(tfidf[corpus])
sims = index[tfidf[new_vec]]
i = sorted(list(enumerate(sims)), key=lambda x: -x[1])[1][0]
print documents[i]
示例#7
0
def LSA_Kmeans(clusters, textoTreinamento, nomeUsuarios, textoComparacao=None):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    ##########################################################################################
    #  PRÉ-PROCESSAMENTO DO TEXTO ENVIADO PARA CRIAÇÃO DO DICIONÁRIO DE RELACOES SEMANTICAS  #
    ##########################################################################################

    #UTILIZA AS FUNCOES removeA e removePontuacao PARA TRATAR textoTreinamento
    textoTrein = [removeA(removePontuacao(i)) for i in textoTreinamento]
    #print textoTrein

    textoComp = [removeA(removePontuacao(i)) for i in textoComparacao]

    #CARREGA A LISTA DE STOPWORDS DA NLTK
    stop = stopwords.words('portuguese')
    #RETIRA OS ACENTOS DA LISTA DE STOPWORDS
    stoplist = [(removeA(s)) for s in stop]
    #     print stoplist

    #REMOVE AS STOPWORDS E PALAVRAS COM MENOS DE 3 CARACTERES
    textoTrein = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \
             for document in textoTrein]
    #     print sw_textoTrein

    textoComp = [[word for word in document.lower().split() if word not in stoplist and len(word) > 3] \
             for document in textoComp]
    #     print textoComp
    ##############################################################################################
    #     INICIO DE APLICACAO DO LSA - CRIANDO O DICIONARIO DE TERMOS/FREQUENCIA                 #
    ##############################################################################################

    #DEFINE FREQUENCIA COMO UMA VARIAVEL DO TIPO DICIONARIO DE INTEIROS
    frequencia = defaultdict(int)

    #ARMAZENA A QUANTIDADE DE REPETIÇÕES DE UM TERMO EM TODOS OS DOCUMENTOS DA COLECAO
    for t in textoTrein:
        for token in t:
            frequencia[token] += 1
#     pprint(frequencia)

#PALAVRAS COM FREQUENCIA 1 NÃO SÃO IMPORTANTES, POIS NÃO POSSUEM RELACOES DE CO-OCORRENCIA
#Remove todas as palavras que apareceram apenas 1 vez durante a contagem
    textoTrein = [[token for token in palavra if frequencia[token] > 1]\
             for palavra in textoTrein]
    #     pprint(textoTrein)

    ##########################################################################################
    # Dictionary encapsulates the mapping between normalized words and their integer ids.    #
    # The main function is `doc2bow`, which converts a collection of words to its            #
    # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.             #
    ##########################################################################################
    dicionario = corpora.Dictionary(textoTrein)
    #     print dicionario

    # Armazena o ID das palavras que aparecem apenas 1 vez nos textos
    once_ids = [
        tokenId for tokenId, docfreq in dicionario.dfs.iteritems()
        if docfreq == 1
    ]
    #     print once_ids

    #remove todas as palavras com frequencia = 1
    dicionario.filter_tokens(once_ids)

    #reorganiza o dicionario, realocando os dados para os indices que foram removidos
    dicionario.compactify()

    #     print dicionario.token2id # token -> tokenId
    #     print dicionario.dfs # document frequencies: tokenId -> in how many documents this token appeared

    # Atribui a corpus_textoTrein o textoTrein no formato "bag-of-words"
    # The main function is `doc2bow`, which converts a collection of words to its
    # bag-of-words representation: a list of (word_id, word_frequency) 2-tuples.
    corpus_textoTrein = [dicionario.doc2bow(texto) for texto in textoTrein]
    #     pprint(corpus_textoTrein)

    corpus_textoComp = [dicionario.doc2bow(textoC) for textoC in textoComp]
    #     pprint(corpus_textoComp)
    ##########################################################################################
    # MODELO DE TRANSFORMACAO - BAG-OF-WORDS PARA TF-IDF                                     #
    ##########################################################################################

    # TRANSFORMA corpus_textoTrein (bag-of-words)
    # PARA tfidf_TextoTrein (frequencia termos x inverso da freq no documento
    tfidf_TextoTrein = models.TfidfModel(corpus=corpus_textoTrein)
    #     print tfidf_TextoTrein

    #USA O posIni PARA GERAR A MATRIZ DE COMPARACAO COM OS DADOS DO DICIONARIO
    corpus_tfidf_TextoTrein = tfidf_TextoTrein[corpus_textoComp]
    #     print list(corpus_tfidf_TextoTrein)

    #TRANSFORMA A MATRIZ TF-IDF
    modelo_lsa = models.LsiModel(corpus_tfidf_TextoTrein,
                                 id2word=dicionario,
                                 num_topics=len(dicionario))

    query = []

    for q in textoComparacao:
        vec_bow = dicionario.doc2bow(q.lower().split())
        vec_lsi = modelo_lsa[
            vec_bow]  #convert a query de comparação num espaço LSI
        query.append(vec_lsi)
#     print "query"
#     pprint(query)

#TRANSFORMA corpus_textoComp num espaço LSA e indexa
    indexComp = similarities.MatrixSimilarity(modelo_lsa[corpus_textoComp])
    #     print "indexComp"
    #     pprint(list(indexComp))

    # To obtain similarities of our query document against the indexed documents:
    # perform a similarity query against the corpus
    sims = indexComp[query]

    #     pprint(sims)

    ##########################################################################################
    # JUNÇÃO COM K-MEANS PARA REALIZAR AGRUPAMENTOS                                          #
    ##########################################################################################

    ##Valor ideal, após experimentos = 100000
    km_model = KMeans(n_clusters=clusters, n_init=100000)

    km_model.fit_transform(sims)

    clustering = collections.defaultdict(list)

    for idx, label in enumerate(km_model.labels_):
        clustering[label].append(idx)

### impressões para visualizar no console
#     print "clustering _LSA_KMEANS"
#     pprint(clustering)

#     print len(clustering)

#     for i in range(len(clustering)):
#         for j in clustering[i]:
#             print "grupo", i
#             print j, nomeUsuarios[j]
#             print textoComparacao[j]

    return clustering
示例#8
0
f.close()
#语料太大的情况下可以强制GC回收内存空间
#gc.collect()
#生成字典
dictionary = corpora.Dictionary(train_set)
#去除极低频的杂质词
dictionary.filter_extremes(no_below=1, no_above=1, keep_n=None)
#将词典保存下来,将语料也保存下来,语料转换成bow形式,方便后续使用
dictionary.save(output + "all.dic")
corpus = [dictionary.doc2bow(text) for text in train_set]
saveObject(output + "all.cps", corpus)
#存储原始的数据
saveObject(output + "all.info", docinfos)

#TF*IDF模型生成
#使用原始数据生成TFIDF模型
tfidfModel = models.TfidfModel(corpus)
#通过TFIDF模型生成TFIDF向量
tfidfVectors = tfidfModel[corpus]
#存储tfidfModel
tfidfModel.save(output + "allTFIDF.mdl")
indexTfidf = similarities.MatrixSimilarity(tfidfVectors)
indexTfidf.save(output + "allTFIDF.idx")

#LDA模型
lda = models.LdaModel(tfidfVectors, id2word=dictionary, num_topics=50)
lda.save(output + "allLDA50Topic.mdl")
corpus_lda = lda[tfidfVectors]
indexLDA = similarities.MatrixSimilarity(corpus_lda)
indexLDA.save(output + "allLDA50Topic.idx")
示例#9
0
文件: test.py 项目: zj2089/cowry
tfidf = models.TfidfModel(corpus_tfidf)

vec = [(0, 1), (4, 1)]
# print(tfidf[vec])

print(corpus_tfidf[0])
print(corpus_tfidf[-1])

from sim import print_similaries

num_topic = 250
lda = models.LdaModel(corpus_tfidf,
                      id2word=dictionary,
                      num_topics=num_topic,
                      update_every=0,
                      passes=20)
lda.print_topics(num_topic)

tfidf_index = similarities.SparseMatrixSimilarity(tfidf[corpus_tfidf],
                                                  num_features=len(tfidf.dfs))
lda_index = similarities.MatrixSimilarity(lda[corpus_tfidf])

print_similaries(tfidf, corpus_tfidf, zip_data, tfidf_index)
print_similaries(lda, corpus_tfidf, zip_data, lda_index)

# num_topic = 10
# lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=num_topic)
# corpus_lsi = lsi[corpus_tfidf]
# lsi.print_topics(num_topic)
示例#10
0
from gensim import corpora
dct = corpora.Dictionary(contexts)
low_freq_ids = [id_ for id_, freq in dct.dfs.items() if freq < 3]
high_freq_ids = [id_ for id_, freq in dct.dfs.items() if freq > 10000]
freq_ids = low_freq_ids + high_freq_ids
dct.filter_tokens(freq_ids)
dct.compactify()
corpus = [dct.doc2bow(s) for s in contexts]

from gensim import models
tfidf_model = models.TfidfModel(corpus)
corpus_mm = tfidf_model[corpus]

from gensim import similarities
index = similarities.MatrixSimilarity(corpus_mm, num_features=len(dct))


def text2vec(text):
    bow = dct.doc2bow(text)
    return tfidf_model[bow]


input_text = '花呗透支了为什么不可以继续用了'
my_text = list(jieba.cut(input_text))
vec = text2vec(my_text)
sims = index[vec]
sim_sort = sorted(list(enumerate(sims)),
                  key=lambda item: item[1],
                  reverse=True)
示例#11
0
 def computeSimilarityMatrix(self, corpus, numFeatures, num_best=7):
     self.similarityMatrix = similarities.MatrixSimilarity(
         self.model[corpus], num_features=numFeatures, num_best=num_best)
示例#12
0
else:
    print("Please run first tutorial to generate data set")

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsi = lsi[corpus_tfidf]

lsi.save('lyrics.lsi')
lsi = models.LsiModel.load('lyrics.lsi')

##################

dictionary = corpora.Dictionary.load('lyrics.dict')
corpus = corpora.MmCorpus('lyrics.mm')

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10)

doc = "望著 滿天星斗 的 塗鴉 好像 看見 自己 童年 的 模樣 總是 說 著 淘氣 浪漫 的 願望 夢想 能夠 飛往 燦爛 的 天堂 而 那天 真的 心願 正 溫柔 地 對 我 說 當你 陷入 絕望 中請 記得 我 用 美麗 的 幻想 讓 真心 永遠 純真 而 不變 當你 寂寞 的 時候 請 想念 我 用 單純 的 信仰 給 自己 溫暖 的 回答 閉上 雙眼 靜靜地 徜徉 彷彿 穿越時空 回到 了 過往 以為 銀河 就 在 不遠 的 前方 星星 月亮 都 在 我 面前 玩耍 而 那 微小 的 喜悅 正 溫柔 地 對 我 說 當你 陷入 絕望 中請 記得 我 用 美麗 的 幻想 讓 真心 永遠 純真 而 不變 當你 寂寞 的 時候 請 想念 我 用 單純 的 信仰 給 自己 溫暖 的 回答 ( 和 童 年時 無邪 的 希望 ) 親愛 的 我 親愛 的 我 願 你 永遠 像 我 一樣 帶著 勇氣 和 倔強 歲月 改變 你 的 模樣 無法 改變 你 的 去向"
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lsi[vec_bow]

index = similarities.MatrixSimilarity(lsi[corpus], num_features=500)
index.save('lyrics.index')

sims = index[vec_lsi]

sims = sorted(enumerate(sims), key=lambda item: -item[1])
print(sims[:10])
示例#13
0
 def get_similarity(self,lda, query_vector):
     index = similarities.MatrixSimilarity(lda[self.corpus])
     sims = index[query_vector]
     return sims
示例#14
0
    word for word in document if '/w' not in word and '/y' not in word
    and '/u' not in word and '/c' not in word and '/k' not in word
] for document in documents]

times = defaultdict(int)
for page in ptexts:
    for word in page:
        times[word] += 1
ptexts = [[word for word in text if times[word] > 1] for text in ptexts]

dictionary = corpora.Dictionary(ptexts)
corpus = [dictionary.doc2bow(text) for text in ptexts]

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
result = similarities.MatrixSimilarity(corpus_tfidf)

data = pd.DataFrame(result[corpus_tfidf],
                    index=date_indexs,
                    columns=date_indexs)
data.to_csv("text_result.csv")

output = open("text_result_100.csv", "w")

for i in range(0, 100):
    tmp = sorted(enumerate(result[corpus_tfidf[i]]),
                 key=lambda x: x[1],
                 reverse=True)
    result100 = []
    for j, m in tmp:
        result100.append([date_indexs[j], m])
示例#15
0
def main():
    logging.basicConfig(format='[%(levelname)s] : %(message)s',
                        level=logging.INFO)

    if (os.path.exists("output/0814.dict")):
        dictionary = corpora.Dictionary.load("output/0814.dict")
        corpus = corpora.MmCorpus("output/0814.mm")
        logging.info("Load model success")
    else:
        logging.info("Please run the train2.py to create dict & data flow")

    # Create tf-idf model
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    # Transfer to LSI model
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=40)
    corpus_lsi = lsi[corpus_tfidf]  # LSI潛在語義索引
    lsi.save('output/0814.lsi')
    corpora.MmCorpus.serialize('output/0814_lsi.mm', corpus_lsi)
    """
    print("LSI topics:")
    results = lsi.print_topics(5)
    for result in results:
        print(result)
    """

    # test_data = ''
    # with open('input/test.txt', 'r', encoding='utf-8') as f:
    #     for line in f:
    #         words = jieba.cut(line)
    #         test_data += ' '.join(words)
    #
    # print(test_data.split())

    test_data = []
    init_stopword()
    test_data = getTestData('input/test.txt')
    test_data_seg = getSingleSegment(test_data)

    vec_bow = dictionary.doc2bow(test_data_seg)
    vec_lsi = lsi[vec_bow]

    print("\nAriticle:\n%s" % test_data)

    # Create index
    index = similarities.MatrixSimilarity(lsi[corpus])
    index.save("output/0814.index")

    # Similarity
    sims = index[vec_lsi]
    sims = sorted(enumerate(sims), key=lambda item: -item[1])

    print("result:")
    print(sims[:5])

    # Print results
    articles = getArticle()
    for idx in sims[:3]:
        print("\nSimilar Ariticle:\n", articles[idx[0]])
        print("\nSimilarity:", idx[1])
示例#16
0
def visual():
    ''' get data depending on algo create a similarity matrix
		feed it to a templete for visulization
	'''
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    ALGORITHM = request.form['algorithm']
    LEVEL = request.form['level']
    DOC_COUNT = int(request.form['num-of-docs'])

    DOCUMENTS = []
    for i in range(DOC_COUNT):
        DOCUMENTS.append(request.form['document' + str(i + 1)])

    raw_sentences = []

    if LEVEL == "sentence":
        for each in DOCUMENTS:  # raw sentences will be each document splited into sentences
            raw_sentences += tokenizer.tokenize(each.decode('utf8').strip())
    else:
        raw_sentences = DOCUMENTS  # raw sentence will be the whole do itself.
    matrix = []
    if ALGORITHM == "TF-IDF":
        # Need to write functions for each. Wrote for TF-IDF.
        tfidf = TfidfVectorizer().fit_transform(raw_sentences)
        matrix = (tfidf * tfidf.T).A

    # For each algo the Idea is to form a martix of similarities.
    #---------
    #Algo 2:Latent Semantic Indexing

    if ALGORITHM == "LSI":
        #added by sneha git:coder477 .
        texts = []
        matrix = np.zeros(shape=(len(raw_sentences), len(raw_sentences)))
        for each in raw_sentences:
            texts.append(document_to_wordlist(each))

        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        lsii = models.LsiModel(corpus)

        matrix = np.zeros(shape=(len(raw_sentences), len(raw_sentences)))

        for i in range(len(raw_sentences)):
            vec = corpus[i]
            doc = raw_sentences[i]

            vec_bow = dictionary.doc2bow(doc.lower().split())
            vec_lsi = lsii[vec_bow]  # convert the query to LSI space

            index = similarities.MatrixSimilarity(lsii[corpus])
            sims = index[
                vec_lsi]  # perform a similarity query against the corpus
            cosine = list(enumerate(sims))
            for j in range(len(raw_sentences)):
                matrix[i][j] = cosine[j][1]

    #---------
    #Algo 3
    if ALGORITHM == "WORDNET":
        print("here---------------------------")
        matrix = []
        for each in range(len(raw_sentences)):
            li = []
            for each1 in range(len(raw_sentences)):
                li.append(0)
            matrix.append(li)
        for i in range(0, len(raw_sentences)):
            for j in range(0, len(raw_sentences)):
                input1 = raw_sentences[i].encode('ascii', 'ignore')
                input2 = raw_sentences[j].encode('ascii', 'ignore')

                input1_nps = nps(input1)
                input2_nps = nps(input2)
                common_nps = common1(input1_nps, input2_nps)
                lsv_input1 = lsv(common_nps, input1_nps)
                lsv_input2 = lsv(common_nps, input2_nps)
                matrix[i][j] = cosine_similarity(lsv_input1, lsv_input2)
    #---------
    #Algo 4
    #Got pretrained vectors from GIT.
    #added by sneha git:coder477 .
    if ALGORITHM == "WORD2VEC":
        word_vector = load_word2vec('static\\vectors')
        matrix = []
        for each in range(len(raw_sentences)):
            li = []
            for each1 in range(len(raw_sentences)):
                li.append(0)
            matrix.append(li)
        for i in range(0, len(raw_sentences)):
            for j in range(0, len(raw_sentences)):
                sen1 = raw_sentences[i]
                sen2 = raw_sentences[j]
                sen1_words = document_to_wordlist(sen1)
                sen2_words = document_to_wordlist(sen2)
                sen1_vectors = []
                for each in sen1_words:
                    if each in word_vector:
                        sen1_vectors.append(word_vector[each])
                sen1_vector = np.array(sen1_vectors).sum(axis=0)
                sen2_vectors = []
                for each in sen2_words:
                    if each in word_vector:
                        sen2_vectors.append(word_vector[each])
                sen2_vector = np.array(sen2_vectors).sum(axis=0)
                matrix[i][j] = cosine_similarity(sen1_vector,
                                                 sen2_vector)[0][0]

    #---------
    #Forming nodes and links for graph.
    #code might as well be same for all algos.
    #Refine note : Think of creating private funcs and moving code.
    force = {}
    force["nodes"] = []
    force["links"] = []
    for each in raw_sentences:
        temp = {}
        temp["name"] = each
        temp["length"] = len(document_to_wordlist(each))
        force["nodes"].append(temp)
    for ((i, _), (j, _)) in itertools.combinations(enumerate(raw_sentences),
                                                   2):
        temp = {}
        temp["source"] = i
        temp["target"] = j
        temp["value"] = matrix[i][j]
        force["links"].append(temp)
    graph = json.dumps(force)
    wordlist = []
    for each in raw_sentences:
        wordlist += document_to_wordlist(each)
    c = Counter(wordlist)
    wordcloud = []
    for each in c:
        temp = {}
        temp["text"] = each
        temp["size"] = c[each] * 20
        wordcloud.append(temp)
    wordcloud = json.dumps(wordcloud)
    return render_template('visual.html',
                           graph=graph,
                           sentences=raw_sentences,
                           wordcloud=wordcloud)
示例#17
0
def LDA():
    train = []  # 训练数据
    fp = codecs.open(r'F:\github\WBFL\uploadpath\output\output.txt',
                     'r',
                     encoding='utf-8')
    for line in fp:
        line = line.split()
        train.append([w for w in line])

    dictionary = corpora.Dictionary(train)  # 构造词典
    corpus = [dictionary.doc2bow(text) for text in train]  # 每个text对应的稀疏向量
    tfidf = models.TfidfModel(corpus)  # 统计tfidf
    corpus_tfidf = tfidf[corpus]

    # 将文本的tfidf向量输入生成Lsi模型,num_topics为生成主题个数,也为Lsi进行SVD分解,生成矩阵列向量数;id2word是语料字典
    lsi = models.LsiModel(corpus_tfidf, num_topics=50, id2word=dictionary)
    topic_result = [a for a in lsi[corpus_tfidf]]  # 給lsi的索引为tfidf向量
    print(lsi)  # 打印LSI Model topic_result
    # print(lsi.print_topics(num_topics=50, num_words=5))   # 打印5个主题并且打印与主题有关的5个关键词,关键词前面的系数为权重而不是概率值

    similarity = similarities.MatrixSimilarity(
        lsi[corpus_tfidf])  # 根据lsi计算文档之间的相似性
    # print(list(similarity))

    #  alpha,eta即为LDA公式中的α和β,minimum_probability表示主题小于某个值(比如0.001)就舍弃此主题。
    lda = models.LdaModel(corpus_tfidf,
                          num_topics=50,
                          id2word=dictionary,
                          alpha='auto',
                          eta='auto',
                          minimum_probability=0.001)

    # for doc_topic in lda.get_document_topics(corpus_tfidf):  # 可以获得每个文档的主题分布
    #     print(doc_topic)

    with open(r'F:\github\WBFL\uploadpath\output\wordlistOutput.txt',
              'w',
              encoding='utf-8') as f1:
        for topic_id in range(50):
            print('Topic', topic_id)
            # print(lda.get_topic_terms(topicid=topic_id))  # lda生成的主题中的词分布,默认显示10个
            print(lda.show_topic(topicid=topic_id))
            word_list = lda.show_topic(topicid=topic_id)
            for i in range(10):
                f1.write(word_list[i][0] + '\n')

    a = np.array(list(similarity))
    result_index = a > 0.99000000
    # print(result_index)

    inputs = open(r'F:\github\WBFL\uploadpath\input\input.txt',
                  'r',
                  encoding='utf-8')
    text_list = inputs.readlines()
    # print(text_list)
    count = len(text_list)
    for line in range(count):
        nowline = count - 1 - line
        num = 0
        for item in result_index[nowline][nowline:count - 1]:
            if item and num != 0:
                text_list[nowline] = text_list[nowline + num].replace(
                    '\n', '\\n') + text_list[nowline]
                text_list[nowline + num] = ''
            num += 1
    # print(text_list)

    with open(r'F:\github\WBFL\uploadpath\output\finaloutput.txt',
              'w',
              encoding='utf-8') as f:
        for text in text_list:
            f.write(text)
示例#18
0
for choice in (models.LsiModel, models.LdaModel):
    model = choice(corpus_tfidf, id2word=dictionary, num_topics=2)
    corpus_mod = model[
        corpus_tfidf]  # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
    fname = '/tmp/%s' % repr(choice)
    model.save(fname)  # same for tfidf, lda, ...
    model = choice.load(fname)

    # topic
    model.print_topics(2)
    for doc in corpus_mod:  # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
        print doc

    # test example
    doc = "Human computer interaction"
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_mod = model[vec_bow]  # convert the query to MODEL space
    print vec_mod

    # Initializing query structures
    index = similarities.MatrixSimilarity(
        model[corpus])  # transform corpus to MODEL space and index it
    index.save('/tmp/deerwester.index')
    index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')

    # Performing queries
    sims = index[vec_mod]  # perform a similarity query against the corpus
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    print sims  # print sorted (document number, similarity score) 2-tuples
示例#19
0
content.close()
# textdict里面是词典
# textcorpus就是分开的一篇篇文章,也叫做语料库
textdict = corpora.Dictionary(corpus)  # 词到数字{"数据挖掘":0,"篮球":1,}

# 通过词典,将语料中的文字转化为数字(编号)
# 最终形如[[(0,1),(1,4),(2,2)],[(1,3),(2,2)],[(3,1),(4,1),(5,1)]]
# 小括号第一个是词的编号,第二个是个数
textcorpus = [textdict.doc2bow(i) for i in corpus]
# model = models.ldamodel.LdaModel(
# 	textcorpus,num_topics=3,id2word = textdict)
# topics = [model[c] for c in textcorpus]
# print topics
# for i in range(3):
# 	print model.print_topic(i)

# tfidf模型
tfidf = models.TfidfModel(textcorpus)
corpus_tfidf = tfidf[textcorpus]

# lsa模型
lsi = models.LsiModel(corpus_tfidf, id2word=textdict, num_topics=2)
corpus_lsi = lsi[textcorpus]
# print lsi.print_topics(2)
print corpus_lsi
index = similarities.MatrixSimilarity(lsi[textcorpus])
sims = index[corpus_lsi]
print list(enumerate(sims))
sims = sorted(enumerate(sims), key=lambda item: -item[1])
print sims
示例#20
0
def computeSimilarity_lsm(X, query):
    index = similarities.MatrixSimilarity(X)
    sims = index[query]
    scoreList = list(enumerate(sims))
    rankList = [scoreList[i][1] for i in range(len(scoreList))]
    return rankList
示例#21
0
def main():
    board = 'Japan_Travel'
    conn = MongoClient('localhost', 27017)
    db = conn['bdhackthon']
    collection = db[board]
    d_start = datetime.datetime(2016, 1, 1, 0)
    d_end = datetime.datetime(2016, 3, 1, 0)

    t_start = time.time()
    # bulil corpus
    if os.path.exists('corpus_data.json'):
        corpus_data = load_json('corpus_data.json')
    else:
        corpus_data = {}
    corpus = []
    articles = collection.find(
        {
            "$or": [{
                "article_title": {
                    "$regex": "\[[遊食]記\].*(東京)+.*"
                },
                "date": {
                    "$gt": d_start,
                    "$lt": d_end
                }
            }, {
                "article_title": {
                    "$regex": "\[住宿\].*(東京)+.*"
                },
                "date": {
                    "$gt": d_start,
                    "$lt": d_end
                }
            }]
        },
        no_cursor_timeout=True).batch_size(20)
    print('Total:', articles.count())
    index_aid = {}  # map index of corpus to article_id
    i = 0
    tmp_data = {}
    for article in articles:
        #if i==80:
        #    break
        tmp_data[article['article_id']] = (article['article_title'],
                                           article['content'])
        index_aid[str(i)] = article['article_id']
        print(i)
        #print(article, article['article_title'])
        print(article['article_title'])
        #print(article['content'])
        #print(article)
        if article['article_id'] in corpus_data.keys():
            corpus.append(corpus_data[article['article_id']]['feature'])
            corpus_data[article['article_id']]['index'] = i
            i = i + 1
            continue
        else:
            doc = []
            doc += splitWord(article['article_title'])
            doc += splitWord(article['content'])
            corpus_data[article['article_id']] = {
                'feature': doc,
                'topic': [],
                'index': i
            }
            corpus.append(doc)
            i = i + 1
        #input()
    t_end = time.time()
    write_json(corpus_data, 'corpus_data.json')
    print('time elapsed for building corpus: %f minutes' %
          ((t_end - t_start) / 60.0))

    dictionary = corpora.Dictionary(corpus)
    stoplist = [
        line.lower().split()[0] for line in open('stop_words.txt', 'r')
    ]
    # remove stop words and words that appear only once
    stop_ids = [
        dictionary.token2id[stopword] for stopword in stoplist
        if stopword in dictionary.token2id
    ]
    once_ids = [
        tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1
    ]
    #once_ids = []
    dictionary.filter_tokens(
        stop_ids +
        once_ids)  # remove stop words and words that appear only once
    dictionary.compactify(
    )  # remove gaps in id sequence after words that were removed
    #print(dictionary)
    #print(dictionary.dfs)
    #pprint(dictionary.token2id)
    dictionary.save('train.dict')  # store the dictionary, for future reference

    corpus_bow = [dictionary.doc2bow(doc) for doc in corpus]
    corpora.MmCorpus.serialize('train.mm',
                               corpus_bow)  # store to disk, for later use

    tfidf = models.TfidfModel(corpus_bow)  # initialize (train) a model
    tfidf.save('train.tfidf')
    corpus_tfidf = tfidf[corpus_bow]

    lda = models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                   id2word=dictionary,
                                   alpha='auto',
                                   num_topics=50)
    #print(lda.print_topics(50))
    lda.save('train.lda')
    corpus_lda = lda[corpus_tfidf]
    index = similarities.MatrixSimilarity(
        corpus_lda)  # transform corpus to LDA space and index it
    index.save('train.index')

    topic = {}
    for i in range(len(corpus_lda)):
        #print(corpus_lda[i])
        #print(corpus[i])
        key = max(corpus_lda[i], key=lambda x: abs(x[1]))[0]
        if key in topic.keys():
            topic[key].append(i)
        else:
            topic[key] = [i]
        #input()

    vec_topic = {}
    print('%d topics identified. Classify them:' % len(topic))

    old_corpus_data = load_json('old_model/corpus_data.json')
    for k, v in topic.items():
        print('Group %s (%d):' % (k, len(v)))
        for c_index in v:
            a_id = index_aid[str(c_index)]
            #if a_id in corpus_data.keys():
            if a_id in old_corpus_data.keys():
                #print(corpus_data[a_id]['topic'])
                if not old_corpus_data[a_id]['topic']:
                    #print(corpus_data[a_id]['feature'])
                    print(tmp_data[a_id])
                    line = input('Enter topics, separate by space: ')
                    corpus_data[a_id]['topic'] = line.split(' ')
                else:
                    corpus_data[a_id]['topic'] = old_corpus_data[a_id]['topic']
            else:
                raise ValueError('Empty article_id')
        write_json(corpus_data, 'corpus_data_labeled.json')
示例#22
0
def generateModel(fileName='RawData20160307.json'):
    RawData = open(fileName, 'r')
    stopWords = getStopWords()
    print "prepocessing the RawData...",
    texts = [
        simpleTokenize(
            json.loads(line)['AppName'] + ' ' +
            json.loads(line)['Description'], stopWords) for line in RawData
    ]
    RawData.close()
    print "prepocessing the RawData Done!"

    print "generating a dictionary...",
    dictionary = corpora.Dictionary(texts)
    once_ids = [
        tokenid for tokenid, docfreq in dictionary.dfs.iteritems()
        if docfreq == 1
    ]
    dictionary.filter_tokens(once_ids)
    dictionary.compactify()
    print "generating a dictionary Done!"
    # create a dir called modelfile
    print dictionary
    datapath = os.path.join(os.getcwd(), 'data')
    if not os.path.isdir(datapath):
        os.mkdir(datapath)
    dicFilePath = os.path.join(datapath, 'appdesc.dict')
    if os.path.isfile(dicFilePath):
        os.remove(dicFilePath)
    dictionary.save(dicFilePath)

    print "generating a Coupus...",
    corpus = [dictionary.doc2bow(text) for text in texts]
    mmFlePath = os.path.join(datapath, 'appdesc.mm')
    if os.path.isfile(mmFlePath):
        os.remove(mmFlePath)
    corpora.MmCorpus.serialize(mmFlePath, corpus, progress_cnt=10000)
    print "Done!"

    # Creating a transformation,train the TF-IDF model
    print "training the tfidf model...",
    tfidf = models.TfidfModel(corpus)  # step 1 -- initialize a model
    tfidfFilePath = os.path.join(datapath, 'model.tfidf_model')
    if os.path.isfile(tfidfFilePath):
        os.remove(tfidfFilePath)
    tfidf.save(tfidfFilePath)
    corpus_tfidf = tfidf[corpus]
    print "Done!"

    print "Mapping from tfidf to lsi...",
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
    corpus_lsi = lsi[corpus_tfidf]
    lsiFilePath = os.path.join(datapath, 'model.lsi')
    if os.path.isfile(lsiFilePath):
        os.remove(lsiFilePath)
    lsi.save(lsiFilePath)
    print "Done!"

    # transform corpus to LSI space and index it
    print "Generating the index...",
    index = similarities.MatrixSimilarity(corpus_lsi)
    indexFilePath = os.path.join(datapath, 'appdesc.index')
    if os.path.isfile(indexFilePath):
        os.remove(indexFilePath)
    index.save(indexFilePath)
    print "done!"
示例#23
0
    words = ' '.join(jieba.cut(line)).split(' ')
    texts.append(words)

frequency = defaultdict(int)  # 构建一个字典对象
for text in texts:
    for word in text:
        frequency[word] += 1
texts = [[word for word in text if frequency[word] > 1] for text in texts]

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

index = similarities.MatrixSimilarity(corpus_tfidf)


def get_similar(token):
    words = []
    token = ' '.join(jieba.cut(token)).split(' ')
    for word in token:
        words.append(word.lower())
    print(words)
    new_vec = dictionary.doc2bow(words)
    new_vec_tfidf = tfidf[new_vec]  # 将待比较文档转换为tfidf表示方法
    sims = index[new_vec_tfidf]
    sims_list = sims.tolist()
    if max(sims_list) < 0.5:
        return 'NO DATA'
    else:
示例#24
0
                res = get_cosine(A, B)
                if (res > 0.95):
                    UrlList.remove(UrlList[i])
            except IndexError:
                pass


#######################################
######### APPLICATION TESTS ###########
#######################################

fileList = getFileList()
deleteSimilarPage(fileList)

#Pour déterminer la page  la plus proche d'une requête donnée (le plus pertinent sémantiquement sur cette requête)

from gensim import corpora, similarities
# corpus is your text, tokenized
dictionary = corpora.Dictionary(corpus)
# transform the corpus into vectors
# Bag of words (BOW) is an algorithm like word2vec, to transform words into vectors
vectors_corpus = [dictionary.doc2bow(text) for text in corpus]
# Build your similarity matrix
matrix = similarities.MatrixSimilarity(vectors_corpus)
# Query is your search query
query = "Does it work"
vector_query = dictionary.doc2bow(query.lower.split())
similarity = matrix[vector_query]
# Now we see which document is closer to the search query
print(list(enumerate(similarity)))
示例#25
0
    dictionary = corpora.Dictionary(texts)
    V = len(dictionary)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpus_tfidf = models.TfidfModel(corpus)[corpus]

    print('TF-IDF:')
    for c in corpus_tfidf:
        print(c)

    print('\nLSI Model:')
    lsi = models.LsiModel(corpus_tfidf, num_topics=2, id2word=dictionary)
    topic_result = [a for a in lsi[corpus_tfidf]]
    pprint(topic_result)
    print('LSI Topics:')
    pprint(lsi.print_topics(num_topics=2, num_words=5))
    similarity = similarities.MatrixSimilarity(
        lsi[corpus_tfidf])  # similarities.Similarity()
    print('Similarity:')
    pprint(list(similarity))

    print('\nLDA Model:')
    num_topics = 2
    lda = models.LdaModel(corpus_tfidf,
                          num_topics=num_topics,
                          id2word=dictionary,
                          alpha='auto',
                          eta='auto',
                          minimum_probability=0.001)
    doc_topic = [doc_t for doc_t in lda[corpus_tfidf]]
    print('Document-Topic:\n')
    pprint(doc_topic)
    for doc_topic in lda.get_document_topics(corpus_tfidf):
示例#26
0
corpus_tfidf = corpora.MmCorpus('../Save/scopus_corpus.mm')
#index = similarities.MatrixSimilarity.load('../Save/scopus_research.index')
''' Similarities between pairs of documents '''

similarities_between_pairs = True
if similarities_between_pairs == True:
    query = "The challenge of a purposeful design addressed in this article is to align offshore energy systems not only with technical and economic values like efficiency and profitability but also with moral and social values more generally We elaborate a theoretical framework that allows us to make a systematic inventory of embedded values of offshore energy systems and relate them to their societal acceptability By characterizing both objects and subjects of acceptability we shed light on ways to identify areas of value conflicts that must be addressed in purposeful design We suggest the capabilities approach as a normative theory to deal with the arising value conflicts"
    split_lower_query = query.lower().split()
    stopped_query = [f for f in split_lower_query if not f in en_stop]
    stemmed_query = [p_stemmer.stem(h) for h in stopped_query]

    vec_bow = dictionary.doc2bow(stemmed_query)
    vec_lda = lda[vec_bow]

    index = similarities.MatrixSimilarity(
        lda[corpus_tfidf]
    )  # only possible if the total memory required is lower than the RAM. In any other case, you should use similarities. Can also add: ,num_features=len(dictionary)
    index.save('../Save/scopus_research.index')

    sims = index[vec_lda]

    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    pprint(sims)
''' Get topics '''

num_topics = 100
num_words = 5

#pprint(lda.get_topic_terms(0)) # terms for one topic
#pprint(lda.show_topics(num_topics, num_words))
#pprint(lda.print_topics())
示例#27
0
    print("---for doc in corpus_lsi")
    for doc in corpus_lsi:  # bow->tfidf和tfidf->lsi变换都会在这里快速执行
        print(doc)

    lsi.save('/tmp/model.lsi')  # same for tfidf, lda, ...
    lsi = models.LsiModel.load('/tmp/model.lsi')

    dictionary = corpora.Dictionary.load('/tmp/deerwester.dict')
    corpus = corpora.MmCorpus('/tmp/deerwester.mm')

    print("---corpus")
    print(corpus)
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)
    doc = "Human computer interaction"
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lsi = lsi[vec_bow]  # convert the query to LSI space
    print("---vec_lsi")
    print(vec_lsi)
    index = similarities.MatrixSimilarity(lsi[corpus])  # 转换语料库到LSI空间并索引它
    index.save('/tmp/deerwester.index')
    index = similarities.MatrixSimilarity.load('/tmp/deerwester.index')
    sims = index[vec_lsi]  # perform a similarity query against the corpus

    print("list(enumerate(sims))"
          )  # print (document_number, document_similarity) 2-tuples
    print(list(enumerate(
        sims)))  # print (document_number, document_similarity) 2-tuples
    sims = sorted(enumerate(sims), key=lambda item: -item[1])
    print("---sims")
    print(sims)
示例#28
0
def main(transcript_dir,testfile,xls_pathname):
    reload(sys)
    sys.setdefaultencoding('utf8')

    print ('\ntranscript_dir:', transcript_dir)

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.WARNING)

    documents, doc_index= create_doc_index(transcript_dir)

    # Remove common words and tokenize
    stoplist = set('for a of the and to in'.split())

    texts = [[word for word in document.lower().split() if word not in stoplist]
             for document in documents]

    # Remove words that appear only once

    frequency = defaultdict(int)

    for text in texts:
                 for token in text:
                     frequency[token] += 1

    texts = [[token for token in text if frequency[token] > 1]
             for text in texts]

    print("\nCreate and save dictionary:")
    dictionary = corpora.Dictionary(texts)
    dictionary.save('./tmp/TopicClassifier.dict') # store the dictionary, for future reference

    #print(dictionary)
    #print(dictionary.token2id)

    print ('\nOpen testfile:', testfile)

    tf = open(testfile, 'r')
    new_doc=(str.decode(tf.read(), "UTF-8", "ignore"))
    tf.close()

    print ("\nQuery document:\n")
    print(new_doc)

    print ("\nVector representation of query document:\n")
    new_vec = dictionary.doc2bow(new_doc.lower().split())
    print(new_vec)

    print("\nCreate and save corpus:")
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('./tmp/TopicClassifier.mm', corpus)

    print("\nCorpus:")
    print(corpus)

    print("\nBuild LSI Model:")
    lsi = models.LdaModel(corpus, id2word=dictionary, num_topics=10)

    # Transform testfile document (question) to bag of words
    vec_bow = new_vec
    vec_lsi = lsi[vec_bow] # convert the query to LSI space
    print(vec_lsi)

    # Transform corpus to LSI space and index it
    index = similarities.MatrixSimilarity(lsi[corpus])
    #index = similarities.MatrixSimilarity.load(save_index)

    print("Creating sims...")
    sims = index[vec_lsi] # perform a similarity query against the corpus
    print(list(enumerate(sims))) # print (document_number, document_similarity) 2-tuples

    print("Sorting sims...")
    sims_sorted = sorted(enumerate(sims), key=lambda item: -item[1])
    print(sims_sorted) # print sorted (document number, similarity score) 2-tuples

    print("Doc Index...")
    print(doc_index)

    print("Combined Results...")
    #[ print ('Seq: {} CaseID: {} File: {} Similarity: {}'.format(item[0][0], item[0][1], item[0][2], item[1])) for item in zip(doc_index,sims)]

    print ("Len(sims): {}  Len(Index): {}".format(len(sims), len(doc_index)))

    combined=[]
    for item in zip(doc_index, sims):
        combined.append((item[0][1], item[0][2], item[1]))

    combined_sorted=sorted(combined, key=lambda x: -x[-1])

    for item in combined_sorted:
        print(item)

    tw=TagWorksheet(xls_pathname)

    s = []

    #print ("tw.get.tags(825):", tw.get_tags(825))


    for item in combined_sorted[0:4]:
        tl=tw.get_tags(int(item[0]))
        print(item,'\t',tl)
        s.append(set(tl))

    print("\ntag sets:",s)

    y = set.intersection(*s)

    print ("\ntag set intersection:",y)
示例#29
0
np.array(corpus).shape

lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)


p=[]
for i in range(0,len(documents)):
    doc1 = documents[i]
    vec_bow2 = dictionary.doc2bow(doc1.lower().split())
    vec_lsi2 = lsi[vec_bow2] # convert the query to LSI space
    p.append(vec_lsi2)
    
p
    
index = similarities.MatrixSimilarity(lsi[corpus]) # transform corpus to LSI space and index it

index.save('/tmp/deerwester4.index')
index = similarities.MatrixSimilarity.load('/tmp/deerwester4.index')

#################

import gensim
import numpy as np
import matplotlib.colors as colors
import matplotlib.cm as cmx
import matplotlib as mpl

matrix1 = gensim.matutils.corpus2dense(p, num_terms=4)
matrix3=matrix1.T
matrix3
示例#30
0
def NMF(request):
    query = ""
    query_response = None
    file_list = None
    file_list_dictionary = None
    search_result_dictionary = None
    documents = []
    for counter in range(1033):
        temp = open("IR/" + str(counter + 1) + ".txt", 'r')
        documents.append(temp.read())
        temp.close()
    stop_words = stopwords.words('english')
    texts = [[
        word for word in document.lower().split() if word not in stop_words
    ] for document in documents]
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('/tmp/ir.mm', corpus)
    nmfmodel = nmf.Nmf(corpus,
                       num_topics=43,
                       id2word=dictionary,
                       normalize=True)
    if request.method == "POST":
        form = SearchForm(request.POST)
        if form.is_valid():
            query_response = list()
            user_query = form.save()
            user_query.save()
            query = user_query.query
            doc = user_query.query
            index = similarities.MatrixSimilarity(nmfmodel[corpus])
            vec_bow = dictionary.doc2bow(doc.split())
            vec_nmf = nmfmodel[vec_bow]
            sims = index[vec_nmf]
            sims = sorted(enumerate(sims, 1), key=lambda item: -item[1])
            file_list = list()
            for element in sims[0:5]:
                file_list.append(element[0])
            temp = None
            for text in file_list:
                temp = open("IR/" + str(text) + ".txt", 'r')
                query_response.append(temp.read())
                temp.close()
            #print(query_response)
            file_list_dictionary = dict()
            file_list_dictionary = {
                i: file_list[i - 1]
                for i in range(1,
                               len(file_list) + 1)
            }
            search_result_dictionary = {
                i: query_response[i - 1]
                for i in range(1,
                               len(query_response) + 1)
            }
    else:
        form = SearchForm()
    return render(
        request, "nmf.html", {
            'form': form,
            'query': query,
            'answer': file_list,
            'search_results': query_response,
            'file_dictionary': file_list_dictionary,
            'search_result_dictionary': search_result_dictionary
        })