Пример #1
0
def recommend(docs_path):
    """ - foo
    """

    with open(docs_path) as f:
        lines = f.readlines()

    test = []
    train_aids = []
    train_texts = []
    tmp_bag = []
    tmp_bag_current_aid = lines[0].split(',')[0]
    texts = []  # for dictionary generation
    adjacent_cit_map = {}
    for idx, line in enumerate(lines):
        aid, adjacent, in_doc, text = line.split(',')
        # create adjacent map for later use in eval
        if aid not in adjacent_cit_map:
            adjacent_cit_map[aid] = []
        if len(adjacent) > 2:
            adj_cits = adjacent[1:-1].split('|')
            for adj_cit in adj_cits:
                if adj_cit not in adjacent_cit_map[aid]:
                    adjacent_cit_map[aid].append(adj_cit)
        # fill texts
        text = text.replace('[]', '')
        texts.append(text.split())
        if aid != tmp_bag_current_aid or idx == len(lines) - 1:
            # tmp_bag now contains all lines sharing ID tmp_bag_current_aid
            num_contexts = len(tmp_bag)
            sub_bags_dict = {}
            for item in tmp_bag:
                item_in_doc = item[0]
                item_text = item[1]
                if item_in_doc not in sub_bags_dict:
                    sub_bags_dict[item_in_doc] = []
                sub_bags_dict[item_in_doc].append(item_text)
            order = sorted(sub_bags_dict,
                           key=lambda k: len(sub_bags_dict[k]),
                           reverse=True)
            # ↑ keys for sub_bags_dict, ordered for largest bag to smallest
            min_num_train = math.floor(num_contexts * 0.8)
            train_texts_comb = []
            test_texts = []
            # TODO: how to do k-fold cross val with this?
            for jdx, sub_bag_key in enumerate(order):
                sb_texts = sub_bags_dict[sub_bag_key]
                if len(train_texts_comb
                       ) > min_num_train or jdx == len(order) - 1:
                    test_texts.extend(sb_texts)
                else:
                    train_texts_comb.extend(sb_texts)
            l_tr = len(train_texts_comb)
            l_te = len(test_texts)
            l_tr_perc = (l_tr / (l_tr + l_te)) * 100
            l_te_perc = (l_te / (l_tr + l_te)) * 100
            test.extend([(tmp_bag_current_aid, txt) for txt in test_texts])
            # because we use BOW we can just combine train docs here
            train_text_combined = ' '.join(txt for txt in train_texts_comb)
            train_aids.append(tmp_bag_current_aid)
            train_texts.append(train_text_combined.split())
            # reset bag
            tmp_bag = []
            tmp_bag_current_aid = aid
        tmp_bag.append([in_doc, text])
    # average number of adjacent docs
    # adj_sum = 0
    # for k, v in adjacent_cit_map.items():
    #     adj_sum += len(v)
    # print(adj_sum/len(adjacent_cit_map))
    dictionary = corpora.Dictionary(texts)
    # dictionary.save('1712_test.dict')
    num_unique_tokens = len(dictionary.keys())
    # print(dictionary.token2id)
    corpus = [dictionary.doc2bow(text) for text in train_texts]
    # corpora.MmCorpus.serialize('1712_test_corpus.mm', corpus)
    # print(corpus)
    tfidf = models.TfidfModel(corpus)

    num_cur = 0
    num_top = 0
    num_top_5 = 0
    num_top_10 = 0
    ndcg_sum_5 = 0
    map_sum_5 = 0
    print('test set size: {}\n- - - - - - - -'.format(len(test)))
    index = similarities.SparseMatrixSimilarity(tfidf[corpus],
                                                num_features=num_unique_tokens)
    for tpl in test:
        test_aid = tpl[0]
        test_text = tpl[1].split()
        test_bow = dictionary.doc2bow(test_text)
        sims = index[tfidf[test_bow]]
        sims_list = list(enumerate(sims))
        sims_list.sort(key=lambda tup: tup[1], reverse=True)
        # print('correct: {}'.format(test_aid))
        # print('- - - - - - - -')
        # for idx, sim in enumerate(sims_list[:11]):
        #     pre = '{} '.format(idx)
        #     if train_aids[sim[0]] == test_aid:
        #         pre += '✔ '
        #     else:
        #         pre += '  '
        #     print('{}{}: {}'.format(pre, sim[1], train_aids[sim[0]]))
        rank = len(sims_list)
        for idx, sim in enumerate(sims_list):
            if train_aids[sim[0]] == test_aid:
                rank = idx + 1
                break
            if idx >= 10:
                break
        dcg = 0
        idcg = 0
        num_rel = 1 + len(adjacent_cit_map[test_aid])
        for i in range(5):
            placement = i + 1
            sim = sims_list[i]
            result_aid = train_aids[sim[0]]
            if result_aid == test_aid:
                relevance = 1
            elif result_aid in adjacent_cit_map[test_aid]:
                relevance = .5
            else:
                relevance = 0
            denom = math.log2(placement + 1)
            dcg_numer = math.pow(2, relevance) - 1
            dcg += dcg_numer / denom
            if placement == 1:
                ideal_rel = 1
            elif placement <= num_rel:
                ideal_rel = .5
            else:
                ideal_rel = 0
            idcg_numer = math.pow(2, ideal_rel) - 1
            idcg += idcg_numer / denom
        ndcg = dcg / idcg
        if rank == 1:
            num_top += 1
        if rank <= 5:
            num_top_5 += 1
            map_sum_5 += 1 / rank
            ndcg_sum_5 += ndcg
        if rank <= 10:
            num_top_10 += 1
        num_cur += 1
        print('- - - - - {}/{} - - - - -'.format(num_cur, len(test)))
        print('#1: {}'.format(num_top))
        print('in top 5: {}'.format(num_top_5))
        print('in top 10: {}'.format(num_top_10))
        print('ndcg@5: {}'.format(ndcg_sum_5 / num_cur))
        print('map@5: {}'.format(map_sum_5 / num_cur))
def trainDictionary(alltokens, productid, outpath):
    dictionary = corpora.Dictionary(alltokens)
    dictionary.save(os.path.join(outpath, "dictionary.dict"))
Пример #3
0
from gensim import corpora, models, similarities
import codecs
import json
import pyLDAvis
import pyLDAvis.gensim
with codecs.open("../input/hafez_Train3cls_cls3.txt", "r", 'UTF-8') as myfile:
    documents=myfile.readlines()

with codecs.open("../../stop-words_persian_1_fa.txt","r", 'UTF-8') as myfile:
	stoplist=myfile.read()
#textha = [[word for word in document.lower().split() if word not in stoplist]
          #for document in matns]
texts = [[word for word in document.lower().split() if word not in stoplist]
          for document in documents]
 # remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
#textha = [[word for word in text if word not in tokens_once]
 #         for text in textha]
texts = [[word for word in text if word not in tokens_once]
          for text in texts]
#loghatname = corpora.Dictionary(textha)
dictionary = corpora.Dictionary(texts)
#maincorpus = [loghatname.doc2bow(text) for text in textha]
corpus = [dictionary.doc2bow(text) for text in texts]
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=20,passes=10)

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda,corpus,dictionary)
vis
# read in bodyTextOnly.txt
with open('./SomethingsNotRightTextOnly.txt') as f:
    reader = csv.reader(f, delimiter='\t')
    id_body = list(reader)

# separate out post
posts = [row[1] for row in id_body]


# build topic model base on posts

## make dictionary
# stop_words = set(stopwords.words('english'))
stop_words = load_stop_words()
words = [[word for word in re.split('\W+', post.lower()) if (word not in stop_words and word != "")] for post in posts]
dictionary = corpora.Dictionary(words)

## build corpus
corpus = [dictionary.doc2bow(word) for word in words if word != ""]

## initialize lda model
lda = ldamulticore.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=40
)

## print topics
topics = lda.print_topics(num_words=10, num_topics=-1)
# for topic in topics:
#     print(topic)
Пример #5
0
import os
import tempfile
from six import iteritems
from gensim import corpora
import datetime

app = Flask(__name__)
app.config["DEBUG"] = True

TEMP_FOLDER = tempfile.gettempdir()
print('Folder "{}" will be used to save temporary dictionary and corpus.'.
      format(TEMP_FOLDER))

# collect statistics about all tokens
dictionary = corpora.Dictionary(
    line.lower().split('|')[0].split()
    for line in open('huangke/jieba_brand_segged.txt'))

stoplist = []
# remove stop words and words that appear only once
stop_ids = [
    dictionary.token2id[stopword] for stopword in stoplist
    if stopword in dictionary.token2id
]
once_ids = [
    tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq < 1
]

# remove stop words and words that appear only once
dictionary.filter_tokens(stop_ids + once_ids)
print(dictionary)
# freq_words = ['comment']
# for i in freq_words :
#     stopset.append(i)

text_corpus = []
for doc in fo:
    temp_doc = tokenize(doc.strip())
    current_doc = []
    for word in range(len(temp_doc)):

        if (temp_doc[word][0] not in stopset) and (
                temp_doc[word][1] == 'NN' or temp_doc[word][1] == 'NNS'
                or temp_doc[word][1] == 'NNP' or temp_doc[word][1] == 'NNPS'):
            current_doc.append(temp_doc[word][0])

    text_corpus.append(current_doc)

dictionary = corpora.Dictionary(text_corpus)
# print dictionary
#dictionary.save('myDict.dict')
# print dictionary.token2id
corpus = [dictionary.doc2bow(text) for text in text_corpus]
# print corpus

ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics=2,
                                           id2word=dictionary,
                                           passes=100)
for topics in ldamodel.print_topics(num_topics=2, num_words=10):
    print topics, "\n"
data_words = list(sentence_to_words(tweets_week))

# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

#lemmatization
data_lemmatized = lemmatization(data_words_bigrams)

# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# id2word.filter_n_most_frequent(int(len(id2word)*0.005))

texts = data_lemmatized.copy()

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

#Build LDA Model

# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                            id2word=id2word,
#                                            num_topics=8,
#                                            random_state=100,
#                                            update_every=1,
Пример #8
0
infolder = '../../' + DataSet + 'Data/'
outfolder = '../../' + DataSet + 'Submissions/'
savedfolder = DataSet + "Saved/"

testPostsFile = infolder + DataSet + "testPosts.json"
trainPostsFile = infolder + DataSet + "trainPosts.json"

# First, we make a dictionary of words used in the titles
with Files([open(trainPostsFile), open(testPostsFile)]) as myFiles:
    try:
        dictionary = corpora.dictionary.Dictionary.load(savedfolder +
                                                        "dictionary.saved")

    except:
        dictionary = corpora.Dictionary(doc for doc in myFiles)
        stop_ids = [
            dictionary.token2id[stopword] for stopword in stop_words
            if stopword in dictionary.token2id
        ]
        #infreq_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq < 50]
        #dictionary.filter_tokens(stop_ids + infreq_ids) # remove stop words and words that appear infrequently
        dictionary.filter_tokens(stop_ids)
        dictionary.compactify(
        )  # remove gaps in id sequence after words that were removed

        dictionary.save(savedfolder + "dictionary.saved")

    try:
        tfidf = models.tfidfmodel.TfidfModel.load(savedfolder + "tfidf.saved")
    except:
Пример #9
0
import jieba, os
from gensim import corpora, models, similarities

train_set = []

walk = os.walk('C:\\Users\\Sun Yutian\\Desktop\\test')
for root, dirs, files in walk:
    for name in files:
        f = open(os.path.join(root, name), 'r')
        raw = f.read()
        word_list = list(jieba.cut(raw, cut_all=False, HMM=True))
        train_set.append(word_list)

dic = corpora.Dictionary(train_set)
corpus = [dic.doc2bow(text) for text in train_set]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=10)
corpus_lda = lda[corpus_tfidf]

f = open('dat', 'r')
raw = f.read()
word_list = list(jieba.cut(raw, cut_all=False, HMM=True))
vec_bow = dic.doc2bow(word_list)
vec_lda = lda[vec_bow]

index = similarities.MatrixSimilarity(lda[corpus])
sims = index[vec_lda]
print(list(enumerate(sims)))
Пример #10
0
    while node:
        if node.feature.split(",")[0] == "名詞":
             keywords.append(node.surface)
        node = node.next
    golo.append(keywords)     
    return keywords

if __name__ == "__main__":
    for i in line_list:
       keywords = extractKeyword(i)

#stop_word_list = ["ため","これ","それ","的","(",")","0","1","2","3","4","5","6","7","8","9","1","2","3","4","5","6","7","8","9","日","私","たち","こと","自分","自身","さん"]
#golo = [[word for word in keywords if word not in stop_word_list] for keywords in golo]

#特徴語辞書の作成
dictionary = corpora.Dictionary(golo)

#低頻度語や二割以上の単語を削除
dictionary.filter_extremes(no_below=2, no_above = 0.2)

##text全体に対する特徴ベクトルの集合= corpusを作成する。
corpus = [dictionary.doc2bow(keywords) for keywords in golo]
corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus) # store to disk, for later use
#print(corpus)

# LDAのモデルの呼出と学習 ここでtopicの数(ユーザー層の数)を設定出来る
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=5)

#トピックの出力
for topic in lda.show_topics(-1,8):
   print(topic)
Пример #11
0
def prepare_for_modeling(data_path, model_type="LDA-KeyWords",
                         params={"TEXT_prepared_df": pd.DataFrame({}),
                                 "save_LDA_dictionary_path": "./output/lda_keywords/dictionary.pickle",
                                 "words_column": "all_key_words"
                                 },
                         verbose=1):
  if model_type == "LDA-KeyWords":
    """
    params={"TEXT_prepared_df": pd.DataFrame({}),
             "save_LDA_dictionary_path": "./output/lda_keywords/dictionary.pickle",
             "words_column": "all_key_words"
            }
    """

    if len(params['TEXT_prepared_df']) > 0:
      # load data for LDA
      df_data = params['TEXT_prepared_df']
      if verbose == 2:
        print("loaded data shape:", df_data.shape)
    else:
      if verbose == 2:
        print("No data is provided")
      return False

    words_column = params['words_column']
    df_data[words_column] = df_data[words_column].apply(lambda x: [w.replace(' ', '_') for w in x
                                                                   if len(w) > 1
                                                                   ])
    # get all unique key_words
    tmp_list = df_data[words_column].tolist()
    set_of_words = set([w for sublist in tmp_list for w in sublist])

    if verbose == 2:
      print('\nNumber of unique key-words for topic modeling dictionary:',
            len(set_of_words))

    # delete empty lists of words
    df_data = df_data[df_data[words_column].apply(len) > 0]

    # create a vocabulary for the LDA model
    dictionary = corpora.Dictionary(df_data[words_column])

    # save dictionary
    with open(params["save_LDA_dictionary_path"], 'wb') as f:
      # Pickle the LDA dictionary using the highest protocol available.
      pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL)
    if verbose == 2:
      print("LDA dictionary file is saved to:",
            params["save_LDA_dictionary_path"])

      print('\nNumber of texts processed: ', dictionary.num_docs)
      print('Number of extracted key-words: ', len(dictionary.token2id))
      print('\nEach text is represented by list of ', len(dictionary.token2id),
            " tuples: \n\t\t(key-words's index in bag-of-words dictionary, key-words's term frequency)")

    # count the number of occurrences of each distinct token in each document
    df_data['doc2bow'] = df_data['all_key_words'].apply(
        lambda x: dictionary.doc2bow(x))

  if model_type == "LDA":
    """
    params={"TEXT_prepared_df": pd.DataFrame({}),
                             "save_LDA_dictionary_path": "./output/lda/dictionary.pickle",
                             "text_column": "text"
                             }
    """
    if len(params['TEXT_prepared_df']) > 0:
      # load data for LDA
      df_data = params['TEXT_prepared_df']
      print("loaded data shape:", df_data.shape)
    elif len(data_path) > 0:
      print("Preparing data for LDA...")
      df_data = pd.read_csv(params['data_path'])
      df_data['list_of_lemmas'] = df_data[words_column].apply(
          lambda text: get_list_of_lemmas(text))
      print("Data for LDA shape:", df_data.shape)
    else:
      return False

    # get all unique lemmas
    tmp_list = df_data['list_of_lemmas'].apply(set).apply(list).tolist()
    list_of_words = [w for sublist in tmp_list for w in sublist]

    # count words' document frequencies in the corpus
    w_freq_counter = collections.Counter(list_of_words)
    s_w_freq = pd.Series(w_freq_counter)
    if verbose == 2:
      print('\nTotal number of unique Lemmas: ', len(s_w_freq))
      print("\nDistribution of lemmas' document counts: ")
      print(pd.DataFrame(s_w_freq.describe(percentiles=[
            0.55, 0.65, 0.75, 0.85, 0.95, 0.97, 0.99])).T)

    # select upper and lower boundary for lemmas' count
    up_pct = s_w_freq.quantile(0.99)
    low_pct = 3  # s_w_freq.quantile(0.50)
    if verbose == 2:
      print("\nDeleting too frequent and too rare words...")
      print('Lemma count upper bound:', up_pct)
      print('Lemma count lower bound:', low_pct)

    # select Lemmas
    selected_words = set(s_w_freq[(s_w_freq > low_pct)
                                  & (s_w_freq <= up_pct)].index)
    if verbose == 2:
      print('\nList of words for topic modeling dictionary is reduced from',
            len(s_w_freq), 'to', len(selected_words))

    # select words in each article if they belong to chosen list of words
    df_data['selected_words'] = df_data['list_of_lemmas'].apply(lambda x:
                                                                [l for l in x if l in selected_words])
    # delete empty lists of words
    df_data = df_data[df_data['selected_words'].apply(len) > 0]

    # create a vocabulary for the LDA model
    dictionary = corpora.Dictionary(df_data['selected_words'])

    # save dictionary
    with open(params["save_LDA_dictionary_path"], 'wb') as f:
      # Pickle the LDA dictionary using the highest protocol available.
      pickle.dump(dictionary, f, pickle.HIGHEST_PROTOCOL)
    if verbose == 2:
      print("LDA dictionary file is saved to:",
            params["save_LDA_dictionary_path"])

      print('\nNumber of texts processed: ', dictionary.num_docs)
      print('Number of extracted lemmas: ', len(dictionary.token2id))
      print('\nEach text is represented by list of ', len(dictionary.token2id),
            " tuples: \n\t\t(lemma's index in bag-of-words dictionary, lemma's term frequency)")

    # count the number of occurrences of each distinct token in each document
    df_data['doc2bow'] = df_data['selected_words'].apply(
        lambda x: dictionary.doc2bow(x))

  return df_data
Пример #12
0
def return_topic_figures(n_topics=5):
    """Creates plotly visualizations generated from topic model

    Args:
        n_topics = number of topics to generate from articles, default 5

    Returns:
        figures (list): list containing the plotly visualizations

    """

    ### import data ###

    data = return_keywords()
    data_for_topics = data["abstract_kw"].apply(
        lambda x: list(ast.literal_eval(x).keys()))

    ### Build topic model ###

    # parameters
    n_topics = n_topics

    # Create Dictionary
    id2word = corpora.Dictionary(data_for_topics)

    # Create Corpus: Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in data_for_topics]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=n_topics,
        random_state=100,
        update_every=1,
        chunksize=10,
        passes=10,
        alpha="symmetric",
        iterations=100,
        per_word_topics=True,
    )

    topics = lda_model.show_topics(formatted=False)
    data_flat = [w for w_list in data_for_topics for w in w_list]
    counter = Counter(data_flat)
    out = []
    for i, topic in topics:
        for word, weight in topic:
            out.append([word, i, weight, counter[word]])
    df = pd.DataFrame(out,
                      columns=["word", "topic_id", "importance", "word_count"])

    specs = np.full((ceil(n_topics / 2), 2), {"secondary_y": True})
    topic_bar_charts = make_subplots(
        rows=ceil(n_topics / 2),
        cols=2,
        specs=specs.tolist(),
        horizontal_spacing=0.1,
        vertical_spacing=0.15,
    )
    row, col = (0, 0)
    for topic in range(n_topics):
        if (topic % 2) != 0:
            col = 2
        else:
            col = 1
            row += 1
        color = px.colors.qualitative.Vivid[topic]
        topic_bar_charts.add_trace(
            go.Bar(
                x=df.loc[df.topic_id == topic, "word"],
                y=df.loc[df.topic_id == topic, "word_count"],
                width=0.5,
                opacity=0.3,
                marker_color=color,
                name=("Topic " + str(topic) + " word count"),
            ),
            secondary_y=False,
            row=row,
            col=col,
        )
        topic_bar_charts.add_trace(
            go.Bar(
                x=df.loc[df.topic_id == topic, "word"],
                y=df.loc[df.topic_id == topic, "importance"],
                width=0.2,
                marker_color=color,
                name=("Topic " + str(topic) + " weight"),
            ),
            secondary_y=True,
            row=row,
            col=col,
        )
        topic_bar_charts.update_layout(barmode="overlay")

    topic_bar_charts.update_layout(height=800,
                                   width=1000,
                                   margin=dict(l=50, r=50, t=50, b=100))

    # append all charts
    figures = [dict(data=topic_bar_charts)]

    return figures
Пример #13
0
def LDA_post(infile, outfile, topic = 14):
    docs = []
    # f = open(infile, 'r')
    # line = f.readline()
    # while line:
    #   docs.append(line.lower().split('\t')[1])
    #   line = f.readline()
    # f.close()

    with open(infile, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter = ',', quotechar = '"')
        header = next(spamreader)
        for row in spamreader:
            docs.append(row[1])

    texts = []
    widgets = [FormatLabel('Processed: %(value)d records (in: %(elapsed)s)')]
    pbar = ProgressBar(widgets = widgets)
    for doc in pbar((doc for doc in docs)):
        texts.append([word for word in wordProcBase.tokenize_tweet(doc) if word not in stopwords.words('english')])
        # doc = wordProcBase.tokenize5(doc.decode('utf-8'))
        # texts.append([word for word in doc if word not in stopwords.words('english')])
    pbar.finish()

    pprint.pprint(texts)
    return

    # create a Gensim dictionary form the texts
    dictionary = corpora.Dictionary(texts)

    # remove extrems
    dictionary.filter_extremes(no_below = 1, no_above = 0.85)

    # convert the dictionary to a bag of words corpus for reference
    corpus = [dictionary.doc2bow(text) for text in texts]

    print ('Applying LDA...')
    lda = models.LdaModel(corpus, num_topics = topic, id2word = dictionary, update_every = 1, chunksize = 10000, passes = 100, minimum_probability = 0.001)

    topics = lda.show_topics(num_topics = topic, num_words = 5)

    # pprint.pprint(lda.print_topics(num_topics = topic)) 

    # pprint.pprint(topics)

    print ('Writing results into file...')
    # 結果寫入文件
    with open(outfile, 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')

        top_prob = lda.get_document_topics(corpus) #a list of (topic_id, topic_probability) 2-tuples
        index = 1
        for prob in top_prob:
            string = [0 for i in range(topic)]
            prob = sorted(prob, key = operator.itemgetter(0), reverse = False)
            for i, p in prob:
                string[i] = p
            spamwriter.writerow(string)
            index += 1

    return

    '''
    # reading unseen data
    '''
    print ('Reading unseen data...')
    unseen = _MAIN_DIR_ + "/Data/VA_Proc/emtion_tweets/survey/google_survey_data.csv"
    docs = []
    with open(unseen, 'r') as csvfile:
        spamreader = csv.reader(csvfile, delimiter = ',', quotechar = '"')
        for row in spamreader:
            docs.append(row[1])
    texts = []
    for doc in docs:
        texts.append([word for word in wordProcBase.tokenize3(doc.decode('utf-8')) if word not in stopwords.words('english')])

    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below = 1, no_above = 0.85)
    corpus = [dictionary.doc2bow(text) for text in texts]

    with open(outfile, 'w') as csvfile:
        spamwriter = csv.writer(csvfile, delimiter = ',', quotechar = '"')

        top_prob = lda.get_document_topics(corpus)
        index = 1
        for prob in top_prob:
            string = [index]
            for i in xrange(0, len(prob)):
                string.append(prob[i][1])
            spamwriter.writerow(string)
            index += 1
def English(documents):
    # Log
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    # Reference text file processing
    texts = [[word for word in document.lower().split()]
             for document in documents[1:]]
    print(texts)

    # Statistically restricted word frequency
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1
    texts1 = [[token for token in text if frequency[token] > 1]
              for text in texts]
    print(texts1)

    # Build a corpus
    dictionary = corpora.Dictionary(texts1)
    print(dictionary.token2id)

    # Doc2bow the dictionary to get a new corpus
    corpus = [dictionary.doc2bow(text) for text in texts1]

    # Building a TF-IDF model
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    for doc in corpus_tfidf:
        print(doc)

    # Depth-first search
    print(tfidf.dfs)
    print(tfidf.idfs)

    # Training the Lsi model
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=2)
    lsi.print_topics(2)

    # Map documents into two-dimensional topic space with Lsi model
    corpus_lsi = lsi[corpus_tfidf]
    for doc in corpus_lsi:
        print(doc)

    # Calculate sparse matrix similarity
    index = similarities.MatrixSimilarity(lsi[corpus])

    # Object text file processing
    query = documents[0]
    print(query)

    # doc2bow builds a bag of words model, turning the file into a sparse vector
    query_bow = dictionary.doc2bow(query.lower().split())
    print(query_bow)

    # Map documents into 2D topic space with Lsi model
    query_lsi = lsi[query_bow]
    print(query_lsi)

    # Calculate cosine similarity
    sims = index[query_lsi]
    sims = list(sims)
    return sims
Пример #15
0
    def buildTokenDict(self):
	'''
	assign an id to each word in self.segResponses
	'''
	self.tokenDictionary = corpora.Dictionary(self.segResponses)
	logging.info("完成词袋" + str(self.tokenDictionary))
Пример #16
0
args = parser.parse_args()

n_topics=args.n_topics
n_docs=0
input_file=args.input
#input_file='/medargsia/iarroyof/Volumen de 384 GB/data/GUs_textform_noPeriods.txt'
#input_file='lsa_example.csv'
#input_file='wiki_sample/wiki_75_AA.txt.cln'
#input_file='wiki_sample/wiki_77_AA.txt'

# A little stopwords list
stoplist = set('for a of the and to in _ [ ]'.split())
# Do not load the text corpus into memory, but stream it!
fille=corpus_streamer(input_file, strings=True)
dictionary=corpora.Dictionary(line.lower().split() for line in fille)#open(input_file))
# remove stop words and words that appear only once
stop_ids=[dictionary.token2id[stopword] for stopword in stoplist
                                             if stopword in dictionary.token2id]
once_ids=[tokenid for tokenid, docfreq in iteritems(dictionary.dfs)
                                                            if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids)
# remove gaps in id sequence after words that were removed
dictionary.compactify()
# Store the dictionary
dictionary.save('lsa_mini.dict')
# Reading sentences from file into a list of strings.
# Use instead streaming objects:
# Load stored word-id map (dictionary)
stream_it = corpus_streamer(input_file, dictionary=dictionary)
#for vector in stream_it:  # load one vector into memory at a time
Пример #17
0
category = df['category']

dates = df['dates']

heads = df['heads']

cats = df.category.unique()

unique_dates = df.dates.unique()

text_words = np.load('text_words.npy')
text_words_nostops = np.load('text_words_nostops.npy')
data_lemmatized = np.load('data_lemmatized.npy')

id2word = corpora.Dictionary(data_lemmatized)

print('0.5')

corpus = np.load('corpus.npy')

range_per_date = {}  #[[0,0] for i in range(len(unique_dates))]

for d in unique_dates:
    range_per_date[d] = [0, 0]

print(1)

for d in unique_dates:
    found = 0
    for i in range(len(dates)):
Пример #18
0
##불용어 제거
swords = open(r'C:\Users\Z\Desktop\NI\한국어불용어100.txt', encoding='UTF8').read()
stop_words = re.findall('[가-힣]+', swords)

tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split())  # 토큰화
tokenized_doc = tokenized_doc.apply(
    lambda x: [item for item in x if item not in stop_words])
# 불용어를 제거합니다.

#####################################
#LDA

tokenized_doc[:5]

from gensim import corpora
dictionary = corpora.Dictionary(tokenized_doc)
corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
print(corpus[1])  # 수행된 결과에서 두번째 뉴스 출력. 첫번째 문서의 인덱스는 0

import gensim
NUM_TOPICS = 5  #20개의 토픽, k=20
ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                           num_topics=NUM_TOPICS,
                                           id2word=dictionary,
                                           passes=15)
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

for i, topic_list in enumerate(ldamodel[corpus]):
    if i == 5:
Пример #19
0
# 日本語の取り扱い・語の取捨選択はKH Coderでやる方が楽。
# そのため(少々ダサいけれども)文書-単語行列からgensimで扱いやすいDictionaryモデルを作成している。
documents = []
words = df.columns.values[1:]  # 一番左のID列は除く
for row in df.values:
    word_counts = row[1:]
    document_bow = []  # 各投稿ごとのbag of words
    for word_index, count in enumerate(word_counts):
        for i in range(count):
            document_bow.append(words[word_index])
    if len(document_bow) > 0:
        documents.append(document_bow)

# 各単語をidに変換する辞書の作成
dictionary = corpora.Dictionary(documents)

# documentsをcorpus化する
corpus = list(map(dictionary.doc2bow, documents))

# TF-IDFモデルを作成する。
test_model = models.TfidfModel(corpus)

# corpusへのモデル適用
corpus_tfidf = test_model[corpus]

start, stop, step = 2, 30, 1
plot_graph(documents, start, stop, step, dictionary, corpus_tfidf)

number_of_topics = 7
words = 10
Пример #20
0
 def __init__(self, corpus_dir):
     self.corpus_dir = corpus_dir
     # Given a list of tokens, return a gensim dictionary of unique tokens
     # Only includes tokens that appear more than 5 times and less than 50% of the corpus.
     self.dictionary = corpora.Dictionary(iter_documents(corpus_dir))
     self.dictionary.filter_extremes(no_below=1, no_above=1)  # remove stopwords etc
Пример #21
0
        ]
        tokens_stemmed = [self.stemmer.stem(x) for x in tokens_stopwords]

        return tokens_stemmed


if __name__ == '__main__':

    input_file = 'data_topic_modeling.txt'
    data = load_data(input_file)

    preprocessor = Preprocessor()

    processed_tokens = [preprocessor.process(x) for x in data]

    dict_tokens = corpora.Dictionary(processed_tokens)
    corpus = [dict_tokens.doc2bow(text) for text in processed_tokens]

    num_topic = 2
    num_words = 4

    ldamodel = models.ldamodel.LdaModel(corpus,
                                        num_topics=num_topic,
                                        id2word=dict_tokens,
                                        passes=25)

    print('Most contributing words to the topics:')
    for item in ldamodel.print_topics(num_topics=num_topic,
                                      num_words=num_words):
        print('Topic', item[0], '==>', item[1])
 def get_docs_corpus(self):
     dictionary = corpora.Dictionary(self.docs_words)
     for doc_words in self.docs_words:
         yield dictionary.doc2bow(doc_words)
Пример #23
0
    def process(self, process_file, cluster_ResFileName):
        try:
            # 一、获取标题和分词
            flag, lines = self.load_processfile(process_file)
            if flag == False:
                logging.error("load error")
                return False, "load error"
            # 分词结果与其他方法形式不同
            title_list, sen_seg_list = self.seg_words(lines)

            # # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频(另一种方式)
            # vectorizer = CountVectorizer()
            # x = vectorizer.fit_transform(sen_seg_list)
            # weight = x.toarray()
            #
            # model = lda.LDA(n_topics=5, n_iter=100, random_state=1)
            # model.fit(np.asarray(weight))  # model.fit_transform(X) is also available
            # topic_word = model.topic_word_  # model.components_ also works
            # print(topic_word)
            # # 文档-主题(Document-Topic)分布
            # doc_topic = model.doc_topic_
            # print(doc_topic)
            # # numpy.savetxt('100.csv', doc_topic, delimiter = ',') #将得到的文档-主题分布保存

            # 二、lda模型提取特征
            # 构造词典
            dictionary = corpora.Dictionary(sen_seg_list)
            # 基于词典,使【词】→【稀疏向量】,并将向量放入列表,形成【稀疏向量集】
            corpus = [dictionary.doc2bow(words) for words in sen_seg_list]

            lda = models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=15)

            lda.save('zhwiki_lda.model')
            lda = models.ldamodel.LdaModel.load('zhwiki_lda.model')
            # 打印所有主题,每个主题显示10个词
            for topic in lda.print_topics(num_words=500):
                print(topic)

            # 主题矩阵
            ldainfer = lda.inference(corpus)[0]

            # 主题推断
            print(lda.inference(corpus))
            np.savetxt('100tag.csv',
                       lda.inference(corpus),
                       delimiter=',',
                       fmt='%s')  # 将得到的文档-主题分布保存

            k = self.evaluate_km(ldainfer)
            # 三、Kmeans,大数据量下用Mini-Batch-KMeans算法
            km = KMeans(n_clusters=k)
            km.fit(ldainfer)
            print(Counter(km.labels_))  # 打印每个类多少个
            # print(km.cluster_centers_)   # 中心点

            # 存储每个样本所属的簇
            clusterRes = codecs.open(cluster_ResFileName,
                                     'w',
                                     encoding='UTF-8')
            count = 1
            while count <= len(km.labels_):
                clusterRes.write(
                    str(title_list[count - 1]) + '\t' +
                    str(km.labels_[count - 1]))
                clusterRes.write('\r\n')
                count = count + 1
            clusterRes.close()

        except:
            logging.error(traceback.format_exc())
            return False, "process fail"
 def get_docs_LSI_model(self):
     LSI_model = models.LsiModel(corpus=self.get_docs_corpus(),
                                 id2word=corpora.Dictionary(self.docs_words),
                                 num_topics=2)
     return LSI_model
lemma = WordNetLemmatizer()
# Function to lemmatize and remove the stopwords
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = "".join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

# Creating a list of documents from the complaints column
list_of_docs = df["message"].tolist()

# Implementing the function for all the complaints of list_of_docs
doc_clean = [clean(doc).split() for doc in list_of_docs]

# Code starts here
dictionary=corpora.Dictionary(doc_clean)
doc_term_matrix=[dictionary.doc2bow(text) for text in doc_clean]
lsimodel=LsiModel(corpus=doc_term_matrix, num_topics=5, id2word=dictionary)

pprint(lsimodel.print_topics())



# --------------
from gensim.models import LdaModel
from gensim.models import CoherenceModel

# doc_term_matrix - Word matrix created in the last task
# dictionary - Dictionary created in the last task

# Function to calculate coherence values
Пример #26
0
def main():
    corpus = {}
    with open('corpus_data/preprocessedf_corpus.json') as corpus:
        corpus = json.loads(corpus.read().encode('utf-8'))

    corpus_2 = defaultdict(str)
    for artist, songlist in corpus.items():
        for song in songlist:
            lyrics = song['lyrics'].strip('\\')
            corpus_2[artist] += lyrics
    features = {}
    with open('corpus_data/artist_features.json') as features:
        features = json.loads(features.read())

    finalcorpus = []

    for artist, lyrics in corpus_2.items():
        d = {}
        d['artist'] = artist
        d['lyrics'] = lyrics
        d['pos'] = features[artist]['pos_counts']
        finalcorpus.append(d)

    df = pd.DataFrame(finalcorpus)

    # nltk.download('wordnet')
    from nltk.corpus import wordnet as wn

    def get_lemma(word):
        lemma = wn.morphy(word)
        if lemma is None:
            return word
        else:
            return lemma

    """TOPIC MODELING HOPEFULLY"""
    import re
    from nltk import word_tokenize
    from nltk.corpus import stopwords
    STOPWORDS = stopwords.words('english')
    PROFANITY = set()
    with open('corpus_data/rapsw.txt') as infile:
        infile = infile.read()
        infile = infile.split()
        for el in infile:
            PROFANITY.add(el)

    def clean_text(text, ar):
        tokenized_text = word_tokenize(text.lower())
        tokenized_text = [token for token in tokenized_text if len(token) > 5]
        cleaned_text = [
            t for t in tokenized_text
            if re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)
        ]
        if ar == 'sw':
            cleaned_text = [t for t in cleaned_text if t not in STOPWORDS]
        if ar == 'lm':
            cleaned_text = [get_lemma(token) for token in cleaned_text]
        if ar == 'rw':
            cleaned_text = [
                token for token in cleaned_text if token not in PROFANITY
            ]
        return cleaned_text

    for index, row in df.iterrows():
        row['lyrics'] = clean_text(row['lyrics'], sys.argv[1])
    from gensim import models, corpora
    from gensim.corpora.dictionary import Dictionary
    from gensim.test.utils import common_corpus, common_texts, get_tmpfile

    all_lyrics = []
    all_artists = []
    for index, row in df.iterrows():
        all_lyrics.append(row['lyrics'])
        all_artists.append(row['artist'])

    #common_dictionary = Dictionary(common_texts)
    #common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    #lda_model = models.LdaModel(common_corpus, num_topics=10)
    dictionary = corpora.Dictionary(all_lyrics)
    corpus = [dictionary.doc2bow(text) for text in all_lyrics]

    NUM_TOPICS = 25
    lda_model = models.LdaModel(corpus=corpus,
                                num_topics=25,
                                id2word=dictionary,
                                passes=20)

    topics = lda_model.print_topics(num_words=4)
    print('LDA Topics')
    for topic in topics:
        print(topic)

    lsi_model = models.LsiModel(corpus=corpus,
                                num_topics=NUM_TOPICS,
                                id2word=dictionary)
    topics = lsi_model.print_topics(num_words=4)
    print('LSI TOPICS')
    for topic in topics:
        print(topic)

    from gensim import similarities

    text = ""
    with open(sys.argv[2]) as inf:
        inf = inf.read()
        text = inf

    bow = dictionary.doc2bow(clean_text(text, sys.argv[1]))
    lda_index = similarities.MatrixSimilarity(lda_model[corpus])
    lsi_index = similarities.MatrixSimilarity(lsi_model[corpus])
    # Let's perform some queries
    similarities = lda_index[lda_model[bow]]
    # Sort the similarities
    similarities = sorted(enumerate(similarities), key=lambda item: -item[1])

    similaritiesLSI = lsi_index[lsi_model[bow]]

    similaritiesLSI = sorted(enumerate(similaritiesLSI),
                             key=lambda item: -item[1])

    # Top most similar documents:
    #print(similarities[:10])
    # [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]

    # Let's see what's the most similar document
    document_id, similarity = similarities[0]
    document_id2, similarityLSI = similaritiesLSI[0]

    # print(all_lyrics[document_id][:1000])
    print("LDA : TOP 5 Similar ARTISTS")
    for el in similarities[:5]:
        print(all_artists[el[0]])

    print('')
    print('LSI : Top 5 Similar Artists')
    for el in similaritiesLSI[:5]:
        print(all_artists[el[0]])
# Count word frequencies
frequency = defaultdict(
    int)  # Create defaultdict class with each word and their frequency
for text in texts:  # Iterate through each text document from our texts corpus
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1]
                    for text in texts]
pprint.pprint(processed_corpus)

# Saved processed corpus into our corpora.Dictionary object this is our most important object
# it contains the tokens as well as their frequencies
dictionary = corpora.Dictionary(processed_corpus)
print(dictionary)

# The token2id attribute returns dictionary of our tokens and their ids
pprint.pprint(dictionary.token2id)

# The cfs attribute returns ow many instances of this token are contained in the documents.
dictionary.dfs

# Save output dictionary into text file for later use
dictionary.save_as_text("dict_text.txt")

## Comparison of new document with corpus

# We can convert our entire original corpus to a list of vectors:
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
Пример #28
0
def LDA(trainSet, testSet, topics=10, times=200):
    # topics: # of topics in the result
    # times: # of passes during training
    #tokenizer = RegexpTokenizer(r'\w+')

    # create English stop words list
    en_stop = get_stop_words('en')

    # Create p_stemmer of class PorterStemmer
    #p_stemmer = PorterStemmer()

    # create sample documents
    #r1 = "1, 2, 3"
    #r2 = "2, 3, 4"
    #r3 = "1, 3, 5"
    #r4 = "2, 4, 5"
    #r5 = "1, 5, 6"

    # compile sample documents into a list
    #r_set = [r1, r2, r3, r4, r5]

    #print r_set

    # list for tokenized documents in loop
    texts = []
    freq = []

    #gradient.csv
    with open('gradient.csv') as myfile:
        csv_reader = csv.reader(myfile)
        for row in csv_reader:
            x = ",".join(row)
            dishList = x.split(',')
            dishList = stem_words(dishList)
            newList = []
            for dish in dishList:
                dish = re.sub('[^A-Za-z]', '', str(dish))
                #print dish
                #tokens = tokenizer.tokenize(raw)
                # remove stop words from tokens
                #stopped_tokens = [i for i in raw if not i in en_stop]
                # stem tokens
                #stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
                # add tokens to list
                if not dish in en_stop:
                    dish = dish.lower()
                    newList.append(dish)
                    freq.append(dish)
            #print newList
            texts.append(newList)

    print "Filter High Freq Words"
    top = Counter(freq).most_common(1000)
    #print top
    topList = []
    for i in top:
        topList.append(i[0])
    topList.remove('allrecip')
    topList.remove('recip')
    topList.remove('martha')
    #topList.remove('re')
    topList.remove('stewart')
    topList.remove('myrecip')
    topList.remove('recipe')
    topList.remove('recipes')
    topList.remove('street')
    topList.remove('epicuri')
    topList.remove('edamam')

    final = []
    for i in texts:
        partFinal = []
        for j in i:
            if j in topList:
                partFinal.append(j)
        final.append(partFinal)

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(final)

    #print dictionary
    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in final]
    #print corpus
    print "Train Model"
    # generate LDA model
    flag = False
    while not flag:
        ldamodel = gensim.models.ldamodel.LdaModel(corpus,
                                                   num_topics=topics,
                                                   id2word=dictionary,
                                                   passes=times)
        ldamodel.save('lda.model')
        flag = True
        japanFlag = False
        for entry in ldamodel.print_topics():
            print entry
            dishes = str(entry[1]).split("+")
            weight = []
            name = []
            for dish in dishes:
                dish = str(dish).split("*")
                #print dish[0]
                weight.append(float(dish[0]))
                name.append(re.sub('[^A-Za-z]', '', str(dish[1])))
            if max(weight) <= 0.002:
                flag = False
                break
            if "thai" in name and "chines" in name:
                flag = False
                break
            if "thai" in name and "indian" in name:
                flag = False
                break
            if "japanes" in name:
                japanFlag = True
        if not japanFlag:
            flag = False
    #print ldamodel.print_topics()
    '''
def summarizer(name,file):
    filename = file.rstrip('.pdf')+'.txt'
    f=open(filename,'r')
    abstract=0
    text = ""
    stemmer = PorterStemmer()

    # for abstract extraction
    for sentence in f.readlines():
        sentence=sentence.rstrip('\n')
        if abstract and len(sentence) >0:
            text+=sentence+"\n"
        elif stemmer.stem(sentence.lower()) == stemmer.stem(name) and abstract==0:
            abstract=1
        if len(sentence)>0 and sentence == sentence.upper() and abstract==1 and \
            stemmer.stem(sentence.lower())!=stemmer.stem(name):
            break
    content = sent_tokenize(text)
    content.pop()

    scores = []
    avg_score = 0.0
    for sent in content:
        score=0.0
        for word in sent.split():
            if word.lower() in tf_values:
                score+=tf_values[word.lower()]
        scores.append(score)
        #print("for",sent,"score :",score)
        avg_score+=score
    #print(tf_values["Citation".lower()])
    try:
        avg_score/=len(content)
    except:
        print("file can't used ")
        return
    #print(avg_score)
    summary=""
    for i,j in enumerate(scores):
        if j>=avg_score:
            summary+=content[i]
    final_summary="By TF-IDF\n"
    final_summary+=name.upper()+" "+"Summary: \n"+" "+summary+"\n"
    final_summary+="\tOriginal "+name+" : "+str(len(text))+" chars\n"
    final_summary+="\tReduced Size : "+str(len(summary))+" chars\n"
    final_summary+="\tcompression ratio : "+str((len(summary))/(len(text)))+"\n\n"
    output.write(final_summary)

    # code for LDA 

    data = []
    lda_stem = PorterStemmer()
    for sent in content:
        words = []
        for word in word_tokenize(sent):
            if word not in get_stop_words('en') and word not in stopwords.words("english") and word != '.':
                words.append(lda_stem.stem(word))
        data.append(words)
    dictionary = corpora.Dictionary(data)
    corpus = [dictionary.doc2bow(sent) for sent in data]
    
    # LDA model intialization 

    topic_used = 3 # no of topic to summarize

    ldamodel = models.ldamodel.LdaModel(corpus, num_topics=topic_used, id2word = dictionary, passes=100)
    
    prob_sum_for_each_topic = [0.0 for i in range(topic_used)]
    data_desc = ldamodel.get_document_topics(corpus)
    print(data_desc)
    for data in data_desc:
        print(data)
        for topic_data in data:
            prob_sum_for_each_topic[topic_data[0]] += topic_data[1]
            print(topic_data[1])
    
    important_topic = prob_sum_for_each_topic.index(max(prob_sum_for_each_topic))
    threshold = prob_sum_for_each_topic[important_topic]/len(data_desc)

    lda_summary=""

    for index,word_data in enumerate(data_desc):
        if word_data[important_topic][1] > threshold:
            lda_summary+=content[index]

    """for index,sent in enumerate(content):
        print(data_desc[important_topic][index][1])
        if data_desc[important_topic][index][1] > threshold:
            lda_summary+= content[index]"""

    final_summary_lda="By LDA\n"
    final_summary_lda+=name.upper()+" "+"Summary: \n"+" "+lda_summary+"\n"
    final_summary_lda+="\tOriginal "+name+" : "+str(len(text))+" chars\n"
    final_summary_lda+="\tReduced Size : "+str(len(lda_summary))+" chars\n"
    final_summary_lda+="\tcompression ratio : "+str((len(lda_summary))/(len(text)))+"\n\n"
    output.write(final_summary_lda)

    f.close()
Пример #30
0

file_docs = []
filenames = []
document_for_test = "the_man_in_the_brown_suit"
for file in os.listdir("data"):
    if file.endswith(".txt"):
        if file in ["stopwords.txt", f"{document_for_test}.txt"]:
            # because we want to test which book is sense_and_sensibility similar to
            continue
        print(file)
        filenames.append(file)
        file_docs.append(open("data/" + file, "r", encoding="UTF-8").read())
# print(file_docs)
gen_words_list = [[w.lower() for w in word_tokenize(text)] for text in file_docs]
dictionary = corpora.Dictionary(gen_words_list)
corpus = [dictionary.doc2bow(gen_words_list) for gen_words_list in gen_words_list]
tf_idf = models.TfidfModel(corpus)
sims = similarities.Similarity("data", tf_idf[corpus], num_features=len(dictionary))
file2_sentence_list = []
with open(f"data/{document_for_test}.txt", "r", encoding="UTF-8") as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file2_sentence_list.append(line)
for line in file2_sentence_list:
    query_doc = [w.lower() for w in word_tokenize(line)]
    query_doc_bow = dictionary.doc2bow(query_doc)
query_doc_tf_idf = tf_idf[query_doc_bow]
# print("Comparing Result:", sims[query_doc_tf_idf])
similarity_list = list(sims[query_doc_tf_idf])
most_similar_index = similarity_list.index(max(similarity_list))