예제 #1
0
파일: LDA_2.py 프로젝트: ZUCCBBQ/Ocean
 def select_lda(self, journal, year, num_topics):
     self.build_corpus(journal, year)
     # 输出
     lda = LdaModel(corpus=self.corpus_a,
                    id2word=self.dictionary,
                    num_topics=num_topics,
                    passes=2,
                    update_every=0,
                    alpha='auto',
                    iterations=500)
     output_path = self.abspath + '/data/lda_topic/' + journal + '/'
     output_filename = year + '.txt'
     with open(output_path + output_filename,
               'w',
               newline='',
               encoding='UTF-8') as f:
         for i in range(0, num_topics):
             input_str = lda.show_topic(i, topn=30)[0][0] + ':' + str(
                 lda.show_topic(i, topn=30)[0][1])
             for j in range(1, len(lda.show_topic(i, topn=30))):
                 word = lda.show_topic(i, topn=30)[j][0] + ':' + str(
                     lda.show_topic(i, topn=30)[j][1])
                 input_str = input_str + ',' + word
             f.write(input_str + '\n')
     self.select_over_msg.emit(journal, year, num_topics)
예제 #2
0
def Lda_topic_model(docs, dictionary, nb_topics, true_labels):
    k = 5
    lda = LdaModel(docs, num_topics=k, id2word=dictionary, passes=10)

    top_words = [[word[::-1] for word, _ in lda.show_topic(topic_id, topn=50)]
                 for topic_id in range(lda.num_topics)]
    top_betas = [[beta for _, beta in lda.show_topic(topic_id, topn=50)]
                 for topic_id in range(lda.num_topics)]
    nb_words = 12
    f, ax = plt.subplots(3, 2, figsize=(20, 15))
    for i in range(nb_topics):
        # ax = plt.subplot(gs[i])
        m, n = np.unravel_index(i,
                                shape=(3,
                                       2))[0], np.unravel_index(i,
                                                                shape=(3,
                                                                       2))[1]
        ax[m, n].barh(range(nb_words),
                      top_betas[i][:nb_words],
                      align='center',
                      color='green',
                      ecolor='black')
        ax[m, n].invert_yaxis()
        ax[m, n].set_yticks(range(nb_words))
        ax[m, n].set_yticklabels(top_words[i][:nb_words])
        ax[m, n].set_title("Topic " + str(i))
    plt.show()
    # get distribution of docs on topics.
    dist_on_topics = lda.get_document_topics(docs)
    topic_predict = []
    for d in dist_on_topics:
        p = 0
        win_topic = 0
        print(d)
        for i, t in enumerate(d):
            if t[1] > p:
                p = t[1]
                win_topic = t[0]
        print(win_topic)
        topic_predict.append(win_topic)
    mat = confusion_matrix(true_labels, topic_predict)
    print(mat)
    cluster_to_class = {}
    for i in range(5):
        cluster_to_class[i] = np.argmax(mat[:, i])
    custom_labels = [cluster_to_class[c] for c in topic_predict]
    print("accuracy:", accuracy_score(true_labels, custom_labels))
    print("f1_score micro: ",
          f1_score(true_labels, custom_labels, average='micro'))
    print("f1_score: macro",
          f1_score(true_labels, custom_labels, average='macro'))
    print("NMI", NMI(true_labels, custom_labels))
예제 #3
0
파일: vsm.py 프로젝트: dmuiruri/nlp
def exercise4(filename):
    """
    Topic Modelling
    """
    articles = []
    stopWords = set(stopwords.words('english'))

    stopWords = stopWords | {
        '</H1>', 'The', 'In', 'For', 'was', 'be', 'will', '<H1>'
    }
    text = open(filename, 'r').read().split()
    index_start = list(np.where(np.array(text) == "<DOC")[0])
    for i in range(len(index_start) - 1):
        start_art = index_start[i] + 2
        end_art = index_start[i + 1]
        article = text[start_art:end_art]
        article = [word for word in article if word not in stopWords]
        articles.append(article)
    common_dictionary = corpora.Dictionary(articles)
    common_corpus = [common_dictionary.doc2bow(a)
                     for a in articles]  # each doc to BOW
    n_topics = 2
    lda = LdaModel(common_corpus,
                   id2word=common_dictionary,
                   num_topics=n_topics,
                   passes=200)
    for k in range(n_topics):
        top_words = lda.show_topic(k, topn=5)
        print("Top words in topic {}: {}\n".format(k + 1, top_words))
예제 #4
0
class CorpusLdaModelWrapper:
    def __init__(self, corpus, dictionary, doc_labels, preprocessing_pipeline, numtopics):
        self.corpus = corpus
        self.dictionary = dictionary
        self.doc_labels = doc_labels
        self.pipeline = preprocessing_pipeline
        self.numtopics = numtopics
        self.trained = False

    def train(self):
        # training
        self.model = LdaModel(self.corpus, id2word=self.dictionary, num_topics=self.numtopics)
        self.index = MatrixSimilarity(self.model[self.corpus])

        # flag
        self.trained = True

    def convertTextToReducedVector(self, text):
        if not self.trained:
            raise exceptions.ModelNotTrainedException()
        tokens = word_tokenize(prep.preprocess_text(text, self.pipeline))
        tokens = filter(lambda token: self.dictionary.token2id.has_key(token), tokens)
        bow = self.dictionary.doc2bow(tokens)
        return self.model[bow]

    def queryDoc(self, text):
        reducedVec = self.convertTextToReducedVector(text)
        sims = self.index[reducedVec]
        simtuples = zip(range(len(sims)), sims) if self.doc_labels==None else zip(self.doc_labels, sims)
        simtuples = sorted(simtuples, key=lambda item: item[1], reverse=True)
        return simtuples

    def show_topic(self, id):
        return self.model.show_topic(id)
예제 #5
0
    def get(self, s, e):
        # Loading our datas without treatment
        dataObject = getDatas()
        data = dataObject.get()
        dataEpisode = dataObject.getDataEpisode(data, s, e)

        # preprocess of the words of the episode
        tokenEpisode = []
        tokenEpisode.append(
            [token for token in self.preprocessEpisode(dataEpisode)])
        dictionnaryEpisode = Dictionary(tokenEpisode)

        # creating our model corpus
        model_corpus = []
        for episode in tokenEpisode:
            model_corpus.append(dictionnaryEpisode.doc2bow(episode))

        # Creating our list of topics with the LDA models
        topicsList = []
        string = "Voici les sujets recurrents pour l'episode " + e + " de la saison " + s
        topicsList.append(string)
        lda_model = LdaModel(
            corpus=model_corpus, id2word=dictionnaryEpisode, num_topics=3
        )  # We choose to get only the 3 most significant topics
        for topic_id, topic_keywords in lda_model.show_topics(formatted=False):
            string = "=== Pour le sujet au mot cle principal '" + str(
                lda_model.show_topic(topic_id, topn=1)[0]
                [0]) + "', les mots clefs representatifs sont ==="
            topicsList.append(string)
            # Broswe the keywords of each topic
            for keyword in topic_keywords:
                string = "-> " + str(keyword[0]) + " (" + str(keyword[1]) + ")"
                topicsList.append(string)
        # Return our list of topics
        return topicsList
예제 #6
0
 def __theme_re_weight(self, tokens):
     dictionary = Dictionary(tokens)
     corpus = [dictionary.doc2bow(text) for text in tokens]
     lda = LdaModel(corpus=corpus,
                    id2word=dictionary,
                    num_topics=2,
                    passes=20)
     topic = []
     topic.append(lda.show_topic(topicid=0, topn=8))
     topic.append(lda.show_topic(topicid=1, topn=8))
     return topic
예제 #7
0
def format_term_search_results(model: LdaModel, search_results: dict):
    temp_list = []

    for key, value in search_results.items():
        sorted_value = sorted(value, key=lambda x: x[1], reverse=True)
        for i in sorted_value:
            topic_id, topic_prob = i
            wp = model.show_topic(topic_id)
            topic_keywords = ", ".join([word for word, prop in wp])
            temp_list.append([key, topic_id, topic_prob, topic_keywords])

    return pd.DataFrame(
        temp_list,
        columns=['Search_Term', 'Topic_ID', 'Topic_Prob', 'Topic_Keywords'])
예제 #8
0
    def train_lda(self, cache_path):
        print(cache_path)
        trainBatchIter = BatchIterBert(self.trainDataIter,
                                       filling_last_batch=False,
                                       postProcessor=batchPostProcessor,
                                       batch_size=1)
        bow_list = []
        for item in trainBatchIter:
            bow = item[1].squeeze().detach().numpy().tolist()
            bow_list.append(self.bow_2_gensim(bow))
        print(len(bow_list))
        #print(self.dictProcess.common_dictionary.id2token)
        lda = LdaModel(np.array(bow_list),
                       num_topics=50,
                       passes=200,
                       chunksize=len(bow_list),
                       id2word=self.dictProcess.common_dictionary)
        #print(lda.show_topic(1, topn=10))
        output_topic_line = ''
        for topic_id in range(50):
            current_topic_list = []
            current_topic = lda.show_topic(topic_id, topn=10)
            for topic_tuple in current_topic:
                current_topic_list.append(topic_tuple[0])
            output_topic_line += ' '.join(current_topic_list) + '\n'
            #print(current_topic_list)

        topic_file = os.path.join(cache_path, 'ldatopic.txt')
        with open(topic_file, 'w') as fo:
            fo.write(output_topic_line)

        testBatchIter = BatchIterBert(self.testDataIter,
                                      filling_last_batch=False,
                                      postProcessor=batchPostProcessor,
                                      batch_size=1)

        test_bow_list = []
        word_count = 0
        for item in testBatchIter:
            bow = item[1].squeeze().detach().numpy().tolist()
            word_count += sum(bow)
            test_bow_list.append(self.bow_2_gensim(bow))

        print(word_count)
        ppl = lda.log_perplexity(test_bow_list, len(test_bow_list))
        print(ppl)
        bound = lda.bound(test_bow_list)
        print(bound / word_count)
        print(np.exp2(-bound / word_count))
예제 #9
0
def topicModeling(corpus, dictionary, texts):

    ldamodel = LdaModel(corpus=corpus,
                        num_topics=3,
                        id2word=dictionary,
                        passes=5)

    x = ldamodel.show_topics()  #show generated topics

    #----------------------------------------------------------
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series(
                    [int(topic_num),
                     round(prop_topic, 4), topic_keywords]),
                                                       ignore_index=True)
            else:
                break
    sent_topics_df.columns = [
        'Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords'
    ]

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)

    #-------Generate Visualization------------------------------

    pyLDAvis.enable_notebook()

    topicModel = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)

    pyLDAvis.save_html(
        topicModel,
        '/Users/[email protected]/Documents/projects/PEM/elon.html')

    pyLDAvis.show(topicModel)

    return x, sent_topics_df
def ldavis(importfile, num_topic, outputfile):
    review_df = pd.read_csv(importfile, encoding='cp949')
    review_df['review_txt'] = review_df['review_txt'].str.replace("\n", "")
    review_df['review_txt'] = review_df['review_txt'].str.replace(r'[0-9]', "")
    review_df['review_txt'] = review_df['review_txt'].str.replace(r'(\.)', "")
    review_df['review_txt'] = review_df['review_txt'].str.replace(
        r"[ㄱ-ㅎㅏ-ㅣ]+", "")
    review_df['review_txt'] = review_df['review_txt'].str.replace(
        r"[-=.#/★^&*)~?(:$}]", "")
    review_df['review_txt'] = review_df['review_txt'].str.replace(r"[잼]", "재미")
    review_df['review_txt'] = review_df['review_txt'].str.replace("겜", "게임")
    review_df['review_txt'] = review_df['review_txt'].str.replace("게임", "")
    review_df['review_txt'] = review_df['review_txt'].str.replace("너무", "")
    review_df['review_txt'] = review_df['review_txt'].str.replace("진짜", "")
    review_df['review_txt'] = review_df['review_txt'].str.replace("정말", "")
    review_df['review_txt'] = review_df['review_txt'].str.replace(
        r"[" + str(importfile[:3]) + "]+", "")

    # 각 리뷰마다 명사만 남기고 띄어쓰기 기준으로 구분되어 있는 list of list of str
    okt = Okt()
    texts = []
    for i in range(review_df.shape[0]):
        review_noun = [
            noun_ for noun_ in okt.nouns(review_df.iloc[i, 1])
            if len(noun_) > 1
        ]
        texts.append(review_noun)
    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    NUM_TOPICS = int(num_topic)  # This is an assumption.

    ldamodel = LdaModel(corpus,
                        num_topics=NUM_TOPICS,
                        id2word=dictionary,
                        passes=20)  # This might take some time.
    word_dict = {}

    for i in range(NUM_TOPICS):
        words = ldamodel.show_topic(i, topn=20)
        word_dict['Topic # ' + '{:02d}'.format(i + 1)] = [i[0] for i in words]
    topic_df = pd.DataFrame(word_dict)
    topic_df.to_csv(outputfile + ".csv", index=False)

    prepared_data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
    print("LDA topic modeling ...")
    pyLDAvis.save_html(prepared_data, outputfile + ".html")
예제 #11
0
 def topicsLDA(self, num_topics=10, num_iterations=10000, num_words=10):
     # LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001)
     try:
         lda = LdaModel(corpus=self.corpus, num_topics=num_topics, id2word=self.id2word, iterations=num_iterations)
         result = {}
         tpd = lda[self.corpus] # topic probability distribution
         for topics in tpd:
             for elem in topics:
                 if result.get(elem[0], -1) == -1:
                     words = lda.show_topic(elem[0], topn=num_words)
                     result[elem[0]] = {'weight': elem[1], 'words': words}
                 else:
                     result[elem[0]]['weight'] += elem[1]
         return result
     except Exception as e:
         print e
         return None
예제 #12
0
파일: topic.py 프로젝트: dickrd/cla_tool
class TopicModel(object):

    def __init__(self, documents, cut=True, num_topics=10, min_length=1):
        from cla.util.util import CutDocument
        from gensim.corpora import Dictionary
        from gensim.models import LdaModel

        self.document = CutDocument(documents, cut, cleanup=True, min_length=min_length)
        self.dictionary = Dictionary(self.document)
        self.model = LdaModel(BowCorpus(self.document, self.dictionary),
                              id2word=self.dictionary,
                              num_topics=num_topics)

    def topic_words(self, topic_id, limit=10):
        return self.model.show_topic(topicid=topic_id, topn=limit)

    def identify_topic(self, words):
        return self.model.get_document_topics(self.dictionary.doc2bow(words))
예제 #13
0
def topic_modelling(files=['114.txt', '100.txt', '465.txt', '059.txt']):
    """
    perform topic modelling for a given list of files
    """
    ntopics = 2
    articles = []
    stop_words = set(stopwords.words('english')) | {'Mr', 'The', '-', 'said'}
    for f in files:
        fp = path.join(data_dir, f)
        with open(fp) as f:
            text = f.read().split()  # word_tokenize(
        articles.append([word for word in text if word not in stop_words])

    dictionary = corpora.Dictionary(articles)
    corpus = [dictionary.doc2bow(a) for a in articles]  # doc to BOW
    lda = LdaModel(corpus, id2word=dictionary, num_topics=ntopics, passes=500)
    for i in range(ntopics):
        topwords = lda.show_topic(i, topn=5)
        print("Top words in topic {}: {}\n".format(i + 1, topwords))
예제 #14
0
 def topicsLDA(self, num_topics=10, num_iterations=10000, num_words=10):
     # LdaModel(corpus=None, num_topics=100, id2word=None, distributed=False, chunksize=2000, passes=1, update_every=1, alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50, gamma_threshold=0.001)
     try:
         lda = LdaModel(corpus=self.corpus,
                        num_topics=num_topics,
                        id2word=self.id2word,
                        iterations=num_iterations)
         result = {}
         tpd = lda[self.corpus]  # topic probability distribution
         for topics in tpd:
             for elem in topics:
                 if result.get(elem[0], -1) == -1:
                     words = lda.show_topic(elem[0], topn=num_words)
                     result[elem[0]] = {'weight': elem[1], 'words': words}
                 else:
                     result[elem[0]]['weight'] += elem[1]
         return result
     except Exception as e:
         print e
         return None
예제 #15
0
def predict_and_format_topics(ldamodel: LdaModel,
                              corpus,
                              texts,
                              doc_id: list = None,
                              n_topics=5):
    """Predict top n topics of corpus and format results in a pandas DataFrame
    DataFrame has the following columns:
    'Document_No' and 'Topic_Id', 'Topic_Prob' and'Topic Keywords' for each n topics

    TODO: Refactor code to optimize prediction speed
    """
    df = pd.DataFrame()

    # Get main topic in each document
    for row in ldamodel[corpus]:
        row = sorted(row, key=lambda x: (x[1]), reverse=True)

        # Get the top n topic and topic probability for each document
        temp_list = []
        for topic_num, prob_topic in row[:n_topics]:
            wp = ldamodel.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            temp_list = temp_list + \
                [int(topic_num), round(prob_topic, 4), topic_keywords]
        df = df.append(pd.Series(temp_list), ignore_index=True)

    # Add original text to the end of the output
    # Code commented out for backward compatibility.
    # Uncomment below line to enable concat of original text
    # df = pd.concat([df, pd.Series(texts)], axis=1)
    if doc_id:
        df.insert(0, 'Document_No', doc_id)
    else:
        df.reset_index(inplace=True)

    df.columns = ['Document_No'] + np.array(
        [(f'Dominant_Topic_{i+1}', f'Topic_Prob_{i+1}', 'Topic Keywords')
         for i in range(n_topics)]).flatten().tolist()

    return df
예제 #16
0
    for seg in seg_list:
        seg = ''.join(seg.split())
        if len(seg) > 1 and seg not in skiplist and seg not in stopwords:
            result.append(seg)
    train.append(result)

print('Starting gensim module')
dictionary = Dictionary(train)
corpus = [dictionary.doc2bow(text) for text in train]
tfidf = TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda_model = LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50)
corpus_lda = lda_model[corpus_tfidf]
topic_list = []
for i in range(lda_model.num_topics):
    topic_list.append(lda_model.show_topic(i))
word_index = {}
for i in range(len(pagelist)):
    theme, dist = sorted(corpus_lda[i], key=lambda x:x[1], reverse=True)[0]
    #print(lda_model.print_topic(theme))
    weight = pages[pagelist[i]]*1e5
    for topic_word, likelihood in topic_list[theme]:
        if topic_word in word_stats.keys():
            word_stats[topic_word] += likelihood * weight
            word_index[topic_word].append((dist * likelihood, pagelist[i]))
        else:
            word_stats[topic_word] = likelihood * weight
            word_index[topic_word] = [(dist * likelihood, pagelist[i])]

cloud = WordCloud(
    font_path = 'simhei.ttf',
예제 #17
0
파일: demo.py 프로젝트: pielstroem/Topics
def upload_file():
    """
    Upload csv files and create:
        * ~/out/corpus.dict
        * ~/out/corpus.lda
        * ~/out/corpus.lda.state
        * ~/out/corpus.mm
        * ~/out/corpus.mm.index
        * ~/out/corpus_doclabels.txt
        * ~/out/corpus_topics.txt
        * ~/mycorpus.txt

    As well as (for example):
        * ~/swcorp/Doyle_AStudyinScarlet.txt
        * ~/swcorp/Lovecraft_AttheMountainofMadness.txt
        * etc.
    """

    # INPUT
    # columns to read from csv file
    columns = ['ParagraphId', 'TokenId', 'Lemma', 'CPOS', 'NamedEntity']

    # parts-of-speech to include into the model
    pos_tags = ['ADJ', 'NN', 'V']

    # stopwords
    regex = re.compile('\w+')
    stopwords = request.files['stoplist']
    stopwords = str(stopwords.readlines())
    stopwords = regex.findall(stopwords)
    stopwords.extend(("'", "'d", "'s")) # temporary solution
    print(stopwords)

    # document size (in words)
    doc_size = 1000

    # uses the pipeline's ParagraphId to split text into documents,
    # overrides doc_size - 1: on, 0: off
    doc_split = 0

    # no. of topics to be generated
    no_of_topics = 30

    # no. of lda iterations - usually, the more the better, but
    # increases computing time
    no_of_passes = 1

    # perplexity estimation every n chunks -
    # the smaller the better, but increases computing time
    eval = 1

    # documents to process at once
    chunk = 100

    # "symmetric", "asymmetric", "auto", or array
    # (default: a symmetric 1.0/num_topics prior) affects sparsity of
    # the document-topic (theta) distribution
    alpha = "symmetric"

    # custom alpha may increase topic coherence, but may also produce
    # more topics with zero probability alpha = np.array([ 0.02, 0.02,
    # 0.02, 0.03, 0.03, 0.03, 0.04, 0.04, 0.04, 0.05, 0.05, 0.04, 0.04,
    # 0.04, 0.03, 0.03, 0.03, 0.02, 0.02, 0.02])

    # can be a number (int/float), an array, or None
    # affects topic-word (lambda) distribution - not necessarily
    # beneficial to topic coherence
    eta = None

    # PREPROCESSING
    files = request.files.getlist('files')
    docs = []
    doc_labels = []

    print("\n reading files ...\n")

    for file in files:
        file_label = secure_filename(file.filename).split('.')[0]

        df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE)
        df = df[columns]
        df = df.groupby('CPOS')

        doc = pd.DataFrame()
        for p in pos_tags:  # collect only the specified parts-of-speech
            doc = doc.append(df.get_group(p))
            # construct documents
            if doc_split:  # size according to paragraph id
                doc = doc.groupby('ParagraphId')
                for para_id, para in doc:
                    docs.append(para['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(para_id)]))
            else:  # size according to doc_size
                doc = doc.sort_values(by='TokenId')
                i = 1
                while(doc_size < doc.shape[0]):
                    docs.append(
                        doc[:doc_size]['Lemma'].values.astype(str))
                    doc_labels.append(
                        ''.join([file_label, " #", str(i)]))
                    doc = doc.drop(doc.index[:doc_size])
                    i += 1
                docs.append(doc['Lemma'].values.astype(str))
                doc_labels.append(''.join([file_label, " #", str(i)]))

            if not os.path.exists(os.path.join(os.getcwd(), "swcorp")):
                os.makedirs(os.path.join(os.getcwd(), "swcorp"))

            swpath = os.path.join('swcorp', "".join(file_label))

            with open(swpath + ".txt", 'w', encoding="utf-8") as text:
                text.write(" ".join(
                    word for word in doc['Lemma'].values.astype(str)
                    if word not in stopwords))

    print("\n normalizing and vectorizing ...\n")

    # texts = [
    #   [word for word in doc if word not in stopwords] for doc in docs]

    print("\n stopwords removed ...\n")

    print("\n writing mastercorpus ...\n")

    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    with open(mastercorpus, 'w', encoding="utf-8") as data:
        folder = glob.glob("swcorp/*")
        for text in folder:
            with open(text, 'r', encoding="utf-8") as text:
                textline = [re.sub(
                    r'\\n\\r', '', document) for document in ' '.join(
                        text.read().split())]
                if text != folder[-1]:
                    data.write("".join(textline) + "\n")
                else:
                    data.write("".join(textline))

    # MAIN PART
    mastercorpus = os.path.join(os.getcwd(), 'mycorpus.txt')

    dictionary = corpora.Dictionary(
        line.lower().split() for line in open(
            mastercorpus, encoding="utf-8"))

    class MyCorpus(object):
        def __iter__(self):
            for line in open('mycorpus.txt'):
                # assume there's one document per line, tokens
                # separated by whitespace
                yield dictionary.doc2bow(line.lower().split())

    # corpus = buildCorpus(mastercorpus, dictionary)

    corpus = MyCorpus()

    # corpus = glob.glob("swcorpus/*")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)): os.makedirs(os.path.join
    # (os.path.join(os.getcwd(), 'out'), foldername))

    MmCorpus.serialize(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus.mm'])), corpus)
    mm = MmCorpus('out/corpus.mm')

    print(mm)

    # doc_labels = glob.glob("corpus/*")

    print("fitting the model ...\n")

    model = LdaModel(
        corpus=mm, id2word=dictionary, num_topics=no_of_topics,
        passes=no_of_passes, eval_every=eval, chunksize=chunk,
        alpha=alpha, eta=eta)

    # model = LdaMulticore(corpus=corpus, id2word=dictionary,
    # num_topics=no_of_topics, passes=no_of_passes,
    # eval_every=eval, chunksize=chunk, alpha=alpha, eta=eta)

    print(model, "\n")

    topics = model.show_topics(num_topics=no_of_topics)

    for item, i in zip(topics, enumerate(topics)):
        print("topic #"+str(i[0])+": "+str(item)+"\n")

    print("saving ...\n")

    if not os.path.exists("out"):
        os.makedirs("out")
    # if not os.path.exists(os.path.join(os.path.join(os.getcwd(),
    # 'out'), foldername)):
    # os.makedirs(os.path.join(os.path.join(os.getcwd(), 'out'),
    # foldername))

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_doclabels.txt"])), "w", encoding="utf-8") as f:
            for item in doc_labels:
                f.write(item + "\n")

    with open(
        os.path.join(os.path.join(os.getcwd(), "out"), ''.join(
            ["corpus_topics.txt"])), "w", encoding="utf-8") as f:
        for item, i in zip(topics, enumerate(topics)):
            f.write(
                "".join(["topic #", str(i[0]), ": ", str(item), "\n"]))

    dictionary.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'dict'])))
    # MmCorpus.serialize(
    # os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
    # [foldername, 'mm'])), corpus)
    model.save(
        os.path.join(os.path.join(os.getcwd(), "out"), '.'.join(
            ['corpus', 'lda'])))

    print("\n ta-daaaa ...\n")
    
    # VISUALIZATION
    no_of_topics = model.num_topics
    no_of_docs = len(doc_labels)
    doc_topic = np.zeros((no_of_docs, no_of_topics))
    
    for doc, i in zip(corpus, range(no_of_docs)):
        # topic_dist is a list of tuples (topic_id, topic_prob)
        topic_dist = model.__getitem__(doc)
        for topic in topic_dist:
            doc_topic[i][topic[0]] = topic[1]
    
    # get plot labels
    topic_labels = []
    for i in range(no_of_topics):
        # show_topic() returns tuples (word_prob, word)
        topic_terms = [x[0] for x in model.show_topic(i, topn=3)]
        topic_labels.append(" ".join(topic_terms))
        
    # cf. https://de.dariah.eu/tatom/topic_model_visualization.html

    if no_of_docs > 20 or no_of_topics > 20:
        plt.figure(figsize=(20, 20)) # if many items, enlarge figure
    plt.pcolor(doc_topic, norm=None, cmap='Reds')
    plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels)
    plt.xticks(
        np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90')
    plt.gca().invert_yaxis()
    plt.colorbar(cmap='Reds')
    plt.tight_layout()
    plt.savefig("./static/corpus_heatmap.svg")
    return render_template('success.html')
예제 #18
0
def main():
    # How to create a dictionary from a list of sentences?
    documents = read_course_descriptions()

    # Tokenize(split) the sentences into words
    texts = [[text for text in doc.split()] for doc in documents]

    # Create dictionary
    dictionary = corpora.Dictionary(texts)

    # Get information about the dictionary
    print(dictionary)

    # print(dictionary.token2id)

    # Tokenize the docs
    stopword_nltk = stopwords.words('english')
    tokenized_list = [preprocess(doc, stopword_nltk) for doc in documents]

    # Create the Corpus
    mydict = corpora.Dictionary()
    mycorpus = [
        mydict.doc2bow(doc, allow_update=True) for doc in tokenized_list
    ]
    # pprint(mycorpus)
    # > [[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)], [(4, 4)]]

    word_counts = [[(mydict[id], count) for id, count in line]
                   for line in mycorpus]
    pprint(word_counts)

    # Create the TF-IDF model
    tfidf = models.TfidfModel(mycorpus, smartirs='ntc')

    # Show the TF-IDF weights
    #for doc in tfidf[mycorpus]:
    #    print([[mydict[id], np.around(freq, decimals=2)] for id, freq in doc])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s')
    logging.root.setLevel(level=logging.INFO)

    lda_model = LdaModel(corpus=tfidf[mycorpus],
                         id2word=mydict,
                         random_state=100,
                         num_topics=30,
                         passes=100,
                         chunksize=1000,
                         alpha='asymmetric',
                         decay=0.5,
                         offset=64,
                         eta=None,
                         eval_every=0,
                         iterations=1000,
                         gamma_threshold=0.001,
                         per_word_topics=True)

    # save the model
    lda_model.save('lda_model.model')

    # See the topics
    #lda_model.print_topics(-1)

    lda_model.show_topic(0)
예제 #19
0
class TopicModel:
    def __init__(self, topicCollection, string):
        if string.lower() == "nmf":
            self.model = "NMF"
            print("Topic Extraction Model: sklearn.NMF")
        else:
            self.model = "LDA"
            print("Topic Extraction Model: gensim.LDAModel")
        self.stemmer = PorterStemmer()

    #Train the LDA model on the current discussion
    def train(self, sentences):
        if self.model == "NMF":
            self.sentenceData = []
            for sentence in sentences:
                self.sentenceData.append(preprocess(sentence, self.stemmer))
            self.tfidf_vectorizer = TfidfVectorizer(
                max_features=1500,
                ngram_range=(1, 2),
                preprocessor=' '.join,
                stop_words='english'
            )
            tfidf = self.tfidf_vectorizer.fit_transform(self.sentenceData)
            self.nmf = NMF(n_components=2, solver="mu")
            self.W = self.nmf.fit_transform(tfidf)
            self.H = self.nmf.components_
        else:
            sentenceData = []
            for sentence in sentences:
                sentenceData.append(preprocess(sentence, self.stemmer))
            self.dictionary = Dictionary(sentenceData)
            bow_corpus = [self.dictionary.doc2bow(doc) for doc in sentenceData]
            self.lda_model = LdaModel(bow_corpus, num_topics=2, id2word=self.dictionary, passes=10)

    #Classify a given sentence to one of the topics found in training
    def classify(self, sentence):
        if self.model == "NMF":
            index = self.sentenceData.index(preprocess(sentence, self.stemmer))
            topic = self.W.argmax(axis=1)[index]
            return "Topic " + str(topic)
        else:
            bow_vector = self.dictionary.doc2bow(preprocess(sentence, self.stemmer))
            return "Topic " + str(sorted(self.lda_model[bow_vector], key=lambda tup: -1*tup[1])[0][0])

    #Shows the terms of a given topic
    def showTerms(self, topic):
        if self.model == "NMF":
            terms = ""
            top_features = []
            tfidf_feature_names = self.tfidf_vectorizer.get_feature_names()
            for topic_idx, topicID in enumerate(self.H):
                if topic_idx == int(topic.split(' ')[-1]):
                    top_features_ind = topicID.argsort()[:-20 - 1:-1]
                    top_features = [tfidf_feature_names[i] for i in top_features_ind]
                    weights = topicID[top_features_ind]
            for term in top_features:
                terms += term + ", "
            print(topic.split(' ')[-1] + " " + terms)
            return terms
        else:
            terms = ""
            topic = int(topic.split(" ")[-1])
            for term in self.lda_model.show_topic(topic):
                terms += term[0] + ", "
            print(str(topic) + " " + terms)
            return terms

    #Gets the probability or the coefficient of the given term in the topic
    def getCoeff(self, topic, term):
        if self.model == "NMF":
            weights = []
            top_features = []
            tfidf_feature_names = self.tfidf_vectorizer.get_feature_names()
            for topic_idx, topicID in enumerate(self.H):
                if topic_idx == topic:
                    top_features_ind = topicID.argsort()[:-20 - 1:-1]
                    top_features = [tfidf_feature_names[i] for i in top_features_ind]
                    weights = topicID[top_features_ind]
            for coeff, terms in zip(weights, top_features):
                if terms == term:
                    return coeff
        else:
            topic = int(topic.split(" ")[-1])
            for terms in self.lda_model.show_topic(topic):
                if terms[0] == term:
                    return terms[1]

    #Shows all the topics found in training
    def showTopics(self):
        if self.model == "NMF":
            ret = []
            for topic_idx, topicID in enumerate(self.H):
                ret.append("Topic " + str(topic_idx))
            return ret
        else:
            topics = self.lda_model.print_topics()
            ret = []
            for topic in topics:
                ret.append("Topic " + str(topic[0]))
            return ret

    #Returns a flag to check what model is deployed at the moment
    def getModel(self):
        return self.model
예제 #20
0
                        except:
                            continue
                writer.writerow(new_sentence)
                new_sentences.append(new_sentence)

    # 単語と単語IDを対応させる辞書の作成
    dictionary = Dictionary(new_sentences)
    # LdaModelが読み込めるBoW形式に変換
    corpus = [dictionary.doc2bow(text) for text in new_sentences]

    # トピック数を指定してモデルを学習
    lda = LdaModel(corpus=corpus,
                   id2word=dictionary,
                   num_topics=9,
                   minimum_probability=0.001,
                   passes=20,
                   update_every=0,
                   chunksize=10000)

    with open('output_topics.csv', 'w', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        for i in range(9):
            writer.writerow([f'------------topic{i}------------'])
            print("\n")
            print("=" * 80)
            print("TOPIC {0}\n".format(i))
            topic = lda.show_topic(i, topn=20)
            for t in topic:
                print("{0:20s}{1}".format(t[0], t[1]))
                writer.writerow([t[0], t[1]])
예제 #21
0
# WordCloud
# 日本語フォントをダウンロードしてwork以下に設置
fig, axs = plt.subplots(ncols=2,
                        nrows=math.ceil(lda_model.num_topics / 2),
                        figsize=(16, 20))
axs = axs.flatten()


def color_func(word, font_size, position, orientation, random_state,
               font_path):
    return 'darkturquoise'


for i, t in enumerate(range(lda_model.num_topics)):

    x = dict(lda_model.show_topic(t, 30))
    im = WordCloud(background_color='black',
                   color_func=color_func,
                   max_words=4000,
                   width=300,
                   height=300,
                   random_state=0,
                   font_path='./work/ipaexg.ttf').generate_from_frequencies(x)
    axs[i].imshow(im.recolor(colormap='Paired_r', random_state=244),
                  alpha=0.98)
    axs[i].axis('off')
    axs[i].set_title('Topic ' + str(t))

# vis
plt.tight_layout()
plt.show()
예제 #22
0
bow_corpus = [ dictionary.doc2bow(t) for t in texts ]

print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)

size = len(bow_corpus) * 9/10
training = bow_corpus[:size]
testing = bow_corpus[size:]
t0 = time()
print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics, len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5)
print("done in %0.3fs." % (time() - t0))
print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)

print 'Random subset of topics:'
print '\n'.join(lda.print_topics())

print 'Computing perplexity on %d held-out documents ...' % len(testing)
perplexity = 2 ** -(lda.log_perplexity(testing))
print 'Perplexity: %.2f' % perplexity

for i in range(0, Num_Topics):
 temp = lda.show_topic(i, 10)
 terms = []
 for term in temp:
     terms.append(term[1])
 print "Top 10 terms for topic #" + str(i) + ": "+ ", ".join(terms)


예제 #23
0
print 'Building bag-of-words corpus ...'
bow_corpus = [dictionary.doc2bow(t) for t in texts]

print 'Serializing corpus (%s) ...' % BOW
MmCorpus.serialize(BOW, bow_corpus)

size = len(bow_corpus) * 9 / 10
training = bow_corpus[:size]
testing = bow_corpus[size:]
t0 = time()
print 'Training LDA w/ %d topics on first %d texts ...' % (Num_Topics,
                                                           len(training))
lda = LdaModel(training, id2word=dictionary, num_topics=Num_Topics, passes=5)
print("done in %0.3fs." % (time() - t0))
print 'Saving LDA model (%s) ...' % NSFLDA
lda.save(NSFLDA)

print 'Random subset of topics:'
print '\n'.join(lda.print_topics())

print 'Computing perplexity on %d held-out documents ...' % len(testing)
perplexity = 2**-(lda.log_perplexity(testing))
print 'Perplexity: %.2f' % perplexity

for i in range(0, Num_Topics):
    temp = lda.show_topic(i, 10)
    terms = []
    for term in temp:
        terms.append(term[1])
    print "Top 10 terms for topic #" + str(i) + ": " + ", ".join(terms)
largest = pairwise.max()
for ti in range(len(topics)):
    pairwise[ti, ti] = largest + 1


def closest_to(doc_id):
    return pairwise[doc_id].argmin()


counts = np.zeros(100)
for doc_top in topics:
    for ti, _ in doc_top:
        counts[ti] += 1

words = lda_model.show_topic(counts.argmax(), 64)
print words

#
# plot
#

# for ti in xrange(84):
#     words = lda_model.show_topic(ti, 64)
#     tf = sum(f for f, w in words)
#     print('\n'.join('{}:{}'.format(w, int(1000. * f / tf)) for f, w in words))
#     print()
#     print()
#     print()

# thetas = [lda_model[c] for c in corpus_lda]
예제 #25
0
def get_topics(raw_text, ngram=1, vocab_binary=True, nwords=30, ntopics=1):

    # Enable logging
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    print('Number of documents: ' + str(len(raw_text)))
    print('\nTokenizing documents..')

    tokens = []
    for doc in raw_text:

        try:
            doc = doc.encode("utf-8")
            token = word_tokenize(str(doc))

            clean_token = [
                i.lower() for i in token if i.strip() not in stop_words
                and i[:-2].strip() not in stop_words and i[:-1].strip() not in
                stop_words and i.strip().isalpha() and len(i.strip()) > 1
            ]

            ngram_tokens = []

            ngram_tokens.extend([x[0] for x in ngrams(clean_token, 1)])

            if ngram > 1:
                for i in range(2, ngram + 1):
                    ngram_tokens.extend(
                        [' '.join(x) for x in ngrams(clean_token, i)])

            if vocab_binary:
                tokens.append(set(ngram_tokens))
            else:
                tokens.append(ngram_tokens)

        except Exception as e:
            print(e)

    # turn our tokenized documents into a id <-> term dictionary
    dictionary = corpora.Dictionary(tokens)

    # dictionary.filter_extremes(no_above=1.0, keep_n=None)

    len(dictionary)

    # convert tokenized documents into a document-term matrix
    corpus = [dictionary.doc2bow(text) for text in tokens]

    lm = LdaModel(corpus=corpus,
                  id2word=dictionary,
                  passes=1,
                  num_topics=ntopics)

    # get topic 0 words (Topic: Default) for foam-tree viz
    words = (lm.show_topic(0, nwords))

    output = []

    weights = [x[1] for x in words]
    scaled_weights = scale_range(weights, 5, 60)
    scaled_weights = numpy.nan_to_num(scaled_weights, 5)

    for i, word in enumerate(words):
        output.append({'label': word[0], 'weight': int(scaled_weights[i])})

    return output
class WordEmbeddingRewarder():

    def __init__(self):
        self.word_embedding = KeyedVectors.load_word2vec_format(W2V_PATH)

    def __call__(self, docs, summaries):
        tfs = []
        df = OrderedDict()
        weights = []
        entities = []
        for doc in docs:
            tf = OrderedDict()
            token_found = set()
            doc_token = []
            for sent in doc[1]:
                sent = sent2tokens_wostop(sent, set(stopwords.words(LANGUAGE)), LANGUAGE)
                for token in sent:
                    if token in tf:
                        tf[token] += 1
                    else:
                        tf[token] = 1
                    if token not in token_found:
                        token_found.add(token)
                        if token in df:
                            df[token] += 1
                        else:
                            df[token] = 1
                    embedding = np.zeros(300)
                    try:
                        embedding += self.word_embedding[token]
                    except KeyError:
                        pass
                embedding /= len(embedding)
                weights.append(embedding)
                entities.append(str(len(entities)))
            tfs.append(tf)

        id2word = {i:word for i, word in enumerate(df.keys())}
        word2id = {id2word[id]:id for id in id2word.keys()}
        corpora = [[(word2id[token], tf[token]) for token in tf.keys()] for tf in tfs]
        self.doc_entities = []
        for i, tf in enumerate(tfs):
            divisor = sum([tf[token]/df[token] for token in tf.keys()])
            embedding = []
            for token in tf.keys():
                try:
                    embedding.append(self.word_embedding[token]*tf[token]/df[token])
                except KeyError:
                    pass
            embedding = np.sum(np.array(embedding), 0)/(len(embedding)*divisor)
            weights.append(embedding)
            entities.append('d'+str(i))
            self.doc_entities.append('d'+str(i))

        self.lda = LdaModel(corpus=corpora, num_topics=10, id2word=id2word, passes=10)

        self.topic_entities = []
        for i in range(10):
            topic_words = self.lda.show_topic(i, topn=30)
            embedding = []
            divisor = sum([w_p_pair[1]for w_p_pair in topic_words])
            for w_p_pair in topic_words:
                try:
                    embedding.append(self.word_embedding[w_p_pair[0]]*w_p_pair[1]/divisor)
                except KeyError:
                    pass
            embedding = np.sum(np.array(embedding), 0)/len(embedding)
            weights.append(embedding)
            entities.append('t'+str(i))
            self.topic_entities.append('t'+str(i))
        self.sent_embedding = WordEmbeddingsKeyedVectors(300)
        self.sent_embedding.add(entities, np.array(weights), replace=True)

        return self.distributional_semantic_similarity(summaries), self.topic_relevance(summaries), self.coherence(summaries)

    def distributional_semantic_similarity(self, summaries):
        results = []
        for summ in summaries:
            sent_entities = list(map(str, summ))
            wmd = self.sent_embedding.wmdistance(sent_entities , self.doc_entities)
            results.append(wmd)
        return results

    def topic_relevance(self, summaries):
        results = []
        for summ in summaries:
            sent_entities = list(map(str, summ))
            wmd = self.sent_embedding.wmdistance(sent_entities , self.topic_entities)
            results.append(wmd)
        return results

    def coherence(self, summaries):
        results = []
        for summ in summaries:
            sim = []
            for i in range(len(summ)-1):
                s = cosine(self.sent_embedding[str(summ[i])], self.sent_embedding[str(summ[i+1])])
                if not np.isnan(s):
                    sim.append(s)
            sim = np.array(sim)
            results.append([np.mean(sim), np.std(sim)])
        return results
예제 #27
0
    with open(os.path.join(path, 'data.tsv'), encoding='utf8') as f:
        reader = csv.reader(f, delimiter="\t")
        for line in reader:
            labels = line[0].split(', ')
            multi_hot_labels.append(labels)
            c = line[1:]
            c = clean_data(c)
            context.extend(c)
    #convert to multi-hot encoding
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(multi_hot_labels)
    label_list = list(mlb.classes_)

    token_context = [word_tokenize(x) for x in context]
    token_list = []
    for x in token_context:
        temp = [i for i in x if not i in stop_words]
        token_list.append(temp)
    token_context = [clean_data(x) for x in token_list]
    del token_list
    common_dictionary = Dictionary(token_context)
    common_corpus = [common_dictionary.doc2bow(text) for text in token_context]
    # Train the model on the corpus.
    lda = LdaModel(common_corpus,
                   id2word=common_dictionary,
                   alpha='auto',
                   num_topics=3,
                   passes=5)
    print(lda.show_topic(2, 20))
texts = [[word for word in document.lower().split() if word not in stoplist]
         for document in documents]

from collections import defaultdict

frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] >= 1]
         for text in texts]

from pprint import pprint  # pretty-printer

dictionary = corpora.Dictionary(texts)
# dictionary.save('/tmp/deerwester.dict') # store the dictionary, for future reference
# print(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
# corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)

lda = LdaModel(corpus, num_topics=2)

# on a new document:
new_doc = "pretty obvious that when i write my tellall memoir someday there will be four to six"
new_vec = dictionary.doc2bow(new_doc.lower().split())

print(lda.print_topic(0))
print(lda.show_topic(1))
print(lda.get_document_topics(new_vec))
import pickle
from gensim.models import CoherenceModel
ldamodel = pickle.load(
    open(
        "\\Users\\hamed\\Desktop\\ECE 143 Project Data Files\\ldamodel_100_QAT.pkl",
        "rb"))
ldamulticore = pickle.load(
    open(
        "\\Users\\hamed\\Desktop\\ECE 143 Project Data Files\\ldamulticore_100_QAT.pkl",
        "rb"))

# In[ ]:

# Get top  significant terms and their probabilities for each topic using LDA multicore
topics_ldam = [[(term, round(wt, 3))
                for term, wt in ldamodel.show_topic(n, topn=20)]
               for n in range(0, ldamodel.num_topics)]

# In[ ]:

# 5 most probable words for each topic for LDA
topics_df = pd.DataFrame(
    [[term for term, wt in topic] for topic in topics_ldamulticore],
    columns=['Term' + str(i) for i in range(1, 21)],
    index=['Topic ' + str(t) for t in range(1, ldamulticore.num_topics + 1)]).T
topics_df.head()

# In[ ]:

# 5 most probable words for each topic for LDA
topics_df = pd.DataFrame(
예제 #30
0
class LdaModelHelper:

    status_scheduled = 'scheduled'
    status_computing = 'computing'
    status_completed = 'completed'
    status_error = 'killed'

    default_use_lemmer = True
    default_min_df = 2
    default_max_df = 0.8

    def __init__(self,
                 training_number_of_topics_to_extract,
                 language,
                 training_use_lemmer=True,
                 training_min_df=2,
                 training_max_df=0.8,
                 chunksize=2000,
                 passes=2):
        """

        :rtype: LdaModelHelper
        :param training_use_lemmer:
        :param training_min_df: int or float, min document frequency / document proportion (if float < 1)
        to consider a term in the model
        :param training_max_df: int or float, max document frequency / document proportion (if float < 1)
        to consider a term in the model
        """

        self.language = language

        self.analysis_use_lemmer = LdaModelHelper.default_use_lemmer
        self.analysis_min_df = LdaModelHelper.default_min_df
        self.analysis_max_df = LdaModelHelper.default_max_df

        self.analysis_corpus = None
        self.analysis_features_names = None
        self.analysis_documents = None

        self.training_number_of_topics_to_extract = training_number_of_topics_to_extract
        self.training_use_lemmer = training_use_lemmer
        self.training_min_df = training_min_df
        self.training_max_df = training_max_df
        self.chunksize = chunksize
        self.passes = passes

        self.training_corpus = None
        self.training_features_names = None
        self.analysis_documents = None
        self.training_documents = None

        self.lda_model = None
        self.model_computation_time = None

        self.topic_labels = None
        self.topic_assignment = None

    def set_analysis_parameters(self,
                                analysis_use_lemmer=True,
                                analysis_min_df=2,
                                analysis_max_df=0.8):

        self.analysis_use_lemmer = analysis_use_lemmer
        self.analysis_min_df = analysis_min_df
        self.analysis_max_df = analysis_max_df

        # reset related fields
        self.topic_assignment = None
        self.topic_labels = None
        self.analysis_corpus = None
        self.analysis_features_names = None
        self.analysis_documents = None

    def generate_model_filename(self):
        return "_".join([
            str(time.time()),
            str(self.training_number_of_topics_to_extract),
            str(self.training_min_df),
            str(self.training_max_df),
            str(self.training_use_lemmer)
        ]).replace('.', '')

    def set_lda_model(self, lda_model):
        self.lda_model = lda_model

    #####################
    # Model computation
    #####################

    def compute_lda_model(self, texts):
        """
        Compute the lda model
        :return:
        """
        if self.training_corpus is None:
            self.compute_corpus(texts, parameters='training')

        if self.training_corpus is None or len(self.training_corpus) == 0:
            raise Exception(
                'The training corpus is empty. Tune model computation parameters.'
            )

        start = time.time()

        if self.passes == 2:
            passes = 10 if (len(self.training_corpus) /
                            self.chunksize) < 10 else 2
        else:
            passes = self.passes

        id2word = {k: v for k, v in enumerate(self.training_features_names)}

        self.lda_model = LdaModel(
            self.training_corpus,
            id2word=id2word,
            num_topics=self.training_number_of_topics_to_extract,
            eval_every=1,
            passes=passes,
            chunksize=self.chunksize)
        end = time.time()

        self.model_computation_time = end - start

    def save_model_to_file(self, file_path):
        """

        :type file_path: str
        :param file_path: the path of the models file
        :return:
        """
        if self.lda_model is None:
            logging.error('The model has not been computed yet.')
            return False
        else:
            self.lda_model.save(file_path)

    def load_model_from_file(self, input_filepath):
        """

        :param input_folder:
        :return:
        """
        self.lda_model = LdaModel.load(input_filepath)

    def compute_corpus(self, texts, parameters='training'):
        """
        Compute the corpus in gensim format considering the specified set of parameters 'training' or 'analysis'.
        :param parameters:
        :param texts:
        :return:
        """
        if parameters == 'training':
            tf_matrix, tf_matrix_features_names, tf_matrix_docs_ids = self.compute_tf_matrix(
                texts, parameters)

            if tf_matrix_features_names is None or len(
                    tf_matrix_features_names) == 0:
                return []

            self.training_corpus = matutils.Sparse2Corpus(
                tf_matrix, documents_columns=False)
            self.training_features_names = tf_matrix_features_names
            self.training_documents = tf_matrix_docs_ids
            return self.training_corpus
        elif parameters == 'analysis':
            if self.lda_model is None:
                logging.error('The model has not been computed yet.')
                return None
            else:
                # Note: words not included in the model are ignored
                tf_matrix, tf_matrix_features_names, tf_matrix_docs_ids = self.compute_tf_matrix(
                    texts, parameters)

                if len(tf_matrix_features_names) == 0:
                    return []

                corpus = [None] * tf_matrix.shape[0]

                if len(tf_matrix_features_names) != 0:
                    word2id = {
                        self.lda_model.id2word[id]: id
                        for id in self.lda_model.id2word.keys()
                    }

                    for i in range(tf_matrix.shape[0]):
                        doc = tf_matrix.getrow(i)
                        _, cols = doc.nonzero()

                        corpus[i] = [None] * len(cols)
                        count = 0
                        for col in cols:
                            if tf_matrix_features_names[col] in word2id.keys():
                                corpus[i][count] = (int(
                                    word2id[tf_matrix_features_names[col]]),
                                                    int(tf_matrix[i, col]))
                                count += 1

                        corpus[i] = corpus[i][:count]

                self.analysis_corpus = corpus
                self.analysis_features_names = tf_matrix_features_names
                self.analysis_documents = tf_matrix_docs_ids

                return self.analysis_corpus
        else:
            logging.error(
                "Value not allowed for argument parameters. Allowed values are 'training' or 'analysis'."
            )
            return None

    def compute_corpus_single_query(self, text):
        """
        Compute the corpus in gensim format for a single query (this implies using special parameters for preprocessing)
        :param text:
        :return:
        """

        if self.lda_model is None:
            logging.error('The model has not been computed or loaded yet.')
            return None, None
        else:
            # Note: words not included in the model are ignored
            stopwords_list = lda_utils.get_stopwords(self.language)
            tf_matrix, tf_matrix_features_names = lda_utils.compute_tf(
                [text], stopwords_list, self.language, True, 1, 1.0)

            if len(tf_matrix_features_names) == 0:
                return [], tf_matrix_features_names

            corpus = [None] * tf_matrix.shape[0]

            if len(tf_matrix_features_names) != 0:
                word2id = {
                    self.lda_model.id2word[id]: id
                    for id in self.lda_model.id2word.keys()
                }

                for i in range(tf_matrix.shape[0]):
                    doc = tf_matrix.getrow(i)
                    _, cols = doc.nonzero()

                    corpus[i] = [None] * len(cols)
                    count = 0
                    for col in cols:
                        if tf_matrix_features_names[col] in word2id.keys():
                            corpus[i][count] = (int(
                                word2id[tf_matrix_features_names[col]]),
                                                int(tf_matrix[i, col]))
                            count += 1

                    corpus[i] = corpus[i][:count]

            return corpus, tf_matrix_features_names

    def compute_tf_matrix(self, texts, parameters='training'):
        """
        Compute the tf matrix using the specified set of parameters ('training' or 'analysis').
        If texts is not specified the system tries to retrieve data directly from the associated db.
        :param parameters: 'training' or 'analysis'
        :param texts: list of strings representing texts to transform.
        :return:
        """

        tf_matrix_docs_id = None
        if parameters == 'training' or parameters == 'analysis':

            stopwords_list = lda_utils.get_stopwords(self.language)

            if parameters == 'training':
                use_lemmer = self.training_use_lemmer
                min_df = self.training_min_df
                max_df = self.training_max_df
            else:
                use_lemmer = self.analysis_use_lemmer
                min_df = self.analysis_min_df
                max_df = self.analysis_max_df

            tf_matrix, tf_matrix_features_names = lda_utils.compute_tf(
                texts, stopwords_list, self.language, use_lemmer, min_df,
                max_df)
        else:
            logging.error(
                "Value not allowed for argument parameters. Allowed values are 'training' or 'analysis'."
            )
            return None

        return tf_matrix, tf_matrix_features_names, tf_matrix_docs_id

    def compute_topic_assignment(self, texts):
        """
        Computes the topics assignment for each document w.r.t the specified topic_model

        Example of output = [[(25, 0.1174058544855012), (49, 0.82926081218116554)],
                            [(6, 0.29928250617927882), (49, 0.59405082715405444)]]

        :param texts:
        :return:
        """
        corpus = self.compute_corpus(texts, parameters='analysis')

        if len(corpus) == 0:
            raise Exception(
                'The corpus is empty. Tune analysis parameters and check stopwords.'
            )

        computed_assignment = self.lda_model[corpus]
        if texts is not None:
            # is the corpus related to analysis parameters
            self.topic_assignment = computed_assignment

        return computed_assignment

    def compute_topic_assignment_for_query(self, text):
        corpus, _ = self.compute_corpus_single_query(text)

        if corpus is None or len(corpus) == 0:
            raise Exception(
                'The corpus is empty. Tune analysis parameters and check stopwords.'
            )

        computed_assignment = self.lda_model[corpus]

        return computed_assignment

    #######################
    # Print functions
    #######################

    def print_topic_assignment(self, topic_assignment):
        """
        Print a topic assignment in a human readable format
        :param topic_assignment:
        :return:
        """
        print('\tTopic importance\tTopic description')
        for i, doc in enumerate(topic_assignment):
            print('Document {0}'.format(i))
            for a in doc:
                print()
                string_topic = a[
                    0] if self.lda_model is None else self.lda_model.print_topic(
                        a[0])
                print('\t{1:2f}\t\t{0}'.format(string_topic, a[1]))

    def print_all_topics(self,
                         num_topics=10,
                         num_words=20,
                         try_to_disambiguate=False,
                         min_word_probabity_for_disambiguation=0.010):
        """
        Print topics from a given LdaModel
        """
        print('Print {0} topics'.format(num_topics))
        print('------------')
        for t in self.lda_model.show_topics(num_topics=num_topics,
                                            num_words=num_words,
                                            formatted=False):
            if try_to_disambiguate:
                possible_labels = self.__class__.label_topic_by_probability(
                    self.lda_model.show_topic(t[0]),
                    min_word_probability=min_word_probabity_for_disambiguation
                )[:2]
                print('{0}:\t{1}\n'.format(t[0], possible_labels))
                print('{0}\n'.format(t[1]))
            else:
                print('{0}:\t{1}\n'.format(t[0], t[1]))

    def get_topic_description(self, topic_id, num_words=20):
        """
        Print topics from a given LdaModel
        """
        if self.lda_model is None:
            logging.error('The model has not been computed yet.')
        else:
            return self.lda_model.show_topic(topic_id, num_words)

    #######################
    # Labeling functions
    #######################

    def compute_topic_labels(self,
                             labeling_mode='mixed',
                             min_word_probability=0.01,
                             max_number_of_words_per_query=6,
                             n_words_to_label=3):
        """
        The labeling is performed querying wikipedia with a set of representative words for the topic.
        The words are chosen with the parameter
        labeling_mode:
        - 'based_on_probability': considers all words with a weight (probability) greater than 0.010
        - 'based_on_top_words': considers the 3 most probable words for the topic
        - 'mixed': try with 'based_on_probability', if there are no results try with 'based_on_top_words'
        """

        if self.lda_model is None:
            logging.error('No LDA model loaded.')

        n_labels_to_save = 3
        self.topic_labels = {}

        # label topics
        for t in self.lda_model.show_topics(
                num_topics=self.training_number_of_topics_to_extract,
                num_words=40,
                formatted=False):
            topic_id = t[0]

            possible_labels = []
            if labeling_mode == 'mixed' or labeling_mode == 'based_on_probability':
                possible_labels = self.__class__.label_topic_by_probability(
                    self.lda_model.show_topic(topic_id),
                    min_word_probability=min_word_probability,
                    max_words=max_number_of_words_per_query)[:n_labels_to_save]

            if len(possible_labels) == 0:
                # try to disambiguate by n_words
                possible_labels = self.__class__.label_topic_by_number_of_words(
                    self.lda_model.show_topic(topic_id),
                    n_words=n_words_to_label)[:n_labels_to_save]

            for i in range(len(possible_labels), n_labels_to_save):
                # fill empty labels
                possible_labels.append('')

            self.topic_labels[topic_id] = possible_labels
            time.sleep(0.5)

    def get_topic_labels(self):
        if self.topic_labels is None:
            self.compute_topic_labels()

        return self.topic_labels

    def get_all_topics(self):
        """
        Return a dictionary where keys are topic ids (integers) and values are words distributions.
        Words distribution should be a dictionary where keys are words and values are words weights within the topic
        :rtype: dict
        :return:
        """

        topics = {}

        for t in self.lda_model.show_topics(
                num_topics=self.training_number_of_topics_to_extract,
                num_words=config.max_number_of_words_per_topic,
                formatted=False):
            topic_id = t[0]
            topic_distr = self.get_word_frequencies(
                self.lda_model.show_topic(
                    topic_id, config.max_number_of_words_per_topic))

            topics[topic_id] = topic_distr

        return topics

    def _get_words_distribution(self, topic_id):
        """
        Return a a dictionary where keys are words and values are words weights within the topic

        :param topic_id: the topic index
        :rtype: dict
        :return:
        """
        topic_description = self.lda_model.show_topic(
            topic_id, config.max_number_of_words_per_topic)
        return self.__class__.get_word_frequencies(topic_description)

    @classmethod
    def delete_model_files(cls, folder_path, files_prefix):
        """
        Delete all files related to a model that have the specified file prefix
        :param folder_path:
        :param files_prefix:
        :rtype:
        :return: 200 if all files have been removed, 404 if files does not exist
        """
        if os.path.exists(os.path.join(folder_path, files_prefix)):
            files_to_remove = [
                files_prefix,
                files_prefix + ".state",
                files_prefix + ".expElogbeta.npy",
                files_prefix + ".id2word",
            ]

            for f in files_to_remove:
                os.remove(os.path.join(folder_path, f))

            return 200
        else:
            logging.error('[ERROR] Model files does not exists.')
            return 404

    #######################
    # Topic labeling
    #######################

    @classmethod
    def label_topic_by_probability(cls,
                                   topic_description,
                                   min_word_probability=0.010,
                                   max_words=6):
        """
        Try to disambiguate a topic considering all words with a weight greater than min_word_probability
        :param max_words:
        :param topic_description: is a list of pairs  (word, word_probability)
        :param min_word_probability: is the minimum probability for words
        :return: list of strings, possible wikipedia pages
        """
        words = [w for w, p in topic_description if p >= min_word_probability]
        words = words[:max_words]

        if len(words) == 0:
            # if no words are over the threshold return empty
            res = []
        else:
            res = wikipedia.search(' '.join(words))

        return res

    @classmethod
    def label_topic_by_number_of_words(cls, topic_description, n_words=5):
        """
        Try to disambiguate a topic considering top k words in its description
        :param n_words:
        :param topic_description: is a list of pairs  (word, word_probability)
        :return: list of strings, possible wikipedia pages
        """
        words = [t[0] for i, t in enumerate(topic_description) if i < n_words]

        if len(words) == 0:
            # if no words are over the threshold, take the first
            words = [topic_description[0][0]]

        res = wikipedia.search(' '.join(words))
        return res

    @classmethod
    def get_word_frequencies(cls, topic_description):
        """
        Given a topic description, returns the corresponding dictionary with words as keys
        and frequencies (weight * 1000) as values.
        :param topic_description: list of pairs (word, word_weight)
        :return:
        """
        frequencies = {w: f for w, f in topic_description}
        return frequencies
예제 #31
0
obj = lda.get_topics()
a = lda.inference(corpus)
print(doc_distribution[:853])
# training corpus document by topic matrix
doc_topic_dist_corpus = np.array([[tup[1] for tup in lst]
                                  for lst in lda[corpus]])
save_obj(lda, 'LDA_MODEL_APPLICATION')
#%%
lda = load_obj('LDA_MODEL_APPLICATION')
fig, axes = plt.subplots(2, 3, figsize=(20, 10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    plt.imshow(
        WordCloud(background_color="white").fit_words(
            dict(lda.show_topic(i, 200))))
    plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
    plt.gca().axis('off')

plt.subplots_adjust(wspace=0, hspace=0)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()


#%%
# finding dominant topics in the corpus for each document
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()
예제 #32
0
files = ["deprecated_libraries", "explicit_mention"]

for filename in files:
    # read in the data
    filepath = "data/filtered/tokenized/"+filename+"_lda.pkl" 
    df = read_pickle(filepath)

    num_topics = 20
    chunksize = 300
    dictionary = Dictionary(df['tokenized'])
    corpus = [dictionary.doc2bow(doc) for doc in df['tokenized']]
    # low alpha means each document is only represented by a small number of topics, and vice versa
    # low eta means each topic is only represented by a small number of words, and vice versa
    model = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary,
                    alpha=1e-2, eta=0.5e-2, chunksize=chunksize, passes=5)

    # save the model
    model_name = "models/"+filename+"_ldamodel"
    model.save(model_name)

    print("\n====\n")
    print(filename.upper())

    for topic_id in range(model.num_topics):
        topic = model.show_topic(topic_id, 10)
        topic_words = [ w for w, _ in topic ]
        
        print('{}: {}'.format(topic_id, ' '.join(topic_words)))

예제 #33
0
class LDA(object):
    def __init__(self,
                 source=None,
                 corpus=None,
                 client=None,
                 corpus_specs=None):
        self.corpus = corpus
        # if sum([t is not None for t in [source, corpus, client]]) != 1:
        #     raise NameError(
        #         'You need to provide one and only one of those (source, corpus, client)')
        if source:
            client = get_instance(**source)
        if client:
            corpus_specs = corpus_specs or {}
            self.corpus = client.get_corpus(**corpus_specs)

        self.native_model = None
        self.dictionary = None

    def train(self, num_topics, alpha="auto", passes=10, eta="auto", **kargs):
        """Train Model"""
        kargs = kargs or {}
        prev_mode = self.corpus.mode
        self.corpus.mode = "bow"
        self.native_model = LdaModel(self.corpus,
                                     id2word=self.corpus.get_dictionary(),
                                     num_topics=num_topics,
                                     alpha=alpha,
                                     passes=passes,
                                     eta=eta,
                                     **kargs)
        self.corpus.mode = prev_mode

    @classmethod
    def __display(cls, data):
        template_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "TopicSheet.html.jinja2")
        with open(template_file) as file_:
            template = Template(file_.read())
        html = template.render(data)
        display(HTML(html))

    def save(self, model_path):
        self.native_model.save(model_path)

    def load(self, model_path):
        self.native_model = LdaModel.load(model_path)
        self.dictionary = self.native_model.id2word
        if self.corpus:
            self.corpus.dictionary = self.native_model.id2word

    def get_topics_matrix(self, topn=10):
        data = []
        for i in range(self.native_model.num_topics):
            data.append((i, self.native_model.show_topic(i, topn=topn)))
        return data

    def get_dictionary(self):
        if self.dictionary:
            return self.dictionary
        if self.corpus:
            return self.corpus.get_dictionary()
        if self.native_model.id2word:
            return self.native_model.id2word
        return None

    def predict(self, text):
        text = self.corpus.doc2bow(text)
        result = sorted(self.native_model[text],
                        key=lambda d: d[1],
                        reverse=True)
        return result

    def get_topic_words(self, topic, topn=10):
        return self.native_model.show_topic(topic, topn=topn)

    def compute_uniquiness_score(self, matrix, sort=False):
        num_topic = self.native_model.num_topics
        temp_dic = {}
        for topic in matrix:
            for word in topic[1]:
                w = word[0]
                count = temp_dic.get(w, 0)
                temp_dic[w] = count + 1

        for topic_idx in range(len(matrix)):
            topic = matrix[topic_idx]

            words_topic = topic[1]
            for word_idx in range(len(words_topic)):
                word = matrix[topic_idx][1][word_idx]
                words_topic[word_idx] = (*word,
                                         1 - temp_dic[word[0]] / num_topic)
            if sort:
                words_topic = sorted(words_topic,
                                     key=lambda w: w[1] * w[2],
                                     reverse=True)
            topic = (topic[0], words_topic)
            matrix[topic_idx] = topic

        return matrix

    def display_topics(self, topn=20, sort_by_uniquiness=False):
        matrix = self.get_topics_matrix(topn=topn)
        matrix = self.compute_uniquiness_score(matrix, sort=sort_by_uniquiness)
        self.__display({"topics": matrix})
        return matrix

    def display_topic(self, topic_id, topn=20):
        matrix = self.get_topics_matrix(topn=topn)
        topic = [self.compute_uniquiness_score(matrix, sort=True)[0]]
        self.__display({"topics": topic})
    id2word=dictionary,
    num_topics=num_topics,
    iterations=5,
    passes=10,
    alpha='auto'
)

#DESCARGA DE DATOS A FICHEROS

word_dict = {}
today = date.today()
today_path = '../data/topic_today_EN.csv'
hist_path = '../data/topic_history_EN.csv'

for i in range(num_topics):
    words = lda_model.show_topic(i, topn = 10)
    word_dict['date'] = today
    word_dict['Topic'] = [i[0] for i in words]

topic_today = pd.DataFrame(word_dict)
topic_today.to_csv(today_path, index=False)

if os.path.isfile(hist_path):
    topic_hist = pd.read_csv(hist_path)
    topic_hist = pd.concat([topic_hist, topic_today])
    topic_hist.to_csv(hist_path, index=False)
else:
    topic_today.to_csv(hist_path, index=False)


예제 #35
0
#   topic distribution for the given document bow, as a list of (topic_id, topic_probability) 2-tuples.
test = dct.doc2bow("I love Kitten".lower().strip().split())
print(lda.get_document_topics(test))
print(lda[test])

# 参数(word_id, minimum_probability=None)
# 关联的topics for the given word.
# Each topic is represented as a tuple of (topic_id, term_probability).
print(lda.get_term_topics(0))

# ----- 输出指定topic的构成 -----
# 参数(word_id, minimum_probability=None)
# 输出形式 list, format: [(word, probability), … ].
print(lda.get_topic_terms(0))
# 参数(topicno, topn=10)
print(lda.show_topic(0))
# 输出形式 String, format: ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘.
# 参数(topicno, topn=10)
print(lda.print_topic(0))

# ----- 输出所有topic的构成 -----
# 默认参数(num_topics=10, num_words=10, log=False, formatted=True)
# 输出形式 String, format: [(0, ‘-0.340 * “category” + 0.298 * “$M$” + 0.183 * “algebra” + … ‘), ...]
print(lda.show_topics())
# [num_topics, vocabulary_size] array of floats (self.dtype)
# which represents the term topic matrix learned during inference.
print(lda.get_topics())

# ----- save and load model -----
lda.save(fname="lda_model")
lda.load(fname="lda_model")