示例#1
0
def refine_metadata(metadata_file_name, new_metadata_file_name, graph_file_name,
                    lda_preamble=True, null_inference=True):
    with open(graph_file_name, "rb") as graph_file:
        G = pkl.load(graph_file)

    with open(metadata_file_name, "r") as metadata_file, open(new_metadata_file_name, "w") as new_metadata_file:
        metadata = json.load(metadata_file)["files"]
        for i in range(0, len(metadata)):
            item = metadata[i]
            full_path = os.path.join(item["system"]["path"], item["system"]["file"])
            print "refining metadata for {}".format(full_path)

            if "lda" not in item["system"]["extractors"]:
                topics = topic_mixture(item, metadata, G)
                item["topics"] = topics

                max_topic = max(topics, key=lambda (i, p): p)[0]
                topic_words = [str(w[0]) for w in LdaModel.show_topic(lda_model, max_topic)]
                item["tags"] = topic_words

            if "columnar" in item["system"]["extractors"] and null_inference:
                nulls = inferred_nulls(item)
                if not all([null == 0 for null in nulls]):
                    with open(full_path, "r") as file_handle:
                        new_columns = extract_columnar_metadata(file_handle, pass_fail=False, lda_preamble=lda_preamble,
                                                                null_inference=True)["columns"]
                    metadata[i]["columns"] = new_columns

        json.dump(metadata, new_metadata_file)
示例#2
0
def train_lda(n_topics, id2word_dictionary=None, documents=None, corpus=None):
    """
    Training method for LDA. documents is a list of lists of words/tokens
    documents is used to construct a dictionary and a corpus from which the
    topics for LDA are inferred
    """
    # Construct dictionary of words if it's not passed
    if not id2word_dictionary:
        id2word_dictionary = corpora.Dictionary(documents)

    word2idx_dictionary = dict([(w, idx) for (idx, w) in id2word_dictionary.items()])

    # Construct corpus for model
    if documents and not corpus:
        corpus = [id2word_dictionary.doc2bow(document) for document in documents]

    # Cluster the documents into topics using LDA. number of topics is given
    # by n_topics
    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word_dictionary,
                         num_topics=n_topics,
                         update_every=1,
                         chunksize=10000,
                         passes=1)

    """
    Default value for topn (number of top words to show by probability) is 10.
    A high enough value should return the words covering most or all of the
    probability mass
    """
    topics = [lda_model.show_topic(idx, topn=50000)
              for idx in range(0, n_topics)]

    return lda_model, id2word_dictionary, word2idx_dictionary, topics
示例#3
0
文件: lda.py 项目: laraolmos/nlp-lmdl
class LMDL_LDA():
    def __init__(self):
        self.lmdl = LMDL_Corpus()
        self.texts = self.lmdl.get_corpus_texts_words()
        self.dictionary = Dictionary(self.texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in self.texts]
        self.lda = LdaModel(self.corpus,
                            num_topics=LDA_NUM_TOPICS,
                            id2word=self.dictionary)

    def print_topics(self):
        return self.lda.print_topics(LDA_NUM_TOPICS)

    def get_document_topics(self, document_name):
        document_tokens = self.lmdl.token_list_processed(document_name)
        topics = self.lda.get_document_topics(
            self.dictionary.doc2bow(document_tokens),
            minimum_probability=None,
            minimum_phi_value=None,
            per_word_topics=False)
        show_topics_list = []
        for topic in topics:
            lda_topic = self.lda.show_topic(topic[0], topn=10)
            show_topics_list.append(lda_topic)
        return show_topics_list

    def top_topics(self):
        return self.lda.top_topics(corpus=self.corpus,
                                   texts=self.texts,
                                   dictionary=self.dictionary,
                                   window_size=None,
                                   coherence='u_mass',
                                   topn=20,
                                   processes=-1)
示例#4
0
class topicExtract_lda:
    
    def __init__(self,docs,nTopic = 20):
        """
            extract topic using LDA
            input:
                list of list of words, each list is a token of a doc
        """
        for i,idoc in enumerate(docs):
            if isinstance(idoc, str):
                docs[i] = word_tokenize(idoc)
        self.wordDict = Dictionary(docs)
        self.corpus_docs = [self.wordDict.doc2bow(doc) for doc in docs]
        corpus_csc = corpus2csc(self.corpus_docs)
        #tfidf_model = models.TfidfModel(self.corpus_docs)
        #tfidf_corpus = tfidf_model[self.corpus_docs]
        self.nmf = NMF(n_components = nTopic, random_state = 42)
        self.W = self.nmf.fit_transform(corpus_csc)
        
        self.topics = {'Topic '+ str(i):' '.join(list(self.get_topic_words(i)[1].values())) for i in range(nTopic)}
        self.lda2 = LdaModel(corpus=self.corpus_docs, id2word=self.wordDict, num_topics=nTopic, update_every=1, chunksize=1000, passes=4, random_state = 24)
        #self.lda2.show_topics(num_topics=-1, num_words=4)
        
            
    
    def get_topic_words(self, component_number):
        """
            NMF topics with a gensim corpus represented by component vectors
        """
        sorted_idx = np.argsort(self.W[:,component_number])[::-1][:5]
        component_words = {self.W[:, component_number][number]:self.wordDict[number] for number in sorted_idx[:5]}
        return sorted_idx, component_words
    
    def get_doc_components(self, doc_number):
        sorted_idx = np.argsort(self.nmf.components_[:,doc_number])[::-1][0:3]
        result = {number: self.nmf.components_[:,doc_number][number] for number in sorted_idx}
        return result
    
    def get_document_details(self, doc_number):
        results = []
        for item, val in self.get_doc_components(doc_number).items():
            print("document is composed of topic %d with weight %.4f" % (item, val))
            result = self.get_topic_words(item)[1]
            results.append(result)
        return results
    
    def show_lda(self, doc_num, threshold = 0.05, nWord = 5):
        topic_list = []
        for topic, weight in self.lda2[self.corpus_docs[doc_num]]:
            if weight > threshold:
                topic_list.append({(topic, weight):self.lda2.show_topic(topic, topn = nWord)})
        return topic_list    

    def showTopics(self, nWord= 4):
        output = self.lda2.show_topics(num_topics=-1, num_words = nWord)
        for i in output:
            print(i)
示例#5
0
    def allot_topics(self, topic_num: int,
                     review_dict_list: list) -> OrderedDict:
        corpus, dictionary = self._make_property(review_dict_list)

        lda = LdaModel(corpus=corpus, num_topics=topic_num, id2word=dictionary)

        word_dict = OrderedDict()
        for topic_id in range(topic_num):
            word_dict[topic_id] = lda.show_topic(topic_id)

        return word_dict
示例#6
0
def ldamodel(doc_clean,n_topics,n_words,description,tfidfmodel=False,unseen_docs=None):
    doc_clean = [min_char(doc).split() for doc in doc_clean]

    dictionary = corpora.Dictionary(doc_clean)
    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    corpus = [dictionary.doc2bow(doc) for doc in doc_clean]
    compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=doc_clean, start=2, limit=40, step=6)
    if tfidfmodel:
       tfidf = TfidfModel(corpus,id2word=dictionary,smartirs='ntc')
       corpus = tfidf[corpus]

    ldamodel = LdaModel(corpus, num_topics=16, id2word=dictionary,random_state=1,passes=50,per_word_topics=True)
    print("#Tópicos LDA")
    for i in range(0, n_topics):
        temp = ldamodel.show_topic(i, n_words)
        terms = []
        for term in temp:
            terms.append(term)
        print("Topic #" + str(i) + ": ", ", ".join([t + '*' + str(i) for t, i in terms]))
    print('Bound: ',ldamodel.bound(corpus))
    # Compute Perplexity
    print('Perplexity: ',ldamodel.log_perplexity(corpus))
    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=ldamodel, texts=doc_clean, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)
    if unseen_docs:
        corpus_new = [dictionary.doc2bow(doc) for doc in unseen_docs]
        for i, unseen_doc in enumerate(corpus_new):
            topic = None
            score = 0
            inference_doc = ldamodel[unseen_doc]
            print(unseen_docs[i])
            for index,tmpScore in inference_doc[0]:
                if tmpScore > score:
                    score = tmpScore
                    topic = ldamodel.print_topic(index, 5)
            print ("Score: {}\t Topic: {}".format(score, topic))
        print("Log perplexity for new corpus is", ldamodel.log_perplexity(corpus_new))

    print_result(ldamodel, doc_clean, corpus, n_topics, description)
    pickle.dump(corpus, open(description+'.pkl', 'wb'))
    dictionary.save(description+'dictionary.gensim')
    ldamodel.save(description+'_ldamodel.gensim')
def getPageID2TopicDist(docs, n_topics):
    """
	:docs=[(docID,[word1,word2,..]),...]
	:param n_topics: Number of topics to generate from the data
	:return: [docid]=[0.9,0.1] # sequential topic probability
	"""
    # prepare the corpus in Gensim format
    texts = [doc[1] for doc in docs]

    dictionary = corpora.Dictionary(texts)

    corpus = [dictionary.doc2bow(text) for text in texts]
    #print corpus;
    # apply LDA on the corpus
    lda = LdaModel(corpus, id2word=dictionary, num_topics=n_topics, passes=4)

    # print top terms from each topic
    for i in range(lda.num_topics):
        topic = lda.print_topic(i, topn=8)
        print[tup[1] for tup in lda.show_topic(topicid=i, topn=8)]

    # get the topic distribution in each document
    pageID2TopicDist = dict()
    doc_topic = []
    for index, doc in enumerate(docs):

        doc_bow = dictionary.doc2bow(doc[1])
        topic_distri = lda[doc_bow]
        dlist = [0.0] * n_topics
        for tup in topic_distri:
            dlist[tup[0]] = tup[1]

        #print topic_distri;
        #top_topic = sorted(topic_distri, key=lambda x: x[1], reverse=True)[0][0]
        #doc_topic.append(top_topic)

        pageid = doc[0]
        pageID2TopicDist[pageid] = dlist

    # topic of each document
    #print doc_topic;
    return pageID2TopicDist
示例#8
0
def topic_choosing(number):
    lda_temp = LdaModel(bow_corpus, num_topics=number, id2word=word_dictionary)

    temp_dic = {}
    for topic_number in range(len(lda_temp.print_topics())):
        temp_dic.setdefault(topic_number, {})
        for term, frequency in lda_temp.show_topic(topic_number, topn=25):
            temp_dic[topic_number].setdefault(term)
            temp_dic[topic_number][term] = frequency  #计算每个分类里前25的词的词频
    X = pd.DataFrame(temp_dic).fillna(0)

    pairs = [record for record in combinations(X.columns, 2)]
    distance = []
    for pair in pairs:
        x1 = np.asarray(X[pair[0]]).reshape(-1, 1)
        x2 = np.asarray(X[pair[1]]).reshape(-1, 1)
        distance.append(cosine_distances(x1, x2))  #计算不同分类间在词频上的距离
    average_distance = np.average(distance)  #计算平均距离

    return average_distance
示例#9
0
def extract_topics(model: LdaModel) -> list:
    """
    From the words in a topic deduce a topic name that covers the content of the words in that topic.
    This is done by selecting candidates from the wikipedia API and compare these terms according to the
    following paper: http://www.aclweb.org/anthology/P11-1154
    Args:f
        model: The trained Lda model

    Returns: List of topic labels ordered by topic cluster

    """
    topic_list = []
    for x in range(model.num_topics):
        words = model.show_topic(x, topn=10)
        titles = get_wikipedia_titles([w[0] for w in words])  # TODO alter?
        possible_labels = chunk(titles)
        print(possible_labels)
        labels_content = retrieve_content(possible_labels)
        best_topic = rate_labels(possible_labels, labels_content, words)
        print(best_topic, words)
        print('-' * 80)
        topic_list.append(best_topic)
    return topic_list
示例#10
0
def extract_topic(file_handle, pass_fail=False):
    """Create free-text metadata JSON from file indicating topic
    and some human-readable indication of its content.

        :param file_handle: (str) file
        :param pass_fail: (bool) whether to exit after ascertaining file class
        :returns: (dict) metadata dictionary"""

    tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
    tag_remover = re.compile('<.+>')

    doc = re.sub(tag_remover, '', file_handle.read())
    doc = tokenizer.tokenize(doc)

    # if the doc is an empty list, it clearly can't be topic modeled
    if not doc:
        raise ExtractionFailed
    elif pass_fail:
        raise ExtractionPassed

    doc_bow = dictionary.doc2bow(doc)

    topics = lda_model[doc_bow]
    # normalize topics to sum to 1, as they are usually just short
    sum_topics = sum([topic[1] for topic in topics])
    topics = [[topic_num, prob / sum_topics] for [topic_num, prob] in topics]
    # if no words are common to the training corpus, topics will be an empty list
    if not topics:
        raise ExtractionFailed
    max_topic = max(topics, key=lambda (i, p): p)[0]
    topic_words = [
        str(w[0]) for w in LdaModel.show_topic(lda_model, max_topic)
    ]

    metadata = {"topics": topics, "tags": topic_words}

    return metadata
示例#11
0
#for topic in ldamodel.print_topics(num_topics=4, num_words=30):
#    print (topic[0]+1, " ", topic[1],"\n")

#ldamodel.save('topic_comments_lda.model')

#MAIN PLOT
viz = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
pyLDAvis.save_html(viz, '650000_20t.html')

#ldamodel.save('TM_lda_1000_4t.model')

#FUGURES
fig = plt.figure(figsize=(15, 30))
for i in range(20):
    df = pd.DataFrame(ldamodel.show_topic(i),
                      columns=['term', 'prob']).set_index('term')
    #     df=df.sort_values('prob')

    plt.subplot(10, 2, i + 1)
    plt.title('topic ' + str(i + 1))
    sns.barplot(x='prob',
                y=df.index,
                data=df,
                label='Cities',
                palette='GnBu_d')
    plt.xlabel('probability')

#plt.show()
fig.savefig('650000_20t.png')
示例#12
0
n_topics = 2
lda = LdaModel(corpus_tfidf,
               id2word=dictionary,
               iterations=50,
               num_topics=n_topics)
#lda.save('/Users/roysourish/Desktop/SENG 607/corpus_apps/batphone_0.01.model')

#plt.plot(lda,n_topics)
#plt.show()

#run_model('data.json', field='abstract', model='lda_online', r_ldavis=True, output_file=True)

#names.to_csv(r'/Users/roysourish/Desktop/SENG 607/np.txt', header=None, index=None, sep=' ', mode='a')
## word lists
for i in range(0, n_topics):
    temp = lda.show_topic(i, 5)
    terms = []
    for term in temp:
        terms.append(term)
    print("Top 5 terms for topic #" + str(i) + ": " +
          ", ".join(str(i[0]) for i in terms))

#lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=2)

doc = "maven-release-plugin], release, prepare, zxing-2.2, rollback, changes, c++, german, inspection, remove, issue, pdf417, add, test, remove, update, c++, port, issue, fix,issue, fix, add, use, remove"

vec_bow = dictionary.doc2bow(doc.lower().split())
vec_lsi = lda[vec_bow]  # convert the query to LSI space
print(vec_lsi)

index = similarities.MatrixSimilarity(lda[corpus])
示例#13
0
def LDA_pd(data=data_path,
           list_keys=keywords,
           num_topics=num_topics,
           iterations=iterations,
           alpha=alpha,
           eta=eta,
           embeddings=embeddings,
           top=topn,
           output_path=output,
           use_keywords=use_keywords):

    output = open(output_path + '.output', 'w')
    output.write("Generating {} topics from {} initial keywords \n".format(
        num_topics, len(keywords)))
    output.write(
        "LDA model parameters:\n(1) alpha {}\n(2) eta {}\n(3) running {} iterations. \n"
        .format(alpha, eta, iterations))

    if use_keywords:  # if false, LDA is performed on all data (NOT Partial Data LDA)
        data_words = list(word_lists(data, list_keys))
        output.write("Standard set of keywords includes:\n" +
                     ', '.join(i for i in list_keys))
        if embeddings:
            display_log("Loading word embeddings")
            model = load_model(model_path)
            most_similar = grab_most_similar(list_keys, model=model, top=topn)
            list_keys = add_similar(list_keys, most_similar)
            output.write("Supplemented keyword list includes:\n" +
                         ', '.join(i for i in list_keys))
            output.write('\n')
            output.write(
                "Top {} most similar words added from word emeddings (if found) \n"
                .format(topn))
    else:
        data_words = list(word_lists_no_keywords(data_path))

    display_log("Created data word list of size {}".format(str(
        len(data_words))))

    # generate bigrams
    if bigrams:
        data_words = make_bigrams(data_words)
        display_log("Created bigrams word list")
        output.write("Topic integrates bigrams.\n\n")

    # create dictionary
    id2word = corpora.Dictionary(data_words)
    display_log("Created dictionary")

    # TDF
    corpus = [id2word.doc2bow(text) for text in data_words]
    display_log("Created corpus")

    #LDA model
    lda_model = LdaModel(corpus=corpus,
                         id2word=id2word,
                         num_topics=num_topics,
                         random_state=100,
                         update_every=1,
                         chunksize=60,
                         passes=25,
                         alpha=alpha,
                         eta=eta,
                         iterations=iterations)
    display_log("Created LDA model")
    #pprint(lda_model.print_topics())

    topic_header = ["Topic " + str(i + 1) for i in range(num_topics)]
    topic_array = np.array(
        [lda_model.show_topic(i) for i in range(num_topics)]).T
    output.write("Topics\n-----------------------\n")
    output.write(
        tabulate(topic_array[0], headers=topic_header, tablefmt='github'))
    output.write("\n\n")
    output.write("Similarity Scores\n-----------------------\n")
    output.write(
        tabulate(topic_array[1], headers=topic_header, tablefmt='github'))
    output.write("\n\n")

    display_log("printed table into output file " + output_path)

    df_all = pd.DataFrame()
    topics_transposed = topic_array.T
    for i in range(num_topics):
        new = pd.DataFrame(topics_transposed[i],
                           columns=['Topic ' + str(i), 'score'])
        df_all = pd.concat([df_all, new], axis=1)
    df_all.to_csv(output_csv, index=False, encoding='utf-16')
    display_log("Exported topics and scores into csv file " + output_csv +
                '.csv')

    #coherence for LDA-PDs
    if coherence:
        coherence_model_lda = CoherenceModel(model=lda_model,
                                             texts=data_words,
                                             dictionary=id2word,
                                             coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()

        output.write(
            "Coherence and Preplexity Scores\n-----------------------\n")
        output.write(
            "LDA-PD Model with {} keywords: \n Perplexity: {} \n Coherence: {}"
            .format(len(keywords), lda_model.log_perplexity(corpus),
                    coherence_lda))
        display_log("Coherence and Perplexity calculated, see " + output_path +
                    '.output')

    display_log("Log saved in " + output_path + '.log')
    display_log("Output saved in " + output_path + '.output')
    display_log("Topics saved in " + output_path + '.csv')

    return lda_model
示例#14
0
# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[1])


# In[87]:


# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[0])


# In[88]:


# show_topic() returns n most important/relevant words, and their weights, that comprise given topic
pprint(model_lda.show_topic(1, topn=10))


# In[89]:


pprint(model_lda.show_topics(num_topics=5, num_words=10))


# ### Evaluate - model #1

# In[91]:


# calculate perplexity metrics
perplexity = model_lda.log_perplexity(corpus_train)
示例#15
0
class LdaPersistent(object):
	"""
	- save LDA model
	- rank words within topics
	- write ranked words to db
	'object' must be of type CreateCorpus
	"""
	def __init__(self, corpus, num_topics):
		if type(corpus) is not CreateCorpus:
			raise TypeError("Initialization object must be of class CreateCorpus")
		self.corpus = corpus
		self.num_topics = num_topics
		print "Generating LDA Model..."
		self.model = LdaModel(
			self.corpus,
			num_topics=self.num_topics,
			id2word=self.corpus.dictionary)
	
	def saveLda(self, model_file_path, dictionary_file_path):
		"""
		save model object for loading later
		"""
		print "Pickling model object..."
		if model_file_path:
			self.model.save(model_file_path)
		else:
			self.model.save('./lda-model')
		if dictionary_file_path:
			self.corpus.dictionary.save(dictionary_file_path)
		else:
			self.corpus.dictionary.save('./lda-dictionary')

	def _dbConnect(self, connect_file, database):
		"""
		connect to specified MongoDb database
		requires json document of form:
			{"connect-string": "mongodb://<connect-string-here>"}
		"""
		with open(connect_file, 'rb') as cnf:
			connection_string = json.load(cnf)['connect-string']

		try:
			connection = pymongo.MongoClient(connection_string)
		except ValueError:
			print "Connection to remote MongoDB client failed"
		
		try:
			self.db = getattr(connection, database)
		except AttributeError:
			print "Specified database not found in MongoDB Client"
		
		# try:
		# 	self.db_collection = getattr(db, collection)
		# except AttributeError:
		# 	print "Specified collection not found in database"

	def _topicCollectionsGen(self):
		"""
		generator function
		warning - this iteratively updates a database connection attribute
		"""
		topn = len(self.corpus.dictionary)
		for topic in xrange(self.num_topics):
			collection_name = 'topic_' + str(topic)
			self._db_collection = getattr(self.db, collection_name)
			for score, word in self.model.show_topic(topicid=topic, topn=topn):
				yield word, score

	def dbInsert(self, connect_file, database):
		"""
		Insert the word and word-score for each topic into MongoDB collection
		Each topic has its own collection that scores each word in the Dictionary
		"""
		print "Loading words and word-ranks into MongoDB..."
		self._dbConnect(connect_file, database)
		for word, score in self._topicCollectionsGen():
			doc = {"word": word, "score": score}
			self._db_collection.insert(doc)
			self._db_collection.create_index("word")
		
		#self.db_collection

	def _wordCollectionsGen(self):
		self._db_collection = getattr(self.db, "word_topic_mappings")
		for word in self.corpus.dictionary.itervalues():
			doc = {}
			doc["word"] = word
			for topic in xrange(self.num_topics):
				collection_name = 'topic_' + str(topic)
				doc[collection_name] = {}
				current_collection = getattr(self.db, collection_name)
				subdoc = {}
				for insert in current_collection.find({"word" : word}):
					subdoc["score"] = insert["score"]
				doc[collection_name] = subdoc
			yield doc

	def dbETL(self):
		print "Loading data into new collection indexed by word..."
		for doc in self._wordCollectionsGen():
			self._db_collection.insert(doc)
		self._db_collection.create_index("word")
示例#16
0
class LDA_parser():
    """
    This class implements a wrapper pipeline for text preprocessing and LDA parsing of an input corpus 
    in the form ['str','str','str', ... ]. 
    """
    def __init__(self,
                 corpus='',
                 language='english',
                 preprocessor_type="spacy",
                 tags=["DET", "PUNCT", "NUM", "SYM", "SPACE"],
                 custom_filter=[],
                 lemmatize=False,
                 stem=False,
                 min_len=2,
                 num_topics=10,
                 passes=100):
        """ 
        Parses the input text into a suitable format, then performs all LDA extraction tasks. 
        It expects the input corpus to be a list of texts. If input is a long string, it will attempt 
        create documents by splitting by 
        @ params: 
            @ corpus: Input corpus in str or ['str','str','str', ... ] format, where each entry
                      is a document of type str. Alternatively, a str format input (not recommended).
            @ preprocessor_type: Use nltk-based or spaCy-base preprocessor 
            @ language: language to use in the preprocessor 
            @ tags: if spaCy is selected, will filter words with input POS tags 
            @ custom_filter: filter words in this input list in the preprocessing step 
            @ lemmatize: use lemmatization in the preprocessing 
            @ stem: use stemming in the preprocessing  
            @ num_topics: maximum number of topics in the LDA algorithm 
            @ passes: number of training epochs in the LDA 
        """

        print("Initializing model...\n")
        if preprocessor_type == "nltk":
            print("NLTK preprocessor selected.")
            self.preprocessor = nltk_preprocessor(language=language)
        if preprocessor_type == "spacy":
            print("spaCy preprocessor selected.")
            self.preprocessor = spacy_preprocessor(language=language)

        self.language = language  # input language
        self.raw_corpus = ""  # simply stores the input if in str type
        self.clean_corpus = [
        ]  # [doc, doc, ..., doc] = [[sent, sent, ...], ... ,[sent, sent, ...]]
        self.dictionary = None  # holds a corpora.Dictionary representation of corpus
        self.doc2bow_corpus = None  # contains doc2bow vector representations of each document in the corpus
        self.lda_model = None  # LDA model trained on the input corpus
        self.topic_mixtures = [
        ]  # contains str representations of mixtures of words with their probabilities
        self.topics = {
        }  # Contains a dictionary of topics with words and respective mix probabilities once "extract topics" is called.
        self.topic_words = {
        }  # As above, but only contains the respective words of the topic

        # check for raw str corpus format
        if isinstance(corpus, str):
            print(
                "***WARNING***\nRaw input (str) received. Text will be sentence-tokenized and parsed accordingly."
            )
            print("Make sure this is intended. \n")
            self.raw_corpus = str(corpus)  # transform input to string
            self.fit(corpus,
                     raw=True,
                     language=language,
                     num_topics=num_topics,
                     passes=passes,
                     min_len=min_len)  # fit corpus as raw

        elif corpus == '':
            print("***WARNING***\nNull Corpus")
        # assume input corpus is in the right format
        else:
            self.fit(corpus,
                     language=language,
                     num_topics=num_topics,
                     passes=passes,
                     min_len=min_len)

    def fit(self,
            corpus,
            raw=False,
            language='english',
            stem=False,
            lemmatize=False,
            num_topics=10,
            passes=100,
            min_len=2,
            echo_corpus=False):
        """ 
        Assumes input corpus is in the right format. 
        @args: 
            @ corpus = input corpus  
            @ language = input language  
            @ stem/lemmatize = if true, stem or lemmatize input corpus
            @ num_topics = number of topics to choose in the algorithm 
            @ passes = number of epochs of the LDA 
            @ min_len = minimum length of words to consider when preprocessing words
        """

        if echo_corpus:
            print("CORPUS: {}".format(corpus))

        t0 = time.time()

        print("Fitting LDA topic modelling...")
        self.raw_corpus = corpus  # input corpus as is
        self.language = language  # in case initial language changed

        if raw:
            print("Preprocessing corpus...(raw)")
            self.clean_corpus = self.preprocessor.preprocess_str_corpus(
                corpus, stem=stem, lemmatize=lemmatize, min_len=min_len)
        else:
            print("Preprocessing corpus...")
            self.clean_corpus = self.preprocessor.preprocess_texts(
                self.raw_corpus, min_len=2)  # preprocess text list

        print("Creating corpora dictionary...")
        self.dictionary = corpora.Dictionary(
            self.clean_corpus)  # create corpora.Dictionary mapping
        print("Translating doc2bow corpus...")
        self.doc2bow_corpus = [
            self.dictionary.doc2bow(text) for text in self.clean_corpus
        ]  # doc2bow corpus representation
        print("Running LDA...")
        self.lda_model = LdaModel(self.doc2bow_corpus,
                                  num_topics=num_topics,
                                  id2word=self.dictionary,
                                  passes=passes)
        self.topic_mixtures = self.lda_model.show_topics(
            num_topics=-1,
            num_words=10)  # string representation of topics mixtures

        t1 = time.time()
        print("\nDone in {:.3f} seconds.".format(t1 - t0))

    def print_topics(self, words_per_topic=5):
        """
        Displays the topics in string format
        """
        topics = self.lda_model.print_topics(num_words=words_per_topic)
        for topic in topics:
            print(topic)

    def extract_topics(self, max_words_per_topic=50, threshold=0.005):
        """
        Returns all topics as a dictionary of tuples, where the key is the topic 
        number, and the value is a list of tuples of words_per_topic many words with 
        probability at least as high as threshold, where the second value is the density 
        for the topic. 
        @params: 
            @ max_words_per_topic: Maximum topic mixture component words to consider. 
            @ threshold: select words whose density is at least this value
        """
        topics = {}  # to store the topics
        indexes = [tup[0]
                   for tup in self.topic_mixtures]  # indexes of the thing

        # assign the topics mixtures
        for i in indexes:
            topics[i] = [
                tup
                for tup in self.lda_model.show_topic(i,
                                                     topn=max_words_per_topic)
                if tup[1] >= threshold
            ]  # extract mosst probable words for topic i

        self.topics = topics  # update attribute

        return topics

    def extract_topic_words(self, max_words_per_topic=50, threshold=0.005):
        """
        Returns all topics as a dictionary of tuples, where the key is the topic 
        number, and the value is a list of words_per_topic many words with 
        probability at least as high as threshold. 
        """
        topics = {}  # to store the topics
        indexes = [tup[0]
                   for tup in self.topic_mixtures]  # indexes of the thing

        # assign the topics mixtures
        for i in indexes:
            topics[i] = [
                tup[0]
                for tup in self.lda_model.show_topic(i,
                                                     topn=max_words_per_topic)
                if tup[1] >= threshold
            ]  # extract mosst probable words for topic i

        self.topic_words = topics  # update attribute

        return topics

    def parse_new(self,
                  new_text,
                  top_n_topics=100,
                  top_n_w=30,
                  max_words_per_topic=50,
                  threshold=0.005,
                  verbose=True):
        """
        Parses a new text by obtaining the most likely topics for the new input, 
        as well as the respective words. This function should be used only after 
        the LDA parser has been fitted. 
        @params: 
            @ new_text: new input text 
            @ top_n_topics: top n topics with larges densities  p(topic)
            @ top_n_w: top n word with largest densities p(word) = p(word|topic)*p(topic)
            @ verbose: display information
            @ max_words_per_topic: maximum words per topic  
            @ thrshold: only consider words with density greater than threshold 
        @returns: 
            @ max_topic: most likely topic for the document 
            @ doc_max_topic_words: words associated with the most likely topic 
            @ doc_topics: all topics related to the document 
            @ doc_topic_words: all words from all topics associated with the document 
        """

        self.extract_topic_words(
            max_words_per_topic,
            threshold)  # extract topics to ensure they are there

        new_text_clean = self.preprocessor.preprocess_sentence(
            new_text)  # preprocess input text
        new_doc_bow = self.dictionary.doc2bow(
            new_text_clean)  # convert to doc2bow

        doc_topics = self.lda_model.get_document_topics(
            new_doc_bow)  # obtain topics for input document
        topic_idx = [tup[0] for tup in doc_topics]  # topic indices

        doc_topic_words = [
            word for idx in topic_idx for word in self.topic_words[idx]
        ]  # extract all words from every topic
        top_n_topics = nlargest(top_n_topics,
                                list(doc_topics),
                                key=lambda x: x[1])  # extract top n topics

        top_n_words = list(
            set([
                word for idx in [tup[0] for tup in top_n_topics]
                for word in self.topic_words[idx]
            ]))  # extrac the word for the topc words

        # Currently, we have access to the top n topics and their actual probabilities.
        # We want to collect all the words for those topics, and multiply them with their probabilities

        words_with_probs = [
        ]  # will store words with their actual probabilities:

        for topic_tup in doc_topics:
            topic_idx = topic_tup[0]  # obtain topic index
            topic_prob = topic_tup[1]  # obtain topic probability p(topic)
            for word_tup in self.lda_model.show_topic(topic_idx, topn=10):
                word_probability = word_tup[
                    1] * topic_prob  # p(w) = p(w|topic)p(topic)
                words_with_probs.append(
                    (word_tup[0], word_probability))  # (word, p(w))

        # obtain the n most likely words according to they individual probabilities
        n_most_likely_words = [
            tup[0] for tup in nlargest(
                top_n_w, list(words_with_probs), key=lambda x: x[1])
        ]

        if verbose:
            print("\nLOGS: \n")
            print("*** Most likely topic: ***\n", top_n_topics)
            print("*** Words for most likely topic: ***\n", top_n_words)
            print("*** All topics: ***\n", doc_topics)
            print("*** All topics words: ***\n", doc_topic_words)

        return n_most_likely_words, top_n_topics, top_n_words, doc_topics, doc_topic_words

    def pickle_save(self, savename="full_LDA_parser.pkl"):
        """ 
        Saves the full model object in pkl format
        """
        pickle.dump(self, open(savename, 'wb'))

    def save_model(self, name="LDA_model"):
        """ 
        Saves the LDA model, doc2bow_corpus and dictionary.
        These parameters can be used to instantiate a gensim 
        model, so there is no load in this class. 
        """
        dictionary_name = name + "_dictionary.gensim"
        corpus_name = name + "_doc2bow_corpus.pkl"
        model_name = name + ".gensim"

        pickle.dump(self.doc2bow_corpus, open(corpus_name,
                                              'wb'))  # save the doc2bow_corpus
        self.dictionary.save(dictionary_name)  # save corpus dictionary mapping
        self.lda_model.save(model_name)  # save the full model
plt.ylabel('Number of topics')

fim.savefig('50t_Topic_Popularity_on_Articles.png')
#plt.show()

#FIGURE SHOWING THE MOST POPULAR TOPICS:
import gensim
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")

fip = plt.figure(figsize=(15, 30))
for i in range(10):
    df = pd.DataFrame(ldamodel.show_topic((q.iloc[i]['topics_id'].astype(int)),
                                          topn=12),
                      columns=['term', 'prob']).set_index('term')
    df = df.loc[df['prob'] > 0.005]
    df
    plt.subplot(5, 2, i + 1)
    plt.title('topic ' + str((q.iloc[i]['topics_id'].astype(int))))
    sns.barplot(x='prob',
                y=df.index,
                data=df,
                label='Cities',
                palette='Reds_d')
    plt.xlabel('probability')
fip.savefig('The_Most_Popular_Topics_on_Articles.png')
#plt.show()

p = result.groupby(['author', 'topics_id']).size().reset_index(
示例#18
0
class Lda(Analysis):
    def __init__(self):
        self.times_fitted = 0
        self.corpus = None
        self.vocabulary = None
        self.lda = None
        self.nb_docs_trained = 0
        self.selected_clusters = set()

    def fit(self,
            documents,
            add_prediction='',
            field='text',
            nb_topics=20,
            normalizing='stem',
            language=DEFAULTLANGUAGE,
            **kwargs):
        """
        This method trains the Lda model by fitting its parameters to the extracted textual data from the given documents\
        (dictionaries) and selected field key. It infers n number of topics/clusters equal to the given parameter.\
        Input documents can be optionally mutated by adding to them the trained model "prediction" value.\n

        `alpha` and `eta` are hyperparameters that affect sparsity of the document-topic (theta) and topic-word (lambda)\
         distributions respectively. 'alpha' parameter is learned as an asymmetric prior directly from your data and 'eta'\
         defaults to a symmetric 1.0/nb_topics prior.\n

        `decay` and `offset` parameters are the same as Kappa and Tau_0 in Hoffman et al, respectively.\n\n

        :param documents: the documents (dictionaries) to train on
        :type documents: iterable
        :param add_prediction: this switch signals the mutation of the train set documents by adding a key, value pair,\
            per document. The value holds the documents's topic distribution predicted by the trained model
        :param field: the requested dictionary/document key pointing to the data. If 'all' is given then returns the\
            concatenation of all the dictionary values with '\\\\n'
        :type field: str
        :param nb_topics: the number of clusters/topics to assume when performing topic modeling. Controls granularity
        :type nb_topics: int
        :param normalizing: if 'lemmatize' then perfoms word net lemmatization with the default pos noun ('n') NOTE: only supported for english
                        if 'stem' perform stemming with the porter stemmer
                        else uses the input words as they are.
        :param language: language of the documents to be classified, important for preprocessing
        :type language: str

        :References:
        * https://radimrehurek.com/gensim/models/ldamodel.html : gensim.models.ldamodel
        * https://www.di.ens.fr/~fbach/mdhnips2010.pdf : Hoffman et al
        """
        self.vocabulary, self.corpus = create_corpus(documents,
                                                     field=field,
                                                     normalizing=normalizing,
                                                     language=language)
        print('Training Lda model ...')
        self.lda = LdaModel(
            corpus=self.corpus, num_topics=nb_topics, alpha='auto'
        )  # alpha can be also set to 'symmetric' or to an explicit array
        self.nb_docs_trained = len(self.corpus)
        #lda = gensim.models.ldamodel.LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=0, passes=20)

    def predict(self, documents, add_prediction='', field='text'):
        docs_lda = []
        for doc in documents:
            docs_lda.append(self.lda[get_bow(extract_data(doc, field=field),
                                             self.corpus)])
            if add_prediction != '':
                doc[add_prediction] = str(docs_lda[-1])

    def update(self, documents, field='text'):
        pass
        # corp = CorpusCreator.create_corpus(documents, field=field, normalizing=self.corpus.normalizer)
        # print('Updating model ...')
        # self.lda.update((get_bow(text_data, corp) for text_data in get_data_generator(documents, field=field)))

    def interpretation(self, prec=3):
        ordered_selected_clusters = [
            _id for _id in range(self.lda.num_topics)
            if _id in self.selected_clusters
        ]
        body, max_len = self.get_rows(
            [self.lda.show_topic(i) for i in ordered_selected_clusters],
            prob_precision=prec)
        header = ' - '.join('{}'.format(idd) + ' ' *
                            (3 + prec + max_len - len(str(idd)))
                            for idd in ordered_selected_clusters) + '\n'
        return header + body

    def get_rows(self, top, prob_precision=3):
        max_token_len = max(
            len(self.vocabulary[int(top[j][i][0])]) for j in range(len(top))
            for i in range(len(top[0])))
        b = ''
        for i in range(len(top[0])):
            b += ' | '.join('{} '.format(
                str(self.vocabulary[int(top[j][i][0])]) + ' ' *
                (max_token_len - len(self.vocabulary[int(top[j][i][0])]))) +
                            "{1:.{0}f}".format(prob_precision, top[j][i][1])
                            for j in range(len(top))) + '\n'
        return b, max_token_len

    def select_topics(self, topic_ids):
        """Use this method to indicate which topics/clusters you are interested in for "selecting" (i.e. interpreting, visualizing) by providing your desired numerical ids. Note that it only adds to the set of currently "selected" topics the new ids provided. To clear the set before selecting use "clear_selected_topics" first.\n
        :param topic_ids: the numerical ids of the topics/clusters to add to the "selected" set
        :type topic_ids: list
        """
        self.selected_clusters.update(topic_ids)

    def deselect_topics(self, topic_ids):
        """Use this method to indicate which topics/clusters you are NOT interested in "selecting" (i.e. for interpreting, visualizing..) by providing your desired numerical ids to exclude from the selected set. It removes the found ids from the set.\n
        :param topic_ids: the numerical ids of the topics/clusters to remove from the "selected" set
        :type topic_ids: list
        """
        self.selected_clusters.difference_update(topic_ids)

    def select_all_topics(self):
        """Use this method in case you are want to "select" (i.e. for interpreting, visualizing..) all topics/clusters infered by the model. It adds all numerical topic/cluster ids to the "selected" set."""
        self.selected_clusters.update(range(self.lda.num_topics))

    def clear_selected_topics(self):
        """Use this method to clear the selection of topics/clusters. It removes all nnumerical ids from the "selected" set"""
        self.selected_clusters.clear()
示例#19
0
corpus = [words.doc2bow(doc) for doc in doc_list]

# Create LDA model
lda = LdaModel(corpus=corpus,
               id2word=words,
               num_topics=3,
               random_state=2,
               update_every=1,
               passes=20,
               alpha='auto',
               per_word_topics=True)

# Pickle that too

# with open('pickles/movie_lda_1.pickle', 'wb') as f:
    # pickle.dump(lda, f)

with open('pickles/movie_lda_1.pickle', 'rb') as f:
    lda = pickle.load(f)

pprint(lda.print_topics(num_words=10))

# Generate wordclouds
for t in range(lda.num_topics):
    wc = WordCloud(width=800, height=400)
    wc.fit_words(dict(lda.show_topic(t, 200)))
    plt.figure()
    plt.imshow(wc)
    plt.axis("off")
    plt.show()
#for topic in ldamodel.print_topics(num_topics=4, num_words=30):
#    print (topic[0]+1, " ", topic[1],"\n")

#ldamodel.save('topic_comments_lda.model')

#MAIN PLOT
viz = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
pyLDAvis.save_html(viz, '50t.html')

#ldamodel.save('TM_lda_1000_4t.model')


#FUGURES
fig = plt.figure(figsize=(30,60))
for i in range(50):
    df=pd.DataFrame(ldamodel.show_topic(i, topn = 15), columns=['term','prob']).set_index('term')
#     df=df.sort_values('prob')
    df = df.loc[df['prob'] >0.005]
    df
    
    plt.subplot(10,5,i+1)
    plt.title('topic '+str(i+1))
    sns.barplot(x='prob', y=df.index, data=df, label='Cities', palette='GnBu_d')
    plt.xlabel('probability')
    
    
#plt.show()
fig.savefig('50t.png')


# for i in tweets[list(tweets.keys())[2]].split("|||"):
#     words =[word for word in nltk.word_tokenize(i) if word not in STOPWORDS and word.isalnum() and len(word)>=2]
#     words_list.append(words)

num_topics = 3
dictionary = corpora.Dictionary(words_list)
corpus = [dictionary.doc2bow(words) for words in words_list]
lda = LdaModel(corpus, id2word=dictionary, num_topics=num_topics)

###output1: topics and corresponding words
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(lda.print_topics(num_words=10))

###output2: 2 ways of showing one topic and corresponding words
lda.print_topic(topicno=0)
lda.show_topic(1)

### ouput3: show topic of one user (even new user)
sorted(lda.get_document_topics(corpus[100],
                               minimum_probability=0,
                               per_word_topics=False),
       key=lambda x: x[1],
       reverse=True)

### output4: visualize LDA
lda_display = pyLDAvis.gensim.prepare(lda,
                                      corpus,
                                      dictionary,
                                      R=15,
                                      sort_topics=False)
pyLDAvis.display(lda_display)
print (word_list)
print ('********************************************************************************************************')
print (tweet_clean_fin)
print (len(tweet_clean_fin))
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(tweet_clean_fin)
print("\n --- dictionary \n",dictionary)
bow_vectors = [dictionary.doc2bow(text) for text in tweet_clean_fin]

goodLdaModel=LdaModel(corpus=bow_vectors,id2word=dictionary,iterations=50,num_topics=6)
print('\n --- goodLdaModel: all topics in result ordered by significance \n')
all_goos_topics=goodLdaModel.print_topics(-1)
print(all_goos_topics)
print("\n---goodLdaModel.print_topics(num_topics=6,num_words=12 \n")
print(goodLdaModel.print_topics(num_topics=6,num_words=16))
%%time
import warnings
import pandas as  pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
fiz=plt.figure(figsize=(30,60))
for i in range(6):
    df=pd.DataFrame(goodLdaModel.show_topic(i,16),columns=['term','prob']).set_index('term')
    plt.subplot(6,3,i+1)
    plt.title('topic'+str(i+1))
    sns.barplot(x='prob',y=df.index,data=df,label='Cities',palette='Reds_d')
    plt.xlabel('probability')
plt.show()
示例#23
0
class LDAWDF:
    mysql: mysql.MySQL
    ldamodel: LdaModel
    dictionary = None
    corpus = None

    def __init__(self, mysql):
        self.mysql = mysql
        self.dataFolder = './data/'
        self.saveFile = 'lda_model'
        self.saveFileDict = 'lda_model_dict'

    def trainFromStart(self):
        with self.mysql as db:
            content = db.getContentsText()
        documents = []
        for item in content:
            documents.append(item['content'].split())

        self.dictionary = corpora.Dictionary(documents)

        self.dictionary.filter_extremes(no_below=5, no_above=0.5)

        doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents]

        self.corpus = doc_term_matrix

        # Running and Training LDA model on the document term matrix.
        print("Starting to train LDA Model...")
        self.ldamodel = LdaModel(
            doc_term_matrix,
            num_topics=200,
            id2word=self.dictionary,
            passes=100)

    def printTest(self):
        print(self.ldamodel.print_topics(num_topics=10, num_words=5))

    def save(self):
        self.ldamodel.save(self.dataFolder + self.saveFile)
        self.dictionary.save(self.dataFolder + self.saveFileDict)

    def canLoad(self):
        my_file = Path(self.dataFolder + self.saveFile)
        my_file_dict = Path(self.dataFolder + self.saveFileDict)
        return my_file.is_file() and my_file_dict.is_file()

    def update(self, corpus):
        self.ldamodel.update(corpus)

    def load(self, subfolder=None):
        if subfolder:
            sf = subfolder + '/'
        else:
            sf = ''
        self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile)
        self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict)

    def fillDb(self):
        topics = {}
        result = []
        result2 = []
        nbTopics = self.ldamodel.get_topics().shape[0]
        # "Old"
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 3)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            words = []
            for topicTerm in topicTerms:
                words.append(self.dictionary.get(topicTerm[0]))
            topics[topicId] = ' '.join(words)
        with mysql as db:
            contentsText = db.getContentsText()
            for element in contentsText:
                bow = self.dictionary.doc2bow(element['content'].split())
                docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05)
                if len(docTopics) > 0:
                    docTopics.sort(key=lambda x: x[1], reverse=True)
                    result.append((element['url'], topics[docTopics[0][0]]))
                    for docTopic in docTopics:
                        result2.append((element['url'], docTopic[0], str(docTopic[1])))
            db.emptyUrlsTopic()
            db.emptyCurrentUrlsTopic()
            db.emptyCurrentUserTags()
            db.setCurrentUrlsTopic(result2)
            db.setPrecalcTopics()
        # "New"
        terms = []
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 5)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            for topicTerm in topicTerms:
                terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1])))
        with mysql as db:
            db.emptyLdaTopics()
            db.setLdaTopics(terms)


    def get_terms_topics(self, keywords):
        bow = self.dictionary.doc2bow(keywords[:30])
        topics = {}
        keywordsResult = {}
        for word in bow:
            wordTopics = self.ldamodel.get_term_topics(word[0], 0.05)
            keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics}
            for wordTopic in wordTopics:
                wordTopicId = wordTopic[0]
                if wordTopicId not in topics:
                    topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId)
        return {'topics': topics, 'keywords': keywordsResult}
示例#24
0
        # Create word cloud in grey scale at 300 dpi for publication


        def grey_color_func(word,
                            font_size,
                            position,
                            orientation,
                            random_state=None,
                            **kwargs):
            return "hsl(0, 0%%, %d%%)" % np.random.randint(60, 95)

        for t in range(lda_model.num_topics):
            plt.figure()
            wc = WordCloud(background_color='black',
                           color_func=grey_color_func).fit_words(
                               dict(lda_model.show_topic(t, 200)))
            plt.imshow(wc)
            plt.axis("off")
            plt.title("Topic #" + str(t))
            # plt.show()
            plt.savefig(os.path.join('./results/wordcloud_' + str(num_topics) +
                                     '_' + str(t) + '.png'),
                        format='png',
                        dpi=300)
    else:
        pass

if inference:
    # Infer topic distribution in the data to find examples
    # lda_model = last model from above
    # unpickle another model that you prefer