for c in lda_model[corpus[5:8]]: print("Document Topics : ", c[0]) # [(Topics, Perc Contrib)] print("Word id, Topics : ", c[1][:3]) # [(Word id, [Topics])] print("Phi Values (word id) : ", c[2][:2]) # [(Word id, [(Topic, Phi Value)])] print("Word, Topics : ", [(dct[wd], topic) for wd, topic in c[1][:2]]) # [(Word, [Topics])] print("Phi Values (word) : ", [(dct[wd], topic) for wd, topic in c[2][:2]]) # [(Word, [(Topic, Phi Value)])] print("------------------------------------------------------\n") train_vecs = [] for i in range(len(train)): top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0) topic_vec = [top_topics[i][1] for i in range(num_topics)] # topic_vec.extend([train.iloc[i].real_counts]) # counts of reviews for restaurant topic_vec.extend([len(train.iloc[i].comment)]) # length review train_vecs.append(topic_vec) print(train_vecs) X = np.array(train_vecs) y = np.array((train['label']).values) print(y) print(len(X), len(y)) test_vecs = [] for i in range(len(test)): i = i + len(train) top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0) topic_vec = [top_topics[i][1] for i in range(num_topics)]
for pair in tp[1]: words.append(pair[0]) weights.append(int(pair[1] * 10000)) keywordWeights.append(weights) # Top topics per paragraph df = pd.DataFrame() df['referenceId'] = referenceIds df['paragraph'] = raw_paragraphs topicNumbers = [] for c in range(len(corpus)): maxProbability = 0 indexOfMax = 0 topTopics = [] topTopicProbabilities = [] lda_model.get_document_topics(corpus[c]) for topicNumber in lda_model.get_document_topics(corpus[c]): topTopics.append(topicNumber[0]) topTopicProbabilities.append(topicNumber[1]) topTopicsSorted = [ x for _, x in sorted(zip(topTopicProbabilities, topTopics), reverse=True) ] topicNumbers.append(topTopicsSorted) df['topTopics'] = topicNumbers # Most probable topic per paragraph topTopics = [] for index, row in df.iterrows(): if (row['topTopics']): topTopics.append(row['topTopics'][0])
class Lda(): def __init__(self): self.logger = Logger.logger self.storage_path = Config.lda_storage_path # Filenames self.gensim_dictionary = 'dictionary.gensim' self.gensim_model = 'model.gensim' self.corpus_pickle = 'corpus.pkl' def persist_lda(self): """ Persist corpus,dictionary and lda-model locally. :return: """ if not os.path.exists(self.storage_path): os.makedirs(self.storage_path) self.logger.info("Persist corpus, dictionary and lda-model to file.") pickle.dump( self.bow_corpus, open(os.path.join(self.storage_path, self.corpus_pickle), 'wb')) self.dictionary.save( os.path.join(self.storage_path, self.gensim_dictionary)) self.ldamodel.save(os.path.join(self.storage_path, self.gensim_model)) def load_lda(self): """ Load corpus,dictionary and lda-model from local storage. :return: """ self.logger.info("Loading corpus, dictionary and lda-model from file.") self.dictionary = corpora.Dictionary.load( os.path.join(self.storage_path, self.gensim_dictionary)) self.bow_corpus = pickle.load( (open(os.path.join(self.storage_path, self.corpus_pickle), "rb"))) path = os.path.join(self.storage_path, self.gensim_model) self.ldamodel = LdaModel.load(path) def show_topics(self): topics = self.ldamodel.print_topics(num_words=5) for topic in topics: print(topic) def train_lda(self, texts, num_topics=5, n=None): self.logger.info("Create corpus, dictionary lda-model.") self.dictionary = corpora.Dictionary(texts) self.bow_corpus = [self.dictionary.doc2bow(text) for text in texts] tfidf = models.TfidfModel(self.bow_corpus) corpus_tfidf = tfidf[self.bow_corpus] self.ldamodel = LdaMulticore( corpus=corpus_tfidf, num_topics=num_topics, id2word=self.dictionary, passes=10, workers=3, ) def classify(self, text): """ Returns an vector of probabilities to which class the given text belongs to. :param text: :return: """ self.logger.info("Classify the given text.") new_doc_bow = self.dictionary.doc2bow(text) return self.ldamodel.get_document_topics(new_doc_bow) def export_html(self): self.logger.info("Export LDA to html file.") lda_display = pyLDAvis.gensim.prepare(self.ldamodel, self.bow_corpus, self.dictionary, sort_topics=True) pyLDAvis.save_html(lda_display, self.storage_path + "/index.html") def visualize(self): """ Visualizes the lda-model usung LDAVIS. :return: """ lda_display = pyLDAvis.gensim.prepare(self.ldamodel, self.bow_corpus, self.dictionary, sort_topics=True) pyLDAvis.show(lda_display)
def run(self, args): # mlflow logs experiment_name = "dev-LessonsClustering" if args.environment == "production": experiment_name = "LessonsClustering" elif args.environment == "staging": experiment_name = "staging-LessonsClustering" mlflow.set_experiment(experiment_name) client = mlflow.tracking.MlflowClient() with mlflow.start_run(): log_param("environment", args.environment) log_param("mode", args.mode) log_param("update_related_lessons", args.update_related_lessons) # Get lessons data from database df = ef.getLessons(self.credentials) # Pre Processing lessonsData = df[df['isLesson'] == True] lessonsData = lessonsData[lessonsData['summary'] == lessonsData['summary']] raw_paragraphs = lessonsData['paragraph'] urls = lessonsData['urlToFile'] raw_sentences = raw_paragraphs ids = lessonsData['_id'] sentences = [line.split(' ') for line in raw_sentences] stop_words = stopwords.words('english') stop_words.extend( ['from', 'subject', 're', 'edu', 'use', 'äô', 'äù', 'äì']) words_to_remove = ['iii', 'project'] def remove_stopwords(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in stop_words ] for doc in texts] def remove_words(texts): return [[ word for word in simple_preprocess(str(doc)) if word not in words_to_remove ] for doc in texts] def remove_word_length_2(texts): allSentences = [] for doc in texts: newWords = [] for word in doc: if len(word) > 2: newWords.append(word) allSentences.append(newWords) return allSentences def replace_adb_special_characters(texts): return [[ word.replace('‚Äôs', "'s ").replace('O‚ÄôSmach', "0").replace( 'äù', "").replace('äô', "").replace('äì', "") for word in doc ] for doc in texts] def get_wordnet_pos(word): tag = nltk.pos_tag([word])[0][1][0].upper() tag_dict = { "J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV } return tag_dict.get(tag, wordnet.NOUN) sentences = replace_adb_special_characters(sentences) data_words_nostops = remove_stopwords(sentences) lemmatizer = WordNetLemmatizer() lemmatized_output = [] for paragraph in data_words_nostops: lemmatized_output.append([ lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in paragraph ]) sentences = remove_words(lemmatized_output) sentences_no_length_2 = remove_word_length_2(sentences) sentences = sentences_no_length_2 id2word = corpora.Dictionary(sentences) texts = sentences corpus = [id2word.doc2bow(text) for text in texts] def compute_coherence_values(corpus, dictionary, k, a, b): lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=k, random_state=100, chunksize=100, passes=10, alpha=a, eta=b, per_word_topics=True) coherence_model_lda = CoherenceModel(model=lda_model, texts=sentences, dictionary=id2word, coherence='c_v') return coherence_model_lda.get_coherence() # Fine Tuning if args.mode == "fine_tuning": grid = {} grid['Validation_Set'] = {} # Topics range min_topics = 2 max_topics = args.max_number_of_topics step_size = 1 topics_range = range(min_topics, max_topics + 1, step_size) # Alpha parameter alpha = list(np.arange(0.01, 1, 0.3)) # alpha.append('symmetric') # alpha.append('asymmetric') # Beta parameter beta = list(np.arange(0.01, 1, 0.3)) # beta.append('symmetric') # Validation sets # num_of_docs = len(corpus) corpus_sets = [ # ClippedCorpus(corpus, int(num_of_docs*0.25)), # ClippedCorpus(corpus, int(num_of_docs*0.5)), # ClippedCorpus(corpus, int(num_of_docs*0.75)), corpus ] # corpus_title = [ # '25% Corpus' # '50% Corpus', # '75% Corpus' # '100% Corpus' # ] model_results = { # 'Validation_Set': [], 'Number Of Topics': [], 'Alpha': [], 'Beta': [], 'Coherence': [] } model_results_2 = { 'Number Of Topics': [], 'Average Coherence': [] } maxCoherence = 0 maxCoherenceK = 2 maxCoherenceA = 0.01 maxCoherenceB = 0.01 for i in range(len(corpus_sets)): for k in topics_range: for a in alpha: for b in beta: cv = compute_coherence_values( corpus=corpus_sets[i], dictionary=id2word, k=k, a=a, b=b) if cv > maxCoherence: maxCoherence = cv maxCoherenceK = k maxCoherenceA = a maxCoherenceB = b # model_results['Validation_Set'].append(corpus_title[i]) model_results['Number Of Topics'].append(k) model_results['Alpha'].append(a) model_results['Beta'].append(b) model_results['Coherence'].append(cv) customStep = int( str(k) + "{:.2f}".format(a).replace(".", "") + "{:.2f}".format(b).replace(".", "")) log_metric("coherence", cv, step=customStep) model_results_2['Number Of Topics'].append(k) model_results_2['Average Coherence'].append(cv) log_metric("average_coherence", cv, step=k) log_metric("max_coherence", maxCoherence) log_metric("number_of_topics_of_max_coherence", maxCoherenceK) log_metric("alpha_of_max_coherence", maxCoherenceA) log_metric("beta_of_max_coherence", maxCoherenceB) pd.DataFrame(model_results).to_csv(defaults.DATA_PATH + "fine-tuning.csv", index=False) pd.DataFrame(model_results_2).to_csv(defaults.DATA_PATH + "fine-tuning-2.csv", index=False) log_artifact(defaults.DATA_PATH + "fine-tuning.csv", "data/") log_artifact(defaults.DATA_PATH + "fine-tuning-2.csv", "data/") # Train LDA model elif args.mode == "train": log_metric("number_of_topics", args.number_of_topics) log_metric("alpha", args.alpha) log_metric("beta", args.beta) lda_model = LdaMulticore(corpus=corpus, id2word=id2word, num_topics=args.number_of_topics, random_state=200, chunksize=100, passes=10, alpha=args.alpha, eta=args.beta, per_word_topics=True) cv = compute_coherence_values(corpus=corpus, dictionary=id2word, k=args.number_of_topics, a=args.alpha, b=args.beta) log_metric("coherence", cv) lda_model.save(defaults.MODEL_PATH + "lda.model") log_artifact(defaults.MODEL_PATH + "lda.model", "models/") # Predict LDA model elif args.mode == "predict": log_param("run_id_model", args.run_id_model) number_of_topics = int(args.number_of_topics) if not args.run_id_model == "": data = client.get_run(args.run_id_model).data number_of_topics = int(data.params['number_of_topics']) alpha = float(data.params['alpha']) beta = float(data.params['beta']) log_metric("number_of_topics", number_of_topics) log_metric("alpha", alpha) log_metric("beta", beta) cv = compute_coherence_values(corpus=corpus, dictionary=id2word, k=number_of_topics, a=alpha, b=beta) log_metric("coherence", cv) # Download and load the LDA model modelFilePath = defaults.MODEL_PATH + "lda.model" af.downloadLDAModel(args, modelFilePath) lda_model = LdaModel.load(modelFilePath) # lda_model.save(defaults.MODEL_PATH + "lda.model") # log_artifact(defaults.MODEL_PATH + "lda.model", "models/") # Keyword weights x = lda_model.show_topics(num_topics=number_of_topics, num_words=50, formatted=False) keywordWeights = [] topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x] for tp in x: words = [] weights = [] for pair in tp[1]: words.append(pair[0]) weights.append(int(pair[1] * 10000)) keywordWeights.append(weights) # Top topics per paragraph topicNumbers = [] for c in range(len(corpus)): maxProbability = 0 indexOfMax = 0 topTopics = [] topTopicProbabilities = [] lda_model.get_document_topics(corpus[c]) for topicNumber in lda_model.get_document_topics( corpus[c]): topTopics.append(topicNumber[0]) topTopicProbabilities.append(topicNumber[1]) topTopicsSorted = [ x for _, x in sorted(zip(topTopicProbabilities, topTopics), reverse=True) ] topicNumbers.append(topTopicsSorted) lessonsData['newTopTopics'] = topicNumbers lessonsData['topTopics'] = topicNumbers # Most probable topic per paragraph topTopics = [] for index, row in lessonsData.iterrows(): if (row['topTopics']): topTopics.append(row['topTopics'][0]) else: topTopics.append(-1) lessonsData['topic'] = topTopics # Frequencies of topic keywords and number of PCRs per topic topics = pd.DataFrame() topicKeywords = [] allKeywords = [] topicIds = [] for topic, words in topics_words: allKeywords.append(words) topicIds.append(topic) topics['key'] = topicIds topics['keywords'] = allKeywords topics['oldFrequencies'] = [[0] * len(keywords) for keywords in allKeywords] topics['numberOfLessons'] = 0 topics['PCRs'] = [[] for i in range(len(topics))] topics['numberOfPCRs'] = 0 for sentenceTopicNumbers, sentenceURL in zip( topicNumbers, urls): for topicNumber in sentenceTopicNumbers: topics.at[topicNumber, 'numberOfLessons'] = topics.at[ topicNumber, 'numberOfLessons'] + 1 topics.at[topicNumber, 'PCRs'].append(sentenceURL) for index, row in topics.iterrows(): topics.at[index, 'numberOfPCRs'] = len( set(topics.at[index, 'PCRs'])) topics = topics.drop(columns=['PCRs']) # Frequencies of words per sentence per topic topics['oldFrequencies'] = [[0] * len(keywords) for keywords in allKeywords] for index, row in topics.iterrows(): topicNumber = topics.at[index, 'key'] topicKeywords = topics.at[index, 'keywords'] topicKeywordsFrequencies = topics.at[index, 'oldFrequencies'] for sentence, sentenceTopicNumbers in zip( sentences, topicNumbers): for sentenceTopicNumber in sentenceTopicNumbers: if topicNumber == sentenceTopicNumber: for word in sentence: if word in topicKeywords: indexOfWord = topicKeywords.index(word) topicKeywordsFrequencies[ indexOfWord] = topicKeywordsFrequencies[ indexOfWord] + 1 topics.at[index, 'oldFrequencies'] = topicKeywordsFrequencies topics['frequencies'] = keywordWeights # Top word per topic topicTopWords = [] for index, row in topics.iterrows(): topicTopWords.append(row['keywords'][0]) topics['topWord'] = topicTopWords # Adjacent topics # pyLDAvis.enable_notebook() vis = pyLDAvis.gensim.prepare(lda_model, corpus, dictionary=lda_model.id2word) topics['x'] = 1.0 topics['y'] = 1.0 for topic, x in zip(list(vis.topic_coordinates.index), list(vis.topic_coordinates.x)): topics.at[topic, 'x'] = float(x) for topic, y in zip(list(vis.topic_coordinates.index), list(vis.topic_coordinates.y)): topics.at[topic, 'y'] = float(y) import math def calculateDistance(x1, y1, x2, y2): dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2) return dist distanceMatrix = [] allDistances = [] c1 = 0 topicsX = topics['x'].tolist() topicsY = topics['y'].tolist() for tx1, ty1 in zip(topicsX, topicsY): distances = [] for tx2, ty2 in zip(topicsX, topicsY): distance = calculateDistance(tx1, ty1, tx2, ty2) if not distance: distance = 999 else: allDistances.append(distance) distances.append(distance) distanceMatrix.append(distances) c1 = c1 + 1 percentile20 = np.percentile(allDistances, 20) numberOfAdjacent = 0 numberOfNodes = len(distanceMatrix) allAdjacentTopics = [] for distances in distanceMatrix: adjacentTopics = [] for index, distance in zip(range(len(distances)), distances): if distance <= percentile20: adjacentTopics.append(index) allAdjacentTopics.append(adjacentTopics) numberOfAdjacent = numberOfAdjacent + len(adjacentTopics) numberOfAdjacent = numberOfAdjacent / 2 pairs = [] for index, adjacentTopicList in zip( range(len(allAdjacentTopics)), allAdjacentTopics): for adjacentTopic in adjacentTopicList: pairs.append(sorted([index, adjacentTopic])) pairs.sort() dedupedPairs = list(pairs for pairs, _ in itertools.groupby(pairs)) topWordPairs = [] for pair in dedupedPairs: topWordPairs.append( [topicTopWords[pair[0]], topicTopWords[pair[1]]]) topics['adjacentTopics'] = allAdjacentTopics # Save topics data ef.deleteIndex(self.credentials, "topics") ef.saveTopics(self.credentials, topics) # Lesson strength maxLessonStrength = topics['numberOfPCRs'].sum() lessonStrengths = [] for index, row in lessonsData.iterrows(): topicNumbers = row['topTopics'] lessonStrength = 0 for topicNumber in topicNumbers: lessonStrength = lessonStrength + topics.at[ topicNumber, 'numberOfPCRs'] lessonStrengths.append(lessonStrength / maxLessonStrength) lessonsData['lessonStrength'] = lessonStrengths # Save lessons data ef.updateSentences(self.credentials, lessonsData) mf.backupIndex(self.credentials, "sentences") mf.backupIndex(self.credentials, "topics") # Update related lessons # Get TFIDF model if args.update_related_lessons == "True": tfidf = TfidfModel(corpus, smartirs='ntc') tfidf_corpus = [] for doc in corpus: tfidf_corpus.append(tfidf[doc]) tfidf_mat = matutils.corpus2dense(tfidf_corpus, num_terms=len(id2word.token2id)) tfidf_mat_transpose = tfidf_mat.transpose() tfidfDF = pd.DataFrame( data=tfidf_mat_transpose[0:, 0:], index=[i for i in range(tfidf_mat_transpose.shape[0])], columns=[ '' + str(i) for i in range(tfidf_mat_transpose.shape[1]) ]) tfidfDF['id'] = ids.tolist() # Save related lessons cf.updateRelatedLessons(self.credentials, tfidfDF)
class AmazonReviewDataset(): def __init__(self, dirpath, word2vector_path, max_vocab_size=10000, num_topics=50, num_topic_iterations=2000, num_topic_passes=10, reproc=False): self.dirpath = dirpath self.word2vector_path = word2vector_path self.max_vocab_size = max_vocab_size self.num_topics = num_topics self.num_topic_iterations = num_topic_iterations self.num_topic_passes = num_topic_passes self.reproc = reproc self.domains = os.listdir(self.dirpath) if not os.path.exists("./preproc_data"): os.makedirs("./preproc_data") print("Initialize the Pre-processed data") elif self.reproc: os.rmdir("./preproc_data") os.makedirs("./preproc_data") print("Re-construct Pre-processed data") else: print("Re-use History Pre-processed data") "domain2data" ################################################## def load_domain2data(self): def file_parser(domain, split): def line_parser(line): features, review = line.split(' ')[:-1], [] for feature in features: ngram, count = feature.split(':') for _ in range(int(count)): review.append(ngram) return review file_path = os.path.join(self.dirpath, domain, '{}.review'.format(split)) with open(file_path, "r") as f: reviews = [line_parser(line) for line in f] return reviews if os.path.exists("./preproc_data/domain2data.pkl"): with open("./preproc_data/domain2data.pkl", "rb") as filer: self.domain2data = pickle.load(filer) else: self.domain2data = { domain: { "labeled": [], "label": [], "unlabeled": None } for domain in self.domains } for domain in self.domains: for split in ['positive', 'negative', 'unlabeled']: reviews = file_parser(domain, split) if split == 'unlabeled': self.domain2data[domain]['unlabeled'] = reviews else: self.domain2data[domain]['labeled'] += reviews self.domain2data[domain]['label'] += [ 1 if split == "positive" else 0 ] * len(reviews) self.domain2data[domain]["label"] = np.array( self.domain2data[domain]["label"]) with open("./preproc_data/domain2data.pkl", "wb") as filew: pickle.dump(self.domain2data, filew) print("Load domain2data has done.") ################################################## def load_global_vocab(self): if os.path.exists("./preproc_data/vocab.txt"): self.word2id = {} with open("./preproc_data/vocab.txt", 'r') as f: for i, line in enumerate(f): if i >= self.max_vocab_size: break word, idx = line.split('\t') self.word2id[word] = int(idx.strip()) self.vocab_size = len(self.word2id) self.id2word = { index: word for word, index in self.word2id.items() } else: texts = [] if not hasattr(self, "domain2data"): self.load_domain2data() for domain in self.domain2data: texts.extend(self.domain2data[domain]["labeled"]) texts.extend(self.domain2data[domain]["unlabeled"]) word_counts = Counter(itertools.chain(*texts)) most_common = word_counts.most_common(n=self.max_vocab_size) self.word2id = { word: index for index, (word, _) in enumerate(most_common) } self.id2word = { index: word for word, index in self.word2id.items() } with open("./preproc_data/vocab.txt", 'w') as f: for word, index in sorted(self.word2id.items(), key=lambda d: d[1]): f.write('%s\t%d\n' % (word, index)) self.vocab_size = len(self.word2id) print("Load vocab has done.") "word2vector" ################################################## def load_word2vector(self): if os.path.exists("./preproc_data/word2vector.pkl"): with open("./preproc_data/word2vector.pkl", "rb") as filer: self.word2vector = pickle.load(filer) else: self.word2vector = {} if not hasattr(self, "word2id"): self.load_global_vocab() with open(self.word2vector_path, 'r') as f: for i, line in enumerate(f): if i == 0: continue word = line.split(' ')[0] if word not in self.word2id: continue line = ' '.join(line.split(' ')[1:]).strip() vector = np.fromstring(line, dtype=float, sep=' ') self.word2vector[word] = vector with open("./preproc_data/word2vector.pkl", "wb") as filer: pickle.dump(self.word2vector, filer) print("Load word2vector has done.") "topic_model" ################################################## def load_topic_model(self): if not hasattr(self, "word2id"): self.load_globel_vocab() self.vectorizer = CountVectorizer(vocabulary=self.word2id, tokenizer=lambda x: x, preprocessor=lambda x: x) file_path = "./preproc_data/topic_model.pkl" if os.path.exists(file_path): self.topic_model = LdaModel.load(file_path) else: texts = [] if not hasattr(self, "domain2data"): self.load_domain2data() for domain in self.domain2data: texts.extend(self.domain2data[domain]["labeled"]) texts.extend(self.domain2data[domain]["unlabeled"]) corpus = self.vectorizer.fit_transform(texts) corpus = Sparse2Corpus(corpus, documents_columns=False) self.topic_model = LdaMulticore( corpus=corpus, num_topics=self.num_topics, id2word=self.id2word, iterations=self.num_topic_iterations, passes=self.num_topic_passes) self.topic_model.save(file_path) "tfidf" ################################################## def load_domain2tfidf(self): if os.path.exists("./preproc_data/domain2tfidf.pkl"): with open("./preproc_data/domain2tfidf.pkl", "rb") as filer: self.domain2tfidf = pickle.load(filer) else: if not hasattr(self, "domain2data"): self.load_domain2data() if not hasattr(self, "word2id"): self.load_global_vocab() self.domain2tfidf = { domain: { "labeled": [], "label": [], "unlabeled": [] } for domain in self.domains } for domain in self.domain2data: vectorizer = TfidfVectorizer(vocabulary=self.word2id, tokenizer=lambda x: x, preprocessor=lambda x: x) vectorizer.fit(self.domain2data[domain]["labeled"] + self.domain2data[domain]["unlabeled"]) for key in self.domain2tfidf[domain]: self.domain2tfidf[domain][key] = self.domain2data[domain][key] if key == "label" \ else vectorizer.transform(self.domain2data[domain][key]) with open("./preproc_data/domain2tfidf.pkl", "wb") as filew: pickle.dump(self.domain2tfidf, filew) print("Load domain2tfidf has done.") "texts" ################################################## def get_texts(self, domains, unlabeled=True): texts = [] if not hasattr(self, "domain2data"): self.load_domain2data() for domain in domains: texts.extend(self.domain2data[domain]["labeled"]) if unlabeled: texts.extend(self.domain2data[domain]["unlabeled"]) return texts "distribution" ################################################## def get_texts_term_distribution(self, texts): if not hasattr(self, "word2id"): self.load_global_vocab() term_distribution = np.zeros(len(self.word2id)) for text in texts: for word in text: if word in self.word2id: term_distribution[self.word2id[word]] += 1 term_distribution /= np.sum(term_distribution) if np.isnan(np.sum(term_distribution)): term_distribution = np.zeros(self.vocab_size) return term_distribution "topic" ################################################## def get_texts_topic_distribution(self, texts): if not hasattr(self, "vectorizer"): self.load_topic_model() vectorized_corpus = self.vectorizer.transform(texts) gensim_corpus = Sparse2Corpus(vectorized_corpus, documents_columns=False) topic_representations = [] for doc in gensim_corpus: topic_representations.append([ topic_prob for (_, topic_prob) in self.topic_model.get_document_topics( doc, minimum_probability=0.) ]) return np.array(topic_representations) "word2vec" ################################################## def get_texts_word2vec_distribution(self, texts): if not hasattr(self, "word2vector"): self.load_word2vector() word_embeds, t = [], 10e-5 texts_term_distribution_weights = self.get_texts_term_distribution( texts) for text in texts: word_count, doc_vector = 0, np.zeros( len(list(self.word2vector.values())[0])) for word in text: if word not in self.word2vector: continue doc_vector += np.sqrt(t / (texts_term_distribution_weights[ self.word2id[word]])) * self.word2vector[word] word_count += 1 doc_vector = doc_vector if word_count == 0 else doc_vector / word_count word_embeds.append(doc_vector) return np.array(word_embeds) "model feature" ################################################## def get_model_feature(self, domains): if not hasattr(self, "domain2tfidf"): self.load_domain2tfidf() X, Y, D = [], [], [] for domain in domains: X.extend(self.domain2tfidf[domain]["labeled"]) Y.extend(self.domain2tfidf[domain]["label"]) D.extend([domain] * self.domain2tfidf[domain]["labeled"].shape[0]) X = scipy.sparse.vstack(X).toarray() Y = np.asarray(Y) return X, Y, D "metric feature" ################################################### def get_metric_feature(self, target_domain, metric_dict): metric_names = [(metric_type, metric_name) for metric_type in metric_dict for metric_name in metric_dict[metric_type]] feature = [] if "term" in metric_dict: term_feature = self.get_term_feature(target_domain, metric_names) feature.append(term_feature) if "topic" in metric_dict: topic_feature = self.get_topic_feature(target_domain, metric_names) feature.append(topic_feature) if "word2vec" in metric_dict: word2vec_feature = self.get_word2vec_feature( target_domain, metric_names) feature.append(word2vec_feature) if "diversity" in metric_dict: diversity_feature = self.get_diversity_feature( target_domain, metric_names) feature.append(diversity_feature) feature = np.concatenate(feature, axis=1) return feature def get_term_feature(self, target_domain, metric_names): filepath = "./preproc_data/term_feature_{}.pkl".format(target_domain) if os.path.exists(filepath): with open(filepath, "rb") as filer: term_feature = pickle.load(filer) else: source_texts = self.get_texts( [domain for domain in self.domains if domain != target_domain], unlabeled=False) target_texts = self.get_texts([target_domain], unlabeled=False) texts_distribution = [ self.get_texts_term_distribution([text]) for text in source_texts ] domain_distribution = self.get_texts_term_distribution( target_texts) rvalues = [] for text_distribution in texts_distribution: values = [] for metric_name in metric_names: metric_type, metric_func = metric_name if metric_type != "term": continue if metric_func in [ 'jensen_shannon', 'renyi', 'cosine', 'euclidean', 'variational', 'bhattacharyya' ]: values.append( getattr(Metric, metric_func)(text_distribution, domain_distribution)) rvalues.append(values) term_feature = np.asarray(rvalues) with open(filepath, "wb") as filew: pickle.dump(term_feature, filew) return term_feature def get_topic_feature(self, target_domain, metric_names): filepath = "./preproc_data/topic_feature_{}.pkl".format(target_domain) if os.path.exists(filepath): with open(filepath, "rb") as filer: topic_feature = pickle.load(filer) else: source_texts = self.get_texts( [domain for domain in self.domains if domain != target_domain], unlabeled=False) target_texts = self.get_texts([target_domain], unlabeled=False) texts_distribution = self.get_texts_topic_distribution( source_texts) domain_distribution = np.mean( self.get_texts_topic_distribution(target_texts), axis=0) rvalues = [] for text_distribution in texts_distribution: values = [] for metric_name in metric_names: metric_type, metric_func = metric_name if metric_type != "topic": continue if metric_func in [ 'jensen_shannon', 'renyi', 'cosine', 'euclidean', 'variational', 'bhattacharyya' ]: values.append( getattr(Metric, metric_func)(text_distribution, domain_distribution)) rvalues.append(values) topic_feature = np.asarray(rvalues) with open(filepath, "wb") as filew: pickle.dump(topic_feature, filew) return topic_feature def get_word2vec_feature(self, target_domain, metric_names): filepath = "./preproc_data/word2vec_feature_{}.pkl".format( target_domain) if os.path.exists(filepath): with open(filepath, "rb") as filer: word2vec_feature = pickle.load(filer) else: source_texts = self.get_texts( [domain for domain in self.domains if domain != target_domain], unlabeled=False) target_texts = self.get_texts([target_domain], unlabeled=False) texts_distribution = self.get_texts_word2vec_distribution( source_texts) domain_distribution = np.mean( self.get_texts_word2vec_distribution(target_texts), axis=0) rvalues = [] for text_distribution in texts_distribution: values = [] for metric_name in metric_names: metric_type, metric_func = metric_name if metric_type != "word2vec": continue if metric_func in ['cosine', 'euclidean', 'variational']: values.append( getattr(Metric, metric_func)(text_distribution, domain_distribution)) rvalues.append(values) word2vec_feature = np.asarray(rvalues) with open(filepath, "wb") as filew: pickle.dump(word2vec_feature, filew) return word2vec_feature def get_diversity_feature(self, target_domain, metric_names): filepath = "./preproc_data/diversity_feature_{}.pkl".format( target_domain) if os.path.exists(filepath): with open(filepath, "rb") as filer: diversity_feature = pickle.load(filer) else: if not hasattr(self, "word2vector"): self.load_word2vector() source_texts = self.get_texts( [domain for domain in self.domains if domain != target_domain], unlabeled=False) term_distribution = self.get_texts_term_distribution(source_texts) rvalues = [] for source_text in source_texts: p_words, p_word_vector_pairs = [], [] for word in set(source_text): if word in self.word2id: p_words.append(term_distribution[self.word2id[word]]) if word in self.word2vector: p_word_vector_pairs.append( (term_distribution[self.word2id[word]], self.word2vector[word])) else: p_words.append(0.0) values = [] for metric_name in metric_names: metric_type, metric_func = metric_name if metric_type != "diversity": continue if metric_func in ['num_word_types', 'type_token_ratio']: values.append( getattr(Metric, metric_func)(source_text)) elif metric_func in [ 'entropy', 'simpsons_index', 'renyi_entropy' ]: values.append(getattr(Metric, metric_func)(p_words)) elif metric_func in ['quadratic_entropy']: values.append( getattr(Metric, metric_func)(p_word_vector_pairs)) else: raise AttributeError() rvalues.append(values) diversity_feature = np.asarray(rvalues) with open(filepath, "wb") as filew: pickle.dump(diversity_feature, filew) return diversity_feature
class LDAMWBase: def __init__(self, mtype='multiple', resource=None, lda_work_folder=None, lda_model_filename=None, lda_dict_filename=None, lda_topic_word_count=0, lda_topics_count=0, resource_language=None, data_type=None): # # todo Deutsch Lemmatizer / Stemmer !!! # self.p_stemmer = PorterStemmer() self.wn_lemmatizer = WordNetLemmatizer() if resource is not None: # resource_lang == 'en' as default resource_lang = 'en' # hope that resource is correct and exists if data_type == 'db': resource_lang = Resources.select(Resources.lang).where( Resources.resource == resource).get() resource_lang = resource_lang.__data__['lang'].lower() elif data_type == 'csv': if resource_language is None: raise Exception( "Resource language must be defined for csv data type.") else: resource_lang = resource_language else: pass self.stop_words = get_stop_words(resource_lang) self.resource_identifier_name = resource def _create_model_deps(model_name, twordscount, tcount, mini=False, mini_path=None): if not mini: mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + model_name else: mp = DEFAULT_PROJECT_PATH + 'topics_extractor/lda_data' + '/' + mini_path mn = 'lda_model' + '_' + model_name md = 'dictionary' + '_' + model_name ltwordscount = twordscount ltcount = tcount _short_model_report = "{}{}: {} \n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}{}: {}\n{}".format( INFO_FLAG, colored("Model path", 'red', None, ['bold']), mp, INFO_FLAG, colored("Model name", 'red', None, ['bold']), mn, INFO_FLAG, colored("Model dictionary", 'red', None, ['bold']), md, INFO_FLAG, colored("Topic words count", 'red', None, ['bold']), ltwordscount, INFO_FLAG, colored("Topics count", 'red', None, ['bold']), ltcount, "-" * 88) if model_name != 'mini': print(_short_model_report) return mp, mn, md, ltwordscount, ltcount if mtype == 'multiple': if resource is not None: mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps( self.resource_identifier_name, LDA_TOPIC_WORD_COUNT, LDA_TOPICS_COUNT) else: raise Exception( "{}Resource must be defined. Exiting... \n".format( EXCEPTION_FLAG)) elif mtype == 'single_ltc': mpath, mname, mdict, lda_topic_word_count, lda_topics_count = _create_model_deps( "mini", MINI_LDA_TOPIC_WORD_COUNT, MINI_LDA_TOPICS_COUNT, mini=True, mini_path=self.resource_identifier_name + "/mini") if lda_work_folder is None: self.lda_work_folder = mpath else: self.lda_work_folder = lda_work_folder if not os.path.exists(self.lda_work_folder): os.mkdir(self.lda_work_folder) if lda_model_filename is None: self.lda_model_filename = os.path.join(self.lda_work_folder, mname) else: self.lda_model_filename = os.path.join(self.lda_work_folder, lda_model_filename) if lda_dict_filename is None: self.lda_dict_filename = os.path.join(self.lda_work_folder, mdict) else: self.lda_dict_filename = os.path.join(self.lda_work_folder, lda_dict_filename) self.lda_topics_count = lda_topics_count self.lda_topic_word_count = lda_topic_word_count self.dictionary = None self.lda_model = None self.lda_topics = [] @staticmethod def load_csv_data(csv_file): df = pd.read_csv(csv_file) train_documents = df['content'].values return train_documents @staticmethod def load_single_ltc(ltc_data): train_documents = re.split( r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', ltc_data) return train_documents @staticmethod def load_db_data(resource=None): # if resource is None: # art_content_stream = Articles.select() # else: art_content_stream = Articles.select().where( Articles.resource == resource) train_documents = (acs.content for acs in art_content_stream if acs.content is not None) return train_documents def save_model(self, as_name=None, save_on_disk=True, save_topics_into_db=False): if save_on_disk: print(" \t-> Model was saved as [ {} ]".format(as_name)) if as_name is not None: self.lda_model.save(as_name) else: self.save_model(self.lda_model_filename) if save_topics_into_db: truncate_topics_tables(resource=self.resource_identifier_name) print(" \t-> Topics will be saved in database for [ {} ]".format( self.resource_identifier_name)) model_numbers_topics = self._get_topics() try: for topic_info in model_numbers_topics: tnum = topic_info[0] tresourceid = topic_info[1] tname = topic_info[2] _topic = { 'ident_number': tnum, 'value': tname, 'created_at': dt.datetime.today().date() } t = Topics.create(**_topic) t_id = t.__data__['topic'] _topic_resource = { 'resource': tresourceid, 'topic': t_id, 'created_at': dt.datetime.today().date() } tr = TopicsResources.create(**_topic_resource) print("{}[ {} ]".format(SUCCESS_FLAG, self.resource_identifier_name)) except Exception as e: print("{}{}".format(EXCEPTION_FLAG, e)) print("{}Failure: [ {} ]".format( ERROR_FLAG, self.resource_identifier_name)) def train_model(self, data_type, resource, single_ltc_data=None, data_file_path=None, train_corpus=None, train_dictionary=None, save_model_as=None, chunksize=LDA_CHUNKSIZE, passes=LDA_PASSES): if train_corpus is not None: corpus = train_corpus elif data_type == 'db': corpus = self._make_corpus(data_type=data_type, resource=resource) elif data_type == 'single_ltc' and single_ltc_data is not None: corpus = self._make_corpus(data_type=data_type, ltc=single_ltc_data, resource=resource) elif data_type == 'csv' and data_file_path is not None: corpus = self._make_corpus(data_type=data_type, data_file_path=data_file_path, resource=resource) else: raise Exception("{}Corpus is None".format(EXCEPTION_FLAG)) if train_dictionary is not None: dictionary = train_dictionary else: dictionary = self.dictionary """ id2word parameter need to get words in topics instead of their indexes in dict """ _tcount = self.lda_topics_count # self.lda_model = LdaModel(corpus=corpus, num_topics=_tcount, id2word=dictionary, passes=passes, chunksize=chunksize) self.lda_model = LdaMulticore(corpus=corpus, num_topics=_tcount, id2word=dictionary, passes=passes, chunksize=chunksize) if save_model_as is not None and not single_ltc_data: self.save_model(save_model_as, save_on_disk=True, save_topics_into_db=False) elif single_ltc_data: self.save_model(self.lda_model_filename, save_on_disk=True, save_topics_into_db=False) elif data_type == 'csv': self.save_model(self.lda_model_filename, save_on_disk=True, save_topics_into_db=False) else: self.save_model(self.lda_model_filename, save_on_disk=True, save_topics_into_db=True) print("{}Trained".format(SUCCESS_FLAG)) def load_model(self, model_file_path=None, dict_file_path=None): """ load model and dictionary from file (need to save them in train function) uses to update model on another corpus """ if model_file_path is not None and os.path.exists(model_file_path): self.lda_model = LdaMulticore.load(model_file_path) # self.lda_model = LdaModel.load(model_file_path) self.dictionary = Dictionary.load(dict_file_path) print(" \t-> Loaded: [ {} ]".format(model_file_path)) elif model_file_path is None and os.path.exists( self.lda_model_filename): self.lda_model = LdaMulticore.load(self.lda_model_filename) # self.lda_model = LdaModel.load(self.lda_model_filename) self.dictionary = Dictionary.load(self.lda_dict_filename) print(" \t-> Loaded: [ {} ]".format(self.lda_model_filename)) else: print( "{}Filepath you gave is incorrect. \n Give another one and retry." "\n Exiting...".format(ERROR_FLAG)) exit() for i in range(self.lda_model.num_topics): terms_id = self.lda_model.get_topic_terms( i, self.lda_topic_word_count) terms = [self.dictionary.get(x[0]) for x in terms_id] self.lda_topics.append(' '.join(terms)) def update_model(self, ondata_file_path=None, resource=None, data_type='db'): if ondata_file_path is not None and data_type == 'csv': corpus = self._make_corpus(data_file_path=ondata_file_path, data_type=data_type, resource=resource) elif data_type == 'db': corpus = self._make_corpus(data_file_path=None, data_type=data_type, resource=resource) else: raise Exception("{}Corpus is None".format(EXCEPTION_FLAG)) self.lda_model.update(corpus) def process_record(self, text, data_type): """ data_type - db / csv / single_ltc """ if data_type == 'single_ltc': try: self.load_model() except Exception as e: print("{}{}".format(EXCEPTION_FLAG, e)) pass elif self.lda_model is None: try: self.load_model() except Exception as e: print("{}{}".format(EXCEPTION_FLAG, e)) pass if data_type == 'db': if self.lda_model is None: return dict() doc = self._prepare_single_document(text) if doc is not None: topics = self._get_document_topics(doc) top_topic = topics[0] return [('topic', self.lda_topics[top_topic])] return [('topic', "")] elif data_type == 'csv': doc = self._prepare_single_document(text) topics_in_count_by_ids = self._get_document_topics(doc) current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[ 0], topics_in_count_by_ids[1:] result_topic_word_descr = re.sub( '[^A-Za-z]+', ' ', self._get_topic_by_id(current_doc_topic_id)) return [('topic', result_topic_word_descr), ('other_topics', current_doc_other_topics)] elif data_type == 'single_ltc': doc = self._prepare_single_document(text) topics_in_count_by_ids = self._get_document_topics(doc) if topics_in_count_by_ids is not None: current_doc_topic_id, current_doc_other_topics = topics_in_count_by_ids[ 0], topics_in_count_by_ids[1:] result_topic_word_descr = re.sub( '[^A-Za-z]+', ' ', self._get_topic_by_id(current_doc_topic_id)) return result_topic_word_descr, current_doc_other_topics else: return "", [] def _get_metric_fields(self): if self.lda_model is None: return [] else: return ['topic'] def _get_document_topics(self, doc, count=5): if doc is not None: bow = self.dictionary.doc2bow(doc) topics = self.lda_model.get_document_topics( bow, minimum_probability=0.0) topics_in_count = list( ident_number for (ident_number, prob) in sorted( topics, key=itemgetter(1), reverse=True)[:count]) return topics_in_count def _get_document_topic(self, doc_topics): topic_id_probs = {} for t_prob in doc_topics: topic_id_probs[t_prob[0]] = t_prob[1] doc_topic_id = sorted(topic_id_probs, key=topic_id_probs.get, reverse=True)[0] doc_topic_prob = topic_id_probs[doc_topic_id] return [doc_topic_id, doc_topic_prob] def _prepare_single_document(self, sd): if sd is None or type(sd) == np.float: return None try: sd = sd.lower() sd = nltk.tokenize.word_tokenize(sd) sd = (word for word in sd if word.isalpha() and len(word) > 2) stopped_sd = (word for word in sd if word not in self.stop_words) lemmatized_doc = [ self.wn_lemmatizer.lemmatize(word) for word in stopped_sd ] return lemmatized_doc except AttributeError as e: print("{}{}".format(EXCEPTION_FLAG, e)) return None def _make_bow(self, text): if text is not None: d = self._prepare_single_document(text) return self.dictionary.doc2bow(d) def _make_corpus(self, data_type, resource, data_file_path=None, save_train_dict=True, save_dict_as=None, ltc=None): """ data type can be csv or db # or new - single_ltc """ if data_type == 'db': documents = self.load_db_data(resource=resource) elif data_type == 'csv' and data_file_path is not None: documents = self.load_csv_data(data_file_path) elif data_type == 'single_ltc' and ltc is not None: ltc_text = " ".join(e if type(e) is str else "" for e in ltc) documents = self.load_single_ltc(ltc_text) else: documents = None print("{}documents is None. Exiting ... \n".format(ERROR_FLAG)) exit() with Pool() as pool: processed_docs = pool.imap(self._prepare_single_document, documents) pool.close() pool.join() processed_docs = (i for i in processed_docs if i is not None) self.dictionary = Dictionary(processed_docs) if save_train_dict and save_dict_as is None: self.dictionary.save(self.lda_dict_filename) else: self.dictionary.save(save_dict_as) corpus = [ self.dictionary.doc2bow(proc_doc) for proc_doc in processed_docs ] return corpus def _get_topic_by_id(self, topic_id): if self.lda_topic_word_count is not None: return self.lda_model.print_topic(topic_id, self.lda_topic_word_count) else: return self.lda_model.print_topic(topic_id, 6) def _get_topics(self, default_view=False, for_db=True): """ 2-tuples (probability * word) of most probable words in topics num_topics=-1 <--- to print all topics """ def _get_words(probabilities_words_string): _pre_topic_with_digits_trash = " ".join( re.findall(ALL_CHARS, probabilities_words_string)) probaply_clean_topic = re.sub(r'\b\d+(?:\.\d+)?\s+', "", _pre_topic_with_digits_trash) return probaply_clean_topic # " ".join(re.findall('[a-zA-Z]+', probabilities_words_string)) if default_view: return self.lda_model.print_topics(num_topics=-1) if for_db: resource_id = Resources.select().where( Resources.resource == self.resource_identifier_name).first() resource_id = resource_id.__data__['resource'] return [(elem[0], resource_id, _get_words(elem[1])) for elem in self.lda_model.print_topics( num_topics=self.lda_topics_count, num_words=self.lda_topic_word_count)] return [(elem[0], _get_words(elem[1])) for elem in self.lda_model.print_topics( num_topics=self.lda_topics_count, num_words=self.lda_topic_word_count)]