def display_topics(model, feature_names, no_top_words): for topic_idx, topic in enumerate(model.components_): du.getLogger().debug("Topic %d:" % (topic_idx)) du.getLogger().debug(" ".join([ feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1] ]))
def subsetByCategory(X, y, targetIn): du.getLogger().debug("Original Set\n", X.__len__) n = 0 rows = list() for target in y: if target == targetIn: rows.append(n) n += 1 array = np.asarray(rows) out = X[array, :] du.getLogger().debug("\n\nReturning SubSet\n" + str(out.__len__)) dataloader.infoX(out) return (out)
def saveListAsExcel(list, outputDirectory, fileName, outputColumnNames, targeName=""): outputDataframe = pd.DataFrame(list, columns=outputColumnNames) # Here we use a python function to convert our dataframe to Excel and then save it writer = ExcelWriter(outputDirectory + "/" + du.timeStamped() + targeName + "_" + fileName + '.xlsx') outputDataframe.to_excel(writer, '_Sheet1') du.getLogger().debug("Saving to " + outputDirectory + "/" + du.timeStamped() + targeName + "_" + fileName + '.xlsx') writer.save()
def infoX(X): logger = datautils.getLogger() logger.info("X.format" + str(X.format)) logger.info("X.dtype" + str(X.dtype)) logger.info("len(X.indices)" + str(len(X.indices))) logger.info("X.ndim" + str(X.ndim)) logger.info("X.__len__" + str(X.__len__)) logger.info("X[:, 0].shape" + str(X[:, 0].shape))
def convertToGensimCorporaAndDictionary(X, columnMap): dct = gensim.corpora.Dictionary() datautils.getLogger().info("\n convertToGensimCorporaAndDictionary \n\n") cx = scipy.sparse.coo_matrix(X) corpora = [] doc = [] currentRow = 0 for i, j, v in zip(cx.row, cx.col, cx.data): if (i > currentRow): print("-> ") corpora.append(doc) doc = [] currentRow = i # print("(%d, %d), %s" % (i,j,v)) for x in range(0, int(v)): doc.append(datautils.getTermByIdx(columnMap, j + 1)) dct.add_documents(corpora) common_corpus = [dct.doc2bow(text) for text in corpora] return (common_corpus, dct)
def kmeansBySubset(X, y, columnMap, targetMap, outputDirectory, dataFile, clusterCount=20, maxIterations=300, init="k-means++", n_init="10", precompute_distances='auto', algorithm='auto', verbose=1, n_jobs=1, thresholdForReporting=0.05): for target in np.unique(y): targetName=du.getTargetByIdx(targetMap,target) du.getLogger().debug("\n\nSubset for "+str(targetName)) subset = dataprocessing.subsetByCategory(X,y,target) dataloader.infoX(subset) kmeansplots.sparsityPlot(subset,targetName) kmeansForSubset=doKmeans(subset, clusterCount, maxIterations, init, n_init, precompute_distances, algorithm, verbose, n_jobs) reportTuple = filterAndReportResults(subset, columnMap, dataFile, thresholdForReporting) dataprocessing.saveListAsExcel(reportTuple[0], outputDirectory, dataFile, reportTuple[1]) kmeansplots.plotClusterCentroids(subset,kmeansForSubset,targetName)
def filterAndReportResultsLDA(model, cmap, n_top_words=10): listOfWordsByTopic = [] for topic, comp in enumerate(model.components_): du.getLogger().debug("topic " + str(topic)) du.getLogger().debug("comp " + str(comp)) word_idx = np.argsort(comp)[::-1][:n_top_words] du.getLogger().debug(str(topic) + "word_idx" + str(word_idx)) for i in word_idx: listOfWordsByTopic.append( [topic, du.getTermByIdx(cmap, (i + 1)), comp[i]]) for i, (topic, term, value) in enumerate(listOfWordsByTopic): du.log().debug("topic " + str(topic) + " term " + str(term) + " value " + str(value)) outputColumnNames = ["topic", "term", "lda_weight"] return ([listOfWordsByTopic, outputColumnNames])
def filterAndReportResults(kmeans,columnMap, target,thresholdForReporting=0.001): du.getLogger().debug(kmeans.cluster_centers_) outputColumnNames = ['Target','Cluster','Weight', 'Term'] #This is the list of analysis results. We will add to it and then save as an Excel Spreadsheet listOfRows = [] #columnsToRemove #This is a set of columns to remove. We will add to this list if our logic tells us to remove the column #NOTE a set cannot contain duplicates. This is good. We don't want to remove the same column twice! du.getLogger().debug(" K Means Parameter Values:") du.getLogger().debug(" inertia: "+str(kmeans.inertia_)) du.getLogger().debug(" init: "+str(kmeans.init)) du.getLogger().debug(" labels_: "+str(kmeans.labels_)) du.getLogger().debug(" max_iter: "+str(kmeans.max_iter)) du.getLogger().debug(" params: "+str(kmeans.get_params)) du.getLogger().debug(" tol: "+str(kmeans.tol)) du.getLogger().debug(" n_init: "+str(kmeans.n_init)) clusterIdx = 0 for row in kmeans.cluster_centers_: clusterIdx += 1 du.getLogger().debug("\n------\nCluster\n\n") colIdx = 1 for weight in row: if weight > thresholdForReporting: #This line simply prints the output to the console du.getLogger().debug(str(weight) + " col: "+ str(du.getTermByIdx(columnMap,colIdx))) # This adds the information about term and its weighting to a list. # We convert this list to a dataframe at the end of this function and save as excel # In other words we are going to save the results listOfRows.append([target,clusterIdx, weight, du.getTermByIdx(columnMap,colIdx)]) colIdx += 1 return ([listOfRows,outputColumnNames])
def print_topics(model, feature_names, n_top_words=10): for idx, topic in enumerate(model.components_): du.getLogger().debug("Topic %d:" % (idx)) du.getLogger().debug([(feature_names[i], round(topic[i], 2)) for i in topic.argsort()[:-n_top_words - 1:-1]])
def reportResults1(best_lda_model, X, y, columnMap, n_top_words=10): lda_output = best_lda_model.transform(X) topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)] docnames = ["Doc" + str(i) for i in range(len(y))] du.getLogger().debug(topicnames) du.getLogger().debug(docnames) import numpy as np import pandas as pd df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames) dominant_topic = np.argmax(df_document_topic.values, axis=1) df_document_topic['dominant_topic'] = dominant_topic # showing document*topic table with a column called dominant_topic df_document_topic.head(15) df_topic_distribution = df_document_topic['dominant_topic'].value_counts( ).reset_index(name="Num Documents") df_topic_distribution.columns = ['Topic Num', 'Num Documents'] df_topic_distribution df_topic_keywords = pd.DataFrame(best_lda_model.components_) df_topic_keywords.columns = columnMap['Term'] df_topic_keywords.index = topicnames df_topic_keywords.head() # showing topics with words, but witout weights def display_topics(model, feature_names, no_top_words): for topic_idx, topic in enumerate(model.components_): du.getLogger().debug("Topic %d:" % (topic_idx)) du.getLogger().debug(" ".join([ feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1] ])) du.getLogger().debug("LDA Model:") display_topics(best_lda_model, columnMap['Term'], 20) du.getLogger().debug("=" * 40) # showing topics with words and weights in tuples # https://nlpforhackers.io/topic-modeling/ du.getLogger().debug("LDA Model:") print_topics(best_lda_model, columnMap['Term'], n_top_words) du.getLogger().debug("=" * 40)