def plotMostFrequentWords(base): vectorizer = CountVectorizer(lowercase=False) docs = vectorizer.fit_transform(base['Tweet']) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features=features) visualizer.fit(docs) visualizer.poof()
def plot_most_frequent_tokens(self, df, column_name): count_vectorizer = CountVectorizer() tf_original = count_vectorizer.fit_transform(df[column_name]) tf_feature_names = count_vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features=tf_feature_names, orient='v') visualizer.fit(tf_original) visualizer.show()
def tf(self, df): tf_vectorizer = CountVectorizer(min_df=0.01, max_df=0.85, max_features=no_features, ngram_range=[2, 3]) dtm_tf = tf_vectorizer.fit_transform(df['descriptions']) print("dtm:", dtm_tf.shape) df = pd.DataFrame(dtm_tf.toarray(), columns=tf_vectorizer.get_feature_names()) print(df.head()) #Show top tokens # Calculate column sums from DTM sum_words = dtm_tf.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in tf_vectorizer.vocabulary_.items()] # Now, sort them words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) # Display top few print(words_freq[:20]) #Visualize Freq. of top 25 tokens plt.figure(figsize=(5, 8)) visualizer = FreqDistVisualizer( features=tf_vectorizer.get_feature_names(), n=25) visualizer.fit(dtm_tf) visualizer.poof() return (df, sum_words, words_freq)
def countvect_test_simple(X_train, X_test, y_train, y_test, token_izer): if (token_izer == '1'): countvect = CountVectorizer(tokenizer=tokenizer_preproccessor) elif (token_izer == '2'): countvect = CountVectorizer(tokenizer=tokenizer_preproccessor_imdb) else: countvect = CountVectorizer() #CountVect countvect.fit(X_train) #to metatrepoume se dtm sparse matrix X_train_dtm = countvect.transform(X_train) X_test_dtm = countvect.transform(X_test) #Ftiaxnoume Multinomial Naive Bayes modelo nb = MultinomialNB() #kanoume prediction gia to x_test_dtm # cross val score/ predict cvec_score = cross_val_score(nb, X_train_dtm, y_train, cv=10) feature_names = countvect.get_feature_names() print("Number of features: {}".format(len(feature_names))) print("to accuracy tou CountVectorizer me NB einai: {}".format( cvec_score.mean())) visualizer = FreqDistVisualizer(features=feature_names, orient='h') visualizer.fit(X_train_dtm) visualizer.poof() return cvec_score.mean()
def freqdist(): corpus = load_hobbies() vecs = CountVectorizer() docs = vecs.fit_transform(corpus.data) oz = FreqDistVisualizer(features=vecs.get_feature_names(), ax=newfig()) oz.fit(docs) savefig(oz, "freqdist")
def freqdist(ax, stopwords=None): from sklearn.feature_extraction.text import CountVectorizer from yellowbrick.text import FreqDistVisualizer X, y = load_data("hobbies", text=True) freq = CountVectorizer(input='filename', stop_words=stopwords) X = freq.fit_transform(X) title = "Frequency Distribution of Top 50 Tokens in a Corpus" if stopwords: title += " (Without Stopwords)" visualizer = FreqDistVisualizer(ax=ax) visualizer.title = title visualizer.fit(X, freq.get_feature_names()) return visualizer
def bagOfWords(featureTrain, stopWords=False, countWords=False, plot=False): if (stopWords == False): count_vect = CountVectorizer() else: count_vect = CountVectorizer(stop_words='english') X_train_counts = count_vect.fit_transform(featureTrain.headline) if countWords: features = count_vect.get_feature_names() visualizer = FreqDistVisualizer(features=features, n=20, orient='v') visualizer.fit(X_train_counts) words = countTopWords(X_train_counts, count_vect, 20) if stopWords: visualizer.show(outpath="SWRemovedYB") visualizer.show() plotBagOfWords("Stop Words Removed", words, 20, stopWords) else: visualizer.show(outpath="SWIncludedYB") visualizer.show() plotBagOfWords("Stop Words Included", words, 20, stopWords) return count_vect, X_train_counts
def tfidf_test_simple(X_train, X_test, y_train, y_test, token_izer): if (token_izer == '1'): tfvect = TfidfVectorizer(tokenizer=tokenizer_preproccessor) elif (token_izer == '2'): tfvect = TfidfVectorizer(tokenizer=tokenizer_preproccessor_imdb) else: tfvect = TfidfVectorizer() tfidf_train = tfvect.fit_transform(X_train) tfidf_test = tfvect.transform(X_test) nb = MultinomialNB() #train the model and timing it #kanoume prediction gia to x_test_dtm # cross val score/ predict cvec_score = cross_val_score(nb, tfidf_train, y_train, cv=4) feature_names = tfvect.get_feature_names() print("Number of features: {}".format(len(feature_names))) print("to accuracy tou TFIDF einai {}".format(cvec_score.mean())) visualizer = FreqDistVisualizer(features=feature_names, orient='10') visualizer.fit(tfidf_train) visualizer.poof() return cvec_score.mean()
sentiment_occurences = df_grouped_sentiments.sentiment_occurences.median() ax = sentiment_occurences.plot(kind='bar') plt.title('Sentimental Word Occurences') plt.show() sentiment_score = abs(df_grouped_sentiments.sentiment_score.median()) ax = sentiment_score.plot(kind='bar') plt.title('Sentiment Score') plt.show() #term frquency charts neg_features = neg_count_vect.get_feature_names() visualizer = FreqDistVisualizer(features=neg_features) visualizer.fit(neg_bag_words) visualizer.poof() pos_features = pos_count_vect.get_feature_names() visualizer = FreqDistVisualizer(features=pos_features) visualizer.fit(pos_bag_words) visualizer.poof() adv_bi_features = adv_bi_count_vect.get_feature_names() visualizer = FreqDistVisualizer(features=adv_bi_features) visualizer.fit(adv_bi_bag_words) visualizer.poof() tfidf_features = tfidf_vect.get_feature_names() visualizer = FreqDistVisualizer(features=tfidf_features) visualizer.fit(tfidf_bag_words)
def visualised(a, b): visualizer = FreqDistVisualizer(b.get_feature_names(), n=10) visualizer.fit(a) visualizer.poof()
t = time.time() vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=500000) vectorizer.fit(X_train) print("Vectoriser fitted") print("No. of feature_words: ", len(vectorizer.get_feature_names())) print('Time Taken: ', round(time.time() - t), 'seconds') X_train = vectorizer.transform(X_train) X_test = vectorizer.transform(X_test) print("Data transformed") print() set_palette('sns_pastel') visualizer = FreqDistVisualizer(features=vectorizer.get_feature_names()) visualizer.fit(X_train) visualizer.show() # evaluate model def model_evaluate(model): # predict values for text data-set y_pred = model.predict(X_test) # print the evaluation metrics for the dataset. print(classification_report(y_test, y_pred)) # compute and plot the confusion matrix cf_matrix = confusion_matrix(y_test, y_pred) categories = ['Negative', 'Positive'] group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
# Display top 50 words_freq[:2000] word_lst = words_freq[:50] # Convert the word frequency list above to the dataframe text = pd.DataFrame(list(word_lst), columns=["Word", "Frequency"]) print(text) # Exporting the word frequency dataframe above to csv file text.to_csv('text2.csv', index=False) ## Visualize the word frequency in the bar chart plt.figure(figsize=(5, 8)) visualizer = FreqDistVisualizer(features=tf_vectorizer.get_feature_names(), n=25) visualizer.fit(dtm_tf) visualizer.poof() ## Create the Word Cloud # Start with one review - generating one string of words based on their frequency # per the pandas dataframe, 'text', above. Each word in this string is separated by a space text_cloud = " " q = 0 for wd in text['Word']: text_unit = (wd + " ") * text['Frequency'][q] text_cloud = text_cloud + text_unit q += 1 # Checking if 'text_ckoud' string contains the correct number of the word frequency def count(word, array):
def doVisualizer(featNames,vector,numTerms=10): visualizer = FreqDistVisualizer(features=featNames,n=numTerms) visualizer.fit(vector) visualizer.poof()
# Split data into training, test and validation set (60:20:20) X = dfClean['text'] y = dataset['target'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42) #wykres wyrysowac i wywalic ogon cv = CountVectorizer(ngram_range=(1, 2), max_df = 0.95, min_df = 0.05, stop_words = 'english') X_train = list(X_train) X_traincv = cv.fit_transform(X_train) print(X_traincv.toarray()) for i in X_traincv: features = cv.get_feature_names() visualizer = FreqDistVisualizer(features=features) visualizer.fit(X_traincv) visualizer.poof() """ out = [] for i in range(len(X)): out.append('_label_ ' + str(y[i]) + ' ' + X [i].replace("\n","")) out = pd.DataFrame(out) #out.to_csv('/home/nanokoper/Pulpit/ISA/out.csv',sep='\t',index=False,header=False) # Split data into training, test and validation set (60:20:20) """
popTerm = [] pt = [] cloud = {} for i, v in enumerate(total_tf): if v > 35: freq.append(v) popTerm.append(i) term_list.append(terms[i]) cloud[terms[i]] = v pt.append(tf_model.transpose().toarray()[i]) print(freq) print(term_list) #word cloud visualizer = FreqDistVisualizer(features=terms, orient='v') visualizer.fit(tf_model) visualizer.show() wordcloud = WordCloud(normalize_plurals=False).generate_from_frequencies(cloud) plt.imshow(wordcloud) plt.axis("off") plt.savefig('word_cloud.png', dpi=200) plt.show() km = KMeans(n_clusters=2) km = km.fit(tf_model.transpose()) print(tf_model.shape) clusters = km.labels_.tolist() color = ['#d95f02' if x == 0 else '#7570b3' for x in clusters] figure, ax = plt.subplots(figsize=(20, 15))
# remove numbers df[new_text_field_name] = df[new_text_field_name].apply( lambda elem: re.sub(r"\d+", "", elem)) return df data_clean = clean_text(train_data, 'text', 'text') # Removes stop words data_clean['text'] = data_clean['text'].apply( lambda x: ' '.join([word for word in x.split() if word not in (stop)])) vectorizer = CountVectorizer() docs = vectorizer.fit_transform(data_clean['text']) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features=features, orient='v') visualizer.fit(docs) visualizer.show() disaster_tweets = data_clean[data_clean['target'] == 1] vectorizer = CountVectorizer() docs = vectorizer.fit_transform(disaster_tweets['text']) features_disaster = vectorizer.get_feature_names() visualizer_disaster = FreqDistVisualizer(features=features_disaster, orient='v') visualizer_disaster.fit(docs) visualizer_disaster.show() ###Logistic regression X_train, X_test, y_train, y_test = train_test_split(data_clean['text'], data_clean['target'],
x_train,x_test,y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0) print(x_train.shape) print(x_test.shape) print('\n') print("________________________________-Text preparation___________________") #Converting our NLP text into vector by using the function countvectorizer print("_____________________Contvervectorizer___________________________") con_vec = CountVectorizer(stop_words=stopwords.words('english')) x_train_count= con_vec.fit_transform(x_train) #print(x_train_count) #Token Frequency Distribution feature =con_vec.get_feature_names() visualizer =FreqDistVisualizer(features=feature,orient='v') visualizer.fit(x_train_count) visualizer.show() #Compute of word count with the function tfidtransformer print("---------------------TfdiTransformer------------------------------------------------------") tfidftransformer = TfidfTransformer() x_train_tfidf =tfidftransformer.fit_transform(x_train_count) print(x_train_tfidf.shape) #Transforming text into a meaningful representation of numbers with of function TfidfVectorize print("-------------------------TfidfVectorize---------------------------") vectorizer = TfidfVectorizer() x_train_tfidf =vectorizer.fit_transform(x_train) print(x_train_tfidf) print('\n')
def dataQualityFn(df, dataTypeDictionary, requiredColumnList, outputDirectory): directory = outputDirectory + 'DataQuality/' import os if not os.path.exists(directory): os.makedirs(directory) outputDirectory = directory logging.basicConfig(filename=outputDirectory + 'appDataQuality.log', filemode='w', format='PROCESS INFO-%(message)s', level=logging.CRITICAL) # Function to print & log process information def printAndLogInfo(customMessage, exceptionMessage=None): print(customMessage) logging.critical(customMessage) if exceptionMessage: print(str(exceptionMessage)) logging.critical(exceptionMessage) # Function to print & log errors def printAndLogError(customMessage, exceptionMessage=None): print('ERROR!!! ' + customMessage) logging.critical(customMessage) if exceptionMessage: print(str(exceptionMessage)) logging.critical(exceptionMessage) time.sleep(10) sys.exit() agentAssignedColumn = requiredColumnList['agentAssignedColumn'] commentTextColumn = requiredColumnList['commentTextColumn'] duplicatesCount = {} for col in df.columns.to_list(): duplicatesCount[col] = [ ((df.duplicated(col).sum() / len(df)) * 100), 100 - ((df.duplicated(col).sum() / len(df)) * 100) ] nullCounter = {} for col in df.columns.to_list(): count = 0 for cell in df[str(col)]: if cell == '?' or cell == "": # or len(str(cell))==1 count = count + 1 nullCounter[col] = [ float(count / len(df)) * 100, 100 - float(count / len(df)) * 100 ] def dataQualityCheck(checkName, columnName): if checkName == "Null Values": # create data names = 'Null Values', 'Non Null Values', size = np.array(nullCounter[columnName]) printAndLogInfo("Null Values Data Quality Check for " + str(columnName)) def absolute_value(val): a = np.round(val / 100. * size.sum(), 0) return a # Create a circle for the center of the plot my_circle = plt.Circle((0, 0), 0.7, color='white') # Custom colors --> colors will cycle plt.pie(size, labels=names, colors=['red', 'green'], autopct=absolute_value) plt.title(checkName + " Pie of " + columnName, fontdict=None, loc='center') p = plt.gcf() p.gca().add_artist(my_circle) plt.savefig(outputDirectory + checkName + columnName + '.png', bbox_inches='tight') plt.close() elif checkName == "Duplicates": # create data names = 'Duplicate Values', 'Unique Values Values', size = np.array(duplicatesCount[columnName]) printAndLogInfo("Duplicate Value Data Quality check for " + str(columnName)) def absolute_value(val): a = np.round(val / 100. * size.sum(), 0) return a # Create a circle for the center of the plot my_circle = plt.Circle((0, 0), 0.7, color='white') # Custom colors --> colors will cycle plt.pie(size, labels=names, colors=['red', 'green'], autopct=absolute_value) plt.title(checkName + " Pie of " + columnName, fontdict=None, loc='center') p = plt.gcf() p.gca().add_artist(my_circle) plt.savefig(outputDirectory + checkName + columnName + '.png', bbox_inches='tight') plt.close() elif checkName == "Details": printAndLogInfo("Details of the Column: \n ") printAndLogInfo("Original Datatype should be " + dataTypeDictionary[columnName] + "\n") printAndLogInfo("Datatype in the data is " + str(df[str(columnName)].dtypes) + "\n") elif checkName == "Range": if str(df[str(columnName)].dtypes) == 'int64' or str( df[str(columnName)].dtypes) == 'datetime64[ns]': printAndLogInfo("Maximum Value is " + str(df[str(columnName)].max()) + " \n ") printAndLogInfo("Minimum Value is " + str(df[str(columnName)].min())) else: printAndLogInfo( "Since the Datatype of column " + str(columnName) + " is not numeric in the given data, Range cannot be calculated." ) def dQexecute(columnName): printAndLogInfo("\n Name of the Column " + str(columnName) + "\n \n") dataQualityCheck("Details", columnName) dataQualityCheck("Null Values", columnName) dataQualityCheck("Duplicates", columnName) dataQualityCheck("Range", columnName) printAndLogInfo("*****************") for col in df.columns.to_list(): dQexecute(col) # Agent Assigned Topic Distribution Analysis uniqueTopics = list(df[agentAssignedColumn].unique()) fig = plt.figure(figsize=(12, 5)) df.groupby(agentAssignedColumn)[commentTextColumn].count().plot.bar(ylim=0) plt.savefig(outputDirectory + 'labelDistribution.png', bbox_inches='tight') plt.close() df['totalwords'] = df[commentTextColumn].str.split().str.len() def reasonCodeLevelWordCount(reasonCode, parameter): dfReasonCodeSubset = df[df[agentAssignedColumn] == reasonCode] if parameter == 'mean': return float(dfReasonCodeSubset.describe()['totalwords'][1]) elif parameter == 'median': return float(dfReasonCodeSubset.describe()['totalwords'][5]) # Mean Word Count reasonCodeDict = {} for topic in uniqueTopics: reasonCodeDict[str(topic)] = float( reasonCodeLevelWordCount(topic, 'mean')) plt.figure(figsize=(20, 20)) plt.title("Mean Word Frequency for each Topic", fontdict=None, loc='center') plt.bar(reasonCodeDict.keys(), reasonCodeDict.values(), width=0.1, color='g') plt.savefig(outputDirectory + 'meanBarGraph.png', bbox_inches='tight') plt.close() printAndLogInfo("\n\n ******************** \n\n ") # Median Word Count (Optional) reasonCodeDict = {} for topic in uniqueTopics: reasonCodeDict[str(topic)] = float( reasonCodeLevelWordCount(topic, 'median')) plt.figure(figsize=(20, 20)) plt.title("Median Word Frequency for each Topic", fontdict=None, loc='center') plt.bar(reasonCodeDict.keys(), reasonCodeDict.values(), width=0.1, color='g') plt.savefig(outputDirectory + 'medianBarGraph.png', bbox_inches='tight') plt.close() # Visualize Token (vocabulary) Frequency Distribution Before Text Preprocessing vectorizer = CountVectorizer() docs = vectorizer.fit_transform(df[commentTextColumn]) features = vectorizer.get_feature_names() plt.figure(figsize=(12, 8)) plt.title("FrequencyDistribution of words before Preprocessing", fontdict=None, loc='center') visualizer = FreqDistVisualizer(features=features) visualizer.fit(docs) for label in visualizer.ax.texts: label.set_size(20) # visualizer.poof() plt.savefig(outputDirectory + 'FrequencyDistributionBeforePreprocessing.png', bbox_inches='tight') plt.close() #Preparation of Data Quality Report from fpdf import FPDF pdf = FPDF() # imagelist is the list with all image filenames filelist = os.listdir(outputDirectory) for fichier in filelist[:]: # filelist[:] makes a copy of filelist. if not (fichier.endswith(".png")): filelist.remove(fichier) imagelist = filelist for image in imagelist: pdf.add_page() pdf.image(outputDirectory + image, 40, 20, 100, 80) pdf.output(outputDirectory + "DataQualityReport.pdf", "F") printAndLogInfo( "Detailed Report on Data Quality is saved in the location: " + outputDirectory) return df
if (len(valor) > 1): #print(len(valor)) colecao.append(valor) infile.close() return colecao def SaveColecao(colecao, file): with open(file, encoding='utf-8', mode="w+") as file: #writer = csv.writer(file, delimiter="") for i in colecao: #linha = (filename, paginas, ano, titulo, label, pA, pB) #print(linha) file.writelines(i + '\n') #writer.writerow(i) #doc.clear() file.close() #SaveColecao(all_documents, 'usiel.txt') all_documents = LoadArquivo('usiel.txt') from yellowbrick.text import FreqDistVisualizer from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() docs = vectorizer.fit_transform(all_documents) features = vectorizer.get_feature_names() visualizer = FreqDistVisualizer(features=features) visualizer.fit(docs) visualizer.poof()
def textPreprocessingFn(df, requiredColumnList, outputDirectory): directory = outputDirectory + 'TextPreprocessing/' if not os.path.exists(directory): os.makedirs(directory) outputDirectory = directory logging.basicConfig(filename=outputDirectory + 'appTextPreprocessing.log', filemode='w', format='PROCESS INFO-%(message)s', level=logging.CRITICAL) # Function to print & log process information def printAndLogInfo(customMessage, exceptionMessage=None): print(customMessage) logging.critical(customMessage) if exceptionMessage: print(str(exceptionMessage)) logging.critical(exceptionMessage) # Function to print & log errors def printAndLogError(customMessage, exceptionMessage=None): print('ERROR!!! ' + customMessage) logging.critical(customMessage) if exceptionMessage: print(str(exceptionMessage)) logging.critical(exceptionMessage) time.sleep(10) sys.exit() commentTextColumn = requiredColumnList['commentTextColumn'] primaryKeyColumn = requiredColumnList['primaryKeyColumn'] agentAssignedColumn = requiredColumnList['agentAssignedColumn'] abbrevationDictionary = { 'Cus': 'customer', 'cus': 'customer', 'Xferred': 'transferred', 'xferred': 'transferred' } # Function to Standardize Text def objectStandardization(input_text): words = str(input_text).split() new_words = [] for word in words: word = re.sub('[^A-Za-z0-9\s]+', ' ', word) # remove special characters if word.lower() in abbrevationDictionary: word = abbrevationDictionary[word.lower()] new_words.append(word) new_text = " ".join(new_words) return new_text df[commentTextColumn] = df[commentTextColumn].apply(objectStandardization) # Function to extract Names of persons, organizations, locations, products etc. from the dataset def entityCollector(df): listOfNames = [] for index, row in df.iterrows(): doc = nlp(row[str(commentTextColumn)]) fil = [ (i.label_.lower(), i) for i in doc.ents if i.label_.lower() in ["person", "gpe", "product"] ] # Extracts Person Names, Organization Names, Location, Product names if fil: listOfNames.append(fil) else: continue flat_list = [item for sublist in listOfNames for item in sublist] entityDict = {} for a, b in list(set(flat_list)): entityDict.setdefault(a, []).append(b) return entityDict entityDict = entityCollector(df) printAndLogInfo("\n Types of entities present in the data are: " + ", ".join(list(entityDict.keys())) + " \n") for entity in list(entityDict.keys()): entityDict[entity] = [str(i) for i in entityDict[entity]] ignoreWords = [] for key in entityDict.keys(): ignoreWords.append(entityDict[key]) ignoreWords = [item for sublist in ignoreWords for item in sublist] printAndLogInfo("Number of words in Custom Stopword list = " + str(len(ignoreWords))) def languageDistribution(df): nlp = spacy.load("en") nlp.add_pipe(LanguageDetector(), name="language_detector", last=True) df['language'] = '' language = [] for index, row in df.iterrows(): text = row[str(commentTextColumn)] doc = nlp(text) language.append(str(doc._.language['language'])) df['language'] = language return df df = languageDistribution(df) langDict = df.groupby('language')[str( primaryKeyColumn)].nunique().to_dict() otherLanguagesList = list(langDict.keys()).remove('en') printAndLogInfo("Some sample other language texts: \n") for lang in list(langDict.keys()): printAndLogInfo(str( df[df['language'] == str(lang)].values.tolist()[0])) # Dropping only the row with Spanish text df = df.drop(df[df['language'] == 'es'].index) # Function to extract Alpha numeric words def alphanumericExtractor(input_text): words = str(input_text).split() alphanumericWordlist = [] for word in words: word = re.sub('[^A-Za-z0-9\s]+', '', word.lower()) # remove special characters word = re.sub(r'[^\x00-\x7F]+', ' ', word) # remove ascii if not word.isdigit() and any(ch.isdigit() for ch in word): alphanumericWordlist.append(word) else: continue return alphanumericWordlist # Function to get the frequency of Alphanumeric words in the data def alphanumericFrequency(df, commentTextColumnName): alphanumericWordsList = [] for index, row in df.iterrows(): if alphanumericExtractor(row[str(commentTextColumnName)]): alphanumericWordsList.append( alphanumericExtractor(row[str(commentTextColumnName)])) else: continue flat_list = [ item for sublist in alphanumericWordsList for item in sublist ] counts = Counter(flat_list) countsdict = dict(counts) return countsdict # Final list of alphanumeric words alphanumericWordFreqDict = alphanumericFrequency(df, commentTextColumn) # To plot the distribution totalWordcount = len(alphanumericWordFreqDict) if totalWordcount < 10: topWordCount = totalWordcount else: topWordCount = 10 alphanumericWordFreqDictTop = dict( sorted(alphanumericWordFreqDict.items(), key=operator.itemgetter(1), reverse=True)[:int(topWordCount)]) printAndLogInfo(alphanumericWordFreqDictTop) plt.figure(figsize=(20, 20)) plt.title('Frequency of AlphaNumeric Words in the Dataset', fontdict=None, loc='center') plt.bar(alphanumericWordFreqDictTop.keys(), alphanumericWordFreqDictTop.values(), width=0.1, color='b') plt.savefig(outputDirectory + 'topAlphanumericWords.png', bbox_inches='tight') # Updating Custom stopword list with Alphanumeric words ignoreWords = ignoreWords + list(alphanumericWordFreqDict.keys()) def clean_text(newDesc): newDesc = re.sub('[^A-Za-z\s]+', '', newDesc) # remove special characters newDesc = re.sub(r'[^\x00-\x7F]+', '', newDesc) # remove ascii newDesc = ' '.join([w for w in newDesc.split() if len(w) > 1]) newDesc = newDesc.split() cleanDesc = [str(w) for w in newDesc if w not in ignoreWords ] # remove entity names, alphanumeric words return ' '.join(cleanDesc) df[commentTextColumn] = df[commentTextColumn].apply(clean_text) def textAutocorrect(df, columnName): df[str(columnName)] = df[str(columnName)].apply( lambda txt: ''.join(TextBlob(txt).correct())) return True textAutocorrect(df, commentTextColumn) stops = nlp.Defaults.stop_words default_stopwords = stopwords.words('english') customStopWords = {'PRON', 'pron'} stops.update(set(default_stopwords)) stops.update(set(customStopWords)) def normalize(comment, lowercase, remove_stopwords): if lowercase: comment = comment.lower() comment = nlp(comment) lemmatized = list() for word in comment: lemma = word.lemma_.strip() if lemma: if not remove_stopwords or (remove_stopwords and lemma not in stops): lemmatized.append(lemma) normalizedSentence = " ".join(lemmatized) normalizedSentence = re.sub( '[^A-Za-z\s]+', '', normalizedSentence) # remove special characters normalizedSentence = normalizedSentence.split() cleanDesc = [str(w) for w in normalizedSentence if w not in stops] # remove PRON return " ".join(cleanDesc) df[commentTextColumn] = df[commentTextColumn].apply(normalize, lowercase=True, remove_stopwords=True) # Removing Null Comments def removeNullValueCommentText(df, columnName): initialLength = len(df) df = df[pd.notnull(df[columnName])] finalLength = len(df) printAndLogInfo("\n Number of rows with Null Value in the column '" + str(columnName) + "' are: " + str(initialLength - finalLength)) return df df = removeNullValueCommentText(df, commentTextColumn) # Removing duplicate comments keeping the first one def removeDuplicateComments(df, columnName, agentAssignedColumn): initialDf = df.copy() initialLength = len(initialDf) finalDf = df.drop_duplicates(subset=[columnName], keep='first') finalLength = len(finalDf) printAndLogInfo( "\n Number of rows with duplicate comments in the column '" + str(columnName) + "' are: " + str(initialLength - finalLength)) printAndLogInfo( "\n The Level 3 Reason Codes for the dropped rows are given below: \n" ) droppedDF = initialDf[~initialDf.apply(tuple, 1). isin(finalDf.apply(tuple, 1))] printAndLogInfo(droppedDF[agentAssignedColumn].value_counts()) return finalDf, droppedDF df, droppedDF = removeDuplicateComments(df, commentTextColumn, agentAssignedColumn) # Removing comments with just one word. (Like #CALL?) def removingShortComments(df, columnName, agentAssignedColumn, numberOfWords=1): initialDf = df.copy() initialLength = len(initialDf) finalDf = df[~(df[str(columnName)].str.split().str.len() < (int(numberOfWords) + 1))] finalLength = len(finalDf) printAndLogInfo( "\n Number of rows with short comments in the column '" + str(columnName) + "' are: " + str(initialLength - finalLength)) printAndLogInfo( "\n The Level 3 Reason Codes for the dropped rows are given below: \n" ) droppedDF = initialDf[~initialDf.apply(tuple, 1). isin(finalDf.apply(tuple, 1))] printAndLogInfo(droppedDF[agentAssignedColumn].value_counts()) return finalDf, droppedDF df, droppedDF = removingShortComments(df, commentTextColumn, agentAssignedColumn) vectorizer = CountVectorizer() docs = vectorizer.fit_transform(df[commentTextColumn]) features = vectorizer.get_feature_names() plt.figure(figsize=(12, 8)) plt.title("FrequencyDistribution of words after Preprocessing", fontdict=None, loc='center') visualizer = FreqDistVisualizer(features=features) visualizer.fit(docs) for label in visualizer.ax.texts: label.set_size(20) # visualizer.poof() plt.savefig(outputDirectory + 'FrequencyDistributionAfterPreprocessing.png', bbox_inches='tight') plt.close() def wordFrequency(reasonCode): return (df[df[agentAssignedColumn] == str(reasonCode)] [commentTextColumn].str.split( expand=True).stack().value_counts()) def wordFrequencyListPlot(reasonCode, plot=False): wordFreqDict = df[df[agentAssignedColumn] == str( reasonCode)][commentTextColumn].str.split( expand=True).stack().value_counts().to_dict() wordFreqDictMostCommon = dict( collections.Counter(wordFreqDict).most_common( 10)) # Considering only Top 10 words printAndLogInfo(list(wordFreqDictMostCommon.keys())) if plot == True: plt.title(str(reasonCode), fontdict=None, loc='center') plt.bar(wordFreqDictMostCommon.keys(), wordFreqDictMostCommon.values(), width=0.1, color='b') plt.figure(figsize=(10, 10)) plt.savefig(outputDirectory + 'wordFrequencyFor' + reasonCode + '.png', bbox_inches='tight') plt.close() return list(wordFreqDictMostCommon.keys()) uniqueTopics = list(df[agentAssignedColumn].unique()) for reasoncode in uniqueTopics: printAndLogInfo(reasoncode) wordFrequencyListPlot(reasoncode, plot=True) def wordCloudGenerator(df, reasonCode, save=False): dfReasonCodeSubset = df[df[agentAssignedColumn] == reasonCode] wordcloud = WordCloud(max_words=50, background_color='white', max_font_size=50, width=100, height=100).generate(' '.join( dfReasonCodeSubset[commentTextColumn])) plt.imshow(wordcloud) plt.title(str(reasonCode), fontdict=None, loc='center') plt.figure(figsize=(50, 50)) plt.axis("off") plt.close() if save: plt.savefig(outputDirectory + 'wordCloud' + str(reasonCode) + '.png', bbox_inches='tight') for topic in uniqueTopics: wordCloudGenerator( df, topic) # ,save = True , if you want to save the Word Clouds from fpdf import FPDF pdf = FPDF() # imagelist is the list with all image filenames filelist = os.listdir(outputDirectory) for fichier in filelist[:]: # filelist[:] makes a copy of filelist. if not (fichier.endswith(".png")): filelist.remove(fichier) imagelist = filelist for image in imagelist: pdf.add_page() pdf.image(outputDirectory + image, 40, 20, 100, 80) pdf.output(outputDirectory + "TextPreprocessing.pdf", "F") printAndLogInfo( "Detailed Report on Text Preprocessing is saved in the location: " + outputDirectory) return df