def simple_parse(text): text = text.lower() stop_words = stop_word_list() #The word_tokenize() function will break our text phrases into #individual words tokens = word_tokenize(text) #we'll create a new list which contains punctuation we wish to clean punctuations = [ '(', ')', ';', ':', '[', ']', ',', '.', '-', '\"', '\'', '{', '}', ' - ' ] special_char = ['#', '<', '>', '*', '+', ' - ', '~', '^', '"'] #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations. keywords = [ word for word in tokens if not word in stop_words and not word in punctuations and not word in special_char ] return text, tokens, keywords
def simple_parse(text): ''' Tokenizes the input text and returns the raw text, a list of tokens and a list of 'keywords'. Keywords are tokens which are not punctuation, stop words, spaces or special characters. Parameters ---------- text : str Raw text to be tokenized. Returns ---------- text : str The raw text from the file. tokens : list A list of the tokens from the raw text. keywords : list A list of the tokens which are not a space, stop words, punctuation or special character. ''' text = text.lower() stop_words = stop_word_list() #The word_tokenize() function will break our text phrases into #individual words tokens = word_tokenize(text) #we'll create a new list which contains punctuation we wish to clean punctuations = [ '(', ')', ';', ':', '[', ']', ',', '.', '-', '\"', '\'', '{', '}', ' - ' ] special_char = ['#', '<', '>', '*', '+', ' - ', '~', '^', '"'] #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations. keywords = [ word for word in tokens if not word in stop_words and not word in punctuations and not word in special_char ] return text, tokens, keywords
def extract(filename): #write a for-loop to open many files -- leave a comment if you'd #like to learn how #filename = 'testfile.pdf' stop_words = stop_word_list() #open allows you to read the file pdfFileObj = open(filename, 'rb') #The pdfReader variable is a readable object that will be parsed pdfReader = PyPDF2.PdfFileReader(pdfFileObj) #discerning the number of pages will allow us to parse through all #the pages num_pages = pdfReader.numPages count = 0 text = "" #The while loop will read each page while count < num_pages: pageObj = pdfReader.getPage(count) count += 1 text += pageObj.extractText() #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files. if text != "": text = text #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text else: text = textract.process(filename, method='tesseract', language='eng') # Now we have a text variable which contains all the text derived #from our PDF file. Type print(text) to see what it contains. It #likely contains a lot of spaces, possibly junk such as '\n' etc. # Now, we will clean our text variable, and return it as a list of keywords. #The word_tokenize() function will break our text phrases into #individual words tokens = word_tokenize(text) #we'll create a new list which contains punctuation we wish to clean punctuations = ['(', ')', ';', ':', '[', ']', ',', '.', '-'] #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations. keywords = [ word for word in tokens if not word in stop_words and not word in punctuations ] return text, tokens, keywords
def build_word_cloud(token_list, n): words = token_list stop_words = stop_word_list() for word in words: if word in stop_words: words.remove(word) wordcloud = WordCloud( width=1440, height=1080, background_color='white', #colormap="Blues", #margin=10, stopwords=stop_words, max_words=n, ).generate(str(words)) fig = plt.figure(figsize=(20, 15)) plt.imshow(wordcloud) plt.axis('off') plt.margins(x=0, y=0) plt.savefig('static/mycloud', bbox_inches='tight')
def build_word_cloud(text, n): ''' Plots the wordcloud from a given token list and returns the plot in html format to be embedded in a html file. Parameters ---------- text : str The text in form of a string to generate the word cloud. n : int: maximum number of tokens to display in the wordcloud. Returns ---------- Embedded html of the wordcloud visulisation. This can be simply added to a html template. ''' stop_words = stop_word_list() wordcloud = WordCloud( width=1440, height=1080, background_color='white', #colormap="Blues", #margin=10, stopwords=stop_words, max_words=n, ).generate(str(text)) fig = plt.figure(figsize=(13, 9)) plt.imshow(wordcloud) plt.axis('off') plt.margins(x=0, y=0) html = mpld3.fig_to_html(fig, no_extras=True, template_type='general') return html
import re import nltk from gensim.models import word2vec from sklearn.manifold import TSNE import matplotlib.pyplot as plt import matplotlib as mpl from wordcloud import WordCloud, STOPWORDS from stopwords import stop_word_list from pdf_extractor import extract import spacy stop_words = stop_word_list() text, tokens, keywords = extract('uploads/mytest.pdf') for word in tokens: if word in stop_words: tokens.remove(word) cleantext = " ".join(tokens) nlp = spacy.load('en_core_web_sm') # make sure to use larger model!
def lda_tsne(total_text, file_names, n_topics=None, n_iter=200, n_top_words=None, threshold=0.3): n_data = len(file_names) if n_topics is None: n_topics = int(round(((len(file_names)) / 2)**0.5)) # session['number_topics'] = str(n_topics) if n_top_words is None: n_top_words = 5 # session['number_topwords'] = str(n_top_words) t0 = time.time() stopwords = stop_word_list() cvectorizer = CountVectorizer(min_df=1, stop_words=stopwords, lowercase=True, ngram_range=(1, 3), max_df=30) cvz = cvectorizer.fit_transform(total_text) lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter) X_topics = lda_model.fit_transform(cvz) ldavis_html = pyladvis_run(lda_model, cvz, cvectorizer) print("<<<<<<<<<<LDAVIS OK>>>>>>>") print(X_topics) t1 = time.time() print('\n') print('LDA training done; took {} mins'.format((t1 - t0) / 60.)) print('\n') # np.save('mednlp/lda_doc_topic_{}files_{}topics.npy'.format( # X_topics.shape[0], X_topics.shape[1]), X_topics) #np.save('mednlp/lda_topic_word_{}files_{}topics.npy'.format( # X_topics.shape[0], X_topics.shape[1]), lda_model.topic_word_) ############################################################################## # threshold and plot #_idx = np.amax(X_topics, axis=1) > threshold # idx of news that > threshold #print('idx: ' + str(_idx)) #_topics = X_topics print('topics: ' + str(X_topics)) num_example = len(X_topics) print("num_example: " + str(num_example)) # t-SNE: 50 -> 2D tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.50, init='pca') tsne_lda = tsne_model.fit_transform(X_topics[:num_example]) print("TSNE_LDA") print(type(tsne_lda)) print(tsne_lda) print(tsne_lda.shape) tsne_lda_df = pd.DataFrame(tsne_lda) tsne_lda_df = tsne_lda_df.fillna('') tsne_lda = tsne_lda[~np.isnan(tsne_lda).any(axis=1)] # find the most probable topic for each news _lda_keys = [] for i in range(X_topics.shape[0]): _lda_keys += X_topics[i].argmax(), print('lda_keys: ') print(_lda_keys) # show topics and their top words topic_summaries = [] topic_word = lda_model.topic_word_ # get the topic words vocab = cvectorizer.get_feature_names() for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] topic_summaries.append(' '.join(topic_words)) colormap = np.array([]) for i in range(n_topics): color = "#" + "%06x" % random.randint(0, 0xFFFFFF) colormap = np.append(colormap, color) print("#########################################################") print("COLORMAP") print(colormap[_lda_keys][:num_example]) print("#########################################################") print("LDA KEYS") print(_lda_keys[:num_example]) print("#########################################################") raw_topic_summaries = [] for x in _lda_keys: raw_topic_summaries.append(topic_summaries[x]) # plot title = " t-SNE visualization of LDA model trained on {} files, " \ "{} topics, thresholding at {} topic probability, {} iterations ({} data " \ "points and top {} words)".format( X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words) plot_lda = bp.figure( plot_width=1200, plot_height=800, title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1) if n_data < 30: dot_size = 20 if n_data >= 30 and n_data < 50: dot_size = 15 if n_data >= 50 and n_data < 150: dot_size = 11 if n_data >= 150: dot_size = 5 source = bp.ColumnDataSource( data=dict(x=tsne_lda_df.iloc[:, 0], y=tsne_lda_df.iloc[:, 1], color=colormap[_lda_keys][:num_example], file_names=file_names, raw_topic_summaries=raw_topic_summaries)) plot_lda.scatter(x='x', y='y', color='color', source=source, size=dot_size) plot_lda.outline_line_width = 7 plot_lda.outline_line_alpha = 0.3 plot_lda.outline_line_color = "#353A40" # randomly choose a news (in a topic) coordinate as the crucial words coordinate topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan for topic_num in _lda_keys: if not np.isnan(topic_coord).any(): break topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)] # plot crucial words for i in range(X_topics.shape[1]): plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]]) # hover tools hover = plot_lda.select(dict(type=HoverTool)) hover.tooltips = [("file name", "@file_names"), ("topic summary", '@raw_topic_summaries')] #save(plot_lda, '20_news_tsne_lda_viz_{}_{}_{}_{}_{}_{}.html'.format( # X_topics.shape[0], n_topics, threshold, n_iter, num_example, n_top_words)) t2 = time.time() print('\n>>> whole process done; took {} mins\n'.format((t2 - t0) / 60.)) output_file("TSNE_OUTPUT.html", title="TTSNE OUTPUT") #show(plot_lda) script, div = components(plot_lda) return script, div, ldavis_html
def lda_tsne(total_text, file_names, n_topics=None, n_top_words=None): ''' Handles the process of applying Latent Dirichlet Allocation (LDA) to the input text and the dimensionality reduction of the result from LDA using t-SNE. The LDA algorithm returns a document-topic probability matrix which describes the probabilities of the topics in a document. The result of t-SNE are x,y coordinates that can be plotted on a scatter plot to visualise the clusters. The bokeh library is used for visualisation, it provides great interactive plots and hovertools that can add extra information to the plot. It also create an html output that can be easily embedded within a web page. Parameters ---------- tota_text : list A list of strings where each element is all the text of a document in one string. file_names : list NA list of strings where each element is a file name of the files that were uploaded. n_topics: int This is a hyperparameter of the sklearn LDA function, it needs to know how many topics are being modelled. n_top_words: int This is a number representing the number of words that described each topic. This is used on the bokeh hover tool. Returns ---------- html : str the html embedding of the bokeh plot, this can be directly embedded in a web page. ''' #loads the flask session variable in order to use it when object serialisations (pickle) to the filing system. myid = session['myid'] n_data = len(file_names) #if the number of topics is not specified (like when a user first launches the clusterinfg), # it uses a rule of thumb to estimate the number of topics in a corpus of documents #the rule of thumb is ((number of documents)/2)^0.5. #another option is to use a more advanced algorithm to estimate the number of topics. # I have tried HDBSCAN but the result is highly dependent on its 'minimal cluster size' parameter. if n_topics is None: n_topics = int(round(((len(file_names))/2)**0.5)) session['number_topics'] = str(n_topics) # if the number of top words is not specific, use 5 words to described a topic if n_top_words is None: n_top_words = 5 session['number_topwords'] = str(n_top_words) #the timing is for testing, to see how long it takes to run certain functions. t0 = time.time() #loads the list of stop words stopwords = stop_word_list() #loads the Scikit-Learn countvectorizer. This will convert the input text into a document-term matrix. #It is a matrix that simply registers a count of the different n-grams within the text #When the ngram_range paramters is set to (1,1) the ngrams are only the different words without a documents. # so for the sentence "My name is David" the list of ngrams would be ['My', 'name', 'is', 'david'] # if the ngram_range parameter is set to (1,2) it will also include bigrams # for the same sentence the ngrams would be ['My', 'My name', 'name', 'name is', 'is', 'is david', 'david'] cvectorizer = CountVectorizer( min_df=1, stop_words=stopwords, lowercase=True, ngram_range=(1, 3)) # this creates the document-term matrix cvz = cvectorizer.fit_transform(total_text) t1 = time.time() print("Time for count vectorizer (document term matrix): " + str(t1-t0)) t2 = time.time() # generates the lda model with 500 iterations lda_model = lda.LDA(n_topics, 500) # fits the lda model to the document-term matrix X_topics = lda_model.fit_transform(cvz) t3 = time.time() print("Time for LDA: " + str(t3-t2)) if not os.path.exists('pickles'): os.makedirs('pickles') # creates the paths to which the pickled objects will be saved lda_model_path = "pickles/lda_model_" + str(myid) + '.p' document_term_matrix_path = "pickles/document_term_matrix_" + \ str(myid) + '.p' cvectorizer_path = "pickles/cvectorizer_" + str(myid) + '.p' #pickles the objects and saves them pickle.dump(lda_model, open(lda_model_path, "wb")) pickle.dump(cvz, open(document_term_matrix_path, "wb")) pickle.dump(cvectorizer, open(cvectorizer_path, "wb")) #the number of files uploaded num_example = len(X_topics) t4 = time.time() #creates the t-SNE object that will be used, the number of components reffers to the number of output dimensions tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.2, init='pca') #uses t-SNE to calculate the 2-D coordinates representing the documents. tsne_lda = tsne_model.fit_transform(X_topics[:num_example]) t5 = time.time() print("Time for TSNE: " + str(t5-t4)) #Some fancy processing of the data using pandas to remove any 'NAN' values from the data. tsne_lda_df = pd.DataFrame(tsne_lda) print(tsne_lda_df.describe()) tsne_lda_df = tsne_lda_df.fillna('') tsne_lda = tsne_lda[~np.isnan(tsne_lda).any(axis=1)] tsne_lda_df = tsne_lda_df[~tsne_lda_df.isin( [np.nan, np.inf, -np.inf]).any(1)] print(tsne_lda_df.describe()) # finds the most probable topic for each document and saves it into the list _lda_keys = [] for i in range(X_topics.shape[0]): _lda_keys += X_topics[i].argmax(), #gets the most probable words of each topic as a representaiton of that topic. topic_summaries = [] topic_word = lda_model.components_ # get the topic words vocab = cvectorizer.get_feature_names() for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort( topic_dist)][:-(n_top_words+1):-1] topic_summaries.append(' '.join(topic_words)) #creates a colourmap to colour each topic in a separate randomly chosen colour colormap = np.array([]) for i in range(n_topics): color = "#" + "%06x" % random.randint(0, 0xFFFFFF) colormap = np.append(colormap, color) raw_topic_summaries = [] for x in _lda_keys: raw_topic_summaries.append(topic_summaries[x]) t6 = time.time() title = " t-SNE visualization of LDA model trained on {} files, " \ "{} topics, {} data " \ "points and top {} words".format( X_topics.shape[0], n_topics, num_example, n_top_words) #creates the bokeh figure objects that will be used to crate the sactter plot plot_lda = bp.figure(plot_width=1200, plot_height=700, title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1) # defines the size of the plot dots, the more there are of them the smaller they should be if n_data < 30: dot_size = 20 if n_data >= 30 and n_data < 50: dot_size = 15 if n_data >= 50 and n_data < 150: dot_size = 11 if n_data >= 150: dot_size = 5 #this object defines the paramters of the plot in the form of a dictionary. The file_names and raw_topic_summaries are used #for the plot's hover tool. source = bp.ColumnDataSource(data=dict(x=tsne_lda_df.iloc[:, 0], y=tsne_lda_df.iloc[:, 1], color=colormap[_lda_keys][:num_example], file_names=file_names, raw_topic_summaries=raw_topic_summaries)) plot_lda.scatter(x='x', y='y', color='color', source=source, size=dot_size) plot_lda.outline_line_width = 7 plot_lda.outline_line_alpha = 0.3 plot_lda.outline_line_color = "#353A40" # randomly choses a file as the coordinate at which to show the topic words. topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan for topic_num in _lda_keys: if not np.isnan(topic_coord).any(): break topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)] # plots the top words for i in range(X_topics.shape[1]): plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [ topic_summaries[i]]) #sets the bokeh's hover tool to display the file name and topic summary of # a dot when the cursor hovers over a dot. hover = plot_lda.select(dict(type=HoverTool)) hover.tooltips = [("file name", "@file_names"), ("topic summary", '@raw_topic_summaries')] t7 = time.time() print("Time for Bokeh plotting: " + str(t7-t6)) print('\n>>> whole process done; took {} mins\n'.format((t7 - t0) / 60.)) #creates the html code of the visualisation that will be used in the html template. html = file_html(plot_lda, CDN) #pickles and saves the objects for later use raw_topic_summaries_path = "pickles/raw_topic_summaries" + str(myid) + '.p' lda_keys_path = "pickles/lda_keys_path" + str(myid) + '.p' pickle.dump(raw_topic_summaries, open(raw_topic_summaries_path, "wb")) pickle.dump(_lda_keys, open(lda_keys_path, "wb")) return html
def lda_tsne(total_text, file_names, n_topics=None, n_top_words=None): myid = session['myid'] n_data = len(file_names) if n_topics is None: n_topics = int(round(((len(file_names)) / 2)**0.5)) session['number_topics'] = str(n_topics) if n_top_words is None: n_top_words = 5 session['number_topwords'] = str(n_top_words) t0 = time.time() stopwords = stop_word_list() cvectorizer = CountVectorizer(min_df=1, stop_words=stopwords, lowercase=True, ngram_range=(1, 3)) cvz = cvectorizer.fit_transform(total_text) t1 = time.time() print("Time for count vectorizer (document term matrix): " + str(t1 - t0)) #lda_model = LatentDirichletAllocation(n_components=n_topics) t2 = time.time() lda_model = lda.LDA(n_topics, 500) X_topics = lda_model.fit_transform(cvz) t3 = time.time() print("Time for LDA: " + str(t3 - t2)) # print("NUMBER OF ITERATIONS OF LDA: " + str(lda_model.n_iter_)) if not os.path.exists('pickles'): os.makedirs('pickles') lda_model_path = "pickles/lda_model_" + str(myid) + '.p' document_term_matrix_path = "pickles/document_term_matrix_" + str( myid) + '.p' cvectorizer_path = "pickles/cvectorizer_" + str(myid) + '.p' pickle.dump(lda_model, open(lda_model_path, "wb")) pickle.dump(cvz, open(document_term_matrix_path, "wb")) pickle.dump(cvectorizer, open(cvectorizer_path, "wb")) ############################################################################## num_example = len(X_topics) t4 = time.time() # t-SNE: 50 -> 2D tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.2, init='pca') tsne_lda = tsne_model.fit_transform(X_topics[:num_example]) t5 = time.time() print("Time for TSNE: " + str(t5 - t4)) tsne_lda_df = pd.DataFrame(tsne_lda) print(tsne_lda_df.describe()) tsne_lda_df = tsne_lda_df.fillna('') tsne_lda = tsne_lda[~np.isnan(tsne_lda).any(axis=1)] tsne_lda_df = tsne_lda_df[~tsne_lda_df.isin([np.nan, np.inf, -np.inf]). any(1)] print(tsne_lda_df.describe()) # find the most probable topic for each news _lda_keys = [] for i in range(X_topics.shape[0]): _lda_keys += X_topics[i].argmax(), print("LDA") print(_lda_keys) # show topics and their top words topic_summaries = [] topic_word = lda_model.components_ # get the topic words vocab = cvectorizer.get_feature_names() for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words + 1):-1] topic_summaries.append(' '.join(topic_words)) colormap = np.array([]) for i in range(n_topics): color = "#" + "%06x" % random.randint(0, 0xFFFFFF) colormap = np.append(colormap, color) raw_topic_summaries = [] for x in _lda_keys: raw_topic_summaries.append(topic_summaries[x]) # plot t6 = time.time() title = " t-SNE visualization of LDA model trained on {} files, " \ "{} topics, {} data " \ "points and top {} words".format( X_topics.shape[0], n_topics, num_example, n_top_words) plot_lda = bp.figure( plot_width=1200, plot_height=800, title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave", x_axis_type=None, y_axis_type=None, min_border=1) if n_data < 30: dot_size = 20 if n_data >= 30 and n_data < 50: dot_size = 15 if n_data >= 50 and n_data < 150: dot_size = 11 if n_data >= 150: dot_size = 5 source = bp.ColumnDataSource( data=dict(x=tsne_lda_df.iloc[:, 0], y=tsne_lda_df.iloc[:, 1], color=colormap[_lda_keys][:num_example], file_names=file_names, raw_topic_summaries=raw_topic_summaries)) plot_lda.scatter(x='x', y='y', color='color', source=source, size=dot_size) plot_lda.outline_line_width = 7 plot_lda.outline_line_alpha = 0.3 plot_lda.outline_line_color = "#353A40" # randomly choose a news (in a topic) coordinate as the crucial words coordinate topic_coord = np.empty((X_topics.shape[1], 2)) * np.nan for topic_num in _lda_keys: if not np.isnan(topic_coord).any(): break topic_coord[topic_num] = tsne_lda[_lda_keys.index(topic_num)] # plot crucial words for i in range(X_topics.shape[1]): plot_lda.text(topic_coord[i, 0], topic_coord[i, 1], [topic_summaries[i]]) # hover tools hover = plot_lda.select(dict(type=HoverTool)) hover.tooltips = [("file name", "@file_names"), ("topic summary", '@raw_topic_summaries')] t7 = time.time() print("Time for Bokeh plotting: " + str(t7 - t6)) print('\n>>> whole process done; took {} mins\n'.format((t7 - t0) / 60.)) html = file_html(plot_lda, CDN) raw_topic_summaries_path = "pickles/raw_topic_summaries" + str(myid) + '.p' lda_keys_path = "pickles/lda_keys_path" + str(myid) + '.p' pickle.dump(raw_topic_summaries, open(raw_topic_summaries_path, "wb")) pickle.dump(_lda_keys, open(lda_keys_path, "wb")) return html