def submit(): ''' This is the route function that deals with the text field submission from the homepage. It checks whether text has been entered. If it has, the text will be parsed and tokenized. The word frequency graph and wordcloud will then be created similarly to the description above. ''' myid = session['myid'] if request.method == 'POST': #checks if text has been entered in the text area. if 'text' not in request.form: print("no text entered") flash('No text was entered') return redirect(request.url) text = request.form['text'] #processes the text so it is tokenized and linguistic noise is removed. text, tokens, keywords = simple_parse(text) keywords_path = "pickles/keywords_" + str(myid) + '.p' pickle.dump(keywords, open(keywords_path, "wb")) # gets the pygal word-frequency object graph_data = frequency_dist(keywords, 26, ('Word frequency for input text')) # gets the wordcloud plot in html form wordcloud_html = build_word_cloud(text, 2000) wordcloud_html_path = "pickles/wordcloud_html_" + str(myid) + '.p' pickle.dump(wordcloud_html, open(wordcloud_html_path, "wb")) session['title'] = 'NLP analysis' stop_words_form = StopWordsForm() session['save'] = False return render_template('analysis_options.html', title='NLP analysis', graph_data=graph_data, wordcloud_html=wordcloud_html, stop_words_form=stop_words_form)
def display_history_single(): ''' This route function handles the loaded single file from Azure blob storage, processes it and displays the wordcloud and the word-frequency distribution graph. This has a lot of similarity how an uploaded single file is handled. ''' myid = session['myid'] save_path = request.args['save_path'] file_name_short_with_extension = request.args[ 'file_name_short_with_extension'] text, tokens, keywords = extract(save_path) keywords_path = "pickles/keywords_" + str(myid) + '.p' pickle.dump(keywords, open(keywords_path, "wb")) graph_data = frequency_dist(keywords, 26, ('Word frequency for file with filename: ' + file_name_short_with_extension)) wordcloud_html = build_word_cloud(text, 2000) wordcloud_html_path = "pickles/wordcloud_html_" + str(myid) + '.p' pickle.dump(wordcloud_html, open(wordcloud_html_path, "wb")) session['save'] = False stop_words_form = StopWordsForm() return render_template('analysis_options.html', title='Single file NLP analysis', graph_data=graph_data, stop_words_form=stop_words_form, wordcloud_html=wordcloud_html)
def upload_file(): ''' This is the route function that governs the home page of the web app. If there are not requests from the html file upload form the function will simply render the home.html template. If there is a POST request from the html file upload form, the uploaded file will be processed. During the processing it is checked whether the upload file has an allowed extension (pdf,docx,txt for single files. rar,zip for compressed files). If the file is is a 'single file' (pdf, docx, txt), it will be stored in the 'uploads' folder and processed using the extractor.py module. The processed text will then be used to build a wordcloud and a word-frequency graph. If the file is a 'compressed folder' (rar, zip), it will be decrompressed and the text extracted and processed usingt the compressed_main.py module. The processed text will then be used to build the scatter graph of the clusters (using the lda_tsne_model2.py module) and the pyldavis visualisation (using the mypyldavis.py module). It must be noted that if less than 4 file are inside the compressed folder then the web app will trigger an error as the LDA model does not work with such little data. ''' # This flask session object is used to determine whether the 'save' button will be visible or not. session['save'] = True # If the a user is not logged in, create a unique identifier for this specific user. # If the user is already logged in, the identifier has already been createrd. # the current_user.is_anonymous comes from the Flask-Login Flask extension. if current_user.is_anonymous: an_id = str(uuid.uuid4()) myid = an_id[:8] + an_id[24:] session['myid'] = myid else: myid = session['myid'] if request.method == 'POST': # check if the post request has the file part if 'document' not in request.files: print("file not in request.files") flash('No file part') return redirect(request.url) file = request.files['document'] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': flash('No selected file') return redirect(request.url) if file and allowed_file(file.filename): filename = secure_filename(file.filename) file_extension = filename.rsplit('.', 1)[1].lower() file_name_no_extension = filename.rsplit('.', 1)[0].lower() if len(file_name_no_extension) > 28: file_name_no_extension = file_name_no_extension[0:28] filename = file_name_no_extension + '.' + file_extension # defining the regular expresison of the allowed characters for the file name. regex = re.compile('[^a-zA-Z0-9-_]') #making sure that the file name contains only alpha-numeric and ('-', '_') characters. Any other character is deleted. file_name_no_extension = regex.sub('', file_name_no_extension) file_name_uuid = str(file_name_no_extension) + \ '_' + str(myid) + '.' + file_extension file_name_uuid_no_extension = str( file_name_no_extension) + '_' + str(myid) file.save(os.path.join('uploads', file_name_uuid)) session['single_file_path'] = os.path.join('uploads', file_name_uuid) #checks if the uploaded file is a compressed format. This is pivotal in the program execution. if file_extension in compressed_extensions: # >>>>>>>>>>> This section of the code handles a compressed file (zip/rar) # creates the unique paths that are going to be used to store the serialised python objects (using the pickle module). # myid is the unique identifier that was previously created and is stored in session['myid'] total_text_path = 'pickles/total_text_' + str(myid) + '.p' file_names_path = 'pickles/file_names_' + str(myid) + '.p' lda_model_path = "pickles/lda_model_" + str(myid) + '.p' lda_html_path = "pickles/lda_html_" + str(myid) + '.p' document_term_matrix_path = "pickles/document_term_matrix_" + \ str(myid) + '.p' cvectorizer_path = "pickles/cvectorizer_" + str(myid) + '.p' pyldavis_html_path = "pickles/pyldavis_html_" + \ str(myid) + '.p' # stores the various path variables in the flask session so that it can be used later. session['total_text_path'] = total_text_path session['file_names_path'] = file_names_path session['vectorizer_path'] = cvectorizer_path session[ 'document_term_matrix_path'] = document_term_matrix_path session['lda_model_path'] = lda_model_path session['lda_html_path'] = lda_html_path session['pyldavis_html_path'] = pyldavis_html_path # handle_compressed_file decompresses the compressed file, extracts the text from each document within and parses/tokenizes/stems/removes stop words. # total_text is a list of strings where each element in the list represents the text from one document # totalvocab_stemmed is a list of stemmed words from the all the documents # totalvocab_tokenized is a list of text tokens from all the documents # file_names is a list of the file names from the compressed folder. total_text, totalvocab_stemmed, totalvocab_tokenized, file_names = handle_compressed_file( (os.path.join('uploads', file_name_uuid)), filename) #once the compressed folder has been decrompressed and processed there is no need to keep it on the filing system so it is removed. os.remove(os.path.join('uploads', file_name_uuid)) # LDA does not work with less than 4 files if len(file_names) < 4: flash( 'At least 4 files in the compressed folder are required' ) return redirect(url_for('upload_file')) # calls the lda_tsne method from lda_tsne_model2.py which peforms text vectorization, LDA clustering and then tSNE before converting the plot into html format. lda_html = lda_tsne(total_text, file_names) # Flask form for inputting a new number of topics parameter. topic_number_form = inputTopicNumber() #calls the pyldavis_run method from mypyldavis.py which produces the pyLDAvis visualization and converts it to html format. pyldavis_html = pyldavis_run(lda_model_path, document_term_matrix_path, cvectorizer_path) # stores serialized versions of total_text, file_names, pyldavis_html and lda_html which will later be used. pickle.dump(total_text, open(total_text_path, "wb")) pickle.dump(file_names, open(file_names_path, "wb")) pickle.dump(pyldavis_html, open(pyldavis_html_path, "wb")) pickle.dump(lda_html, open(lda_html_path, "wb")) # Flask sessions object that dictates whether the 'Download' button is visible. session['download'] = True return render_template('bulk_analysis.html', title='Clustering analysis', lda_html=lda_html, number_form=topic_number_form, pyldavis_html=pyldavis_html) # >>>>>>>>>>> This section of the code handles a single file (docx/pdf/txt) #stores the name of the file name with and without the file extension and unique identifier in flask session objects. session[ 'single_file_name_short_no_extension'] = file_name_no_extension session[ 'single_file_name_uuid_long_no_extension'] = file_name_uuid_no_extension session['single_file_name_short_with_extension'] = filename session['single_file_name_long_with_extension'] = file_name_uuid # calls the extract method from extractor.py that extracts and processes the text from the document. text, tokens, keywords = extract( os.path.join('uploads', file_name_uuid)) keywords_path = "pickles/keywords_" + str(myid) + '.p' pickle.dump(keywords, open(keywords_path, "wb")) # gets the pygal word-frequency distribution graph graph_data = frequency_dist( keywords, 26, ('Word frequency for file with filename: ' + filename)) # gets the wordcloud plot html wordcloud_html = build_word_cloud(text, 2000) wordcloud_html_path = "pickles/wordcloud_html_" + str(myid) + '.p' pickle.dump(wordcloud_html, open(wordcloud_html_path, "wb")) # gets the flask form that is used to input new stopwords. stop_words_form = StopWordsForm() session['title'] = 'Single file NLP analysis' return render_template('analysis_options.html', title='Single file NLP analysis', graph_data=graph_data, wordcloud_html=wordcloud_html, stop_words_form=stop_words_form) else: flash('not an allowed file format') return redirect(url_for('upload_file')) else: uploadForm = UploadFileForm() inputTextForm = inputText() return render_template('home.html', title='Welcome', form=uploadForm, textform=inputTextForm)