def OpenOutputFiles(window, openOutputFiles, filesToOpen): if filesToOpen == None: return if len(filesToOpen) == 0: return if len(filesToOpen) == 1: singularPlural = 'file' else: singularPlural = 'files' if openOutputFiles == True: # now the approach is to open all files at the end, so this extra check is redundant "and runningAll==False:"" # should display a reminder about csv files with weird characters most likely dues to non utf-8 apostrophes and quotes # but... this reminder does not have a specific config, so... perhaps *? reminders_util.checkReminder( '*', ['csv files'], 'If csv ouput files open displaying weird characters in a Windows OS (e.g., a€), most likely the cause is due to non utf-8 compliant input text. Apostrophes and quotes are the typical culprits, but also other punctuation characters.\n\nPlease, run the tool to check documents for utf-8 compliance and, if necessary, run the tool for automatic apostrophe and quote conversion from non utf-8 to utf-8.\n\nTo learm more on utf-8 compliance, read the TIPS on utf-8 compliance.', True) routine_options = reminders_util.getReminder_list('*') timed_alert( window, 2000, 'Warning', 'Opening ' + str(len(filesToOpen)) + ' output ' + singularPlural + '... Please wait...', False) for file in filesToOpen: if os.path.isfile(file): if file.endswith('.kml'): open_kmlFile(window, file) else: openFile(window, file)
def activate_fileName_wellFormedness(*args): if check_filename_var.get() == False: fileName_embeds_date_checkbox.configure(state="disabled") NER_checkbox.configure(state="disabled") NER_var.set(0) similarityIndex_Intruder_menu.configure(state="disabled") character_home_checkbox.configure(state="normal") missing_character_checkbox.configure(state="normal") Levenshtein_checkbox.configure(state='normal') character_checkbox.configure(state="normal") # character_home_checkbox.configure(state="normal") intruder_checkbox.configure(state="normal") plagiarist_checkbox.configure(state="normal") else: reminders_util.checkReminder(config_filename, ["Filename checker"], '', True) fileName_embeds_date_checkbox.configure(state="normal") NER_checkbox.configure(state="disabled") NER_var.set(0) similarityIndex_Intruder_menu.configure(state="disabled") character_home_checkbox.configure(state="disabled") missing_character_checkbox.configure(state="disabled") Levenshtein_checkbox.configure(state='disabled') character_checkbox.configure(state="disabled") # character_home_checkbox.configure(state="disabled") intruder_checkbox.configure(state="disabled") plagiarist_checkbox.configure(state="disabled")
def checkUSSSUpdate(): if annotator_dictionary_var.get() == True or plot_var.get() == True: currentYear = datetime.now().year yearDiff = currentYear - last_SS_year_var.get() if yearDiff >= 2: reminders_util.checkReminder( config_filename, ['Time to download new US SS data'], 'It has been more than two years since the US Social Security gender data have been downloaded to your machine.\n\nCheck on the US Social Security website whether more current data are available at US Social Security website\n\nhttps://www.ssa.gov/oact/babynames/limits.html', True)
def display_warning(*args): if GIS_package2_var.get(): # routine_options = reminders_util.getReminder_list(config_filename) reminders_util.checkReminder( config_filename, ['Open Google Earth GUI'], 'You should tick the Open GUI checkbox ONLY if you wish to open the GUI.\n\nThe Google Earth Pro GUI will provide a number of options to personalize a Google Earth Pro map. Press Run after selecting the Open GUI option.', True) routine_options = reminders_util.getReminder_list(config_filename) return
def activate_filenameEmbedsDate(*args): if plagiarist_var.get() == False: similarityIndex_Plagiarist_menu.configure(state="disabled") similarityIndex_Plagiarist_menu.configure(state="disabled") check_filename_checkbox.configure(state='normal') character_checkbox.configure(state="normal") character_home_checkbox.configure(state="normal") missing_character_checkbox.configure(state="normal") Levenshtein_checkbox.configure(state='normal') intruder_checkbox.configure(state="normal") fileName_embeds_date_checkbox.configure(state="disabled") fileName_embeds_date.set(0) else: reminders_util.checkReminder(config_filename, ["Plagiarist"], '', True) similarityIndex_Plagiarist_menu.configure(state="normal") check_filename_checkbox.configure(state='disabled') character_checkbox.configure(state="disabled") character_home_checkbox.configure(state="disabled") Levenshtein_checkbox.configure(state='disabled') missing_character_checkbox.configure(state="disabled") intruder_checkbox.configure(state="disabled") fileName_embeds_date_checkbox.configure(state="normal")
def create_js(output_filename, locations, api_key, geocoder, latLongList): gmaps_list = [] if not latLongList: latLongList = [] for l in locations: returned_loc = GIS_geocode_util.nominatim_geocode(geocoder, l) latLongList.append([returned_loc.latitude, returned_loc.longitude]) else: latLongList = locations for item in latLongList: gmaps_str = ''.join([ "new google.maps.LatLng(", str(item[0]), ", ", str(item[1]), ")," ]) gmaps_list.append(gmaps_str) # gmaps_list geocoded values create_google_heatmap(output_filename, gmaps_list, api_key) config_filename = 'GIS-config.txt' reminders_util.checkReminder( config_filename, ['Google Maps API'], 'If the heatmap produced by Google Maps is displayed correctly for a split second and then displays "Oops! Something went wrong" you probably:\n 1. pasted incorrectly into the API key widget the Google API key;\n 2. you may not have entered billing information when applying for an API key; billing information is required although it is VERY unlikely you will be charged since you are not producing maps on a massive scale;\n 3. you may not have enabled the Maps JavaScript API (and if you use Google for geocoding, you also need to enable the Geocoding API.\n\nPlease, check the API key, your billing information, and tthe API enabled and try again.', True)
def check_requirements(*args): inputDir=GUI_util.input_main_dir_path.get() inputFilename=GUI_util.inputFilename.get() sentimentAnalysis=sentiment_analysis_var.get() if inputDir=='' and sentimentAnalysis == True or corpus_analysis_var.get() == True: mb.showwarning(title='Input folder error', message='The \'Sentiment analysis\' and \'Compute & visualize corpus statistcs\' options require in input a set of txt files for which to compute sentiment scores and/or corpus statistics.\n\nPlease, use the IO widget \'Select INPUT files directory\' to select the appropriate directory and try again.') return if inputDir!='' and sentiment_analysis_var.get() == True or corpus_analysis_var.get() == True: nSAscoreFiles=IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'txt') if nSAscoreFiles==0: mb.showwarning(title="Directory error", message="Sentiment Analysis and Corpus Statistics algorithms require in input a set of txt files for which to compute sentiment scores and/or create statistics. The selected input directory\n\n"+inputDir+"\n\ndoes not contain any txt files.\n\nPlease, select a different directory (or untick the checkboxes 'Sentiment Analysis' and/or 'Compute & visualize corpus statistics') and try again.") return if sentiment_analysis_var.get() == True: title_options = ['Stanford CoreNLP Sentiment Analysis system requirements'] message = 'The Stanford CoreNLP Sentiment Analysis tool requires two components.\n\n1. A copy of the FREEWARE Stanford CoreNLP suite installed on your machine. You can download the FREEWARE Stanford CoreNLP at https://stanfordnlp.github.io/CoreNLP/download.html.\n\n2. CoreNLP, in turn, requires to have the FREEWARE Java installed. You can download and install the FREEWARE JAVA at https://www.java.com/en/download/' reminders_util.checkReminder(config_filename, title_options, message, True) return if inputFilename=='' and sentiment_analysis_var.get() == False and corpus_analysis_var.get() == False and ( hierarchical_clustering_var.get() == True or SVD_var.get() == True or NMF_var.get() == True): mb.showwarning(title="Data warning: Data reduction algorithms", message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a csv file of sentiment analysis scores.\n\nPlease, use the IO widget \'Select INPUT csv file\' and try again.") if inputFilename!='' and sentiment_analysis_var.get() == False and corpus_analysis_var.get() == False and ( hierarchical_clustering_var.get() == True or SVD_var.get() == True or NMF_var.get() == True): nSAscoreFiles = IO_csv_util.GetNumberOfDocumentsInCSVfile(inputFilename,'Shape of Stories') if nSAscoreFiles == None: return # nSAscoreFiles = IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'csv') if nSAscoreFiles < 50: mb.showwarning(title="Data warning: Data reduction algorithms", message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a csv file of sentiment analysis scores for a large number of documents (at least 50). The selected input file\n\n" + inputFilename + "\n\ncontains only " + str( nSAscoreFiles) + " files. TOO FEW!\n\nYou REALLY should select a different csv file and try again.")
def display_reminder(*args): if SA_algorithm_var.get() == 'Stanford CoreNLP': message = 'The Stanford CoreNLP Sentiment Analysis tool requires two components.\n\n1. A copy of the FREEWARE Stanford CoreNLP suite installed on your machine. You can download the FREEWARE Stanford CoreNLP at https://stanfordnlp.github.io/CoreNLP/download.html.\n\n2. CoreNLP, in turn, requires to have the FREEWARE Java installed. You can download and install the FREEWARE JAVA at https://www.java.com/en/download/' title_option = [SA_algorithm_var.get()] reminders_util.checkReminder(config_filename, title_option, message, True) elif SA_algorithm_var.get() == 'VADER': message = 'VADER heavily relies on a number of NLTK libraries. If VADER fails to run, make sure that in command line you run\n\npython -m nltk.downloader all' title_option = [SA_algorithm_var.get()] reminders_util.checkReminder(config_filename, title_option, message, True) if mean_var.get() or median_var.get() == True: message = 'VADER cannot compute sentence mean and median values because VADER computes a single compound value for the entire sentence.\n\nUse the hedonometer to compute separate values and word list of words found.' title_option = ['VADER Mean/Median'] reminders_util.checkReminder(config_filename, title_option, message, True) elif SA_algorithm_var.get() == 'SentiWordNet': message = 'SentiWordNet does not compute sentence mean and median values nor does it display a list of the individual words found.' title_option = ['SentiWordNet'] reminders_util.checkReminder(config_filename, title_option, message, True) else: return
GUI_IO_util.place_help_button( window, help_button_x_coordinate, basic_y_coordinate + y_step * 8, "Help", "Please, tick the checkbox if you wish to MAP a list of geococed locations.\n\nUsing the dropdown menu, select the GIS (Geographic Information System) package you wish to use to produce maps.\n\nGoogle Maps requires an API key that you obtain from registering.\n\nWhen selecting Google Maps, the API key field will become available.\n\nYou will need to get the API key from the Google console and entering it there. REMEMBER! When applying for an API key you will need to enter billing information; billing information is required although it is VERY unlikely you will be charged since you are not producing maps on a massive scale.\n https://developers.google.com/maps/documentation/embed/get-api-key.\n\nAfter entering the Google API key, click OK to save it and the key will be read in automatically next time around.\n\nTick the Open GUI checkbox ONLY if you wish to open the Google Earth Pro GUI for more options. Do not tick the checkbox if you wish to run the pipeline automatically from text to maps." + GUI_IO_util.msg_Esc) GUI_IO_util.place_help_button(window, help_button_x_coordinate, basic_y_coordinate + y_step * 9, "Help", GUI_IO_util.msg_openOutputFiles) help_buttons(window, GUI_IO_util.get_help_button_x_coordinate(), GUI_IO_util.get_basic_y_coordinate(), GUI_IO_util.get_y_step()) # change the value of the readMe_message readMe_message = "This Python 3 script allows users to go from text to map in three steps:\n\n1. EXTRACT locations from a text file using Stanford CoreNLP NER extractor (NER values: CITY, STATE_OR_PROVINCE, COUNTRY);\n2. GEOCODE locations, previously extracted, using Nominatim or Google (an API is needed for Google);\n3. MAP locations, previously geocoded, using a selected GIS package (e.g., Google Earth Pro; Google Maps to produce heat maps; Google Maps requires an API key).\n\nOptions are preset and\or disabled depending upon the input type (directory or file; txt or csv file; csv CoNLL file or list of locations to be geocoded or already geocoded).\n\nAll three steps can be selected and carried out in sequence in a pipeline, going automatically from text to map." readMe_command = lambda: GUI_IO_util.readme_button( window, GUI_IO_util.get_help_button_x_coordinate(), GUI_IO_util.get_basic_y_coordinate(), "Help", readMe_message) GUI_util.GUI_bottom(config_input_output_options, y_multiplier_integer, readMe_command, TIPS_lookup, TIPS_options) # routine_options = reminders_util.getReminder_list(config_filename) result = reminders_util.checkReminder( config_filename, ['GIS GUI options'], 'The options available on the GUI have been automatically set for you depending upon the type of input file selected: txt or csv.\n\nWith a TXT file, NER extraction via Stanford CoreNLP must be first performed.\n\nWith a CSV file, the script checks whether the file is a CoNLL table, a geocoded file containing latitude and longitude values, or a file containing a list of locations that need to be geocoded.' ) if result != None: routine_options = reminders_util.getReminder_list(config_filename) GUI_util.window.mainloop()
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, memory_var, manual_Coref, parser, parser_menu_var, dateInclude, sep, date_field_position, dateFormat, compute_sentence_var, CoNLL_table_analyzer_var, CoreNLP_annotators_var, CoreNLP_annotators_menu_var): # check internet connection filesToOpen = [] if not IO_internet_util.check_internet_availability_warning( "Stanford CoreNLP"): return errorFound, error_code, system_output = IO_libraries_util.check_java_installation( 'Stanford CoreNLP') if errorFound: return if parser == 0 and CoNLL_table_analyzer_var == 0 and CoreNLP_annotators_var == 0: mb.showinfo( "Warning", "No options have been selected.\n\nPlease, select an option and try again." ) if CoreNLP_annotators_var == True and 'Coreference PRONOMINAL resolution' in CoreNLP_annotators_menu_var: if IO_libraries_util.inputProgramFileCheck( "Stanford_CoreNLP_coReference_util.py") == False: return if "Neural" in CoreNLP_annotators_menu_var: CoRef_Option = 'Neural Network' file_open, error_indicator = Stanford_CoreNLP_coreference_util.run( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, memory_var, CoRef_Option, manual_Coref) if error_indicator == 0: IO_user_interface_util.timed_alert( GUI_util.window, 4000, 'Stanford CoreNLP Co-Reference Resolution', 'Finished running Stanford CoreNLP Co-Reference Resolution using the ' + CoRef_Option + ' approach at', True) else: mb.showinfo( "Coreference Resolution Error", "Since Stanford CoreNLP Co-Reference Resolution throws error, " + "and you either didn't choose manual Co-Reference Resolution or manual Co-Referenece Resolution fails as well, the process ends now." ) filesToOpen = filesToOpen + file_open outputCoNLLfilePath = '' # parser --------------------------------------------------------------------------------------------------------------------------- if parser: # Parser ------------------------------ if parser_menu_var == 'Probabilistic Context Free Grammar (PCFG)' or parser_menu_var == 'Neural Network': if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return if parser_menu_var == 'Probabilistic Context Free Grammar (PCFG)': tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'parser (pcfg)', False, memory_var, extract_date_from_filename_var=dateInclude, date_format=dateFormat, date_separator_var=sep, date_position_var=date_field_position) else: # Parser (Neural Network) ------------------------------ tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'parser (nn)', False, memory_var, extract_date_from_filename_var=dateInclude, date_format=dateFormat, date_separator_var=sep, date_position_var=date_field_position) if len(tempOutputFiles) > 0: filesToOpen.extend(tempOutputFiles) if compute_sentence_var: tempOutputFile = IO_CoNLL_util.compute_sentence_table( tempOutputFiles[0], outputDir) filesToOpen.append(tempOutputFile) if CoNLL_table_analyzer_var and len(filesToOpen) > 0: if IO_libraries_util.inputProgramFileCheck( 'CoNLL_table_analyzer_main.py') == False: return # open the analyzer having saved the new new parser output in config so that it open the right input file if parser: config_filename_temp = 'conll-table-analyzer-config.txt' config_array = [ 'EMPTY LINE', outputCoNLLfilePath, 'EMPTY LINE', 'EMPTY LINE', 'EMPTY LINE', outputDir ] config_util.saveConfig(GUI_util.window, config_filename_temp, config_array, True) reminders_util.checkReminder( config_filename, ['CoNLL table analyzer'], "The Stanford CoreNLP GUI will now open the 'CoNLL table analyzer' where you can:\n\n 1. search the words contained in the CoNLL table (the one just created or a different one) by their syntactical properties and the type of relations to other words;\n 2. compute frequency distributions of various types of linguistic objects: clauses, nouns, verbs, function words ('junk/stop' words).", True) call("python CoNLL_table_analyzer_main.py", shell=True) if CoreNLP_annotators_var and CoreNLP_annotators_menu_var != '': # POS annotator --------------------------------------------------------------------------------------------------------------------------- if 'POS annotator' in CoreNLP_annotators_menu_var or CoreNLP_annotators_menu_var == '*': if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'All POS', False, memory_var, extract_date_from_filename_var=dateInclude, date_format=dateFormat, date_separator_var=sep, date_position_var=date_field_position) if len(tempOutputFiles) > 0: filesToOpen.extend(tempOutputFiles) # DepRel annotator --------------------------------------------------------------------------------------------------------------------------- if 'DepRel annotator' in CoreNLP_annotators_menu_var or CoreNLP_annotators_menu_var == '*': if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'DepRel', False, memory_var, extract_date_from_filename_var=dateInclude, date_format=dateFormat, date_separator_var=sep, date_position_var=date_field_position) if len(tempOutputFiles) > 0: filesToOpen.extend(tempOutputFiles) # NER annotator --------------------------------------------------------------------------------------------------------------------------- if 'NER (GUI)' in CoreNLP_annotators_menu_var: if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_NER_main.py') == False: return call("python Stanford_CoreNLP_NER_main.py", shell=True) # NER normalized date annotator --------------------------------------------------------------------------------------------------------------------------- if 'Normalized' in CoreNLP_annotators_menu_var or '**' in CoreNLP_annotators_menu_var: # date_extractor if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'normalized-date', False, memory_var, extract_date_from_filename_var=dateInclude, date_format=dateFormat, date_separator_var=sep, date_position_var=date_field_position) if len(tempOutputFiles) > 0: filesToOpen.extend(tempOutputFiles) # quote annotator --------------------------------------------------------------------------------------------------------------------------- if 'Quote' in CoreNLP_annotators_menu_var or '**' in CoreNLP_annotators_menu_var: # if quote_extractor: if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'quote', False, memory_var, extract_date_from_filename_var=dateInclude, date_format=dateFormat, date_separator_var=sep, date_position_var=date_field_position) if len(tempOutputFiles) > 0: filesToOpen.extend(tempOutputFiles) # gender annotator --------------------------------------------------------------------------------------------------------------------------- if 'Gender' in CoreNLP_annotators_menu_var or '**' in CoreNLP_annotators_menu_var: if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'gender', False, memory_var, extract_date_from_filename_var=dateInclude, date_format=dateFormat, date_separator_var=sep, date_position_var=date_field_position) if len(tempOutputFiles) > 0: filesToOpen.extend(tempOutputFiles) # Sentiment analysis annotator --------------------------------------------------------------------------------------------------------------------------- if 'Sentiment analysis' in CoreNLP_annotators_menu_var: if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'sentiment', False, memory_var, extract_date_from_filename_var=dateInclude, date_format=dateFormat, date_separator_var=sep, date_position_var=date_field_position) if len(tempOutputFiles) > 0: filesToOpen.extend(tempOutputFiles) # OpenIE SVO extractor --------------------------------------------------------------------------------------------------------------------------- if 'OpenIE' in CoreNLP_annotators_menu_var: if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return IO_user_interface_util.script_under_development( 'Stanford CoreNLP OpenIE') tempOutputFiles = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, 'openIE', False, memory_var, extract_date_from_filename_var=dateInclude, date_format=dateFormat, date_separator_var=sep, date_position_var=date_field_position) if len(tempOutputFiles) > 0: filesToOpen.extend(tempOutputFiles) if openOutputFiles: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, utf8_var, ASCII_var, corpus_statistics_var, corpus_options_menu_var, topics_var, topics_Mallet_var, topics_Gensim_var, open_GUI_var, what_else_var, what_else_menu_var, memory_var): filesToOpen = [] inputFilename = '' # only corpus in dir used if (corpus_statistics_var==False and \ corpus_options_menu_var==False and \ topics_Mallet_var==False and \ topics_Gensim_var==False and \ what_else_var==False and \ what_else_var == False): mb.showwarning( title='No options selected', message= 'No options have been selected.\n\nPlease, select an option and try again.' ) return if utf8_var == True: IO_user_interface_util.timed_alert( GUI_util.window, 7000, 'Analysis start', 'Started running utf8 compliance test at', True) file_utf8_compliance_util.check_utf8_compliance( GUI_util.window, inputFilename, inputDir, outputDir, openOutputFiles) if ASCII_var == True: IO_user_interface_util.timed_alert( GUI_util.window, 7000, 'Analysis start', 'Started running characters conversion at', True) file_cleaner_util.convert_quotes(GUI_util.window, inputFilename, inputDir) if corpus_statistics_var == True: if IO_libraries_util.inputProgramFileCheck( 'statistics_txt_util.py') == False: return lemmatize = False stopwords = False if '*' or 'stopwords' in corpus_options_menu_var: stopwords = True if '*' or 'Lemmatize' in corpus_options_menu_var: lemmatize = True if '*' in corpus_options_menu_var or 'stopwords' in corpus_options_menu_var or 'Lemmatize' in corpus_options_menu_var: output = statistics_txt_util.compute_corpus_statistics( window, '', inputDir, outputDir, False, createExcelCharts, stopwords, lemmatize) if output != None: filesToOpen.extend(output) if '*' in corpus_options_menu_var or 'lines' in corpus_options_menu_var: output = statistics_txt_util.read_line(window, '', inputDir, outputDir, False, createExcelCharts) if output != None: filesToOpen.extend(output) if topics_var == True: if topics_Gensim_var == True: if IO_libraries_util.inputProgramFileCheck( 'topic_modeling_gensim_main.py') == False: return routine_options = reminders_util.getReminder_list(config_filename) reminders_util.checkReminder( config_filename, ['What is in your corpus - Gensim'], 'The Gensim topic modeling routine run from here is a reduced version of the script, meant to provide a quick overview of the topics in your corpus.\n\nFor a more in-depth analysis of topics, use the topic modeling scripts for Gensim and Mallet.', True) routine_options = reminders_util.getReminder_list(config_filename) if open_GUI_var == True: call("python topic_modeling_gensim_main.py", shell=True) else: # run with all default values; do not run Mallet output = topic_modeling_gensim_util.run_Gensim( GUI_util.window, inputDir, outputDir, num_topics=20, remove_stopwords_var=1, lemmatize=1, nounsOnly=0, run_Mallet=False, openOutputFiles=openOutputFiles, createExcelCharts=createExcelCharts) if output != None: filesToOpen.extend(output) if topics_Mallet_var == True: # def run(inputDir, outputDir, openOutputFiles, createExcelCharts, OptimizeInterval, numTopics): if open_GUI_var == True: call("python topic_modeling_mallet_main.py", shell=True) else: # running with default values output = topic_modeling_mallet_util.run( inputDir, outputDir, openOutputFiles=openOutputFiles, createExcelCharts=createExcelCharts, OptimizeInterval=True, numTopics=20) if output != None: filesToOpen.extend(output) nouns_var = False verbs_var = False dialogues_var = False people_organizations_var = False gender_var = False times_var = False locations_var = False nature_var = False if what_else_var and what_else_menu_var == '*': nouns_var = True verbs_var = True if 'noun' in what_else_menu_var.lower(): nouns_var = True if 'verb' in what_else_menu_var.lower(): verbs_var = True if 'dialogue' in what_else_menu_var.lower(): dialogues_var = True if 'people' in what_else_menu_var.lower(): people_organizations_var = True if 'male' in what_else_menu_var.lower(): gender_var = True if 'time' in what_else_menu_var.lower(): times_var = True if 'location' in what_else_menu_var.lower(): locations_var = True if 'nature' in what_else_menu_var.lower(): nature_var = True if ( what_else_var and what_else_menu_var == '*' ) or nouns_var == True or verbs_var == True or people_organizations_var == True or gender_var == True or dialogues_var == True or times_var == True or locations_var == True: if IO_libraries_util.inputProgramFileCheck( 'Stanford_CoreNLP_annotator_util.py') == False: return if nouns_var or verbs_var: if nouns_var or verbs_var or what_else_menu_var == '*': WordNetDir = IO_libraries_util.get_external_software_dir( 'whats_in_your_corpus', 'WordNet') if WordNetDir == None: return annotator = ['POS'] files = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, annotator, False, memory_var) if len(files) > 0: noun_verb = '' if verbs_var == True: inputFilename = files[0] # Verbs but... double check if "verbs" in inputFilename.lower(): noun_verb = 'VERB' else: return output = WordNet_util.ancestor_GoingUP( WordNetDir, inputFilename, outputDir, noun_verb, openOutputFiles, createExcelCharts) if output != None: filesToOpen.extend(output) if nouns_var == True: inputFilename = files[1] # Nouns but... double check if "nouns" in inputFilename.lower(): noun_verb = 'NOUN' else: return output = WordNet_util.ancestor_GoingUP( WordNetDir, inputFilename, outputDir, noun_verb, openOutputFiles, createExcelCharts) if output != None: filesToOpen.extend(output) else: if (what_else_var and what_else_menu_var == '*'): IO_user_interface_util.timed_alert( GUI_util.window, 4000, 'Missing WordNet', 'The analysis of \'what else is in your corpus\' will skip the nouns and verbs classification requiring WordNet and will continue with all other CoreNLP annotators' ) if what_else_var and what_else_menu_var == '*': annotator_list = ['NER', 'gender', 'quote', 'normalized-date'] NER_list = [ 'PERSON', 'ORGANIZATION', 'CITY', 'STATE_OR_PROVINCE', 'COUNTRY' ] output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, annotator_list, False, memory_var, NERs=NER_list) if output != None: filesToOpen.extend(output) if people_organizations_var == True: annotator = 'NER' NER_list = ['PERSON', 'ORGANIZATION'] output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, annotator, False, memory_var, NERs=NER_list) if output != None: filesToOpen.extend(output) if gender_var == True: annotator = 'gender' output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, annotator, False, memory_var) if output != None: filesToOpen.extend(output) if dialogues_var == True: annotator = 'quote' output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, annotator, False, memory_var) if output != None: filesToOpen.extend(output) if times_var == True: annotator = 'normalized-date' output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, annotator, False, memory_var) if output != None: filesToOpen.extend(output) if locations_var == True: annotator = 'NER' NER_list = ['CITY', 'STATE_OR_PROVINCE', 'COUNTRY'] output = Stanford_CoreNLP_annotator_util.CoreNLP_annotate( inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, annotator, False, memory_var, NERs=NER_list) if output != None: filesToOpen.extend(output) if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def language_detection(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts): folderID = 0 fileID = 0 filesToOpen = [] outputFilenameCSV = IO_files_util.generate_output_file_name( inputFilename, inputDir, outputDir, '.csv', 'lang_detect') filesToOpen.append(outputFilenameCSV) files = IO_files_util.getFileList(inputFilename, inputDir, '.txt') if len(files) == 0: return if IO_csv_util.openCSVOutputFile(outputFilenameCSV): return fieldnames = [ 'LANGDETECT', 'Language', 'Probability', 'SPACY', 'Language', 'Probability', 'LANGID', 'Language', 'Probability', 'Document ID', 'Document' ] config_filename = 'file-spell-checker-config.txt' reminders_util.checkReminder( config_filename, ['Language detection'], 'Language detection algorithms are very slow. The NLP Suite runs three different types of algorithms: LANGDETECT, SPACY, and LANGID.\n\nPlease, arm yourself with patience, depennding upon the number and size of documents processed.', True) IO_user_interface_util.timed_alert( GUI_util.window, 2000, 'Analysis start', 'Started running language detection algorithms at', True, 'You can follow the algorithms in command line.') with open(outputFilenameCSV, 'w', encoding='utf-8', errors='ignore', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() docErrors_empty = 0 docErrors_unknown = 0 filenameSV = '' for filename in files: fileID = fileID + 1 head, tail = os.path.split(filename) print("Processing file " + str(fileID) + "/" + str(len(files)) + ' ' + tail) text = open(filename, 'r', encoding='utf-8', errors='ignore').read() if len(text) == 0: print( " The file is empty. It will be discarded from processing." ) docErrors_empty = docErrors_empty + 1 continue # text = opened_file.read() # head, tail = os.path.split(filename) # head is path, tail is filename try: value = detect_langs(text) except: filenameSV = filename # do not count the same document twice in this and the other algorithms that follow docErrors_unknown = docErrors_unknown + 1 print(" Unknown file read error.") continue value = str(value[0]).split(':') language = value[0] probability = value[1] print(' LANGDETECT', language, probability) # print(' LANGDETECT',value[0],value[1]) # [cs:0.7142840957132709, pl:0.14285810606233737, sk:0.14285779665739756] currentLine = ['LANGDETECT', language, probability] nlp = spacy.load('en_core_web_sm') nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) try: doc = nlp(text) except: if filename != filenameSV: # do not count the same document twice in this and the other algorithm that follows docErrors_unknown = docErrors_unknown + 1 filenameSV = filename print(" Unknown file read error.") continue value = doc._.language language = value['language'] probability = value['score'] print( ' SPACY', language, probability) # {'language': 'en', 'score': 0.9999978351575265} currentLine.extend(['SPACY', language, probability]) lang_identifier = LanguageIdentifier.from_modelstring( model, norm_probs=True) try: value = lang_identifier.classify(text) except: if filename != filenameSV: docErrors_unknown = docErrors_unknown + 1 filenameSV = filename print(" Unknown file read error.") continue language = value[0] probability = value[1] print(' LANGID', language, probability) # ('en', 0.999999999999998) print() currentLine.extend(['LANGID', language, probability]) currentLine.extend( [fileID, IO_csv_util.dressFilenameForCSVHyperlink(filename)]) writer = csv.writer(csvfile) writer.writerows([currentLine]) filenameSV = filename csvfile.close() msg = '' if docErrors_empty == 0 and docErrors_unknown == 0: msg = str( fileID ) + ' documents successfully processed for language detection.' else: if docErrors_empty > 0: msg = str( fileID ) + ' documents processed for language detection.\n ' + str( docErrors_empty) + ' document(s) found empty.' if docErrors_unknown > 0: if msg != '': msg = msg + '\n ' + str( docErrors_unknown ) + ' document(s) read with unknown errors.' else: msg = str(fileID) + ' documents processed for language detection.\n ' + \ str(docErrors_unknown) + ' document(s) read with unknown errors.' mb.showwarning( title='File read errors', message=msg + '\n\nFaulty files are listed in command line/terminal. Please, search for \'File read error\' and inspect each file carefully.' ) filesToOpen.append(outputFilenameCSV) if createExcelCharts: columns_to_be_plotted = [[1, 1], [4, 4], [7, 7]] chart_title = 'Frequency of Languages Detected by 3 Algorithms' hover_label = ['LANGDETECT', 'SPACY', 'LANGID'] inputFilename = outputFilenameCSV Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='_bar_chart', chart_type_list=["bar"], chart_title=chart_title, column_xAxis_label_var='Language', hover_info_column_list=hover_label, count_var=1) if Excel_outputFilename != '': filesToOpen.append(Excel_outputFilename) if openOutputFiles: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def run(inputDir, outputDir, openOutputFiles, createExcelCharts, n_grams_var, n_grams_menu_var, n_grams_list, n_grams_viewer_var, CoOcc_Viewer_var, search_words, date_options, temporal_aggregation, date_format, date_separator_var, date_position_var, viewer_list): # print(date_options, temporal_aggregation, date_format, date_separator_var, date_position_var) filesToOpen = [] total_file_number = 0 error_file_number = 0 error_filenames = [] error_flag = False if n_grams_var == False and n_grams_viewer_var == False and CoOcc_Viewer_var == False: mb.showwarning( title='Warning', message= 'There are no options selected.\n\nPlease, select one of the available options and try again.' ) return if date_options: new_date_format = date_format.replace('yyyy', '%Y').replace( 'mm', '%m').replace('dd', '%d') for folder, subs, files in os.walk(inputDir): for filename in files: if not filename.endswith('.txt'): continue filename = filename.replace('.txt', '') total_file_number = total_file_number + 1 try: date_text = '' date_text = filename.split(date_separator_var)[ date_position_var - 1] except: # if a file in the folder has no date it will break the code pass try: datetime.datetime.strptime(date_text, new_date_format) except ValueError: error_file_number = error_file_number + 1 error_filenames.append( IO_csv_util.dressFilenameForCSVHyperlink( os.path.join(folder, filename + '.txt'))) error_flag = True if error_flag: df = pd.DataFrame(error_filenames, columns=[ 'File with date not in position ' + str(date_position_var) ]) error_output = IO_files_util.generate_output_file_name( '', inputDir, outputDir, '.csv', 'Date_position_errors_file') df.to_csv(error_output, index=False) mb.showwarning( title='Warning', message='There are ' + str(error_file_number) + ' files out of ' + str(total_file_number) + ' processed in the selected input directory with errors in either the date format or the date position. \n\nThe selected date format is ' + str(date_format) + ' and the selected date position is ' + str(date_position_var) + '.\n\nClick OK to open a csv file with a list of files with erroneous dates. Check carefully, both date format and date position. Any erroneous file will need to be fixed or removed from the input directory before processing. You may also simply need to select a different date format and/or date position.' ) filesToOpen.append(error_output) if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen) return # COMPUTE Ngrams ______________________________________________________________________________ if n_grams_var: n_grams_word_var = False n_grams_character_var = False normalize = False n_grams_size = 4 # default number of n_grams excludePunctuation = False bySentenceIndex_word_var = False bySentenceIndex_character_var = False if n_grams_menu_var == "Word": n_grams_word_var = True else: n_grams_character_var = True bySentenceIndex_character_var = False if 'Hapax' in str(n_grams_list): n_grams_size = 1 if 'punctuation' in str(n_grams_list): excludePunctuation = True if 'sentence index' in str(n_grams_list): if n_grams_menu_var == "Word": bySentenceIndex_word_var = True else: bySentenceIndex_character_var = True IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams start', 'Started running ' + n_grams_menu_var + ' n-grams at', True, 'You can follow the script in command line.') if n_grams_word_var or n_grams_character_var or bySentenceIndex_word_var or bySentenceIndex_character_var: inputFilename = '' # for now we only process a whole directory if IO_libraries_util.inputProgramFileCheck( 'statistics_txt_util.py') == False: return if n_grams_word_var or bySentenceIndex_word_var: statistics_txt_util.compute_character_word_ngrams( GUI_util.window, inputFilename, inputDir, outputDir, n_grams_size, normalize, excludePunctuation, 1, openOutputFiles, createExcelCharts, bySentenceIndex_word_var) if n_grams_character_var or bySentenceIndex_character_var: statistics_txt_util.compute_character_word_ngrams( GUI_util.window, inputFilename, inputDir, outputDir, n_grams_size, normalize, excludePunctuation, 0, openOutputFiles, createExcelCharts, bySentenceIndex_character_var) IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams end', 'Finished running ' + n_grams_menu_var + ' n-grams at', True) # VIEWER ____________________________________________________________________________________________ if (n_grams_viewer_var == False and CoOcc_Viewer_var == False): return if (n_grams_viewer_var == True or CoOcc_Viewer_var == True) and (createExcelCharts == False): mb.showwarning( title='Warning', message= 'The checkbox to compute Excel charts is unticked. Since the VIEWER produces Excel charts as output, the script will abort.\n\nPlease, tick the checkbox to produce Excel charts and try again.' ) return txtCounter = len(glob.glob1(inputDir, "*.txt")) if txtCounter == 0: mb.showwarning( title='Warning', message= 'There are no files with txt extension in the selected directory.\n\nPlease, select a different directory and try again.' ) return if txtCounter == 1: mb.showwarning( title='Warning', message= 'There is only one file with txt extension in the selected directory. The script requires at least two files.\n\nPlease, select a different directory and try again.' ) return if (n_grams_viewer_var or CoOcc_Viewer_var): if IO_libraries_util.inputProgramFileCheck( 'NGrams_CoOccurrences_Viewer.jar') == False: return errorFound, error_code, system_output = IO_libraries_util.check_java_installation( 'Ngram/CoOccurrence Viewer') if errorFound: return if ',' in search_words: mb.showwarning( title='Warning', message= 'Values entered in the search bar should not be comma-separated, but blank-separated (e.g., woman man, and not woman, man).\n\nPlease, check your search bar values and try again.' ) return if search_words != '' and n_grams_viewer_var == False and CoOcc_Viewer_var == False: mb.showwarning( title='Warning', message="You have entered the string '" + search_words + "' in the Search widget but you have not selected which Viewer you wish to use, Ngram or Co-Occurrence.\n\nPlease, select an option and try again." ) return if search_words == '' and (n_grams_viewer_var == True or CoOcc_Viewer_var == True): mb.showwarning( title='Warning', message= "You have selected to run a Viewer but you have not entered any search strings in the Search widget.\n\nPlease, enter search values and try again." ) return normalize = False scaleData = False useLemma = False fullInfo = False if 'Normalize' in str(viewer_list): normalize = True if 'Scale' in str(viewer_list): scaleData = True if 'Lemmatize' in str(viewer_list): useLemma = True if 'full information' in str(viewer_list): fullInfo = True cmd = [ 'java', '-jar', 'NGrams_CoOccurrences_Viewer.jar', '-inputFolder', inputDir, '-outputFolder', outputDir ] if (n_grams_viewer_var == 1 or CoOcc_Viewer_var == 1) and len(search_words) == 0: mb.showwarning( title='Warning', message= 'No search words have been entered for either N-Grams or words co-occurrences.\n\nPlease, enter the search words and try again.' ) return if n_grams_viewer_var == 1 and len(search_words) > 0: if date_options == 0: mb.showwarning( title='Warning', message= 'No Date options selected. The N-Grams routine requires date metadata (i.e., date information embedded in the document filenames, e.g., The New York Times_12-18-1899).\n\nPlease, tick the Date options checkbox, enter the appropariate date options and try again.' ) return ngram_list = processSearchWords(search_words) ngram_list = ['-checkNGrams'] + ngram_list cmd.extend(ngram_list) if date_options == 1: cmd.extend([ '-AggregateBy', temporal_aggregation, '-dateFormat', date_format, '-datePos', str(date_position_var), '-itemsDelim', date_separator_var ]) if CoOcc_Viewer_var == 1 and len(search_words) > 0: co_occurrences_list = processSearchWords(search_words) co_occurrences_list = ["-checkCoOccurrences"] + co_occurrences_list cmd.extend(co_occurrences_list) if normalize == 1 and n_grams_viewer_var == 1 and len(search_words) > 0: cmd.append('-normalize') # only available for Ngrams if scaleData == 1: cmd.append('-scaledata') if useLemma == 1: cmd.append('-lemma') if fullInfo == 1: cmd.append('-fullInfo') IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams Word Co-Occurrences start', 'Started running N-Grams Word Co-Occurrences Viewer at', True, 'Please, be patient. Depending upon the number of documents processed this may take a few minutes.\n\nYou can follow the script in command line.' ) reminders_util.checkReminder( config_filename, ['subprocess.call(cmd) error'], 'subprocess.call(cmd) error\n\nIf the VIEWER you are running exits with an error code about a file not found, most likely your selected INPUT & OUTPUT directory options are too long for Windows to handle.\n\nYou may need to move your input and output folders so as to have a shorter path (e.g., desktop).', True) print(cmd) try: subprocess.run(cmd, shell=True) except: mb.showwarning( title='Warning', message= "The Java viewer script exited with errors. Please, check your command line for a possible error 'Java' is not recognized as an internal or external command. If that's the case, please install Java JDK. Please, check the TIPS on Java download and installation and try again." ) return if n_grams_viewer_var == 1 and len(search_words) > 0: # this is the output filename generated by the Java script n_grams_outputFile = os.path.join(outputDir, 'Searched_N-Grams.csv') if IO_files_util.checkFile(n_grams_outputFile, '.csv', True) == False: mb.showwarning( title='Warning', message= "The Java viewer script did not produce an N-grams output file.\n\nPlease, check your command line for possible Java errors and try again." ) return if CoOcc_Viewer_var == 1 and len(search_words) > 0: # this is the output filename generated by the Java script co_occurrences_outputFile = os.path.join(outputDir, 'Searched_CoOccurrences.csv') if IO_files_util.checkFile(co_occurrences_outputFile, '.csv', True) == False: mb.showwarning( title='Warning', message= "The Java viewer script did not produce a Co-occurrences output file.\n\nPlease, check your command line for possible Java errors and try again." ) return # plot co-occurrences if createExcelCharts == True and CoOcc_Viewer_var == 1 and len( search_words) > 0: xlsxFilename = co_occurrences_outputFile filesToOpen.append(co_occurrences_outputFile) chartTitle = 'Co-Occurrences Viewer' if date_options == 0: xAxis = 'Document' else: xAxis = temporal_aggregation hover_label = ['More information'] if xAxis == 'Document': columns_to_be_plotted = [[1, 1]] Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, xlsxFilename, outputDir, 'Co-Occ_viewer', chart_type_list=["pie"], chart_title=chartTitle, column_xAxis_label_var=xAxis, hover_info_column_list=hover_label, count_var=1) else: columns_to_be_plotted = [[0, 1]] Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, xlsxFilename, outputDir, 'Co-Occ_viewer', chart_type_list=["line"], chart_title=chartTitle, column_xAxis_label_var=xAxis, hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # plot Ngrams if createExcelCharts == True and n_grams_viewer_var == 1 and len( search_words) > 0: xlsxFilename = n_grams_outputFile filesToOpen.append(n_grams_outputFile) xAxis = temporal_aggregation chartTitle = 'N-Grams Viewer' columns_to_be_plotted = [] for i in range(len(ngram_list) - 1): # it will iterate through i = 0, 1, 2, …., n-1 columns_to_be_plotted.append([0, i + 1]) hover_label = [ 'Total Word Count of This Group', 'Total Word Count of This Group', 'Total Word Count of This Group' ] Excel_outputFilename = Excel_util.run_all( columns_to_be_plotted, xlsxFilename, outputDir, 'n-grams_viewer', chart_type_list=["line"], chart_title=chartTitle, column_xAxis_label_var=xAxis, hover_info_column_list=hover_label) if Excel_outputFilename != "": filesToOpen.append(Excel_outputFilename) # with both Ngrams and co-occurrences if n_grams_viewer_var == 1 and CoOcc_Viewer_var == 1 and CoOcc_Viewer_var == 1 and len( search_words) > 0: n_grams_co_occurrences_outputFile = os.path.join( outputDir, 'N-Grams_CoOccurrences_Statistics.csv') filesToOpen.append(n_grams_co_occurrences_outputFile) chartTitle = '' IO_user_interface_util.timed_alert( GUI_util.window, 3000, 'N-Grams Word Co-Occurrences end', 'Finished running N-Grams Word Co-Occurrences Viewer at', True) if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def GUI_bottom(config_input_output_options, y_multiplier_integer, readMe_command, TIPS_lookup, TIPS_options): """ :type TIPS_options: object """ # No bottom lines (README, TIPS, RUN, QUIT) displayed when opening the license agreement GUI if config_filename == 'license-config.txt': return # IO_options=[] reminder_options = [] video_options = [] missingIO = "" # for those GUIs (e.g., style analysis) that simply # display options for opening more specialized GUIs # do NOT display the next two sets of widgets # since there is no output to display if config_input_output_options != [0, 0, 0, 0, 0, 0]: #open out csv files widget defined above since it is used earlier open_csv_output_label = tk.Checkbutton( window, variable=open_csv_output_checkbox, onvalue=1, offvalue=0, command=lambda: trace_checkbox( open_csv_output_label, open_csv_output_checkbox, "Automatically open output csv file(s)", "Do NOT automatically open output csv file(s)")) open_csv_output_label.configure( text="Automatically open output csv file(s)") open_csv_output_label.place( x=GUI_IO_util.get_labels_x_coordinate(), y=GUI_IO_util.get_basic_y_coordinate() + GUI_IO_util.get_y_step() * y_multiplier_integer) open_csv_output_checkbox.set(1) #creat Excel chart files widget defined above since it is used earlier create_Excel_chart_output_label = tk.Checkbutton( window, variable=create_Excel_chart_output_checkbox, onvalue=1, offvalue=0, command=lambda: trace_checkbox( create_Excel_chart_output_label, create_Excel_chart_output_checkbox, "Automatically compute and open Excel charts", "Do NOT automatically compute and open Excel charts")) create_Excel_chart_output_label.configure( text="Automatically compute and open Excel chart(s)") create_Excel_chart_output_label.place( x=GUI_IO_util.get_labels_x_coordinate() + 380, y=GUI_IO_util.get_basic_y_coordinate() + GUI_IO_util.get_y_step() * y_multiplier_integer) create_Excel_chart_output_checkbox.set(1) y_multiplier_integer = y_multiplier_integer + 1 readme_button = tk.Button(window, text='Read Me', command=readMe_command, width=10, height=2) readme_button.place(x=GUI_IO_util.read_button_x_coordinate, y=GUI_IO_util.get_basic_y_coordinate() + GUI_IO_util.get_y_step() * y_multiplier_integer) video_options = ['No videos available'] videos_dropdown_field.set('Watch videos') if len(video_options) == 0: videos_menu_lb = tk.OptionMenu(window, videos_dropdown_field, video_options) else: if video_options[0] == "No videos available": videos_menu_lb = tk.OptionMenu(window, videos_dropdown_field, "No videos available") else: videos_menu_lb = tk.OptionMenu(window, videos_dropdown_field, *video_options) videos_menu_lb.configure(foreground="red") videos_menu_lb.place(x=GUI_IO_util.watch_videos_x_coordinate, y=GUI_IO_util.get_basic_y_coordinate() + GUI_IO_util.get_y_step() * y_multiplier_integer) tips_dropdown_field.set('Open TIPS files') if len(TIPS_lookup) == 1: if TIPS_options == "No TIPS available": tips_menu_lb = tk.OptionMenu(window, tips_dropdown_field, TIPS_options) else: tips_menu_lb = tk.OptionMenu(window, tips_dropdown_field, TIPS_options) tips_menu_lb.configure(foreground="red") else: tips_menu_lb = tk.OptionMenu(window, tips_dropdown_field, *TIPS_options) tips_menu_lb.configure(foreground="red") tips_menu_lb.place(x=GUI_IO_util.open_TIPS_x_coordinate, y=GUI_IO_util.get_basic_y_coordinate() + GUI_IO_util.get_y_step() * y_multiplier_integer) TIPS_util.trace_open_tips(tips_dropdown_field, tips_menu_lb, TIPS_lookup) routine = config_filename[:-len('-config.txt')] # get the list of titles available for a given GUI reminder_options = reminders_util.getReminder_list(config_filename, True) # None returned for a faulty reminders.csv reminders_error = False if reminder_options == None: reminders_error = True reminder_options = ["No Reminders available"] # reminders content for specific GUIs are set in the csv file reminders # called from any GUI reminders_dropdown_field.set('Open reminders') reminders_menu_lb = tk.OptionMenu(window, reminders_dropdown_field, "No Reminders available") if len(reminder_options) == 0: reminder_options = ["No Reminders available"] if len(reminder_options) == 0 or len(reminder_options) == 1: if reminder_options == ["No Reminders available"]: reminders_menu_lb = tk.OptionMenu(window, reminders_dropdown_field, *reminder_options) else: reminders_menu_lb = tk.OptionMenu(window, reminders_dropdown_field, *reminder_options) reminders_menu_lb.configure(foreground="red") else: reminders_menu_lb = tk.OptionMenu(window, reminders_dropdown_field, *reminder_options) reminders_menu_lb.configure(foreground="red") reminders_menu_lb.place(x=GUI_IO_util.open_reminders_x_coordinate, y=GUI_IO_util.get_basic_y_coordinate() + GUI_IO_util.get_y_step() * y_multiplier_integer) def trace_reminders_dropdown(*args): if len(reminder_options) > 0: reminders_util.resetReminder(config_filename, reminders_dropdown_field.get()) reminders_dropdown_field.trace('w', trace_reminders_dropdown) # get_help_button_x_coordinate()+700 run_button.place(x=GUI_IO_util.run_button_x_coordinate, y=GUI_IO_util.get_basic_y_coordinate() + GUI_IO_util.get_y_step() * y_multiplier_integer) def _close_window(): configArray = \ config_util.setup_IO_configArray(window, config_input_output_options, select_softwareDir_button, softwareDir, select_input_file_button, inputFilename, select_input_main_dir_button, input_main_dir_path, select_input_secondary_dir_button, input_secondary_dir_path, select_output_file_button, outputFilename, select_output_dir_button, output_dir_path)[0] GUI_IO_util.exit_window(window, config_filename, configArray) # quit_button = tk.Button(window, text='QUIT', width=10,height=2, command=lambda: GUI_IO_util.exit_window(window,config_filename,configArray)) quit_button = tk.Button(window, text='QUIT', width=10, height=2, command=lambda: _close_window()) # get_help_button_x_coordinate()+820 quit_button.place(x=GUI_IO_util.quit_button_x_coordinate, y=GUI_IO_util.get_basic_y_coordinate() + GUI_IO_util.get_y_step() * y_multiplier_integer) # Any message should be displayed after the whole GUI has been displayed if noLicenceError == True: mb.showwarning( title='Fatal error', message= "The licence agreement file 'LICENSE-NLP-1.0.txt' could not be found in the 'lib' subdirectory of your main NLP Suite directory\n" + GUI_IO_util.NLPPath + "\n\nPlease, make sure to copy this file in the 'lib' subdirectory.\n\nThe NLP Suite will now exit." ) sys.exit() if IO_options[0] == "EMPTY LINE": # INPUT software directory softwareDir.set('') else: softwareDir.set( config_util.checkConfigDirExists(config_filename, IO_options[0], 'INPUT')) if IO_options[1] == "EMPTY LINE": # INPUT filename inputFilename.set('') else: inputFilename.set( config_util.checkConfigFileExists(config_filename, IO_options[1], 'INPUT')) if IO_options[2] == "EMPTY LINE": # INPUT main directory input_main_dir_path.set('') else: input_main_dir_path.set( config_util.checkConfigDirExists(config_filename, IO_options[2], 'INPUT')) if IO_options[3] == "EMPTY LINE": # INPUT secondary directory input_secondary_dir_path.set('') else: input_secondary_dir_path.set( config_util.checkConfigDirExists(config_filename, IO_options[3], 'INPUT')) if IO_options[4] == "EMPTY LINE": # OUTPUT file name outputFilename.set('') else: outputFilename.set( config_util.checkConfigFileExists(config_filename, IO_options[4], 'OUTPUT')) if IO_options[5] == "EMPTY LINE": # OUTPUT directory output_dir_path.set('') else: output_dir_path.set( config_util.checkConfigDirExists(config_filename, IO_options[5], 'OUTPUT')) # set the state (enabled/disabled) of the RUN button # depending upon IO widgets; no IO info, RUN disabled configArray, missingIO = config_util.setup_IO_configArray( window, config_input_output_options, select_softwareDir_button, softwareDir, select_input_file_button, inputFilename, select_input_main_dir_button, input_main_dir_path, select_input_secondary_dir_button, input_secondary_dir_path, select_output_file_button, outputFilename, select_output_dir_button, output_dir_path) run_button_state = GUI_IO_util.check_missingIO(window, missingIO, config_filename) run_button.configure(state=run_button_state) if ('GUI front end' not in reminder_options) and (configArray == [ 'EMPTY LINE', 'EMPTY LINE', 'EMPTY LINE', 'EMPTY LINE', 'EMPTY LINE', 'EMPTY LINE' ]): # reminders_util.No_IO_reminder(config_filename) reminder_options = ['GUI front end'] message = 'The current GUI is a convenient front end that displays all the options available for the GUI.\n\nNo Input/Output options are displayed in this GUI since any selected option, when RUN, will open a specialized GUI with its own Input/Output requirements.' # recompute the options since a new line has been added else: message = '' # this will now display the error message if reminders_error == True: reminders_util.checkReminder(config_filename, reminder_options, message) window.protocol("WM_DELETE_WINDOW", _close_window)
def display_reminder(*args): if best_topic_estimation_var.get(): reminders_util.checkReminder(config_filename, ['Best topic estimation'], 'The function that estimates the best topics is VERY slow and may take an hour or longer. You can follow its progress in command line.', True)
def run(inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts, sentimentAnalysis, sentimentAnalysisMethod, memory_var, corpus_analysis, hierarchical_clustering, SVD, NMF, best_topic_estimation): # check all IO options --------------------------------------------------------------------------- if sentimentAnalysis==False and corpus_analysis==False and hierarchical_clustering==False and SVD==False and NMF==False and best_topic_estimation==False: mb.showwarning(title='Option selection error', message='No options have been selected.\n\nPlease, select an option and try again.') return # check if "Shape of Stories" default output directory exists sosDir = os.path.join(outputDir, "Shape of Stories") if not os.path.exists(sosDir): os.mkdir(sosDir) tail = '' if inputFilename!='': sentiment_scores_input = inputFilename # INPUT head, tail = os.path.split(sentiment_scores_input) outputDir = os.path.join(sosDir, os.path.basename(head)) elif inputDir!='': sentiment_scores_input = inputDir # INPUT head, tail = os.path.split(sentiment_scores_input) outputDir = os.path.join(sosDir, tail) # check that the specific default directory exists under "Shape of Stories" if not os.path.exists(outputDir): os.mkdir(outputDir) if GUI_util.output_dir_path.get()!=outputDir: # outputDir = head GUI_util.output_dir_path.set(outputDir) title_options = ['Output directory'] message = 'The output directory was changed to:\n\n'+str(outputDir) reminders_util.checkReminder(config_filename, title_options, message, True) if inputDir=='' and inputFilename!='': if sentimentAnalysis == True: mb.showwarning(title='Input folder error', message='The selected option requires in input a set of txt files for which to compute sentiment scores.\n\nPlease, use the IO widget \'Select INPUT files directory\' to select the appropriate directory and try again.') return if corpus_analysis == True: mb.showwarning(title='Input folder error', message='The selected option requires in input a set of txt files for which to compute corpus statistics.\n\nPlease, use the IO widget \'Select INPUT files directory\' to select the appropriate directory and try again.') return if inputFilename!='': # get headers so as to check that it is a sentiment score file str1=' ' str2=str1.join(IO_csv_util.get_csvfile_headers(inputFilename)) if not('Document' in str2 and 'Sentence' in str2 and 'Sentiment' in str2): mb.showwarning(title='Input file error', message='The selected file is not a file of sentiment scores.\n\nPlease, use the IO widget \'Select INPUT csv file\' to select the appropriate csv file containing sentiment scores and try again.') return computeSAScores = False nSAscoreFiles = IO_csv_util.GetNumberOfDocumentsInCSVfile(inputFilename,'Shape of Stories') if nSAscoreFiles == None: return if nSAscoreFiles < 50: answer = mb.askyesno("Data warning: Data reduction algorithms", message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a csv file of sentiment analysis scores for a large number of documents (at least 50). The selected input file\n\n" + inputFilename + "\n\ncontains only " + str( nSAscoreFiles) + " files. TOO FEW!\n\nYou REALLY should select a different csv file and try again.\n\nAre you sure you want to continue?") if answer == False: return else: # inputDir if sentimentAnalysis == True or corpus_analysis == True: nSAscoreFiles=IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'txt') if nSAscoreFiles == 0: mb.showwarning(title="Directory error", message="Sentiment Analysis and Corpus Statistics algorithms require in input a LARGE set of txt files for which to compute sentiment scores and/or comppute corpus statistics. The selected input directory\n\n" + inputDir + "\n\ndoes not contain any txt files.\n\nPlease, select a different directory (or untick the checkboxes 'Sentiment Analysis' and/or 'Compute & visualize corpus statistics') and try again.") return if nSAscoreFiles < 50 and sentimentAnalysis == True: answer = mb.askyesno("Directory error", message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a LARGE set of txt files. The selected input directory\n\n" + inputDir + "\n\ncontains only " + str( nSAscoreFiles) + " txt files from which to compute sentiment scores. TOO FEW!\n\nYou REALLY should select a different directory (or untick the checkboxes 'Sentiment Analysis') and try again.\n\nAre you sure you want to continue?") if not(sentimentAnalysis) and (hierarchical_clustering or SVD or NMF or best_topic_estimation): nSAscoreFiles = IO_files_util.GetNumberOfDocumentsInDirectory(inputDir, 'csv') if nSAscoreFiles==0: mb.showwarning(title="Directory error", message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a LARGE set of csv files. The selected input directory\n\n" + inputDir + "\n\ndoes not contain any csv files.\n\nPlease, select a different directory (or untick the checkboxes 'Hierarchical Clustering' 'Singular Value Decomposition' 'Non-Negative Matrix Factorization' and try again.") return elif nSAscoreFiles < 50 and sentimentAnalysis == True: answer = mb.askyesno("Data reduction algorithms", message="Data reduction algorithms (Hierarchical Clustering, Singular Value Decomposition, Non-Negative Matrix Factorization) require in input a LARGE set of txt files. The selected input directory\n\n" + inputDir + "\n\ncontains only " + str( nSAscoreFiles) + " txt files from which to compute sentiment scores. TOO FEW!\n\nYou REALLY should select a different directory (or untick the checkboxes 'Sentiment Analysis') and try again.\n\nAre you sure you want to continue?") if answer == False: return # check that the default directory of sentiment scores exists under the new default outputDir # sentiment_scores_folder = os.path.join(outputDir, "sentiment_analysis_scores_" + os.path.basename(inputDir)) # computeSAScores = False # if os.path.exists(sentiment_scores_input): # if sentimentAnalysis == True: # if nSAscoreFiles>0: # computeSAScores=mb.askyesno("Sentiment Analysis","You have selected to run sentiment analysis on your corpus of stories. But there already exists a set of sentiment scores for this corpus saved in the default output directory:\n\n"+sentiment_scores_input+"\n\nAre you sure you want to recompute the scores?") # if computeSAScores ==True: # # remove current sentiment scores directory and recreate it # shutil.rmtree(sentiment_scores_input) # os.mkdir(sentiment_scores_input) # else: # if hierarchical_clustering == False and SVD == False and NMF == False: # mb.showwarning(title='Option selection error', # message='No data reduction options have been selected.\n\nPlease, select an option and try again.') # return # else: # answer = mb.askyesno("Sentiment Analysis", # "The 'Shape of Stories' algorithms will not compute sentiment scores and will continue running the data reduction algorithms using the already available scores.\n\nAre you sure you want to continue?") # if answer == False: # return # else: # computeSAScores=True # else: # if nSAscoreFiles==0: # mb.showwarning(title="Folder error", # message="There are no csv files of sentiment analysis scores in the directory\n\n" +str(sentiment_scores_input) + \ # "\n\nYou will need to run the sentiment analysis algorithm. Please, tick the checkbox to run Sentiment Analysis and try again.") # return # else: # os.mkdir(sentiment_scores_input) # computeSAScores = True # RUN SCRIPTS --------------------------------------------------------------------------- filesToOpen = [] # utf.check_utf8_compliance(GUI_util.window, "", inputDir, outputDir, openOutputFiles) IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start', 'Started running Shape of Stories at', True) # check corpus statistics if corpus_analysis: statistics_txt_util.compute_corpus_statistics(GUI_util.window, inputDir, inputDir, outputDir, openOutputFiles, True) # step 1: run sentiment analysis if sentimentAnalysis == 1: # run appropriate sentiment analysis method as indicated by sentimentAnalysisMethod if sentimentAnalysisMethod == "Stanford CoreNLP Neural Network": title_options = ['Stanford CoreNLP Neural Network'] message = 'The Stanford CoreNLP Neural Network approach to Sentiment analysis, like all neural network algorithms, is VERY slow. On a few hundred stories it may take hours to run.\n\nAlso, neural network algorithms are memory hogs. MAKE SURE TO ALLOCATE AS MUCH MEMORY AS YOU CAN AFFORD ON YOUR MACHINE.' reminders_util.checkReminder(config_filename, title_options, message, True) # TODO any changes in the way the CoreNLP_annotator generates output filenames will need to be edited here outputFilename = 'NLP_CoreNLP_sentiment_Dir_'+tail + '.csv' if os.path.isfile(os.path.join(outputDir,outputFilename)): computeSAScores=mb.askyesno("Sentiment Analysis","You have selected to run sentiment analysis on your corpus. But there already exists a csv file of sentiment scores for this corpus saved in the default output directory:\n\n"+outputFilename+"\n\nAre you sure you want to recompute the scores?") if not computeSAScores: return tempOutputfile=Stanford_CoreNLP_annotator_util.CoreNLP_annotate('', inputDir, outputDir, openOutputFiles, createExcelCharts,'sentiment',False, memory_var) if tempOutputfile==None: return sentiment_scores_input=tempOutputfile[0] else: mb.showwarning(title="Sentiment Analysis Method not available", message=sentimentAnalysisMethod + " is not currently available. The only available option is the \'Stanford CoreNLP neural network\' method. Sorry!") return if hierarchical_clustering or SVD or NMF or best_topic_estimation: # step 2: vectorize # the sentiment_scores_input can either be a single merged csv file or a directory with multiple SA scores files vectz = vec.Vectorizer(sentiment_scores_input) # pop up window # window size val = GUI_IO_util.slider_widget(GUI_util.window,"Please, select the value for window size. Window size is the number of sentences " + "that will be averaged to obtain one point of the story arc. The recommend value is " + str(vectz.window_size) + ".", 1, vectz.min_doc_len - 1, vectz.window_size) vectz.window_size = val # sentiment_vector_size val = GUI_IO_util.slider_widget(GUI_util.window,"Please, select the value for sentiment vector size. Sentiment vector size is the number of values " + "that each document will be represented with. The recommend value is " + str(vectz.ideal_sent_v_size) + ".", 1, vectz.min_doc_len, vectz.ideal_sent_v_size) vectz.sentiment_vector_size = val sentiment_vectors, file_list, scoresFile_list = vectz.vectorize()#ANGEl rec_n_clusters = vectz.compute_suggested_n_clusters(sentiment_vectors) if rec_n_clusters==None: return # visualize a Principal Component Analysis (PCA) scatter plot of sentiment scores PCAFilename=viz.visualize_sentiment_arcs(sentiment_vectors, outputDir) filesToOpen.append(PCAFilename) # number of clusters val = GUI_IO_util.slider_widget(GUI_util.window,"Please, select the value for number of clusters (modes). The recommend value is " + str( rec_n_clusters) + ".", 1, vectz.sentiment_vector_size, rec_n_clusters) rec_n_clusters = val # hierarchical clustering if hierarchical_clustering: hier = cl.Clustering(rec_n_clusters) DendogramFilename, grouped_vectors, clusters_indices, vectors = hier.cluster(sentiment_vectors, outputDir) filesToOpen.append(DendogramFilename) sentiment_vectors = vectors clusters_file = cl.processCluster(clusters_indices, scoresFile_list,file_list, sentiment_vectors, rec_n_clusters, os.path.join(outputDir, "Hierarchical Clustering Documents.csv"), inputDir) vis = viz.Visualizer(outputDir) vis.visualize_clusters(grouped_vectors, "Hierarchical Clustering (HC)", "HC", clusters_file) for i in range(rec_n_clusters): filesToOpen.append(os.path.join(outputDir, "HC_Cluster_" + str(i + 1) + ".png")) filesToOpen.append(os.path.join(outputDir, "HC_Cluster_" + str(i + 1) + "_subplot.png")) filesToOpen.append(os.path.join(outputDir, "Hierarchical Clustering Documents.csv")) # svd if SVD: svd = cl.SVDClustering(rec_n_clusters) pos_vector_clusters, pos_clusters_indices, pos_modes, neg_vector_clusters, neg_clusters_indices, neg_modes = \ svd.cluster(sentiment_vectors) clusters_file = cl.processCluster(pos_clusters_indices,scoresFile_list, file_list, sentiment_vectors, rec_n_clusters, os.path.join(outputDir, "SVD Positive Documents.csv"), inputDir) vis = viz.Visualizer(outputDir) vis.visualize_clusters(pos_vector_clusters, "Singular Value Decomposition Positive (SVD Positive)", "SVDPositive", clusters_file, modes=pos_modes) clusters_file = cl.processCluster(neg_clusters_indices, scoresFile_list,file_list, sentiment_vectors, rec_n_clusters, os.path.join(outputDir, "SVD Negative Documents.csv"), inputDir) vis = viz.Visualizer(outputDir) vis.visualize_clusters(neg_vector_clusters, "Singular Value Decomposition Negative (SVD Negative)", "SVDNegative", clusters_file, modes=neg_modes) for i in range(rec_n_clusters): filesToOpen.append(os.path.join(outputDir, "SVD_Positive_Cluster_" + str(i + 1) + ".png")) for i in range(rec_n_clusters): filesToOpen.append(os.path.join(outputDir, "SVD_Negative_Cluster_" + str(i + 1) + ".png")) filesToOpen.append(os.path.join(outputDir, "SVD Positive Documents.csv")) filesToOpen.append(os.path.join(outputDir, "SVD Negative Documents.csv")) # NMF if NMF: nmf = cl.NMFClustering(rec_n_clusters) grouped_vectors, clusters_indices, vectors = nmf.cluster(sentiment_vectors) sentiment_vectors = vectors clusters_file = cl.processCluster(clusters_indices, scoresFile_list,file_list, sentiment_vectors, rec_n_clusters, os.path.join(outputDir, "NMF Documents.csv"), inputDir) vis = viz.Visualizer(outputDir) vis.visualize_clusters(grouped_vectors, "Non-negative Matrix Factorization (NMF)", "NMF", clusters_file) for i in range(rec_n_clusters): filesToOpen.append(os.path.join(outputDir, "NMF_Cluster_" + str(i + 1) + ".png")) filesToOpen.append(os.path.join(outputDir, "NMF_Cluster_" + str(i + 1) + "_subplot.png")) filesToOpen.append(os.path.join(outputDir, "NMF Documents.csv")) # best topic estimate if best_topic_estimation: IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start', 'Started running estimate_best_k at', True,'You can follow the progress bar in command line.') filesToOpen = cl.estimate_best_k(sentiment_vectors, outputDir, filesToOpen) IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis end', 'Finished running estimate_best_k at', True) IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis end', 'Finished running Shape of Stories at', True) if openOutputFiles == True: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
def language_detection(window, inputFilename, inputDir, outputDir, openOutputFiles, createExcelCharts): IO_user_interface_util.timed_alert(GUI_util.window, 1000, 'Analysis start', 'Started running Language Detection at', True) folderID = 0 fileID = 0 filesToOpen=[] outputFilenameCSV=IO_files_util.generate_output_file_name(inputFilename, inputDir, outputDir, '.csv', 'lang_detect') filesToOpen.append(outputFilenameCSV) files=IO_files_util.getFileList(inputFilename, inputDir, '.txt') if len(files) == 0: return if IO_csv_util.openCSVOutputFile(outputFilenameCSV): return fieldnames = ['LANGDETECT', 'Language', 'Probability', 'SPACY', 'Language', 'Probability', 'LANGID', 'Language', 'Probability', 'Document ID', 'Document'] config_filename = 'file-spell-checker-config.txt' reminders_util.checkReminder(config_filename, ['Language detection'], 'Language detection algorithms are very slow. The NLP Suite runs three different types of algorithms: LANGDETECT, SPACY, and LANGID.\n\nPlease, arm yourself with patience, depennding upon the number and size of documents processed.', True) IO_user_interface_util.timed_alert(GUI_util.window, 2000, 'Analysis start', 'Started running language detection algorithms at', True, 'You can follow the algorithms in command line.') with open(outputFilenameCSV, 'w', encoding='utf-8', errors='ignore', newline='') as csvfile: writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() docErrors_empty=0 docErrors_unknown=0 filenameSV='' for filename in files: fileID = fileID + 1 head, tail = os.path.split(filename) print("Processing file " + str(fileID) + "/" + str(len(files)) + ' ' + tail) text = open(filename, 'r', encoding='utf-8', errors='ignore').read() if len(text)==0: print(" The file is empty. It will be discarded from processing.") docErrors_empty=docErrors_empty+1 continue # text = opened_file.read() # head, tail = os.path.split(filename) # head is path, tail is filename try: value = detect_langs(text) except: filenameSV=filename # do not count the same document twice in this and the other algorithms that follow docErrors_unknown=docErrors_unknown+1 print(" Unknown file read error.") continue value=str(value[0]).split(':') language=value[0] probability=value[1] # https://pypi.org/project/langdetect/ # langdetect supports 55 languages out of the box (ISO 639-1 codes) # af, ar, bg, bn, ca, cs, cy, da, de, el, en, es, et, fa, fi, fr, gu, he, # hi, hr, hu, id, it, ja, kn, ko, lt, lv, mk, ml, mr, ne, nl, no, pa, pl, # pt, ro, ru, sk, sl, so, sq, sv, sw, ta, te, th, tl, tr, uk, ur, vi, zh-cn, zh-tw # ISO codes https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes print(' LANGDETECT', language, probability) # print(' LANGDETECT',value[0],value[1]) # [cs:0.7142840957132709, pl:0.14285810606233737, sk:0.14285779665739756] currentLine = ['LANGDETECT', language, probability] nlp = spacy.load('en_core_web_sm') nlp.add_pipe(LanguageDetector(), name='language_detector', last=True) try: doc = nlp(text) except: if filename!=filenameSV: # do not count the same document twice in this and the other algorithm that follows docErrors_unknown = docErrors_unknown + 1 filenameSV=filename print(" Unknown file read error.") continue value = doc._.language language=value['language'] probability=value['score'] # print(' SPACY', language, probability) # {'language': 'en', 'score': 0.9999978351575265} currentLine.extend(['SPACY', language, probability]) lang_identifier = LanguageIdentifier.from_modelstring(model, norm_probs=True) try: value=lang_identifier.classify(text) except: if filename!=filenameSV: docErrors_unknown = docErrors_unknown + 1 filenameSV=filename print(" Unknown file read error.") continue language=value[0] probability=value[1] # LANGID ``langid.py`` comes pre-trained on 97 languages (ISO 639-1 codes given) # https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes for ISO codes # https://pypi.org/project/langid/1.1.5/ # af, am, an, ar, as, az, be, bg, bn, br, # bs, ca, cs, cy, da, de, dz, el, en, eo, # es, et, eu, fa, fi, fo, fr, ga, gl, gu, # he, hi, hr, ht, hu, hy, id, is, it, ja, # jv, ka, kk, km, kn, ko, ku, ky, la, lb, # lo, lt, lv, mg, mk, ml, mn, mr, ms, mt, # nb, ne, nl, nn, no, oc, or, pa, pl, ps, # pt, qu, ro, ru, rw, se, si, sk, sl, sq, # sr, sv, sw, ta, te, th, tl, tr, ug, uk, # ur, vi, vo, wa, xh, zh, zu print(' LANGID', language, probability) # ('en', 0.999999999999998) print() currentLine.extend(['LANGID', language, probability]) currentLine.extend([fileID, IO_csv_util.dressFilenameForCSVHyperlink(filename)]) writer = csv.writer(csvfile) writer.writerows([currentLine]) filenameSV=filename csvfile.close() msg='' if docErrors_empty==0 and docErrors_unknown==0: msg=str(fileID) + ' documents successfully processed for language detection.' else: if docErrors_empty>0: msg=str(fileID) + ' documents processed for language detection.\n ' + str(docErrors_empty) + ' document(s) found empty.' if docErrors_unknown>0: if msg!='': msg=msg + '\n ' + str(docErrors_unknown) + ' document(s) read with unknown errors.' else: msg = str(fileID) + ' documents processed for language detection.\n ' + \ str(docErrors_unknown) + ' document(s) read with unknown errors.' mb.showwarning(title='File read errors', message=msg+ '\n\nFaulty files are listed in command line/terminal. Please, search for \'File read error\' and inspect each file carefully.') filesToOpen.append(outputFilenameCSV) IO_user_interface_util.timed_alert(GUI_util.window, 1000, 'Analysis end', 'Finished running Language Detection at', True,'Languages detected are exported via the ISO 639 two-letter code. ISO 639 is a standardized nomenclature used to classify languages. Check the ISO list at https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes.') print('Languages detected are exported via the ISO 639 two-letter code. ISO 639 is a standardized nomenclature used to classify languages. Check the ISO list at https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes.') if createExcelCharts: columns_to_be_plotted = [[1, 1],[4,4],[7,7]] chart_title='Frequency of Languages Detected by 3 Algorithms' hover_label=['LANGDETECT', 'SPACY', 'LANGID'] inputFilename = outputFilenameCSV Excel_outputFilename = Excel_util.run_all(columns_to_be_plotted, inputFilename, outputDir, outputFileLabel='_bar_chart', chart_type_list=["bar"], chart_title=chart_title, column_xAxis_label_var='Language', hover_info_column_list=hover_label, count_var=1) if Excel_outputFilename!='': filesToOpen.append(Excel_outputFilename) if openOutputFiles: IO_files_util.OpenOutputFiles(GUI_util.window, openOutputFiles, filesToOpen)
"Help", "Please, using the dropdown menu, select one of the many options available for analyzing your corpus and/or a single document.\n\nTHE TOOLS IN THIS CATEGORY, APPLY TO EITHER MULTIPLE DOCUMENTS (THE 'CORPUS') OR TO A SINGLE DOCUMENT.\n\nIn INPUT the tools expect either multiple documents stored in a directory (the 'corpus') or a single document." + GUI_IO_util.msg_Esc) GUI_IO_util.place_help_button( window, help_button_x_coordinate, basic_y_coordinate + y_step * 9, "Help", "Please, using the dropdown menu, select one of the many options available for analyzing your corpus/document by sentence index.\n\nTHE TOOLS IN THIS CATEGORY, APPLY TO EITHER MULTIPLE DOCUMENTS (THE 'CORPUS') OR TO A SINGLE DOCUMENT; BUT THEY ALSO PROVIDE SENTENCE-BASED INFORMATION FOR MORE IN-GRAINED ANALYSES.\n\nIn INPUT the tools expect either multiple documents stored in a directory (the 'corpus') or a single document." + GUI_IO_util.msg_Esc) help_buttons(window, GUI_IO_util.get_help_button_x_coordinate(), GUI_IO_util.get_basic_y_coordinate(), GUI_IO_util.get_y_step()) # change the value of the readMe_message readMe_message = "This Python 3 script is the front end for a wide collection of Java and Python Natural Language Processing (NLP) tools.\n\nThe set of tools are divided into GENERAL TOOLS (data and file handling, pre-processing, statistical, visualization) and LINGUISTIC ANALYSIS TOOLS.\n\nLINGUISTIC ANALYSIS TOOLS are divided into tools that expect in input CORPUS DATA (i.e., multiple documents stored in a directory), CORPUS and/or SINGLE DOCUMENT, and SENTENCE.\n\nWhile some linguistic tools are specific for one of these three categories (e.g., topic modeling cannot be performed on a single document), MANY TOOLS OVERLAP. As a result, you may find the same tool under BOTH corpus and corpus/document. SENTENCE TOOLS still require either a corpus or a single document in input; but they also provide in output sentence-level information for more in-grained linguistic analyses.\n\nAll tools are open source freeware software released under the GNU LGPLv2.1 license (http://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html).\n\nYou can cite the NLP Suite as:\n\nR. Franzosi. 2020. NLP Suite: A set of tools of Natural Language Processing (NLP) & Data Visualization." readMe_command = lambda: GUI_IO_util.readme_button( window, GUI_IO_util.get_help_button_x_coordinate(), GUI_IO_util.get_basic_y_coordinate(), "Help", readMe_message) GUI_util.GUI_bottom(config_input_output_options, y_multiplier_integer, readMe_command, TIPS_lookup, TIPS_options) if platform == "darwin": title_options = ['tkinter MacOS bug'] message = 'MacOS bug in tkinter (https://www.python.org/download/mac/tcltk/).\n\nPython\'s integrated development environment, IDLE, and the tkinter GUI toolkit it uses, depend on the Tk GUI toolkit which is not part of Python itself. For best results, it is important that the proper release of Tcl/Tk is installed on your machine. For recent Python installers for macOS downloadable from this website, here is a summary of current recommendations followed by more detailed information.' reminders_util.checkReminder(config_filename, title_options, message, True) # check for software installation IO_libraries_util.get_external_software_dir('NLP_menu', '') GUI_util.window.mainloop()