def loop_topic_data(fileID=None, model_number=None, dir_path=parser.get('final_stage', 'loop_topic_data_dir_path'), dest_dir=parser.get('final_stage', 'loop_topic_data_dest_dir')): global score_data if not os.path.exists(dest_dir): os.mkdir(dest_dir) # r=root, d=directories, f = files for r, d, f in os.walk(dir_path, topdown=True): if r.split(os.path.sep)[-1] == 'topic_class': continue if (model_number is not None and r.split(os.path.sep)[-1] == model_number) or (model_number is None): current_model = r.split(os.path.sep)[-1] output_dir_path = os.path.join(dest_dir, r.split(os.path.sep)[-1]) if not os.path.exists(output_dir_path): os.mkdir(output_dir_path) print(r) if current_model == 'textData': continue for file in f: file_id = file.split('_')[0] if (fileID is not None and file.split('_')[0] == str(fileID)) or (fileID is None): file_path_dir = os.path.join(output_dir_path, file_id) if not os.path.exists(file_path_dir): os.mkdir(file_path_dir) print(os.path.join(r, file)) score_data = sr.load_data(file[:-10] + '.txt', current_model) process_topic_file(os.path.join(r, file), file_path_dir) create_json_repr(file_id, file_path_dir, r.split(os.path.sep)[-1])
def loop_discourse_results_one_file( topic_number, fileId, mode='LDA', discourse_output_dir=parser.get( 'LDA_process', 'loop_discourse_results_one_file_discourse_output_dir')): results_dict = dict( ) # each file processed and list of all the topics detected for discourse_results in os.listdir(discourse_output_dir): # If the current file is a dir => continue if os.path.isdir( os.path.join(discourse_output_dir, discourse_results)): continue # If the current file is not he fileID we looking for => continue if discourse_results.split('_')[0] != fileId: continue # If the current file has the FAILED_PARSE or FAILED_SEG we cannot process => continue if 'strip_output' in discourse_results and 'FAILED_PARSE' not in discourse_results and 'FAILED_SEG' not in discourse_results: results_dict[discourse_results[:-17]] = list() text_tuples = list() classify_discourse_tree(discourse_results, results_dict[discourse_results[:-17]], discourse_output_dir, topic_number, text_tuples, mode=mode) if mode == 'LDA': write_stats_to_file(results_dict, topic_number + 1) elif mode == 'HLDA': write_stats_to_file_hlda(results_dict)
def loop_discourse_results( topic_number, mode='LDA', discourse_output_dir=parser.get( 'LDA_process', 'loop_discourse_results_one_file_discourse_output_dir')): results_dict = dict() counter = 0 for discourse_results in os.listdir(discourse_output_dir): if os.path.isdir(os.path.join(discourse_output_dir, discourse_results)): continue if 'strip_output' in discourse_results and 'FAILED_PARSE' not in discourse_results and 'FAILED_SEG' not in discourse_results: results_dict[discourse_results[:-17]] = list() text_tuples = list() classify_discourse_tree(discourse_results, results_dict[discourse_results[:-17]], discourse_output_dir, topic_number, text_tuples, mode=mode) if mode == 'LDA': write_stats_to_file(results_dict, topic_number + 1) elif mode == 'HLDA': write_stats_to_file_hlda(results_dict)
def run_dataSet(): DATA_PATH = parser.get('summarizerWS', 'DATA_PATH') queue = [file[:-4] for file in os.listdir(DATA_PATH)] for file in queue: print(f'{bcolors.OKGREEN}Processing: {file}{bcolors.ENDC}') create_summary_NSxTI(fileId=int(file), modelNumber=10)
def create_summary(file_id, des_dir=parser.get('summarizerWS', 'des_dir')): file_data, topic_score = tr.score_fileID(file_id) count_words(file_data) new_summary = build_summary(file_data, topic_score) with open(os.path.join(des_dir, str(file_id) + '.txt'), 'w') as output_file: output_file.writelines(new_summary)
def createXmlDocument_v2(text, fileName, path=parser.get('xmlTree', 'createXmlDocument_v2_path')): sections_type = init_section_structure() if not os.path.isdir('output'): os.mkdir('output') if not os.path.isdir(path): # Checks if directory exists os.mkdir(path) section = None data = ET.Element('data') for sent in text: if sent.startswith('<H-'): section_exists = False sent_strip, section_id = strip_tags(sent, 'H') for elem in data: if len(elem.attrib) != 0 and elem.attrib[ 'name'] == sections_type[section_id]: section_exists = True section = elem # updates section variable to the element found so the sentences will be added to the section exists break if section_exists: continue section = ET.SubElement(data, 'section') section.set('name', sections_type[section_id]) section.text = sent_strip if sent.startswith('<S>'): if section is None: sent = strip_tags(sent, 'S') sentence = ET.SubElement(data, 'S') sentence.text = sent else: sent = strip_tags(sent, 'S') sentence = ET.SubElement(section, 'S') sentence.text = sent if sent.startswith('<OneItem>'): if section is None: sent = strip_tags(sent, 'OneItem') sentence = ET.SubElement(data, 'OneItem') sentence.text = sent else: sent = strip_tags(sent, 'OneItem') sentence = ET.SubElement(section, 'OneItem') sentence.text = sent with open(path + fileName + ".xml", "wb") as xmlWriter: newData = ET.tostring(data) newData = re.sub( b'[\x00-\x08|\x0B|\x0C|\x0E-\x1F|\x7F-\x84|\x86-\x9F]', b'', newData) xmlWriter.write(newData) return path + fileName + ".xml"
def reset_project(): with open('reset_project_paths.txt', 'r') as rst_path: paths = rst_path.readlines() paths = ['rm ' + line[:-1] for line in paths] choice1 = input( '[WARNING!] This will delete all the data for the project!\nAre you sure? [to continue type yes]\n' ) if choice1.lower() == 'yes': choice2 = input('Verify again! type yes!\n') if choice2.lower() == 'yes!': for path in paths: print(f'Deleting data: {path}') subprocess.run(path, shell=True) discourse_output = parser.get( 'LDA_process', 'loop_discourse_results_one_file_discourse_output_dir' ) # from project_paths.ini discourse_output1 = discourse_output + '/*' discourse_output2 = discourse_output + '/Nucleus/*' print('Deleting data: ' + discourse_output1) subprocess.run('rm ' + discourse_output1, shell=True) print('Deleting data: ' + discourse_output2) subprocess.run('rm ' + discourse_output2, shell=True) discourse_input = parser.get('main_pipeline', 'discourseInput') discourse_input1 = discourse_input + '*' discourse_input2 = ' '.join([ 'rm', '-r', discourse_input1.replace('xml/', 'xmlParse/') ]) print('Deleting data: ' + discourse_input1) subprocess.run('rm ' + discourse_input1, shell=True) print('Deleting data: ' + discourse_input2) subprocess.run(discourse_input2, shell=True) else: print('Aborted') else: print('Aborted')
def read_xml_file(filename_path, filename, target_dir=parser.get('xmlTree', 'read_xml_file_target_dir')): stats_counter = { 'name': filename, 'words': 0, 'sentences': 0, 'sections': 0, 'wordsPerSection': 0 } if not os.path.isdir(target_dir): os.mkdir(target_dir) tree = ET.parse(filename_path) root = tree.getroot() with open(os.path.join(target_dir, filename), mode='w') as txtXml: for elem in root: if elem.tag == 'S': txtXml.write(elem.text + '\n') print(elem.text) # Stat stats_counter['sentences'] += 1 stats_counter['words'] += len(elem.text.strip().split()) elif elem.tag == 'section': txtXml.write(elem.attrib['name'] + '\n') print(elem.attrib['name']) # Stat stats_counter['sections'] += 1 stats_counter['words'] += len( elem.attrib['name'].strip().split()) elif elem.tag == 'OneItem': txtXml.write(elem.text + '\n') print(elem.text) stats_counter['words'] += len(elem.text.strip().split()) for subelem in elem: txtXml.write(subelem.text + '\n') print(subelem.text) stats_counter['words'] += len(subelem.text.strip().split()) stats_counter['wordsPerSection'] += len( subelem.text.strip().split()) # Write to stat file with open('data_set_statistics.csv', mode='a') as statFile: fieldNames = list(stats_counter.keys()) writer = csv.DictWriter(statFile, fieldnames=fieldNames) writer.writerow(stats_counter)
def check_files(proc_list, discourse_output_path=parser.get('cluster_eval', 'discourse_output_path')): """ Makes sure that there files in the output dir that correspond with the proc_list If files are missing, removes the file name from the proc_list """ # Make list of all files in the output dir output_list = list() for file in os.listdir(discourse_output_path): if file.split('_')[0] not in output_list: output_list.append(file.split('_')[0] + '.txt') # Check difference diff = list(set(proc_list) - set(output_list)) # remove from proc_list file names that not been processed for filename in diff: proc_list.pop(proc_list.index(filename))
def import_regex(path=parser.get('util_functions', 'import_regex_path')): """ Imports as list of regex expressions from file :param path: path to regex file :return: list of regex expressions in lower state """ if not os.path.isfile(path): raise FileNotFoundError( 'File Not exists, supply full path with extension') else: regex_arguments = list() with open(path, mode='r') as regex_file: csv_reader = csv.reader(regex_file, delimiter=',') for row in csv_reader: regex_arguments.append(row[2]) regex_arguments.pop(0) return regex_arguments
def show_case(file_id, model_number, topic_words=None): from show_case.show_case_functions import run_show_case show_case_url = run_show_case( file_id, model_number, original_text_dir=parser.get('main_pipeline', 'original_text_dir'), xml_processed_dir=parser.get('main_pipeline', 'xml_processed_dir'), xml_parse_dir=parser.get('main_pipeline', 'xml_parse_dir'), topic_class_dir=parser.get('main_pipeline', 'topic_class_dir'), trees_dir=parser.get('main_pipeline', 'trees_dir'), final_stage_dir=parser.get('main_pipeline', 'final_stage_dir'), topic_data=convert_topics_for_show_case(model_number)) return show_case_url
def third_stage(file_id, models=None): from LDA_process import loop_models_one_file if isinstance(models, list): # It means CLI MODE models[1]["file_id"] = str(file_id) loop_models_one_file(models[0], **models[1]) # 0 - Models, 1 - Topic Paths else: loop_models_one_file( models, file_id=str(file_id), topic_4_model=parser.get('main_pipeline', 'topic_4_model'), topic_4_data_dir=parser.get('main_pipeline', 'topic_4_data_dir'), topic_6_model=parser.get('main_pipeline', 'topic_6_model'), topic_6_data_dir=parser.get('main_pipeline', 'topic_6_data_dir'), topic_10_model=parser.get('main_pipeline', 'topic_10_model'), topic_10_data_dir=parser.get('main_pipeline', 'topic_10_data_dir'), hdp_model=parser.get('main_pipeline', 'hdp_model'))
def classify_discourse_tree(filename, results_dict_list, dis_dir, topicNumber, text_tuples, mode='LDA', script_path=parser.get( 'LDA_process', 'classify_discourse_tree_script_path')): dir_path = os.path.join('output/topic_class', str(topicNumber)) if not os.path.exists(dir_path): os.mkdir(dir_path) print(bcolors.OKGREEN + 'Classifying topics for file: ' + filename + bcolors.ENDC) file_name = filename[:-4] file_path = os.path.join(dis_dir, filename) parsing_output = subprocess.check_output( ['python2', script_path, file_path]) parsed_json = json.loads(parsing_output) sent_counter = 0 sent_counter = recursive_read_text(parsed_json, results_dict_list, sent_counter, text_tuples, mode=mode) # with open(os.path.join('output/topic_class', file_name + '_topic.txt'), mode='w') as output: with open(os.path.join(dir_path, file_name + '_topic.txt'), mode='w') as output: json.dump(parsed_json, output) # write file data to pickle item pickle_path = os.path.join(dir_path, 'textData') if not os.path.exists(pickle_path): os.mkdir(pickle_path) with open(os.path.join(pickle_path, filename + '.pickle'), mode='wb') as pickle_file: pickle.dump(text_tuples, pickle_file)
def write_text_to_file(fileId, newSummary): OUTPUT_SUMMARY_PATH = Path( parser.get('summarizerWS', 'OUTPUT_SUMMARY_PATH')) with open(OUTPUT_SUMMARY_PATH / (str(fileId) + '_NuVec_NI_TI.txt'), mode='w') as outFile: outFile.writelines(newSummary)
def cluster_eval(topic_number, log_path=None): """# Get list of all processed files if log_path is None: print(f'{bcolors.FAIL} Please provide log path {bcolors.ENDC}') exit(1) # proc_files = processed_list('logs/D29_04_20M17_37.txt') proc_files = processed_list(log_path) print(proc_files) check_files(proc_files)""" # Process all the files that finished discourse parsing # third stage & four stage # Can be uncommented if running files separately - Stages 1+2 and then 3+4 """ for file_id in proc_files: f_id = int(file_id[:-4]) # remove .txt and convert to int value if config_file is None: # run stages mp.third_stage(f_id, {'4': True, '6': False, '10': False, 'hdp': False}) mp.fourth_stage(f_id) mp.show_case(f_id, 4) else: argument_list = [ config_file['third_stage']['models'], config_file['third_stage']['models_path'] ] mp.third_stage(f_id, argument_list) # Working on all of the models # Fourth stage # mp.fourth_stage(f_id) for key in config_file['third_stage']['models'].keys(): if config_file['third_stage']['models'][key] is True: mp.fourth_stage(f_id, key) mp.show_case(f_id, int(key)) input('Finished processing - press space+enter') """ nucleus_path = os.path.join(parser.get('cluster_eval', 'evaluation_list'), os.path.join(str(topic_number), 'nucleus')) proc_files = evaluation_list(nucleus_path) OTHER_SYSTEM = '/home/tzvi/PycharmProjects/HSdataprocessLinux/summarizerWS/summaries' create_clustering_corpus(parser.get('cluster_eval', 'original_data'), parser.get('cluster_eval', 'cluster_folder'), file_name='corpus.csv') cluster_folder_path = parser.get('cluster_eval', 'cluster_folder') print('Finished processing the data\nStarting to cluster....') start_time = time.time() # Calculate clustering for the current <!-original-!> files print(f'{bcolors.WARNING}Clustering original files{bcolors.ENDC}') truth, original_M = run_kmeans( os.path.join(cluster_folder_path, 'corpus.csv'), os.path.join(cluster_folder_path, 'output_plots'), proc_files, 'clustering_report.txt') run_time = time.time() - start_time print(f'{bcolors.OKBLUE} --- {run_time} seconds --- {bcolors.ENDC}') print(f'{bcolors.WARNING}Creating new corpus{bcolors.ENDC}') # Create new clustering corpus from the processed files create_clustering_corpus(OTHER_SYSTEM, cluster_folder_path, file_name='after_corpus.csv', customSystemTag='') start_time = time.time() print(f'{bcolors.WARNING}Clustering for processed files{bcolors.ENDC}') # Calculate clustering for the current <!-processed-!> files predict, predict_M = run_kmeans( os.path.join(cluster_folder_path, 'after_corpus.csv'), os.path.join(cluster_folder_path, 'after_output_plots'), proc_files, 'after_clustering_report.txt') run_time = time.time() - start_time print(f'{bcolors.OKBLUE} --- {run_time} seconds --- {bcolors.ENDC}') report = create_confusion_matrix(truth, predict) with open('after_clustering_report.txt', 'a') as report_file: for key in report.keys(): report_file.write('\n' + key + '\n' + str(report[key])) report_df = pd.DataFrame([original_M, predict_M, report], index=['Texts', 'Summaries', 'report']) report_df.to_excel("eval_report.xlsx")
sg.Text( '- Identify sections in the text\n- Create xml file\n- Create new text file based on xml file created', font=('Helvetica', 11)) ], [ sg.Text('Input file path', size=(15, 1), justification='center'), sg.Input(disabled=True, size=(40, 1), key='filePathBrowse'), sg.FileBrowse() ], [ sg.Text('Discourse input\nfolder path', size=(15, 2), justification='center'), sg.Input(disabled=True, size=(40, 1), default_text=parser.get('main_pipeline', 'discourseInput'), key='discoursePathBrowse'), sg.FileBrowse() ], [ sg.Button('Process', key='process_button'), sg.Button('Next Stage', key='nextStage_button', visible=False) ] ] # endregion # region ----- Second Stage layout ------------------ second_stage_layout = [ [ sg.Text('Second Stage\n--- Discourse Parsing ---', size=(30, 2),
def decide_topic_2(self, tree_height, mode=parser.get('nucleus_weight_approach', 'mode')): """ Decides the unit overall topic number Mode: NS - Nucleus Importance maximum approach V - vector approach mode """ def mul_vector(val, vector): return [(item[0], val * item[1]) for item in vector] unit_topics = dict() # Weights W_HEIGHT = 0.5 W_POSITION = 0.3 W_NS = 0.2 if len(self.leaf_nodes): for leaf_node in self.leaf_nodes.values(): # F1 - height F1 = (tree_height + 1 - leaf_node.tree_depth) / (tree_height + 1) # F2 - position F2 = 1 if leaf_node.position.right else 0 # F3 - nucleus / satellite F3 = 1 if leaf_node.node_class == 'N' else 0 # NS NS = W_HEIGHT * F1 + W_POSITION * F2 + W_NS * F3 leaf_node.ni_score = NS if mode is 'V': self.unit_topic_vector = mul_vector( NS, self.unit_topic_vector if len(self.unit_topic_vector) else leaf_node.topic_vector) unit_vector_topic_class = max(self.unit_topic_vector, key=lambda t: t[1]) if leaf_node.topic_class not in unit_topics: unit_topics[leaf_node.topic_class] = NS else: unit_topics[leaf_node.topic_class] += NS # region Write data to pickle file if not os.path.exists(file_name): with open(file_name, mode='wb') as dataFile: cur_tup = [ (leaf_node.node_number_L, leaf_node.node_number_R, leaf_node.text, leaf_node.topic_class, leaf_node.ni_score, leaf_node.node_class, leaf_node.topic_vector) ] pickle.dump(cur_tup, dataFile) else: # Read the pickle file, append data and write again with open(file_name, mode='rb') as dataFile: cur_tup = pickle.load(dataFile) with open(file_name, mode='wb') as dataFile: new_tup = (leaf_node.node_number_L, leaf_node.node_number_R, leaf_node.text, leaf_node.topic_class, leaf_node.ni_score, leaf_node.node_class, leaf_node.topic_vector) if new_tup not in cur_tup: cur_tup.append( (leaf_node.node_number_L, leaf_node.node_number_R, leaf_node.text, leaf_node.topic_class, leaf_node.ni_score, leaf_node.node_class, leaf_node.topic_vector)) pickle.dump(cur_tup, dataFile) # endregion N_max = max(unit_topics.values()) topic_max = find_max(unit_topics, N_max) if mode == 'V': # Mode V self.unit_topic = unit_vector_topic_class[0] self.unit_topic_score = unit_vector_topic_class[1] else: # Mode NS self.unit_topic = topic_max self.unit_topic_score = N_max
def first_stage(inputFile_path, discourseInput=parser.get('main_pipeline', 'discourseInput')): import preprocess preprocess.pre_process_single_file(inputFile_path, discourseInput)
def second_stage(xml_result_path, discourse_script_path=parser.get('main_pipeline', 'discourse_script_path')): import os os.system(' '.join(['python2', discourse_script_path, xml_result_path]))
def create_model_dict(model): model_dict = dict() model_data = model.print_topics() print(model_data) for t in model_data: model_dict[t[0]] = ' | '.join([ item.strip()[6:] for item in t[1].replace('"', '').split('+')[0:4] ]) return model_dict # Load models topic_labels = dict() topic4 = gensim.models.ldamodel.LdaModel.load( parser.get('final_stage', 'load_4topic')) topic4 = create_model_dict(topic4) topic_labels['4'] = topic4 topic6 = gensim.models.ldamodel.LdaModel.load( parser.get('final_stage', 'load_6topic')) topic6 = create_model_dict(topic6) topic_labels['6'] = topic6 topic10 = gensim.models.ldamodel.LdaModel.load( parser.get('final_stage', 'load_10topic')) topic10 = create_model_dict(topic10) topic_labels['10'] = topic10 pp = pprint.PrettyPrinter() # region <------------------------------ Base tree struct functions --------------------------------------------->
import LDA_classifier as classifier from print_colors import bcolors from project_config import parser # Change working directory def change_working_dir(): work_str = Path(os.getcwd()) os.chdir(work_str.parent) # change_working_dir() # Directory that contains all the CSV files for each file with topic/text data # OUTPUT_FINAL_STAGE_PATH = 'output/final_stage' OUTPUT_FINAL_STAGE_PATH = parser.get('summarizerWS', 'OUTPUT_FINAL_STAGE_PATH') # region Model loading MODEL_PATH = parser.get('summarizerWS', 'MODEL_PATH') DATA_DIR = parser.get('summarizerWS', 'DATA_DIR') # MODEL_PATH = '/home/tzvi/PycharmProjects/HSdataprocessLinux/gensim_models/10topics/lda_model_trained_10topics.model' # DATA_DIR = '/home/tzvi/PycharmProjects/HSdataprocessLinux/gensim_files/10Topic' # Working with 10 topic model def load_LDA_model(topic_model, topic_data_dir): classifier.load_Model_local(topic_model) classifier.load_data_local(topic_data_dir) classifier.print_model_topics() print(f'{bcolors.HEADER} \t [!!]\tLDA model loaded!\t[!!] {bcolors.ENDC}')