def __init__(self, train_processed_path, abbr_inventory_path, use_pretrain=False, use_softmax=False): """ Initialize environment & model. """ # Initialize processor and tokenizer self.pre_processor = TextProcessor( [white_space_remover_upmc, sub_deid_patterns_upmc]) self.tokenizer = CoreNLPTokenizer() self.post_processor = TextProcessor( [AbbrDetector(abbr_inventory_path)]) self.filter_processor = TextProcessor( [TextTokenFilter(), repeat_non_word_remover]) # Load model train_path = train_processed_path + '/fasttext' if use_pretrain: model_path = train_path + '/model/pre_train' else: model_path = train_path + '/model' if use_softmax: model_file = model_path + '/all_softmax.bin' else: model_file = model_path + '/all.bin' self.model = load_model(model_file)
# combine corpus share_txt_all_annotated = share_txt_train_annotated.copy() share_txt_all_annotated.extend(share_txt_test_annotated) all_sense_inventory = merge_inventories(train_sense_inventory, test_sense_inventory) all_sense_inventory_invalid = merge_inventories(train_sense_inventory_invalid, test_sense_inventory_invalid) # save sense inventory to json json_writer(train_sense_inventory, share_processed_path + "/train_sense_inventory.json") json_writer(test_sense_inventory, share_processed_path + "/test_sense_inventory.json") json_writer(all_sense_inventory, share_processed_path + "/all_sense_inventory.json") json_writer(all_sense_inventory_invalid, share_processed_path + "/all_sense_inventory_invalid.json") # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover, sub_deid_patterns_mimic]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing share_txt = processor.process_texts(share_txt_all_annotated, n_jobs=30) # tokenizing share_txt_tokenized = toknizer.process_texts(share_txt, n_jobs=30) # Filter trivial tokens and Remove repeat non-words
if long_form in lf2cui_only_have_cui: UMN_sense_cui_inventory[abbr][ long_form] = lf2cui_only_have_cui[long_form] else: UMN_sense_cui_inventory[abbr][long_form] = None json_writer(UMN_sense_cui_inventory, umn_processed_path + "/UMN_sense_cui_inventory.json") ############################# # Process UMN documents ############################# umn_txt_marked = add_abbr_marker_umn(umn_txt) # Initialize processor and tokenizer processor = TextProcessor([white_space_remover, sub_deid_patterns_umn]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor( [token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing umn_txt = processor.process_texts(umn_txt_marked, n_jobs=30) # tokenizing umn_txt_tokenized = toknizer.process_texts(umn_txt, n_jobs=30) # add real annotations umn_txt_annotated = add_annotation_umn(UMN_sense_cui_inventory, umn_txt_tokenized) # Filter trivial tokens and Remove repeat non-words umn_txt_filtered = filter_processor.process_texts(umn_txt_annotated,
abbr_list = find_abbrs(msh_path + '/12859_2010_4593_MOESM1_ESM.CSV') # Read original sense inventory (only one word abbrs) MSH_sense_inventory_one_word, MSH_sense_inventory = sense_inventory_msh(msh_path+"/benchmark_mesh.txt", abbr_list) # save sense inventory to json json_writer(MSH_sense_inventory_one_word, msh_processed_path + "/MSH_sense_inventory_one_word.json") json_writer(MSH_sense_inventory, msh_processed_path + "/MSH_sense_inventory.json") ############################# # Process MSH documents (only one word abbrs) ############################# msh_txt_annotated = add_annotation_msh(MSH_sense_inventory_one_word, msh_path) # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing msh_txt = processor.process_texts(msh_txt_annotated, n_jobs=10) # tokenizing msh_txt_tokenized = toknizer.process_texts(msh_txt, n_jobs=10) # Filter trivial tokens and Remove repeat non-words msh_txt_filtered = filter_processor.process_texts(msh_txt_tokenized, n_jobs=10) # Write to file txt_writer(msh_txt_filtered, msh_processed_path+"/msh_processed.txt")
###################################### # Read texts from dataset ###################################### # File paths data_path = "/home/luoz3/wsd_data" upmc_all_path = data_path + "/upmc/batch1_4" upmc_all_processed_path = upmc_all_path + "/processed" os.makedirs(upmc_all_processed_path, exist_ok=True) ############################# # Process DataSet documents (only one word abbrs) ############################# # Initialize processor and tokenizer token_filter = TextTokenFilter() processor = TextProcessor([ white_space_remover, token_filter, repeat_non_word_remover, ]) upmc_all_txt = txt_reader(data_path + "/upmc_batch1_4/upmc_no_mark_new.txt") # pre-processing upmc_all_txt = processor.process_texts(upmc_all_txt, n_jobs=30) # Write to file txt_writer(upmc_all_txt, upmc_all_processed_path+"/train_no_mark.txt") print()
class AbbrDisambiguation: def __init__(self, train_processed_path, abbr_inventory_path, use_pretrain=False, use_softmax=False): """ Initialize environment & model. """ # Initialize processor and tokenizer self.pre_processor = TextProcessor( [white_space_remover_upmc, sub_deid_patterns_upmc]) self.tokenizer = CoreNLPTokenizer() self.post_processor = TextProcessor( [AbbrDetector(abbr_inventory_path)]) self.filter_processor = TextProcessor( [TextTokenFilter(), repeat_non_word_remover]) # Load model train_path = train_processed_path + '/fasttext' if use_pretrain: model_path = train_path + '/model/pre_train' else: model_path = train_path + '/model' if use_softmax: model_file = model_path + '/all_softmax.bin' else: model_file = model_path + '/all.bin' self.model = load_model(model_file) def process_single_text(self, text, save_json_path=None): """ Process one text. """ ############################# # Process document ############################# # pre-processing text = self.pre_processor.process_single_text(text) # tokenizing text_tokenized = self.tokenizer.process_single_text(text) # detect abbrs text_detected = self.post_processor.process_single_text(text_tokenized) # Filter trivial tokens and Remove repeat non-words text_filtered = self.filter_processor.process_single_text( text_detected) ############################# # Build index ############################# result_collector = AbbrInstanceCollectorUPMC([text_detected]) abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index( ) result_global_idx_mapper = global_instance_idx_mapper( abbr_index_result) pred_collector = AbbrInstanceCollectorUPMC([text_filtered]) abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index( ) abbr_instances_pred = instance_generator(abbr_index_pred, Doc(document_no_mark_pred)) ############################# # Do classification ############################# wsd_results = fasttext_classifier(self.model, abbr_index_pred, abbr_instances_pred, result_global_idx_mapper) return save_result_to_json(wsd_results, document_no_mark_result, save_json_path) def process_texts(self, text_list, save_json_path=None, n_jobs=8): """ Process list of texts. """ ############################# # Process document ############################# # pre-processing text = self.pre_processor.process_texts(text_list, n_jobs=n_jobs) # tokenizing text_tokenized = self.tokenizer.process_texts(text, n_jobs=n_jobs) # detect abbrs text_detected = self.post_processor.process_texts(text_tokenized, n_jobs=n_jobs) # Filter trivial tokens and Remove repeat non-words text_filtered = self.filter_processor.process_texts(text_detected, n_jobs=n_jobs) ############################# # Build index ############################# print("Building index...") result_collector = AbbrInstanceCollectorUPMC(text_detected) abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index( ) result_global_idx_mapper = global_instance_idx_mapper( abbr_index_result) pred_collector = AbbrInstanceCollectorUPMC(text_filtered) abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index( ) abbr_instances_pred = instance_generator(abbr_index_pred, Doc(document_no_mark_pred)) ############################# # Do classification ############################# print("Predicting...") wsd_results = fasttext_classifier(self.model, abbr_index_pred, abbr_instances_pred, result_global_idx_mapper) return save_result_to_json(wsd_results, document_no_mark_result, save_json_path)
if not os.path.exists(PATH_FOLDER_PROCESSED): os.makedirs(PATH_FOLDER_PROCESSED) PATH_PROCESSED_INVENTORY_PKL = BASE_FOLDER + 'sense_inventory/final_cleaned_sense_inventory.cased.processed.pkl' # Get pickle generated from mimic_inventory.py inventory = pickle_reader(PATH_PROCESSED_INVENTORY_PKL) inventory_rmapper = inventory['longform-abbr_cui'] ###################################### # Processing ###################################### # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover, sub_deid_patterns_mimic]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter]) remove_repeat_processor = TextProcessor([repeat_non_word_remover]) for i in range(42): # read file filename = 'processed_text_chunk_%s.json' % i print("-"*50)
# os.makedirs(dataset_processed_path, exist_ok=True) # # fix annotation error # with open(dataset_path + "/training_data.txt") as input, open(dataset_path + "/training_data_fixed.txt", "w") as output: # for line in input: # new_line = " ".join([replace(token) for token in line.rstrip("\n").split(" ")]) # output.write(new_line + "\n") ############################# # Process DataSet documents (only one word abbrs) ############################# # dataset_txt_annotated = txt_reader(dataset_path + "/training_data_fixed.txt") # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) all_processor = TextProcessor([ white_space_remover, token_filter, repeat_non_word_remover, recover_upper_cui])