all_sense_inventory = merge_inventories(train_sense_inventory, test_sense_inventory) all_sense_inventory_invalid = merge_inventories(train_sense_inventory_invalid, test_sense_inventory_invalid) # save sense inventory to json json_writer(train_sense_inventory, share_processed_path + "/train_sense_inventory.json") json_writer(test_sense_inventory, share_processed_path + "/test_sense_inventory.json") json_writer(all_sense_inventory, share_processed_path + "/all_sense_inventory.json") json_writer(all_sense_inventory_invalid, share_processed_path + "/all_sense_inventory_invalid.json") # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover, sub_deid_patterns_mimic]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing share_txt = processor.process_texts(share_txt_all_annotated, n_jobs=30) # tokenizing share_txt_tokenized = toknizer.process_texts(share_txt, n_jobs=30) # Filter trivial tokens and Remove repeat non-words share_txt_filtered = filter_processor.process_texts(share_txt_tokenized, n_jobs=30) # Write to file txt_writer(share_txt_filtered, share_processed_path+"/share_all_processed.txt")
UMN_sense_cui_inventory[abbr][long_form] = None json_writer(UMN_sense_cui_inventory, umn_processed_path + "/UMN_sense_cui_inventory.json") ############################# # Process UMN documents ############################# umn_txt_marked = add_abbr_marker_umn(umn_txt) # Initialize processor and tokenizer processor = TextProcessor([white_space_remover, sub_deid_patterns_umn]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor( [token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing umn_txt = processor.process_texts(umn_txt_marked, n_jobs=30) # tokenizing umn_txt_tokenized = toknizer.process_texts(umn_txt, n_jobs=30) # add real annotations umn_txt_annotated = add_annotation_umn(UMN_sense_cui_inventory, umn_txt_tokenized) # Filter trivial tokens and Remove repeat non-words umn_txt_filtered = filter_processor.process_texts(umn_txt_annotated, n_jobs=30) # Write to file txt_writer(umn_txt_filtered, umn_processed_path + "/umn_processed.txt")
# Read original sense inventory (only one word abbrs) MSH_sense_inventory_one_word, MSH_sense_inventory = sense_inventory_msh(msh_path+"/benchmark_mesh.txt", abbr_list) # save sense inventory to json json_writer(MSH_sense_inventory_one_word, msh_processed_path + "/MSH_sense_inventory_one_word.json") json_writer(MSH_sense_inventory, msh_processed_path + "/MSH_sense_inventory.json") ############################# # Process MSH documents (only one word abbrs) ############################# msh_txt_annotated = add_annotation_msh(MSH_sense_inventory_one_word, msh_path) # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing msh_txt = processor.process_texts(msh_txt_annotated, n_jobs=10) # tokenizing msh_txt_tokenized = toknizer.process_texts(msh_txt, n_jobs=10) # Filter trivial tokens and Remove repeat non-words msh_txt_filtered = filter_processor.process_texts(msh_txt_tokenized, n_jobs=10) # Write to file txt_writer(msh_txt_filtered, msh_processed_path+"/msh_processed.txt")
class AbbrDisambiguation: def __init__(self, train_processed_path, abbr_inventory_path, use_pretrain=False, use_softmax=False): """ Initialize environment & model. """ # Initialize processor and tokenizer self.pre_processor = TextProcessor( [white_space_remover_upmc, sub_deid_patterns_upmc]) self.tokenizer = CoreNLPTokenizer() self.post_processor = TextProcessor( [AbbrDetector(abbr_inventory_path)]) self.filter_processor = TextProcessor( [TextTokenFilter(), repeat_non_word_remover]) # Load model train_path = train_processed_path + '/fasttext' if use_pretrain: model_path = train_path + '/model/pre_train' else: model_path = train_path + '/model' if use_softmax: model_file = model_path + '/all_softmax.bin' else: model_file = model_path + '/all.bin' self.model = load_model(model_file) def process_single_text(self, text, save_json_path=None): """ Process one text. """ ############################# # Process document ############################# # pre-processing text = self.pre_processor.process_single_text(text) # tokenizing text_tokenized = self.tokenizer.process_single_text(text) # detect abbrs text_detected = self.post_processor.process_single_text(text_tokenized) # Filter trivial tokens and Remove repeat non-words text_filtered = self.filter_processor.process_single_text( text_detected) ############################# # Build index ############################# result_collector = AbbrInstanceCollectorUPMC([text_detected]) abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index( ) result_global_idx_mapper = global_instance_idx_mapper( abbr_index_result) pred_collector = AbbrInstanceCollectorUPMC([text_filtered]) abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index( ) abbr_instances_pred = instance_generator(abbr_index_pred, Doc(document_no_mark_pred)) ############################# # Do classification ############################# wsd_results = fasttext_classifier(self.model, abbr_index_pred, abbr_instances_pred, result_global_idx_mapper) return save_result_to_json(wsd_results, document_no_mark_result, save_json_path) def process_texts(self, text_list, save_json_path=None, n_jobs=8): """ Process list of texts. """ ############################# # Process document ############################# # pre-processing text = self.pre_processor.process_texts(text_list, n_jobs=n_jobs) # tokenizing text_tokenized = self.tokenizer.process_texts(text, n_jobs=n_jobs) # detect abbrs text_detected = self.post_processor.process_texts(text_tokenized, n_jobs=n_jobs) # Filter trivial tokens and Remove repeat non-words text_filtered = self.filter_processor.process_texts(text_detected, n_jobs=n_jobs) ############################# # Build index ############################# print("Building index...") result_collector = AbbrInstanceCollectorUPMC(text_detected) abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index( ) result_global_idx_mapper = global_instance_idx_mapper( abbr_index_result) pred_collector = AbbrInstanceCollectorUPMC(text_filtered) abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index( ) abbr_instances_pred = instance_generator(abbr_index_pred, Doc(document_no_mark_pred)) ############################# # Do classification ############################# print("Predicting...") wsd_results = fasttext_classifier(self.model, abbr_index_pred, abbr_instances_pred, result_global_idx_mapper) return save_result_to_json(wsd_results, document_no_mark_result, save_json_path)
# read file filename = 'processed_text_chunk_%s.json' % i print("-"*50) print("Start File for %s" % filename) mimic_txt = [] mimic_present_senses = [] if not os.path.exists(PATH_FOLDER+filename): continue for line in open(PATH_FOLDER+filename, "r"): obj = json.loads(line) text = obj['TEXT'] present_senses = obj['present_senses'] mimic_txt.append(text) mimic_present_senses.append(present_senses) # pre-processing mimic_txt = processor.process_texts(mimic_txt, n_jobs=30) # Replace Long forms to abbrs mimic_txt_processed = longform_replacer(mimic_txt_filtered, mimic_present_senses, inventory_rmapper, n_jobs=16) # tokenizing mimic_txt_tokenized = toknizer.process_texts(mimic_txt, n_jobs=40) # Filter trivial tokens mimic_txt_filtered = filter_processor.process_texts(mimic_txt_tokenized, n_jobs=40) # Remove repeat non-words mimic_txt_processed = remove_repeat_processor.process_texts(mimic_txt_processed, n_jobs=40) # Save to file txt_writer(mimic_txt_processed, PATH_FOLDER_PROCESSED+'%s.txt' % filename[:-5])
# save sense inventory to json json_writer(sense_inventory, dataset_processed_path + "/dataset_sense_inventory.json") ############################# # Process DataSet documents (only one word abbrs) ############################# dataset_txt_annotated = add_annotation_dataset(sense_inventory, dataset_path) # Initialize processor and tokenizer processor = TextProcessor([white_space_remover, sub_deid_patterns_dataset]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor( [token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing dataset_txt = processor.process_texts(dataset_txt_annotated, n_jobs=30) # tokenizing dataset_txt_tokenized = toknizer.process_texts(dataset_txt, n_jobs=30) # Filter trivial tokens and Remove repeat non-words dataset_txt_filtered = filter_processor.process_texts( dataset_txt_tokenized, n_jobs=30) # Write to file txt_writer(dataset_txt_filtered, dataset_processed_path + "/dataset_processed.txt")