def __init__(self, train_processed_path, abbr_inventory_path, use_pretrain=False, use_softmax=False): """ Initialize environment & model. """ # Initialize processor and tokenizer self.pre_processor = TextProcessor( [white_space_remover_upmc, sub_deid_patterns_upmc]) self.tokenizer = CoreNLPTokenizer() self.post_processor = TextProcessor( [AbbrDetector(abbr_inventory_path)]) self.filter_processor = TextProcessor( [TextTokenFilter(), repeat_non_word_remover]) # Load model train_path = train_processed_path + '/fasttext' if use_pretrain: model_path = train_path + '/model/pre_train' else: model_path = train_path + '/model' if use_softmax: model_file = model_path + '/all_softmax.bin' else: model_file = model_path + '/all.bin' self.model = load_model(model_file)
all_sense_inventory = merge_inventories(train_sense_inventory, test_sense_inventory) all_sense_inventory_invalid = merge_inventories(train_sense_inventory_invalid, test_sense_inventory_invalid) # save sense inventory to json json_writer(train_sense_inventory, share_processed_path + "/train_sense_inventory.json") json_writer(test_sense_inventory, share_processed_path + "/test_sense_inventory.json") json_writer(all_sense_inventory, share_processed_path + "/all_sense_inventory.json") json_writer(all_sense_inventory_invalid, share_processed_path + "/all_sense_inventory_invalid.json") # Initialize processor and tokenizer processor = TextProcessor([ white_space_remover, sub_deid_patterns_mimic]) toknizer = CoreNLPTokenizer() token_filter = TextTokenFilter() filter_processor = TextProcessor([ token_filter, repeat_non_word_remover, recover_upper_cui]) # pre-processing share_txt = processor.process_texts(share_txt_all_annotated, n_jobs=30) # tokenizing share_txt_tokenized = toknizer.process_texts(share_txt, n_jobs=30) # Filter trivial tokens and Remove repeat non-words share_txt_filtered = filter_processor.process_texts(share_txt_tokenized, n_jobs=30) # Write to file txt_writer(share_txt_filtered, share_processed_path+"/share_all_processed.txt")