Пример #1
0
 def __init__(self,
              train_processed_path,
              abbr_inventory_path,
              use_pretrain=False,
              use_softmax=False):
     """
     Initialize environment & model.
     """
     # Initialize processor and tokenizer
     self.pre_processor = TextProcessor(
         [white_space_remover_upmc, sub_deid_patterns_upmc])
     self.tokenizer = CoreNLPTokenizer()
     self.post_processor = TextProcessor(
         [AbbrDetector(abbr_inventory_path)])
     self.filter_processor = TextProcessor(
         [TextTokenFilter(), repeat_non_word_remover])
     # Load model
     train_path = train_processed_path + '/fasttext'
     if use_pretrain:
         model_path = train_path + '/model/pre_train'
     else:
         model_path = train_path + '/model'
     if use_softmax:
         model_file = model_path + '/all_softmax.bin'
     else:
         model_file = model_path + '/all.bin'
     self.model = load_model(model_file)
Пример #2
0
    # combine corpus
    share_txt_all_annotated = share_txt_train_annotated.copy()
    share_txt_all_annotated.extend(share_txt_test_annotated)
    all_sense_inventory = merge_inventories(train_sense_inventory, test_sense_inventory)
    all_sense_inventory_invalid = merge_inventories(train_sense_inventory_invalid, test_sense_inventory_invalid)

    # save sense inventory to json
    json_writer(train_sense_inventory, share_processed_path + "/train_sense_inventory.json")
    json_writer(test_sense_inventory, share_processed_path + "/test_sense_inventory.json")
    json_writer(all_sense_inventory, share_processed_path + "/all_sense_inventory.json")
    json_writer(all_sense_inventory_invalid, share_processed_path + "/all_sense_inventory_invalid.json")

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover,
        sub_deid_patterns_mimic])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    # pre-processing
    share_txt = processor.process_texts(share_txt_all_annotated, n_jobs=30)
    # tokenizing
    share_txt_tokenized = toknizer.process_texts(share_txt, n_jobs=30)
    # Filter trivial tokens and Remove repeat non-words
Пример #3
0
            if long_form in lf2cui_only_have_cui:
                UMN_sense_cui_inventory[abbr][
                    long_form] = lf2cui_only_have_cui[long_form]
            else:
                UMN_sense_cui_inventory[abbr][long_form] = None
    json_writer(UMN_sense_cui_inventory,
                umn_processed_path + "/UMN_sense_cui_inventory.json")

    #############################
    # Process UMN documents
    #############################

    umn_txt_marked = add_abbr_marker_umn(umn_txt)

    # Initialize processor and tokenizer
    processor = TextProcessor([white_space_remover, sub_deid_patterns_umn])

    toknizer = CoreNLPTokenizer()
    token_filter = TextTokenFilter()
    filter_processor = TextProcessor(
        [token_filter, repeat_non_word_remover, recover_upper_cui])

    # pre-processing
    umn_txt = processor.process_texts(umn_txt_marked, n_jobs=30)
    # tokenizing
    umn_txt_tokenized = toknizer.process_texts(umn_txt, n_jobs=30)
    # add real annotations
    umn_txt_annotated = add_annotation_umn(UMN_sense_cui_inventory,
                                           umn_txt_tokenized)
    # Filter trivial tokens and Remove repeat non-words
    umn_txt_filtered = filter_processor.process_texts(umn_txt_annotated,
Пример #4
0
    abbr_list = find_abbrs(msh_path + '/12859_2010_4593_MOESM1_ESM.CSV')

    # Read original sense inventory (only one word abbrs)
    MSH_sense_inventory_one_word, MSH_sense_inventory = sense_inventory_msh(msh_path+"/benchmark_mesh.txt", abbr_list)

    # save sense inventory to json
    json_writer(MSH_sense_inventory_one_word, msh_processed_path + "/MSH_sense_inventory_one_word.json")
    json_writer(MSH_sense_inventory, msh_processed_path + "/MSH_sense_inventory.json")

    #############################
    # Process MSH documents (only one word abbrs)
    #############################
    msh_txt_annotated = add_annotation_msh(MSH_sense_inventory_one_word, msh_path)

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover])
    toknizer = CoreNLPTokenizer()
    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    # pre-processing
    msh_txt = processor.process_texts(msh_txt_annotated, n_jobs=10)
    # tokenizing
    msh_txt_tokenized = toknizer.process_texts(msh_txt, n_jobs=10)
    # Filter trivial tokens and Remove repeat non-words
    msh_txt_filtered = filter_processor.process_texts(msh_txt_tokenized, n_jobs=10)
    # Write to file
    txt_writer(msh_txt_filtered, msh_processed_path+"/msh_processed.txt")
Пример #5
0
    ######################################
    # Read texts from dataset
    ######################################

    # File paths
    data_path = "/home/luoz3/wsd_data"
    upmc_all_path = data_path + "/upmc/batch1_4"
    upmc_all_processed_path = upmc_all_path + "/processed"
    os.makedirs(upmc_all_processed_path, exist_ok=True)

    #############################
    # Process DataSet documents (only one word abbrs)
    #############################

    # Initialize processor and tokenizer
    token_filter = TextTokenFilter()
    processor = TextProcessor([
        white_space_remover,
        token_filter,
        repeat_non_word_remover,
    ])

    upmc_all_txt = txt_reader(data_path + "/upmc_batch1_4/upmc_no_mark_new.txt")
    # pre-processing
    upmc_all_txt = processor.process_texts(upmc_all_txt, n_jobs=30)
    # Write to file
    txt_writer(upmc_all_txt, upmc_all_processed_path+"/train_no_mark.txt")

    print()
Пример #6
0
class AbbrDisambiguation:
    def __init__(self,
                 train_processed_path,
                 abbr_inventory_path,
                 use_pretrain=False,
                 use_softmax=False):
        """
        Initialize environment & model.
        """
        # Initialize processor and tokenizer
        self.pre_processor = TextProcessor(
            [white_space_remover_upmc, sub_deid_patterns_upmc])
        self.tokenizer = CoreNLPTokenizer()
        self.post_processor = TextProcessor(
            [AbbrDetector(abbr_inventory_path)])
        self.filter_processor = TextProcessor(
            [TextTokenFilter(), repeat_non_word_remover])
        # Load model
        train_path = train_processed_path + '/fasttext'
        if use_pretrain:
            model_path = train_path + '/model/pre_train'
        else:
            model_path = train_path + '/model'
        if use_softmax:
            model_file = model_path + '/all_softmax.bin'
        else:
            model_file = model_path + '/all.bin'
        self.model = load_model(model_file)

    def process_single_text(self, text, save_json_path=None):
        """
        Process one text.
        """
        #############################
        # Process document
        #############################

        # pre-processing
        text = self.pre_processor.process_single_text(text)
        # tokenizing
        text_tokenized = self.tokenizer.process_single_text(text)
        # detect abbrs
        text_detected = self.post_processor.process_single_text(text_tokenized)
        # Filter trivial tokens and Remove repeat non-words
        text_filtered = self.filter_processor.process_single_text(
            text_detected)

        #############################
        # Build index
        #############################

        result_collector = AbbrInstanceCollectorUPMC([text_detected])
        abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index(
        )
        result_global_idx_mapper = global_instance_idx_mapper(
            abbr_index_result)

        pred_collector = AbbrInstanceCollectorUPMC([text_filtered])
        abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index(
        )
        abbr_instances_pred = instance_generator(abbr_index_pred,
                                                 Doc(document_no_mark_pred))

        #############################
        # Do classification
        #############################

        wsd_results = fasttext_classifier(self.model, abbr_index_pred,
                                          abbr_instances_pred,
                                          result_global_idx_mapper)
        return save_result_to_json(wsd_results, document_no_mark_result,
                                   save_json_path)

    def process_texts(self, text_list, save_json_path=None, n_jobs=8):
        """
        Process list of texts.
        """
        #############################
        # Process document
        #############################

        # pre-processing
        text = self.pre_processor.process_texts(text_list, n_jobs=n_jobs)
        # tokenizing
        text_tokenized = self.tokenizer.process_texts(text, n_jobs=n_jobs)
        # detect abbrs
        text_detected = self.post_processor.process_texts(text_tokenized,
                                                          n_jobs=n_jobs)
        # Filter trivial tokens and Remove repeat non-words
        text_filtered = self.filter_processor.process_texts(text_detected,
                                                            n_jobs=n_jobs)

        #############################
        # Build index
        #############################
        print("Building index...")
        result_collector = AbbrInstanceCollectorUPMC(text_detected)
        abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index(
        )
        result_global_idx_mapper = global_instance_idx_mapper(
            abbr_index_result)

        pred_collector = AbbrInstanceCollectorUPMC(text_filtered)
        abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index(
        )
        abbr_instances_pred = instance_generator(abbr_index_pred,
                                                 Doc(document_no_mark_pred))

        #############################
        # Do classification
        #############################
        print("Predicting...")
        wsd_results = fasttext_classifier(self.model, abbr_index_pred,
                                          abbr_instances_pred,
                                          result_global_idx_mapper)
        return save_result_to_json(wsd_results, document_no_mark_result,
                                   save_json_path)
Пример #7
0
    if not os.path.exists(PATH_FOLDER_PROCESSED):
        os.makedirs(PATH_FOLDER_PROCESSED)

    PATH_PROCESSED_INVENTORY_PKL = BASE_FOLDER + 'sense_inventory/final_cleaned_sense_inventory.cased.processed.pkl'

    # Get pickle generated from mimic_inventory.py
    inventory = pickle_reader(PATH_PROCESSED_INVENTORY_PKL)
    inventory_rmapper = inventory['longform-abbr_cui']

    ######################################
    # Processing
    ######################################

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover,
        sub_deid_patterns_mimic])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter])

    remove_repeat_processor = TextProcessor([repeat_non_word_remover])

    for i in range(42):

        # read file
        filename = 'processed_text_chunk_%s.json' % i
        print("-"*50)
Пример #8
0
    # os.makedirs(dataset_processed_path, exist_ok=True)

    # # fix annotation error
    # with open(dataset_path + "/training_data.txt") as input, open(dataset_path + "/training_data_fixed.txt", "w") as output:
    #     for line in input:
    #         new_line = " ".join([replace(token) for token in line.rstrip("\n").split(" ")])
    #         output.write(new_line + "\n")

    #############################
    # Process DataSet documents (only one word abbrs)
    #############################

    # dataset_txt_annotated = txt_reader(dataset_path + "/training_data_fixed.txt")

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    all_processor = TextProcessor([
        white_space_remover,
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])