예제 #1
0
파일: share.py 프로젝트: Astroneko404/wsd
    all_sense_inventory = merge_inventories(train_sense_inventory, test_sense_inventory)
    all_sense_inventory_invalid = merge_inventories(train_sense_inventory_invalid, test_sense_inventory_invalid)

    # save sense inventory to json
    json_writer(train_sense_inventory, share_processed_path + "/train_sense_inventory.json")
    json_writer(test_sense_inventory, share_processed_path + "/test_sense_inventory.json")
    json_writer(all_sense_inventory, share_processed_path + "/all_sense_inventory.json")
    json_writer(all_sense_inventory_invalid, share_processed_path + "/all_sense_inventory_invalid.json")

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover,
        sub_deid_patterns_mimic])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    # pre-processing
    share_txt = processor.process_texts(share_txt_all_annotated, n_jobs=30)
    # tokenizing
    share_txt_tokenized = toknizer.process_texts(share_txt, n_jobs=30)
    # Filter trivial tokens and Remove repeat non-words
    share_txt_filtered = filter_processor.process_texts(share_txt_tokenized, n_jobs=30)
    # Write to file
    txt_writer(share_txt_filtered, share_processed_path+"/share_all_processed.txt")
예제 #2
0
파일: umn.py 프로젝트: Astroneko404/wsd
                UMN_sense_cui_inventory[abbr][long_form] = None
    json_writer(UMN_sense_cui_inventory,
                umn_processed_path + "/UMN_sense_cui_inventory.json")

    #############################
    # Process UMN documents
    #############################

    umn_txt_marked = add_abbr_marker_umn(umn_txt)

    # Initialize processor and tokenizer
    processor = TextProcessor([white_space_remover, sub_deid_patterns_umn])

    toknizer = CoreNLPTokenizer()
    token_filter = TextTokenFilter()
    filter_processor = TextProcessor(
        [token_filter, repeat_non_word_remover, recover_upper_cui])

    # pre-processing
    umn_txt = processor.process_texts(umn_txt_marked, n_jobs=30)
    # tokenizing
    umn_txt_tokenized = toknizer.process_texts(umn_txt, n_jobs=30)
    # add real annotations
    umn_txt_annotated = add_annotation_umn(UMN_sense_cui_inventory,
                                           umn_txt_tokenized)
    # Filter trivial tokens and Remove repeat non-words
    umn_txt_filtered = filter_processor.process_texts(umn_txt_annotated,
                                                      n_jobs=30)
    # Write to file
    txt_writer(umn_txt_filtered, umn_processed_path + "/umn_processed.txt")
예제 #3
0
파일: msh.py 프로젝트: Astroneko404/wsd
    # Read original sense inventory (only one word abbrs)
    MSH_sense_inventory_one_word, MSH_sense_inventory = sense_inventory_msh(msh_path+"/benchmark_mesh.txt", abbr_list)

    # save sense inventory to json
    json_writer(MSH_sense_inventory_one_word, msh_processed_path + "/MSH_sense_inventory_one_word.json")
    json_writer(MSH_sense_inventory, msh_processed_path + "/MSH_sense_inventory.json")

    #############################
    # Process MSH documents (only one word abbrs)
    #############################
    msh_txt_annotated = add_annotation_msh(MSH_sense_inventory_one_word, msh_path)

    # Initialize processor and tokenizer
    processor = TextProcessor([
        white_space_remover])
    toknizer = CoreNLPTokenizer()
    token_filter = TextTokenFilter()
    filter_processor = TextProcessor([
        token_filter,
        repeat_non_word_remover,
        recover_upper_cui])

    # pre-processing
    msh_txt = processor.process_texts(msh_txt_annotated, n_jobs=10)
    # tokenizing
    msh_txt_tokenized = toknizer.process_texts(msh_txt, n_jobs=10)
    # Filter trivial tokens and Remove repeat non-words
    msh_txt_filtered = filter_processor.process_texts(msh_txt_tokenized, n_jobs=10)
    # Write to file
    txt_writer(msh_txt_filtered, msh_processed_path+"/msh_processed.txt")
예제 #4
0
class AbbrDisambiguation:
    def __init__(self,
                 train_processed_path,
                 abbr_inventory_path,
                 use_pretrain=False,
                 use_softmax=False):
        """
        Initialize environment & model.
        """
        # Initialize processor and tokenizer
        self.pre_processor = TextProcessor(
            [white_space_remover_upmc, sub_deid_patterns_upmc])
        self.tokenizer = CoreNLPTokenizer()
        self.post_processor = TextProcessor(
            [AbbrDetector(abbr_inventory_path)])
        self.filter_processor = TextProcessor(
            [TextTokenFilter(), repeat_non_word_remover])
        # Load model
        train_path = train_processed_path + '/fasttext'
        if use_pretrain:
            model_path = train_path + '/model/pre_train'
        else:
            model_path = train_path + '/model'
        if use_softmax:
            model_file = model_path + '/all_softmax.bin'
        else:
            model_file = model_path + '/all.bin'
        self.model = load_model(model_file)

    def process_single_text(self, text, save_json_path=None):
        """
        Process one text.
        """
        #############################
        # Process document
        #############################

        # pre-processing
        text = self.pre_processor.process_single_text(text)
        # tokenizing
        text_tokenized = self.tokenizer.process_single_text(text)
        # detect abbrs
        text_detected = self.post_processor.process_single_text(text_tokenized)
        # Filter trivial tokens and Remove repeat non-words
        text_filtered = self.filter_processor.process_single_text(
            text_detected)

        #############################
        # Build index
        #############################

        result_collector = AbbrInstanceCollectorUPMC([text_detected])
        abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index(
        )
        result_global_idx_mapper = global_instance_idx_mapper(
            abbr_index_result)

        pred_collector = AbbrInstanceCollectorUPMC([text_filtered])
        abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index(
        )
        abbr_instances_pred = instance_generator(abbr_index_pred,
                                                 Doc(document_no_mark_pred))

        #############################
        # Do classification
        #############################

        wsd_results = fasttext_classifier(self.model, abbr_index_pred,
                                          abbr_instances_pred,
                                          result_global_idx_mapper)
        return save_result_to_json(wsd_results, document_no_mark_result,
                                   save_json_path)

    def process_texts(self, text_list, save_json_path=None, n_jobs=8):
        """
        Process list of texts.
        """
        #############################
        # Process document
        #############################

        # pre-processing
        text = self.pre_processor.process_texts(text_list, n_jobs=n_jobs)
        # tokenizing
        text_tokenized = self.tokenizer.process_texts(text, n_jobs=n_jobs)
        # detect abbrs
        text_detected = self.post_processor.process_texts(text_tokenized,
                                                          n_jobs=n_jobs)
        # Filter trivial tokens and Remove repeat non-words
        text_filtered = self.filter_processor.process_texts(text_detected,
                                                            n_jobs=n_jobs)

        #############################
        # Build index
        #############################
        print("Building index...")
        result_collector = AbbrInstanceCollectorUPMC(text_detected)
        abbr_index_result, document_no_mark_result = result_collector.generate_inverted_index(
        )
        result_global_idx_mapper = global_instance_idx_mapper(
            abbr_index_result)

        pred_collector = AbbrInstanceCollectorUPMC(text_filtered)
        abbr_index_pred, document_no_mark_pred = pred_collector.generate_inverted_index(
        )
        abbr_instances_pred = instance_generator(abbr_index_pred,
                                                 Doc(document_no_mark_pred))

        #############################
        # Do classification
        #############################
        print("Predicting...")
        wsd_results = fasttext_classifier(self.model, abbr_index_pred,
                                          abbr_instances_pred,
                                          result_global_idx_mapper)
        return save_result_to_json(wsd_results, document_no_mark_result,
                                   save_json_path)
예제 #5
0
        # read file
        filename = 'processed_text_chunk_%s.json' % i
        print("-"*50)
        print("Start File for %s" % filename)
        mimic_txt = []
        mimic_present_senses = []

        if not os.path.exists(PATH_FOLDER+filename):
            continue

        for line in open(PATH_FOLDER+filename, "r"):
            obj = json.loads(line)
            text = obj['TEXT']
            present_senses = obj['present_senses']
            mimic_txt.append(text)
            mimic_present_senses.append(present_senses)

        # pre-processing
        mimic_txt = processor.process_texts(mimic_txt, n_jobs=30)
        # Replace Long forms to abbrs
        mimic_txt_processed = longform_replacer(mimic_txt_filtered, mimic_present_senses, inventory_rmapper, n_jobs=16)
        # tokenizing
        mimic_txt_tokenized = toknizer.process_texts(mimic_txt, n_jobs=40)
        # Filter trivial tokens
        mimic_txt_filtered = filter_processor.process_texts(mimic_txt_tokenized, n_jobs=40)
        # Remove repeat non-words
        mimic_txt_processed = remove_repeat_processor.process_texts(mimic_txt_processed, n_jobs=40)
        # Save to file
        txt_writer(mimic_txt_processed, PATH_FOLDER_PROCESSED+'%s.txt' % filename[:-5])
예제 #6
0
    # save sense inventory to json
    json_writer(sense_inventory,
                dataset_processed_path + "/dataset_sense_inventory.json")

    #############################
    # Process DataSet documents (only one word abbrs)
    #############################

    dataset_txt_annotated = add_annotation_dataset(sense_inventory,
                                                   dataset_path)

    # Initialize processor and tokenizer
    processor = TextProcessor([white_space_remover, sub_deid_patterns_dataset])

    toknizer = CoreNLPTokenizer()

    token_filter = TextTokenFilter()
    filter_processor = TextProcessor(
        [token_filter, repeat_non_word_remover, recover_upper_cui])

    # pre-processing
    dataset_txt = processor.process_texts(dataset_txt_annotated, n_jobs=30)
    # tokenizing
    dataset_txt_tokenized = toknizer.process_texts(dataset_txt, n_jobs=30)
    # Filter trivial tokens and Remove repeat non-words
    dataset_txt_filtered = filter_processor.process_texts(
        dataset_txt_tokenized, n_jobs=30)
    # Write to file
    txt_writer(dataset_txt_filtered,
               dataset_processed_path + "/dataset_processed.txt")