示例#1
0
def predict_fasttext_classifier(train_processed_path,
                                test_processed_path,
                                use_pretrain=False,
                                use_softmax=False):
    train_path = train_processed_path + '/fasttext'
    if use_pretrain:
        model_path = train_path + '/model/pre_train'
    else:
        model_path = train_path + '/model'
    # Load abbr index
    train_abbr_idx_mapper = pickle_reader(train_path + '/abbr_idx_mapper.pkl')
    train_abbr2idx = train_abbr_idx_mapper['abbr2idx']
    test_abbr_idx_mapper = pickle_reader(test_processed_path +
                                         '/fasttext/abbr_idx_mapper.pkl')
    test_abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl')

    # Load model
    if use_softmax:
        model_file = model_path + '/all_softmax.bin'
    else:
        model_file = model_path + '/all.bin'
    model = load_model(model_file)
    label_set = set(map(lambda x: x.lstrip("__label__"), model.get_labels()))

    instance_collection = []
    # generate testing data
    for abbr, test_abbr_idx in tqdm(test_abbr_idx_mapper['abbr2idx'].items()):
        # if abbr not in train_abbr2idx:
        #     for doc_id, pos_list in test_abbr_index[abbr].items():
        #         for global_instance_idx, pos, label in pos_list:
        #             instance_collection.append(InstancePred(index=global_instance_idx, abbr=abbr, sense_pred=None))
        # else:
        eval_abbr_instance_list = txt_reader(test_processed_path +
                                             '/fasttext/dataset/%d.txt' %
                                             test_abbr_idx)
        abbr_instance_idx = 0
        for doc_id, pos_list in test_abbr_index[abbr].items():
            for global_instance_idx, pos, label in pos_list:
                if label not in label_set:
                    instance_collection.append(
                        InstancePred(index=global_instance_idx,
                                     abbr=abbr,
                                     sense_pred=None))
                else:
                    # get instance
                    tokens = eval_abbr_instance_list[abbr_instance_idx].split()
                    label_in_txt = tokens[0].lstrip("__label__")
                    assert label == label_in_txt
                    context = " ".join(tokens[1:])
                    instance_collection.append(
                        InstancePred(index=global_instance_idx,
                                     abbr=abbr,
                                     sense_pred=model.predict(context)[0]
                                     [0].lstrip("__label__")))
                abbr_instance_idx += 1
    # sort collection list based on global instance idx
    instance_collection = sorted(instance_collection, key=lambda x: x.index)
    return instance_collection
示例#2
0
def generate_whole_dataset(processed_path, shuffle=False):
    abbr_idx_mapper = pickle_reader(processed_path +
                                    '/fasttext/abbr_idx_mapper.pkl')
    with open(processed_path + '/fasttext/dataset/all.txt', 'w') as f:
        total_dataset = []
        for abbr, abbr_idx in tqdm(abbr_idx_mapper['abbr2idx'].items()):
            total_dataset.extend(
                txt_reader(processed_path +
                           '/fasttext/dataset/%d.txt' % abbr_idx))
        if shuffle:
            random.shuffle(total_dataset)
        f.write("\n".join(total_dataset))
示例#3
0
文件: umn.py 项目: Astroneko404/wsd
def load_umn(umn_file_path, remove_umn_senses=True):
    instance_list = []
    umn_txt = []
    umn_file_original = txt_reader(umn_file_path, encoding="latin-1")

    for line in umn_file_original:
        items = line.split("|")
        abbr, sense, start = items[0], items[1], items[3]
        if remove_umn_senses and is_umn_senses(sense):
            continue
        else:
            instance_list.append((abbr, sense, start))
            umn_txt.append(items[6])
    return instance_list, umn_txt
示例#4
0
文件: msh.py 项目: Astroneko404/wsd
def sense_inventory_msh(benchmark_mesh_file_path, abbr_list):
    inventory_file = txt_reader(benchmark_mesh_file_path)

    sense_inventory = {}
    sense_inventory_one_word = {}
    for line in inventory_file:
        items = line.split("\t")
        abbr = items[0]
        cuis = items[1:]
        if abbr in abbr_list:
            sense_inventory[abbr] = cuis
            if " " not in abbr:
                sense_inventory_one_word[abbr] = cuis
    return sense_inventory_one_word, sense_inventory
示例#5
0
def process_annotated_data(txt_preprocessed_path, upmc_processed_path, train_ratio=0.8, n_jobs=30):
    os.makedirs(upmc_processed_path, exist_ok=True)
    upmc_txt_annotated = txt_reader(txt_preprocessed_path)
    # pre-processing
    upmc_txt = all_processor.process_texts(upmc_txt_annotated, n_jobs=n_jobs)
    # train/test split (80% train)
    random.shuffle(upmc_txt)
    num_instances = len(upmc_txt)
    train_idx = set(random.sample(range(num_instances), int(train_ratio*num_instances)))
    upmc_train_txt = []
    upmc_test_txt = []
    for idx, txt in enumerate(tqdm.tqdm(upmc_txt)):
        if idx in train_idx:
            upmc_train_txt.append(txt)
        else:
            upmc_test_txt.append(txt)
    # Write to file
    txt_writer(upmc_train_txt, upmc_processed_path+"/upmc_train.txt")
    txt_writer(upmc_test_txt, upmc_processed_path+"/upmc_test.txt")
示例#6
0
def generate_test_content(test_processed_path, train_processed_path):
    # Load word2vec vectors
    model = gensim.models.Word2Vec.load(train_processed_path + '/train.model')

    # Load abbr index
    abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl')
    train_docs = Doc(txt_reader(test_processed_path + "/test_no_mark.txt"))

    # Build index for abbrs (for saving pickle files)
    abbr_idx_mapper = build_index_of_abbrs(abbr_index)
    pickle_writer(abbr_idx_mapper, test_processed_path + '/abbr_idx_mapper.pkl')

    # Save all content vectors to pickle files
    content_dir = test_processed_path + '/content_vectors/'
    os.makedirs(content_dir, exist_ok=True)

    print("Saving content vectors...")
    print(len(abbr_index))

    for abbr in tqdm.tqdm(abbr_index):
        abbr_job(abbr, abbr_index, abbr_idx_mapper, train_docs, model, content_dir)
示例#7
0
def generate_test_data(test_processed_path, window_size=5):
    # Load abbr index
    abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl')
    test_docs = Doc(txt_reader(test_processed_path + "/test_no_mark.txt"))

    data_processed_path = test_processed_path + '/fasttext'
    os.makedirs(data_processed_path, exist_ok=True)

    # Build index for abbrs (for saving pickle files)
    abbr_idx_mapper = build_index_of_abbrs(abbr_index)
    pickle_writer(abbr_idx_mapper,
                  data_processed_path + '/abbr_idx_mapper.pkl')

    content_dir = data_processed_path + '/dataset/'
    os.makedirs(content_dir, exist_ok=True)

    print("Building dataset for fastText...")
    print(len(abbr_index))

    for abbr in tqdm(abbr_index):
        abbr_job(abbr, abbr_index, abbr_idx_mapper, test_docs, content_dir,
                 window_size)
示例#8
0
 def __init__(self, dataset_file_path):
     self.corpus = txt_reader(dataset_file_path)
示例#9
0
def predict_fasttext_classifier_multi_model(train_processed_path,
                                            test_processed_path,
                                            use_pretrain=False):
    train_path = train_processed_path + '/fasttext'
    if use_pretrain:
        model_path = train_path + '/model/pre_train'
    else:
        model_path = train_path + '/model'
    # Load abbr index
    test_abbr_idx_mapper = pickle_reader(test_processed_path +
                                         '/fasttext/abbr_idx_mapper.pkl')
    test_abbr_index = AbbrIndex(test_processed_path + '/abbr_index_data.pkl')
    train_abbr_idx_mapper = pickle_reader(train_processed_path +
                                          '/fasttext/abbr_idx_mapper.pkl')
    train_abbr2idx = train_abbr_idx_mapper['abbr2idx']
    train_abbr_label_set = pickle_reader(train_processed_path +
                                         '/fasttext/abbr_label_set.pkl')

    instance_collection = []
    # generate testing data
    for abbr, test_abbr_idx in tqdm(test_abbr_idx_mapper['abbr2idx'].items()):
        if abbr not in train_abbr_label_set:
            for doc_id, pos_list in test_abbr_index[abbr].items():
                for global_instance_idx, pos, label in pos_list:
                    instance_collection.append(
                        InstancePred(index=global_instance_idx,
                                     abbr=abbr,
                                     sense_pred=None))
        else:
            train_label_set = train_abbr_label_set[abbr]
            eval_abbr_instance_list = txt_reader(test_processed_path +
                                                 '/fasttext/dataset/%d.txt' %
                                                 test_abbr_idx)

            abbr_instance_idx = 0
            context_list, global_idx_list = [], []
            for doc_id, pos_list in test_abbr_index[abbr].items():
                for global_instance_idx, pos, label in pos_list:
                    # if true label not in train collection
                    if label not in train_label_set:
                        instance_collection.append(
                            InstancePred(index=global_instance_idx,
                                         abbr=abbr,
                                         sense_pred=None))
                    # if only have 1 CUI
                    elif len(train_label_set) == 1:
                        instance_collection.append(
                            InstancePred(index=global_instance_idx,
                                         abbr=abbr,
                                         sense_pred=label))
                    # need predict
                    else:
                        # get instance
                        tokens = eval_abbr_instance_list[
                            abbr_instance_idx].split()
                        label_in_txt = tokens[0].lstrip("__label__")
                        assert label == label_in_txt
                        context = " ".join(tokens[1:])
                        context_list.append(context)
                        global_idx_list.append(global_instance_idx)
                    abbr_instance_idx += 1
            # predict
            if len(context_list) > 0:
                # Load model
                model_file = model_path + '/%d.bin' % train_abbr2idx[abbr]
                model = load_model(model_file)
                predict_list = model.predict(context_list)[0]
                for idx, predict in zip(global_idx_list, predict_list):
                    instance_collection.append(
                        InstancePred(
                            index=idx,
                            abbr=abbr,
                            sense_pred=predict[0].lstrip("__label__")))

    # sort collection list based on global instance idx
    instance_collection = sorted(instance_collection, key=lambda x: x.index)
    return instance_collection
示例#10
0
    ######################################
    # Read texts from dataset
    ######################################

    # File paths
    data_path = "/home/luoz3/wsd_data"
    upmc_all_path = data_path + "/upmc/batch1_4"
    upmc_all_processed_path = upmc_all_path + "/processed"
    os.makedirs(upmc_all_processed_path, exist_ok=True)

    #############################
    # Process DataSet documents (only one word abbrs)
    #############################

    # Initialize processor and tokenizer
    token_filter = TextTokenFilter()
    processor = TextProcessor([
        white_space_remover,
        token_filter,
        repeat_non_word_remover,
    ])

    upmc_all_txt = txt_reader(data_path + "/upmc_batch1_4/upmc_no_mark_new.txt")
    # pre-processing
    upmc_all_txt = processor.process_texts(upmc_all_txt, n_jobs=30)
    # Write to file
    txt_writer(upmc_all_txt, upmc_all_processed_path+"/train_no_mark.txt")

    print()