예제 #1
0
def write_discharge_summaries(MIMIC_3_DIR, output_dir):
    notes_file = '%s/NOTEEVENTS.csv' % (MIMIC_3_DIR)

    makedir_and_clear(output_dir)

    logging.info("processing notes file")

    count_note = 0

    with open(notes_file, 'r') as csvfile:

        notereader = csv.reader(csvfile)
        #header
        next(notereader)

        for line in notereader: # each line is saved as a file
            subj = int(line[1])
            category = line[6]
            if category == "Discharge summary":
                file_name = line[1]+'_'+line[2]+".txt"
                file_path = os.path.join(output_dir, file_name)
                with open(file_path, 'w') as outfile:
                    logging.info("writing to %s" % (file_path))
                    outfile.write(line[10])
                count_note += 1

    logging.info("totally write {} notes".format(count_note))
    return
예제 #2
0
def apply_metamap_to(input_dir, output_dir):
    makedir_and_clear(output_dir)

    for input_file_name in listdir(input_dir):
        input_file_path = join(input_dir, input_file_name)
        if input_file_name.rfind('.') == -1:
            output_file_name = input_file_name + ".field.txt"
        else:
            output_file_name = input_file_name[0:input_file_name.
                                               rfind('.')] + ".field.txt"
        output_file_path = join(output_dir, output_file_name)
        os.system(
            '/Users/feili/tools/metamap/public_mm/bin/metamap -y -I -N --blanklines 0 -R SNOMEDCT_US,MDR -J acab,anab,comd,cgab,dsyn,emod,fndg,inpo,mobd,neop,patf,sosy {} {}'
            .format(input_file_path, output_file_path))
예제 #3
0
def prepare_instance(opt):

    tokenizer = BertTokenizer.from_pretrained(opt.bert_dir,
                                              do_lower_case=opt.do_lower_case)
    vocab_list = list(tokenizer.vocab.keys())
    with DocumentDatabase(reduce_memory=False) as docs:
        with open(opt.merged_file, 'r') as f:
            doc = []
            for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
                line = line.strip()
                if line == "":
                    docs.add_document(doc)
                    doc = []
                else:
                    sentence = json.loads(line)
                    # tokens = tokenizer.tokenize(sentence['text'])
                    # doc.append(tokens)
                    doc.append(sentence)

        makedir_and_clear(opt.instance_dir)
        for epoch in trange(opt.iter, desc="Epoch"):
            epoch_filename = os.path.join(opt.instance_dir,
                                          f"epoch_{epoch}.json")
            num_instances = 0
            with open(epoch_filename, 'w') as epoch_file:
                for doc_idx in trange(len(docs), desc="Document"):
                    doc_instances = create_instances_from_document(
                        docs,
                        doc_idx,
                        max_seq_length=opt.max_seq_length,
                        short_seq_prob=opt.short_seq_prob,
                        masked_lm_prob=opt.masked_lm_prob,
                        max_predictions_per_seq=opt.max_predictions_per_seq,
                        vocab_list=vocab_list,
                        tokenizer=tokenizer)
                    doc_instances = [
                        json.dumps(instance) for instance in doc_instances
                    ]
                    for instance in doc_instances:
                        epoch_file.write(instance + '\n')
                        num_instances += 1
            metrics_file = os.path.join(opt.instance_dir,
                                        f"epoch_{epoch}_metrics.json")
            with open(metrics_file, 'w') as metrics_file:
                metrics = {
                    "num_training_examples": num_instances,
                    "max_seq_len": opt.max_seq_length
                }
                metrics_file.write(json.dumps(metrics))
예제 #4
0
def apply_metamap_to(input_dir, output_dir, metamap_dir, metamap_process):
    makedir_and_clear(output_dir)

    logging.info('read text file ......')
    input_file_names = []
    for input_file_name in os.listdir(input_dir):
        input_file_names.append(input_file_name)

    total_file_num = len(input_file_names)
    logging.info('totally {} files'.format(total_file_num))

    file_index = []
    file_num_per_process = total_file_num // metamap_process
    for group_idx in range(metamap_process):
        begin = group_idx * file_num_per_process
        end = (group_idx + 1) * file_num_per_process
        idx = begin
        tmp = []
        while idx < end:
            tmp.append(input_file_names[idx])
            idx += 1
        file_index.append(tmp)

    if end < total_file_num:
        file_index[-1].extend(input_file_names[end:])

    processes = []
    for group in file_index:
        p = multiprocessing.Process(target=worker,
                                    args=(group, input_dir, output_dir,
                                          metamap_dir))
        processes.append(p)
        p.start()

    for p in processes:
        p.join()
예제 #5
0
    logging.info(opt)

    if opt.random_seed != 0:
        random.seed(opt.random_seed)
        np.random.seed(opt.random_seed)
        torch.manual_seed(opt.random_seed)
        torch.cuda.manual_seed_all(opt.random_seed)

    d = data.Data(opt)



    logging.info(d.config)

    makedir_and_clear(opt.output)

    logging.info("load data ...")
    train_data = data.loadData(opt.train_file, True, opt.types, opt.type_filter)
    dev_data = data.loadData(opt.dev_file, True, opt.types, opt.type_filter)
    if opt.test_file:
        test_data = data.loadData(opt.test_file, False, opt.types, opt.type_filter)
    else:
        test_data = None

    logging.info("load dict ...")
    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(d.config['norm_dict'])
    logging.info("dict concept number {}".format(len(UMLS_dict)))

    train(train_data, dev_data, test_data, d, UMLS_dict, UMLS_dict_reverse, opt, None, False)
예제 #6
0
    logger = logging.getLogger()
    if opt.verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    logging.info(opt)

    if opt.random_seed != 0:
        random.seed(opt.random_seed)
        np.random.seed(opt.random_seed)
        torch.manual_seed(opt.random_seed)
        torch.cuda.manual_seed_all(opt.random_seed)

    if opt.whattodo == 'train':
        makedir_and_clear(opt.save)

        logging.info('loading training data...')
        train_datasets = []
        for train_jsonl_file in opt.train:
            train_dataset = load_data(train_jsonl_file, 'train')
            train_datasets.append(train_dataset)

        logging.info('loading dev indomain data...')
        dev_indomain_datasets = []
        for dev_jsonl_file in opt.dev_indomain:
            dev_dataset = load_data(dev_jsonl_file, 'dev')
            dev_indomain_datasets.append(dev_dataset)

        logging.info('loading dev outdomain data...')
        dev_outdomain_datasets = []
예제 #7
0
def test(data, opt):

    corpus_dir = opt.test_file

    if opt.nlp_tool == "nltk":
        nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle')
    else:
        raise RuntimeError("invalid nlp tool")

    corpus_files = [f for f in os.listdir(corpus_dir) if f.find('.xml') != -1]

    model = SeqModel(data, opt)
    if opt.test_in_cpu:
        model.load_state_dict(
            torch.load(os.path.join(opt.output, 'model.pkl'), map_location='cpu'))
    else:
        model.load_state_dict(torch.load(os.path.join(opt.output, 'model.pkl')))

    meddra_dict = load_meddra_dict(data)

    # initialize norm models
    if opt.norm_rule and opt.norm_vsm and opt.norm_neural: # ensemble
        logging.info("use ensemble normer")
        multi_sieve.init(opt, None, data, meddra_dict, None, True)
        if opt.ensemble == 'learn':
            if opt.test_in_cpu:
                ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl'), map_location='cpu')
            else:
                ensemble_model = torch.load(os.path.join(opt.output, 'ensemble.pkl'))
            ensemble_model.eval()
        else:
            if opt.test_in_cpu:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu')
                neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
            else:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
                neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))

            vsm_model.eval()
            neural_model.eval()

    elif opt.norm_rule:
        logging.info("use rule-based normer")
        multi_sieve.init(opt, None, data, meddra_dict)

    elif opt.norm_vsm:
        logging.info("use vsm-based normer")
        if opt.test_in_cpu:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'), map_location='cpu')
        else:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
        vsm_model.eval()

    elif opt.norm_neural:
        logging.info("use neural-based normer")
        if opt.test_in_cpu:
            neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'), map_location='cpu')
        else:
            neural_model = torch.load(os.path.join(opt.output, 'norm_neural.pkl'))
        neural_model.eval()
    else:
        logging.info("no normalization is performed.")


    makedir_and_clear(opt.predict)

    ct_success = 0
    ct_error = 0

    for fileName in corpus_files:
        try:
            start = time.time()
            document, annotation_file = processOneFile_fda(fileName, corpus_dir, nlp_tool, False, opt.types, opt.type_filter, True, False)
            pred_entities = []

            for section in document:

                data.test_texts = []
                data.test_Ids = []
                read_instance_from_one_document(section, data.word_alphabet, data.char_alphabet, data.label_alphabet,
                                                data.test_texts, data.test_Ids, data)

                _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test', False, opt.nbest)

                entities = translateResultsintoEntities(section.sentences, pred_results)

                # remove the entity in the ignore_region and fill section_id
                section_id = section.name[section.name.rfind('_')+1: ]
                entities = remove_entity_in_the_ignore_region(annotation_file.ignore_regions, entities, section_id)


                if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
                    if opt.ensemble == 'learn':
                        ensemble_model.process_one_doc(section, entities, meddra_dict, None, True)
                    else:
                        pred_entities1 = copy.deepcopy(entities)
                        pred_entities2 = copy.deepcopy(entities)
                        pred_entities3 = copy.deepcopy(entities)
                        multi_sieve.runMultiPassSieve(section, pred_entities1, meddra_dict, True)
                        vsm_model.process_one_doc(section, pred_entities2, meddra_dict, None, True)
                        neural_model.process_one_doc(section, pred_entities3, meddra_dict, None, True)

                        # merge pred_entities1, pred_entities2, pred_entities3 into entities
                        ensemble.merge_result(pred_entities1, pred_entities2, pred_entities3, entities, meddra_dict, True, vsm_model.dict_alphabet, data)

                elif opt.norm_rule:
                    multi_sieve.runMultiPassSieve(section, entities, meddra_dict, True)
                elif opt.norm_vsm:
                    vsm_model.process_one_doc(section, entities, meddra_dict, None, True)
                elif opt.norm_neural:
                    neural_model.process_one_doc(section, entities, meddra_dict, None, True)


                for entity in entities:
                    if len(entity.norm_ids)!=0: # if a mention can't be normed, not output it
                        pred_entities.append(entity)


            dump_results(fileName, pred_entities, opt, annotation_file)

            end = time.time()
            logging.info("process %s complete with %.2fs" % (fileName, end - start))

            ct_success += 1
        except Exception as e:
            logging.error("process file {} error: {}".format(fileName, e))
            ct_error += 1

    if opt.norm_rule:
        multi_sieve.finalize(True)

    logging.info("test finished, total {}, error {}".format(ct_success + ct_error, ct_error))
예제 #8
0
def main():

    logger = logging.getLogger()
    if opt.verbose:
        logger.setLevel(logging.DEBUG)
    else:
        logger.setLevel(logging.INFO)

    logging.info(opt)

    if opt.random_seed != 0:
        random.seed(opt.random_seed)
        np.random.seed(opt.random_seed)
        torch.manual_seed(opt.random_seed)
        torch.cuda.manual_seed_all(opt.random_seed)

    d = data.Data(opt)
    logging.info(d.config)

    makedir_and_clear(opt.output)

    documents = load_data_fda(opt.train_file, opt.types, opt.type_filter, True,
                              True)

    logging.info("use {} fold cross validataion".format(opt.cross_validation))
    fold_num = opt.cross_validation
    total_doc_num = len(documents)
    dev_doc_num = total_doc_num // fold_num

    macro_p = 0.0
    macro_r = 0.0
    macro_f = 0.0

    meddra_dict = load_meddra_dict(d)

    for fold_idx in range(fold_num):

        fold_start = fold_idx * dev_doc_num
        fold_end = fold_idx * dev_doc_num + dev_doc_num
        if fold_end > total_doc_num:
            fold_end = total_doc_num
        if fold_idx == fold_num - 1 and fold_end < total_doc_num:
            fold_end = total_doc_num

        train_data = []
        train_data.extend(documents[:fold_start])
        train_data.extend(documents[fold_end:])
        dev_data = documents[fold_start:fold_end]

        logging.info("begin fold {}".format(fold_idx))
        logging.info("doc start {}, doc end {}".format(fold_start, fold_end))

        p, r, f = train(train_data, dev_data, None, d, meddra_dict, None, opt,
                        fold_idx, True)

        macro_p += p
        macro_r += r
        macro_f += f

    logging.info("the macro averaged p r f are %.4f, %.4f, %.4f" %
                 (macro_p * 1.0 / fold_num, macro_r * 1.0 / fold_num,
                  macro_f * 1.0 / fold_num))
예제 #9
0
파일: pretrain.py 프로젝트: pj0616/EhrERNIE
def pretrain(opt):

    samples_per_epoch = []
    pregenerated_data = Path(opt.instance_dir)
    for i in range(opt.iter):

        epoch_file = pregenerated_data / f"epoch_{i}.json"
        metrics_file = pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({opt.iter})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = opt.iter

    if opt.gpu >= 0 and torch.cuda.is_available():
        if opt.multi_gpu:
            device = torch.device("cuda")
            n_gpu = torch.cuda.device_count()
        else:
            device = torch.device('cuda', opt.gpu)
            n_gpu = 1
    else:
        device = torch.device("cpu")
        n_gpu = 0

    logging.info("device: {} n_gpu: {}".format(device, n_gpu))

    if opt.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(opt.gradient_accumulation_steps))

    opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps

    makedir_and_clear(opt.save)

    tokenizer = BertTokenizer.from_pretrained(opt.bert_dir,
                                              do_lower_case=opt.do_lower_case)

    total_train_examples = 0
    for i in range(opt.iter):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples / opt.batch_size /
                                       opt.gradient_accumulation_steps)

    logging.info("load dict ...")
    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(opt.norm_dict)
    logging.info("dict concept number {}".format(len(UMLS_dict)))
    dict_alphabet = Alphabet('dict')
    init_dict_alphabet(dict_alphabet, UMLS_dict)
    dict_alphabet.close()

    # Prepare model
    model, _ = BertForPreTraining.from_pretrained(
        opt.bert_dir, num_norm_labels=get_dict_size(dict_alphabet))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=opt.lr,
                         warmup=opt.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", opt.batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(opt.iter):
        epoch_dataset = PregeneratedDataset(epoch=epoch,
                                            training_path=pregenerated_data,
                                            tokenizer=tokenizer,
                                            num_data_epochs=num_data_epochs,
                                            dict_alphabet=dict_alphabet)
        train_sampler = RandomSampler(epoch_dataset)

        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=opt.batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        epoch_start = time.time()
        sum_loss = 0
        sum_orginal_loss = 0
        num_iter = len(train_dataloader)

        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next, input_ids_ent, input_mask_ent, norm_label_ids = batch
                loss, original_loss = model(input_ids, segment_ids, input_mask,
                                            lm_label_ids, input_ids_ent,
                                            input_mask_ent, is_next,
                                            norm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                    original_loss = original_loss.mean()
                if opt.gradient_accumulation_steps > 1:
                    loss = loss / opt.gradient_accumulation_steps
                    original_loss = original_loss / opt.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss * opt.gradient_accumulation_steps / nb_tr_steps
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")

                if (step + 1) % opt.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                sum_loss += loss.item()
                sum_orginal_loss += original_loss.item()

        epoch_finish = time.time()
        logging.info(
            "epoch: %s training finished. Time: %.2fs. loss: %.4f, original_loss %.4f"
            % (epoch, epoch_finish - epoch_start, sum_loss / num_iter,
               sum_orginal_loss / num_iter))

        # Save a trained model
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            opt.save, "pytorch_model_{}.bin".format(str(epoch + 1)))
        torch.save(model_to_save.state_dict(), str(output_model_file))
예제 #10
0
파일: test.py 프로젝트: pj0616/norm-1-30
def test(data, opt):
    # corpus_dir = join(opt.test_file, 'corpus')
    # corpus_dir = join(opt.test_file, 'txt')
    corpus_dir = opt.test_file

    if opt.nlp_tool == "spacy":
        nlp_tool = spacy.load('en')
    elif opt.nlp_tool == "nltk":
        nlp_tool = nltk.data.load('tokenizers/punkt/english.pickle')
    elif opt.nlp_tool == "stanford":
        nlp_tool = StanfordCoreNLP('http://localhost:{0}'.format(9000))
    else:
        raise RuntimeError("invalid nlp tool")

    corpus_files = [
        f for f in listdir(corpus_dir) if isfile(join(corpus_dir, f))
    ]

    model = SeqModel(data, opt)
    if opt.test_in_cpu:
        model.load_state_dict(
            torch.load(os.path.join(opt.output, 'model.pkl'),
                       map_location='cpu'))
    else:
        model.load_state_dict(torch.load(os.path.join(opt.output,
                                                      'model.pkl')))

    dictionary, dictionary_reverse = umls.load_umls_MRCONSO(
        data.config['norm_dict'])
    isMeddra_dict = False

    # initialize norm models
    if opt.norm_rule and opt.norm_vsm and opt.norm_neural:  # ensemble
        logging.info("use ensemble normer")
        multi_sieve.init(opt, None, data, dictionary, dictionary_reverse,
                         False)
        if opt.ensemble == 'learn':
            if opt.test_in_cpu:
                ensemble_model = torch.load(os.path.join(
                    opt.output, 'ensemble.pkl'),
                                            map_location='cpu')
            else:
                ensemble_model = torch.load(
                    os.path.join(opt.output, 'ensemble.pkl'))
            ensemble_model.eval()
        else:
            if opt.test_in_cpu:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'),
                                       map_location='cpu')
                neural_model = torch.load(os.path.join(opt.output,
                                                       'norm_neural.pkl'),
                                          map_location='cpu')
            else:
                vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
                neural_model = torch.load(
                    os.path.join(opt.output, 'norm_neural.pkl'))

            vsm_model.eval()
            neural_model.eval()

    elif opt.norm_rule:
        logging.info("use rule-based normer")
        multi_sieve.init(opt, None, data, dictionary, dictionary_reverse,
                         False)

    elif opt.norm_vsm:
        logging.info("use vsm-based normer")
        if opt.test_in_cpu:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'),
                                   map_location='cpu')
        else:
            vsm_model = torch.load(os.path.join(opt.output, 'vsm.pkl'))
        vsm_model.eval()

    elif opt.norm_neural:
        logging.info("use neural-based normer")
        if opt.test_in_cpu:
            neural_model = torch.load(os.path.join(opt.output,
                                                   'norm_neural.pkl'),
                                      map_location='cpu')
        else:
            neural_model = torch.load(
                os.path.join(opt.output, 'norm_neural.pkl'))
        neural_model.eval()
    else:
        logging.info("no normalization is performed.")

    makedir_and_clear(opt.predict)

    ct_success = 0
    ct_error = 0

    for fileName in corpus_files:
        try:
            start = time.time()
            document, _, _, _ = processOneFile(fileName, None, corpus_dir,
                                               nlp_tool, False, opt.types,
                                               opt.type_filter)

            data.test_texts = []
            data.test_Ids = []
            read_instance_from_one_document(document, data.word_alphabet,
                                            data.char_alphabet,
                                            data.label_alphabet,
                                            data.test_texts, data.test_Ids,
                                            data)

            _, _, _, _, _, pred_results, _ = evaluate(data, opt, model, 'test',
                                                      False, opt.nbest)

            entities = translateResultsintoEntities(document.sentences,
                                                    pred_results)

            if opt.norm_rule and opt.norm_vsm and opt.norm_neural:
                if opt.ensemble == 'learn':
                    ensemble_model.process_one_doc(document, entities,
                                                   dictionary,
                                                   dictionary_reverse,
                                                   isMeddra_dict)
                else:
                    pred_entities1 = copy.deepcopy(entities)
                    pred_entities2 = copy.deepcopy(entities)
                    pred_entities3 = copy.deepcopy(entities)
                    multi_sieve.runMultiPassSieve(document, pred_entities1,
                                                  dictionary, isMeddra_dict)
                    vsm_model.process_one_doc(document, pred_entities2,
                                              dictionary, dictionary_reverse,
                                              isMeddra_dict)
                    neural_model.process_one_doc(document, pred_entities3,
                                                 dictionary,
                                                 dictionary_reverse,
                                                 isMeddra_dict)

                    # merge pred_entities1, pred_entities2, pred_entities3 into entities
                    ensemble.merge_result(pred_entities1, pred_entities2,
                                          pred_entities3, entities, dictionary,
                                          isMeddra_dict,
                                          vsm_model.dict_alphabet, data)

            elif opt.norm_rule:
                multi_sieve.runMultiPassSieve(document, entities, dictionary,
                                              isMeddra_dict)
            elif opt.norm_vsm:
                vsm_model.process_one_doc(document, entities, dictionary,
                                          dictionary_reverse, isMeddra_dict)
            elif opt.norm_neural:
                neural_model.process_one_doc(document, entities, dictionary,
                                             dictionary_reverse, isMeddra_dict)

            dump_results(fileName, entities, opt)

            end = time.time()
            logging.info("process %s complete with %.2fs" %
                         (fileName, end - start))

            ct_success += 1
        except Exception as e:
            logging.error("process file {} error: {}".format(fileName, e))
            ct_error += 1

    logging.info("test finished, total {}, error {}".format(
        ct_success + ct_error, ct_error))
예제 #11
0
    logging.info(opt)

    if opt.random_seed != 0:
        random.seed(opt.random_seed)
        np.random.seed(opt.random_seed)
        torch.manual_seed(opt.random_seed)
        torch.cuda.manual_seed_all(opt.random_seed)

    if opt.whattodo == 'train':

        config = data.read_config(opt.config)

        logging.info(config)

        makedir_and_clear(opt.save)

        logging.info("load data ...")
        train_data = data.loadData(opt.train_file, True, opt.types,
                                   opt.type_filter)
        dev_data = data.loadData(opt.dev_file, True, opt.types,
                                 opt.type_filter)
        if opt.test_file:
            test_data = data.loadData(opt.test_file, False, opt.types,
                                      opt.type_filter)
        else:
            test_data = None

        logging.info("load dict ...")
        UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(
            config['norm_dict'])