예제 #1
0
def generate_dict_instances(dictionary, dict_alphabet, word_alphabet, isMeddra_dict):
    Xs = []
    Ys = []

    if isMeddra_dict:
        for concept_id, concept_name in dictionary.items():

            Y = norm_utils.get_dict_index(dict_alphabet, concept_id)
            if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                Ys.append(Y)
            else:
                continue


            tokens = my_tokenize(concept_name)
            word_ids = []
            for token in tokens:
                if token in stop_word:
                    continue
                token = norm_utils.word_preprocess(token)
                word_id = word_alphabet.get_index(token)
                word_ids.append(word_id)

            Xs.append(word_ids)
    else :
        for concept_id, concept in dictionary.items():
            Y = norm_utils.get_dict_index(dict_alphabet, concept_id)
            if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                pass
            else:
                continue

            # for concept_name in concept.names:
            #
            #     tokens = my_tokenize(concept_name)
            #     word_ids = []
            #     for token in tokens:
            #         token = norm_utils.word_preprocess(token)
            #         word_id = word_alphabet.get_index(token)
            #         word_ids.append(word_id)
            #
            #     Ys.append(Y)
            #     Xs.append(word_ids)


            tokens = my_tokenize(concept.names[0])
            word_ids = []
            for token in tokens:
                if token in stop_word:
                    continue
                token = norm_utils.word_preprocess(token)
                word_id = word_alphabet.get_index(token)
                word_ids.append(word_id)

            Ys.append(Y)
            Xs.append(word_ids)


    return Xs, Ys
예제 #2
0
def generate_instances(entities, word_alphabet, dict_alphabet):
    Xs = []
    Ys = []
    dict_size = norm_utils.get_dict_size(dict_alphabet)

    for entity in entities:
        if len(entity.norm_ids) > 0:
            Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0])
            if Y >= 0 and Y < dict_size:
                pass
            else:
                continue
        else:
            Y = 0

        # mention
        tokens = my_tokenize(entity.name)
        mention = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            mention.append(word_id)

        Xs.append(mention)
        Ys.append(Y)

    return Xs, Ys
예제 #3
0
파일: vsm.py 프로젝트: foxlf823/norm
def generate_instances_ehr(entities, word_alphabet, dict_alphabet, dictionary_reverse):
    Xs = []
    Ys = []
    dict_size = norm_utils.get_dict_size(dict_alphabet)

    for entity in entities:

        if len(entity.norm_ids) > 0:
            if entity.norm_ids[0] in dictionary_reverse:
                cui_list = dictionary_reverse[entity.norm_ids[0]]
                Y = norm_utils.get_dict_index(dict_alphabet, cui_list[0])  # use the first id to generate instance
                if Y >= 0 and Y < dict_size:
                    pass
                else:
                    raise RuntimeError("entity {}, {}, cui not in dict_alphabet".format(entity.id, entity.name))
            else:
                logging.debug("entity {}, {}, can't map to umls, ignored".format(entity.id, entity.name))
                continue
        else:
            Y = 0

        # mention
        tokens = my_tokenize(entity.name)
        mention = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            mention.append(word_id)

        Xs.append(mention)
        Ys.append(Y)


    return Xs, Ys
예제 #4
0
    def __init__(self, word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses):
        super(Ensemble, self).__init__()

        self.word_alphabet = word_alphabet
        self.embedding_dim = embedding_dim
        self.word_embedding = word_embedding
        self.dict_alphabet = dict_alphabet
        self.gpu = opt.gpu
        self.poses = poses
        self.dict_size = norm_utils.get_dict_size(dict_alphabet)

        self.vsm_linear = nn.Linear(self.embedding_dim, self.embedding_dim, bias=False)
        self.vsm_linear.weight.data.copy_(torch.eye(self.embedding_dim))

        self.neural_linear = nn.Linear(self.embedding_dim, self.dict_size, bias=False)

        # self.hidden_size = 2500
        # self.dropout = nn.Dropout(opt.dropout)
        # self.hidden = nn.Linear(3*self.dict_size, self.hidden_size)
        # self.relu = nn.ReLU()
        #
        # self.output = nn.Linear(self.hidden_size, self.dict_size)

        self.output = nn.Linear(3*self.dict_size, self.dict_size)

        self.criterion = nn.CrossEntropyLoss()

        if opt.gpu >= 0 and torch.cuda.is_available():
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            self.vsm_linear = self.vsm_linear.cuda(self.gpu)
            self.neural_linear = self.neural_linear.cuda(self.gpu)

            # self.hidden = self.hidden.cuda(self.gpu)
            self.output = self.output.cuda(self.gpu)
예제 #5
0
def generate_instances(entities, word_alphabet, dict_alphabet):
    Xs = []
    Ys = []

    for entity in entities:
        if len(entity.norm_ids) > 0:
            Y = norm_utils.get_dict_index(
                dict_alphabet,
                entity.norm_ids[0])  # use the first id to generate instance
            if Y >= 0 and Y < norm_utils.get_dict_size(
                    dict_alphabet):  # for tac, can be none or oov ID
                Ys.append(Y)
            else:
                continue
        else:
            Ys.append(0)

        tokens = my_tokenize(entity.name)
        word_ids = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            word_ids.append(word_id)

        Xs.append(word_ids)

    return Xs, Ys
예제 #6
0
 def __init__(self, config, target):
     super(BertForSequenceClassification, self).__init__(config)
     self.num_labels = get_dict_size(target)
     self.bert = BertModel(config)
     self.dropout = nn.Dropout(config.hidden_dropout_prob)
     self.classifier = nn.Linear(config.hidden_size, self.num_labels)
     self.apply(self.init_bert_weights)
     self.dict_alphabet = target
     self.criterion = nn.CrossEntropyLoss()
예제 #7
0
def generate_instances_ehr(entities, dict_alphabet, dictionary_reverse):
    Xs = []
    Ys = []

    for entity in entities:
        if len(entity.norm_ids) > 0:
            if entity.norm_ids[0] in dictionary_reverse:
                cui_list = dictionary_reverse[entity.norm_ids[0]]
                Y = get_dict_index(
                    dict_alphabet,
                    cui_list[0])  # use the first id to generate instance
                if Y >= 0 and Y < get_dict_size(dict_alphabet):
                    Ys.append(Y)
                else:
                    raise RuntimeError(
                        "entity {}, {}, cui not in dict_alphabet".format(
                            entity.id, entity.name))
            else:
                logging.debug(
                    "entity {}, {}, can't map to umls, ignored".format(
                        entity.id, entity.name))
                continue
        else:
            Ys.append(0)

        X = {}

        tokens = []
        for token in tokenizer.tokenize(entity.name):
            if token in stop_word:
                continue
            token = word_preprocess(token)
            tokens.append(token)

        tokens.insert(0, '[CLS]')
        tokens.append('[SEP]')
        word_ids = tokenizer.convert_tokens_to_ids(tokens)

        if len(word_ids) == 0:
            continue

        X['token'] = word_ids
        X['segment'] = [0] * len(word_ids)
        X['mask'] = [1] * len(word_ids)

        Xs.append(X)

    return Xs, Ys
예제 #8
0
    def __init__(self, word_alphabet, word_embedding, embedding_dim, dict_alphabet):
        super(NeuralNormer, self).__init__()
        self.word_alphabet = word_alphabet
        self.embedding_dim = embedding_dim
        self.word_embedding = word_embedding
        self.dict_alphabet = dict_alphabet
        self.gpu = opt.gpu
        self.word_drop = nn.Dropout(opt.dropout)
        self.attn = DotAttentionLayer(self.embedding_dim)
        self.linear = nn.Linear(self.embedding_dim, norm_utils.get_dict_size(self.dict_alphabet), bias=False)
        self.criterion = nn.CrossEntropyLoss()

        if opt.gpu >= 0 and torch.cuda.is_available():
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            self.attn = self.attn.cuda(self.gpu)
            self.linear = self.linear.cuda(self.gpu)
예제 #9
0
파일: vsm.py 프로젝트: foxlf823/norm
    def __init__(self, word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses, poses_lengths):
        super(VsmNormer, self).__init__()
        self.word_alphabet = word_alphabet
        self.embedding_dim = embedding_dim
        self.word_embedding = word_embedding
        self.dict_alphabet = dict_alphabet
        self.gpu = opt.gpu
        self.poses = poses
        self.dict_size = norm_utils.get_dict_size(dict_alphabet)
        self.margin = 1
        self.poses_lengths = poses_lengths
        self.word_drop = nn.Dropout(opt.dropout)
        self.attn = DotAttentionLayer(self.embedding_dim)
        self.linear = nn.Linear(self.embedding_dim, self.embedding_dim, bias=False)
        self.linear.weight.data.copy_(torch.eye(self.embedding_dim))

        if opt.gpu >= 0 and torch.cuda.is_available():
            self.word_embedding = self.word_embedding.cuda(self.gpu)
            self.attn = self.attn.cuda(self.gpu)
            self.linear = self.linear.cuda(self.gpu)
예제 #10
0
def init_vector_for_dict(word_alphabet, dict_alphabet, dictionary,
                         isMeddra_dict):

    # pos
    poses = []
    poses_lengths = []
    dict_size = norm_utils.get_dict_size(dict_alphabet)
    max_len = 0
    for i in range(dict_size):

        # pos
        if isMeddra_dict:
            concept_name = dictionary[norm_utils.get_dict_name(
                dict_alphabet, i)]
            tokens = my_tokenize(concept_name)
        else:
            concept = dictionary[norm_utils.get_dict_name(dict_alphabet, i)]
            tokens = my_tokenize(concept.names[0])
        pos = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            pos.append(word_id)

        if len(pos) > max_len:
            max_len = len(pos)

        poses.append(pos)
        poses_lengths.append(len(pos))

    poses = pad_sequence(poses, max_len)
    poses_lengths = torch.LongTensor(poses_lengths)

    if opt.gpu >= 0 and torch.cuda.is_available():
        poses = poses.cuda(opt.gpu)
        poses_lengths = poses_lengths.cuda(opt.gpu)

    return poses, poses_lengths
예제 #11
0
파일: pretrain.py 프로젝트: pj0616/EhrERNIE
def pretrain(opt):

    samples_per_epoch = []
    pregenerated_data = Path(opt.instance_dir)
    for i in range(opt.iter):

        epoch_file = pregenerated_data / f"epoch_{i}.json"
        metrics_file = pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(
                f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({opt.iter})."
            )
            print(
                "This script will loop over the available data, but training diversity may be negatively impacted."
            )
            num_data_epochs = i
            break
    else:
        num_data_epochs = opt.iter

    if opt.gpu >= 0 and torch.cuda.is_available():
        if opt.multi_gpu:
            device = torch.device("cuda")
            n_gpu = torch.cuda.device_count()
        else:
            device = torch.device('cuda', opt.gpu)
            n_gpu = 1
    else:
        device = torch.device("cpu")
        n_gpu = 0

    logging.info("device: {} n_gpu: {}".format(device, n_gpu))

    if opt.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(opt.gradient_accumulation_steps))

    opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps

    makedir_and_clear(opt.save)

    tokenizer = BertTokenizer.from_pretrained(opt.bert_dir,
                                              do_lower_case=opt.do_lower_case)

    total_train_examples = 0
    for i in range(opt.iter):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(total_train_examples / opt.batch_size /
                                       opt.gradient_accumulation_steps)

    logging.info("load dict ...")
    UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(opt.norm_dict)
    logging.info("dict concept number {}".format(len(UMLS_dict)))
    dict_alphabet = Alphabet('dict')
    init_dict_alphabet(dict_alphabet, UMLS_dict)
    dict_alphabet.close()

    # Prepare model
    model, _ = BertForPreTraining.from_pretrained(
        opt.bert_dir, num_norm_labels=get_dict_size(dict_alphabet))
    model.to(device)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=opt.lr,
                         warmup=opt.warmup_proportion,
                         t_total=num_train_optimization_steps)

    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", opt.batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    model.train()
    for epoch in range(opt.iter):
        epoch_dataset = PregeneratedDataset(epoch=epoch,
                                            training_path=pregenerated_data,
                                            tokenizer=tokenizer,
                                            num_data_epochs=num_data_epochs,
                                            dict_alphabet=dict_alphabet)
        train_sampler = RandomSampler(epoch_dataset)

        train_dataloader = DataLoader(epoch_dataset,
                                      sampler=train_sampler,
                                      batch_size=opt.batch_size)
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        epoch_start = time.time()
        sum_loss = 0
        sum_orginal_loss = 0
        num_iter = len(train_dataloader)

        with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
            for step, batch in enumerate(train_dataloader):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, lm_label_ids, is_next, input_ids_ent, input_mask_ent, norm_label_ids = batch
                loss, original_loss = model(input_ids, segment_ids, input_mask,
                                            lm_label_ids, input_ids_ent,
                                            input_mask_ent, is_next,
                                            norm_label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                    original_loss = original_loss.mean()
                if opt.gradient_accumulation_steps > 1:
                    loss = loss / opt.gradient_accumulation_steps
                    original_loss = original_loss / opt.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                pbar.update(1)
                mean_loss = tr_loss * opt.gradient_accumulation_steps / nb_tr_steps
                pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")

                if (step + 1) % opt.gradient_accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

                sum_loss += loss.item()
                sum_orginal_loss += original_loss.item()

        epoch_finish = time.time()
        logging.info(
            "epoch: %s training finished. Time: %.2fs. loss: %.4f, original_loss %.4f"
            % (epoch, epoch_finish - epoch_start, sum_loss / num_iter,
               sum_orginal_loss / num_iter))

        # Save a trained model
        logging.info("** ** * Saving fine-tuned model ** ** * ")
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(
            opt.save, "pytorch_model_{}.bin".format(str(epoch + 1)))
        torch.save(model_to_save.state_dict(), str(output_model_file))
예제 #12
0
def generate_instances(document, word_alphabet, dict_alphabet, dictionary,
                       dictionary_reverse, isMeddra_dict):
    Xs = []
    Ys = []

    # copy entities from gold entities
    pred_entities = []
    for gold in document.entities:
        pred = Entity()
        pred.id = gold.id
        pred.type = gold.type
        pred.spans = gold.spans
        pred.section = gold.section
        pred.name = gold.name
        pred_entities.append(pred)

    multi_sieve.runMultiPassSieve(document, pred_entities, dictionary,
                                  isMeddra_dict)

    for idx, entity in enumerate(document.entities):

        if isMeddra_dict:
            if len(entity.norm_ids) > 0:
                Y = norm_utils.get_dict_index(dict_alphabet,
                                              entity.norm_ids[0])
                if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                    Ys.append(Y)
                else:
                    continue
            else:
                Ys.append(0)
        else:
            if len(entity.norm_ids) > 0:
                if entity.norm_ids[0] in dictionary_reverse:
                    cui_list = dictionary_reverse[entity.norm_ids[0]]
                    Y = norm_utils.get_dict_index(
                        dict_alphabet,
                        cui_list[0])  # use the first id to generate instance
                    if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet):
                        Ys.append(Y)
                    else:
                        raise RuntimeError(
                            "entity {}, {}, cui not in dict_alphabet".format(
                                entity.id, entity.name))
                else:
                    logging.info(
                        "entity {}, {}, can't map to umls, ignored".format(
                            entity.id, entity.name))
                    continue
            else:
                Ys.append(0)

        X = dict()

        tokens = my_tokenize(entity.name)
        word_ids = []
        for token in tokens:
            token = norm_utils.word_preprocess(token)
            word_id = word_alphabet.get_index(token)
            word_ids.append(word_id)
        X['word'] = word_ids

        if pred_entities[idx].rule_id is None:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
        else:
            X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet)
            X['rule'][norm_utils.get_dict_index(
                dict_alphabet, pred_entities[idx].rule_id)] = 1

        Xs.append(X)

    return Xs, Ys