def generate_dict_instances(dictionary, dict_alphabet, word_alphabet, isMeddra_dict): Xs = [] Ys = [] if isMeddra_dict: for concept_id, concept_name in dictionary.items(): Y = norm_utils.get_dict_index(dict_alphabet, concept_id) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: continue tokens = my_tokenize(concept_name) word_ids = [] for token in tokens: if token in stop_word: continue token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) Xs.append(word_ids) else : for concept_id, concept in dictionary.items(): Y = norm_utils.get_dict_index(dict_alphabet, concept_id) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): pass else: continue # for concept_name in concept.names: # # tokens = my_tokenize(concept_name) # word_ids = [] # for token in tokens: # token = norm_utils.word_preprocess(token) # word_id = word_alphabet.get_index(token) # word_ids.append(word_id) # # Ys.append(Y) # Xs.append(word_ids) tokens = my_tokenize(concept.names[0]) word_ids = [] for token in tokens: if token in stop_word: continue token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) Ys.append(Y) Xs.append(word_ids) return Xs, Ys
def generate_instances(entities, word_alphabet, dict_alphabet): Xs = [] Ys = [] dict_size = norm_utils.get_dict_size(dict_alphabet) for entity in entities: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0]) if Y >= 0 and Y < dict_size: pass else: continue else: Y = 0 # mention tokens = my_tokenize(entity.name) mention = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) mention.append(word_id) Xs.append(mention) Ys.append(Y) return Xs, Ys
def generate_instances_ehr(entities, word_alphabet, dict_alphabet, dictionary_reverse): Xs = [] Ys = [] dict_size = norm_utils.get_dict_size(dict_alphabet) for entity in entities: if len(entity.norm_ids) > 0: if entity.norm_ids[0] in dictionary_reverse: cui_list = dictionary_reverse[entity.norm_ids[0]] Y = norm_utils.get_dict_index(dict_alphabet, cui_list[0]) # use the first id to generate instance if Y >= 0 and Y < dict_size: pass else: raise RuntimeError("entity {}, {}, cui not in dict_alphabet".format(entity.id, entity.name)) else: logging.debug("entity {}, {}, can't map to umls, ignored".format(entity.id, entity.name)) continue else: Y = 0 # mention tokens = my_tokenize(entity.name) mention = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) mention.append(word_id) Xs.append(mention) Ys.append(Y) return Xs, Ys
def __init__(self, word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses): super(Ensemble, self).__init__() self.word_alphabet = word_alphabet self.embedding_dim = embedding_dim self.word_embedding = word_embedding self.dict_alphabet = dict_alphabet self.gpu = opt.gpu self.poses = poses self.dict_size = norm_utils.get_dict_size(dict_alphabet) self.vsm_linear = nn.Linear(self.embedding_dim, self.embedding_dim, bias=False) self.vsm_linear.weight.data.copy_(torch.eye(self.embedding_dim)) self.neural_linear = nn.Linear(self.embedding_dim, self.dict_size, bias=False) # self.hidden_size = 2500 # self.dropout = nn.Dropout(opt.dropout) # self.hidden = nn.Linear(3*self.dict_size, self.hidden_size) # self.relu = nn.ReLU() # # self.output = nn.Linear(self.hidden_size, self.dict_size) self.output = nn.Linear(3*self.dict_size, self.dict_size) self.criterion = nn.CrossEntropyLoss() if opt.gpu >= 0 and torch.cuda.is_available(): self.word_embedding = self.word_embedding.cuda(self.gpu) self.vsm_linear = self.vsm_linear.cuda(self.gpu) self.neural_linear = self.neural_linear.cuda(self.gpu) # self.hidden = self.hidden.cuda(self.gpu) self.output = self.output.cuda(self.gpu)
def generate_instances(entities, word_alphabet, dict_alphabet): Xs = [] Ys = [] for entity in entities: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index( dict_alphabet, entity.norm_ids[0]) # use the first id to generate instance if Y >= 0 and Y < norm_utils.get_dict_size( dict_alphabet): # for tac, can be none or oov ID Ys.append(Y) else: continue else: Ys.append(0) tokens = my_tokenize(entity.name) word_ids = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) Xs.append(word_ids) return Xs, Ys
def __init__(self, config, target): super(BertForSequenceClassification, self).__init__(config) self.num_labels = get_dict_size(target) self.bert = BertModel(config) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.classifier = nn.Linear(config.hidden_size, self.num_labels) self.apply(self.init_bert_weights) self.dict_alphabet = target self.criterion = nn.CrossEntropyLoss()
def generate_instances_ehr(entities, dict_alphabet, dictionary_reverse): Xs = [] Ys = [] for entity in entities: if len(entity.norm_ids) > 0: if entity.norm_ids[0] in dictionary_reverse: cui_list = dictionary_reverse[entity.norm_ids[0]] Y = get_dict_index( dict_alphabet, cui_list[0]) # use the first id to generate instance if Y >= 0 and Y < get_dict_size(dict_alphabet): Ys.append(Y) else: raise RuntimeError( "entity {}, {}, cui not in dict_alphabet".format( entity.id, entity.name)) else: logging.debug( "entity {}, {}, can't map to umls, ignored".format( entity.id, entity.name)) continue else: Ys.append(0) X = {} tokens = [] for token in tokenizer.tokenize(entity.name): if token in stop_word: continue token = word_preprocess(token) tokens.append(token) tokens.insert(0, '[CLS]') tokens.append('[SEP]') word_ids = tokenizer.convert_tokens_to_ids(tokens) if len(word_ids) == 0: continue X['token'] = word_ids X['segment'] = [0] * len(word_ids) X['mask'] = [1] * len(word_ids) Xs.append(X) return Xs, Ys
def __init__(self, word_alphabet, word_embedding, embedding_dim, dict_alphabet): super(NeuralNormer, self).__init__() self.word_alphabet = word_alphabet self.embedding_dim = embedding_dim self.word_embedding = word_embedding self.dict_alphabet = dict_alphabet self.gpu = opt.gpu self.word_drop = nn.Dropout(opt.dropout) self.attn = DotAttentionLayer(self.embedding_dim) self.linear = nn.Linear(self.embedding_dim, norm_utils.get_dict_size(self.dict_alphabet), bias=False) self.criterion = nn.CrossEntropyLoss() if opt.gpu >= 0 and torch.cuda.is_available(): self.word_embedding = self.word_embedding.cuda(self.gpu) self.attn = self.attn.cuda(self.gpu) self.linear = self.linear.cuda(self.gpu)
def __init__(self, word_alphabet, word_embedding, embedding_dim, dict_alphabet, poses, poses_lengths): super(VsmNormer, self).__init__() self.word_alphabet = word_alphabet self.embedding_dim = embedding_dim self.word_embedding = word_embedding self.dict_alphabet = dict_alphabet self.gpu = opt.gpu self.poses = poses self.dict_size = norm_utils.get_dict_size(dict_alphabet) self.margin = 1 self.poses_lengths = poses_lengths self.word_drop = nn.Dropout(opt.dropout) self.attn = DotAttentionLayer(self.embedding_dim) self.linear = nn.Linear(self.embedding_dim, self.embedding_dim, bias=False) self.linear.weight.data.copy_(torch.eye(self.embedding_dim)) if opt.gpu >= 0 and torch.cuda.is_available(): self.word_embedding = self.word_embedding.cuda(self.gpu) self.attn = self.attn.cuda(self.gpu) self.linear = self.linear.cuda(self.gpu)
def init_vector_for_dict(word_alphabet, dict_alphabet, dictionary, isMeddra_dict): # pos poses = [] poses_lengths = [] dict_size = norm_utils.get_dict_size(dict_alphabet) max_len = 0 for i in range(dict_size): # pos if isMeddra_dict: concept_name = dictionary[norm_utils.get_dict_name( dict_alphabet, i)] tokens = my_tokenize(concept_name) else: concept = dictionary[norm_utils.get_dict_name(dict_alphabet, i)] tokens = my_tokenize(concept.names[0]) pos = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) pos.append(word_id) if len(pos) > max_len: max_len = len(pos) poses.append(pos) poses_lengths.append(len(pos)) poses = pad_sequence(poses, max_len) poses_lengths = torch.LongTensor(poses_lengths) if opt.gpu >= 0 and torch.cuda.is_available(): poses = poses.cuda(opt.gpu) poses_lengths = poses_lengths.cuda(opt.gpu) return poses, poses_lengths
def pretrain(opt): samples_per_epoch = [] pregenerated_data = Path(opt.instance_dir) for i in range(opt.iter): epoch_file = pregenerated_data / f"epoch_{i}.json" metrics_file = pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({opt.iter})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = opt.iter if opt.gpu >= 0 and torch.cuda.is_available(): if opt.multi_gpu: device = torch.device("cuda") n_gpu = torch.cuda.device_count() else: device = torch.device('cuda', opt.gpu) n_gpu = 1 else: device = torch.device("cpu") n_gpu = 0 logging.info("device: {} n_gpu: {}".format(device, n_gpu)) if opt.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(opt.gradient_accumulation_steps)) opt.batch_size = opt.batch_size // opt.gradient_accumulation_steps makedir_and_clear(opt.save) tokenizer = BertTokenizer.from_pretrained(opt.bert_dir, do_lower_case=opt.do_lower_case) total_train_examples = 0 for i in range(opt.iter): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / opt.batch_size / opt.gradient_accumulation_steps) logging.info("load dict ...") UMLS_dict, UMLS_dict_reverse = umls.load_umls_MRCONSO(opt.norm_dict) logging.info("dict concept number {}".format(len(UMLS_dict))) dict_alphabet = Alphabet('dict') init_dict_alphabet(dict_alphabet, UMLS_dict) dict_alphabet.close() # Prepare model model, _ = BertForPreTraining.from_pretrained( opt.bert_dir, num_norm_labels=get_dict_size(dict_alphabet)) model.to(device) if n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = BertAdam(optimizer_grouped_parameters, lr=opt.lr, warmup=opt.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", opt.batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() for epoch in range(opt.iter): epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, dict_alphabet=dict_alphabet) train_sampler = RandomSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=opt.batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 epoch_start = time.time() sum_loss = 0 sum_orginal_loss = 0 num_iter = len(train_dataloader) with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next, input_ids_ent, input_mask_ent, norm_label_ids = batch loss, original_loss = model(input_ids, segment_ids, input_mask, lm_label_ids, input_ids_ent, input_mask_ent, is_next, norm_label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. original_loss = original_loss.mean() if opt.gradient_accumulation_steps > 1: loss = loss / opt.gradient_accumulation_steps original_loss = original_loss / opt.gradient_accumulation_steps loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * opt.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % opt.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 sum_loss += loss.item() sum_orginal_loss += original_loss.item() epoch_finish = time.time() logging.info( "epoch: %s training finished. Time: %.2fs. loss: %.4f, original_loss %.4f" % (epoch, epoch_finish - epoch_start, sum_loss / num_iter, sum_orginal_loss / num_iter)) # Save a trained model logging.info("** ** * Saving fine-tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join( opt.save, "pytorch_model_{}.bin".format(str(epoch + 1))) torch.save(model_to_save.state_dict(), str(output_model_file))
def generate_instances(document, word_alphabet, dict_alphabet, dictionary, dictionary_reverse, isMeddra_dict): Xs = [] Ys = [] # copy entities from gold entities pred_entities = [] for gold in document.entities: pred = Entity() pred.id = gold.id pred.type = gold.type pred.spans = gold.spans pred.section = gold.section pred.name = gold.name pred_entities.append(pred) multi_sieve.runMultiPassSieve(document, pred_entities, dictionary, isMeddra_dict) for idx, entity in enumerate(document.entities): if isMeddra_dict: if len(entity.norm_ids) > 0: Y = norm_utils.get_dict_index(dict_alphabet, entity.norm_ids[0]) if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: continue else: Ys.append(0) else: if len(entity.norm_ids) > 0: if entity.norm_ids[0] in dictionary_reverse: cui_list = dictionary_reverse[entity.norm_ids[0]] Y = norm_utils.get_dict_index( dict_alphabet, cui_list[0]) # use the first id to generate instance if Y >= 0 and Y < norm_utils.get_dict_size(dict_alphabet): Ys.append(Y) else: raise RuntimeError( "entity {}, {}, cui not in dict_alphabet".format( entity.id, entity.name)) else: logging.info( "entity {}, {}, can't map to umls, ignored".format( entity.id, entity.name)) continue else: Ys.append(0) X = dict() tokens = my_tokenize(entity.name) word_ids = [] for token in tokens: token = norm_utils.word_preprocess(token) word_id = word_alphabet.get_index(token) word_ids.append(word_id) X['word'] = word_ids if pred_entities[idx].rule_id is None: X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet) else: X['rule'] = [0] * norm_utils.get_dict_size(dict_alphabet) X['rule'][norm_utils.get_dict_index( dict_alphabet, pred_entities[idx].rule_id)] = 1 Xs.append(X) return Xs, Ys