def test_full_tokenizer(self): vocab_tokens = [ "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", "##ing", ",", "low", "lowest", ] with TemporaryDirectory() as tmpdirname: vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) with open(vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) input_text = "UNwant\u00E9d,running" output_text = "unwanted, running" create_and_check_tokenizer_commons(self, input_text, output_text, BertTokenizer, tmpdirname) tokenizer = BertTokenizer(vocab_file) tokens = tokenizer.tokenize("UNwant\u00E9d,running") self.assertListEqual( tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def test_full_tokenizer(self): tokenizer = BertTokenizer(self.vocab_file) tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def main(): # **************************** 基础信息 *********************** logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = 'cuda:%d' % config['train']['n_gpu'][0] if len( config['train']['n_gpu']) else 'cpu' seed_everything(seed=config['train']['seed'], device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} #**************************** 数据生成 *********************** DT = DataTransformer(logger=logger, seed=config['train']['seed']) # 读取数据集以及数据划分 targets, sentences = DT.read_data( raw_data_path=config['data']['test_file_path'], preprocessor=EnglishPreProcessor(), is_train=False) tokenizer = BertTokenizer( vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) # train test_dataset = CreateDataset(data=list(zip(sentences, targets)), tokenizer=tokenizer, max_seq_len=config['train']['max_seq_len'], seed=config['train']['seed'], example_type='test') # 验证数据集 test_loader = DataLoader(dataset=test_dataset, batch_size=config['train']['batch_size'], num_workers=config['train']['num_workers'], shuffle=False, drop_last=False, pin_memory=False) # **************************** 模型 *********************** logger.info("initializing model") model = BertFine.from_pretrained( config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes=len(id2label)) # **************************** training model *********************** logger.info('model predicting....') predicter = Predicter( model=model, logger=logger, n_gpu=config['train']['n_gpu'], model_path=config['output']['checkpoint_dir'] / f"best_{config['model']['arch']}_model.pth", ) # 拟合模型 result = predicter.predict(data=test_loader) print(result) # 释放显存 if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
def main(): # **************************** 基础信息 *********************** logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir']) logger.info(f"seed is {config['train']['seed']}") device = f"cuda: {config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu'}" seed_everything(seed=config['train']['seed'],device=device) logger.info('starting load data from disk') id2label = {value: key for key, value in config['label2id'].items()} # **************************** 数据生成 *********************** DT = DataTransformer(logger = logger,seed = config['train']['seed']) # 读取数据集以及数据划分 targets,sentences = DT.read_data(raw_data_path = config['data']['raw_data_path'], preprocessor = EnglishPreProcessor(), is_train = True) train, valid = DT.train_val_split(X = sentences,y = targets,save=True,shuffle=True,stratify=False, valid_size = config['train']['valid_size'], train_path = config['data']['train_file_path'], valid_path = config['data']['valid_file_path']) tokenizer = BertTokenizer(vocab_file=config['pretrained']['bert']['vocab_path'], do_lower_case=config['train']['do_lower_case']) # train train_dataset = CreateDataset(data = train, tokenizer = tokenizer, max_seq_len = config['train']['max_seq_len'], seed = config['train']['seed'], example_type = 'train') # valid valid_dataset = CreateDataset(data= valid, tokenizer = tokenizer, max_seq_len = config['train']['max_seq_len'], seed = config['train']['seed'], example_type = 'valid') #加载训练数据集 train_loader = DataLoader(dataset = train_dataset, batch_size = config['train']['batch_size'], num_workers = config['train']['num_workers'], shuffle = True, drop_last = False, pin_memory = False) # 验证数据集 valid_loader = DataLoader(dataset = valid_dataset, batch_size = config['train']['batch_size'], num_workers = config['train']['num_workers'], shuffle = False, drop_last = False, pin_memory = False) # **************************** 模型 *********************** logger.info("initializing model") model = BertFine.from_pretrained(config['pretrained']['bert']['bert_model_dir'], cache_dir=config['output']['cache_dir'], num_classes = len(id2label)) # ************************** 优化器 ************************* param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_steps = int( len(train_dataset.examples) / config['train']['batch_size'] / config['train']['gradient_accumulation_steps'] * config['train']['epochs']) # t_total: total number of training steps for the learning rate schedule # warmup: portion of t_total for the warmup optimizer = AdamW(optimizer_grouped_parameters, lr = config['train']['learning_rate']) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=config['train']['warmup_steps'], t_total=num_train_steps) # **************************** callbacks *********************** logger.info("initializing callbacks") # 模型保存 model_checkpoint = ModelCheckpoint(checkpoint_dir = config['output']['checkpoint_dir'], mode = config['callbacks']['mode'], monitor = config['callbacks']['monitor'], save_best_only = config['callbacks']['save_best_only'], arch = config['model']['arch'], logger = logger) # 监控训练过程 train_monitor = TrainingMonitor(file_dir = config['output']['figure_dir'], arch = config['model']['arch']) # 学习率机制 lr_scheduler = BertLR(optimizer = optimizer, learning_rate = config['train']['learning_rate'], t_total = num_train_steps, warmup = config['train']['warmup_steps']) # **************************** training model *********************** logger.info('training model....') train_configs = { 'model': model, 'logger': logger, 'optimizer': optimizer, 'scheduler': scheduler, 'resume': config['train']['resume'], 'epochs': config['train']['epochs'], 'n_gpu': config['train']['n_gpu'], 'gradient_accumulation_steps': config['train']['gradient_accumulation_steps'], 'epoch_metrics':[F1Score(average='micro',task_type='binary'),MultiLabelReport(id2label = id2label)], 'batch_metrics':[AccuracyThresh(thresh=0.5)], 'criterion': BCEWithLogLoss(), 'model_checkpoint': model_checkpoint, 'training_monitor': train_monitor, 'lr_scheduler': lr_scheduler, 'early_stopping': None, 'verbose': 1 } trainer = Trainer(train_configs=train_configs) # 拟合模型 trainer.train(train_data = train_loader,valid_data=valid_loader) # 释放显存 if len(config['train']['n_gpu']) > 0: torch.cuda.empty_cache()
PUBTATOR_FILE = '/mnt/nfs/scratch1/rangell/BLINK/tmp/corpus_pubtator.txt' PRED_PUBTATOR_FILE = '/mnt/nfs/scratch1/rangell/BLINK/tmp/pred_corpus_pubtator.txt' PRED_MATCHES_FILE = '/mnt/nfs/scratch1/rangell/BLINK/tmp/matches_pred_corpus_pubtator.tsv' TEST_PMIDS_FILE = '/mnt/nfs/scratch1/rangell/BLINK/tmp/corpus_pubtator_pmids_test.txt' DATA_DIR = '/mnt/nfs/scratch1/rangell/BLINK/data/' DATASET = 'medmentions' OUTPUT_DIR = '/mnt/nfs/scratch1/rangell/BLINK/data/{}/taggerOne'.format( DATASET) if __name__ == '__main__': # get tokenizer tokenizer = BertTokenizer( '../lerac/coref_entity_linking/models/biobert_v1.1_pubmed/vocab.txt', do_lower_case=False) # get all test pmids with open(TEST_PMIDS_FILE, 'r') as f: test_pmids = set(map(lambda x: x.strip(), f.readlines())) # get all of the documents raw_docs = defaultdict(str) gold_mention_labels = {} with open(PUBTATOR_FILE, 'r') as f: for line in f: line_split = line.split('|') if len(line_split) == 3: if line_split[0] not in test_pmids: continue