Пример #1
0
    def test_full_tokenizer(self):
        vocab_tokens = [
            "[UNK]",
            "[CLS]",
            "[SEP]",
            "want",
            "##want",
            "##ed",
            "wa",
            "un",
            "runn",
            "##ing",
            ",",
            "low",
            "lowest",
        ]
        with TemporaryDirectory() as tmpdirname:
            vocab_file = os.path.join(tmpdirname,
                                      VOCAB_FILES_NAMES["vocab_file"])
            with open(vocab_file, "w", encoding="utf-8") as vocab_writer:
                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))

            input_text = "UNwant\u00E9d,running"
            output_text = "unwanted, running"

            create_and_check_tokenizer_commons(self, input_text, output_text,
                                               BertTokenizer, tmpdirname)

            tokenizer = BertTokenizer(vocab_file)

            tokens = tokenizer.tokenize("UNwant\u00E9d,running")
            self.assertListEqual(
                tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
            self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                                 [7, 4, 5, 10, 8, 9])
Пример #2
0
    def test_full_tokenizer(self):
        tokenizer = BertTokenizer(self.vocab_file)

        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
        self.assertListEqual(tokens,
                             ["un", "##want", "##ed", ",", "runn", "##ing"])
        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
                             [7, 4, 5, 10, 8, 9])
def main():
    # **************************** 基础信息 ***********************
    logger = init_logger(log_name=config['model']['arch'],
                         log_dir=config['output']['log_dir'])
    logger.info(f"seed is {config['train']['seed']}")
    device = 'cuda:%d' % config['train']['n_gpu'][0] if len(
        config['train']['n_gpu']) else 'cpu'
    seed_everything(seed=config['train']['seed'], device=device)
    logger.info('starting load data from disk')
    id2label = {value: key for key, value in config['label2id'].items()}
    #**************************** 数据生成 ***********************
    DT = DataTransformer(logger=logger, seed=config['train']['seed'])

    # 读取数据集以及数据划分
    targets, sentences = DT.read_data(
        raw_data_path=config['data']['test_file_path'],
        preprocessor=EnglishPreProcessor(),
        is_train=False)
    tokenizer = BertTokenizer(
        vocab_file=config['pretrained']['bert']['vocab_path'],
        do_lower_case=config['train']['do_lower_case'])
    # train
    test_dataset = CreateDataset(data=list(zip(sentences, targets)),
                                 tokenizer=tokenizer,
                                 max_seq_len=config['train']['max_seq_len'],
                                 seed=config['train']['seed'],
                                 example_type='test')
    # 验证数据集
    test_loader = DataLoader(dataset=test_dataset,
                             batch_size=config['train']['batch_size'],
                             num_workers=config['train']['num_workers'],
                             shuffle=False,
                             drop_last=False,
                             pin_memory=False)

    # **************************** 模型 ***********************
    logger.info("initializing model")
    model = BertFine.from_pretrained(
        config['pretrained']['bert']['bert_model_dir'],
        cache_dir=config['output']['cache_dir'],
        num_classes=len(id2label))
    # **************************** training model ***********************
    logger.info('model predicting....')
    predicter = Predicter(
        model=model,
        logger=logger,
        n_gpu=config['train']['n_gpu'],
        model_path=config['output']['checkpoint_dir'] /
        f"best_{config['model']['arch']}_model.pth",
    )
    # 拟合模型
    result = predicter.predict(data=test_loader)
    print(result)

    # 释放显存
    if len(config['train']['n_gpu']) > 0:
        torch.cuda.empty_cache()
Пример #4
0
def main():
    # **************************** 基础信息 ***********************
    logger = init_logger(log_name=config['model']['arch'], log_dir=config['output']['log_dir'])
    logger.info(f"seed is {config['train']['seed']}")
    device = f"cuda: {config['train']['n_gpu'][0] if len(config['train']['n_gpu']) else 'cpu'}"
    seed_everything(seed=config['train']['seed'],device=device)
    logger.info('starting load data from disk')
    id2label = {value: key for key, value in config['label2id'].items()}

    # **************************** 数据生成 ***********************
    DT = DataTransformer(logger = logger,seed = config['train']['seed'])
    # 读取数据集以及数据划分
    targets,sentences = DT.read_data(raw_data_path = config['data']['raw_data_path'],
                                    preprocessor = EnglishPreProcessor(),
                                    is_train = True)

    train, valid = DT.train_val_split(X = sentences,y = targets,save=True,shuffle=True,stratify=False,
                                      valid_size  = config['train']['valid_size'],
                                      train_path  = config['data']['train_file_path'],
                                      valid_path  = config['data']['valid_file_path'])

    tokenizer = BertTokenizer(vocab_file=config['pretrained']['bert']['vocab_path'],
                              do_lower_case=config['train']['do_lower_case'])

    # train
    train_dataset   = CreateDataset(data = train,
                                    tokenizer = tokenizer,
                                    max_seq_len = config['train']['max_seq_len'],
                                    seed = config['train']['seed'],
                                    example_type = 'train')
    # valid
    valid_dataset   = CreateDataset(data= valid,
                                    tokenizer = tokenizer,
                                    max_seq_len  = config['train']['max_seq_len'],
                                    seed = config['train']['seed'],
                                    example_type = 'valid')
    #加载训练数据集
    train_loader = DataLoader(dataset     = train_dataset,
                              batch_size  = config['train']['batch_size'],
                              num_workers = config['train']['num_workers'],
                              shuffle     = True,
                              drop_last   = False,
                              pin_memory  = False)
    # 验证数据集
    valid_loader = DataLoader(dataset     = valid_dataset,
                              batch_size  = config['train']['batch_size'],
                              num_workers = config['train']['num_workers'],
                              shuffle     = False,
                              drop_last   = False,
                              pin_memory  = False)

    # **************************** 模型 ***********************
    logger.info("initializing model")
    model = BertFine.from_pretrained(config['pretrained']['bert']['bert_model_dir'],
                                     cache_dir=config['output']['cache_dir'],
                                     num_classes = len(id2label))

    # ************************** 优化器 *************************
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    num_train_steps = int(
        len(train_dataset.examples) / config['train']['batch_size'] / config['train']['gradient_accumulation_steps'] * config['train']['epochs'])
    # t_total: total number of training steps for the learning rate schedule
    # warmup: portion of t_total for the warmup
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr = config['train']['learning_rate'])

    scheduler = WarmupLinearSchedule(optimizer, 
                                     warmup_steps=config['train']['warmup_steps'], 
                                     t_total=num_train_steps)

    # **************************** callbacks ***********************
    logger.info("initializing callbacks")
    # 模型保存
    model_checkpoint = ModelCheckpoint(checkpoint_dir   = config['output']['checkpoint_dir'],
                                       mode             = config['callbacks']['mode'],
                                       monitor          = config['callbacks']['monitor'],
                                       save_best_only   = config['callbacks']['save_best_only'],
                                       arch             = config['model']['arch'],
                                       logger           = logger)
    # 监控训练过程
    train_monitor = TrainingMonitor(file_dir = config['output']['figure_dir'],
                                    arch = config['model']['arch'])
    # 学习率机制
    lr_scheduler = BertLR(optimizer = optimizer,
                          learning_rate = config['train']['learning_rate'],
                          t_total = num_train_steps,
                          warmup = config['train']['warmup_steps'])

    # **************************** training model ***********************
    logger.info('training model....')

    train_configs = {
        'model': model,
        'logger': logger,
        'optimizer': optimizer,
        'scheduler': scheduler,
        'resume': config['train']['resume'],
        'epochs': config['train']['epochs'],
        'n_gpu': config['train']['n_gpu'],
        'gradient_accumulation_steps': config['train']['gradient_accumulation_steps'],
        'epoch_metrics':[F1Score(average='micro',task_type='binary'),MultiLabelReport(id2label = id2label)],
        'batch_metrics':[AccuracyThresh(thresh=0.5)],
        'criterion': BCEWithLogLoss(),
        'model_checkpoint': model_checkpoint,
        'training_monitor': train_monitor,
        'lr_scheduler': lr_scheduler,
        'early_stopping': None,
        'verbose': 1
    }

    trainer = Trainer(train_configs=train_configs)
    # 拟合模型
    trainer.train(train_data = train_loader,valid_data=valid_loader)
    # 释放显存
    if len(config['train']['n_gpu']) > 0:
        torch.cuda.empty_cache()
Пример #5
0
PUBTATOR_FILE = '/mnt/nfs/scratch1/rangell/BLINK/tmp/corpus_pubtator.txt'
PRED_PUBTATOR_FILE = '/mnt/nfs/scratch1/rangell/BLINK/tmp/pred_corpus_pubtator.txt'
PRED_MATCHES_FILE = '/mnt/nfs/scratch1/rangell/BLINK/tmp/matches_pred_corpus_pubtator.tsv'
TEST_PMIDS_FILE = '/mnt/nfs/scratch1/rangell/BLINK/tmp/corpus_pubtator_pmids_test.txt'
DATA_DIR = '/mnt/nfs/scratch1/rangell/BLINK/data/'
DATASET = 'medmentions'

OUTPUT_DIR = '/mnt/nfs/scratch1/rangell/BLINK/data/{}/taggerOne'.format(
    DATASET)

if __name__ == '__main__':

    # get tokenizer
    tokenizer = BertTokenizer(
        '../lerac/coref_entity_linking/models/biobert_v1.1_pubmed/vocab.txt',
        do_lower_case=False)

    # get all test pmids
    with open(TEST_PMIDS_FILE, 'r') as f:
        test_pmids = set(map(lambda x: x.strip(), f.readlines()))

    # get all of the documents
    raw_docs = defaultdict(str)
    gold_mention_labels = {}
    with open(PUBTATOR_FILE, 'r') as f:
        for line in f:
            line_split = line.split('|')
            if len(line_split) == 3:
                if line_split[0] not in test_pmids:
                    continue