예제 #1
0
 def fit_one_cycle(learn: Learner,
                   cyc_len: int,
                   max_lr: Union[Floats, slice] = defaults.lr,
                   moms: Tuple[float, float] = (0.95, 0.85),
                   div_factor: float = 25.,
                   pct_start: float = 0.3,
                   final_div: float = None,
                   wd: float = None,
                   callbacks: Optional[CallbackList] = None,
                   tot_epochs: int = None,
                   start_epoch: int = None,
                   teachers: Optional[list] = None) -> None:
     "Fit a model following the 1cycle policy."
     max_lr = learn.lr_range(max_lr)
     callbacks = listify(callbacks)
     callbacks.append(
         OneCycleScheduler(learn,
                           max_lr,
                           moms=moms,
                           div_factor=div_factor,
                           pct_start=pct_start,
                           final_div=final_div,
                           tot_epochs=tot_epochs,
                           start_epoch=start_epoch))
     learn.fit(cyc_len,
               max_lr,
               wd=wd,
               callbacks=callbacks,
               teachers=teachers)
예제 #2
0
def run_ner(
        lang: str = 'eng',
        log_dir: str = 'logs',
        task: str = NER,
        batch_size: int = 1,
        epochs: int = 1,
        dataset: str = 'data/conll-2003/',
        loss: str = 'cross',
        max_seq_len: int = 128,
        do_lower_case: bool = False,
        warmup_proportion: float = 0.1,
        rand_seed: int = None,
        ds_size: int = None,
        data_bunch_path: str = 'data/conll-2003/db',
        tuned_learner: str = None,
        do_train: str = False,
        do_eval: str = False,
        save: bool = False,
        nameX: str = 'ner',
        mask: tuple = ('s', 's'),
):
    name = "_".join(
        map(str, [
            nameX, task, lang, mask[0], mask[1], loss, batch_size, max_seq_len,
            do_train, do_eval
        ]))
    log_dir = Path(log_dir)
    log_dir.mkdir(parents=True, exist_ok=True)
    init_logger(log_dir, name)

    if rand_seed:
        random.seed(rand_seed)
        np.random.seed(rand_seed)
        torch.manual_seed(rand_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(rand_seed)

    trainset = dataset + lang + '/train.txt'
    devset = dataset + lang + '/dev.txt'
    testset = dataset + lang + '/test.txt'

    bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased'
    print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}')
    model = BertForTokenClassification.from_pretrained(bert_model,
                                                       num_labels=len(VOCAB),
                                                       cache_dir='bertm')
    if tuned_learner:
        print('Loading pretrained learner: ', tuned_learner)
        model.bert.load_state_dict(torch.load(tuned_learner))

    model = torch.nn.DataParallel(model)
    model_lr_group = bert_layer_list(model)
    layers = len(model_lr_group)
    kwargs = {'max_seq_len': max_seq_len, 'ds_size': ds_size, 'mask': mask}

    train_dl = DataLoader(dataset=NerDataset(trainset,
                                             bert_model,
                                             train=True,
                                             **kwargs),
                          batch_size=batch_size,
                          shuffle=True,
                          collate_fn=partial(pad, train=True))

    dev_dl = DataLoader(dataset=NerDataset(devset, bert_model, **kwargs),
                        batch_size=batch_size,
                        shuffle=False,
                        collate_fn=pad)

    test_dl = DataLoader(dataset=NerDataset(testset, bert_model, **kwargs),
                         batch_size=batch_size,
                         shuffle=False,
                         collate_fn=pad)

    data = DataBunch(train_dl=train_dl,
                     valid_dl=dev_dl,
                     test_dl=test_dl,
                     collate_fn=pad,
                     path=Path(data_bunch_path))

    train_opt_steps = int(len(train_dl.dataset) / batch_size) * epochs
    optim = BertAdam(model.parameters(),
                     lr=0.01,
                     warmup=warmup_proportion,
                     t_total=train_opt_steps)

    loss_fun = ner_loss_func if loss == 'cross' else partial(ner_loss_func,
                                                             zero=True)
    metrics = [Conll_F1()]

    learn = Learner(
        data,
        model,
        BertAdam,
        loss_func=loss_fun,
        metrics=metrics,
        true_wd=False,
        layer_groups=model_lr_group,
        path='learn' + nameX,
    )

    learn.opt = OptimWrapper(optim)

    lrm = 1.6

    # select set of starting lrs
    lrs_eng = [0.01, 5e-4, 3e-4, 3e-4, 1e-5]
    lrs_deu = [0.01, 5e-4, 5e-4, 3e-4, 2e-5]

    startlr = lrs_eng if lang == 'eng' else lrs_deu
    results = [['epoch', 'lr', 'f1', 'val_loss', 'train_loss', 'train_losses']]
    if do_train:
        learn.freeze()
        learn.fit_one_cycle(1, startlr[0], moms=(0.8, 0.7))
        learn.freeze_to(-3)
        lrs = learn.lr_range(slice(startlr[1] / (1.6**15), startlr[1]))
        learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))
        learn.freeze_to(-6)
        lrs = learn.lr_range(slice(startlr[2] / (1.6**15), startlr[2]))
        learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))
        learn.freeze_to(-12)
        lrs = learn.lr_range(slice(startlr[3] / (1.6**15), startlr[3]))
        learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))
        learn.unfreeze()
        lrs = learn.lr_range(slice(startlr[4] / (1.6**15), startlr[4]))
        learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))

    if do_eval:
        res = learn.validate(test_dl, metrics=metrics)
        met_res = [f'{m.__name__}: {r}' for m, r in zip(metrics, res[1:])]
        print(f'Validation on TEST SET:\nloss {res[0]}, {met_res}')
        results.append(['val', '-', res[1], res[0], '-', '-'])

    with open(log_dir / (name + '.csv'), 'a') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(results)
예제 #3
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--pregenerated_data', type=Path, required=True)
    parser.add_argument('--output_dir', type=Path, required=True)
    parser.add_argument("--bert_model", type=str, required=True,
                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
                                 "bert-base-multilingual-cased", "bert-base-chinese"])
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument("--reduce_memory", action="store_true",
                        help="Store training data as on-disc memmaps to massively reduce memory usage")

    parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                        "0 (default value): dynamic loss scaling.\n"
                        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument('--seed',
                        type=int,
                        default=None,
                        help="random seed for initialization")
    args = parser.parse_args()

    assert args.pregenerated_data.is_dir(), \
        "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"

    samples_per_epoch = []
    for i in range(args.epochs):
        epoch_file = args.pregenerated_data / f"epoch_{i}.json"
        metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
        if epoch_file.is_file() and metrics_file.is_file():
            metrics = json.loads(metrics_file.read_text())
            samples_per_epoch.append(metrics['num_training_examples'])
        else:
            if i == 0:
                exit("No training data was found!")
            print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
            print("This script will loop over the available data, but training diversity may be negatively impacted.")
            num_data_epochs = i
            break
    else:
        num_data_epochs = args.epochs
    print(samples_per_epoch)

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    if args.seed:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        if n_gpu > 0:
            torch.cuda.manual_seed_all(args.seed)

    if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
        logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
    args.output_dir.mkdir(parents=True, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    total_train_examples = 0
    for i in range(args.epochs):
        # The modulo takes into account the fact that we may loop over limited epochs of data
        total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]

    num_train_optimization_steps = int(
        total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)

    # Prepare model
    model = BertForPreTraining.from_pretrained(args.bert_model)
    model = torch.nn.DataParallel(model)

    # Prepare optimizer
    optimizer = BertAdam

    train_dataloader = DataLoader(
        PregeneratedData(args.pregenerated_data,  tokenizer,args.epochs, args.train_batch_size),
        batch_size=args.train_batch_size,
    )

    data = DataBunch(train_dataloader,train_dataloader)
    global_step = 0
    logging.info("***** Running training *****")
    logging.info(f"  Num examples = {total_train_examples}")
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)
    def loss(x, y):
        return x.mean()

    learn = Learner(data, model, optimizer,
                    loss_func=loss,
                    true_wd=False,
                    path='learn',
                    layer_groups=bert_layer_list(model),
    )

    lr= args.learning_rate
    layers = len(bert_layer_list(model))
    lrs = learn.lr_range(slice(lr/(2.6**4), lr))
    for epoch in range(args.epochs):
        learn.fit_one_cycle(1, lrs, wd=0.01)
        # save model at half way point
        if epoch == args.epochs//2:
            savem = learn.model.module.bert if hasattr(learn.model, 'module') else learn.model.bert
            output_model_file = args.output_dir / (f"pytorch_fastai_model_{args.bert_model}_{epoch}.bin")
            torch.save(savem.state_dict(), str(output_model_file))
            print(f'Saved bert to {output_model_file}')

    savem = learn.model.module.bert if hasattr(learn.model, 'module') else learn.model.bert
    output_model_file = args.output_dir / (f"pytorch_fastai_model_{args.bert_model}_{args.epochs}.bin")
    torch.save(savem.state_dict(), str(output_model_file))
    print(f'Saved bert to {output_model_file}')
예제 #4
0
def run_ner(
        lang: str = 'eng',
        log_dir: str = 'logs',
        task: str = NER,
        batch_size: int = 1,
        lr: float = 5e-5,
        epochs: int = 1,
        dataset: str = 'data/conll-2003/',
        loss: str = 'cross',
        max_seq_len: int = 128,
        do_lower_case: bool = False,
        warmup_proportion: float = 0.1,
        grad_acc_steps: int = 1,
        rand_seed: int = None,
        fp16: bool = False,
        loss_scale: float = None,
        ds_size: int = None,
        data_bunch_path: str = 'data/conll-2003/db',
        bertAdam: bool = False,
        freez: bool = False,
        one_cycle: bool = False,
        discr: bool = False,
        lrm: int = 2.6,
        div: int = None,
        tuned_learner: str = None,
        do_train: str = False,
        do_eval: str = False,
        save: bool = False,
        name: str = 'ner',
        mask: tuple = ('s', 's'),
):
    name = "_".join(
        map(str, [
            name, task, lang, mask[0], mask[1], loss, batch_size, lr,
            max_seq_len, do_train, do_eval
        ]))

    log_dir = Path(log_dir)
    log_dir.mkdir(parents=True, exist_ok=True)
    init_logger(log_dir, name)

    if rand_seed:
        random.seed(rand_seed)
        np.random.seed(rand_seed)
        torch.manual_seed(rand_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(rand_seed)

    trainset = dataset + lang + '/train.txt'
    devset = dataset + lang + '/dev.txt'
    testset = dataset + lang + '/test.txt'

    bert_model = 'bert-base-cased' if lang == 'eng' else 'bert-base-multilingual-cased'
    print(f'Lang: {lang}\nModel: {bert_model}\nRun: {name}')
    model = BertForTokenClassification.from_pretrained(bert_model,
                                                       num_labels=len(VOCAB),
                                                       cache_dir='bertm')

    model = torch.nn.DataParallel(model)
    model_lr_group = bert_layer_list(model)
    layers = len(model_lr_group)
    kwargs = {'max_seq_len': max_seq_len, 'ds_size': ds_size, 'mask': mask}

    train_dl = DataLoader(dataset=NerDataset(trainset,
                                             bert_model,
                                             train=True,
                                             **kwargs),
                          batch_size=batch_size,
                          shuffle=True,
                          collate_fn=partial(pad, train=True))

    dev_dl = DataLoader(dataset=NerDataset(devset, bert_model, **kwargs),
                        batch_size=batch_size,
                        shuffle=False,
                        collate_fn=pad)

    test_dl = DataLoader(dataset=NerDataset(testset, bert_model, **kwargs),
                         batch_size=batch_size,
                         shuffle=False,
                         collate_fn=pad)

    data = DataBunch(train_dl=train_dl,
                     valid_dl=dev_dl,
                     test_dl=test_dl,
                     collate_fn=pad,
                     path=Path(data_bunch_path))

    loss_fun = ner_loss_func if loss == 'cross' else partial(ner_loss_func,
                                                             zero=True)
    metrics = [Conll_F1()]

    learn = Learner(
        data,
        model,
        BertAdam,
        loss_func=loss_fun,
        metrics=metrics,
        true_wd=False,
        layer_groups=None if not freez else model_lr_group,
        path='learn',
    )

    # initialise bert adam optimiser
    train_opt_steps = int(len(train_dl.dataset) / batch_size) * epochs
    optim = BertAdam(model.parameters(),
                     lr=lr,
                     warmup=warmup_proportion,
                     t_total=train_opt_steps)

    if bertAdam: learn.opt = OptimWrapper(optim)
    else: print("No Bert Adam")

    # load fine-tuned learner
    if tuned_learner:
        print('Loading pretrained learner: ', tuned_learner)
        learn.load(tuned_learner)

    # Uncomment to graph learning rate plot
    # learn.lr_find()
    # learn.recorder.plot(skip_end=15)

    # set lr (discriminative learning rates)
    if div: layers = div
    lrs = lr if not discr else learn.lr_range(slice(lr / lrm**(layers), lr))

    results = [['epoch', 'lr', 'f1', 'val_loss', 'train_loss', 'train_losses']]

    if do_train:
        for epoch in range(epochs):
            if freez:
                lay = (layers // (epochs - 1)) * epoch * -1
                if lay == 0:
                    print('Freeze')
                    learn.freeze()
                elif lay == layers:
                    print('unfreeze')
                    learn.unfreeze()
                else:
                    print('freeze2')
                    learn.freeze_to(lay)
                print('Freezing layers ', lay, ' off ', layers)

            # Fit Learner - eg train model
            if one_cycle: learn.fit_one_cycle(1, lrs, moms=(0.8, 0.7))
            else: learn.fit(1, lrs)

            results.append([
                epoch,
                lrs,
                learn.recorder.metrics[0][0],
                learn.recorder.val_losses[0],
                np.array(learn.recorder.losses).mean(),
                learn.recorder.losses,
            ])

            if save:
                m_path = learn.save(f"{lang}_{epoch}_model", return_path=True)
                print(f'Saved model to {m_path}')
    if save: learn.export(f'{lang}.pkl')

    if do_eval:
        res = learn.validate(test_dl, metrics=metrics)
        met_res = [f'{m.__name__}: {r}' for m, r in zip(metrics, res[1:])]
        print(f'Validation on TEST SET:\nloss {res[0]}, {met_res}')
        results.append(['val', '-', res[1], res[0], '-', '-'])

    with open(log_dir / (name + '.csv'), 'a') as resultFile:
        wr = csv.writer(resultFile)
        wr.writerows(results)