Exemplo n.º 1
0
    n_ctx = train_dataset.n_ctx
    
    batch_size = args.batch_size_per_gpu
    n_gpus = torch.cuda.device_count()
    
    if n_gpus > 1:  # https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
      batch_size *= n_gpus

    n_updates_total = (train_size // batch_size) * args.n_epoch


    model_stepwise = StepwiseClassifierModel(args, n_classifier=args.n_classes, vocab_count=args.vocab_count, extra_block=args.extra_block)

    model_opt = OpenAIAdam(model_stepwise.parameters(),
                           lr=args.lr, schedule=args.lr_schedule, 
                           warmup=args.lr_warmup, t_total=n_updates_total,
                           b1=args.b1, b2=args.b2, e=args.e,
                           l2=args.l2, ector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)
                           
    epoch_start, epoch_max, loss_best = -1, args.n_epoch, None

    if args.checkpoint is None:
      load_openai_pretrained_model(
        model_stepwise.transformer, 
        n_special=args.tokens_special,  n_ctx=n_ctx,   # n_ctx adjusts embedding size to include positional
        path=pretrained_model_path+'/',
        path_names=os.path.join('.', 'orig', 'pytorch-openai-transformer-lm')+'/',
      )

    model_stepwise.to(device)
Exemplo n.º 2
0
# make tokenizer
tokenizer = make_tokenizer(args)

# make model
device = torch.device(args.gpu_ids)
model = make_model(args, device)
model = model.to(device)

##make optimizer
optimizer = OpenAIAdam(model.parameters(),
                       lr=args.lr,
                       schedule='warmup_linear',
                       warmup=0.002,
                       t_total=args.steps,
                       b1=0.9,
                       b2=0.999,
                       e=1e-08,
                       l2=0.01,
                       vector_l2=True,
                       max_grad_norm=args.clip)

# critirion = torch.nn.CrossEntropyLoss()

step = 0
bar = tqdm.tqdm(total=args.steps)
bar.update(0)
best_acc = 0
recoder = Recoder_multi(args)

best_loss = float('inf')
    n_train = len(trY)
    n_valid = len(vaY)
    n_batch_train = args.n_batch * max(n_gpu, 1)
    n_updates_total = (n_train // n_batch_train) * args.n_iter

    dh_model = DoubleHeadModel(args, clf_token, ('classification', 3), vocab,
                               n_ctx)

    criterion = nn.CrossEntropyLoss(reduction='none')
    model_opt = OpenAIAdam(dh_model.parameters(),
                           lr=args.lr,
                           schedule=args.lr_schedule,
                           warmup=args.lr_warmup,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)
    compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion,
                                                 args.lm_coef, model_opt)
    openAIModel = OpenAIModel()
    openAIModel.load_openai_pretrained_model(dh_model.transformer,
                                             n_ctx=n_ctx,
                                             n_special=n_special)

    dh_model.to(device)
    dh_model = nn.DataParallel(dh_model)
Exemplo n.º 4
0
    n_batch_train = n_batch*n_gpu
    n_updates_total = (n_train//n_batch_train)*n_iter

    print("n_vocab", n_vocab)
    print("n_ctx", n_ctx)
    print("vocab", vocab)
    print("n_train", n_train, "n_updates_total", n_updates_total)

    # declare the model and lmhead
    model = Model(args, vocab, n_ctx)
    lm_head = LMHead(model, args)

    # declare loss function and the optimizer
    criterion = nn.CrossEntropyLoss(reduce=False) # TODO check loss functions
    model_opt = OpenAIAdam(model.parameters(), lr=lr, schedule=lr_schedule,
                            warmup=lr_warmup, t_total=n_updates_total, b1=b1,
                            b2=b2, e=e, l2=l2, vector_l2=vector_l2,
                            max_grad_norm=max_grad_norm)
    compute_loss_fct = LossCompute(criterion, lm_coef, model_opt)

    # this part will be changed for multigpu support
    model.to(device)
    lm_head.to(device)

    n_updates = 0
    n_epochs = 0

    make_path(os.path.join(save_dir, desc, 'temp.txt'))
    # repeat for n_iter epochs
    while n_epochs < n_iter:
        iters = 0
        # split to train and valid
    trYt = trY
    best_score = 0

    n_batch_train = args.n_batch * max(n_gpu, 1)
    n_updates_total = (n_train_lm // n_batch_train) * args.n_iter_lm

    print(n_updates_total)

    criterion = nn.CrossEntropyLoss(reduce=False)
    model_opt = OpenAIAdam(dh_model.parameters(),
                           lr=6.25e-5,
                           schedule=args.lr_schedule,
                           warmup=.002,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)

    compute_loss_fct = ClassificationLossCompute(criterion, criterion,
                                                 args.lm_coef, model_opt)

    for i in range(args.n_iter_lm):
        print("running lm fine-tuning epoch: ", i)
        run_epoch_lm()
        n_epochs += 1
        log_lm(save_dir, desc)
Exemplo n.º 6
0
                          target_type)

    if config['opt'] == 'adam':
        model_opt = Adam(dh_model.parameters(),
                         lr=config['lr'],
                         betas=(config['b1'], config['b2']),
                         eps=config['eps'])
    elif config['opt'] == 'openai_adam':
        n_updates_total = (train_dataloader.dataset.instances.shape[0] //
                           config['batch_size']) * config['n_iter']
        model_opt = OpenAIAdam(dh_model.parameters(),
                               lr=config['lr'],
                               schedule=config['lr_schedule'],
                               warmup=config['lr_warmup'],
                               t_total=n_updates_total,
                               b1=config['b1'],
                               b2=config['b2'],
                               e=config['eps'],
                               l2=config['l2'],
                               vector_l2=config['vector_l2'],
                               max_grad_norm=config['max_grad_norm'])
    elif config['opt'] == 'sgd':
        model_opt = SGD(dh_model.parameters(), lr=config['lr'])
    else:
        raise NotImplementedError()

    dh_model.to(device)

    task_file_name = os.path.basename(args.task_path)
    task_name = os.path.join(
        os.path.splitext(task_file_name)[0], '{}tr__{}val__{}te'.format(
Exemplo n.º 7
0
def main(args,pretrain_setting,finetune_setting):
    # Build Model and push model to GPU
    device = torch.device('cuda')
    if args.do_pretrain:
        model = make_model()
    else:
        model = make_model(finetune_setting.load_model_pth)
    
    model = model.to(device)

    #Build dataset

    if args.do_pretrain:
        pretrain_dataset = make_pretrain_dataset(pretrain_setting,
                                                pretrain_setting.saved_data_pth, 
                                                pretrain_setting.raw_data_pth,
                                                pretrain_setting.processed_data_pth)
    if args.do_finetune:
        finetune_dataset, train_data, test_data = make_finetune_dataset(saved_data_pth = finetune_setting.saved_data_pth,
                                            raw_data_pth = finetune_setting.raw_data_pth, 
                                            processed_data_pth = finetune_setting.processed_data_pth)

    if args.do_pretrain:
        num_train_optimization_steps = len(pretrain_dataset["train"]) * pretrain_setting.epoch_num // pretrain_setting.batch_size // pretrain_setting.num_accumulation
        optimizer = OpenAIAdam(model.parameters(),
                                lr=1e-5,
                                schedule='warmup_linear',
                                warmup=0.002,
                                t_total=num_train_optimization_steps,
                                b1=0.9,
                                b2=0.999,
                                e=1e-08,
                                l2=0.01,
                                vector_l2=True,
                                max_grad_norm=1)
        pretrain.train(model,
                       dataset = pretrain_dataset,
                       optimizer = optimizer,
                       log_path = pretrain_setting.log_pth,
                       best_model_pth = pretrain_setting.best_model_pth,
                       batch_size=pretrain_setting.batch_size,
                       num_accumulation=pretrain_setting.num_accumulation,
                       epoch_num=pretrain_setting.epoch_num)

    if args.do_finetune:
        if args.do_pretrain:
          model = make_model(finetune_setting.load_model_pth)
        num_train_optimization_steps = len(finetune_dataset["train"]) * finetune_setting.epoch_num // finetune_setting.batch_size // finetune_setting.num_accumulation
        optimizer = OpenAIAdam(model.parameters(),
                                lr=1e-5,
                                schedule='warmup_linear',
                                warmup=0.002,
                                t_total=num_train_optimization_steps,
                                b1=0.9,
                                b2=0.999,
                                e=1e-08,
                                l2=0.01,
                                vector_l2=True,
                                max_grad_norm=1)
        finetune.train(model,
                       dataset = finetune_dataset,
                       test_data = test_data,
                       train_data = train_data,
                       optimizer = optimizer,
                       log_path = finetune_setting.log_pth,
                       gen_path = finetune_setting.gen_pth,
                       best_model_pth = finetune_setting.best_model_pth,
                       batch_size=finetune_setting.batch_size,
                       num_accumulation=finetune_setting.num_accumulation,
                       epoch_num=finetune_setting.epoch_num)
Exemplo n.º 8
0
def run_epoch2(train, test):

    train = LM_Dataset(train, batch_size=16)
    test = LM_Dataset(test, batch_size=16)

    opt = OpenAIAdam(dh_model.parameters(),
                     lr=6.25e-5,
                     schedule='warmup_linear',
                     warmup=0.002,
                     t_total=train.n_batches * 3,
                     b1=.9,
                     b2=.999,
                     e=1e-8,
                     l2=0.01,
                     vector_l2=True,
                     max_grad_norm=1)

    #opt = torch.optim.Adam(lr=6.25e-5,params=dh_model.parameters())
    opt = Adam16(lr=6.25e-5, params=dh_model.parameters())
    #opt = torch.optim.SGD(lr=6.25e-5,params=dh_model.parameters())

    opt = FP16_Optimizer(opt, static_loss_scale=1, dynamic_loss_scale=False)

    criterion = nn.CrossEntropyLoss(reduce=False)

    L = LangModelLoss(criterion, opt=opt)

    avg_loss_train, avg_loss_test = 0, 0
    acc_train, acc_test = 0, 0

    for i in tqdm(range(train.n_batches)):

        data = train.next()
        data, mask = transform_data(data)
        data = torch.from_numpy(data).long()
        mask = torch.from_numpy(mask)

        opt.zero_grad()

        if GPU:
            data = data.cuda()
            mask = mask.cuda().half()

        lm_logits, clf_logits = dh_model(data)

        loss = L(data, mask, lm_logits=lm_logits, only_return_losses=False)
        print(loss)
        avg_loss_train += loss

    print('Training Loss: ', avg_loss_train / len(train_loader))

    for i in tqdm(range(test.n_batches)):

        data = train.next()
        data, mask = transform_data(data)
        data = torch.from_numpy(data).long()
        mask = torch.from_numpy(mask)

        opt.zero_grad()

        if GPU:
            data = data.cuda()
            mask = mask.cuda().half()

        lm_logits, clf_logits = dh_model(data)
        loss = L(data, mask, lm_logits=lm_logits, only_return_losses=True)

        avg_loss_test += loss

    print('Test Loss: ', avg_loss_test / len(test_loader))
Exemplo n.º 9
0
def run_epoch(train_loader, test_loader):

    opt = OpenAIAdam(dh_model.parameters(),
                     lr=6.25e-5,
                     schedule='warmup_linear',
                     warmup=0.002,
                     t_total=len(train_loader) * 3,
                     b1=.9,
                     b2=.999,
                     e=1e-8,
                     l2=0.01,
                     vector_l2=True,
                     max_grad_norm=1)

    opt = torch.optim.Adam(lr=6.25e-5, params=dh_model.parameters())

    print(half)

    if half:

        opt = Adam16(lr=6.25e-5, params=dh_model.parameters())

    criterion = nn.CrossEntropyLoss(reduce=False)

    L = LangModelLoss(criterion, opt=opt)

    avg_loss_train, avg_loss_test = 0, 0
    acc_train, acc_test = 0, 0

    for (data, mask), target in tqdm(train_loader):
        opt.zero_grad()

        if GPU:
            data = data.cuda()
            target = target.cuda()
            mask = mask.cuda()  #.half()

        if half:
            mask = mask.half()

        lm_logits, clf_logits = dh_model(data)

        loss = L(data, mask, lm_logits=lm_logits, only_return_losses=False)
        print(loss)
        avg_loss_train += loss

    print('Training Loss: ', avg_loss_train / len(train_loader))

    for (data, mask), target in tqdm(test_loader):

        opt.zero_grad()

        if GPU:
            data = data.cuda()
            target = target.cuda()
            mask = mask.cuda()  #.half()

        if half:
            mask = mask.half()

        lm_logits, clf_logits = dh_model(data)
        loss = L(data, mask, lm_logits=lm_logits, only_return_losses=True)

        avg_loss_test += loss

    print('Test Loss: ', avg_loss_test / len(test_loader))
Exemplo n.º 10
0
    test_set = TestDataset(data_dir, args.dataset, params.num_class)
    #sampler = WeightedSampler(data_dir, args.dataset) # Use weighted sampler instead of random sampler
    train_loader = DataLoader(train_set, batch_size=params.batch_size, sampler=RandomSampler(train_set), num_workers=4)
    valid_loader = DataLoader(valid_set, batch_size=params.predict_batch, sampler=RandomSampler(valid_set), num_workers=4)
    test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4)
    logger.info('Loading complete.')

    n_updates_total = (train_set.__len__() // params.batch_size) * params.num_epochs

    optimizer_D = optim.RMSprop(discriminator.parameters(), lr = params.lr_d)
    optimizer_G = OpenAIAdam(model.parameters(),
                           lr=params.lr,
                           schedule=params.lr_schedule,
                           warmup=params.lr_warmup,
                           t_total=n_updates_total,
                           b1=0.9,
                           b2=0.999,
                           e=1e-8,
                           l2=0.01,
                           vector_l2='store_true',
                           max_grad_norm=1)

    adversarial_loss = torch.nn.BCELoss()
    # Train the model
    logger.info('Starting training for {} epoch(s)'.format(params.num_epochs))
    train_and_evaluate(model,
                       discriminator,
                       train_loader,
                       valid_loader,
                       test_loader,
                       optimizer_G,
Exemplo n.º 11
0
    n_train = len(trY)
    n_valid = len(vaY)
    n_batch_train = args.n_batch * max(n_gpu, 1)
    n_updates_total = (n_train // n_batch_train) * args.n_iter

    dh_model = DoubleHeadModel(args, clf_token, 'multiple_choice', vocab,
                               n_ctx)

    criterion = nn.CrossEntropyLoss(reduce=False)
    model_opt = OpenAIAdam(
        params=dh_model.parameters(),
        lr=args.lr,  # 6.25e-5
        schedule=args.lr_schedule,  # warmup_linear
        warmup=args.lr_warmup,  # 0.002
        t_total=n_updates_total,  # 748
        b1=args.b1,  # 0.9
        b2=args.b2,  # 0.999
        e=args.e,  # 1e-8
        l2=args.l2,  # 0.01
        vector_l2=args.vector_l2,
        max_grad_norm=args.max_grad_norm  # 1
    )
    compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion,
                                                 args.lm_coef, model_opt)
    load_openai_pretrained_model(dh_model.transformer,
                                 n_ctx=n_ctx,
                                 n_special=n_special)

    dh_model.to(device)
    dh_model = nn.DataParallel(dh_model)
def main(args):
    init(args)

    # Constants
    n_ctx = args.n_ctx
    save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints")
    desc = args.desc
    data_dir = args.data_dir
    log_dir = os.path.join(args.output_dir, args.experiment_name, "logs")
    train_log_interval = args.train_log_interval
    val_log_interval = args.val_log_interval
    beam = args.beam
    gen_len = args.gen_len
    k = args.k
    decoding_strategy = args.decoding_strategy
    accum_iter = args.accum_iter

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    print("device", device, "n_gpu", n_gpu)
    logger = Logger(log_dir)

    text_encoder = TextEncoder(args.encoder_path, args.vocab_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3

    print("Loading dataset...")
    train_loader = get_loader(os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, encoder, num_workers=3, shuffle=True)
    val_loader = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=0, shuffle=False, max_size=args.num_val_examples)
    print("Train length: {}, Validation length: {}".format(len(train_loader), len(val_loader)))

    vocab = n_vocab + n_special + n_ctx
    n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs_dat + args.num_epochs_ft)

    dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model)

    criterion = nn.CrossEntropyLoss(reduction="none")
    model_opt = OpenAIAdam(dh_model.parameters(),
                           lr=args.lr,
                           schedule=args.lr_schedule,
                           warmup=args.lr_warmup,
                           t_total=n_updates_total,
                           b1=args.b1,
                           b2=args.b2,
                           e=args.e,
                           l2=args.l2,
                           vector_l2=args.vector_l2,
                           max_grad_norm=args.max_grad_norm)

    lm_loss = LMLoss(criterion)
    summary_loss = SummaryLoss(criterion)

    print("Loading Model")
    if args.use_pretrain:
        load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./")
    start_iter, running_loss = load_checkpoint(args.checkpoint, dh_model, model_opt, vocab, n_ctx)

    dh_model.to(device)
    dh_model = DataParallelModel(dh_model)
    lm_loss = DataParallelCriterion(lm_loss)
    summary_loss = DataParallelCriterion(summary_loss)

    for i in range(args.num_epochs_dat):
        start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "DAT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_dat), save_dir, logger, text_encoder, show_progress=args.show_progress, summary_loss=summary_loss)
    for i in range(args.num_epochs_ft):
        start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, summary_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_ft), save_dir, logger, text_encoder, show_progress=args.show_progress)