n_ctx = train_dataset.n_ctx batch_size = args.batch_size_per_gpu n_gpus = torch.cuda.device_count() if n_gpus > 1: # https://pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html batch_size *= n_gpus n_updates_total = (train_size // batch_size) * args.n_epoch model_stepwise = StepwiseClassifierModel(args, n_classifier=args.n_classes, vocab_count=args.vocab_count, extra_block=args.extra_block) model_opt = OpenAIAdam(model_stepwise.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, ector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) epoch_start, epoch_max, loss_best = -1, args.n_epoch, None if args.checkpoint is None: load_openai_pretrained_model( model_stepwise.transformer, n_special=args.tokens_special, n_ctx=n_ctx, # n_ctx adjusts embedding size to include positional path=pretrained_model_path+'/', path_names=os.path.join('.', 'orig', 'pytorch-openai-transformer-lm')+'/', ) model_stepwise.to(device)
# make tokenizer tokenizer = make_tokenizer(args) # make model device = torch.device(args.gpu_ids) model = make_model(args, device) model = model.to(device) ##make optimizer optimizer = OpenAIAdam(model.parameters(), lr=args.lr, schedule='warmup_linear', warmup=0.002, t_total=args.steps, b1=0.9, b2=0.999, e=1e-08, l2=0.01, vector_l2=True, max_grad_norm=args.clip) # critirion = torch.nn.CrossEntropyLoss() step = 0 bar = tqdm.tqdm(total=args.steps) bar.update(0) best_acc = 0 recoder = Recoder_multi(args) best_loss = float('inf')
n_train = len(trY) n_valid = len(vaY) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter dh_model = DoubleHeadModel(args, clf_token, ('classification', 3), vocab, n_ctx) criterion = nn.CrossEntropyLoss(reduction='none') model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion, args.lm_coef, model_opt) openAIModel = OpenAIModel() openAIModel.load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special) dh_model.to(device) dh_model = nn.DataParallel(dh_model)
n_batch_train = n_batch*n_gpu n_updates_total = (n_train//n_batch_train)*n_iter print("n_vocab", n_vocab) print("n_ctx", n_ctx) print("vocab", vocab) print("n_train", n_train, "n_updates_total", n_updates_total) # declare the model and lmhead model = Model(args, vocab, n_ctx) lm_head = LMHead(model, args) # declare loss function and the optimizer criterion = nn.CrossEntropyLoss(reduce=False) # TODO check loss functions model_opt = OpenAIAdam(model.parameters(), lr=lr, schedule=lr_schedule, warmup=lr_warmup, t_total=n_updates_total, b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2, max_grad_norm=max_grad_norm) compute_loss_fct = LossCompute(criterion, lm_coef, model_opt) # this part will be changed for multigpu support model.to(device) lm_head.to(device) n_updates = 0 n_epochs = 0 make_path(os.path.join(save_dir, desc, 'temp.txt')) # repeat for n_iter epochs while n_epochs < n_iter: iters = 0 # split to train and valid
trYt = trY best_score = 0 n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train_lm // n_batch_train) * args.n_iter_lm print(n_updates_total) criterion = nn.CrossEntropyLoss(reduce=False) model_opt = OpenAIAdam(dh_model.parameters(), lr=6.25e-5, schedule=args.lr_schedule, warmup=.002, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = ClassificationLossCompute(criterion, criterion, args.lm_coef, model_opt) for i in range(args.n_iter_lm): print("running lm fine-tuning epoch: ", i) run_epoch_lm() n_epochs += 1 log_lm(save_dir, desc)
target_type) if config['opt'] == 'adam': model_opt = Adam(dh_model.parameters(), lr=config['lr'], betas=(config['b1'], config['b2']), eps=config['eps']) elif config['opt'] == 'openai_adam': n_updates_total = (train_dataloader.dataset.instances.shape[0] // config['batch_size']) * config['n_iter'] model_opt = OpenAIAdam(dh_model.parameters(), lr=config['lr'], schedule=config['lr_schedule'], warmup=config['lr_warmup'], t_total=n_updates_total, b1=config['b1'], b2=config['b2'], e=config['eps'], l2=config['l2'], vector_l2=config['vector_l2'], max_grad_norm=config['max_grad_norm']) elif config['opt'] == 'sgd': model_opt = SGD(dh_model.parameters(), lr=config['lr']) else: raise NotImplementedError() dh_model.to(device) task_file_name = os.path.basename(args.task_path) task_name = os.path.join( os.path.splitext(task_file_name)[0], '{}tr__{}val__{}te'.format(
def main(args,pretrain_setting,finetune_setting): # Build Model and push model to GPU device = torch.device('cuda') if args.do_pretrain: model = make_model() else: model = make_model(finetune_setting.load_model_pth) model = model.to(device) #Build dataset if args.do_pretrain: pretrain_dataset = make_pretrain_dataset(pretrain_setting, pretrain_setting.saved_data_pth, pretrain_setting.raw_data_pth, pretrain_setting.processed_data_pth) if args.do_finetune: finetune_dataset, train_data, test_data = make_finetune_dataset(saved_data_pth = finetune_setting.saved_data_pth, raw_data_pth = finetune_setting.raw_data_pth, processed_data_pth = finetune_setting.processed_data_pth) if args.do_pretrain: num_train_optimization_steps = len(pretrain_dataset["train"]) * pretrain_setting.epoch_num // pretrain_setting.batch_size // pretrain_setting.num_accumulation optimizer = OpenAIAdam(model.parameters(), lr=1e-5, schedule='warmup_linear', warmup=0.002, t_total=num_train_optimization_steps, b1=0.9, b2=0.999, e=1e-08, l2=0.01, vector_l2=True, max_grad_norm=1) pretrain.train(model, dataset = pretrain_dataset, optimizer = optimizer, log_path = pretrain_setting.log_pth, best_model_pth = pretrain_setting.best_model_pth, batch_size=pretrain_setting.batch_size, num_accumulation=pretrain_setting.num_accumulation, epoch_num=pretrain_setting.epoch_num) if args.do_finetune: if args.do_pretrain: model = make_model(finetune_setting.load_model_pth) num_train_optimization_steps = len(finetune_dataset["train"]) * finetune_setting.epoch_num // finetune_setting.batch_size // finetune_setting.num_accumulation optimizer = OpenAIAdam(model.parameters(), lr=1e-5, schedule='warmup_linear', warmup=0.002, t_total=num_train_optimization_steps, b1=0.9, b2=0.999, e=1e-08, l2=0.01, vector_l2=True, max_grad_norm=1) finetune.train(model, dataset = finetune_dataset, test_data = test_data, train_data = train_data, optimizer = optimizer, log_path = finetune_setting.log_pth, gen_path = finetune_setting.gen_pth, best_model_pth = finetune_setting.best_model_pth, batch_size=finetune_setting.batch_size, num_accumulation=finetune_setting.num_accumulation, epoch_num=finetune_setting.epoch_num)
def run_epoch2(train, test): train = LM_Dataset(train, batch_size=16) test = LM_Dataset(test, batch_size=16) opt = OpenAIAdam(dh_model.parameters(), lr=6.25e-5, schedule='warmup_linear', warmup=0.002, t_total=train.n_batches * 3, b1=.9, b2=.999, e=1e-8, l2=0.01, vector_l2=True, max_grad_norm=1) #opt = torch.optim.Adam(lr=6.25e-5,params=dh_model.parameters()) opt = Adam16(lr=6.25e-5, params=dh_model.parameters()) #opt = torch.optim.SGD(lr=6.25e-5,params=dh_model.parameters()) opt = FP16_Optimizer(opt, static_loss_scale=1, dynamic_loss_scale=False) criterion = nn.CrossEntropyLoss(reduce=False) L = LangModelLoss(criterion, opt=opt) avg_loss_train, avg_loss_test = 0, 0 acc_train, acc_test = 0, 0 for i in tqdm(range(train.n_batches)): data = train.next() data, mask = transform_data(data) data = torch.from_numpy(data).long() mask = torch.from_numpy(mask) opt.zero_grad() if GPU: data = data.cuda() mask = mask.cuda().half() lm_logits, clf_logits = dh_model(data) loss = L(data, mask, lm_logits=lm_logits, only_return_losses=False) print(loss) avg_loss_train += loss print('Training Loss: ', avg_loss_train / len(train_loader)) for i in tqdm(range(test.n_batches)): data = train.next() data, mask = transform_data(data) data = torch.from_numpy(data).long() mask = torch.from_numpy(mask) opt.zero_grad() if GPU: data = data.cuda() mask = mask.cuda().half() lm_logits, clf_logits = dh_model(data) loss = L(data, mask, lm_logits=lm_logits, only_return_losses=True) avg_loss_test += loss print('Test Loss: ', avg_loss_test / len(test_loader))
def run_epoch(train_loader, test_loader): opt = OpenAIAdam(dh_model.parameters(), lr=6.25e-5, schedule='warmup_linear', warmup=0.002, t_total=len(train_loader) * 3, b1=.9, b2=.999, e=1e-8, l2=0.01, vector_l2=True, max_grad_norm=1) opt = torch.optim.Adam(lr=6.25e-5, params=dh_model.parameters()) print(half) if half: opt = Adam16(lr=6.25e-5, params=dh_model.parameters()) criterion = nn.CrossEntropyLoss(reduce=False) L = LangModelLoss(criterion, opt=opt) avg_loss_train, avg_loss_test = 0, 0 acc_train, acc_test = 0, 0 for (data, mask), target in tqdm(train_loader): opt.zero_grad() if GPU: data = data.cuda() target = target.cuda() mask = mask.cuda() #.half() if half: mask = mask.half() lm_logits, clf_logits = dh_model(data) loss = L(data, mask, lm_logits=lm_logits, only_return_losses=False) print(loss) avg_loss_train += loss print('Training Loss: ', avg_loss_train / len(train_loader)) for (data, mask), target in tqdm(test_loader): opt.zero_grad() if GPU: data = data.cuda() target = target.cuda() mask = mask.cuda() #.half() if half: mask = mask.half() lm_logits, clf_logits = dh_model(data) loss = L(data, mask, lm_logits=lm_logits, only_return_losses=True) avg_loss_test += loss print('Test Loss: ', avg_loss_test / len(test_loader))
test_set = TestDataset(data_dir, args.dataset, params.num_class) #sampler = WeightedSampler(data_dir, args.dataset) # Use weighted sampler instead of random sampler train_loader = DataLoader(train_set, batch_size=params.batch_size, sampler=RandomSampler(train_set), num_workers=4) valid_loader = DataLoader(valid_set, batch_size=params.predict_batch, sampler=RandomSampler(valid_set), num_workers=4) test_loader = DataLoader(test_set, batch_size=params.predict_batch, sampler=RandomSampler(test_set), num_workers=4) logger.info('Loading complete.') n_updates_total = (train_set.__len__() // params.batch_size) * params.num_epochs optimizer_D = optim.RMSprop(discriminator.parameters(), lr = params.lr_d) optimizer_G = OpenAIAdam(model.parameters(), lr=params.lr, schedule=params.lr_schedule, warmup=params.lr_warmup, t_total=n_updates_total, b1=0.9, b2=0.999, e=1e-8, l2=0.01, vector_l2='store_true', max_grad_norm=1) adversarial_loss = torch.nn.BCELoss() # Train the model logger.info('Starting training for {} epoch(s)'.format(params.num_epochs)) train_and_evaluate(model, discriminator, train_loader, valid_loader, test_loader, optimizer_G,
n_train = len(trY) n_valid = len(vaY) n_batch_train = args.n_batch * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * args.n_iter dh_model = DoubleHeadModel(args, clf_token, 'multiple_choice', vocab, n_ctx) criterion = nn.CrossEntropyLoss(reduce=False) model_opt = OpenAIAdam( params=dh_model.parameters(), lr=args.lr, # 6.25e-5 schedule=args.lr_schedule, # warmup_linear warmup=args.lr_warmup, # 0.002 t_total=n_updates_total, # 748 b1=args.b1, # 0.9 b2=args.b2, # 0.999 e=args.e, # 1e-8 l2=args.l2, # 0.01 vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm # 1 ) compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion, args.lm_coef, model_opt) load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special) dh_model.to(device) dh_model = nn.DataParallel(dh_model)
def main(args): init(args) # Constants n_ctx = args.n_ctx save_dir = os.path.join(args.output_dir, args.experiment_name, "checkpoints") desc = args.desc data_dir = args.data_dir log_dir = os.path.join(args.output_dir, args.experiment_name, "logs") train_log_interval = args.train_log_interval val_log_interval = args.val_log_interval beam = args.beam gen_len = args.gen_len k = args.k decoding_strategy = args.decoding_strategy accum_iter = args.accum_iter device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() print("device", device, "n_gpu", n_gpu) logger = Logger(log_dir) text_encoder = TextEncoder(args.encoder_path, args.vocab_path) encoder = text_encoder.encoder n_vocab = len(text_encoder.encoder) encoder['_start_'] = len(encoder) encoder['_delimiter_'] = len(encoder) encoder['_classify_'] = len(encoder) clf_token = encoder['_classify_'] n_special = 3 print("Loading dataset...") train_loader = get_loader(os.path.join(data_dir, "train_encoded.jsonl"), args.n_batch, encoder, num_workers=3, shuffle=True) val_loader = get_loader(os.path.join(data_dir, "val_encoded.jsonl"), n_gpu, encoder, num_workers=0, shuffle=False, max_size=args.num_val_examples) print("Train length: {}, Validation length: {}".format(len(train_loader), len(val_loader))) vocab = n_vocab + n_special + n_ctx n_updates_total = (len(train_loader) // args.accum_iter) * (args.num_epochs_dat + args.num_epochs_ft) dh_model = LMModel(args, vocab=vocab, n_ctx=n_ctx, doc_embed=args.doc_model) criterion = nn.CrossEntropyLoss(reduction="none") model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) lm_loss = LMLoss(criterion) summary_loss = SummaryLoss(criterion) print("Loading Model") if args.use_pretrain: load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special, path="./model/", path_names="./") start_iter, running_loss = load_checkpoint(args.checkpoint, dh_model, model_opt, vocab, n_ctx) dh_model.to(device) dh_model = DataParallelModel(dh_model) lm_loss = DataParallelCriterion(lm_loss) summary_loss = DataParallelCriterion(summary_loss) for i in range(args.num_epochs_dat): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, lm_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "DAT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_dat), save_dir, logger, text_encoder, show_progress=args.show_progress, summary_loss=summary_loss) for i in range(args.num_epochs_ft): start_iter, running_loss = run_epoch(start_iter, running_loss, dh_model, summary_loss, model_opt, train_loader, val_loader, train_log_interval, val_log_interval, device, beam, gen_len, k, decoding_strategy, accum_iter, "FT Training Epoch [{}/{}]".format(i + 1, args.num_epochs_ft), save_dir, logger, text_encoder, show_progress=args.show_progress)