dh_model = DoubleHeadModel(args, clf_token, ('classification', 3), vocab, n_ctx) criterion = nn.CrossEntropyLoss(reduction='none') model_opt = OpenAIAdam(dh_model.parameters(), lr=args.lr, schedule=args.lr_schedule, warmup=args.lr_warmup, t_total=n_updates_total, b1=args.b1, b2=args.b2, e=args.e, l2=args.l2, vector_l2=args.vector_l2, max_grad_norm=args.max_grad_norm) compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion, args.lm_coef, model_opt) openAIModel = OpenAIModel() openAIModel.load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special) dh_model.to(device) dh_model = nn.DataParallel(dh_model) n_updates = 0 n_epochs = 0 if dataset != 'stsb': trYt = trY if submit: path = os.path.join(save_dir, desc, 'state_of_module') torch.save(dh_model.state_dict(), make_path(path))
teX, teM = transform_roc(teX1, teX2, teX3) n_train = len(trY) n_valid = len(vaY) n_batch_train = 8 * max(n_gpu, 1) n_updates_total = (n_train // n_batch_train) * 3 dh_model = Model(clf_token, 'multiple_choice', vocab, n_ctx) criterion = nn.CrossEntropyLoss(reduce=False) model_opt = torch.optim.adam( dh_model.parameters, lr=6.25e-5, ) compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion, 0.5, model_opt) dh_model.to(device) dh_model = nn.DataParallel(dh_model) n_updates = 0 n_epochs = 0 if dataset != 'stsb': trYt = trY if submit: path = os.path.join(save_dir, desc, 'best_params') torch.save(dh_model.state_dict(), make_path(path)) best_score = 0 for i in range(3): print("running epoch", i) run_epoch()