def run_training(): models = [] for f in weeks_to_try(): file_training = f"output\\nfl\\train\\{f}.csv" file_model = f"models\\nfl\\{f}_model.pkl" if os.path.isfile(file_model): os.remove(file_model) _, X, y = common.read_data_from_file(file_training, "home_win", get_column_names_for_removal()) grid = train.train_model(X, y, 10) model = grid.best_estimator_ train.save_model(model, file_model) output = [f, grid.best_score_, str(grid.best_params_)] models.append(output) dict = {"data": models} with open("output\\nfl\\html\\trainingdata.json", 'w') as summary_file: json.dump(dict, summary_file)
def main(): """ Entry point for training Load dataset according to args and train model """ args = Argparser().args torch.backends.cudnn.benchmark = True data_path = f'./{args.input_dir}/{args.data_dir}/' dataset = ShapeNetDataset(data_path) data_loader = DataLoader(dataset=dataset, batch_size=args.batch_size, num_workers=torch.cuda.device_count() * 4 if args.device.upper() == 'CUDA' else 4, shuffle=True, drop_last=True) d_path = f'./{args.models_path}/{args.obj}_d.tar' g_path = f'./{args.models_path}/{args.obj}_g.tar' d_model, g_model, d_optim, g_optim = initialize_model(args, d_path, g_path) # Always save model if something goes wrong, disconnects or what not try: gan = '' if args.unpac else 'Pac' two = '' if args.unpac else '2' print( f'Training {gan}{args.gan_type.upper()}{two} on {args.device.upper()}' ) training_loop(data_loader, d_model, g_model, d_optim, g_optim, args) finally: save_model(args.models_path, d_path, g_path, d_model, g_model, d_optim, g_optim, args)
def train(feature_set_name: str, model_name: str, queries_file: str, judgments_file: str, index_name: str, features_file: str, model_output: str, protected_feature_name="1", gamma=1, number_of_iterations=3000, learning_rate=0.001, lambdaa=0.001, init_var=0.01, standardize=False, log=None): """ Train and upload model with specified parameters """ es = elastic_connection(timeout=1000) collect_train_data(es, queries_file, judgments_file, feature_set_name, index_name, features_file) train_model(features_file, model_output, protected_feature_name, gamma, number_of_iterations, learning_rate, lambdaa, init_var, standardize, log) save_model(model_name, feature_set_name, model_output)
def merge_and_save_model(worker_results, args): # save the model for future evaluation merged_counter, model_type = get_model(args) merged_vocab = set() for counter, vocab in worker_results: merged_counter += counter merged_vocab |= vocab save_model(merged_counter, model_type, len(merged_vocab), args.output, args)
def run_training(training_csv_path: str, model_name: str, feature_columns: List[str], model_output_path: str, summary_file: str, model, param_grid): _, X, y = common.read_data_from_file(training_csv_path, "home_win", feature_columns) grid = train.train_model(X, y, 10, model, param_grid) model = grid.best_estimator_ train.save_model(model, model_output_path) output = [model_name, f"{grid.best_score_:.4f}", str(grid.best_params_)] add_to_json_summary(summary_file, output)
import pandas as pd from preparing_df import apply_preparing from cleaning import apply_cleaning from typo import apply_typo_ratio from train import split_train_test_data, fit_model, save_model true = pd.read_csv('True.csv') fake = pd.read_csv('Fake.csv') data = apply_preparing(true, fake) apply_cleaning(data) apply_typo_ratio(data) x_train, x_test, y_train, y_test = split_train_test_data(data) pipe = fit_model(x_train, y_train) save_model(pipe, 'model_test') print(pipe.score(x_test, y_test))
checkpoint = torch.load(args.model_ori) net.load_state_dict(checkpoint['net_state_dict']) for name, param in net.named_parameters(): print(name) print(param.size()) print('initialization (structured sketching)...') parameters_w, parameters_b, parameters_w_bin = initialize( net, train_loader, loss_func, args.structure, args.subc, args.max_bit) optimizer_b = torch.optim.Adam(parameters_b, weight_decay=args.wd) optimizer_w = ALQ_optimizer(parameters_w, weight_decay=args.wd) val_accuracy = validate(net, val_loader, loss_func) best_acc = val_accuracy[0] test(net, test_loader, loss_func) save_model(args.model, net, optimizer_w, optimizer_b, parameters_w_bin) M_p = (args.pr / args.top_k) / (args.epoch_prune * math.ceil( num_training_sample / args.batch_size)) for r in range(args.R): print('outer iteration: ', r) optimizer_b.param_groups[0]['lr'] = args.lr optimizer_w.param_groups[0]['lr'] = args.lr print('optimizing basis...') for q_epoch in range(args.epoch_basis): optimizer_b.param_groups[0]['lr'] *= args.ld_basis optimizer_w.param_groups[0]['lr'] *= args.ld_basis train_basis(net, train_loader, loss_func, optimizer_w,
from train import save_model from utils import clean_df import pandas as pd df = pd.read_csv('csv/train', header=0, chunksize=1) global_X_test = numpy.ndarray((0, 13)) global_y_test = numpy.ndarray((0, )) clf = linear_model.SGDClassifier() counter = 0 for chunk in df: chunk = clean_df(chunk) train_data = chunk.values X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0], test_size=0.3, random_state=0) global_X_test = numpy.concatenate((global_X_test, X_test)) global_y_test = numpy.concatenate((global_y_test, y_test)) clf.partial_fit(X_train, y_train, classes=[0, 1]) counter += 1 if counter % 100 == 0: print 'Counter ', counter print 'Score', clf.score(global_X_test, global_y_test) save_model(clf) print clf.score(global_X_test, global_y_test)
( kd_loss_teacher, kd_loss_student, kd_loss_backdoor, ) = train.dynamic.train_epoch(models, dataset_train, optimizers,) logger.info("teacher kd loss: {}".format(kd_loss_teacher.numpy())) logger.info("student kd loss: {}".format(kd_loss_student.numpy())) logger.info("backdoor kd loss: {}".format(kd_loss_backdoor.numpy())) eval(models, dataset_test) train.save_models(model_dir, cur_time, models) else: train.load_model(model_dir, "2020-08-03-2247", models["teacher"], "teacher") train.load_model(model_dir, "2020-08-03-2247", models["backdoor"], "backdoor") models["student"] = nets.resnet_v1.get_model(depth=8) optimizers = train.utils.get_opts() logger.debug("Starting static distillation") for epoch_index in range(settings.NUM_EPOCHS): logger.info("static epoch: %d" % (epoch_index + 1)) kd_loss_student = train.static.train_epoch(models, dataset_train, optimizers,) logger.info("student static kd loss: {}".format(kd_loss_student.numpy())) eval(models, dataset_test) train.save_model( model_dir, cur_time, models["student"], "student_static_resnet_v1_8" )
def run_gan(get_data_m, get_generator, get_discriminator, batch_size, image_size, iter_n=1000000, init_rate=0.0001, logs_dir='logs_tmp/', save_dir='save_tmp/', need_load=False): """ build_net_m: (t_images1, t_images2, is_trainable) -> t_logits get_data_m: () -> (images1, images2) """ create_dir(logs_dir) create_dir(save_dir) save_file_path = os.path.join(save_dir, 'model.ckpt') images1 = tf.placeholder(tf.float32, [batch_size] + image_size, name='images1') # todo: Change to variable shapes images2 = tf.placeholder(tf.float32, [batch_size] + image_size, name='images2') # Build nets G = get_generator(images1) D_true = get_discriminator(images1, images2) # Checks true image pairs D_false = get_discriminator(images2, images1, reuse=True) # Checks false image pairs D_gen = get_discriminator(images1, G, reuse=True) # Checks generated image pairs d_vars = [] g_vars = [] for var in tf.trainable_variables(): name = var.op.name if name.startswith('discriminator/'): d_vars.append(var) if name.startswith('generator/'): g_vars.append(var) # C = build_net_m(images1, G, is_training=False) # Trained classifier dt_sum = tf.histogram_summary("dt", D_true) df_sum = tf.histogram_summary("df", D_false) dg_sum = tf.histogram_summary("dg", D_gen) g_sum = tf.image_summary("g", G) g_sum_2 = tf.image_summary("left_g", tf.concat(2, [images1, G])) # Build losses dt_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(D_true, tf.ones_like(D_true))) df_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(D_false, tf.zeros_like(D_false))) dg_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(D_gen, tf.zeros_like(D_gen))) g_loss_discr = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(D_gen, tf.ones_like(D_gen))) # g_loss_net = train.build_loss(C, tf.constant(1, dtype=tf.int64, shape=[BATCH_SIZE])) * 50.0 g_loss_reg = tf.div(slim.losses.l1_loss(G - images1, weight=2e-6, scope='l1_loss'), batch_size) g_loss_l2im2 = tf.div(slim.losses.l2_loss(G - images2, weight=2e-6, scope='l2im2_loss'), batch_size) dt_loss_sum = tf.scalar_summary("dt_loss", dt_loss) df_loss_sum = tf.scalar_summary("df_loss", df_loss) dg_loss_sum = tf.scalar_summary("dg_loss", dg_loss) g_loss_discr_sum = tf.scalar_summary("g_loss_discr", g_loss_discr) # g_loss_net_sum = tf.scalar_summary("g_loss_net", g_loss_net) g_loss_reg_sum = tf.scalar_summary("g_loss_reg", g_loss_reg) g_loss_l2im2_sum = tf.scalar_summary("g_loss_l2im2", g_loss_l2im2) # d_loss = dt_loss * 3.0 + df_loss * 3.0 + dg_loss * 2.0 d_loss = dt_loss + dg_loss # g_loss = g_loss_discr + g_loss_reg + g_loss_l2im2 # g_loss = g_loss_discr + g_loss_l2im2 # g_loss = g_loss_reg g_loss = g_loss_l2im2 d_loss_sum = tf.scalar_summary("d_loss", d_loss) g_loss_sum = tf.scalar_summary("g_loss", g_loss) # Build optimizers g_opt = tf.train.AdamOptimizer(init_rate, name='train_G') g_grads = g_opt.compute_gradients(g_loss, var_list=g_vars) d_opt = tf.train.AdamOptimizer(init_rate, name='train_D') d_grads = d_opt.compute_gradients(d_loss, var_list=d_vars) d_apply_grad = d_opt.apply_gradients(d_grads) g_apply_grad = g_opt.apply_gradients(g_grads) for var in tf.trainable_variables(): tf.histogram_summary(var.op.name, var) for grad, var in d_grads: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) for grad, var in g_grads: if grad is not None: tf.histogram_summary(var.op.name + '/gradients', grad) step = slim.variables.variable('step_ref', shape=[], initializer=tf.constant_initializer(0), dtype=tf.int64, trainable=False) step = tf.assign(step, tf.add(step, 1), name='global_step') merged_summaries = tf.merge_all_summaries() sess = tf.get_default_session() # old_variables = [] # for var in tf.get_collection(slim.variables.VARIABLES_TO_RESTORE): # if (var.op.name.startswith('discriminator') or # var.op.name.startswith('generator') or # var.op.name == 'step_ref'): # pass # else: # old_variables.append(var) saver = tf.train.Saver(tf.get_collection(slim.variables.VARIABLES_TO_RESTORE)) # tmp_saver = tf.train.Saver(g_vars) # old_saver = tf.train.Saver(old_variables) writer = tf.train.SummaryWriter(logs_dir, sess.graph, flush_secs=30) sess.run(tf.initialize_all_variables()) tf.train.start_queue_runners() if need_load: train.load_model(saver, sess, save_file_path) else: pass # train.load_model(old_saver, sess, 'save_kingstreet/model.ckpt') # todo: remove kingstreet my_print("Starting...\n") for i in range(0, iter_n): im1, im2 = get_data_m() feed = { images1: im1, images2: im2 } st = step.eval() g_loss_val = 0.0 for j in range(2): _, g_loss_val = sess.run([g_apply_grad, g_loss], feed) if g_loss_val < 5 and False: d_apply_grad.run(feed) if st % 10 == 0: summary_str = merged_summaries.eval(feed) my_print('Current step: %i\n' % st) writer.add_summary(summary_str, st) if st % 100 == 0: train.save_model(saver, sess, save_file_path)
def pseudo_labeling(num_epochs, model, data_loader, val_loader, unlabeled_loader, device, val_every, file_name): # Instead of using current epoch we use a "step" variable to calculate alpha_weight # This helps the model converge faster from torch.optim.swa_utils import AveragedModel, SWALR from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss from adamp import AdamP criterion = [ SoftCrossEntropyLoss(smooth_factor=0.1), JaccardLoss('multiclass', classes=12) ] optimizer = AdamP(params=model.parameters(), lr=0.0001, weight_decay=1e-6) swa_scheduler = SWALR(optimizer, swa_lr=0.0001) swa_model = AveragedModel(model) optimizer = Lookahead(optimizer, la_alpha=0.5) step = 100 size = 256 best_mIoU = 0 model.train() print('Start Pseudo-Labeling..') for epoch in range(num_epochs): hist = np.zeros((12, 12)) for batch_idx, (imgs, image_infos) in enumerate(unlabeled_loader): # Forward Pass to get the pseudo labels # --------------------------------------------- test(unlabelse)를 모델에 통과 model.eval() outs = model(torch.stack(imgs).to(device)) oms = torch.argmax(outs.squeeze(), dim=1).detach().cpu().numpy() oms = torch.Tensor(oms) oms = oms.long() oms = oms.to(device) # --------------------------------------------- 학습 model.train() # Now calculate the unlabeled loss using the pseudo label imgs = torch.stack(imgs) imgs = imgs.to(device) # preds_array = preds_array.to(device) output = model(imgs) loss = 0 for each in criterion: loss += each(output, oms) unlabeled_loss = alpha_weight(step) * loss # Backpropogate optimizer.zero_grad() unlabeled_loss.backward() optimizer.step() output = torch.argmax(output.squeeze(), dim=1).detach().cpu().numpy() hist = add_hist(hist, oms.detach().cpu().numpy(), output, n_class=12) if (batch_idx + 1) % 25 == 0: acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist) print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU:{:.4f}'. format(epoch + 1, num_epochs, batch_idx + 1, len(unlabeled_loader), unlabeled_loss.item(), mIoU)) # For every 50 batches train one epoch on labeled data # 50배치마다 라벨데이터를 1 epoch학습 if batch_idx % 50 == 0: # Normal training procedure for batch_idx, (images, masks, _) in enumerate(data_loader): labeled_loss = 0 images = torch.stack(images) # (batch, channel, height, width) masks = torch.stack(masks).long() # gpu 연산을 위해 device 할당 images, masks = images.to(device), masks.to(device) output = model(images) for each in criterion: labeled_loss += each(output, masks) optimizer.zero_grad() labeled_loss.backward() optimizer.step() # Now we increment step by 1 step += 1 if (epoch + 1) % val_every == 0: avrg_loss, val_mIoU = validation(epoch + 1, model, val_loader, criterion, device) if val_mIoU > best_mIoU: print('Best performance at epoch: {}'.format(epoch + 1)) print('Save model in', saved_dir) best_mIoU = val_mIoU save_model(model, file_name=file_name) model.train() if epoch > 3: swa_model.update_parameters(model) swa_scheduler.step()
#--Memory cleanup prior to running the memory intensive classifiers--# dfTrn,dfTest,dfAll = utils.data_garbage_collection(dfTrn,dfTest,dfAll) #--use a benchmark instead of a classifier--# benchmark_preds = train.cross_validate_using_benchmark('3.5', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15) benchmark_preds = train.cross_validate_using_benchmark('global_mean', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15) benchmark_preds = train.cross_validate_using_benchmark('business_mean', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15) benchmark_preds = train.cross_validate_using_benchmark('usr_mean', dfTrn, dfTrn[0].merge(dfTrn[2],how='inner',on='user_id').as_matrix(),dfTrn[0].merge(dfTrn[2],how='inner',on='user_id').ix[:,['rev_stars']].as_matrix(),folds=3,SEED=22,test_size=.15) #--predict using a benchmark--# train.save_predictions_benchmark(dfTest_Benchmark_BusMean,'bus_mean',submission_no) train.save_predictions_benchmark(dfTest_Benchmark_UsrMean,'usr_mean',submission_no) train.save_predictions_benchmark(dfTest_Benchmark_BusUsrMean,'bus_usr_mean',submission_no) #--Save model to joblib file--# train.save_model(clf,clf_name) #--Save a dataframe to CSV--# filename = 'Data/'+datetime.now().strftime("%d-%m-%y_%H%M")+'--FinalDataset--OldUserTest'+'.csv' #del dfTest_Master['business_id'];del dfTest_Master['user_id']; #dfTest_Master.ix[:,['RecommendationId','calc_user_avg_stars','calc_user_rev_count']].to_csv(filename, index=False) dfTest_Old[2].to_csv(filename, index=False) #--Save predictions to CSV--# filename = 'Data/'+datetime.now().strftime("%d-%m-%y_%H%M")+'--Pred_ChkBus&Open_LinReg'+'.csv' dfTest_Master['predictions_LinReg'] = [x[0] for x in dfTest_Master.predictions_LinReg] dfTest_Master.ix[:,['RecommendationId','predictions_LinReg']].to_csv(filename, index=False) #--Load model from joblib file--# clf = train.load_model('Models/07-07-13_1247--SGD_001_1000.joblib.pk1')
df = pd.read_csv('csv/train', header=0, chunksize=1) global_X_test = numpy.ndarray((0,13)) global_y_test = numpy.ndarray((0,)) clf = linear_model.SGDClassifier() counter = 0 for chunk in df: chunk = clean_df(chunk) train_data = chunk.values X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0], test_size=0.3, random_state=0) global_X_test = numpy.concatenate((global_X_test, X_test)) global_y_test = numpy.concatenate((global_y_test, y_test)) clf.partial_fit(X_train, y_train, classes=[0, 1]) counter += 1 if counter % 100 == 0 : print 'Counter ', counter print 'Score', clf.score(global_X_test, global_y_test) save_model(clf) print clf.score(global_X_test, global_y_test)
dfTrn[0].merge(dfTrn[2], how='inner', on='user_id').ix[:, ['rev_stars']].as_matrix(), folds=3, SEED=22, test_size=.15) #--predict using a benchmark--# train.save_predictions_benchmark(dfTest_Benchmark_BusMean, 'bus_mean', submission_no) train.save_predictions_benchmark(dfTest_Benchmark_UsrMean, 'usr_mean', submission_no) train.save_predictions_benchmark(dfTest_Benchmark_BusUsrMean, 'bus_usr_mean', submission_no) #--Save model to joblib file--# train.save_model(clf, clf_name) #--Save a dataframe to CSV--# filename = 'Data/' + datetime.now().strftime( "%d-%m-%y_%H%M") + '--FinalDataset--OldUserTest' + '.csv' #del dfTest_Master['business_id'];del dfTest_Master['user_id']; #dfTest_Master.ix[:,['RecommendationId','calc_user_avg_stars','calc_user_rev_count']].to_csv(filename, index=False) dfTest_Old[2].to_csv(filename, index=False) #--Save predictions to CSV--# filename = 'Data/' + datetime.now().strftime( "%d-%m-%y_%H%M") + '--Pred_ChkBus&Open_LinReg' + '.csv' dfTest_Master['predictions_LinReg'] = [ x[0] for x in dfTest_Master.predictions_LinReg ] dfTest_Master.ix[:, ['RecommendationId', 'predictions_LinReg']].to_csv(
def train(opt): if torch.cuda.is_available(): logger.info("%s", torch.cuda.get_device_name(0)) # set etc torch.autograd.set_detect_anomaly(True) # prepare teacher config teacher_config = load_config(opt, config_path=opt.teacher_config) teacher_config['opt'] = opt logger.info("[teacher config] :\n%s", teacher_config) # prepare student config student_config = load_config(opt, config_path=opt.config) student_config['opt'] = opt logger.info("[student config] :\n%s", student_config) # set path set_path(teacher_config) # prepare train, valid dataset train_loader, valid_loader = prepare_datasets(teacher_config) # prepare labeled dataset for meta pseudo labels mpl_loader = None if opt.mpl_data_path: mpl_loader, _ = prepare_datasets(teacher_config, train_path=opt.mpl_data_path) # ------------------------------------------------------------------------------------------------------- # distillation # ------------------------------------------------------------------------------------------------------- if opt.do_distill: # prepare and load teacher model teacher_model = prepare_model(teacher_config, bert_model_name_or_path=opt.teacher_bert_model_name_or_path) teacher_checkpoint = load_checkpoint(opt.teacher_model_path, device=opt.device) teacher_model.load_state_dict(teacher_checkpoint) teacher_model = teacher_model.to(opt.device) logger.info("[prepare teacher model and loading done]") # prepare student model student_model = prepare_model(student_config, bert_model_name_or_path=opt.bert_model_name_or_path) logger.info("[prepare student model done]") best_eval_metric=None global_step, tr_loss, best_eval_metric = distill(teacher_config, teacher_model, student_config, student_model, train_loader, valid_loader, best_eval_metric=best_eval_metric, mpl_loader=mpl_loader) logger.info(f"[distillation done] global steps: {global_step}, total loss: {tr_loss}, best metric: {best_eval_metric}") # ------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------- # structured pruning # ------------------------------------------------------------------------------------------------------- if opt.do_prune: # restore model from '--save_path', '--bert_output_dir' model = prepare_model(student_config, bert_model_name_or_path=opt.bert_output_dir) checkpoint = load_checkpoint(opt.save_path, device=opt.device) model.load_state_dict(checkpoint) model = model.to(opt.device) logger.info("[Restore best student model] : {}, {}".format(opt.bert_output_dir, opt.save_path)) eval_loss = eval_acc = 0 eval_loss, eval_acc = evaluate(model, student_config, valid_loader) logs = {} logs['eval_loss'] = eval_loss logs['eval_acc'] = eval_acc logger.info("[before pruning] :") logger.info(json.dumps({**logs})) prune_rewire(student_config, model, valid_loader, use_tqdm=True) # save pruned model to '--save_path_pruned', '--bert_output_dir_pruned' save_model(student_config, model, save_path=opt.save_path_pruned) model.bert_tokenizer.save_pretrained(opt.bert_output_dir_pruned) model.bert_model.save_pretrained(opt.bert_output_dir_pruned) logger.info("[Pruned model saved] : {}, {}".format(opt.save_path_pruned, opt.bert_output_dir_pruned))
def distill( teacher_config, teacher_model, student_config, student_model, train_loader, eval_loader, best_eval_metric=None, mpl_loader=None): args = teacher_config['opt'] teacher_layer_num = teacher_model.bert_model.config.num_hidden_layers student_layer_num = student_model.bert_model.config.num_hidden_layers # create teacher optimizer with larger L2 norm teacher_optimizer, _, _, _ = prepare_osws(teacher_config, teacher_model, train_loader, lr=args.mpl_learning_rate, weight_decay=args.mpl_weight_decay) # create student optimizer, scheduler, summary writer student_optimizer, student_scheduler, writer, _ = prepare_osws(student_config, student_model, train_loader, lr=args.lr, weight_decay=args.weight_decay) # prepare loss functions def soft_cross_entropy(predicts, targets): likelihood = F.log_softmax(predicts, dim=-1) targets_prob = F.softmax(targets, dim=-1) return (- targets_prob * likelihood).sum(dim=-1).mean() loss_mse_sum = MSELoss(reduction='sum').to(args.device) loss_mse = MSELoss().to(args.device) loss_cs = CosineSimilarity(dim=2).to(args.device) loss_cs_att = CosineSimilarity(dim=3).to(args.device) logger.info("***** Running distillation training *****") logger.info(" Num Batchs = %d", len(train_loader)) logger.info(" Num Epochs = %d", args.epoch) logger.info(" batch size = %d", args.batch_size) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 tr_loss, logging_loss = 0.0, 0.0 tr_att_loss = 0. tr_rep_loss = 0. tr_cls_loss = 0. teacher_model.zero_grad() student_model.zero_grad() epoch_iterator = range(epochs_trained, int(args.epoch)) # for reproductibility set_seed(args) for epoch_n in epoch_iterator: tr_att_loss = 0. tr_rep_loss = 0. tr_cls_loss = 0. train_iterator = tqdm(train_loader, desc=f"Epoch {epoch_n}") for step, (x, y) in enumerate(train_iterator): x = to_device(x, args.device) y = to_device(y, args.device) # ------------------------------------------------------------------------------------------------------- # teacher -> student, teaching with teacher_model.eval(), student_model.train() # ------------------------------------------------------------------------------------------------------- att_loss = 0. rep_loss = 0. cls_loss = 0. # teacher model output teacher_model.eval() with torch.no_grad(): output_teacher, teacher_bert_outputs = teacher_model(x, return_bert_outputs=True) # student model output student_model.train() output_student, student_bert_outputs = student_model(x, return_bert_outputs=True) # Knowledge Distillation loss # 1) logits distillation ''' kd_loss = soft_cross_entropy(output_student, output_teacher) ''' kd_loss = loss_mse_sum(output_student, output_teacher) loss = kd_loss tr_cls_loss += loss.item() # 2) embedding and last hidden state distillation if args.state_loss_ratio > 0.0: teacher_reps = teacher_bert_outputs.hidden_states student_reps = student_bert_outputs.hidden_states new_teacher_reps = [teacher_reps[0], teacher_reps[teacher_layer_num]] new_student_reps = [student_reps[0], student_reps[student_layer_num]] for student_rep, teacher_rep in zip(new_student_reps, new_teacher_reps): # cosine similarity loss if args.state_distill_cs: tmp_loss = 1.0 - loss_cs(student_rep, teacher_rep).mean() # MSE loss else: tmp_loss = loss_mse(student_rep, teacher_rep) rep_loss += tmp_loss loss += args.state_loss_ratio * rep_loss tr_rep_loss += rep_loss.item() # 3) Attentions distillation if args.att_loss_ratio > 0.0: teacher_atts = teacher_bert_outputs.attentions student_atts = student_bert_outputs.attentions assert teacher_layer_num == len(teacher_atts) assert student_layer_num == len(student_atts) assert teacher_layer_num % student_layer_num == 0 layers_per_block = int(teacher_layer_num / student_layer_num) new_teacher_atts = [teacher_atts[i * layers_per_block + layers_per_block - 1] for i in range(student_layer_num)] for student_att, teacher_att in zip(student_atts, new_teacher_atts): student_att = torch.where(student_att <= -1e2, torch.zeros_like(student_att).to(args.device), student_att) teacher_att = torch.where(teacher_att <= -1e2, torch.zeros_like(teacher_att).to(args.device), teacher_att) tmp_loss = 1.0 - loss_cs_att(student_att, teacher_att).mean() att_loss += tmp_loss loss += args.att_loss_ratio * att_loss tr_att_loss += att_loss.item() if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps # back propagate through student model loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(student_model.parameters(), args.max_grad_norm) student_optimizer.step() # update student model student_scheduler.step() # Update learning rate schedule student_model.zero_grad() global_step += 1 # ------------------------------------------------------------------------------------------------------- # ------------------------------------------------------------------------------------------------------- # student -> teacher, performance feedback/update with student_model.eval(), teacher_model.train() # ------------------------------------------------------------------------------------------------------- mpl_loss = 0.0 if mpl_loader and global_step > args.mpl_warmup_steps: loss_cross_entropy = torch.nn.CrossEntropyLoss().to(args.device) mpl_iterator = iter(mpl_loader) try: (x, y) = next(mpl_iterator) # draw random sample except StopIteration as e: mpl_iterator = iter(mpl_loader) (x, y) = next(mpl_iterator) # draw random sample x = to_device(x, args.device) y = to_device(y, args.device) # teacher model output teacher_model.train() output_teacher, teacher_bert_outputs = teacher_model(x, return_bert_outputs=True) # student model output student_model.eval() # updated student model output_student, student_bert_outputs = student_model(x, return_bert_outputs=True) # the loss is the performance of the student on the labeled data. # additionaly, we add the loss of the teacher on the labeled data for avoiding overfitting. mpl_loss = loss_cross_entropy(output_student, y) / 2 + loss_cross_entropy(output_teacher, y) / 2 if args.gradient_accumulation_steps > 1: mpl_loss = mpl_loss / args.gradient_accumulation_steps # back propagate through teacher model mpl_loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(teacher_model.parameters(), args.max_grad_norm) teacher_optimizer.step() # update teacher model teacher_model.zero_grad() student_model.zero_grad() # clear gradient info which was generated during forward computation. # ------------------------------------------------------------------------------------------------------- train_iterator.set_description(f"Epoch {epoch_n} loss: {loss:.3f}, mpl loss: {mpl_loss:.3f}") if writer: writer.add_scalar('loss', loss, global_step) writer.add_scalar('mpl_loss', mpl_loss, global_step) # ------------------------------------------------------------------------------------------------------- # evaluate student, save model # ------------------------------------------------------------------------------------------------------- flag_eval = False logs = {} if args.logging_steps > 0 and global_step % args.logging_steps == 0: flag_eval = True if flag_eval: if args.log_evaluate_during_training: eval_loss, eval_acc = evaluate(student_model, student_config, eval_loader) logs['eval_loss'] = eval_loss logs['eval_acc'] = eval_acc if writer: writer.add_scalar('eval_loss', eval_loss, global_step) writer.add_scalar('eval_acc', eval_acc, global_step) cls_loss = tr_cls_loss / (step + 1) att_loss = tr_att_loss / (step + 1) rep_loss = tr_rep_loss / (step + 1) loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = student_scheduler.get_last_lr()[0] logs["learning_rate"] = learning_rate_scalar logs["avg_loss_since_last_log"] = loss_scalar logs['cls_loss'] = cls_loss logs['att_loss'] = att_loss logs['rep_loss'] = rep_loss logging_loss = tr_loss logging.info(json.dumps({**logs, **{"step": global_step}})) if writer: writer.add_scalar('learning_rate', learning_rate_scalar, global_step) writer.add_scalar('avg_loss_since_last_log', loss_scalar, global_step) writer.add_scalar('cls_loss', cls_loss, global_step) writer.add_scalar('att_loss', att_loss, global_step) writer.add_scalar('rep_loss', rep_loss, global_step) flag_eval = False if step == 0 and epoch_n != 0: flag_eval = True # every epoch if args.eval_and_save_steps > 0 and global_step % args.eval_and_save_steps == 0: flag_eval = True if flag_eval: eval_loss, eval_acc = evaluate(student_model, student_config, eval_loader) logs['eval_loss'] = eval_loss logs['eval_acc'] = eval_acc logger.info(json.dumps({**logs, **{"step": global_step}})) if writer: writer.add_scalar('eval_loss', eval_loss, global_step) writer.add_scalar('eval_acc', eval_acc, global_step) # measured by accuracy curr_eval_metric = eval_acc if best_eval_metric is None or curr_eval_metric > best_eval_metric: # save model to '--save_path', '--bert_output_dir' save_model(student_config, student_model, save_path=args.save_path) student_model.bert_tokenizer.save_pretrained(args.bert_output_dir) student_model.bert_model.save_pretrained(args.bert_output_dir) best_eval_metric = curr_eval_metric logger.info("[Best student model saved] : {:10.6f}, {}, {}".format(best_eval_metric, args.bert_output_dir, args.save_path)) # ------------------------------------------------------------------------------------------------------- return global_step, tr_loss / global_step, best_eval_metric
# try: # s = sys.argv[1] # except IndexError: # s = "" # create_interactions(s) epochs = 1 for i in range(0, 10): # Generate new data create_interactions(str(i)) # Load model = load_model(name="model") model.compile(loss='mse', optimizer=RMSprop()) # Train train_filename = "/ssd/train_extra.csv{}".format(i) model, losses = train_model(model, epochs, train_filename, nb_epoch=2) # Test print("MSE", losses[-1]) test_filename = "/ssd/test_extra.csv{}".format(i) m = test_model(model, test_filename) # Save model save_model(model, name="model") # if m > 0.93: # break
# Similar to our train script, but we do this k times for k, datasets in enumerate(iterate_folds(fold_sets)): train, val, test = datasets model = BeatNet(downbeats=args.downbeats) if cuda_device is not None: model.cuda(args.cuda_device) output_file = make_fold_output_name(args.output_file, k) train_loader, val_loader, test_loader = make_data_loaders( (train, val, test), batch_size=args.batch_size) train_loop(model, train_loader, val_loader=val_loader, num_epochs=args.num_epochs, cuda_device=cuda_device, output_file=output_file, davies_stopping_condition=args.davies_stopping_condition, fold=k) if args.output_file is not None: save_model(model, output_file) if args.dataset_output_file is not None: save_dir = make_fold_output_name(args.dataset_output_file, k) save_datasets((train, val, test), save_dir) test_model(model, test_loader, cuda_device=cuda_device)