示例#1
0
def run_training():
	models = []

	for f in weeks_to_try():

		file_training = f"output\\nfl\\train\\{f}.csv"
		file_model = f"models\\nfl\\{f}_model.pkl"

		if os.path.isfile(file_model):
			os.remove(file_model)

		_, X, y = common.read_data_from_file(file_training, "home_win", get_column_names_for_removal())
		
		grid = train.train_model(X, y, 10)

		model = grid.best_estimator_

		train.save_model(model, file_model)

		output = [f, grid.best_score_, str(grid.best_params_)]
		
		models.append(output)

	dict = {"data": models}

	with open("output\\nfl\\html\\trainingdata.json", 'w') as summary_file:
		json.dump(dict, summary_file)
示例#2
0
def main():
    """
    Entry point for training

    Load dataset according to args and train model
    """
    args = Argparser().args
    torch.backends.cudnn.benchmark = True

    data_path = f'./{args.input_dir}/{args.data_dir}/'
    dataset = ShapeNetDataset(data_path)
    data_loader = DataLoader(dataset=dataset,
                             batch_size=args.batch_size,
                             num_workers=torch.cuda.device_count() *
                             4 if args.device.upper() == 'CUDA' else 4,
                             shuffle=True,
                             drop_last=True)
    d_path = f'./{args.models_path}/{args.obj}_d.tar'
    g_path = f'./{args.models_path}/{args.obj}_g.tar'
    d_model, g_model, d_optim, g_optim = initialize_model(args, d_path, g_path)

    # Always save model if something goes wrong, disconnects or what not
    try:
        gan = '' if args.unpac else 'Pac'
        two = '' if args.unpac else '2'
        print(
            f'Training {gan}{args.gan_type.upper()}{two} on {args.device.upper()}'
        )
        training_loop(data_loader, d_model, g_model, d_optim, g_optim, args)
    finally:
        save_model(args.models_path, d_path, g_path, d_model, g_model, d_optim,
                   g_optim, args)
def train(feature_set_name: str,
          model_name: str,
          queries_file: str,
          judgments_file: str,
          index_name: str,
          features_file: str,
          model_output: str,
          protected_feature_name="1",
          gamma=1,
          number_of_iterations=3000,
          learning_rate=0.001,
          lambdaa=0.001,
          init_var=0.01,
          standardize=False,
          log=None):
    """
    Train and upload model with specified parameters
    """
    es = elastic_connection(timeout=1000)
    collect_train_data(es, queries_file, judgments_file, feature_set_name,
                       index_name, features_file)
    train_model(features_file, model_output, protected_feature_name, gamma,
                number_of_iterations, learning_rate, lambdaa, init_var,
                standardize, log)

    save_model(model_name, feature_set_name, model_output)
示例#4
0
def merge_and_save_model(worker_results, args):
    # save the model for future evaluation
    merged_counter, model_type = get_model(args)
    merged_vocab = set()
    for counter, vocab in worker_results:
        merged_counter += counter
        merged_vocab |= vocab
    save_model(merged_counter, model_type, len(merged_vocab), args.output,
               args)
示例#5
0
def run_training(training_csv_path: str, model_name: str,
                 feature_columns: List[str], model_output_path: str,
                 summary_file: str, model, param_grid):

    _, X, y = common.read_data_from_file(training_csv_path, "home_win",
                                         feature_columns)

    grid = train.train_model(X, y, 10, model, param_grid)

    model = grid.best_estimator_

    train.save_model(model, model_output_path)

    output = [model_name, f"{grid.best_score_:.4f}", str(grid.best_params_)]

    add_to_json_summary(summary_file, output)
示例#6
0
import pandas as pd
from preparing_df import apply_preparing
from cleaning import apply_cleaning
from typo import apply_typo_ratio
from train import split_train_test_data, fit_model, save_model

true = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

data = apply_preparing(true, fake)

apply_cleaning(data)

apply_typo_ratio(data)

x_train, x_test, y_train, y_test = split_train_test_data(data)

pipe = fit_model(x_train, y_train)

save_model(pipe, 'model_test')

print(pipe.score(x_test, y_test))
示例#7
0
        checkpoint = torch.load(args.model_ori)
        net.load_state_dict(checkpoint['net_state_dict'])
        for name, param in net.named_parameters():
            print(name)
            print(param.size())

        print('initialization (structured sketching)...')
        parameters_w, parameters_b, parameters_w_bin = initialize(
            net, train_loader, loss_func, args.structure, args.subc,
            args.max_bit)
        optimizer_b = torch.optim.Adam(parameters_b, weight_decay=args.wd)
        optimizer_w = ALQ_optimizer(parameters_w, weight_decay=args.wd)
        val_accuracy = validate(net, val_loader, loss_func)
        best_acc = val_accuracy[0]
        test(net, test_loader, loss_func)
        save_model(args.model, net, optimizer_w, optimizer_b, parameters_w_bin)

        M_p = (args.pr / args.top_k) / (args.epoch_prune * math.ceil(
            num_training_sample / args.batch_size))

        for r in range(args.R):

            print('outer iteration: ', r)
            optimizer_b.param_groups[0]['lr'] = args.lr
            optimizer_w.param_groups[0]['lr'] = args.lr

            print('optimizing basis...')
            for q_epoch in range(args.epoch_basis):
                optimizer_b.param_groups[0]['lr'] *= args.ld_basis
                optimizer_w.param_groups[0]['lr'] *= args.ld_basis
                train_basis(net, train_loader, loss_func, optimizer_w,
示例#8
0
from train import save_model
from utils import clean_df
import pandas as pd

df = pd.read_csv('csv/train', header=0, chunksize=1)

global_X_test = numpy.ndarray((0, 13))
global_y_test = numpy.ndarray((0, ))
clf = linear_model.SGDClassifier()

counter = 0
for chunk in df:
    chunk = clean_df(chunk)
    train_data = chunk.values
    X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::],
                                                        train_data[0::, 0],
                                                        test_size=0.3,
                                                        random_state=0)

    global_X_test = numpy.concatenate((global_X_test, X_test))
    global_y_test = numpy.concatenate((global_y_test, y_test))
    clf.partial_fit(X_train, y_train, classes=[0, 1])
    counter += 1

    if counter % 100 == 0:
        print 'Counter ', counter
        print 'Score', clf.score(global_X_test, global_y_test)

save_model(clf)
print clf.score(global_X_test, global_y_test)
示例#9
0
            (
                kd_loss_teacher,
                kd_loss_student,
                kd_loss_backdoor,
            ) = train.dynamic.train_epoch(models, dataset_train, optimizers,)

            logger.info("teacher kd loss: {}".format(kd_loss_teacher.numpy()))
            logger.info("student kd loss: {}".format(kd_loss_student.numpy()))
            logger.info("backdoor kd loss: {}".format(kd_loss_backdoor.numpy()))

            eval(models, dataset_test)
        train.save_models(model_dir, cur_time, models)

    else:
        train.load_model(model_dir, "2020-08-03-2247", models["teacher"], "teacher")
        train.load_model(model_dir, "2020-08-03-2247", models["backdoor"], "backdoor")
        models["student"] = nets.resnet_v1.get_model(depth=8)
        optimizers = train.utils.get_opts()
        logger.debug("Starting static distillation")
        for epoch_index in range(settings.NUM_EPOCHS):
            logger.info("static epoch: %d" % (epoch_index + 1))

            kd_loss_student = train.static.train_epoch(models, dataset_train, optimizers,)
            
            logger.info("student static kd loss: {}".format(kd_loss_student.numpy()))

            eval(models, dataset_test)
        train.save_model(
            model_dir, cur_time, models["student"], "student_static_resnet_v1_8"
        )
示例#10
0
def run_gan(get_data_m, get_generator, get_discriminator, batch_size, image_size,
            iter_n=1000000, init_rate=0.0001, logs_dir='logs_tmp/',
            save_dir='save_tmp/', need_load=False):
    """
    build_net_m: (t_images1, t_images2, is_trainable) -> t_logits
    get_data_m: () -> (images1, images2)
    """

    create_dir(logs_dir)
    create_dir(save_dir)
    save_file_path = os.path.join(save_dir, 'model.ckpt')

    images1 = tf.placeholder(tf.float32, [batch_size] + image_size, name='images1')  # todo: Change to variable shapes
    images2 = tf.placeholder(tf.float32, [batch_size] + image_size, name='images2')

    # Build nets
    G = get_generator(images1)

    D_true = get_discriminator(images1, images2)  # Checks true image pairs
    D_false = get_discriminator(images2, images1, reuse=True)         # Checks false image pairs
    D_gen = get_discriminator(images1, G, reuse=True)  # Checks generated image pairs

    d_vars = []
    g_vars = []
    for var in tf.trainable_variables():
        name = var.op.name
        if name.startswith('discriminator/'):
            d_vars.append(var)
        if name.startswith('generator/'):
            g_vars.append(var)

    # C = build_net_m(images1, G, is_training=False)  # Trained classifier

    dt_sum = tf.histogram_summary("dt", D_true)
    df_sum = tf.histogram_summary("df", D_false)
    dg_sum = tf.histogram_summary("dg", D_gen)
    g_sum = tf.image_summary("g", G)
    g_sum_2 = tf.image_summary("left_g", tf.concat(2, [images1, G]))

    # Build losses
    dt_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(D_true, tf.ones_like(D_true)))
    df_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(D_false, tf.zeros_like(D_false)))
    dg_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(D_gen, tf.zeros_like(D_gen)))
    g_loss_discr = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(D_gen, tf.ones_like(D_gen)))
    # g_loss_net = train.build_loss(C, tf.constant(1, dtype=tf.int64, shape=[BATCH_SIZE])) * 50.0
    g_loss_reg = tf.div(slim.losses.l1_loss(G - images1, weight=2e-6, scope='l1_loss'), batch_size)
    g_loss_l2im2 = tf.div(slim.losses.l2_loss(G - images2, weight=2e-6, scope='l2im2_loss'), batch_size)

    dt_loss_sum = tf.scalar_summary("dt_loss", dt_loss)
    df_loss_sum = tf.scalar_summary("df_loss", df_loss)
    dg_loss_sum = tf.scalar_summary("dg_loss", dg_loss)
    g_loss_discr_sum = tf.scalar_summary("g_loss_discr", g_loss_discr)
    # g_loss_net_sum = tf.scalar_summary("g_loss_net", g_loss_net)
    g_loss_reg_sum = tf.scalar_summary("g_loss_reg", g_loss_reg)
    g_loss_l2im2_sum = tf.scalar_summary("g_loss_l2im2", g_loss_l2im2)

    # d_loss = dt_loss * 3.0 + df_loss * 3.0 + dg_loss * 2.0
    d_loss = dt_loss + dg_loss
    # g_loss = g_loss_discr + g_loss_reg + g_loss_l2im2
    # g_loss = g_loss_discr + g_loss_l2im2
    # g_loss = g_loss_reg
    g_loss = g_loss_l2im2

    d_loss_sum = tf.scalar_summary("d_loss", d_loss)
    g_loss_sum = tf.scalar_summary("g_loss", g_loss)

    # Build optimizers
    g_opt = tf.train.AdamOptimizer(init_rate, name='train_G')
    g_grads = g_opt.compute_gradients(g_loss, var_list=g_vars)
    d_opt = tf.train.AdamOptimizer(init_rate, name='train_D')
    d_grads = d_opt.compute_gradients(d_loss, var_list=d_vars)

    d_apply_grad = d_opt.apply_gradients(d_grads)
    g_apply_grad = g_opt.apply_gradients(g_grads)

    for var in tf.trainable_variables():
        tf.histogram_summary(var.op.name, var)
    for grad, var in d_grads:
        if grad is not None:
            tf.histogram_summary(var.op.name + '/gradients', grad)
    for grad, var in g_grads:
        if grad is not None:
            tf.histogram_summary(var.op.name + '/gradients', grad)

    step = slim.variables.variable('step_ref', shape=[], initializer=tf.constant_initializer(0), dtype=tf.int64,
                                   trainable=False)
    step = tf.assign(step, tf.add(step, 1), name='global_step')

    merged_summaries = tf.merge_all_summaries()

    sess = tf.get_default_session()

    # old_variables = []
    # for var in tf.get_collection(slim.variables.VARIABLES_TO_RESTORE):
    #     if (var.op.name.startswith('discriminator') or
    #             var.op.name.startswith('generator') or
    #             var.op.name == 'step_ref'):
    #         pass
    #     else:
    #         old_variables.append(var)

    saver = tf.train.Saver(tf.get_collection(slim.variables.VARIABLES_TO_RESTORE))
    # tmp_saver = tf.train.Saver(g_vars)
    # old_saver = tf.train.Saver(old_variables)
    writer = tf.train.SummaryWriter(logs_dir, sess.graph, flush_secs=30)

    sess.run(tf.initialize_all_variables())

    tf.train.start_queue_runners()

    if need_load:
        train.load_model(saver, sess, save_file_path)
    else:
        pass
        # train.load_model(old_saver, sess, 'save_kingstreet/model.ckpt') # todo: remove kingstreet

    my_print("Starting...\n")

    for i in range(0, iter_n):
        im1, im2 = get_data_m()
        feed = {
            images1: im1,
            images2: im2
        }
        st = step.eval()
        g_loss_val = 0.0
        for j in range(2):
            _, g_loss_val = sess.run([g_apply_grad, g_loss], feed)
        if g_loss_val < 5 and False:
            d_apply_grad.run(feed)
        if st % 10 == 0:
            summary_str = merged_summaries.eval(feed)
            my_print('Current step: %i\n' % st)
            writer.add_summary(summary_str, st)

        if st % 100 == 0:
            train.save_model(saver, sess, save_file_path)
示例#11
0
def pseudo_labeling(num_epochs, model, data_loader, val_loader,
                    unlabeled_loader, device, val_every, file_name):
    # Instead of using current epoch we use a "step" variable to calculate alpha_weight
    # This helps the model converge faster
    from torch.optim.swa_utils import AveragedModel, SWALR
    from segmentation_models_pytorch.losses import SoftCrossEntropyLoss, JaccardLoss
    from adamp import AdamP

    criterion = [
        SoftCrossEntropyLoss(smooth_factor=0.1),
        JaccardLoss('multiclass', classes=12)
    ]
    optimizer = AdamP(params=model.parameters(), lr=0.0001, weight_decay=1e-6)
    swa_scheduler = SWALR(optimizer, swa_lr=0.0001)
    swa_model = AveragedModel(model)
    optimizer = Lookahead(optimizer, la_alpha=0.5)

    step = 100
    size = 256
    best_mIoU = 0
    model.train()
    print('Start Pseudo-Labeling..')
    for epoch in range(num_epochs):
        hist = np.zeros((12, 12))
        for batch_idx, (imgs, image_infos) in enumerate(unlabeled_loader):

            # Forward Pass to get the pseudo labels
            # --------------------------------------------- test(unlabelse)를 모델에 통과
            model.eval()
            outs = model(torch.stack(imgs).to(device))
            oms = torch.argmax(outs.squeeze(), dim=1).detach().cpu().numpy()
            oms = torch.Tensor(oms)
            oms = oms.long()
            oms = oms.to(device)

            # --------------------------------------------- 학습

            model.train()
            # Now calculate the unlabeled loss using the pseudo label
            imgs = torch.stack(imgs)
            imgs = imgs.to(device)
            # preds_array = preds_array.to(device)

            output = model(imgs)
            loss = 0
            for each in criterion:
                loss += each(output, oms)

            unlabeled_loss = alpha_weight(step) * loss

            # Backpropogate
            optimizer.zero_grad()
            unlabeled_loss.backward()
            optimizer.step()
            output = torch.argmax(output.squeeze(),
                                  dim=1).detach().cpu().numpy()
            hist = add_hist(hist,
                            oms.detach().cpu().numpy(),
                            output,
                            n_class=12)

            if (batch_idx + 1) % 25 == 0:
                acc, acc_cls, mIoU, fwavacc = label_accuracy_score(hist)
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, mIoU:{:.4f}'.
                      format(epoch + 1, num_epochs, batch_idx + 1,
                             len(unlabeled_loader), unlabeled_loss.item(),
                             mIoU))
            # For every 50 batches train one epoch on labeled data
            # 50배치마다 라벨데이터를 1 epoch학습
            if batch_idx % 50 == 0:

                # Normal training procedure
                for batch_idx, (images, masks, _) in enumerate(data_loader):
                    labeled_loss = 0
                    images = torch.stack(images)
                    # (batch, channel, height, width)
                    masks = torch.stack(masks).long()

                    # gpu 연산을 위해 device 할당
                    images, masks = images.to(device), masks.to(device)

                    output = model(images)

                    for each in criterion:
                        labeled_loss += each(output, masks)

                    optimizer.zero_grad()
                    labeled_loss.backward()
                    optimizer.step()

                # Now we increment step by 1
                step += 1

        if (epoch + 1) % val_every == 0:
            avrg_loss, val_mIoU = validation(epoch + 1, model, val_loader,
                                             criterion, device)
            if val_mIoU > best_mIoU:
                print('Best performance at epoch: {}'.format(epoch + 1))
                print('Save model in', saved_dir)
                best_mIoU = val_mIoU
                save_model(model, file_name=file_name)

        model.train()

        if epoch > 3:
            swa_model.update_parameters(model)
            swa_scheduler.step()
#--Memory cleanup prior to running the memory intensive classifiers--#
dfTrn,dfTest,dfAll = utils.data_garbage_collection(dfTrn,dfTest,dfAll)

#--use a benchmark instead of a classifier--#
benchmark_preds = train.cross_validate_using_benchmark('3.5', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15)
benchmark_preds = train.cross_validate_using_benchmark('global_mean', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15)
benchmark_preds = train.cross_validate_using_benchmark('business_mean', dfTrn, dfTrn[0].merge(dfTrn[1],how='inner',on='business_id').as_matrix(),dfTrn[0].ix[:,['rev_stars']].as_matrix(),folds=3,SEED=42,test_size=.15)
benchmark_preds = train.cross_validate_using_benchmark('usr_mean', dfTrn, dfTrn[0].merge(dfTrn[2],how='inner',on='user_id').as_matrix(),dfTrn[0].merge(dfTrn[2],how='inner',on='user_id').ix[:,['rev_stars']].as_matrix(),folds=3,SEED=22,test_size=.15)

#--predict using a benchmark--#
train.save_predictions_benchmark(dfTest_Benchmark_BusMean,'bus_mean',submission_no)
train.save_predictions_benchmark(dfTest_Benchmark_UsrMean,'usr_mean',submission_no)
train.save_predictions_benchmark(dfTest_Benchmark_BusUsrMean,'bus_usr_mean',submission_no)

#--Save model to joblib file--#
train.save_model(clf,clf_name)

#--Save a dataframe to CSV--#
filename = 'Data/'+datetime.now().strftime("%d-%m-%y_%H%M")+'--FinalDataset--OldUserTest'+'.csv'
#del dfTest_Master['business_id'];del dfTest_Master['user_id'];
#dfTest_Master.ix[:,['RecommendationId','calc_user_avg_stars','calc_user_rev_count']].to_csv(filename, index=False)
dfTest_Old[2].to_csv(filename, index=False)

#--Save predictions to CSV--#
filename = 'Data/'+datetime.now().strftime("%d-%m-%y_%H%M")+'--Pred_ChkBus&Open_LinReg'+'.csv'
dfTest_Master['predictions_LinReg'] = [x[0] for x in dfTest_Master.predictions_LinReg]
dfTest_Master.ix[:,['RecommendationId','predictions_LinReg']].to_csv(filename, index=False)

#--Load model from joblib file--#
clf = train.load_model('Models/07-07-13_1247--SGD_001_1000.joblib.pk1')

df = pd.read_csv('csv/train', header=0, chunksize=1)


global_X_test = numpy.ndarray((0,13))
global_y_test = numpy.ndarray((0,))
clf = linear_model.SGDClassifier()

counter = 0
for chunk in df:
	chunk = clean_df(chunk)
	train_data = chunk.values
	X_train, X_test, y_train, y_test = train_test_split(train_data[0::, 1::], train_data[0::, 0],
                                                    test_size=0.3, random_state=0)

	global_X_test = numpy.concatenate((global_X_test, X_test))
	global_y_test = numpy.concatenate((global_y_test, y_test))
	clf.partial_fit(X_train, y_train, classes=[0, 1])
	counter += 1

	if counter % 100 == 0 :
		print 'Counter ', counter
		print 'Score', clf.score(global_X_test, global_y_test)

save_model(clf)
print clf.score(global_X_test, global_y_test)



    dfTrn[0].merge(dfTrn[2], how='inner',
                   on='user_id').ix[:, ['rev_stars']].as_matrix(),
    folds=3,
    SEED=22,
    test_size=.15)

#--predict using a benchmark--#
train.save_predictions_benchmark(dfTest_Benchmark_BusMean, 'bus_mean',
                                 submission_no)
train.save_predictions_benchmark(dfTest_Benchmark_UsrMean, 'usr_mean',
                                 submission_no)
train.save_predictions_benchmark(dfTest_Benchmark_BusUsrMean, 'bus_usr_mean',
                                 submission_no)

#--Save model to joblib file--#
train.save_model(clf, clf_name)

#--Save a dataframe to CSV--#
filename = 'Data/' + datetime.now().strftime(
    "%d-%m-%y_%H%M") + '--FinalDataset--OldUserTest' + '.csv'
#del dfTest_Master['business_id'];del dfTest_Master['user_id'];
#dfTest_Master.ix[:,['RecommendationId','calc_user_avg_stars','calc_user_rev_count']].to_csv(filename, index=False)
dfTest_Old[2].to_csv(filename, index=False)

#--Save predictions to CSV--#
filename = 'Data/' + datetime.now().strftime(
    "%d-%m-%y_%H%M") + '--Pred_ChkBus&Open_LinReg' + '.csv'
dfTest_Master['predictions_LinReg'] = [
    x[0] for x in dfTest_Master.predictions_LinReg
]
dfTest_Master.ix[:, ['RecommendationId', 'predictions_LinReg']].to_csv(
示例#15
0
def train(opt):
    if torch.cuda.is_available():
        logger.info("%s", torch.cuda.get_device_name(0))

    # set etc
    torch.autograd.set_detect_anomaly(True)

    # prepare teacher config
    teacher_config = load_config(opt, config_path=opt.teacher_config)
    teacher_config['opt'] = opt
    logger.info("[teacher config] :\n%s", teacher_config)

    # prepare student config
    student_config = load_config(opt, config_path=opt.config)
    student_config['opt'] = opt
    logger.info("[student config] :\n%s", student_config)
         
    # set path
    set_path(teacher_config)
  
    # prepare train, valid dataset
    train_loader, valid_loader = prepare_datasets(teacher_config)
 
    # prepare labeled dataset for meta pseudo labels
    mpl_loader = None
    if opt.mpl_data_path:
        mpl_loader, _ = prepare_datasets(teacher_config, train_path=opt.mpl_data_path)

    # -------------------------------------------------------------------------------------------------------
    # distillation
    # -------------------------------------------------------------------------------------------------------
    if opt.do_distill:
        # prepare and load teacher model
        teacher_model = prepare_model(teacher_config, bert_model_name_or_path=opt.teacher_bert_model_name_or_path)
        teacher_checkpoint = load_checkpoint(opt.teacher_model_path, device=opt.device)
        teacher_model.load_state_dict(teacher_checkpoint)
        teacher_model = teacher_model.to(opt.device)
        logger.info("[prepare teacher model and loading done]")
 
        # prepare student model
        student_model = prepare_model(student_config, bert_model_name_or_path=opt.bert_model_name_or_path)
        logger.info("[prepare student model done]")

        best_eval_metric=None
        global_step, tr_loss, best_eval_metric = distill(teacher_config,
                teacher_model,
                student_config,
                student_model,
                train_loader,
                valid_loader,
                best_eval_metric=best_eval_metric,
                mpl_loader=mpl_loader)
        logger.info(f"[distillation done] global steps: {global_step}, total loss: {tr_loss}, best metric: {best_eval_metric}")
    # -------------------------------------------------------------------------------------------------------


    # -------------------------------------------------------------------------------------------------------
    # structured pruning
    # -------------------------------------------------------------------------------------------------------
    if opt.do_prune:
        # restore model from '--save_path', '--bert_output_dir'
        model = prepare_model(student_config, bert_model_name_or_path=opt.bert_output_dir)
        checkpoint = load_checkpoint(opt.save_path, device=opt.device)
        model.load_state_dict(checkpoint)
        model = model.to(opt.device)
        logger.info("[Restore best student model] : {}, {}".format(opt.bert_output_dir, opt.save_path))

        eval_loss = eval_acc = 0
        eval_loss, eval_acc = evaluate(model, student_config, valid_loader)
        logs = {}
        logs['eval_loss'] = eval_loss
        logs['eval_acc'] = eval_acc
        logger.info("[before pruning] :")
        logger.info(json.dumps({**logs}))

        prune_rewire(student_config, model, valid_loader, use_tqdm=True)

        # save pruned model to '--save_path_pruned', '--bert_output_dir_pruned'
        save_model(student_config, model, save_path=opt.save_path_pruned)
        model.bert_tokenizer.save_pretrained(opt.bert_output_dir_pruned)
        model.bert_model.save_pretrained(opt.bert_output_dir_pruned)
        logger.info("[Pruned model saved] : {}, {}".format(opt.save_path_pruned, opt.bert_output_dir_pruned))
示例#16
0
def distill(
        teacher_config,
        teacher_model,
        student_config,
        student_model,
        train_loader,
        eval_loader,
        best_eval_metric=None,
        mpl_loader=None):

    args = teacher_config['opt']

    teacher_layer_num = teacher_model.bert_model.config.num_hidden_layers
    student_layer_num = student_model.bert_model.config.num_hidden_layers

    # create teacher optimizer with larger L2 norm
    teacher_optimizer, _, _, _ = prepare_osws(teacher_config, teacher_model, train_loader, lr=args.mpl_learning_rate, weight_decay=args.mpl_weight_decay)

    # create student optimizer, scheduler, summary writer
    student_optimizer, student_scheduler, writer, _ = prepare_osws(student_config, student_model, train_loader, lr=args.lr, weight_decay=args.weight_decay)

    # prepare loss functions
    def soft_cross_entropy(predicts, targets):
        likelihood = F.log_softmax(predicts, dim=-1)
        targets_prob = F.softmax(targets, dim=-1)
        return (- targets_prob * likelihood).sum(dim=-1).mean()

    loss_mse_sum = MSELoss(reduction='sum').to(args.device)
    loss_mse = MSELoss().to(args.device)
    loss_cs = CosineSimilarity(dim=2).to(args.device)
    loss_cs_att = CosineSimilarity(dim=3).to(args.device)

    logger.info("***** Running distillation training *****")
    logger.info("  Num Batchs = %d", len(train_loader))
    logger.info("  Num Epochs = %d", args.epoch)
    logger.info("  batch size = %d", args.batch_size)
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    tr_loss, logging_loss = 0.0, 0.0
    tr_att_loss = 0.
    tr_rep_loss = 0.
    tr_cls_loss = 0.
    teacher_model.zero_grad()
    student_model.zero_grad()
    epoch_iterator = range(epochs_trained, int(args.epoch))

    # for reproductibility
    set_seed(args)

    for epoch_n in epoch_iterator:
        tr_att_loss = 0.
        tr_rep_loss = 0.
        tr_cls_loss = 0.
        train_iterator = tqdm(train_loader, desc=f"Epoch {epoch_n}")
        for step, (x, y) in enumerate(train_iterator):
            x = to_device(x, args.device)
            y = to_device(y, args.device)

            # -------------------------------------------------------------------------------------------------------
            # teacher -> student, teaching with teacher_model.eval(), student_model.train()
            # -------------------------------------------------------------------------------------------------------
            att_loss = 0.
            rep_loss = 0.
            cls_loss = 0.

            # teacher model output
            teacher_model.eval()
            with torch.no_grad():
                output_teacher, teacher_bert_outputs = teacher_model(x, return_bert_outputs=True)

            # student model output
            student_model.train()
            output_student, student_bert_outputs = student_model(x, return_bert_outputs=True)

           
            # Knowledge Distillation loss
            # 1) logits distillation
            '''
            kd_loss = soft_cross_entropy(output_student, output_teacher)
            '''
            kd_loss = loss_mse_sum(output_student, output_teacher)

            loss = kd_loss
            tr_cls_loss += loss.item()

            # 2) embedding and last hidden state distillation
            if args.state_loss_ratio > 0.0:
                teacher_reps = teacher_bert_outputs.hidden_states
                student_reps = student_bert_outputs.hidden_states

                new_teacher_reps = [teacher_reps[0], teacher_reps[teacher_layer_num]]
                new_student_reps = [student_reps[0], student_reps[student_layer_num]]
                for student_rep, teacher_rep in zip(new_student_reps, new_teacher_reps):
                    # cosine similarity loss
                    if args.state_distill_cs:
                        tmp_loss = 1.0 - loss_cs(student_rep, teacher_rep).mean()
                    # MSE loss
                    else:
                        tmp_loss = loss_mse(student_rep, teacher_rep)
                    rep_loss += tmp_loss
                loss += args.state_loss_ratio * rep_loss
                tr_rep_loss += rep_loss.item()

            # 3) Attentions distillation
            if args.att_loss_ratio > 0.0:
                teacher_atts = teacher_bert_outputs.attentions
                student_atts = student_bert_outputs.attentions

                assert teacher_layer_num == len(teacher_atts)
                assert student_layer_num == len(student_atts)
                assert teacher_layer_num % student_layer_num == 0
                layers_per_block = int(teacher_layer_num / student_layer_num)
                new_teacher_atts = [teacher_atts[i * layers_per_block + layers_per_block - 1]
                                    for i in range(student_layer_num)]

                for student_att, teacher_att in zip(student_atts, new_teacher_atts):
                    student_att = torch.where(student_att <= -1e2, torch.zeros_like(student_att).to(args.device),
                                              student_att)
                    teacher_att = torch.where(teacher_att <= -1e2, torch.zeros_like(teacher_att).to(args.device),
                                              teacher_att)
                    tmp_loss = 1.0 - loss_cs_att(student_att, teacher_att).mean()
                    att_loss += tmp_loss

                loss += args.att_loss_ratio * att_loss
                tr_att_loss += att_loss.item()

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            # back propagate through student model
            loss.backward()
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(student_model.parameters(), args.max_grad_norm)
                student_optimizer.step()  # update student model
                student_scheduler.step()  # Update learning rate schedule
                student_model.zero_grad()
                global_step += 1
            # -------------------------------------------------------------------------------------------------------

            # -------------------------------------------------------------------------------------------------------
            # student -> teacher, performance feedback/update with student_model.eval(), teacher_model.train()
            # -------------------------------------------------------------------------------------------------------
            mpl_loss = 0.0
            if mpl_loader and global_step > args.mpl_warmup_steps: 
                loss_cross_entropy = torch.nn.CrossEntropyLoss().to(args.device)
                mpl_iterator = iter(mpl_loader)
                try:
                    (x, y) = next(mpl_iterator) # draw random sample
                except StopIteration as e:
                    mpl_iterator = iter(mpl_loader)
                    (x, y) = next(mpl_iterator) # draw random sample
                x = to_device(x, args.device)
                y = to_device(y, args.device)

                # teacher model output
                teacher_model.train()
                output_teacher, teacher_bert_outputs = teacher_model(x, return_bert_outputs=True)

                # student model output
                student_model.eval() # updated student model
                output_student, student_bert_outputs = student_model(x, return_bert_outputs=True)

                # the loss is the performance of the student on the labeled data.
                # additionaly, we add the loss of the teacher on the labeled data for avoiding overfitting.
                mpl_loss = loss_cross_entropy(output_student, y) / 2 + loss_cross_entropy(output_teacher, y) / 2
                if args.gradient_accumulation_steps > 1:
                    mpl_loss = mpl_loss / args.gradient_accumulation_steps

                # back propagate through teacher model
                mpl_loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(teacher_model.parameters(), args.max_grad_norm)
                    teacher_optimizer.step() # update teacher model 
                    teacher_model.zero_grad()
                    student_model.zero_grad() # clear gradient info which was generated during forward computation.
            # -------------------------------------------------------------------------------------------------------

            train_iterator.set_description(f"Epoch {epoch_n} loss: {loss:.3f}, mpl loss: {mpl_loss:.3f}")
            if writer:
                writer.add_scalar('loss', loss, global_step)
                writer.add_scalar('mpl_loss', mpl_loss, global_step)


            # -------------------------------------------------------------------------------------------------------
            # evaluate student, save model
            # -------------------------------------------------------------------------------------------------------
            flag_eval = False
            logs = {}
            if args.logging_steps > 0 and global_step % args.logging_steps == 0: flag_eval = True
            if flag_eval:
                if args.log_evaluate_during_training:
                    eval_loss, eval_acc = evaluate(student_model, student_config, eval_loader)
                    logs['eval_loss'] = eval_loss
                    logs['eval_acc'] = eval_acc
                    if writer:
                        writer.add_scalar('eval_loss', eval_loss, global_step)
                        writer.add_scalar('eval_acc', eval_acc, global_step)
                
                cls_loss = tr_cls_loss / (step + 1)
                att_loss = tr_att_loss / (step + 1)
                rep_loss = tr_rep_loss / (step + 1)

                loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                learning_rate_scalar = student_scheduler.get_last_lr()[0]
                logs["learning_rate"] = learning_rate_scalar
                logs["avg_loss_since_last_log"] = loss_scalar
                logs['cls_loss'] = cls_loss
                logs['att_loss'] = att_loss
                logs['rep_loss'] = rep_loss
                logging_loss = tr_loss
                logging.info(json.dumps({**logs, **{"step": global_step}}))
                if writer:
                    writer.add_scalar('learning_rate', learning_rate_scalar, global_step)
                    writer.add_scalar('avg_loss_since_last_log', loss_scalar, global_step)
                    writer.add_scalar('cls_loss', cls_loss, global_step)
                    writer.add_scalar('att_loss', att_loss, global_step)
                    writer.add_scalar('rep_loss', rep_loss, global_step)

            flag_eval = False
            if step == 0 and epoch_n != 0: flag_eval = True # every epoch
            if args.eval_and_save_steps > 0 and global_step % args.eval_and_save_steps == 0: flag_eval = True
            if flag_eval:
                eval_loss, eval_acc = evaluate(student_model, student_config, eval_loader)
                logs['eval_loss'] = eval_loss
                logs['eval_acc'] = eval_acc
                logger.info(json.dumps({**logs, **{"step": global_step}}))
                if writer:
                    writer.add_scalar('eval_loss', eval_loss, global_step)
                    writer.add_scalar('eval_acc', eval_acc, global_step)
                # measured by accuracy
                curr_eval_metric = eval_acc
                if best_eval_metric is None or curr_eval_metric > best_eval_metric:
                    # save model to '--save_path', '--bert_output_dir'
                    save_model(student_config, student_model, save_path=args.save_path)
                    student_model.bert_tokenizer.save_pretrained(args.bert_output_dir)
                    student_model.bert_model.save_pretrained(args.bert_output_dir)
                    best_eval_metric = curr_eval_metric
                    logger.info("[Best student model saved] : {:10.6f}, {}, {}".format(best_eval_metric, args.bert_output_dir, args.save_path))
            # -------------------------------------------------------------------------------------------------------

    return global_step, tr_loss / global_step, best_eval_metric
示例#17
0
    # try:
    #     s = sys.argv[1]
    # except IndexError:
    #     s = ""
    # create_interactions(s)


    epochs = 1

    for i in range(0, 10):
        # Generate new data
        create_interactions(str(i))
        # Load
        model = load_model(name="model")
        model.compile(loss='mse',
           optimizer=RMSprop())
        # Train
        train_filename = "/ssd/train_extra.csv{}".format(i)
        model, losses = train_model(model, epochs, train_filename, nb_epoch=2)
        # Test
        print("MSE", losses[-1])
        test_filename = "/ssd/test_extra.csv{}".format(i)
        m = test_model(model, test_filename)
        # Save model
        save_model(model, name="model")
        # if m > 0.93:
        #     break



示例#18
0
    # Similar to our train script, but we do this k times
    for k, datasets in enumerate(iterate_folds(fold_sets)):
        train, val, test = datasets
        model = BeatNet(downbeats=args.downbeats)
        if cuda_device is not None:
            model.cuda(args.cuda_device)

        output_file = make_fold_output_name(args.output_file, k)

        train_loader, val_loader, test_loader = make_data_loaders(
            (train, val, test), batch_size=args.batch_size)

        train_loop(model,
                   train_loader,
                   val_loader=val_loader,
                   num_epochs=args.num_epochs,
                   cuda_device=cuda_device,
                   output_file=output_file,
                   davies_stopping_condition=args.davies_stopping_condition,
                   fold=k)

        if args.output_file is not None:
            save_model(model, output_file)

        if args.dataset_output_file is not None:
            save_dir = make_fold_output_name(args.dataset_output_file, k)
            save_datasets((train, val, test), save_dir)

        test_model(model, test_loader, cuda_device=cuda_device)