Exemplo n.º 1
0
def multiclass_forecast_full(num_classes_train=TRAIN_LENGTH,
                             num_classes_predict=PREDICT_LENGTH,
                             model=None,
                             categories=False,
                             top_n=1):
    (X_train, X_val, X_test, y_train, y_val,
     y_test), vectorizer = util.prep_dataset_v3(
         num_classes_train=num_classes_train,
         num_classes_predict=num_classes_predict,
         vectorize=True)

    if categories:
        y_train = util.degrees_to_categories(y_train)
        y_val = util.degrees_to_categories(y_val)
        y_test = util.degrees_to_categories(y_test)

    if not model:
        train_score, reg = train_log_reg(X_train, y_train)
        print(train_score)
        if categories:
            pickle.dump(reg, open("log_reg_full_train_categories.pickle",
                                  'wb'))
        else:
            pickle.dump(reg, open("log_reg_full_train.pickle", 'wb'))
    else:

        reg = pickle.load(open(model, 'rb'))

    macro_f1 = util.evaluate_model(X_test,
                                   y_test,
                                   reg,
                                   output_dict=True,
                                   top_n=top_n)['macro avg']['f1-score']
    print(
        util.evaluate_model(X_test,
                            y_test,
                            reg,
                            output_dict=False,
                            top_n=top_n))

    util.evaluate_model_bias(reg,
                             vectorizer,
                             evaluate_model_bias_single_df,
                             num_classes_predict=PREDICT_LENGTH,
                             categories=False,
                             top_n=top_n,
                             test=True)

    return macro_f1, reg
def main(device=torch.device('cuda:0')):
    # CLI arguments
    parser = arg.ArgumentParser(
        description='We all know what we are doing. Fighting!')
    parser.add_argument("--datasize",
                        "-d",
                        default="small",
                        type=str,
                        help="data size you want to use, small, medium, total")
    # Parsing
    args = parser.parse_args()
    # Data loaders
    datasize = args.datasize
    pathname = "data/nyu.zip"
    tr_loader, va_loader, te_loader = getTrainingValidationTestingData(
        datasize, pathname, batch_size=config("unet.batch_size"))

    # Model
    model = Net()

    # define loss function
    # criterion = torch.nn.L1Loss()

    # Attempts to restore the latest checkpoint if exists
    print("Loading unet...")
    model, start_epoch, stats = util.restore_checkpoint(
        model, util.config("unet.checkpoint"))
    acc, loss = util.evaluate_model(model, te_loader, device)
    # axes = util.make_training_plot()
    print(f'Test Accuracy:{acc}')
    print(f'Test Loss:{loss}')
Exemplo n.º 3
0
def main():
    model = None
    if config.model == 'Baseline':
        model = models.Baseline()
    elif config.model == "Muzip":
        model = models.Muzip()
    elif config.model == "Muzip2":
        model = models.Muzip2()
    elif config.model == "Muzip3":
        model = models.Muzip3()
    else:
        model = models.Muzip3()
    if config.train:
        # Can train with existing weights (if config.restart = True, will use most recent by default)
        weights_path = None
        if not config.restart:
            weights_path = os.path.join(
                config.model_save_dir,
                get_recent_weights_path(config.model_save_dir))
        model.build(weights_path)
        train(model)
        #simple_train(model)
        if config.save_model:
            save_model(model,
                       config.model_save_path,
                       config.model_weights_save_path,
                       ext=".h5")
    if config.pred:
        # If we only care about predicting!
        # Make sure there are trained weights (most recent will be used by default)
        weights_path = config.model_save_dir + '/' + get_recent_weights_path(
            config.model_save_dir)
        model.build(weights_path)
        # pred(model)
        pred(model)
    if config.evaluate:
        weights_path = config.model_save_dir + '/' + get_recent_weights_path(
            config.model_save_dir)
        model.build(weights_path)
        evaluate_model(model, "test/")
Exemplo n.º 4
0
def run_baseline_classifier(num_classes_train=TRAIN_LENGTH,
                            num_classes_predict=PREDICT_LENGTH,
                            categories=False,
                            top_n=1):
    (X_train, X_val, X_test, y_train, y_val,
     y_test), vectorizer = util.prep_dataset_v3(
         num_classes_train=num_classes_train,
         num_classes_predict=num_classes_predict,
         vectorize=True)

    if categories:
        y_train = util.degrees_to_categories(y_train)
        y_val = util.degrees_to_categories(y_val)
        y_test = util.degrees_to_categories(y_test)

    dummy_clf = DummyClassifier(strategy='stratified').fit(X_train, y_train)

    macro_f1 = util.evaluate_model(X_test,
                                   y_test,
                                   dummy_clf,
                                   output_dict=True,
                                   top_n=top_n)['macro avg']['f1-score']
    print(
        util.evaluate_model(X_test,
                            y_test,
                            dummy_clf,
                            output_dict=False,
                            top_n=top_n))

    util.evaluate_model_bias(
        dummy_clf,
        vectorizer,
        logistic_regression_model.evaluate_model_bias_single_df,
        num_classes_predict=PREDICT_LENGTH,
        categories=categories,
        top_n=top_n,
        test=True)

    return macro_f1, dummy_clf
Exemplo n.º 5
0
def evaluate_model_bias_single_df(model,
                                  df,
                                  vectorizer,
                                  num_classes_predict=0,
                                  categories=False,
                                  top_n=1):
    X, y = util.tokenize_df(df, num_classes_predict)
    _, X = util.vectorize_course_history(X.loc[:, 'course_history'],
                                         vectorizer=vectorizer)

    if categories:
        y = util.degrees_to_categories(y)

    return util.evaluate_model(X, y, model, output_dict=True, top_n=top_n)
def evaluate_model_bias_single_df(model,
                                  df,
                                  args,
                                  num_classes_predict=0,
                                  categories=False,
                                  top_n=1):
    course2vec_model, vec_size = args
    X, y = util.tokenize_df(df, num_classes_predict)

    X = featurize_student(X['course_history'], course2vec_model, vec_size)
    if categories:
        y = util.degrees_to_categories(y)

    return util.evaluate_model(X, y, model, output_dict=True, top_n=top_n)
def main(device=torch.device('cuda:0')):
    """Print performance metrics for model at specified epoch."""
    # Data loaders
    pathname = "data/nyu.zip"
    tr_loader, va_loader, te_loader = getTrainingValidationTestingData(pathname,
                                                                       batch_size=util.config("unet.batch_size"))

    # Model
    model = Net()

    # define loss function
    # criterion = torch.nn.L1Loss()

    # Attempts to restore the latest checkpoint if exists
    print("Loading unet...")
    model, start_epoch, stats = util.restore_checkpoint(model, util.config("unet.checkpoint"))
    acc, loss = util.evaluate_model(model, te_loader, device)
    # axes = util.make_training_plot()
    print(f'Test Accuracy:{acc}')
    print(f'Test Loss:{loss}')
Exemplo n.º 8
0
def train_model(data_set_identifier, train_file, val_file, learning_rate, minibatch_size, name):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size, name)

    train_loader = contruct_dataloader_from_disk(train_file, minibatch_size, use_evolutionary=True)
    validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size, use_evolutionary=True)
    validation_dataset_size = validation_loader.dataset.__len__()
    train_dataset_size = train_loader.dataset.__len__()



    embedding_size = 21
    if configs.run_params["use_evolutionary"]:
        embedding_size = 42


    #Load in existing model if given as argument
    if args.model is not None:
        model_path = "output/models/" + args.model + ".model"
        model = load_model_from_disk(model_path, use_gpu)
    else:
    #else construct new model from config file
        model = construct_model(configs.model_params, embedding_size, use_gpu,minibatch_size)
    
    #optimizer parameters
    betas = tuple(configs.run_params["betas"])
    weight_decay = configs.run_params["weight_decay"]
    angle_lr = configs.run_params["angles_lr"]

    if configs.model_params['architecture'] == 'cnn_angles':
        optimizer = optim.Adam(model.parameters(), betas=betas, lr=learning_rate, weight_decay=weight_decay)
    else:
        optimizer = optim.Adam([
            {'params' : model.model.parameters(), 'lr':learning_rate},
            {'params' : model.soft_to_angle.parameters(), 'lr':angle_lr}], betas=betas, weight_decay=weight_decay)
    
    #print number of trainable parameters
    print_number_of_parameters(model)
    #For creating a summary table of the model (does not work on ExampleModel!)
    if configs.run_params["print_model_summary"]:
        if configs.model_params["architecture"] != 'rnn':
            summary(model, configs.run_params["max_sequence_length"], 2)
        else:
            write_out("DETAILED MODEL SUMMARY IS NOT SUPPORTED FOR RNN MODELS")
    
    if use_gpu:
        model = model.cuda()

    # TODO: is soft_to_angle.parameters() included here?

    sample_num = list()
    train_loss_values = list()
    validation_loss_values = list()
    rmsd_avg_values = list()
    drmsd_avg_values = list()
    break_point_values = list()

    breakpoints = configs.run_params['breakpoints']
    best_model_loss = 1e20
    best_model_train_loss = 1e20
    best_model_minibatch_time = None
    best_model_path = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    loss_atoms = configs.run_params["loss_atoms"]
    start_time = time.time()
    max_time = configs.run_params["max_time"]
    C_epochs = configs.run_params["c_epochs"] # TODO: Change to parameter
    C_batch_updates = C_epochs

    while not stopping_condition_met:
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        start_time_n_minibatches = time.time()
        for minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            training_minibatch = list(training_minibatch)
            primary_sequence, tertiary_positions, mask, p_id = training_minibatch[:-1]
            # Update C
            C = 1.0 if minibatches_proccesed >= C_batch_updates else float(minibatches_proccesed) / C_batch_updates

            #One Hot encode amino string and concate PSSM values.
            amino_acids, batch_sizes = one_hot_encode(primary_sequence, 21, use_gpu)

            if configs.run_params["use_evolutionary"]:
                evolutionary = training_minibatch[-1]

                evolutionary, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(torch.nn.utils.rnn.pack_sequence(evolutionary))
                
                if use_gpu:
                    evolutionary = evolutionary.cuda()

                amino_acids = torch.cat((amino_acids, evolutionary.view(-1, len(batch_sizes) , 21)), 2)

            start_compute_loss = time.time()

            if configs.run_params["only_angular_loss"]:
                #raise NotImplementedError("Only_angular_loss function has not been implemented correctly yet.")
                loss = model.compute_angular_loss((amino_acids, batch_sizes), tertiary_positions, mask)
            else:
                loss = model.compute_loss((amino_acids, batch_sizes), tertiary_positions, mask, C=C, loss_atoms=loss_atoms)
            
            if C != 1:
                write_out("C:", C)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            end = time.time()
            write_out("Loss time:", start_compute_grad-start_compute_loss, "Grad time:", end-start_compute_grad)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % configs.run_params["eval_interval"] == 0:
                model.eval()
                write_out("Testing model on validation set...")
                train_loss = loss_tracker.mean()
                loss_tracker = np.zeros(0)
                validation_loss, data_total, rmsd_avg, drmsd_avg = evaluate_model(validation_loader,
                     model, use_gpu, loss_atoms, configs.run_params["use_evolutionary"])
                prim = data_total[0][0]
                pos = data_total[0][1]
                pos_pred = data_total[0][3]
                mask = data_total[0][4]
                pos = apply_mask(pos, mask)
                angles_pred = data_total[0][2]

                angles_pred = apply_mask(angles_pred, mask, size=3)

                pos_pred = apply_mask(pos_pred, mask)
                prim = torch.masked_select(prim, mask)

                if use_gpu:
                    pos = pos.cuda()
                    pos_pred = pos_pred.cuda()

                angles = calculate_dihedral_angels(pos, use_gpu)
                #angles_pred = calculate_dihedral_angels(pos_pred, use_gpu)
                #angles_pred = data_total[0][2] # Use angles output from model - calculate_dihedral_angels(pos_pred, use_gpu)

                write_to_pdb(get_structure_from_angles(prim, angles), "test")
                write_to_pdb(get_structure_from_angles(prim, angles_pred), "test_pred")
                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = write_model_to_disk(model)

                if train_loss < best_model_train_loss:
                    best_model_train_loss = train_loss
                    best_model_train_path = write_model_to_disk(model, model_type="train")

                write_out("Validation loss:", validation_loss, "Train loss:", train_loss)
                write_out("Best model so far (validation loss): ", best_model_loss, "at time", best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Best model train stored at " + best_model_train_path)
                write_out("Minibatches processed:",minibatches_proccesed)

                end_time_n_minibatches = time.time()
                n_minibatches_time_used = end_time_n_minibatches - start_time_n_minibatches
                minibatches_left = configs.run_params["max_updates"] - minibatches_proccesed
                seconds_left = int(n_minibatches_time_used * (minibatches_left/configs.run_params["eval_interval"]))
                
                m, s = divmod(seconds_left, 60)
                h, m = divmod(m, 60)
                write_out("Estimated time until maximum number of updates:", '{:d}:{:02d}:{:02d}'.format(h, m, s) )
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                validation_loss_values.append(validation_loss)
                rmsd_avg_values.append(rmsd_avg)
                drmsd_avg_values.append(drmsd_avg)
                
                if breakpoints and minibatches_proccesed > breakpoints[0]:
                    break_point_values.append(drmsd_avg)
                    breakpoints = breakpoints[1:]

                data = {}
                data["pdb_data_pred"] = open("output/protein_test_pred.pdb","r").read()
                data["pdb_data_true"] = open("output/protein_test.pdb","r").read()
                data["validation_dataset_size"] = validation_dataset_size
                data["sample_num"] = sample_num
                data["train_loss_values"] = train_loss_values
                data["break_point_values"] = break_point_values
                data["validation_loss_values"] = validation_loss_values
                data["phi_actual"] = list([math.degrees(float(v)) for v in angles[1:,1]])
                data["psi_actual"] = list([math.degrees(float(v)) for v in angles[:-1,2]])
                data["phi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[1:,1]])
                data["psi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[:-1,2]])
                data["drmsd_avg"] = drmsd_avg_values
                data["rmsd_avg"] = rmsd_avg_values
                if not configs.run_params["hide_ui"]:
                    res = requests.post('http://localhost:5000/graph', json=data)
                    if res.ok:
                        print(res.json())
                
                # Save run data
                write_run_to_disk(data)

                #Check if maximum time is reached.
                start_time_n_minibatches = time.time()
                time_used = time.time() - start_time

                time_condition = (max_time is not None and time_used > max_time)
                max_update_condition = minibatches_proccesed >= configs.run_params["max_updates"]
                min_update_condition = (minibatches_proccesed > configs.run_params["min_updates"] and minibatches_proccesed > best_model_minibatch_time * 2)

                model.train()
                #Checking for stop conditions
                if time_condition or max_update_condition or min_update_condition:
                    stopping_condition_met = True
                    break
    write_out("Best validation model found after" , best_model_minibatch_time , "minibatches.")
    write_result_summary(best_model_loss)
    return best_model_path
Exemplo n.º 9
0
def main(device=torch.device('cuda:0')):
    # CLI arguments
    parser = arg.ArgumentParser(
        description='We all know what we are doing. Fighting!')
    parser.add_argument("--datasize",
                        "-d",
                        default="small",
                        type=str,
                        help="data size you want to use, small, medium, total")
    # Parsing
    args = parser.parse_args()
    # Data loaders
    datasize = args.datasize
    pathname = "data/nyu.zip"
    tr_loader, va_loader, te_loader = getTrainingValidationTestingData(
        datasize, pathname, batch_size=config("unet.batch_size"))

    # Model
    model = Net()

    # TODO: define loss function, and optimizer
    learning_rate = util.config("unet.learning_rate")
    criterion = DepthLoss(0.1)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    number_of_epoches = 10
    #

    # print("Number of float-valued parameters:", util.count_parameters(model))

    # Attempts to restore the latest checkpoint if exists
    print("Loading unet...")
    model, start_epoch, stats = util.restore_checkpoint(
        model, util.config("unet.checkpoint"))

    # axes = utils.make_training_plot()

    # Evaluate the randomly initialized model
    # evaluate_epoch(
    #     axes, tr_loader, va_loader, te_loader, model, criterion, start_epoch, stats
    # )
    # loss = criterion()

    # initial val loss for early stopping
    # prev_val_loss = stats[0][1]

    running_va_loss = []
    running_va_acc = []
    running_tr_loss = []
    running_tr_acc = []
    # TODO: define patience for early stopping
    # patience = 1
    # curr_patience = 0
    #
    tr_acc, tr_loss = util.evaluate_model(model, tr_loader, device)
    acc, loss = util.evaluate_model(model, va_loader, device)
    running_va_acc.append(acc)
    running_va_loss.append(loss)
    running_tr_acc.append(tr_acc)
    running_tr_loss.append(tr_loss)

    # Loop over the entire dataset multiple times
    # for epoch in range(start_epoch, config('cnn.num_epochs')):
    epoch = start_epoch
    # while curr_patience < patience:
    while epoch < number_of_epoches:
        # Train model
        util.train_epoch(tr_loader, model, criterion, optimizer, device)
        tr_acc, tr_loss = util.evaluate_model(model, tr_loader, device)
        va_acc, va_loss = util.evaluate_model(model, va_loader, device)
        running_va_acc.append(va_acc)
        running_va_loss.append(va_loss)
        running_tr_acc.append(tr_acc)
        running_tr_loss.append(tr_loss)
        # Evaluate model
        # evaluate_epoch(
        #     axes, tr_loader, va_loader, te_loader, model, criterion, epoch + 1, stats
        # )

        # Save model parameters
        util.save_checkpoint(model, epoch + 1, util.config("unet.checkpoint"),
                             stats)

        # update early stopping parameters
        """
        curr_patience, prev_val_loss = early_stopping(
            stats, curr_patience, prev_val_loss
        )
        """

        epoch += 1
    print("Finished Training")
    # Save figure and keep plot open
    # utils.save_training_plot()
    # utils.hold_training_plot()
    util.make_plot(running_tr_loss, running_tr_acc, running_va_loss,
                   running_va_acc)
Exemplo n.º 10
0
Author: Tim Sweeney
'''

import wandb
import util
import argparse

project = "model_registry_example"
model_use_case_id = "mnist"
job_type = "evaluator"

# First, we launch a run which registers this workload with W&B
run = wandb.init(project=project, job_type=job_type)

# Then we fetch the latest evaluation set.
x_eval, y_eval, dataset = util.download_eval_dataset_from_wb(model_use_case_id)

# Next we fetch the new candidate models for this use case
metric = f"{dataset.name}-ce_loss"
candidates = util.get_new_model_candidates_from_wb(project, model_use_case_id,
                                                   metric)

# Evaluate the models and save their metrics to wb.
for model in candidates:
    score = util.evaluate_model(model, x_eval, y_eval)
    util.save_metric_to_model_in_wb(model, metric, score)

# Finally, promote the best model to production.
util.promote_best_model_in_wb(project, model_use_case_id, metric)
Exemplo n.º 11
0
def train_model(data_set_identifier, train_file, val_file, learning_rate,
                minibatch_size):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size)

    train_loader = contruct_dataloader_from_disk(train_file, minibatch_size)
    validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size)
    validation_dataset_size = validation_loader.dataset.__len__()

    model = ExampleModel(21, minibatch_size,
                         use_gpu=use_gpu)  # embed size = 21

    # TODO: is soft_to_angle.parameters() included here?
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    sample_num = list()
    train_loss_values = list()
    validation_loss_values = list()
    rmsd_avg_values = list()
    drmsd_avg_values = list()

    best_model_loss = 1.1
    best_model_minibatch_time = None
    best_model_path = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    while not stopping_condition_met:
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        for minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            primary_sequence, tertiary_positions, mask = training_minibatch
            start_compute_loss = time.time()
            loss = model.compute_loss(primary_sequence, tertiary_positions)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            end = time.time()
            write_out("Loss time:", start_compute_grad - start_compute_loss,
                      "Grad time:", end - start_compute_grad)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % args.eval_interval == 0:

                train_loss = loss_tracker.mean()
                loss_tracker = np.zeros(0)
                validation_loss, data_total, rmsd_avg, drmsd_avg = evaluate_model(
                    validation_loader, model)
                prim = data_total[0][0]
                pos = data_total[0][1]
                (aa_list, phi_list, psi_list,
                 omega_list) = calculate_dihedral_angels(prim, pos)
                write_to_pdb(
                    get_structure_from_angles(aa_list, phi_list[1:],
                                              psi_list[:-1], omega_list[:-1]),
                    "test")
                cmd.load("output/protein_test.pdb")
                write_to_pdb(data_total[0][3], "test_pred")
                cmd.load("output/protein_test_pred.pdb")
                cmd.forward()
                cmd.orient()
                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = write_model_to_disk(model)

                write_out("Validation loss:", validation_loss, "Train loss:",
                          train_loss)
                write_out("Best model so far (label loss): ", validation_loss,
                          "at time", best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Minibatches processed:", minibatches_proccesed)
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                validation_loss_values.append(validation_loss)
                rmsd_avg_values.append(rmsd_avg)
                drmsd_avg_values.append(drmsd_avg)
                if args.live_plot:
                    data = {}
                    data["validation_dataset_size"] = validation_dataset_size
                    data["sample_num"] = sample_num
                    data["train_loss_values"] = train_loss_values
                    data["validation_loss_values"] = validation_loss_values
                    data["phi_actual"] = list(
                        [math.degrees(float(v)) for v in phi_list[1:]])
                    data["psi_actual"] = list(
                        [math.degrees(float(v)) for v in psi_list[:-1]])
                    data["phi_predicted"] = list([
                        math.degrees(float(v)) for v in data_total[0]
                        [2].detach().transpose(0, 1)[0][1:]
                    ])
                    data["psi_predicted"] = list([
                        math.degrees(float(v)) for v in data_total[0]
                        [2].detach().transpose(0, 1)[1][:-1]
                    ])
                    data["drmsd_avg"] = drmsd_avg_values
                    data["rmsd_avg"] = rmsd_avg_values
                    res = requests.post('http://localhost:5000/graph',
                                        json=data)
                    if res.ok:
                        print(res.json())

                if minibatches_proccesed > args.minimum_updates and minibatches_proccesed > best_model_minibatch_time * 2:
                    stopping_condition_met = True
                    break
    write_result_summary(best_model_loss)
    return best_model_path
Exemplo n.º 12
0
def main(device=torch.device('cuda:0')):
    """Train CNN and show training plots."""
    # Data loaders
    """
    if check_for_augmented_data("./data"):
        tr_loader, va_loader, te_loader, _ = get_train_val_test_loaders(
            task="target", batch_size=config("cnn.batch_size"), augment=True
        )
    else:
        tr_loader, va_loader, te_loader, _ = get_train_val_test_loaders(
            task="target",
            batch_size=config("cnn.batch_size"),
        )
    """
    # pathname = "data/nyu_depth.zip"
    pathname = "data/nyu_small.zip"
    tr_loader, va_loader, te_loader = getTrainingValidationTestingData(pathname,
                                                                       batch_size=util.config("unet.batch_size"))

    # Model
    model = Net()

    # TODO: define loss function, and optimizer
    learning_rate = util.config("unet.learning_rate")
    criterion = DepthLoss(0.1)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    number_of_epoches = 10
    #

    # print("Number of float-valued parameters:", util.count_parameters(model))

    # Attempts to restore the latest checkpoint if exists
    print("Loading unet...")
    model, start_epoch, stats = util.restore_checkpoint(model, util.config("unet.checkpoint"))

    # axes = utils.make_training_plot()

    # Evaluate the randomly initialized model
    # evaluate_epoch(
    #     axes, tr_loader, va_loader, te_loader, model, criterion, start_epoch, stats
    # )
    # loss = criterion()

    # initial val loss for early stopping
    # prev_val_loss = stats[0][1]

    running_va_loss = []
    running_va_acc = []
    running_tr_loss = []
    running_tr_acc = []
    # TODO: define patience for early stopping
    # patience = 1
    # curr_patience = 0
    #
    tr_acc, tr_loss = util.evaluate_model(model, tr_loader, device)
    acc, loss = util.evaluate_model(model, va_loader, device)
    running_va_acc.append(acc)
    running_va_loss.append(loss)
    running_tr_acc.append(tr_acc)
    running_tr_loss.append(tr_loss)

    # Loop over the entire dataset multiple times
    # for epoch in range(start_epoch, config('cnn.num_epochs')):
    epoch = start_epoch
    # while curr_patience < patience:
    while epoch < number_of_epoches:
        # Train model
        util.train_epoch(tr_loader, model, criterion, optimizer)
        tr_acc, tr_loss = util.evaluate_model(model, tr_loader, device)
        va_acc, va_loss = util.evaluate_model(model, va_loader, device)
        running_va_acc.append(va_acc)
        running_va_loss.append(va_loss)
        running_tr_acc.append(tr_acc)
        running_tr_loss.append(tr_loss)
        # Evaluate model
        # evaluate_epoch(
        #     axes, tr_loader, va_loader, te_loader, model, criterion, epoch + 1, stats
        # )

        # Save model parameters
        util.save_checkpoint(model, epoch + 1, util.config("unet.checkpoint"), stats)

        # update early stopping parameters
        """
        curr_patience, prev_val_loss = early_stopping(
            stats, curr_patience, prev_val_loss
        )
        """

        epoch += 1
    print("Finished Training")
    # Save figure and keep plot open
    # utils.save_training_plot()
    # utils.hold_training_plot()
    util.make_plot(running_tr_loss, running_tr_acc, running_va_loss, running_va_acc)
Exemplo n.º 13
0
def train_model(data_set_identifier, train_file, val_file, learning_rate,
                minibatch_size):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size)

    train_loader = contruct_dataloader_from_disk(train_file, minibatch_size)
    validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size)
    validation_dataset_size = validation_loader.dataset.__len__()

    model = ExampleModel(21, minibatch_size,
                         use_gpu=use_gpu)  # embed size = 21
    if use_gpu:
        model = model.cuda()

    # TODO: is soft_to_angle.parameters() included here?
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    sample_num = list()
    train_loss_values = list()
    validation_loss_values = list()
    rmsd_avg_values = list()
    drmsd_avg_values = list()

    best_model_loss = 1e20
    best_model_minibatch_time = None
    best_model_path = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    while not stopping_condition_met:
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        for minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            primary_sequence, tertiary_positions, mask = training_minibatch
            start_compute_loss = time.time()
            loss = model.compute_loss(primary_sequence, tertiary_positions)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            end = time.time()
            write_out("Loss time:", start_compute_grad - start_compute_loss,
                      "Grad time:", end - start_compute_grad)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % args.eval_interval == 0:

                write_out("Testing model on validation set...")

                train_loss = loss_tracker.mean()
                loss_tracker = np.zeros(0)
                validation_loss, data_total, rmsd_avg, drmsd_avg = evaluate_model(
                    validation_loader, model)
                prim = data_total[0][0]
                pos = data_total[0][1]
                pos_pred = data_total[0][3]
                if use_gpu:
                    pos = pos.cuda()
                    pos_pred = pos_pred.cuda()
                angles = calculate_dihedral_angels(pos, use_gpu)
                angles_pred = calculate_dihedral_angels(pos_pred, use_gpu)
                write_to_pdb(get_structure_from_angles(prim, angles), "test")
                write_to_pdb(get_structure_from_angles(prim, angles_pred),
                             "test_pred")
                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = saveModel(encoder_net, decoder_net,
                                                encoder_optimizer,
                                                decoder_optimizer, loss.item(),
                                                tot_eval_acc, e)

                write_out("Validation loss:", validation_loss, "Train loss:",
                          train_loss)
                write_out("Best model so far (validation loss): ",
                          validation_loss, "at time",
                          best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Minibatches processed:", minibatches_proccesed)
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                validation_loss_values.append(validation_loss)
                rmsd_avg_values.append(rmsd_avg)
                drmsd_avg_values.append(drmsd_avg)
                if not args.hide_ui:
                    data = {}
                    data["pdb_data_pred"] = open(
                        "output/protein_test_pred.pdb", "r").read()
                    data["pdb_data_true"] = open("output/protein_test.pdb",
                                                 "r").read()
                    data["validation_dataset_size"] = validation_dataset_size
                    data["sample_num"] = sample_num
                    data["train_loss_values"] = train_loss_values
                    data["validation_loss_values"] = validation_loss_values
                    data["phi_actual"] = list(
                        [math.degrees(float(v)) for v in angles[1:, 1]])
                    data["psi_actual"] = list(
                        [math.degrees(float(v)) for v in angles[:-1, 2]])
                    data["phi_predicted"] = list(
                        [math.degrees(float(v)) for v in angles_pred[1:, 1]])
                    data["psi_predicted"] = list(
                        [math.degrees(float(v)) for v in angles_pred[:-1, 2]])
                    data["drmsd_avg"] = drmsd_avg_values
                    data["rmsd_avg"] = rmsd_avg_values
                    res = requests.post('http://localhost:5000/graph',
                                        json=data)
                    if res.ok:
                        print(res.json())

                if minibatches_proccesed > args.minimum_updates and minibatches_proccesed > best_model_minibatch_time * 2:
                    stopping_condition_met = True
                    break
    write_result_summary(best_model_loss)
    return best_model_path
def log_reg_course2vec(training_set=None,
                       vec_size=150,
                       win_size=10,
                       min_count=2,
                       epochs=10,
                       num_classes_val=-1,
                       categories=False,
                       top_n=1):
    print(
        f"\nRunning course2vec with logreg with vec_size={vec_size}, win_size={win_size}, min_count={min_count}, epochs={epochs}, num_classes_val={num_classes_val}"
    )

    # set up hyperparams, load model
    course2vec_model_path = get_course2vec_model_path(vec_size, win_size,
                                                      min_count)

    if training_set:
        course2vec_model = train_course2vec(training_set,
                                            course2vec_model_path,
                                            vec_size,
                                            win_size,
                                            min_count,
                                            epochs=epochs)
    else:
        course2vec_model = Word2Vec.load(course2vec_model_path)

    # prep datasets
    _, X_val, X_test, _, y_val, y_test = util.prep_dataset_v3(
        num_classes_train=TRAIN_LENGTH,
        num_classes_predict=PREDICT_LENGTH,
        augmented=False)
    X_train, _, _, y_train, _, _ = util.prep_dataset_v3(
        num_classes_train=TRAIN_LENGTH,
        num_classes_predict=PREDICT_LENGTH,
        augmented=False)

    X_train = featurize_student(X_train['course_history'], course2vec_model,
                                vec_size)
    X_val = featurize_student(X_val['course_history'], course2vec_model,
                              vec_size)
    X_test = featurize_student(X_test['course_history'], course2vec_model,
                               vec_size)

    if categories:
        y_train = util.degrees_to_categories(y_train)
        y_val = util.degrees_to_categories(y_val)
        y_test = util.degrees_to_categories(y_test)

    # train and predict using logistic regression model
    train_score, log_reg_model = train_log_reg(list(X_train), y_train)
    print(f"train_score: {train_score}")
    # y_pred = log_reg_model.predict(list(X_val))
    macro_f1 = util.evaluate_model(X_test,
                                   y_test,
                                   log_reg_model,
                                   output_dict=True,
                                   top_n=top_n)['macro avg']['f1-score']
    print(
        util.evaluate_model(X_test,
                            y_test,
                            log_reg_model,
                            output_dict=False,
                            top_n=top_n))

    util.evaluate_model_bias(log_reg_model, (course2vec_model, vec_size),
                             evaluate_model_bias_single_df,
                             num_classes_predict=PREDICT_LENGTH,
                             categories=categories,
                             top_n=top_n,
                             test=True)

    return macro_f1, log_reg_model