Пример #1
0
def train_model(data_set_identifier, train_file, val_file, learning_rate, minibatch_size, name):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size, name)

    train_loader = contruct_dataloader_from_disk(train_file, minibatch_size, use_evolutionary=True)
    validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size, use_evolutionary=True)
    validation_dataset_size = validation_loader.dataset.__len__()
    train_dataset_size = train_loader.dataset.__len__()



    embedding_size = 21
    if configs.run_params["use_evolutionary"]:
        embedding_size = 42


    #Load in existing model if given as argument
    if args.model is not None:
        model_path = "output/models/" + args.model + ".model"
        model = load_model_from_disk(model_path, use_gpu)
    else:
    #else construct new model from config file
        model = construct_model(configs.model_params, embedding_size, use_gpu,minibatch_size)
    
    #optimizer parameters
    betas = tuple(configs.run_params["betas"])
    weight_decay = configs.run_params["weight_decay"]
    angle_lr = configs.run_params["angles_lr"]

    if configs.model_params['architecture'] == 'cnn_angles':
        optimizer = optim.Adam(model.parameters(), betas=betas, lr=learning_rate, weight_decay=weight_decay)
    else:
        optimizer = optim.Adam([
            {'params' : model.model.parameters(), 'lr':learning_rate},
            {'params' : model.soft_to_angle.parameters(), 'lr':angle_lr}], betas=betas, weight_decay=weight_decay)
    
    #print number of trainable parameters
    print_number_of_parameters(model)
    #For creating a summary table of the model (does not work on ExampleModel!)
    if configs.run_params["print_model_summary"]:
        if configs.model_params["architecture"] != 'rnn':
            summary(model, configs.run_params["max_sequence_length"], 2)
        else:
            write_out("DETAILED MODEL SUMMARY IS NOT SUPPORTED FOR RNN MODELS")
    
    if use_gpu:
        model = model.cuda()

    # TODO: is soft_to_angle.parameters() included here?

    sample_num = list()
    train_loss_values = list()
    validation_loss_values = list()
    rmsd_avg_values = list()
    drmsd_avg_values = list()
    break_point_values = list()

    breakpoints = configs.run_params['breakpoints']
    best_model_loss = 1e20
    best_model_train_loss = 1e20
    best_model_minibatch_time = None
    best_model_path = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    loss_atoms = configs.run_params["loss_atoms"]
    start_time = time.time()
    max_time = configs.run_params["max_time"]
    C_epochs = configs.run_params["c_epochs"] # TODO: Change to parameter
    C_batch_updates = C_epochs

    while not stopping_condition_met:
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        start_time_n_minibatches = time.time()
        for minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            training_minibatch = list(training_minibatch)
            primary_sequence, tertiary_positions, mask, p_id = training_minibatch[:-1]
            # Update C
            C = 1.0 if minibatches_proccesed >= C_batch_updates else float(minibatches_proccesed) / C_batch_updates

            #One Hot encode amino string and concate PSSM values.
            amino_acids, batch_sizes = one_hot_encode(primary_sequence, 21, use_gpu)

            if configs.run_params["use_evolutionary"]:
                evolutionary = training_minibatch[-1]

                evolutionary, batch_sizes = torch.nn.utils.rnn.pad_packed_sequence(torch.nn.utils.rnn.pack_sequence(evolutionary))
                
                if use_gpu:
                    evolutionary = evolutionary.cuda()

                amino_acids = torch.cat((amino_acids, evolutionary.view(-1, len(batch_sizes) , 21)), 2)

            start_compute_loss = time.time()

            if configs.run_params["only_angular_loss"]:
                #raise NotImplementedError("Only_angular_loss function has not been implemented correctly yet.")
                loss = model.compute_angular_loss((amino_acids, batch_sizes), tertiary_positions, mask)
            else:
                loss = model.compute_loss((amino_acids, batch_sizes), tertiary_positions, mask, C=C, loss_atoms=loss_atoms)
            
            if C != 1:
                write_out("C:", C)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            end = time.time()
            write_out("Loss time:", start_compute_grad-start_compute_loss, "Grad time:", end-start_compute_grad)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % configs.run_params["eval_interval"] == 0:
                model.eval()
                write_out("Testing model on validation set...")
                train_loss = loss_tracker.mean()
                loss_tracker = np.zeros(0)
                validation_loss, data_total, rmsd_avg, drmsd_avg = evaluate_model(validation_loader,
                     model, use_gpu, loss_atoms, configs.run_params["use_evolutionary"])
                prim = data_total[0][0]
                pos = data_total[0][1]
                pos_pred = data_total[0][3]
                mask = data_total[0][4]
                pos = apply_mask(pos, mask)
                angles_pred = data_total[0][2]

                angles_pred = apply_mask(angles_pred, mask, size=3)

                pos_pred = apply_mask(pos_pred, mask)
                prim = torch.masked_select(prim, mask)

                if use_gpu:
                    pos = pos.cuda()
                    pos_pred = pos_pred.cuda()

                angles = calculate_dihedral_angels(pos, use_gpu)
                #angles_pred = calculate_dihedral_angels(pos_pred, use_gpu)
                #angles_pred = data_total[0][2] # Use angles output from model - calculate_dihedral_angels(pos_pred, use_gpu)

                write_to_pdb(get_structure_from_angles(prim, angles), "test")
                write_to_pdb(get_structure_from_angles(prim, angles_pred), "test_pred")
                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = write_model_to_disk(model)

                if train_loss < best_model_train_loss:
                    best_model_train_loss = train_loss
                    best_model_train_path = write_model_to_disk(model, model_type="train")

                write_out("Validation loss:", validation_loss, "Train loss:", train_loss)
                write_out("Best model so far (validation loss): ", best_model_loss, "at time", best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Best model train stored at " + best_model_train_path)
                write_out("Minibatches processed:",minibatches_proccesed)

                end_time_n_minibatches = time.time()
                n_minibatches_time_used = end_time_n_minibatches - start_time_n_minibatches
                minibatches_left = configs.run_params["max_updates"] - minibatches_proccesed
                seconds_left = int(n_minibatches_time_used * (minibatches_left/configs.run_params["eval_interval"]))
                
                m, s = divmod(seconds_left, 60)
                h, m = divmod(m, 60)
                write_out("Estimated time until maximum number of updates:", '{:d}:{:02d}:{:02d}'.format(h, m, s) )
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                validation_loss_values.append(validation_loss)
                rmsd_avg_values.append(rmsd_avg)
                drmsd_avg_values.append(drmsd_avg)
                
                if breakpoints and minibatches_proccesed > breakpoints[0]:
                    break_point_values.append(drmsd_avg)
                    breakpoints = breakpoints[1:]

                data = {}
                data["pdb_data_pred"] = open("output/protein_test_pred.pdb","r").read()
                data["pdb_data_true"] = open("output/protein_test.pdb","r").read()
                data["validation_dataset_size"] = validation_dataset_size
                data["sample_num"] = sample_num
                data["train_loss_values"] = train_loss_values
                data["break_point_values"] = break_point_values
                data["validation_loss_values"] = validation_loss_values
                data["phi_actual"] = list([math.degrees(float(v)) for v in angles[1:,1]])
                data["psi_actual"] = list([math.degrees(float(v)) for v in angles[:-1,2]])
                data["phi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[1:,1]])
                data["psi_predicted"] = list([math.degrees(float(v)) for v in angles_pred[:-1,2]])
                data["drmsd_avg"] = drmsd_avg_values
                data["rmsd_avg"] = rmsd_avg_values
                if not configs.run_params["hide_ui"]:
                    res = requests.post('http://localhost:5000/graph', json=data)
                    if res.ok:
                        print(res.json())
                
                # Save run data
                write_run_to_disk(data)

                #Check if maximum time is reached.
                start_time_n_minibatches = time.time()
                time_used = time.time() - start_time

                time_condition = (max_time is not None and time_used > max_time)
                max_update_condition = minibatches_proccesed >= configs.run_params["max_updates"]
                min_update_condition = (minibatches_proccesed > configs.run_params["min_updates"] and minibatches_proccesed > best_model_minibatch_time * 2)

                model.train()
                #Checking for stop conditions
                if time_condition or max_update_condition or min_update_condition:
                    stopping_condition_met = True
                    break
    write_out("Best validation model found after" , best_model_minibatch_time , "minibatches.")
    write_result_summary(best_model_loss)
    return best_model_path
Пример #2
0
def train_model(data_set_identifier, model, train_loader, validation_loader,
                learning_rate, minibatch_size=64, eval_interval=50, hide_ui=False,
                use_gpu=False, minimum_updates=1000,
                optimizer_type='adam', restart=False):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size)

    validation_dataset_size = validation_loader.dataset.__len__()

    if use_gpu:
        model = model.cuda()

    if optimizer_type == 'adam':
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    elif optimizer_type == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
    elif optimizer_type == 'rmsprop':
        optimizer = optim.RMSprop(model.parameters(), lr=learning_rate)
    else:
        optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

    if restart:
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=32)

    sample_num = list()
    train_loss_values = list()
    train_drmsd_values = list()
    validation_loss_values = list()
    validation_angles_loss_values = list()
    best_model_loss = 1e20
    best_model_minibatch_time = None
    best_model_path = None
    best_json_data = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    while not stopping_condition_met:
    # for i in range(2):
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        drmsd_tracker = np.zeros(0)
        for _minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            start_compute_loss = time.time()
            loss, drmsd_avg = model.compute_loss(training_minibatch)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            drmsd_tracker = np.append(drmsd_tracker, float(drmsd_avg))
            end = time.time()
            write_out("Loss time:", start_compute_grad - start_compute_loss, "Grad time:",
                      end - start_compute_grad)
            optimizer.step()
            if restart:
                scheduler.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % eval_interval == 0:

                write_out("Testing model on validation set...")

                train_loss = float(loss_tracker.mean())
                train_drmsd = float(drmsd_tracker.mean())
                loss_tracker = np.zeros(0)
                drmsd_tracker = np.zeros(0)
                validation_loss, json_data, _, validation_angles_loss = model.evaluate_model(validation_loader)

                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = write_model_to_disk(model)
                    best_json_data = json_data

                write_out("Validation loss:", validation_loss, "Train loss:", train_loss, "Train drmsd:", train_drmsd)
                write_out("Best model so far (validation loss): ", best_model_loss, "at time",
                          best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Minibatches processed:", minibatches_proccesed)
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                train_drmsd_values.append(train_drmsd)
                validation_loss_values.append(validation_loss)
                validation_angles_loss_values.append(validation_angles_loss)
                json_data["validation_dataset_size"] = validation_dataset_size
                json_data["sample_num"] = sample_num
                json_data["train_loss_values"] = train_loss_values
                json_data["train_drmsd_values"] = train_drmsd_values
                json_data["validation_loss_values"] = validation_loss_values
                json_data['validation_angles_loss_values'] = validation_angles_loss_values

                write_out(json_data)

                if not hide_ui:
                    res = requests.post('http://localhost:5000/graph', json=json_data)
                    if res.ok:
                        print(res.json())

                if minibatches_proccesed > minimum_updates and minibatches_proccesed \
                        >= best_model_minibatch_time + minimum_updates:
                    stopping_condition_met = True
                    break
    write_result_summary(best_model_loss)
    write_result_summary(json.dumps(best_json_data))
    return best_model_path
Пример #3
0
def run_experiment(parser, use_gpu):
    parser.add_argument('--minibatch-size-validation',
                        dest='minibatch_size_validation',
                        type=int,
                        default=8,
                        help='Size of each minibatch during evaluation.')
    parser.add_argument('--hidden-size',
                        dest='hidden_size',
                        type=int,
                        default=64,
                        help='Hidden size.')
    parser.add_argument('--learning-rate',
                        dest='learning_rate',
                        type=float,
                        default=0.0002,
                        help='Learning rate to use during training.')
    parser.add_argument('--cv-partition',
                        dest='cv_partition',
                        type=int,
                        default=0,
                        help='Run a particular cross validation rotation.')
    parser.add_argument('--model-mode',
                        dest='model_mode',
                        type=int,
                        default=2,
                        help='Which model to use.')
    parser.add_argument('--input-data',
                        dest='input_data',
                        type=str,
                        default='data/raw/TMHMM3.train.3line.latest',
                        help='Path of input data file.')
    parser.add_argument('--pre-trained-model-paths',
                        dest='pre_trained_model_paths',
                        type=str,
                        default=None,
                        help='Paths of pre-trained models.')
    parser.add_argument('--profile-path', dest='profile_path',
                        type=str, default="",
                        help='Profiles to use for embedding.')
    args, _unknown = parser.parse_known_args()

    result_matrices = np.zeros((5, 5), dtype=np.int64)

    if args.model_mode == 0:
        model_mode = TMHMM3Mode.LSTM
    elif args.model_mode == 1:
        model_mode = TMHMM3Mode.LSTM_CRF
    elif args.model_mode == 2:
        model_mode = TMHMM3Mode.LSTM_CRF_HMM
    elif args.model_mode == 3:
        model_mode = TMHMM3Mode.LSTM_CRF_MARG
    else:
        print("ERROR: No model defined")

    print("Using model:", model_mode)

    if args.profile_path != "":
        embedding = "PROFILE"
    else:
        embedding = "BLOSUM62"
    use_marg_prob = False
    all_prediction_data = []

    for cv_partition in [0, 1, 2, 3, 4]:
        # prepare data sets
        train_set, val_set, test_set = load_data_from_disk(filename=args.input_data,
                                                           partition_rotation=cv_partition)

        # topology data set
        train_set_topology = list(filter(lambda x: x[3] == 0 or x[3] == 1, train_set))
        val_set_topology = list(filter(lambda x: x[3] == 0 or x[3] == 1, val_set))
        test_set_topology = list(filter(lambda x: x[3] == 0 or x[3] == 1, test_set))

        if not args.silent:
            print("Loaded ",
                  len(train_set), "training,",
                  len(val_set), "validation and",
                  len(test_set), "test samples")

        print("Processing data...")
        pre_processed_path = "data/preprocessed/preprocessed_data_" + str(
            hashlib.sha256(args.input_data.encode()).hexdigest())[:8] + "_cv" \
                             + str(cv_partition) + ".pickle"
        if not os.path.isfile(pre_processed_path):
            input_data_processed = list([TMDataset.from_disk(set, use_gpu) for set in
                                         [train_set, val_set, test_set,
                                          train_set_topology, val_set_topology,
                                          test_set_topology]])
            pickle.dump(input_data_processed, open(pre_processed_path, "wb"))
        input_data_processed = pickle.load(open(pre_processed_path, "rb"))
        train_preprocessed_set = input_data_processed[0]
        validation_preprocessed_set = input_data_processed[1]
        test_preprocessed_set = input_data_processed[2]
        train_preprocessed_set_topology = input_data_processed[3]
        validation_preprocessed_set_topology = input_data_processed[4]
        _test_preprocessed_set_topology = input_data_processed[5]

        print("Completed preprocessing of data...")

        train_loader = tm_contruct_dataloader_from_disk(train_preprocessed_set,
                                                        args.minibatch_size,
                                                        balance_classes=True)
        validation_loader = tm_contruct_dataloader_from_disk(validation_preprocessed_set,
                                                             args.minibatch_size_validation,
                                                             balance_classes=True)
        test_loader = tm_contruct_dataloader_from_disk(
            test_preprocessed_set if args.evaluate_on_test else validation_preprocessed_set,
            args.minibatch_size_validation)

        train_loader_topology = \
            tm_contruct_dataloader_from_disk(train_preprocessed_set_topology,
                                             args.minibatch_size)
        validation_loader_topology = \
            tm_contruct_dataloader_from_disk(validation_preprocessed_set_topology,
                                             args
                                             .minibatch_size_validation)

        type_predictor_model_path = None

        if args.pre_trained_model_paths is None:
            for (experiment_id, train_data, validation_data) in [
                    ("TRAIN_TYPE_CV" + str(cv_partition) + "-" + str(model_mode)
                     + "-HS" + str(args.hidden_size) + "-F" + str(args.input_data.split(".")[-2])
                     + "-P" + str(args.profile_path.split("_")[-1]), train_loader,
                     validation_loader),
                    ("TRAIN_TOPOLOGY_CV" + str(cv_partition) + "-" + str(model_mode)
                     + "-HS" + str(args.hidden_size) + "-F" + str(args.input_data.split(".")[-2])
                     + "-P" + str(args.profile_path.split("_")[-1]),
                     train_loader_topology, validation_loader_topology)]:

                type_predictor = None
                if type_predictor_model_path is not None:
                    type_predictor = load_model_from_disk(type_predictor_model_path,
                                                          force_cpu=False)
                    model = load_model_from_disk(type_predictor_model_path,
                                                 force_cpu=False)
                    model.type_classifier = type_predictor
                    model.type_01loss_values = []
                    model.topology_01loss_values = []
                else:
                    model = TMHMM3(
                        embedding,
                        args.hidden_size,
                        use_gpu,
                        model_mode,
                        use_marg_prob,
                        type_predictor,
                        args.profile_path)

                model_path = train_model(data_set_identifier=experiment_id,
                                         model=model,
                                         train_loader=train_data,
                                         validation_loader=validation_data,
                                         learning_rate=args.learning_rate,
                                         minibatch_size=args.minibatch_size,
                                         eval_interval=args.eval_interval,
                                         hide_ui=args.hide_ui,
                                         use_gpu=use_gpu,
                                         minimum_updates=args.minimum_updates)

                # let the GC collect the model
                del model

                write_out(model_path)

                # if we just trained a type predictor, save it for later
                if "TRAIN_TYPE" in experiment_id:
                    type_predictor_model_path = model_path
        else:
            # use the pre-trained model
            model_path = args.pre_trained_model_paths.split(",")[cv_partition]

        # test model
        write_out("Testing model...")
        model = load_model_from_disk(model_path, force_cpu=False)
        _loss, json_data, prediction_data = model.evaluate_model(test_loader)

        all_prediction_data.append(post_process_prediction_data(prediction_data))
        result_matrix = np.array(json_data['confusion_matrix'])
        result_matrices += result_matrix
        write_out(result_matrix)

    set_experiment_id(
        "TEST-" + str(model_mode) + "-HS" + str(args.hidden_size) + "-F"
        + str(args.input_data.split(".")[-2]),
        args.learning_rate,
        args.minibatch_size)
    write_out(result_matrices)
    write_prediction_data_to_disk("\n".join(all_prediction_data))
Пример #4
0
def run_experiment(parser, use_gpu):
    # parse experiment specific command line arguments
    parser.add_argument('--learning-rate',
                        dest='learning_rate',
                        type=float,
                        default=0.001,
                        help='Learning rate to use during training.')
    parser.add_argument('--embed-size',
                        dest='embed_size',
                        type=int,
                        default=21,
                        help='Embedding size.')
    args, _unknown = parser.parse_known_args()

    all_prediction_data = []
    result_matrices = []
    # pre-process data
    preprocessed_training_file = process_single_raw_data(
        training_file, use_gpu=use_gpu, force_pre_processing_overwrite=False)
    preprocessed_validation_file = process_single_raw_data(
        validation_file, use_gpu=use_gpu, force_pre_processing_overwrite=False)
    preprocessed_test_file = process_single_raw_data(
        test_file, use_gpu=use_gpu, force_pre_processing_overwrite=False)

    # run experiment

    # model = ExampleModel(args.embed_size, args.minibatch_size, use_gpu=use_gpu)  # embed size = 21
    # model = SimpleRCNN(args.embed_size, args.minibatch_size, use_gpu=use_gpu)  # embed size = 21
    model = DeepResRCNN_100(args.embed_size,
                            args.minibatch_size,
                            use_gpu=use_gpu)  # embed size = 21

    train_loader = contruct_dataloader_from_disk(preprocessed_training_file,
                                                 args.minibatch_size)
    validation_loader = contruct_dataloader_from_disk(
        preprocessed_validation_file, args.minibatch_size)

    train_model_path = train_model(data_set_identifier="TRAIN",
                                   model=model,
                                   train_loader=train_loader,
                                   validation_loader=validation_loader,
                                   learning_rate=args.learning_rate,
                                   minibatch_size=args.minibatch_size,
                                   eval_interval=args.eval_interval,
                                   hide_ui=args.hide_ui,
                                   use_gpu=use_gpu,
                                   minimum_updates=args.minimum_updates)

    print(train_model_path)

    # test model
    test_loader = contruct_dataloader_from_disk(preprocessed_test_file,
                                                args.minibatch_size)
    write_out("Testing model...")
    model = load_model_from_disk(train_model_path, force_cpu=False)
    _loss, json_data, _ = model.evaluate_model(test_loader)

    all_prediction_data.append(json_data)
    # all_prediction_data.append(model.post_process_prediction_data(prediction_data))
    result_matrix = np.array(json_data['confusion_matrix'])
    result_matrices += result_matrix
    write_out(result_matrix)

    set_experiment_id(
        "TEST-" + str(args.hidden_size) + "-F" +
        str(args.input_data.split(".")[-2]), args.learning_rate,
        args.minibatch_size)
    write_out(result_matrices)
    write_prediction_data_to_disk("\n".join(all_prediction_data))
Пример #5
0
def train_model(data_set_identifier,
                model,
                train_loader,
                validation_loader,
                learning_rate,
                minibatch_size=64,
                eval_interval=50,
                hide_ui=False,
                use_gpu=False,
                minimum_updates=100):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size)

    validation_dataset_size = validation_loader.dataset.__len__()

    if use_gpu:
        model = model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    sample_num = list()
    train_loss_values = list()
    validation_loss_values = list()

    best_model_loss = 1e20
    best_model_minibatch_time = None
    best_model_path = None
    _best_json_data = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    while not stopping_condition_met:
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        for _minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            start_compute_loss = time.time()
            loss = model.compute_loss(training_minibatch,
                                      minibatches_proccesed, minimum_updates)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            end = time.time()
            write_out("Loss time:", start_compute_grad - start_compute_loss,
                      "Grad time:", end - start_compute_grad)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % eval_interval == 0:

                write_out("Testing model on validation set...")

                train_loss = float(loss_tracker.mean())
                loss_tracker = np.zeros(0)
                validation_loss, json_data, _ = model.evaluate_model(
                    validation_loader)

                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = write_model_to_disk(model)
                    _best_json_data = json_data

                write_out("Validation loss:", validation_loss, "Train loss:",
                          train_loss)
                write_out("Best model so far (validation loss): ",
                          best_model_loss, "at time",
                          best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Minibatches processed:", minibatches_proccesed)
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                validation_loss_values.append(validation_loss)

                json_data["validation_dataset_size"] = validation_dataset_size
                json_data["sample_num"] = sample_num
                json_data["train_loss_values"] = train_loss_values
                json_data["validation_loss_values"] = validation_loss_values

                if not hide_ui:
                    write_out(
                        "Updating monitoring service:",
                        str(json_data) if len(str(json_data)) < 50 else
                        str(json_data)[:50] + "...")
                    res = requests.post('http://localhost:5000/graph',
                                        json=json_data)
                    if res.ok:
                        write_out("Received response from monitoring service:",
                                  res.json())

                if minibatches_proccesed > minimum_updates and minibatches_proccesed \
                        >= best_model_minibatch_time + minimum_updates:
                    stopping_condition_met = True
                    break
    write_result_summary(best_model_loss)
    # write_result_summary(json.dumps(_best_json_data))
    return best_model_path
Пример #6
0
def train_model(data_set_identifier, train_file, val_file, learning_rate,
                minibatch_size):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size)

    train_loader = contruct_dataloader_from_disk(train_file, minibatch_size)
    validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size)
    validation_dataset_size = validation_loader.dataset.__len__()

    model = ExampleModel(21, minibatch_size,
                         use_gpu=use_gpu)  # embed size = 21

    # TODO: is soft_to_angle.parameters() included here?
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    sample_num = list()
    train_loss_values = list()
    validation_loss_values = list()
    rmsd_avg_values = list()
    drmsd_avg_values = list()

    best_model_loss = 1.1
    best_model_minibatch_time = None
    best_model_path = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    while not stopping_condition_met:
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        for minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            primary_sequence, tertiary_positions, mask = training_minibatch
            start_compute_loss = time.time()
            loss = model.compute_loss(primary_sequence, tertiary_positions)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            end = time.time()
            write_out("Loss time:", start_compute_grad - start_compute_loss,
                      "Grad time:", end - start_compute_grad)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % args.eval_interval == 0:

                train_loss = loss_tracker.mean()
                loss_tracker = np.zeros(0)
                validation_loss, data_total, rmsd_avg, drmsd_avg = evaluate_model(
                    validation_loader, model)
                prim = data_total[0][0]
                pos = data_total[0][1]
                (aa_list, phi_list, psi_list,
                 omega_list) = calculate_dihedral_angels(prim, pos)
                write_to_pdb(
                    get_structure_from_angles(aa_list, phi_list[1:],
                                              psi_list[:-1], omega_list[:-1]),
                    "test")
                cmd.load("output/protein_test.pdb")
                write_to_pdb(data_total[0][3], "test_pred")
                cmd.load("output/protein_test_pred.pdb")
                cmd.forward()
                cmd.orient()
                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = write_model_to_disk(model)

                write_out("Validation loss:", validation_loss, "Train loss:",
                          train_loss)
                write_out("Best model so far (label loss): ", validation_loss,
                          "at time", best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Minibatches processed:", minibatches_proccesed)
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                validation_loss_values.append(validation_loss)
                rmsd_avg_values.append(rmsd_avg)
                drmsd_avg_values.append(drmsd_avg)
                if args.live_plot:
                    data = {}
                    data["validation_dataset_size"] = validation_dataset_size
                    data["sample_num"] = sample_num
                    data["train_loss_values"] = train_loss_values
                    data["validation_loss_values"] = validation_loss_values
                    data["phi_actual"] = list(
                        [math.degrees(float(v)) for v in phi_list[1:]])
                    data["psi_actual"] = list(
                        [math.degrees(float(v)) for v in psi_list[:-1]])
                    data["phi_predicted"] = list([
                        math.degrees(float(v)) for v in data_total[0]
                        [2].detach().transpose(0, 1)[0][1:]
                    ])
                    data["psi_predicted"] = list([
                        math.degrees(float(v)) for v in data_total[0]
                        [2].detach().transpose(0, 1)[1][:-1]
                    ])
                    data["drmsd_avg"] = drmsd_avg_values
                    data["rmsd_avg"] = rmsd_avg_values
                    res = requests.post('http://localhost:5000/graph',
                                        json=data)
                    if res.ok:
                        print(res.json())

                if minibatches_proccesed > args.minimum_updates and minibatches_proccesed > best_model_minibatch_time * 2:
                    stopping_condition_met = True
                    break
    write_result_summary(best_model_loss)
    return best_model_path
Пример #7
0
def train_model(data_set_identifier, train_file, val_file, learning_rate,
                minibatch_size):
    set_experiment_id(data_set_identifier, learning_rate, minibatch_size)

    train_loader = contruct_dataloader_from_disk(train_file, minibatch_size)
    validation_loader = contruct_dataloader_from_disk(val_file, minibatch_size)
    validation_dataset_size = validation_loader.dataset.__len__()

    model = ExampleModel(21, minibatch_size,
                         use_gpu=use_gpu)  # embed size = 21
    if use_gpu:
        model = model.cuda()

    # TODO: is soft_to_angle.parameters() included here?
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    sample_num = list()
    train_loss_values = list()
    validation_loss_values = list()
    rmsd_avg_values = list()
    drmsd_avg_values = list()

    best_model_loss = 1e20
    best_model_minibatch_time = None
    best_model_path = None
    stopping_condition_met = False
    minibatches_proccesed = 0

    while not stopping_condition_met:
        optimizer.zero_grad()
        model.zero_grad()
        loss_tracker = np.zeros(0)
        for minibatch_id, training_minibatch in enumerate(train_loader, 0):
            minibatches_proccesed += 1
            primary_sequence, tertiary_positions, mask = training_minibatch
            start_compute_loss = time.time()
            loss = model.compute_loss(primary_sequence, tertiary_positions)
            write_out("Train loss:", float(loss))
            start_compute_grad = time.time()
            loss.backward()
            loss_tracker = np.append(loss_tracker, float(loss))
            end = time.time()
            write_out("Loss time:", start_compute_grad - start_compute_loss,
                      "Grad time:", end - start_compute_grad)
            optimizer.step()
            optimizer.zero_grad()
            model.zero_grad()

            # for every eval_interval samples, plot performance on the validation set
            if minibatches_proccesed % args.eval_interval == 0:

                write_out("Testing model on validation set...")

                train_loss = loss_tracker.mean()
                loss_tracker = np.zeros(0)
                validation_loss, data_total, rmsd_avg, drmsd_avg = evaluate_model(
                    validation_loader, model)
                prim = data_total[0][0]
                pos = data_total[0][1]
                pos_pred = data_total[0][3]
                if use_gpu:
                    pos = pos.cuda()
                    pos_pred = pos_pred.cuda()
                angles = calculate_dihedral_angels(pos, use_gpu)
                angles_pred = calculate_dihedral_angels(pos_pred, use_gpu)
                write_to_pdb(get_structure_from_angles(prim, angles), "test")
                write_to_pdb(get_structure_from_angles(prim, angles_pred),
                             "test_pred")
                if validation_loss < best_model_loss:
                    best_model_loss = validation_loss
                    best_model_minibatch_time = minibatches_proccesed
                    best_model_path = saveModel(encoder_net, decoder_net,
                                                encoder_optimizer,
                                                decoder_optimizer, loss.item(),
                                                tot_eval_acc, e)

                write_out("Validation loss:", validation_loss, "Train loss:",
                          train_loss)
                write_out("Best model so far (validation loss): ",
                          validation_loss, "at time",
                          best_model_minibatch_time)
                write_out("Best model stored at " + best_model_path)
                write_out("Minibatches processed:", minibatches_proccesed)
                sample_num.append(minibatches_proccesed)
                train_loss_values.append(train_loss)
                validation_loss_values.append(validation_loss)
                rmsd_avg_values.append(rmsd_avg)
                drmsd_avg_values.append(drmsd_avg)
                if not args.hide_ui:
                    data = {}
                    data["pdb_data_pred"] = open(
                        "output/protein_test_pred.pdb", "r").read()
                    data["pdb_data_true"] = open("output/protein_test.pdb",
                                                 "r").read()
                    data["validation_dataset_size"] = validation_dataset_size
                    data["sample_num"] = sample_num
                    data["train_loss_values"] = train_loss_values
                    data["validation_loss_values"] = validation_loss_values
                    data["phi_actual"] = list(
                        [math.degrees(float(v)) for v in angles[1:, 1]])
                    data["psi_actual"] = list(
                        [math.degrees(float(v)) for v in angles[:-1, 2]])
                    data["phi_predicted"] = list(
                        [math.degrees(float(v)) for v in angles_pred[1:, 1]])
                    data["psi_predicted"] = list(
                        [math.degrees(float(v)) for v in angles_pred[:-1, 2]])
                    data["drmsd_avg"] = drmsd_avg_values
                    data["rmsd_avg"] = rmsd_avg_values
                    res = requests.post('http://localhost:5000/graph',
                                        json=data)
                    if res.ok:
                        print(res.json())

                if minibatches_proccesed > args.minimum_updates and minibatches_proccesed > best_model_minibatch_time * 2:
                    stopping_condition_met = True
                    break
    write_result_summary(best_model_loss)
    return best_model_path