示例#1
0
def main():
    data = load_data()

    if ("deepsol1" in static_args.parameter_setting_id):
        x_train, y_train = data['train']['src'], data['train']['tgt']
        dynamic_args['num_classes'] = len(set(y_train))

        x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen))
        y_train = utils.get_one_hot(y_train, dynamic_args['num_classes'])

        print('Training data: ', x_train.shape)

        x_test, y_test = data['test']['src'], data['test']['tgt']
        x_test = np.array(utils.pad_sequecnes(x_test, static_args.maxlen))
        y_test = utils.get_one_hot(y_test, dynamic_args['num_classes'])

        print('Test data: ', x_test.shape)

    else:
        x_train, x_train_bio, y_train = data['train']['src'], data['train'][
            'src_bio'], data['train']['tgt']
        dynamic_args['num_classes'] = len(set(y_train))

        x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen))
        x_train_bio = np.array(x_train_bio)[:, :-1]
        dynamic_args['num_bio_feats'] = int(x_train_bio.shape[1])

        y_train = utils.get_one_hot(y_train, dynamic_args['num_classes'])

        print('Training data: ', x_train.shape)
        print('Training data Bio: ', x_train_bio.shape)
        print(x_train_bio[1])

        x_test, x_test_bio, y_test = data['test']['src'], data['test'][
            'src_bio'], data['test']['tgt']
        x_test = np.array(utils.pad_sequecnes(x_test, static_args.maxlen))
        x_test_bio = np.array(x_test_bio)[:, :-1]
        y_test = utils.get_one_hot(y_test, dynamic_args['num_classes'])

        print('Test data: ', x_test.shape)
        print('Test data Bio: ', x_test_bio.shape)

    best_model = utils.load_model(get_model_path())

    if ("deepsol1" in static_args.parameter_setting_id):
        [pred_test, pred_prob_test
         ] = get_classification_prediction(best_model, x_test, y_test)
    else:
        [pred_test, pred_prob_test
         ] = get_classification_prediction(best_model, [x_test, x_test_bio],
                                           y_test)

    print('Finished testing')
    #save ion disk
    get_classification_performance_path()
    save_classification_prediction(pred_test, pred_prob_test)
示例#2
0
def main():
    data = load_data()
    x_train, y_train = data['train']['src'], data['train']['tgt']
    dynamic_args['num_classes'] = len(set(y_train))

    x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen))
    y_train = utils.get_one_hot(y_train, dynamic_args['num_classes'])

    print('Training data: ', x_train.shape)

    x_val, y_val = data['valid']['src'], data['valid']['tgt']
    x_val = np.array(utils.pad_sequecnes(x_val, static_args.maxlen))
    y_val = utils.get_one_hot(y_val, dynamic_args['num_classes'])

    print('Validation data: ', x_val.shape)

    x_test, y_test = data['test']['src'], data['test']['tgt']
    x_test = np.array(utils.pad_sequecnes(x_test, static_args.maxlen))
    y_test = utils.get_one_hot(y_test, dynamic_args['num_classes'])

    print('Test data: ', x_test.shape)

    model = Models.DeepSol(static_args, dynamic_args).fetch_model_def()

    model.compile(loss='binary_crossentropy',
                  optimizer=get_optimizer(),
                  metrics=['accuracy'])
    print(model.summary())

    # Training
    model.fit(x_train,
              y_train,
              batch_size=dynamic_args['batch_size'],
              epochs=int(static_args.epochs),
              validation_data=(x_val, y_val),
              callbacks=get_callbacks())

    # Load the best model. Model that performs best on the validation data
    best_model = utils.load_model(get_model_path())

    # Calculate classification performance of the best model on the val and test
    [acc_val, score_report_val, cm_val,
     pred_val] = get_classification_performance(best_model, x_val, y_val)
    [acc_test, score_report_test, cm_test,
     pred_test] = get_classification_performance(best_model, x_test, y_test)
    results_filename_with_path = get_classification_performance_path()
    if os.path.exists(results_filename_with_path):
        os.remove(results_filename_with_path)

    # save on disk
    save_classification_performance('Validation Accuracy: ', str(acc_val))
    save_classification_performance('Test Accuracy: ', str(acc_test))
    save_classification_performance('Score Report Test: : ',
                                    str(score_report_test))
    save_classification_performance('Confusion Matrix test: ', str(cm_test))
    save_classification_prediction(pred_test)
示例#3
0
def main():
    data = load_data()
    x_train, x_train_bio, y_train = data['train']['src'], data['train'][
        'src_bio'], data['train']['tgt']
    dynamic_args['num_classes'] = len(set(y_train))
    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(y_train),
                                                      y_train)

    x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen))
    x_train_bio = np.array(x_train_bio)[:, :-1]
    dynamic_args['num_bio_feats'] = int(x_train_bio.shape[1])

    y_train = utils.get_one_hot(y_train, dynamic_args['num_classes'])

    print('Training data: ', x_train.shape)
    print('Training data Bio: ', x_train_bio.shape)
    print(x_train_bio[1])

    x_val, x_val_bio, y_val = data['valid']['src'], data['valid'][
        'src_bio'], data['valid']['tgt']
    x_val = np.array(utils.pad_sequecnes(x_val, static_args.maxlen))
    x_val_bio = np.array(x_val_bio)[:, :-1]
    y_val = utils.get_one_hot(y_val, dynamic_args['num_classes'])

    print('Validation data: ', x_val.shape)
    print('Validation data Bio: ', x_val_bio.shape)

    x_test, x_test_bio, y_test = data['test']['src'], data['test'][
        'src_bio'], data['test']['tgt']
    x_test = np.array(utils.pad_sequecnes(x_test, static_args.maxlen))
    x_test_bio = np.array(x_test_bio)[:, :-1]
    y_test = utils.get_one_hot(y_test, dynamic_args['num_classes'])

    print('Test data: ', x_test.shape)
    print('Test data Bio: ', x_test_bio.shape)

    model = Models.DeepSol(static_args, dynamic_args).fetch_model_def()

    model.compile(loss='binary_crossentropy',
                  optimizer=get_optimizer(),
                  metrics=['accuracy'])
    print(model.summary())

    # Training
    # Either use both bio and protein feats or just one of them
    acc_val, score_report_val, cm_val, pred_val = None, None, None, None
    if dynamic_args['biofeats'] is not None and dynamic_args[
            'protein_seq_feats'] is not None:
        model.fit([x_train, x_train_bio],
                  y_train,
                  batch_size=dynamic_args['batch_size'],
                  class_weight="auto",
                  epochs=int(static_args.epochs),
                  validation_data=([x_val, x_val_bio], y_val),
                  callbacks=get_callbacks())
        best_model = utils.load_model(get_model_path())
        [acc_val, score_report_val, cm_val,
         pred_val] = get_classification_performance(best_model,
                                                    [x_val, x_val_bio], y_val)
        [acc_test, score_report_test, cm_test,
         pred_test] = get_classification_performance(best_model,
                                                     [x_test, x_test_bio],
                                                     y_test)
    elif dynamic_args['protein_seq_feats'] is not None:
        model.fit(x_train,
                  y_train,
                  batch_size=dynamic_args['batch_size'],
                  class_weight="auto",
                  epochs=int(static_args.epochs),
                  validation_data=(x_val, y_val),
                  callbacks=get_callbacks())
        best_model = utils.load_model(get_model_path())
        [acc_val, score_report_val, cm_val,
         pred_val] = get_classification_performance(best_model, x_val, y_val)
        [acc_test, score_report_test, cm_test,
         pred_test] = get_classification_performance(best_model, x_test,
                                                     y_test)
    elif dynamic_args['biofeats'] is not None:
        model.fit(x_train_bio,
                  y_train,
                  batch_size=dynamic_args['batch_size'],
                  class_weight="auto",
                  epochs=int(static_args.epochs),
                  validation_data=(x_val_bio, y_val),
                  callbacks=get_callbacks())
        best_model = utils.load_model(get_model_path())
        [acc_val, score_report_val, cm_val,
         pred_val] = get_classification_performance(best_model, x_val_bio,
                                                    y_val)
        [acc_test, score_report_test, cm_test,
         pred_test] = get_classification_performance(best_model, x_test_bio,
                                                     y_test)
    results_filename_with_path = get_classification_performance_path()
    if os.path.exists(results_filename_with_path):
        os.remove(results_filename_with_path)

    # save on disk
    save_classification_performance('Validation Accuracy: ', str(acc_val))
    save_classification_performance('Test Accuracy: ', str(acc_test))
    save_classification_performance('Score Report Test: : ',
                                    str(score_report_test))
    save_classification_performance('Confusion Matrix test: ', str(cm_test))
    save_classification_prediction(pred_test)
示例#4
0
def main():
    data = load_data()
    #To perform 10-fold cross-validation
    kf = KFold(n_splits=10,shuffle=False)

    if (static_args.parameter_setting_id=="deepsol1"):
        x_train, y_train = data['train']['src'], data['train']['tgt']
        dynamic_args['num_classes'] = len(set(y_train))

        x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen))
        y_train = utils.get_one_hot(y_train, dynamic_args['num_classes'])

        print('Training data: ', x_train.shape)

        x_val, y_val = data['valid']['src'], data['valid']['tgt']
        x_val = np.array(utils.pad_sequecnes(x_val, static_args.maxlen))
        y_val = utils.get_one_hot(y_val, dynamic_args['num_classes'])

        print('Valid data: ', x_val.shape)

        x_full = np.concatenate((x_train,x_val),axis=0)
        y_full = np.concatenate((y_train,y_val),axis=0)

        print('Full Train data: ', x_full.shape)

    else:
        x_train, x_train_bio, y_train = data['train']['src'], data['train']['src_bio'], data['train']['tgt']
        dynamic_args['num_classes'] = len(set(y_train))
        
        x_train = np.array(utils.pad_sequecnes(x_train, static_args.maxlen))
        x_train_bio = np.array(x_train_bio)[:, :-1]
        dynamic_args['num_bio_feats'] = int(x_train_bio.shape[1])
        
        y_train = utils.get_one_hot(y_train, dynamic_args['num_classes'])

        print('Training data: ', x_train.shape)
        print('Training data Bio: ', x_train_bio.shape)

        x_val, x_val_bio, y_val = data['valid']['src'], data['valid']['src_bio'], data['valid']['tgt']
        x_val = np.array(utils.pad_sequecnes(x_val, static_args.maxlen))
        x_val_bio = np.array(x_val_bio)[:, :-1]
        y_val = utils.get_one_hot(y_val, dynamic_args['num_classes'])
        
        print('Valid data: ', x_val.shape)
        print('Valid data Bio: ', x_val_bio.shape)

        x_full = np.concatenate((x_train,x_val),axis=0)
        x_full_bio = np.concatenate((x_train_bio,x_val_bio),axis=0)
        y_full = np.concatenate((y_train,y_val),axis=0)

        print('Full Train data: ',x_full.shape)
        print('Full Train Bio: ',x_full_bio.shape)

    best_model = utils.load_model(get_model_path())
    print('Loaded best model for ',static_args.parameter_setting_id)

    #Get file where results are to be saved
    results_filename_with_path = get_classification_performance_path()
    if os.path.exists(results_filename_with_path):
    	os.remove(results_filename_with_path)

    #Keep average scores for cross_validation
    acc_test_vec, mcc_test_vec = [],[]
    count = 1
    for train_index,test_index in kf.split(x_full):
        print('Starting CV Iteration: ',str(count))
        x_test = x_full[test_index]
        y_test = y_full[test_index]
        y_test = np.array(y_test, dtype='int32')
        if (static_args.parameter_setting_id=='deepsol1'):
            [acc_test, score_report_test, cm_test, mcc_test, pred_test, pred_prob_test] = get_classification_performance(best_model,x_test,y_test)
        else:
            x_test_bio = x_full_bio[test_index]
            [acc_test, score_report_test, cm_test, mcc_test, pred_test, pred_prob_test] = get_classification_performance(best_model,[x_test,x_test_bio],y_test)
	#Save output on disk
        save_classification_performance('Iteration: '+str(count),'')
        save_classification_performance('Test Accuracy: ',str(acc_test))
        save_classification_performance('Test MCC: ',str(mcc_test))
        save_classification_performance('Score Report Test: : ',str(score_report_test))
        save_classification_performance('Confusion Matrix test: ',str(cm_test))
        acc_test_vec.append(acc_test)
        mcc_test_vec.append(mcc_test)
        count=count+1

    mean_acc = (1.0*sum(acc_test_vec))/len(acc_test_vec)
    save_classification_performance('Mean CV accuracy: ',str(mean_acc))
    mean_mcc = (1.0*sum(mcc_test_vec))/len(mcc_test_vec)
    save_classification_performance('Mean CV MCC: ',str(mean_mcc))

    print('Finished cross-validation')