Пример #1
0
def testEnsemble(target_file_loc = r'D:\Dropbox\Protein Cleavage Prediction\data\NeuroPred\V4\best_windows_pos\features10_8.csv',outputFileName="MetricsResults"):
    """
    http://nbviewer.ipython.org/github/rasbt/mlxtend/blob/master/docs/examples/sklearn_ensemble_ensembleclassifier.ipynb#Additional-Note-About-the-EnsembleClassifier-Implementation:-Class-Labels-vs.-Probabilities
    http://sebastianraschka.com/Articles/2014_ensemble_classifier.html#EnsembleClassifier---Tuning-Weights

    Blend:
    https://github.com/log0/vertebral/blob/master/stacked_generalization.py
    """

    np.random.seed(123)

    SILLY_NUMBER = 70 #USed as a magic number; when debugging for speed

    results = {} #List of classifier results-dicts.

    clf1 = LogisticRegressionCV(Cs=22,class_weight='auto')
    clf2 = RandomForestClassifier(n_estimators=int(SILLY_NUMBER*1.5), max_features=SILLY_NUMBER,bootstrap=False,class_weight='auto',n_jobs=2,criterion='entropy',random_state=123)
    clf3 = SVC(C=2.3, kernel= 'rbf', gamma= 0.0, cache_size= 1000, class_weight= 'auto',probability=True)
    # C=3.798 , 5.79
    clf4 = GradientBoostingClassifier(n_estimators=SILLY_NUMBER, max_depth=12,min_samples_leaf=2)
    clf5 = BaggingClassifier(KNeighborsClassifier(), max_samples=0.6, max_features=0.4)
    # clf6 = Pipeline([('scale',MinMaxScaler(copy = False)),('MultinomialNB',MultinomialNB())])
    clf7 = KNeighborsClassifier(n_neighbors=4, weights='distance')
    clf8 = SVC(C=20,class_weight= 'auto',probability=True)

    cclf1,cclf2,cclf3,cclf8 = CalibratedClassifierCV(clf1),CalibratedClassifierCV(clf2,cv=4),CalibratedClassifierCV(clf3,cv=4),CalibratedClassifierCV(clf8,cv=4)
    clfs_calibrated = [cclf1,cclf2,cclf3]
    cclf5,cclf7 = CalibratedClassifierCV(clf5),CalibratedClassifierCV(clf7)


    X,y,KM_pred = get_training_data(target_file_loc, drop_duplicates = True, select_features = True, scale = True,get_KM = True)

    # eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3,clf4, clf5], voting='soft')#, weights=y_weights)
    eclf = EnsembleClassifier(clfs=[clf1, cclf2, cclf3,cclf8,cclf7], voting='soft', weights=[2.5,1,2,1])

    eclf2 = EnsembleClassifier(clfs=[clf1,cclf2,clf3,clf4,clf8], voting='hard')#,weights=[2,2,2.5,1,1])


    all_clfs_calibrated = [cclf1,cclf2,cclf3,cclf5,cclf7,cclf8,eclf]
    # eclf_all = EnsembleClassifier(clfs=all_clfs_calibrated, voting='hard')

    classifiers_and_names = zip([
        # clf1, cclf2,
         clf3,
     # cclf5,cclf7,cclf8,
     eclf,
     eclf2,
     # eclf_all
     ],
     [
     # 'Logistic Regression','Random Forest',
     'SVM-RBF',
     # 'BaggingClassifier-KNN','KNeighbors',  'linearSVC',
      'Ensemble-SoftWeighted','Ensemble-Hard',
      # 'Ensemble-Ensemble-all'
      ])

    # classifiers_and_names = zip([eclf,eclf2],['Ensemble-SoftWeighted','Ensemble-Hard'])

    print("X Shape:",X.shape)
    print('# Positives: %i' % (sum(y)))

    for clf, label in classifiers_and_names:
        print('')
        print(label)
        print('')
        scores = cross_val_predict(clf, X, y, cv=8,n_jobs=-1)
        results[label] = get_scores(scores,y,label)



    print('Predicted according to Known Motif model: %i' %(sum(KM_pred)))
    cm = metrics.classification_report(y, KM_pred)
    print(cm)
    cm = metrics.confusion_matrix(y, KM_pred)
    print(cm)
    results['KnownMotif'] = get_scores(KM_pred,y,label='KnownMotif')

    res_df = pd.DataFrame(results)
    res_df.to_csv(outputFileName+"tsv", sep='\t')
    res_df.to_csv(outputFileName+"csv", sep=';')
Пример #2
0
def blend(
    target_file_loc=r'E:\Dropbox\Dropbox\Protein Cleavage Prediction\data\NeuroPred\V4\features-11_8_KR.csv'
):
    '''
    https://github.com/log0/vertebral/blob/master/stacked_generalization.py#L162
    '''
    X, Y = get_training_data(target_file_loc,
                             drop_duplicates=True,
                             select_features=True,
                             scale=True)
    # We need to transform the string output to numeric
    # label_encoder = LabelEncoder()
    # label_encoder.fit(Y)
    # Y = label_encoder.transform(Y)

    # The DEV SET will be used for all training and validation purposes
    # The TEST SET will never be used for training, it is the unseen set.
    dev_cutoff = len(Y) * 4 / 5
    X_dev = X[:dev_cutoff]
    Y_dev = Y[:dev_cutoff]
    X_test = X[dev_cutoff:]
    Y_test = Y[dev_cutoff:]

    n_trees = 30
    n_folds = 4

    # Our level 0 classifiers
    clfs = [
        ExtraTreesClassifier(n_estimators=n_trees * 2, criterion='gini'),
        LogisticRegressionCV(Cs=25, class_weight='auto'),
        RandomForestClassifier(n_estimators=150,
                               max_features=100,
                               bootstrap=False,
                               class_weight='auto',
                               n_jobs=-2,
                               criterion='entropy',
                               random_state=123),
        SVC(C=3.798,
            kernel='rbf',
            gamma=0.0,
            cache_size=1000,
            class_weight='auto',
            probability=True),
        GradientBoostingClassifier(n_estimators=110,
                                   max_depth=9,
                                   min_samples_leaf=2),
        BaggingClassifier(KNeighborsClassifier(),
                          max_samples=0.6,
                          max_features=0.5),
        # Pipeline([('scale',MinMaxScaler(copy = False)),('MultinomialNB',MultinomialNB())]),
        KNeighborsClassifier(n_neighbors=5, weights='distance')
    ]

    # Ready for cross validation
    skf = list(StratifiedKFold(Y_dev, n_folds))

    # Pre-allocate the data
    blend_train = np.zeros(
        (X_dev.shape[0],
         len(clfs)))  # Number of training data x Number of classifiers
    blend_test = np.zeros(
        (X_test.shape[0],
         len(clfs)))  # Number of testing data x Number of classifiers

    print('X_test.shape = %s' % (str(X_test.shape)))
    print('blend_train.shape = %s' % (str(blend_train.shape)))
    print('blend_test.shape = %s' % (str(blend_test.shape)))

    # For each classifier, we train the number of fold times (=len(skf))
    for j, clf in enumerate(clfs):
        print('Training classifier [%s]' % (j))
        blend_test_j = np.zeros(
            (X_test.shape[0], len(skf))
        )  # Number of testing data x Number of folds , we will take the mean of the predictions later
        for i, (train_index, cv_index) in enumerate(skf):
            print('Fold [%s]' % (i))

            # This is the training and validation set
            X_train = X_dev[train_index]
            Y_train = Y_dev[train_index]
            X_cv = X_dev[cv_index]
            Y_cv = Y_dev[cv_index]

            clf.fit(X_train, Y_train)

            # This output will be the basis for our blended classifier to train against,
            # which is also the output of our classifiers
            ## Orig
            # blend_train[cv_index, j] = clf.predict(X_cv)
            # blend_test_j[:, i] = clf.predict(X_test)
            blend_train[cv_index, j] = clf.predict_proba(X_cv)[:, 1]
            blend_test_j[:, i] = clf.predict_proba(X_test)[:, 1]
        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)

    print('Y_dev.shape = %s' % (Y_dev.shape))

    # Start blending!
    # bclf = LogisticRegressionCV()
    bclf = AdaBoostClassifier(n_estimators=60)
    bclf.fit(blend_train, Y_dev)

    # Predict now
    Y_test_predict = bclf.predict(blend_test)
    score = metrics.accuracy_score(Y_test, Y_test_predict)
    print('Accuracy = %s' % (score))
    score = metrics.f1_score(Y_test, Y_test_predict)
    print('f1 = %s' % (score))
    score = metrics.roc_auc_score(Y_test, Y_test_predict)
    print('roc_auc = %s' % (score))
    return score
Пример #3
0
def blend(target_file_loc = r'E:\Dropbox\Dropbox\Protein Cleavage Prediction\data\NeuroPred\V4\features-11_8_KR.csv'):
    '''
    https://github.com/log0/vertebral/blob/master/stacked_generalization.py#L162
    '''
    X,Y = get_training_data(target_file_loc, drop_duplicates = True, select_features = True, scale = True)
    # We need to transform the string output to numeric
    # label_encoder = LabelEncoder()
    # label_encoder.fit(Y)
    # Y = label_encoder.transform(Y)

    # The DEV SET will be used for all training and validation purposes
    # The TEST SET will never be used for training, it is the unseen set.
    dev_cutoff = len(Y) * 4/5
    X_dev = X[:dev_cutoff]
    Y_dev = Y[:dev_cutoff]
    X_test = X[dev_cutoff:]
    Y_test = Y[dev_cutoff:]

    n_trees = 30
    n_folds = 4

    # Our level 0 classifiers
    clfs = [
        ExtraTreesClassifier(n_estimators = n_trees * 2, criterion = 'gini'),
        LogisticRegressionCV(Cs=25,class_weight='auto'),
    RandomForestClassifier(n_estimators=150, max_features=100,bootstrap=False,class_weight='auto',n_jobs=-2,criterion='entropy',random_state=123),
    SVC(C=3.798, kernel= 'rbf', gamma= 0.0, cache_size= 1000, class_weight= 'auto',probability=True),
    GradientBoostingClassifier(n_estimators=110, max_depth=9,min_samples_leaf=2),
    BaggingClassifier(KNeighborsClassifier(), max_samples=0.6, max_features=0.5),
    # Pipeline([('scale',MinMaxScaler(copy = False)),('MultinomialNB',MultinomialNB())]),
     KNeighborsClassifier(n_neighbors=5, weights='distance')
    ]

    # Ready for cross validation
    skf = list(StratifiedKFold(Y_dev, n_folds))

    # Pre-allocate the data
    blend_train = np.zeros((X_dev.shape[0], len(clfs))) # Number of training data x Number of classifiers
    blend_test = np.zeros((X_test.shape[0], len(clfs))) # Number of testing data x Number of classifiers

    print ('X_test.shape = %s' % (str(X_test.shape)))
    print ('blend_train.shape = %s' % (str(blend_train.shape)))
    print ('blend_test.shape = %s' % (str(blend_test.shape)))

    # For each classifier, we train the number of fold times (=len(skf))
    for j, clf in enumerate(clfs):
        print ('Training classifier [%s]' % (j))
        blend_test_j = np.zeros((X_test.shape[0], len(skf))) # Number of testing data x Number of folds , we will take the mean of the predictions later
        for i, (train_index, cv_index) in enumerate(skf):
            print ('Fold [%s]' % (i))

            # This is the training and validation set
            X_train = X_dev[train_index]
            Y_train = Y_dev[train_index]
            X_cv = X_dev[cv_index]
            Y_cv = Y_dev[cv_index]

            clf.fit(X_train, Y_train)

            # This output will be the basis for our blended classifier to train against,
            # which is also the output of our classifiers
            ## Orig
            # blend_train[cv_index, j] = clf.predict(X_cv)
            # blend_test_j[:, i] = clf.predict(X_test)
            blend_train[cv_index, j] = clf.predict_proba(X_cv)[:,1]
            blend_test_j[:, i] = clf.predict_proba(X_test)[:,1]
        # Take the mean of the predictions of the cross validation set
        blend_test[:, j] = blend_test_j.mean(1)

    print ('Y_dev.shape = %s' % (Y_dev.shape))

    # Start blending!
    # bclf = LogisticRegressionCV()
    bclf = AdaBoostClassifier(n_estimators=60)
    bclf.fit(blend_train, Y_dev)

    # Predict now
    Y_test_predict = bclf.predict(blend_test)
    score = metrics.accuracy_score(Y_test, Y_test_predict)
    print ('Accuracy = %s' % (score))
    score = metrics.f1_score(Y_test, Y_test_predict)
    print ('f1 = %s' % (score))
    score = metrics.roc_auc_score(Y_test, Y_test_predict)
    print ('roc_auc = %s' % (score))
    return score
Пример #4
0
def testEnsemble(
        target_file_loc=r'D:\Dropbox\Protein Cleavage Prediction\data\NeuroPred\V4\best_windows_pos\features10_8.csv',
        outputFileName="MetricsResults"):
    """
    http://nbviewer.ipython.org/github/rasbt/mlxtend/blob/master/docs/examples/sklearn_ensemble_ensembleclassifier.ipynb#Additional-Note-About-the-EnsembleClassifier-Implementation:-Class-Labels-vs.-Probabilities
    http://sebastianraschka.com/Articles/2014_ensemble_classifier.html#EnsembleClassifier---Tuning-Weights

    Blend:
    https://github.com/log0/vertebral/blob/master/stacked_generalization.py
    """

    np.random.seed(123)

    SILLY_NUMBER = 70  #USed as a magic number; when debugging for speed

    results = {}  #List of classifier results-dicts.

    clf1 = LogisticRegressionCV(Cs=22, class_weight='auto')
    clf2 = RandomForestClassifier(n_estimators=int(SILLY_NUMBER * 1.5),
                                  max_features=SILLY_NUMBER,
                                  bootstrap=False,
                                  class_weight='auto',
                                  n_jobs=2,
                                  criterion='entropy',
                                  random_state=123)
    clf3 = SVC(C=2.3,
               kernel='rbf',
               gamma=0.0,
               cache_size=1000,
               class_weight='auto',
               probability=True)
    # C=3.798 , 5.79
    clf4 = GradientBoostingClassifier(n_estimators=SILLY_NUMBER,
                                      max_depth=12,
                                      min_samples_leaf=2)
    clf5 = BaggingClassifier(KNeighborsClassifier(),
                             max_samples=0.6,
                             max_features=0.4)
    # clf6 = Pipeline([('scale',MinMaxScaler(copy = False)),('MultinomialNB',MultinomialNB())])
    clf7 = KNeighborsClassifier(n_neighbors=4, weights='distance')
    clf8 = SVC(C=20, class_weight='auto', probability=True)

    cclf1, cclf2, cclf3, cclf8 = CalibratedClassifierCV(
        clf1), CalibratedClassifierCV(clf2, cv=4), CalibratedClassifierCV(
            clf3, cv=4), CalibratedClassifierCV(clf8, cv=4)
    clfs_calibrated = [cclf1, cclf2, cclf3]
    cclf5, cclf7 = CalibratedClassifierCV(clf5), CalibratedClassifierCV(clf7)

    X, y, KM_pred = get_training_data(target_file_loc,
                                      drop_duplicates=True,
                                      select_features=True,
                                      scale=True,
                                      get_KM=True)

    # eclf = EnsembleClassifier(clfs=[clf1, clf2, clf3,clf4, clf5], voting='soft')#, weights=y_weights)
    eclf = EnsembleClassifier(clfs=[clf1, cclf2, cclf3, cclf8, cclf7],
                              voting='soft',
                              weights=[2.5, 1, 2, 1])

    eclf2 = EnsembleClassifier(clfs=[clf1, cclf2, clf3, clf4, clf8],
                               voting='hard')  #,weights=[2,2,2.5,1,1])

    all_clfs_calibrated = [cclf1, cclf2, cclf3, cclf5, cclf7, cclf8, eclf]
    # eclf_all = EnsembleClassifier(clfs=all_clfs_calibrated, voting='hard')

    classifiers_and_names = zip(
        [
            # clf1, cclf2,
            clf3,
            # cclf5,cclf7,cclf8,
            eclf,
            eclf2,
            # eclf_all
        ],
        [
            # 'Logistic Regression','Random Forest',
            'SVM-RBF',
            # 'BaggingClassifier-KNN','KNeighbors',  'linearSVC',
            'Ensemble-SoftWeighted',
            'Ensemble-Hard',
            # 'Ensemble-Ensemble-all'
        ])

    # classifiers_and_names = zip([eclf,eclf2],['Ensemble-SoftWeighted','Ensemble-Hard'])

    print("X Shape:", X.shape)
    print('# Positives: %i' % (sum(y)))

    for clf, label in classifiers_and_names:
        print('')
        print(label)
        print('')
        scores = cross_val_predict(clf, X, y, cv=8, n_jobs=-1)
        results[label] = get_scores(scores, y, label)

    print('Predicted according to Known Motif model: %i' % (sum(KM_pred)))
    cm = metrics.classification_report(y, KM_pred)
    print(cm)
    cm = metrics.confusion_matrix(y, KM_pred)
    print(cm)
    results['KnownMotif'] = get_scores(KM_pred, y, label='KnownMotif')

    res_df = pd.DataFrame(results)
    res_df.to_csv(outputFileName + "tsv", sep='\t')
    res_df.to_csv(outputFileName + "csv", sep=';')