def log_fit(X,y,n_iter=5):
	##get X in the correct shape for sklearn function
	if len(X.shape) == 1:
		X = X.reshape(-1,1)
	##init the model class
	lr = gaussian_process.GaussianProcessClassifier()
		#linear_model.LogisticRegression(penalty='l2',fit_intercept=True,
		#solver='liblinear',max_iter=100,n_jobs=1,class_weight='balanced')
		#tree.DecisionTreeClassifier(class_weight='balanced')
		#neural_network.MLPClassifier(solver='lbfgs')
		#svm.LinearSVC(class_weight='balanced')

	accuracy = np.zeros(n_iter)
	for i in range(n_iter):
		##split the data into train and test sets
		X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=42)
		##make sure you have both classes of values in your training and test sets
		if np.unique(y_train).size<2 or np.unique(y_test).size<2:
			print("Re-splitting cross val data; only one class type in current set")
			X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.5,random_state=42)
		##now fit to the test data
		lr.fit(X_train,y_train)
		##now try to predict the test data
		y_pred = lr.predict(X_test)
		##lastly, compare the accuracy of the prediction
		accuracy[i] = (y_pred==y_test).sum()/float(y_test.size)
	return accuracy.mean()
def get_classifiers():
    clfs = {}
        #clfs['bag'] = {
        #    'clf': ensemble.BaggingClassifier(neighbors.KNeighborsClassifier(), max_samples=0.5,
        #                                      max_features=0.5), 'name': "BaggingClassifier"}
        # clfs['mlp'] = {'clf': neural_network.MLPClassifier(hidden_layer_sizes=(100,100,100), alpha=1e-5, solver='lbfgs', max_iter=500), 'name': 'MultilayerPerceptron'}

    clfs['logreg'] = {'clf': linear_model.LogisticRegression(),
                      'params': {'C': [(2**x) for x in np.arange(-5, 15, step=3)]}}
    clfs['sgd'] = {'clf': linear_model.SGDClassifier(),
                   'params': {'loss': ['perceptron'], 'alpha': 10 ** np.random.uniform(-6, 1)}}


    clfs['knc'] = {'clf':neighbors.KNeighborsClassifier(), 'params': {'n_neighbors':np.arange(3, 15)}}
    clfs['rfc'] = {'clf':ensemble.RandomForestClassifier(), 'params':{'n_estimators':np.arange(64, 1024, step=64)}}
    clfs['svc'] = {'clf': svm.SVC(), 'params': {'kernel':['linear', 'sigmoid', 'poly', 'rbf'], 'gamma':np.linspace(0.0,2.0,num=21),'C': np.linspace(0.5,1.5,num=11)}}
    clfs['abc'] = {'clf': ensemble.AdaBoostClassifier(), 'params': {'n_estimators': np.arange(64, 1024, step=64)}}
    clfs['gbc'] = {'clf': ensemble.GradientBoostingClassifier(), 'params': {'n_estimators': np.arange(64, 1024, step=64)}}

    clfs['gauss_class'] = {'clf': gaussian_process.GaussianProcessClassifier(), 'params': {}}
    clfs['gauss_nb'] = {'clf': naive_bayes.GaussianNB(), 'params': {}}

    #LinearDiscriminantAnalysis(),
    #QuadraticDiscriminantAnalysis()

    return clfs
示例#3
0
def get_algorithms():
    MLA_dict = {
        # Ensemble methods
        "ada": ensemble.AdaBoostClassifier(),
        "bc": ensemble.BaggingClassifier(),
        "etc": ensemble.ExtraTreesClassifier(),
        "gbc": ensemble.GradientBoostingClassifier(),
        "rfc": ensemble.RandomForestClassifier(),
        # Gaussian processes
        "gpc": gaussian_process.GaussianProcessClassifier(),
        # Linear models
        "lr": linear_model.LogisticRegressionCV(),
        "pac": linear_model.PassiveAggressiveClassifier(),
        "rcc": linear_model.RidgeClassifierCV(),
        "sgd": linear_model.SGDClassifier(),
        "per": linear_model.Perceptron(),
        # Navies bayes
        "bnb": naive_bayes.BernoulliNB(),
        "gnb": naive_bayes.GaussianNB(),
        # Nearest neighbour
        "knn": neighbors.KNeighborsClassifier(),
        # SVM
        "svc": svm.SVC(probability=True),
        "nvc": svm.NuSVC(probability=True),
        "lvc": svm.LinearSVC(),
        # Trees
        "dtc": tree.DecisionTreeClassifier(),
        "ets": tree.ExtraTreeClassifier(),
        # Discriminant analysis
        "lda": discriminant_analysis.LinearDiscriminantAnalysis(),
        "qda": discriminant_analysis.QuadraticDiscriminantAnalysis(),
    }
    return MLA_dict
示例#4
0
def gp_clf_iris():
    # Follow the example from the sklearn docs, and only use the
    # first two features, so we can visualize the predicted
    # probabilities in 2D.
    X = iris_dataset.data[:, :2]
    y = iris_dataset.target
    y_names = iris_dataset.target_names
    print("Feature names: ", iris_dataset.feature_names)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.15, random_state=RANDOM_SEED)

    # Make the RBF kernel anisotropic for maximum flexibility.
    kernel = gp.kernels.RBF(np.ones((X.shape[1], 1))) \
        * gp.kernels.ConstantKernel() \
        + gp.kernels.WhiteKernel()
    clf = gp.GaussianProcessClassifier(kernel, n_restarts_optimizer=0)
    print("Fitting Gaussian Process on input of shape {0}...".format(
        X_train.shape
    ))
    clf.fit(X_train, y_train)
    print("Learned kernel: {0}".format(str(clf.kernel_)))
    print("Fit complete.")
    
    y_pred = clf.predict(X_test)
    print(y_pred)
    
    acc = accuracy_score(y_test, y_pred)
    print("Accuracy: {0:.2f}%".format(acc * 100.0))
    
    # Plot class probabilities in 2D, with the coordinates being the
    # values of the first and second features (f0, f1, i.e., sepal
    # length and sepal width).
    f0_min = X[:, 0].min() - 1
    f0_max = X[:, 0].max() + 1
    f1_min = X[:, 1].min() - 1
    f1_max = X[:, 1].max() + 1
    step = 0.02
    f0, f1 = np.meshgrid(np.arange(f0_min, f0_max, step),
                         np.arange(f1_min, f1_max, step))
    grid_data = np.c_[f0.ravel(), f1.ravel()]
    print(X.shape)
    print(X_train.shape)
    print(grid_data.shape)
    prob_grid = clf.predict_proba(grid_data)
    prob_grid = prob_grid.reshape((f0.shape[0], f0.shape[1], 3))
    print( prob_grid.shape)#, '\n --- \n' ,prob_grid.squeeze()
    exit()
    plt.figure(figsize=(6, 6))
    plt.imshow(prob_grid, extent=(f0_min, f0_max, f1_min, f1_max),
               origin='lower')

    plt.scatter(X[y==0, 0], X[y==0, 1], s=30, c='red', edgecolors='black')
    plt.scatter(X[y==1, 0], X[y==1, 1], s=30, c='green', edgecolors='black')
    plt.scatter(X[y==2, 0], X[y==2, 1], s=30, c='blue', edgecolors='black')
    plt.show()
示例#5
0
def GaussianProcessTrain(featureMatrix, labelMatrix):

    clf = gaussian_process.GaussianProcessClassifier()
    #print (clf.fit(featureMatrix, labelMatrix))

    # cross validation
    scores = cross_val_score(clf, featureMatrix, labelMatrix, cv=10)
    print("Accuracy (Cross-V): %0.2f (+/- %0.2f)" %
          (scores.mean(), scores.std() * 2))

    return clf
    def fit_gaussian_process_classifier(self, data_interval = None, verbose = False):
        """fit a Gaussian Process classifier
        implementation from: scikit-learn
        (https://scikit-learn.org/stable/modules/gaussian_process.html)

        Parameters
        ----------
        data_interval : array_int, optional
            Array indicies of data used to train (training on a subset).
            if None - train on whole data set
        verbose : bool, optional
            Print statements with more information while training.

        Returns
        -------
        binary_classifier_holder : array_like
            Each element is a trained GaussianProcessClassifier object.
            N elements for N classes.
        """
        if verbose:
            if data_interval is None:
                print("N points: %i"%(len(self.input_data)))
            else:
                print("N points: %i"%(len(data_interval)))

        start_time = time.time()

        binary_classifier_holder = dict()

        for i, cls_data in enumerate(self.class_data):
            iter_time = time.time()

            kernel = gp.kernels.RBF( [1,1,1], [(1e-3,1e3), (1e-3,1e3), (1e-3, 1e3)] )
            gpc = gp.GaussianProcessClassifier(kernel = kernel)

            # for running with a subset of the data
            if data_interval is None:
                line = gpc.fit( self.input_data, cls_data )
            else:
                di = np.array(data_interval)
                line = gpc.fit( self.input_data[di], cls_data[di] )

            binary_classifier_holder[self.class_names[i]] = line

            time_print = time.time()-start_time

            if verbose:
                if i == 0:
                    len_classes = len(self.class_ids)
                    print( "Time to fit %s classifiers ~ %.3f\n"%(len_classes, time_print*len_classes) )
                print("GaussianProcessClassifier class %s -- current time: %.3f"%(i, time_print) )

        return binary_classifier_holder
示例#7
0
def predict(pickle_prefix, is_regression):

    kernel = ConstantKernel() + Matern(length_scale=2,
                                       nu=3 / 2) + WhiteKernel(noise_level=1)

    if is_regression:
        pickle_file = pickle_prefix + "probabilities.pickle"
        gp = gaussian_process.GaussianProcessRegressor(kernel=kernel)
    else:
        pickle_file = pickle_prefix + "selects.pickle"
        gp = gaussian_process.GaussianProcessClassifier(kernel=kernel)

    ys_by_image_id = {}

    with open(pickle_file, 'rb') as handle:
        d = pickle.load(handle)
        for image_id, ys in d.iteritems():
            ys_by_image_id[image_id] = ys

    max_ids = 10
    num_test = 20

    mses = []

    for i, (image_id, ys) in enumerate(ys_by_image_id.iteritems()):
        ys_train = ys[:-num_test]
        ys_test = ys[-num_test:]

        if i > max_ids:
            break
        X = np.array(range(len(ys_train))).reshape(-1, 1)
        gp.fit(X, ys_train)
        print("================={}=====================".format(image_id))
        print(
            "Fit image-{} with {} samples with {} average probability".format(
                image_id, len(ys_train), round(np.mean(ys_train), 4)))

        xs_test = np.linspace(len(ys_train),
                              len(ys_train) + num_test - 1,
                              num=num_test).reshape(-1, 1)
        y_pred = gp.predict(xs_test)
        pp.pprint(zip([x[0] for x in xs_test], y_pred, ys_test))
        mse = get_mse(y_pred, ys_test)
        mses.append(mse)
        print("MSE: {0:5f}".format(mse))
    return np.average(mses)
示例#8
0
def ModelSelection(test_data, features, label):
    MLA = [
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),
        gaussian_process.GaussianProcessClassifier(),
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),
        neighbors.KNeighborsClassifier(),
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),
    ]

    MLA_columns = ['MLA Name', 'MLA Parameters', 'MLA Score']
    MLA_compare = pd.DataFrame(columns=MLA_columns)
    x_train, x_test, y_train, y_test = train_test_split(train_data[features],
                                                        train_data[label],
                                                        test_size=0.2)
    row_index = 0
    MLA_predict = train_data[label]
    for alg in MLA:

        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
        alg.fit(x_train, y_train)
        MLA_predict[MLA_name] = alg.predict(x_test)
        MLA_compare.loc[row_index, 'MLA Score'] = alg.score(x_test, y_test)
        row_index += 1

    MLA_compare.sort_values(by=['MLA Score'], ascending=False, inplace=True)
    return MLA_compare, x_train, x_test, y_train, y_test
def all_classifiers():
    # Model Data
    MLA = [
        # Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        # Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        # GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(),
        linear_model.Perceptron(),

        # Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        # Nearest Neighbor
        neighbors.KNeighborsClassifier(),  # SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        # Trees
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        # Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),

        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        XGBClassifier()
    ]
    return MLA
示例#10
0
def make_image_classifier_model_example():
    """
    This an example of how the image classifier will be made.
    We will need to get actual features from Image classification team.
    Creates "Image_Classifier_Model.p" using pickle dump ("wb")
    :return: None
    """

    FEATURE_EXTRACTOR = lambda image: [
        image[:, :, 0].mean(), image[:, :, 1].mean(), image[:, :, 2].mean()
    ]
    redish_images = np.random.normal([100, 100, 160], [20, 20, 20],
                                     (200, 100, 100, 3))
    blueish_images = np.random.normal([160, 100, 100], [20, 20, 20],
                                      (200, 100, 100, 3))
    x = np.vstack(([FEATURE_EXTRACTOR(x) for x in redish_images],
                   [FEATURE_EXTRACTOR(x) for x in blueish_images]))
    y = np.ones(400)
    y[:200] = 0  #red=0 blue=1

    GPC_images_classifier = gp.GaussianProcessClassifier(
        2.0 * gp.kernels.RBF([2.0, 1.0, 2.0]))
    GPC_images_classifier.fit(x, y)
    pickle.dump(GPC_images_classifier, open("Image_Classifier_Model.p", "wb"))

    MAP = np.zeros((N * 100, M * 100, 3))
    cmap = []
    for i in range(0, N):
        i = i + 1 if i < M else N - i
        column = (M - i) * "b" + i * "r"
        cmap.append(column)

    for j, row in enumerate(cmap):
        for i, color in enumerate(row):
            position_x = j * 100
            position_y = i * 100
            color = [100, 100, 160] if color == "r" else [160, 100, 100]
            picture = np.random.normal(color, [20, 20, 20], (100, 100, 3))
            MAP[position_x:position_x + 100,
                position_y:position_y + 100] = picture
    cv2.imwrite("MAP.png", MAP)
def classifier(rand, fts, set, cl_type):
    # Define classifier
    if cl_type == 'rf':
        clf = ensemble.RandomForestClassifier(random_state=rand)
    if cl_type == 'svm':
        clf = svm.SVC(probability=True, random_state=rand)
    if cl_type == 'gauss':
        clf = gaussian_process.GaussianProcessClassifier(random_state=rand)
    # clf = tree.DecisionTreeClassifier(splitter='best', random_state=rand)

    # Fit the model and predict
    clf.fit(train_fts, train_classes)
    predictions = clf.predict(fts)
    probs = clf.predict_proba(fts)

    # Count positive predictions
    positives = 0
    for prediction, file in zip(predictions, set):
        if int(prediction) == dataset[file]:
            positives += 1
    return positives, probs, clf.classes_
示例#12
0
X = [[181, 80, 44], [177, 70, 43], [160, 60, 31], [154, 54, 37], [166, 65, 40],
     [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42],
     [181, 85, 43]]

Y = [
    'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
    'female', 'male', 'male'
]

predict_data = [[190, 70, 43], [180, 50, 30]]

clf = discriminant_analysis.QuadraticDiscriminantAnalysis()

clf = clf.fit(X, Y)

print("Quadratic Discriminant Analysis: ")
print(clf.predict(predict_data))

clf = neighbors.KNeighborsClassifier(n_neighbors=5)

clf = clf.fit(X, Y)

print("KNeighbors Classifier: ")
print(clf.predict(predict_data))

clf = gaussian_process.GaussianProcessClassifier(kernel=1.0 * RBF(1.0),
                                                 random_state=0)
clf = clf.fit(X, Y)

print("Gaussian Process Classifier: ")
print(clf.predict(predict_data))
示例#13
0
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=0)

shuffle_index = np.random.permutation(len(X_train))
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

from sklearn import gaussian_process
clf = gaussian_process.GaussianProcessClassifier(random_state=0)
clf.fit(X_train, y_train)

# Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
cross_val_score(clf, X_train, y_train, cv=3, scoring='accuracy')
y_train_pred = cross_val_predict(clf, X_train, y_train, cv=3)
cm = confusion_matrix(y_train, y_train_pred)
print(cm)

from sklearn.metrics import precision_score, recall_score
print("precision score = {0:.4f}".format(precision_score(
    y_train, y_train_pred)))
print("recall score =  {0:.4f}".format(recall_score(y_train, y_train_pred)))
示例#14
0
from sklearn import tree
from sklearn import neural_network
from sklearn import svm
from sklearn import gaussian_process
from sklearn.metrics import accuracy_score

dt_clf = tree.DecisionTreeClassifier()

# CHALLENGE - create 3 more classifiers...
# 1
mlp_clf = neural_network.MLPClassifier()
# 2
svc_clf = svm.SVC()
# 3
gauss_clf = gaussian_process.GaussianProcessClassifier()

classifiers = {
    'decision_tree': dt_clf,
    'MLP': mlp_clf,
    'SVC': svc_clf,
    'gaussian_process': gauss_clf
}

# [height, weight, shoe_size]
X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40],
     [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42],
     [181, 85, 43]]

Y = [
    'male', 'male', 'female', 'female', 'male', 'male', 'female', 'female',
    'female', 'male', 'male'
示例#15
0
    data = _data
    labels = _labels
    return _data, _labels, trainingData, trainingLabel

splitData (data,labels, 0.5)
#print (trainingData)

numTests = 20
treeResults = []
gaussianResults = []
neuralNetResults = []

for i in range(numTests):
    #Classifiers
    clf_tree = tree.DecisionTreeClassifier()
    clf_gaussian = gaussian_process.GaussianProcessClassifier()
    clf_neuralNet = neural_network.MLPClassifier(hidden_layer_sizes=(100,), alpha=0.0001, learning_rate_init=0.0001)

    #Training
    treeFit = clf_tree.fit(trainingData, trainingLabel)
    gaussianFit = clf_gaussian.fit(trainingData,trainingLabel)
    neuralNetFit = clf_neuralNet.fit(trainingData, trainingLabel)

    #Predictions and accuraccy
    prediction_tree = treeFit .predict(data)
    accuraccy_tree = accuracy_score(labels, prediction_tree) * 100 #in percentage
    treeResults.append(accuraccy_tree)

    prediction_gaussian = gaussianFit.predict(data)
    accuraccy_gaussian = accuracy_score(labels,prediction_gaussian) * 100
    gaussianResults.append(accuraccy_gaussian)
示例#16
0
def dict_method_clf():
    dict_method = {}
    # 1st part
    """4KNC"""
    me4 = neighbors.KNeighborsClassifier(n_neighbors=5)
    cv4 = StratifiedKFold(5, shuffle=False, random_state=0)
    scoring4 = 'balanced_accuracy'

    param_grid4 = [
        {
            'n_neighbors': [3, 4, 5, 6, 7],
            "weight": ['uniform', "distance"],
            "leaf_size=30": [10, 20, 30],
            'metric': ['seuclidean', "manhattan"]
        },
    ]

    dict_method.update({"KNC-set": [me4, cv4, scoring4, param_grid4]})
    """1SVC"""
    me1 = SVC(C=1.0,
              kernel='rbf',
              degree=3,
              gamma='auto_deprecated',
              coef0=0.0,
              shrinking=True,
              probability=False,
              tol=1e-3,
              cache_size=200,
              class_weight='balanced',
              verbose=False,
              max_iter=-1,
              decision_function_shape='ovr',
              random_state=None)
    cv1 = StratifiedKFold(5, shuffle=False)
    scoring1 = 'accuracy'

    param_grid1 = [{
        'C': [1.0e8, 1.0e6, 10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01],
        'kernel':
        ker
    }]

    dict_method.update({'SVC-set': [me1, cv1, scoring1, param_grid1]})
    """5GPC"""
    me5 = gaussian_process.GaussianProcessClassifier(kernel=kernel)
    cv5 = StratifiedKFold(5, shuffle=False)
    scoring5 = 'balanced_accuracy'
    param_grid5 = [
        {
            "kernel": ker
        },
    ]

    dict_method.update({'GPC-set': [me5, cv5, scoring5, param_grid5]})

    # 2nd part
    '''TreeC'''
    me6 = DecisionTreeClassifier(criterion='gini',
                                 splitter='best',
                                 max_depth=None,
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.0,
                                 max_features=None,
                                 random_state=None,
                                 max_leaf_nodes=None,
                                 min_impurity_decrease=0.0,
                                 min_impurity_split=None,
                                 class_weight="balanced",
                                 presort=False)
    cv6 = StratifiedKFold(5, shuffle=False)
    scoring6 = 'accuracy'
    param_grid6 = [{
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'min_samples_split': [2, 3, 4]
    }]
    dict_method.update({'TreeC-em': [me6, cv6, scoring6, param_grid6]})
    '''GBC'''
    me7 = GradientBoostingClassifier(loss='deviance',
                                     learning_rate=0.1,
                                     n_estimators=100,
                                     subsample=1.0,
                                     criterion='friedman_mse',
                                     min_samples_split=2,
                                     min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.,
                                     max_depth=3,
                                     min_impurity_decrease=0.,
                                     min_impurity_split=None,
                                     init=None,
                                     random_state=None,
                                     max_features=None,
                                     verbose=0,
                                     max_leaf_nodes=None,
                                     warm_start=False,
                                     presort='auto')
    cv7 = StratifiedKFold(5, shuffle=False)
    scoring7 = 'balanced_accuracy'
    param_grid7 = [{
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'min_samples_split': [2, 3, 4],
        'learning_rate': [0.1, 0.05]
    }]
    dict_method.update({'GBC-em': [me7, cv7, scoring7, param_grid7]})
    '''RFC'''
    me8 = RandomForestClassifier(n_estimators=100,
                                 criterion="gini",
                                 max_depth=None,
                                 min_samples_split=2,
                                 min_samples_leaf=1,
                                 min_weight_fraction_leaf=0.,
                                 max_features="auto",
                                 max_leaf_nodes=None,
                                 min_impurity_decrease=0.,
                                 min_impurity_split=None,
                                 bootstrap=True,
                                 oob_score=False,
                                 random_state=None,
                                 verbose=0,
                                 warm_start=False,
                                 class_weight="balanced")
    cv8 = StratifiedKFold(5, shuffle=False)
    scoring8 = 'accuracy'
    param_grid8 = [{
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'min_samples_split': [2, 3, 4],
        'learning_rate': [0.1, 0.05]
    }]
    dict_method.update({"RFC-em": [me8, cv8, scoring8, param_grid8]})

    "AdaBC"
    dt = DecisionTreeRegressor(criterion="mse",
                               splitter="best",
                               max_features=None,
                               max_depth=5,
                               min_samples_split=4)
    me9 = AdaBoostClassifier(dt,
                             n_estimators=100,
                             learning_rate=1.,
                             algorithm='SAMME.R',
                             random_state=0)
    cv9 = StratifiedKFold(5, shuffle=False)
    scoring9 = 'accuracy'
    param_grid9 = [{
        'n_estimators': [50, 100, 200, 500],
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'min_samples_split': [2, 3, 4],
        'learning_rate': [0.1, 0.05]
    }]
    dict_method.update({"AdaBC-em": [me9, cv9, scoring9, param_grid9]})

    # 3nd

    "Per"
    me14 = Perceptron(penalty="l1",
                      alpha=0.0001,
                      fit_intercept=True,
                      max_iter=None,
                      tol=None,
                      shuffle=True,
                      verbose=0,
                      eta0=1.0,
                      random_state=0,
                      class_weight=None,
                      warm_start=False,
                      n_iter=None)
    cv14 = StratifiedKFold(5, shuffle=False)
    scoring14 = 'accuracy'
    param_grid14 = [
        {
            'alpha': [0.0001, 0.001, 0.01]
        },
    ]
    dict_method.update({"Per-L1": [me14, cv14, scoring14, param_grid14]})
    """LogRL1"""
    me15 = LogisticRegression(penalty='l1',
                              solver='liblinear',
                              dual=False,
                              tol=1e-3,
                              C=1.0,
                              fit_intercept=True,
                              intercept_scaling=1,
                              class_weight='balanced',
                              random_state=0)
    cv15 = StratifiedKFold(5, shuffle=False)
    scoring15 = 'accuracy'
    param_grid15 = [
        {
            'C': [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2],
            'penalty': ["l1", "l2"]
        },
    ]
    dict_method.update({"LogR-L1": [me15, cv15, scoring15, param_grid15]})
    """3SGDCL2"""
    me3 = SGDClassifier(loss='hinge',
                        penalty='l2',
                        alpha=0.0001,
                        l1_ratio=0.15,
                        fit_intercept=True,
                        max_iter=None,
                        tol=None,
                        shuffle=True,
                        verbose=0,
                        epsilon=0.1,
                        random_state=0,
                        learning_rate='optimal',
                        eta0=0.0,
                        power_t=0.5,
                        class_weight="balanced",
                        warm_start=False,
                        average=False,
                        n_iter=None)
    cv3 = StratifiedKFold(5, shuffle=False)
    scoring3 = 'accuracy'

    param_grid3 = [
        {
            'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05],
            'loss': ['squared_loss', "huber"],
            "penalty": ["l1", "l2"]
        },
    ]

    dict_method.update({"SGDC-set": [me3, cv3, scoring3, param_grid3]})

    return dict_method
示例#17
0
        for x__ in x_:
            tmp_x_.append([x__])
        tmp_x.append(np.array(tmp_x_))
    return np.array(tmp_x)
if __name__ == '__main__':
    path = './blod.data'
    data = np.loadtxt(path, delimiter=' ',skiprows=1)
    # print(data[0])
    train_data = data[90*20:];
    test_data = data[:90*20];
    # test_data = test_data[90:];
    x_data, y_data = np.split(train_data, (10,), axis=1)
    # x_data = oneToTwo(x_data)
    x_test_data, y_test_data = np.split(test_data, (10,), axis=1)
    # x_test_data = oneToTwo(x_test_data)
    y_data = np.split(y_data, (1,), axis=1)[0]
    y_test_data = np.split(y_test_data, (1,), axis=1)[0]

    # print(x_data)
    # print(y_data.T[0])
    # print(x_test_data)
    # print(y_test_data.T[0])

    model = gaussian_process.GaussianProcessClassifier()

    model.fit(x_data,y_data.T[0])

    diabetes_y_pred = model.predict(x_test_data)
    result = y_test_data.T[0] == diabetes_y_pred
    acc = np.mean(result)
    print('准确度: %.2f%%' % (100 * acc))
示例#18
0
# X = data1_x_bin
# y = Target
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
示例#19
0
X_train=X_selected.head(num_train)
X_test=X_selected.tail(num_test)
X_train=preprocessing.scale(X_train)


#Machine Learning Algorithm (MLA) Selection and Initialization
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(learning_rate=0.1,n_estimators=300,random_state=0),
    ensemble.BaggingClassifier(max_samples= 0.25, n_estimators= 300, random_state= 0),
    ensemble.ExtraTreesClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, random_state= 0),
    ensemble.GradientBoostingClassifier(learning_rate= 0.05, max_depth= 2, n_estimators= 300, random_state= 0),
    ensemble.RandomForestClassifier(criterion= 'entropy', max_depth= 6, n_estimators= 100, oob_score= True, random_state= 0),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(max_iter_predict= 10, random_state= 0),

    #GLM
    linear_model.LogisticRegressionCV(fit_intercept= True, random_state= 0, solver= 'liblinear'),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #Navies Bayes
    naive_bayes.BernoulliNB(alpha= 0.1),
    naive_bayes.GaussianNB(),

    #Nearest Neighbor
    neighbors.KNeighborsClassifier(algorithm= 'brute', n_neighbors= 7, weights= 'uniform'),
#models_f1.append(f1)
#models_performances.append(performance)

classifiers = [
    ("KNN", None, KNeighborsClassifier(2)),
    ("Linear SVM", None, SVC(kernel="linear")),
    ("RBF SVM", None, SVC(gamma=2, C=1)),
    ("DT", None, DecisionTreeClassifier(min_samples_split=1024, max_depth=20)),
    ("RF", None,
     RandomForestClassifier(n_estimators=10,
                            min_samples_split=1024,
                            max_depth=20)),
    ("AB", None, AdaBoostClassifier(random_state=13370)),
    #("GP ARD", ["MFCC"], gp.GaussianProcessClassifier(kernel=ard_kernel(sigma=1.2, length_scale=np.array([1]*1)))),
    ("GP-DP", ["MFCC", "All", "CIFE", "CFS"],
     gp.GaussianProcessClassifier(kernel=gp.kernels.DotProduct()))
    # output the confidence level and the predictive variance for the dot product (the only one that we keep in the end)
    # GP beats SVM in our experiment (qualitative advantages)
    # only keep RBF, dot product and matern on the chart
    # add a paragraph 'Processed Data'
    #1) generate the dataset with 526 features
    #2) the predictive variance and predictive mean (best and worst) of some vectors from the dot product.
]
#classify(X_train[:,bitVec], X_dev[:,bitVec])
models_f1, models_performances = getClassifieresPerformances(
    classifiers, models_f1, models_performances)
#models_f1, models_performances = getClassifieresPerformancesByDefinedX(classifiers, 'predict', models_f1, models_performances, newTrainX, y_bin_train, newDevX)
models_f1, models_performances = addRelatedWork(models_f1, models_performances)
models_f1 = sorted(models_f1, key=lambda l: l[1])
models_performances = sorted(models_performances, key=lambda l: l[1])
示例#21
0
文件: my.py 项目: boliqq07/BGP
def dict_method_clf():
    dict_method = {}
    # 1st part
    """1SVC"""
    me1 = SVC(C=1.0, kernel='rbf', degree=3, gamma='auto_deprecated',
              coef0=0.0, shrinking=True, probability=False,
              tol=1e-3, cache_size=200, class_weight='balanced',
              verbose=False, max_iter=-1, decision_function_shape='ovr',
              random_state=None)
    cv1 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring1 = 'accuracy'

    param_grid1 = [{'C': [10, 5, 2.5, 1, 0.5], 'gamma': [0.001, 0.01, 0.0001]}]

    dict_method.update({'SVC-set': [me1, cv1, scoring1, param_grid1]})

    """2LogRL2"""
    me2 = LogisticRegression(penalty='l2', solver='liblinear', dual=False, tol=1e-3, C=1.0, fit_intercept=True,
                             intercept_scaling=1, class_weight='balanced', random_state=0)
    cv2 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring2 = 'accuracy'

    param_grid2 = [{'C': [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2]}, ]

    dict_method.update({"LogRL2-set": [me2, cv2, scoring2, param_grid2]})

    """3SGDCL2"""
    me3 = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15,
                                     fit_intercept=True, max_iter=None, tol=None, shuffle=True,
                                     verbose=0, epsilon=0.1, random_state=0,
                                     learning_rate='optimal', eta0=0.0, power_t=0.5,
                                     class_weight="balanced", warm_start=False, average=False, n_iter=None)
    cv3 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring3 = 'accuracy'

    param_grid3 = [{'alpha': [0.0001, 0.001, 0.01]}, ]

    dict_method.update({"SGDCL2-set": [me3, cv3, scoring3, param_grid3]})

    """4KNC"""
    me4 = neighbors.KNeighborsClassifier(n_neighbors=5)
    cv4 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring4 = 'balanced_accuracy'

    param_grid4 = [{'n_neighbors': [3, 4, 5]}, ]

    dict_method.update({"KNC-set": [me4, cv4, scoring4, param_grid4]})

    """5GPC"""
    kernel = 1.0 * RBF(1.0)
    me5 = gaussian_process.GaussianProcessClassifier(kernel=kernel)
    cv5 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring5 = 'balanced_accuracy'
    param_grid5 = [{'max_iter_predict': [100, ]}, ]

    dict_method.update({'GPC-set': [me5, cv5, scoring5, param_grid5]})

    # 2nd part
    '''TreeC'''
    me6 = DecisionTreeClassifier(
        criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1,
        min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None,
        min_impurity_decrease=0.0, min_impurity_split=None, class_weight="balanced", presort=False)
    cv6 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring6 = 'accuracy'
    param_grid6 = [{'max_depth': [3, 4, 5, 6]}]
    dict_method.update({'TreeC-em': [me6, cv6, scoring6, param_grid6]})

    '''GBC'''
    me7 = ensemble.GradientBoostingClassifier(
        loss='deviance', learning_rate=0.1, n_estimators=100,
        subsample=1.0, criterion='friedman_mse', min_samples_split=2,
        min_samples_leaf=1, min_weight_fraction_leaf=0.,
        max_depth=3, min_impurity_decrease=0.,
        min_impurity_split=None, init=None,
        random_state=None, max_features=None, verbose=0,
        max_leaf_nodes=None, warm_start=False,
        presort='auto')
    cv7 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring7 = 'balanced_accuracy'
    param_grid7 = [{'max_depth': [3, 4, 5, 6]}]
    dict_method.update({'GBC-em': [me7, cv7, scoring7, param_grid7]})

    '''RFC'''
    me8 = ensemble.RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=None,
                                          min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.,
                                          max_features="auto", max_leaf_nodes=None, min_impurity_decrease=0.,
                                          min_impurity_split=None, bootstrap=True, oob_score=False,
                                          random_state=None, verbose=0, warm_start=False,
                                          class_weight="balanced")
    cv8 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring8 = 'accuracy'
    param_grid8 = [{'max_depth': [3, 4, 5, 6]}]
    dict_method.update({"RFC-em": [me8, cv8, scoring8, param_grid8]})

    "AdaBC"
    me9 = AdaBoostClassifier(n_estimators=100, learning_rate=1., algorithm='SAMME.R',
                             random_state=0)
    cv9 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring9 = 'accuracy'
    param_grid9 = [{'base_estimator': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2),
                                       DecisionTreeClassifier(max_depth=3)]}]
    dict_method.update({"AdaBC-em": [me9, cv9, scoring9, param_grid9]})

    # 3nd

    'SGDCL1'
    me12 = linear_model.SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, l1_ratio=0.15,
                                      fit_intercept=True, max_iter=None, tol=None, shuffle=True,
                                      verbose=0, epsilon=0.1, random_state=0,
                                      learning_rate='optimal', eta0=0.0, power_t=0.5,
                                      class_weight="balanced", warm_start=False, average=False, n_iter=None)
    cv12 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring12 = 'accuracy'
    param_grid12 = [{'alpha': [0.0001, 0.001, 0.01]}, ]
    dict_method.update({"SGDC-L1": [me12, cv12, scoring12, param_grid12]})

    "Per"
    me14 = Perceptron(penalty="l1", alpha=0.0001, fit_intercept=True, max_iter=None, tol=None,
                      shuffle=True, verbose=0, eta0=1.0, random_state=0,
                      class_weight=None, warm_start=False, n_iter=None)
    cv14 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring14 = 'accuracy'
    param_grid14 = [{'alpha': [0.0001, 0.001, 0.01]}, ]
    dict_method.update({"Per-L1": [me14, cv14, scoring14, param_grid14]})

    """LogRL1"""
    me15 = LogisticRegression(penalty='l1', solver='liblinear', dual=False, tol=1e-3, C=1.0, fit_intercept=True,
                              intercept_scaling=1, class_weight='balanced', random_state=0)
    cv15 = StratifiedKFold(5, shuffle=True, random_state=0)
    scoring15 = 'accuracy'
    param_grid15 = [{'C': [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2]}, ]
    dict_method.update({"LogR-L1": [me15, cv15, scoring15, param_grid15]})

    return dict_method
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

# In[ ]:

# initialization algorithms
algorithms = {  # Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    # Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),

    # Generalized Linear Models
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    # Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),

    # Nearest Neighbor
    neighbors.KNeighborsClassifier(),
示例#23
0
# # Step 5: Model Data

# In[*]

#Machine Learning Algorithm (MLA) Selection and initialization
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(n_estimators=100),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),

    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.PassiveAggressiveClassifier(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),

    #Navies Bayes
    naive_bayes.GaussianNB(),

    #Nearest Neighbor
    neighbors.KNeighborsClassifier(n_neighbors=3),

    #SVM
示例#24
0
 def learn(self, data_train, target_train):
     self.model = gaussian_process.GaussianProcessClassifier()
     self.model.fit(
         preprocessing.normalize(self.maybe_reshape(data_train), axis=1),
         target_train)
     return
示例#25
0
 def get_skl_estimator(self, **default_parameters):
     return gaussian_process.GaussianProcessClassifier(**default_parameters)
示例#26
0
def compare_algorithm(data, target):
    x_train, x_cross, y_train, y_cross = train_test_split(data, target)
    MLA = [
        # Ensemble Methods
        ensemble.AdaBoostClassifier(),
        ensemble.BaggingClassifier(),
        ensemble.ExtraTreesClassifier(),
        ensemble.GradientBoostingClassifier(),
        ensemble.RandomForestClassifier(),

        # Gaussian Processes
        gaussian_process.GaussianProcessClassifier(),

        # GLM
        linear_model.LogisticRegressionCV(),
        linear_model.PassiveAggressiveClassifier(max_iter=1000, tol=0.001),
        linear_model.RidgeClassifierCV(),
        linear_model.SGDClassifier(max_iter=1000, tol=0.001),
        linear_model.Perceptron(max_iter=1000, tol=0.001),

        # Navies Bayes
        naive_bayes.BernoulliNB(),
        naive_bayes.GaussianNB(),

        # Nearest Neighbor
        neighbors.KNeighborsClassifier(),

        # SVM
        svm.SVC(probability=True),
        svm.NuSVC(probability=True),
        svm.LinearSVC(),

        # Trees
        tree.DecisionTreeClassifier(),
        tree.ExtraTreeClassifier(),

        # Discriminant Analysis
        discriminant_analysis.LinearDiscriminantAnalysis(),
        discriminant_analysis.QuadraticDiscriminantAnalysis(),

        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        xgb.XGBClassifier()
    ]
    MLA_columns = []
    MLA_compare = pd.DataFrame(columns=MLA_columns)

    row_index = 0
    for alg in MLA:
        predicted = alg.fit(x_train, y_train).predict(x_cross)
        fp, tp, th = roc_curve(y_cross, predicted)
        MLA_name = alg.__class__.__name__
        MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
        MLA_compare.loc[row_index, 'MLA Train Accuracy'] = round(
            alg.score(x_train, y_train), 4)
        MLA_compare.loc[row_index, 'MLA Test Accuracy'] = round(
            alg.score(x_cross, y_cross), 4)
        MLA_compare.loc[row_index, 'MLA Precission'] = precision_score(
            y_cross, predicted)
        MLA_compare.loc[row_index,
                        'MLA Recall'] = recall_score(y_cross, predicted)
        MLA_compare.loc[row_index, 'MLA AUC'] = auc(fp, tp)
        row_index = row_index + 1

    MLA_compare.sort_values(by=['MLA Test Accuracy'],
                            ascending=False,
                            inplace=True)
    print(MLA_compare)
示例#27
0
    #    print("Negative Log Likelihood: %.3f\n" % (mul_gp1.log_marginal_likelihood(theta=None)))

    #    print("Computing 10-fold CV...\n")
    #    tcv = time.time()
    #    cv_gp1 = cross_val_score(mul_gp1, data1[data_features[0:11]], data1["eval"], cv=10)
    #    elapsed = time.time() - tcv
    #    print('CV computation time :: %.3f\n' % (elapsed))
    ##print('CV-prediction error rate :: {}'.format(cv_gp1))
    ##mean cv and the 95% confidence interval of the cv's estimate
    #    print("Accuracy(Mean CV): %0.2f (+/- %0.2f)\n" % (cv_gp1.mean(), cv_gp1.std() * 2))
    #    print('---------------------------------------------')

    # Multiclass as One-vs-One
    t2 = time.time()
    #kernel=1.0 * RBF(length_scale=1.0)
    mul_gp2 = gaussian_process.GaussianProcessClassifier(
        multi_class='one_vs_one').fit(train_x, train_y)
    trainTime = time.time() - t2
    #    print('Multiclass (1-vs-1) computation time :: %.3f\n' % (elapsed))
    trainTestStartTime = time.time()
    print('Multiclass (1-vs-1) Gaussian Process Train Accuracy :: %.3f\n' %
          (metrics.accuracy_score(train_y, mul_gp2.predict(train_x))))
    trainTestTime = time.time() - trainTestStartTime
    testTestStartTime = time.time()
    print('Multiclass (1-vs-1) Gaussian Process Test Accuracy :: %.3f\n' %
          (metrics.accuracy_score(test_y, mul_gp2.predict(test_x))))
    testTestTime = time.time() - testTestStartTime
    print(trainTime)
    print(trainTestTime)
    print(testTestTime)
#    print("Negative Log Likelihood: %.3f\n" % (mul_gp1.log_marginal_likelihood(theta=None)))
# coding=utf-8
"""Guassian Process Classifier applied on the Iris dataset."""

from sklearn import datasets, model_selection, gaussian_process, metrics

if __name__ == "__main__":
    print("Loading data...")
    X, y = datasets.load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y)

    print("Fitting model...")
    gpc = gaussian_process.GaussianProcessClassifier(
        kernel=gaussian_process.kernels.RBF([1.0]))
    gpc.fit(X_train, y_train)

    print("Evaluating model...")
    print(metrics.classification_report(y_test, gpc.predict(X_test)))
    print(metrics.confusion_matrix(y_test, gpc.predict(X_test)))
示例#29
0
    test_features = pd.read_csv('test_features.csv', index_col=0)

else:
    # Split the training and testing data sets; save this test data set for later use too
    train_labels, test_labels, train_features, test_features = train_test_split(preprocessed_labels, preprocessed_features, test_size=0.2)

    test_features.to_csv("test_features.csv")
    test_labels.to_csv("test_labels.csv")

    # Create one of the following classifiers:
    if classifier_type == "Tree":
        classifier = tree.DecisionTreeClassifier()  # Create Classifier, doesn't even need any of the params changed
    elif classifier_type == "DNN":
        classifier = neural_network.MLPClassifier(verbose=True, max_iter=max_iter, early_stopping=early_stopping, hidden_layer_sizes=(100, 50))
    elif classifier_type == "Gaussian":
        classifier = gaussian_process.GaussianProcessClassifier(kernel=1.0*RBF(1.0))
    elif classifier_type == "Cal_Class_Test":
        classifier = neural_network.MLPClassifier(verbose=True, max_iter=max_iter, hidden_layer_sizes=(100, 50))
        classifier = CalibratedClassifierCV(classifier, cv=5, method="isotonic")
        use_calibrator = False    # already calibrated
    else:   # Default to DNN
        print("Classifier Unselected, Defaulting to DNN")
        print("----------------------------------------")
        classifier = neural_network.MLPClassifier(verbose=True, max_iter=max_iter, hidden_layer_sizes=(100, 50))

    classifier.fit(train_features, train_labels)  # Fit Model

    if use_calibrator:      # Calibrate Classifier to adjust label probabilities
        print('calibrating...')
        classifier = CalibratedClassifierCV(classifier, cv="prefit", method=calibration_type)  # Defaults to sigmoid
        classifier.fit(train_features, train_labels)
def multi_classifier_voting_predication(data1, data1_x_bin, cv_split, Target):
    # why choose one model, when you can pick them all with voting classifier
    # http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
    # removed models w/o attribute 'predict_proba' required for vote classifier and models with a 1.0 correlation to another model
    vote_est = [
        # Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
        ('ada', ensemble.AdaBoostClassifier()),
        ('bc', ensemble.BaggingClassifier()),
        ('etc', ensemble.ExtraTreesClassifier()),
        ('gbc', ensemble.GradientBoostingClassifier()),
        ('rfc', ensemble.RandomForestClassifier()),
        # Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
        ('gpc', gaussian_process.GaussianProcessClassifier()),

        # GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
        ('lr', linear_model.LogisticRegressionCV()),

        # Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
        ('bnb', naive_bayes.BernoulliNB()),
        ('gnb', naive_bayes.GaussianNB()),

        # Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
        ('knn', neighbors.KNeighborsClassifier()),

        # SVM: http://scikit-learn.org/stable/modules/svm.html
        ('svc', svm.SVC(probability=True)),

        # xgboost: http://xgboost.readthedocs.io/en/latest/model.html
        ('xgb', XGBClassifier())
    ]
    # Hard Vote or majority rules
    vote_hard = ensemble.VotingClassifier(estimators=vote_est, voting='hard')
    vote_hard_cv = model_selection.cross_validate(vote_hard,
                                                  data1[data1_x_bin],
                                                  data1[Target],
                                                  cv=cv_split,
                                                  return_train_score=True)
    vote_hard.fit(data1[data1_x_bin], data1[Target])

    print("Hard Voting Training w/bin score mean: {:.2f}".format(
        vote_hard_cv['train_score'].mean() * 100))
    print("Hard Voting Test w/bin score mean: {:.2f}".format(
        vote_hard_cv['test_score'].mean() * 100))
    print("Hard Voting Test w/bin score 3*std: +/- {:.2f}".format(
        vote_hard_cv['test_score'].std() * 100 * 3))
    print('-' * 10)

    # Soft Vote or weighted probabilities
    vote_soft = ensemble.VotingClassifier(estimators=vote_est, voting='soft')
    vote_soft_cv = model_selection.cross_validate(vote_soft,
                                                  data1[data1_x_bin],
                                                  data1[Target],
                                                  cv=cv_split,
                                                  return_train_score=True)
    vote_soft.fit(data1[data1_x_bin], data1[Target])

    print("Soft Voting Training w/bin score mean: {:.2f}".format(
        vote_soft_cv['train_score'].mean() * 100))
    print("Soft Voting Test w/bin score mean: {:.2f}".format(
        vote_soft_cv['test_score'].mean() * 100))
    print("Soft Voting Test w/bin score 3*std: +/- {:.2f}".format(
        vote_soft_cv['test_score'].std() * 100 * 3))
    print('-' * 10)
    return vote_hard, vote_soft