예제 #1
0
파일: demo.py 프로젝트: zistvan/caribou
def compare_classifiers(file):
    if "banknote" in file:
        used_features = ["Variance", "Skewness", "Curtosis", "Entropy"]
        output_feature = "Class"
        orig = pd.read_csv("banknote_s_orig.csv")
        pert = pd.read_csv("banknote_s_pert.csv")
    elif "diabetes" in file:
        used_features = [
            "Pregnancies", "Glucose", "BloodPressure", "SkinThickness",
            "Insulin", "BMI", "DiabetesPedigreeFunction", "Age"
        ]
        output_feature = "Outcome"
        orig = pd.read_csv("diabetes_orig.csv")
        pert = pd.read_csv("diabetes_pert.csv")
    elif "bank" in file:
        used_features = [
            "Age", "Job", "Marital", "Education", "Default", "Balance",
            "Housing", "Loan", "Contact", "Day", "Month", "Duration",
            "Campaign", "Pdays", "Previous", "Poutcome"
        ]
        output_feature = "Y"
        orig = pd.read_csv("bank_labeled_orig.csv")
        pert = pd.read_csv("bank_labeled_pert.csv")

    sc = StandardScaler()
    orig_scaled = orig.copy()
    pert_scaled = pert.copy()
    orig_scaled[used_features] = sc.fit_transform(orig_scaled[used_features])
    pert_scaled[used_features] = sc.fit_transform(pert_scaled[used_features])

    y_orig = orig_scaled[output_feature]
    x_orig = orig_scaled.drop([output_feature], axis=1)[used_features]
    y_pert = pert_scaled[output_feature]
    x_pert = pert_scaled.drop([output_feature], axis=1)[used_features]

    x_train_orig, x_test_orig, y_train_orig, y_test_orig = train_test_split(
        x_orig, y_orig, test_size=0.33, random_state=42)
    x_train_pert, x_test_pert, y_train_pert, y_test_pert = train_test_split(
        x_pert, y_pert, test_size=0.33, random_state=42)

    models = [
        DecisionTreeClassifier(),
        KNeighborsClassifier(),
        SVC(),
        GaussianProcessClassifier(),
        RandomForestClassifier(),
        MLPClassifier(),
        AdaBoostClassifier(),
        QuadraticDiscriminantAnalysis(),
        GaussianNB()
    ]

    fig = plt.figure()
    for idx, model in enumerate(models):
        model.fit(x_train_orig, y_train_orig)
        y_pred_orig = model.predict(x_test_orig)
        tn_orig, fp_orig, fn_orig, tp_orig = confusion_matrix(
            y_test_orig, y_pred_orig).ravel()
        model.fit(x_train_pert, y_train_pert)
        y_pred_pert = model.predict(x_test_pert)
        tn_pert, fp_pert, fn_pert, tp_pert = confusion_matrix(
            y_test_pert, y_pred_pert).ravel()

        accuracy_orig = (tp_orig + tn_orig) / (tp_orig + tn_orig + fp_orig +
                                               fn_orig) * 100
        accuracy_pert = (tp_pert + tn_pert) / (tp_pert + tn_pert + fp_pert +
                                               fn_pert) * 100
        precision_orig = tp_orig / (tp_orig + fp_orig) * 100
        precision_pert = tp_pert / (tp_pert + fp_pert) * 100
        recall_orig = tp_orig / (tp_orig + fn_orig) * 100
        recall_pert = tp_pert / (tp_pert + fn_pert) * 100

        print(("\n{}:\n Accuracy original  | masked = {:5.2f} | {:5.2f}\n" +
               " Precision original | masked = {:5.2f} | {:5.2f}\n" +
               " Recall original    | masked = {:5.2f} | {:5.2f}").format(
                   type(model).__name__, accuracy_orig, accuracy_pert,
                   precision_orig, precision_pert, recall_orig, recall_pert))

        plot_data = [[accuracy_orig, precision_orig, recall_orig],
                     [accuracy_pert, precision_pert, recall_pert]]
        X = np.arange(3)
        ax = fig.add_subplot(3, 3, idx + 1)
        ax.bar(X + 0.00,
               plot_data[0],
               color='b',
               width=0.25,
               label="Original data")
        ax.bar(X + 0.25,
               plot_data[1],
               color='g',
               width=0.25,
               label="Masked data")
        ax.set_title(type(model).__name__)
        ax.set_xticks(X)
        ax.set_xticklabels(["Accuracy", "Precision", "Recall"])

    plt.tight_layout()
    plt.legend()
    plt.show()
예제 #2
0
import numpy as np

from matplotlib import pyplot as plt

from sklearn.metrics import accuracy_score, log_loss
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# Generate data
train_size = 50
rng = np.random.RandomState(0)
X = rng.uniform(0, 5, 100)[:, np.newaxis]
y = np.array(X[:, 0] > 2.5, dtype=int)

# Specify Gaussian Processes with fixed and optimized hyperparameters
gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0),
                                   optimizer=None)
gp_fix.fit(X[:train_size], y[:train_size])

gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0))
gp_opt.fit(X[:train_size], y[:train_size])

print("Log Marginal Likelihood (initial): %.3f" %
      gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta))
print("Log Marginal Likelihood (optimized): %.3f" %
      gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta))

print("Accuracy: %.3f (initial) %.3f (optimized)" % (
    accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])),
    accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])),
))
print("Log-loss: %.3f (initial) %.3f (optimized)" % (
예제 #3
0
    'L2 logistic (Multinomial)':
    LogisticRegression(C=C,
                       penalty='l2',
                       solver='saga',
                       multi_class='multinomial',
                       max_iter=10000),
    'L2 logistic (OvR)':
    LogisticRegression(C=C,
                       penalty='l2',
                       solver='saga',
                       multi_class='ovr',
                       max_iter=10000),
    'Linear SVC':
    SVC(kernel='linear', C=C, probability=True, random_state=0),
    'GPC':
    GaussianProcessClassifier(kernel)
}

n_classifiers = len(classifiers)

plt.figure(figsize=(3 * 2, n_classifiers * 2))
plt.subplots_adjust(bottom=.2, top=.95)

xx = np.linspace(3, 9, 100)
yy = np.linspace(1, 5, 100).T
xx, yy = np.meshgrid(xx, yy)
Xfull = np.c_[xx.ravel(), yy.ravel()]

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X, y)
예제 #4
0
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2]  # we only take the first two features.
y = np.array(iris.target, dtype=int)

h = .02  # step size in the mesh

kernel = 1.0 * RBF([1.0])
gpc_rbf_isotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)
kernel = 1.0 * RBF([1.0, 1.0])
gpc_rbf_anisotropic = GaussianProcessClassifier(kernel=kernel).fit(X, y)

# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

titles = ["Isotropic RBF", "Anisotropic RBF"]
plt.figure(figsize=(10, 5))
for i, clf in enumerate((gpc_rbf_isotropic, gpc_rbf_anisotropic)):
    # Plot the predicted probabilities. For that, we will assign a color to
    # each point in the mesh [x_min, m_max]x[y_min, y_max].
    plt.subplot(1, 2, i + 1)
### WRITE YOUR CODE HERE
# If you get stuck, uncomment the line above to load a correction in this cell (then you can execute this code).

from sklearn.gaussian_process import GaussianProcessClassifier

spam_GP = GaussianProcessClassifier()
print(spam_GP.fit(Xtrain.toarray(), ytrain))

print("Score:", spam_GP.score(Xtest.toarray(), ytest))
예제 #6
0
파일: test_gpc.py 프로젝트: AnAnteup/icp4
def test_predict_consistent(kernel):
    # Check binary predict decision has also predicted probability above 0.5.
    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
    assert_array_equal(gpc.predict(X), gpc.predict_proba(X)[:, 1] >= 0.5)
예제 #7
0
파일: test_gpc.py 프로젝트: AnAnteup/icp4
def test_lml_precomputed(kernel):
    # Test that lml of optimized kernel is stored correctly.
    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
    assert_almost_equal(gpc.log_marginal_likelihood(gpc.kernel_.theta),
                        gpc.log_marginal_likelihood(), 7)
예제 #8
0
def fun_classify(inputFile, groupsSel, FeatSelect, Nfeats, scaleFeats=1):
    """
    AllStatsMean, AllStatsSTD = fun_classify(inputFile, groupsSel, FeatSelect, Nfeats)
    inputFile: the .csv file containt feature tables
    groups: The selected groups to classify. Full set is ["S","F","Z","N","O"],
    but ["S","F","Z"] are of most interest for the article (ictal, inter-ictal and normal EEG)
    FeatSelect: feature selection method: PCA, RFE, fisher or none
    Nfeats: number of selected features
    Returns:
    AllStatsMean: mean performance values
    AllStatsSTD: standard deviation of performance values  
    """
    #reads input features
    dfFeats = pd.read_csv(inputFile, sep=',', header=0)

    #only selected groups
    dfFeats = dfFeats[dfFeats["Group"].isin(groupsSel)]
    if "decTaime" in dfFeats:
        x = dfFeats.iloc[:, 2:]  #ignores decomposition method execution time
    else:
        x = dfFeats.iloc[:, 1:]
    y = dfFeats.iloc[:, 0].values
    if scaleFeats:  #scale feats?
        x = StandardScaler().fit_transform(x)
    #Feature selection
    if x.shape[1] > Nfeats:
        #RFE
        if FeatSelect == "RFE":
            rfeModel = SVC(kernel="linear",
                           C=0.025,
                           probability=True,
                           gamma='scale')
            rfeSelect = RFE(rfeModel, n_features_to_select=Nfeats)
            rfe_fit = rfeSelect.fit(x, y)
            x = x[:, rfe_fit.support_]

        if FeatSelect == "PCA":
            pca = PCA(n_components=Nfeats)
            x = pca.fit_transform(x)

        if FeatSelect == "fisher":
            fisherScore = fisher_score.fisher_score(x, y)
            idx = fisher_score.feature_ranking(fisherScore)
            x = x[:, idx[:Nfeats]]

    names = ["KNN", "Linear SVM", "RBF SVM", "GPC", "MLP"]

    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025, probability=True, gamma='scale'),
        SVC(probability=True, gamma='scale'),
        GaussianProcessClassifier(1.0 * RBF(1.0)),
        MLPClassifier(alpha=1, max_iter=200)
    ]

    #initialize performance variable
    AllStats = {}
    AllStatsMean = {}
    AllStatsSTD = {}

    for name in names:
        AllStats[name] = {
            "Accuracy": np.zeros([realizations, K_folds]),
            "SensitivityMean": np.zeros([realizations, K_folds]),
            "SpecificityMean": np.zeros([realizations, K_folds]),
            "AUC_Mean": np.zeros([realizations, K_folds]),
            "SensitivityIctal": np.zeros([realizations, K_folds]),
            "SpecificityIctal": np.zeros([realizations, K_folds]),
            "AUC_Ictal": np.zeros([realizations, K_folds]),
            "TTtimes": np.zeros([realizations, K_folds])
        }
        AllStatsMean[name] = {
            "Accuracy": 0.,
            "SensitivityMean": 0.,
            "SpecificityMean": 0,
            "AUC_Mean": 0.,
            "SensitivityIctal": 0.,
            "SpecificityIctal": 0.,
            "AUC_Ictal": 0.,
            "TTtimes": 0.
        }
        AllStatsSTD[name] = {
            "Accuracy": 0.,
            "SensitivityMean": 0.,
            "SpecificityMean": 0,
            "AUC_Mean": 0.,
            "SensitivityIctal": 0.,
            "SpecificityIctal": 0.,
            "AUC_Ictal": 0.,
            "TTtimes": 0.
        }
        #for each realization
    for i in range(realizations):
        skf = StratifiedKFold(n_splits=K_folds,
                              shuffle=True)  #5-fold validation

        for tupTemp, ki in zip(skf.split(x, y), range(K_folds)):
            train_idx, test_idx = tupTemp[0], tupTemp[1]
            X_train, X_test = x[train_idx], x[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            for name, clf in zip(names, classifiers):  #for each classifier
                tic = time.time(
                )  #check training/testing time of each classifier
                #Fit model and predict
                modelFit = clf.fit(X_train, y_train)
                yPredicted = modelFit.predict(X_test)
                probsTest = modelFit.predict_proba(X_test)
                toc = time.time()
                # AUC -  #ictal class as positive
                if len(np.unique(y)) > 2:
                    AUCs = roc_auc_score(
                        LabelBinarizer().fit_transform(y_test),
                        probsTest,
                        average=None)
                else:
                    AUCs = roc_auc_score(y_test, probsTest[:, 1], average=None)
                #Sensitivity and Specificity
                cMatrix = confusion_matrix(y_test, yPredicted)
                FP = cMatrix.sum(axis=0) - np.diag(cMatrix)
                FN = cMatrix.sum(axis=1) - np.diag(cMatrix)
                TP = np.diag(cMatrix)
                TN = cMatrix.sum() - (FP + FN + TP)
                # Sensitivity
                TPR = TP / (TP + FN)
                # Specificity or true negative rate
                TNR = TN / (TN + FP)
                #fill performance variable
                AllStats[name]["Accuracy"][i, ki] = accuracy_score(
                    y_test, yPredicted)
                AllStats[name]["SensitivityMean"][i, ki] = np.mean(TPR)
                AllStats[name]["SpecificityMean"][i, ki] = np.mean(TNR)
                AllStats[name]["SensitivityIctal"][i, ki] = TPR[0]
                AllStats[name]["SpecificityIctal"][i, ki] = TNR[0]
                AllStats[name]["AUC_Mean"][i, ki] = np.mean(AUCs)
                AllStats[name]["TTtimes"][i, ki] = toc - tic
                if len(np.unique(y)) > 2:
                    AllStats[name]["AUC_Ictal"][i, ki] = AUCs[0]
    AllStatsDF = [0] * len(names)
    for idx, name in enumerate(names):
        for istat in AllStats[name].keys():
            AllStats[name][istat] = np.mean(AllStats[name][istat], axis=1)
            AllStatsMean[name][istat] = np.mean(AllStats[name][istat])
            AllStatsSTD[name][istat] = np.std(AllStats[name][istat])
        AllStatsDF[idx] = pd.DataFrame.from_dict(AllStats[name])
        AllStatsDF[idx]["Nmodes"] = Nmodes
        AllStatsDF[idx]["Classifier"] = name

    return pd.DataFrame.from_dict(AllStatsMean), pd.DataFrame.from_dict(
        AllStatsSTD), pd.concat(AllStatsDF)
예제 #9
0
models = [
#            MultinomialNB(alpha=.01), #very fast ~0.7
            LogisticRegression(multi_class="auto", solver="lbfgs"), # fast ~10m and very good ~0.865 
#            QuadraticDiscriminantAnalysis(), # fast ~5m and very good ~0.81 
#            DecisionTreeClassifier(), # fast enough
#            RandomForestClassifier(), # fast enough
#            GaussianNB(), #very fast, performance around 0.58
         ]

non_models = [
            #xgb.XGBClassifier(objective="multi:softprob"), # too slow probably around 10h and okayish ~ 0.759
            KNeighborsClassifier(100), #works but v slow --> takes an hour + need to consider a hundred NNs because there are 20 classes
            SVC(kernel="linear", C=0.025, probability=True), # too slow to work
            SVC(gamma=2, C=1, probability=True), # too slow to work but v good ~0.83
            GaussianProcessClassifier(1.0 * RBF(1.0)), # too slow to work
            MLPClassifier(alpha=1, max_iter=1000), #fast enough ~80m and really good ~0.81
            AdaBoostClassifier() #prob take 1-2h, also very poor performance 
        ]

non_names = ["Nearest Neighbors", "Linear SVM","RBF SVM","Gaussian Process", "Neural Net","AdaBoost"] # "xgboost",

names = [
#         "Multinomial Naive Bayes",
         "Logistic Regression",
#         "QDA",
#         "Decision Tree",
#         "Random Forest",
#         "Gaussian Naive Bayes"
         ]
def load_data(dataset='20newsgroups', true_ratio = 0.5):
def run_all_classifiers(X_train, X_test, y_train, y_test, print_output_scores_to_csv=False, output_scores_csv_file_suffix='', print_only_table=False):
    """
    The list of all classifiers was generated by running the following commented code.

    Args:
        a_X_train, a_X_test, a_y_train, a_y_test: The train and tests datasets.
        a_print_output_scores_to_csv: If True the Precision, Recall, F1-Score and Support for both classes will
        be printed to a file with the current date and time.
        a_output_scores_csv_file_suffix: Suffix to be added to the csv file just before the .csv extension. Normally
        describing the run that is being performed.

    Returns:
        dataset: Returns output scores dataset.

    """
    assert isinstance(X_train, pd.core.frame.DataFrame)
    assert isinstance(X_test,  pd.core.frame.DataFrame)
    assert isinstance(y_train, pd.core.frame.Series)
    assert isinstance(y_test,  pd.core.frame.Series)
    assert isinstance(print_output_scores_to_csv, bool)
    assert isinstance(output_scores_csv_file_suffix, object)

    import time

    # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn
    #from sklearn.utils.testing import all_estimators
    #estimators = all_estimators()
    #for name, class_ in estimators:
    #    log_print(name)

    from sklearn.calibration           import CalibratedClassifierCV
    from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
    from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
    from sklearn.ensemble              import AdaBoostClassifier
    from sklearn.ensemble              import BaggingClassifier
    from sklearn.ensemble              import ExtraTreesClassifier
    from sklearn.ensemble              import GradientBoostingClassifier
    from sklearn.ensemble              import RandomForestClassifier
    from sklearn.gaussian_process      import GaussianProcessClassifier
    from sklearn.linear_model          import LogisticRegression
    from sklearn.linear_model          import LogisticRegressionCV
    from sklearn.linear_model          import SGDClassifier

    from sklearn.mixture               import BayesianGaussianMixture
    from sklearn.mixture               import DPGMM
    from sklearn.mixture               import GaussianMixture
    from sklearn.mixture               import GMM
    from sklearn.mixture               import VBGMM
    from sklearn.naive_bayes           import BernoulliNB
    from sklearn.naive_bayes           import GaussianNB
    from sklearn.neighbors             import KNeighborsClassifier
    from sklearn.neural_network        import MLPClassifier
    from sklearn.semi_supervised       import LabelPropagation
    from sklearn.semi_supervised       import LabelSpreading
    from sklearn.svm                   import SVC
    from sklearn.tree                  import DecisionTreeClassifier
    #from xgboost                       import XGBClassifier

    models = []
    models.append(('AdaBoostClassifier',            AdaBoostClassifier()))
    models.append(('BaggingClassifier',             BaggingClassifier()))
    models.append(('BayesianGaussianMixture',       BayesianGaussianMixture()))
    models.append(('BernoulliNB',                   BernoulliNB()))
    models.append(('CalibratedClassifierCV',        CalibratedClassifierCV()))
    models.append(('DPGMM',                         DPGMM()))
    models.append(('DecisionTreeClassifier',        DecisionTreeClassifier(random_state=SEED)))
    models.append(('ExtraTreesClassifier',          ExtraTreesClassifier(random_state=SEED)))
    models.append(('GMM',                           GMM()))
    models.append(('GaussianMixture',               GaussianMixture()))
    models.append(('GaussianNB',                    GaussianNB()))
    models.append(('GaussianProcessClassifier',     GaussianProcessClassifier()))
    models.append(('GradientBoostingClassifier',    GradientBoostingClassifier()))
    models.append(('KNeighborsClassifier',          KNeighborsClassifier()))
    models.append(('LabelPropagation',              LabelPropagation()))
    models.append(('LabelSpreading',                LabelSpreading()))
    models.append(('LinearDiscriminantAnalysis',    LinearDiscriminantAnalysis()))
    models.append(('LogisticRegression',            LogisticRegression()))
    models.append(('LogisticRegressionCV',          LogisticRegressionCV()))
    models.append(('MLPClassifier',                 MLPClassifier()))
    #models.append(('MultinomialNB', MultinomialNB()))
    #models.append(('NuSVC', NuSVC()))
    models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis()))
    models.append(('RandomForestClassifier',        RandomForestClassifier(random_state=SEED)))
    models.append(('SGDClassifier',                 SGDClassifier()))
    models.append(('SVC',                           SVC()))
    models.append(('VBGMM',                         VBGMM()))
    #models.append(('XGBClassifier',                 XGBClassifier()))
    
    output_scores_df = fit_predict_plot(X_train, X_test, y_train, y_test, models, print_only_table)

    if print_output_scores_to_csv:
        output_scores_df.to_csv(time.strftime('output_scores' + str(output_scores_csv_file_suffix) + '.csv')

    return output_scores_df

def run_all_classifiers(X_train, X_test, y_train, y_test, print_details=True):
    """
    Run all classifiers of sklearn

    Args:
        X_train, X_test, y_train, y_test: The train and tests datasets.
        print_details: if true, print details of all models and save csv table ;
                       if false, print only table with summary of the models
    Returns:
        dataset: Returns output scores dataset.

    """
    assert isinstance(X_train, pd.core.frame.DataFrame)
    assert isinstance(X_test, pd.core.frame.DataFrame)
    assert isinstance(y_train, pd.core.frame.Series)
    assert isinstance(y_test, pd.core.frame.Series)
    assert isinstance(print_details, bool)

    log_method_execution_time(log_funcname())

    from sklearn.utils.testing import all_estimators
    import sklearn.metrics
    import time
    from src.util.acq_util import RANDOM_SEED

    # https://stackoverflow.com/questions/42160313/how-to-list-all-classification-regression-clustering-algorithms-in-scikit-learn
    #from xgboost import XGBClassifier
    #models.append(('XGBClassifier', XGBClassifier()))

    models = all_estimators(type_filter='classifier')
    output_scores_dataset = pd.DataFrame(index=['Precision 0', 'Recall 0', 'F1-Score 0', 'Support 0',
                                                'Precision 1', 'Recall 1', 'F1-Score 1', 'Support 1'],
                                         columns=list(zip(*models))[0])

    for name, model in models:
        if print_details is True:
            print('------------------------------------------------------------------------------')
            print(name)
            print('------------------------------------------------------------------------------')

        if (name == 'MultinomialNB' or name == 'NuSVC' or name == 'RadiusNeighborsClassifier' or name == 'GaussianProcessClassifier'):
            continue

        model = model()
        if 'random_state' in model.get_params():
            model.random_state = SEED

        #Fitting the model.
        model.fit(X_train, y_train)

        #Measuring accuracy.
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)

        output_scores_dataset = class_compute_accuracy(y_train, y_train_pred, output_scores_dataset,
                                                       ['Accuracy on the train set', name], print_details)
        output_scores_dataset = class_compute_accuracy(y_test, y_test_pred, output_scores_dataset,
                                                       ['Accuracy on the test set', name], print_details)

        #Plotting confusion matrix.
        output_scores_dataset = class_compute_plot_confusion_matrix(y_test, y_test_pred, output_scores_dataset, name, print_details)

        #Showing classification report.
        if print_details is True:
            print(sklearn.metrics.classification_report(y_test, y_test_pred))

        # Printing scores to output dataset.
        output_scores_dataset = class_compute_recall_precision_f1(y_test, y_test_pred, output_scores_dataset, name)

    # Can use idxmax with axis=1 to find the column with the greatest value on each row.
    output_scores_dataset['Max Value'] = output_scores_dataset.apply(max, axis=1)
    #output_scores_dataset['Max Classifier'] = output_scores_dataset.idxmax(axis=1)

    if print_details is True:
        output_scores_dataset.to_csv('output_scores' + '.csv')

    return output_scores_dataset

def train_test_split_for_classification(dataset, label, test_size, random_state=SEED):
    """
    Selects X and y, considering that y has been renamed to label.
    """
    from sklearn.model_selection import train_test_split

    assert isinstance(dataset, pd.core.frame.DataFrame)
    assert isinstance(test_size, float)
    assert isinstance(random_state, int)

    X = dataset.loc[:, dataset.columns != label]
    y = dataset[g_label]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    log_print('X_train: {}'.format(X_train.shape))
    log_print('y_train: {}'.format(y_train.shape))
    log_print('X_test:  {}'.format(X_test.shape))
    log_print('y_test:  {}'.format(y_test.shape))
    return(X_train, X_test, y_train, y_test)
예제 #11
0
import pika

import analyzer

# warnings.filterwarnings("ignore")

df = pd.read_csv('data.csv', header=0)
y = df['Genre']
X = df[[
    'Duration', 'Tempo', 'Strength', 'Contrast', 'Fore_Diff', 'Fore_Position'
]]

sc = preprocessing.MinMaxScaler(feature_range=(-1, 1), copy=False)
X = sc.fit_transform(X)

classifier = GaussianProcessClassifier(1.0 * RBF(1.0))

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.67,
                                                    random_state=27,
                                                    shuffle=True)
classifier.fit(X_train, y_train)


def detect(data, channel):
    try:
        response = {
            'id': data['id'],
            'status': 'processing',
            'result': {},
예제 #12
0
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score

data = pd.read_csv('finance_data.csv',
                   index_col=['Ticker', 'Fiscal Year', 'Fiscal Period'])
print(data.columns)

Y = data.loc[:, 'pos_neg']
X = data.drop(columns=['pos_neg', 'shifted_chg', 'report_date'])
X = scale(X.values)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=.2,
                                                    shuffle=False)
h = .02  # step size in the mesh#i ##i3#fff

kernal = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(kernel=kernal)

gpc.fit(X_train, y_train)

Z = gpc.predict(X_test)
acc = accuracy_score(y_test, Z)
print(acc)
print(y_test[0:10])
print(Z[0:10])
예제 #13
0
 def gaussian_process_classifier(self):
     model = OneVsRestClassifier(GaussianProcessClassifier()).fit(self.train_texts_tfidf, self.train_labels)
     self.save_model(model, self.gpc_filename)
     return model
예제 #14
0
        kernelfun = multiplier * gpkernels.RationalQuadratic(
            length_scale,
            alpha=alpha,
            length_scale_bounds=(length_scale_lb, length_scale_ub),
            alpha_bounds=(alpha_lb, alpha_ub))
    else:
        print('It should have not reached here!')
        kernelfun = 1.0 * gpkernels.RBF(1.0)

    #RBF, Matern, ConstantKernel, WhiteKernel, RationalQuadratic
    # length_scale=1.0, length_scale_bounds=(1e-05, 100000.0), nu=1.5
    # length_scale=1.0, alpha=1.0, length_scale_bounds=(1e-05, 100000.0), alpha_bounds=(1e-05, 100000.0)

    #device = torch.device("cpu")

    ## TODO: Define a model
    #print('We instiantate the model')
    model = GaussianProcessClassifier(kernelfun)

    ## TODO: Train the model
    #print('We fit the model')

    model.fit(train_x, train_y)

    print('score-training {}'.format(model.score(train_x, train_y)))

    ## --- End of your code  --- ##

    # Save the trained model
    joblib.dump(model, os.path.join(args.model_dir, "model.joblib"))
예제 #15
0
파일: data.py 프로젝트: nefermu/datas
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score

classifiers = [
    KNeighborsClassifier(),
    SVC(),
    GaussianProcessClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()
]

#

#[height, weight, shoe_size]
X = [[181, 80, 44], [177, 70, 43], [160, 60, 38], [154, 54, 37], [166, 65, 40],
     [190, 90, 47], [175, 64, 39], [177, 70, 40], [159, 55, 37], [171, 75, 42],
     [181, 85, 43]]

Y = [
예제 #16
0
X_reduced = PCA(n_components=3).fit_transform(X_)
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=Y_.ravel(),
           cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("1st eigenvector")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("2nd eigenvector")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("3rd eigenvector")
ax.w_zaxis.set_ticklabels([])
plt.show()

from sklearn.gaussian_process.kernels import RBF
kernel = 1.0 * RBF(1.0)
gpc = GaussianProcessClassifier(kernel=kernel,
                                multi_class = 'one_vs_one',
                                random_state=0).fit(X_, Y_)

# lets see how good our fit on the train set is
print(gpc.score(X_, Y_))

# create the TF neural net
# some hyperparams
training_epochs = 200

n_neurons_in_h1 = 10
n_neurons_in_h2 = 10
learning_rate = 0.01
dkl_loss_rate = 0.1

n_features = len(X[0])
예제 #17
0
def evaluateIndividualClassifiers(x, y, train_size_pct):
    """
    evaluateIndividualClassifiers
        x : The features of the dataset to be used for predictions
        y : The target class for each row in "x"
        train_size_pct : {float in the range(0.0, 1.0)} the percentage of the dataset that should be used for training
    """
    max_depth_x2 = MAX_DEPTH * 2
    max_iter_x2 = MAX_ITER * 2
    max_iter_x10 = MAX_ITER * 10
    n_neighbors_x2 = N_NEIGHBORS * 2
    n_neighbors_d2 = N_NEIGHBORS // 2

    rf = RandomForestClassifier(max_depth=MAX_DEPTH,
                                criterion='entropy',
                                random_state=SEED)
    rf_x2 = RandomForestClassifier(max_depth=max_depth_x2,
                                   criterion='entropy',
                                   random_state=SEED)
    et = ExtraTreesClassifier(max_depth=MAX_DEPTH,
                              criterion='entropy',
                              random_state=SEED)
    dectree = DecisionTreeClassifier(max_depth=MAX_DEPTH, random_state=SEED)
    knn = KNeighborsClassifier(n_neighbors=N_NEIGHBORS)
    knn_x2 = KNeighborsClassifier(n_neighbors=n_neighbors_x2)
    knn_d2 = KNeighborsClassifier(n_neighbors=n_neighbors_d2)
    mlpnn = MLPClassifier(max_iter=MAX_ITER)
    mlpnnE = MLPClassifier(max_iter=MAX_ITER, early_stopping=True)
    mlpnn_x2 = MLPClassifier(max_iter=max_iter_x2)
    mlpnnE_x2 = MLPClassifier(max_iter=max_iter_x2, early_stopping=True)
    XGB1 = XGBClassifier()
    GNB1 = GaussianNB()
    dumm = DummyClassifier()
    knb = neighbors.KNeighborsClassifier()
    LR1 = LogisticRegression(max_iter=max_iter_x10)
    SVC1 = SVC(max_iter=max_iter_x10)
    ovr1 = SGDClassifier(max_iter=max_iter_x2)
    ada1 = AdaBoostClassifier()
    gpc1 = GaussianProcessClassifier()
    GBclass1 = GradientBoostingClassifier(n_estimators=100,
                                          learning_rate=1.0,
                                          max_depth=1,
                                          random_state=0)
    histgclass = HistGradientBoostingClassifier(max_iter=max_iter_x2)
    bagclass = BaggingClassifier(KNeighborsClassifier(),
                                 max_samples=0.5,
                                 max_features=0.5)
    ridge1 = RidgeClassifier(max_iter=max_iter_x10)
    #Mnb = MultinomialNB()
    SVC2 = NuSVC(max_iter=max_iter_x10)
    linear1 = LinearSVC(max_iter=max_iter_x10)
    classifier_mapping = {
        f'1-RandomForest-{MAX_DEPTH}': rf,
        f'2-RandomForest-{max_depth_x2}': rf_x2,
        f'3-ExtraTrees-{MAX_DEPTH}': et,
        f'4-DecisionTree-{MAX_DEPTH}': dectree,
        f'5-KNeighbors case1-{N_NEIGHBORS}': knn,
        f'5-KNeighbors case2-{n_neighbors_x2}': knn_x2,
        f'5-KNeighbors case3-{n_neighbors_d2}': knn_d2,
        f'6-MLP case1-{MAX_ITER}': mlpnn,
        f'6-MLP  case2-{MAX_ITER}-early': mlpnnE,
        f'6-MLP case3-{max_iter_x2}': mlpnn_x2,
        f'6-MLP case4-{max_iter_x2}-early': mlpnnE_x2,
        f'7-XGB-': XGB1,
        f'8-GNB-': GNB1,
        f'9-dumm-': dumm,
        f'10-knb-': knb,
        f'11-LR1-': LR1,
        f'12-SVC1-': SVC1,
        f'13-ovr-': ovr1,
        f'14-ada-': ada1,
        f'15-gpc': gpc1,
        f'16-GBclass': GBclass1,
        f'17-histgclas': histgclass,
        f'18-bagclas': bagclass,
        f'19-ridge': ridge1,
        f'20-SVC2': SVC2,
        f'21-linear SVC': linear1,
    }

    for model_name, model in classifier_mapping.items():

        train_test_model(model_name, model, x, y, train_size_pct)
    "ElasticNet": linear_model.ElasticNet(random_state=0),
    "Lars": linear_model.Lars(n_nonzero_coefs=1),
    "LassoLars": linear_model.LassoLars(alpha=.1),
    "Omp": linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=1),
    "BayesianRidge":linear_model.BayesianRidge(),
    "ARDRegression":linear_model.ARDRegression(),
    "LogisitcRegression":linear_model.LogisticRegression(),
    "SGDClassifier":linear_model.SGDClassifier(),
    "Perceptron": linear_model.Perceptron(),
    "PassiveAggressiveClassifier": linear_model.PassiveAggressiveClassifier(),
    "Theil-Sen": linear_model.TheilSenRegressor(random_state=42),
    "RANSAC": linear_model.RANSACRegressor(random_state=42),
    "Huber": linear_model.HuberRegressor(),
    "SVC linear": SVC(kernel="linear", C=0.025),
    "SVC": SVC(gamma=2, C=1, probability=True),
    "GuassianProcess":GaussianProcessClassifier(1.0 * RBF(1.0)),
    "DecisionTree":DecisionTreeClassifier(max_depth=5),
    "RandomForest":RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    "NeutraNet":MLPClassifier(alpha=1),
    "ADABoost":AdaBoostClassifier(),
    "GaussianNB":GaussianNB(),
    "QDA":QuadraticDiscriminantAnalysis()
}

best_model_names = {}
for model_name in classifiers.keys():
    try:
        model = classifiers[model_name]
        scores = cross_val_score(model, data, data_label, cv=5, verbose=1, scoring='accuracy')
        score = scores.mean()
        if score > .8:
예제 #19
0
파일: test_gpc.py 프로젝트: AnAnteup/icp4
def test_lml_improving(kernel):
    # Test that hyperparameter-tuning improves log-marginal likelihood.
    gpc = GaussianProcessClassifier(kernel=kernel).fit(X, y)
    assert_greater(gpc.log_marginal_likelihood(gpc.kernel_.theta),
                   gpc.log_marginal_likelihood(kernel.theta))
예제 #20
0
def return_model(mode, **kwargs):

    if mode == 'logistic':
        solver = kwargs.get('solver', 'liblinear')
        n_jobs = kwargs.get('n_jobs', None)
        max_iter = kwargs.get('max_iter', 5000)
        model = LogisticRegression(solver=solver,
                                   n_jobs=n_jobs,
                                   max_iter=max_iter,
                                   random_state=666)
    elif mode == 'Tree':
        model = DecisionTreeClassifier(random_state=666)
    elif mode == 'RandomForest':
        n_estimators = kwargs.get('n_estimators', 50)
        model = RandomForestClassifier(n_estimators=n_estimators,
                                       random_state=666)
    elif mode == 'GB':
        n_estimators = kwargs.get('n_estimators', 50)
        model = GradientBoostingClassifier(n_estimators=n_estimators,
                                           random_state=666)
    elif mode == 'AdaBoost':
        n_estimators = kwargs.get('n_estimators', 50)
        model = AdaBoostClassifier(n_estimators=n_estimators, random_state=666)
    elif mode == 'SVC':
        kernel = kwargs.get('kernel', 'rbf')
        model = SVC(kernel=kernel, random_state=666)
    elif mode == 'LinearSVC':
        model = LinearSVC(loss='hinge', random_state=666)
    elif mode == 'GP':
        model = GaussianProcessClassifier(random_state=666)
    elif mode == 'KNN':
        n_neighbors = kwargs.get('n_neighbors', 5)
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
    elif mode == 'NB':
        model = MultinomialNB()
    elif mode == 'linear':
        model = LinearRegression(random_state=666)
    elif mode == 'ridge':
        alpha = kwargs.get('alpha', 1.0)
        model = Ridge(alpha=alpha, random_state=666)
    elif 'conv' in mode:
        tf.reset_default_graph()
        address = kwargs.get('address', 'weights/conv')
        hidden_units = kwargs.get('hidden_layer_sizes', [20])
        activation = kwargs.get('activation', 'relu')
        weight_decay = kwargs.get('weight_decay', 1e-4)
        learning_rate = kwargs.get('learning_rate', 0.001)
        max_iter = kwargs.get('max_iter', 1000)
        early_stopping = kwargs.get('early_stopping', 10)
        warm_start = kwargs.get('warm_start', False)
        batch_size = kwargs.get('batch_size', 256)
        kernel_sizes = kwargs.get('kernel_sizes', [5])
        strides = kwargs.get('strides', [5])
        channels = kwargs.get('channels', [1])
        validation_fraction = kwargs.get('validation_fraction', 0.)
        global_averaging = kwargs.get('global_averaging', 0.)
        optimizer = kwargs.get('optimizer', 'sgd')
        if mode == 'conv':
            model = CShapNN(mode='classification',
                            batch_size=batch_size,
                            max_epochs=max_iter,
                            learning_rate=learning_rate,
                            weight_decay=weight_decay,
                            validation_fraction=validation_fraction,
                            early_stopping=early_stopping,
                            optimizer=optimizer,
                            warm_start=warm_start,
                            address=address,
                            hidden_units=hidden_units,
                            strides=strides,
                            global_averaging=global_averaging,
                            kernel_sizes=kernel_sizes,
                            channels=channels,
                            random_seed=666)
        elif mode == 'conv_reg':
            model = CShapNN(mode='regression',
                            batch_size=batch_size,
                            max_epochs=max_iter,
                            learning_rate=learning_rate,
                            weight_decay=weight_decay,
                            validation_fraction=validation_fraction,
                            early_stopping=early_stopping,
                            optimizer=optimizer,
                            warm_start=warm_start,
                            address=address,
                            hidden_units=hidden_units,
                            strides=strides,
                            global_averaging=global_averaging,
                            kernel_sizes=kernel_sizes,
                            channels=channels,
                            random_seed=666)
    elif 'NN' in mode:
        solver = kwargs.get('solver', 'adam')
        hidden_layer_sizes = kwargs.get('hidden_layer_sizes', (20, ))
        if isinstance(hidden_layer_sizes, list):
            hidden_layer_sizes = list(hidden_layer_sizes)
        activation = kwargs.get('activation', 'relu')
        learning_rate_init = kwargs.get('learning_rate', 0.001)
        max_iter = kwargs.get('max_iter', 5000)
        early_stopping = kwargs.get('early_stopping', False)
        warm_start = kwargs.get('warm_start', False)
        if mode == 'NN':
            model = MLPClassifier(solver=solver,
                                  hidden_layer_sizes=hidden_layer_sizes,
                                  activation=activation,
                                  learning_rate_init=learning_rate_init,
                                  warm_start=warm_start,
                                  max_iter=max_iter,
                                  early_stopping=early_stopping)
        if mode == 'NN_reg':
            model = MLPRegressor(solver=solver,
                                 hidden_layer_sizes=hidden_layer_sizes,
                                 activation=activation,
                                 learning_rate_init=learning_rate_init,
                                 warm_start=warm_start,
                                 max_iter=max_iter,
                                 early_stopping=early_stopping)
    else:
        raise ValueError("Invalid mode!")
    return model
예제 #21
0
# %%
print('label - feature split')
topcols = [
    'pageinstanceid', 'referringpageinstanceid', 'pagesequenceinattribution',
    'pagesequenceinsession'
]
# X = df[topcols]
# df['pageinstanceid'] = df['pageinstanceid'].apply(str)
# df['referringpageinstanceid'] = df['referringpageinstanceid'].apply(str)
# X_2h = pd.get_dummies(df[topcols])
# X_1h = df.drop(columns='iscustomer').values #<-------

clfs_names = [
    (KNeighborsClassifier(4), 'K-NN 4'),
    (GaussianProcessClassifier(1.0 * RBF(1.0)), 'GaussP'),
    (DecisionTreeClassifier(), 'DeciT'),
    (RandomForestClassifier(n_estimators=300), 'RF3'),
    (MLPClassifier(alpha=1), 'Neu-N'),
    (AdaBoostClassifier(), 'AdaBoo'),
    (GaussianNB(), 'NaiveBayes'),
    (QuadraticDiscriminantAnalysis(), 'QDA'),
    (SVC(gamma=2, C=1), 'RBF-SVM'),
    # (SVC(kernel="linear", C=0.025),'L-SVM')
]

# %% X
X.shape
print(10 * '#', 'ORIG', 10 * '#')
print('test - train split')
tt_split = train_test_split(X, y, test_size=.25)
예제 #22
0
from sklearn import ensemble
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

_slow_or_bad_pipelines = {
    'KNeighborsClassifier':
    Pipeline([('clf', KNeighborsClassifier(2))]),
    'GaussianProcessClassifier':
    Pipeline([('clf', GaussianProcessClassifier(1.0 * RBF(1.0)))]),
    'GaussianNB':
    Pipeline([('clf', GaussianNB())]),
    'QuadraticDiscriminantAnalysis':
    Pipeline([('clf', QuadraticDiscriminantAnalysis())]),
    'ExtraTreeClassifier':
    Pipeline([('clf', ExtraTreeClassifier())]),
}

#decomp =preprocessing.MaxAbsScaler()
#decomp = decomposition.PCA(n_components=100)
ecomp = decomposition.TruncatedSVD(n_components=100)
#decomp = decomposition.NMF(n_components=250, random_state=1, ) #alpha=.1, l1_ratio=.5
#decomp = decomposition.LatentDirichletAllocation(n_components=400, learning_method='batch')

classify_pre_pipeline = Pipeline([
예제 #23
0
x_test_original = testset[["bone_length", "rotting_flesh", "hair_length", "has_soul"]]
x_test_hair_soul = testset[["bone_length", "rotting_flesh", "hair_length", "has_soul", "hair_soul"]]

#creating a dictionary to hold classifier objects
clfs = {}
#clfs['lr'] = {'clf': linear_model.LogisticRegression(), 'name':'LogisticRegression'}
#clfs['rf'] = {'clf': ensemble.RandomForestClassifier(n_estimators=750, n_jobs=-1), 'name':'RandomForest'}
#clfs['knn'] = {'clf': neighbors.KNeighborsClassifier(n_neighbors=4), 'name':'kNearestNeighbors'}
#clfs['svc'] = {'clf': svm.SVC(kernel='linear'), 'name': 'SupportVectorClassifier'}

#some of the classifiers
clfs['tr'] = {'clf': DecisionTreeClassifier(), 'name':'DecisionTree'}
clfs['nusvc'] = {'clf': NuSVC(gamma='scale'), 'name': 'NuSVC'}
clfs['linearsvc'] = {'clf': LinearSVC(), 'name': 'LinearSVC'}
clfs['SGD'] = {'clf': SGDClassifier(max_iter=1000 ,tol=1e-3), 'name': 'SGDClassifier'}
clfs['GPC'] = {'clf': GaussianProcessClassifier(), 'name': 'GaussianProcess'}
clfs['nb'] = {'clf': GaussianNB(), 'name':'GaussianNaiveBayes'}
clfs['bag'] = {'clf': BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5), 'name': "BaggingClassifier"}
clfs['gbc'] = {'clf': GradientBoostingClassifier(), 'name': 'GradientBoostingClassifier'}
#clfs['mlp'] = {'clf': neural_network.MLPClassifier(hidden_layer_sizes=(100,100,100), alpha=1e-5, solver='lbfgs', max_iter=500), 'name': 'MultilayerPerceptron'}

#creating parameters for searching
parameters = {'solver': ['lbfgs'], 'max_iter': [1500], 'alpha': 10.0 ** -np.arange(1, 7), 'hidden_layer_sizes':np.arange(5, 12)}
clfs['mlpgrid'] = {'clf': GridSearchCV(MLPClassifier(), parameters,cv=3,iid=True), 'name': 'MLP with GridSearch'}

parameters = {'kernel':['linear', 'sigmoid', 'poly', 'rbf'], 'gamma':np.linspace(0.0,2.0,num=21),'C': np.linspace(0.5,1.5,num=11)}
clfs['svcgrid'] = {'clf': GridSearchCV(SVC(), parameters,cv=3,iid=True), 'name': 'SVC with GridSearch'}

parameters = {'n_estimators':np.arange(64, 1024, step=64)}
clfs['rfgrid'] = {'clf': GridSearchCV(RandomForestClassifier(), parameters,cv=3,iid=True), 'name': 'Random Forest with GridSearch'}
예제 #24
0
plt.title('2-class Logistic Regression\n Probabilistic Decision Boundary')
plt.scatter(X[:,0][y==1],X[:,1][y==1], label="versicolor",color="red",edgecolors=(0, 0, 0))
plt.scatter(X[:,0][y==2],X[:,1][y==2], label="virginica",color="blue",edgecolors=(0, 0, 0))
plt.scatter(X[:,0][y==0],X[:,1][y==0], label="setosa",color="green",edgecolors=(0, 0, 0))
plt.xlabel("Sepal Length")
plt.ylabel("Petal Length")
plt.xlim(4,8)
plt.ylim(0.5,7.5)
plt.legend()
plt.savefig("2-class-logr-prob.pdf")


#gaussian process decision map
xx, yy = np.mgrid[4:8:0.05, 0.5:7.5:0.05]
kernel = 1.0 * RBF([1.0, 1.0])#rbf_anisotropic
m_gpc = GaussianProcessClassifier(kernel=kernel).fit(Xt, yt)
Z = m_gpc.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
Z = Z.reshape(xx.shape)
image = plt.imshow(Z.T, interpolation='nearest',
                   extent=(4, 8, 0.5, 7.5),
                   aspect='auto', origin='lower', cmap=plt.cm.RdBu)
plt.scatter(X[:,0][y==1],X[:,1][y==1], label="versicolor",color="red",edgecolors=(0, 0, 0))
plt.scatter(X[:,0][y==2],X[:,1][y==2], label="virginica",color="blue",edgecolors=(0, 0, 0))
plt.scatter(X[:,0][y==0],X[:,1][y==0], label="setosa",color="green",edgecolors=(0, 0, 0))
plt.colorbar(image)
plt.title("2-class RBF Gaussian Process Classifier\n Decision Map")
plt.xlabel("Sepal Length")
plt.ylabel("Petal Length")
plt.xlim(4,8)
plt.ylim(0.5,7.5)
plt.legend()
예제 #25
0
def main(argv):
	# parse data
	parsed = parse_args(argv)
	if parsed.output_directory != None:
		parsed.output_directory += '/' if (not parsed.output_directory.endswith('/')) else ''
		if (os.path.exists(parsed.output_directory)):
			shutil.rmtree(parsed.output_directory)
		os.makedirs(parsed.output_directory)
	
	[gene_id, sample_id, expr_tr, label_tr] = parse_data(parsed.input_expr, 1, 2)

	label_unique= np.unique(label_tr)
	label_count = np.array([len(np.where(label_tr == l)[0]) for l in label_unique])

	print "Training set dimension:", expr_tr.shape[0], "samples x", expr_tr.shape[1], "features"
	print "True labels", label_unique, "| Counts", label_count

	time_start = time.clock()
	
	##### Random Forest #####
	if parsed.learning_algorithm.lower() ==	'random_forest':
		from sklearn.ensemble import RandomForestClassifier

		if parsed.cross_valid:
			## sklearn model selection
			from sklearn.model_selection import GridSearchCV
			rf = RandomForestClassifier()
			hyperparams = {'n_estimators': [250, 500, 1000],
							'criterion': ['gini', 'entropy'],
							'class_weight': [None, 'balanced']}
			clf = GridSearchCV(rf, hyperparams, cv=parsed.cross_valid, n_jobs=4)
			clf.fit(expr_tr, label_tr)
			params = parse_cv_result(clf)
		else:
			params = {'n_estimators': 1000,
							'criterion': 'gini',
							'class_weight': None}
			
		## train the model
		clf = RandomForestClassifier(n_estimators=params['n_estimators'], 
										criterion=params['criterion'],
										class_weight=params['class_weight'],
										oob_score=True,
										n_jobs=4, 
										verbose=False)
		clf.fit(expr_tr, label_tr)
		label_pred = clf.predict(expr_tr)
		accuracy_pred = clf.score(expr_tr, label_tr)

		## save the model
		if parsed.output_directory != None:
			joblib.dump(clf, parsed.output_directory + 
				parsed.learning_algorithm.lower() + '_model.pkl')
		
		## sort genes by importance
		num_most_important_gene = 25
		gene_score = clf.feature_importances_
		gene_index = gene_score.argsort()[-num_most_important_gene:][::-1]
		num_most_important_gene = min(num_most_important_gene, len(gene_score))


	##### C-SVM #####
	elif parsed.learning_algorithm.lower() == 'svm':
		from sklearn.svm import SVC

		if parsed.cross_valid:
			## sklearn model selection
			from sklearn.model_selection import GridSearchCV
			svm = SVC()

			from sklearn.model_selection import RandomizedSearchCV
			import scipy.stats as ss
			hyperparams = {'C': ss.expon(scale=10), #randomized parameters
							'kernel':['rbf'],
							# 'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
							'class_weight': [None]}
			clf = RandomizedSearchCV(svm, hyperparams, n_iter=500, cv=parsed.cross_valid, n_jobs=4)
			
			clf.fit(expr_tr, label_tr)
			params = parse_cv_result(clf)
		else:
			params = {'C': 1.2,
						'kernel': 'rbf',
						'class_weight': None}

		## train the model
		clf = SVC(C=params['C'], 
					kernel=params['kernel'], 
					class_weight=params['class_weight'],
					probability=True, 
					verbose=False)
		clf.fit(expr_tr, label_tr)
		label_pred = clf.predict(expr_tr)
		accuracy_pred = clf.score(expr_tr, label_tr)

		## save the model
		if parsed.output_directory != None:
			joblib.dump(clf, parsed.output_directory + 
				parsed.learning_algorithm.lower() + '_model.pkl')


	##### Nu-SVM #####
	elif parsed.learning_algorithm.lower() == 'nu_svm':
		from sklearn.svm import NuSVC

		if parsed.cross_valid:
			## sklearn model selection
			from sklearn.model_selection import GridSearchCV
			svm = NuSVC()

			from sklearn.model_selection import RandomizedSearchCV
			import scipy.stats as ss
			hyperparams = {'nu': ss.expon(scale=10), #randomized parameters
							'kernel':['rbf'],
							# 'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
							'class_weight': [None]}
			clf = RandomizedSearchCV(svm, hyperparams, n_iter=500, cv=parsed.cross_valid, n_jobs=4)
			
			clf.fit(expr_tr, label_tr)
			params = parse_cv_result(clf)
		else:
			params = {'nu': 0.82, 
						'kernel': 'rbf',
						'class_weight': 'balanced'}

		## train the model
		clf = NuSVC(nu=params['nu'], 
					kernel=params['kernel'], 
					class_weight=params['class_weight'],
					probability=True, 
					verbose=False)
		clf.fit(expr_tr, label_tr)
		label_pred = clf.predict(expr_tr)
		accuracy_pred = clf.score(expr_tr, label_tr)

		## save the model
		if parsed.output_directory != None:
			joblib.dump(clf, parsed.output_directory + 
				parsed.learning_algorithm.lower() + '_model.pkl')


	##### SVR #####
	elif parsed.learning_algorithm.lower() == 'svr':
		from sklearn.svm import SVR

		if parsed.cross_valid:
			## sklearn model selection
			svr = SVR()

			from sklearn.model_selection import RandomizedSearchCV
			import scipy.stats as ss
			hyperparams = {'C': ss.expon(scale=10), #randomized parameters
							'kernel':['rbf'], # 'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
							}
			clf = RandomizedSearchCV(svr, hyperparams, n_iter=500, cv=parsed.cross_valid, n_jobs=4)
			
			clf.fit(expr_tr, convert_labels(label_tr))
			params = parse_cv_result(clf)
		else:
			params = {'C': 1.1,
						'kernel': 'rbf'}

		## train the model
		clf = SVR(C=params['C'], 
					kernel=params['kernel'], 
					verbose=False)
		clf.fit(expr_tr, convert_labels(label_tr))
		label_pred = clf.predict(expr_tr)
		accuracy_pred = clf.score(expr_tr, convert_labels(label_tr)) #coefficient of determination R^2 of the prediction

		## save the model
		if parsed.output_directory != None:
			joblib.dump(clf, parsed.output_directory + 
				parsed.learning_algorithm.lower() + '_model.pkl')


	##### Neural Network #####
	elif parsed.learning_algorithm.lower() == 'neural_net':
		from sklearn.linear_model import LogisticRegression
		from sklearn.neural_network import BernoulliRBM
		from sklearn.pipeline import Pipeline

		# train the model
		logistic = LogisticRegression(C=10)
		rbm = BernoulliRBM(n_components=256, learning_rate=.001, n_iter=100, verbose=False)
		clf = Pipeline(steps=[('rmb', rbm), ('logistic', logistic)])
		clf.fit(expr_tr, label_tr)
		if parsed.output_directory != None:
			joblib.dump(clf, parsed.output_directory + 
				parsed.learning_algorithm.lower() + '_model.pkl')


	##### Naive Bayes #####
	elif parsed.learning_algorithm.lower() == 'naive_bayes':
		from sklearn.naive_bayes import GaussianNB	
		clf = GaussianNB()
		clf.fit(expr_tr, label_tr)
		label_pred = clf.predict(expr_tr)
		accuracy_pred = clf.score(expr_tr, label_tr)

		## save the model
		if parsed.output_directory != None:
			joblib.dump(clf, parsed.output_directory + 
				parsed.learning_algorithm.lower() + '_model.pkl')


	##### Gradient Boosting #####		
	elif parsed.learning_algorithm.lower() == 'grad_boosting':
		from sklearn.ensemble import GradientBoostingClassifier

		# ## convert to two class 
		# label_tr = [1 if x=='P' or x=='C' else 0 for x in label_tr]

		if parsed.cross_valid:
			## sklearn model selection
			from sklearn.model_selection import GridSearchCV
			gb = GradientBoostingClassifier()
			hyperparams = {'learning_rate': [.01, .0075, .005, .001, .0005], 
							'max_depth': [3],
							'subsample': [1, .8, .5],
							'n_estimators': [1000]}
			clf = GridSearchCV(gb, hyperparams, cv=parsed.cross_valid, n_jobs=4)
			clf.fit(expr_tr, label_tr)
			params = parse_cv_result(clf)
		else:
			params = {'learning_rate': .0025, 
						'max_depth': 3,
						'subsample': .8,
						'n_estimators': 1000}

		## train the model
		clf = GradientBoostingClassifier(learning_rate=params['learning_rate'], 
											n_estimators=params['n_estimators'], 
											max_depth=params['max_depth'], 
											subsample=params['subsample'], 
											verbose=False)
		clf.fit(expr_tr, label_tr)
		label_pred = clf.predict(expr_tr)
		accuracy_pred = clf.score(expr_tr, label_tr)

		## save the model
		if parsed.output_directory != None:
			joblib.dump(clf, parsed.output_directory + 
				parsed.learning_algorithm.lower() + '_model.pkl')


	##### AdaBoost #####
	elif parsed.learning_algorithm.lower() == "adaboost":
		from sklearn.ensemble import AdaBoostClassifier
		from sklearn.tree import DecisionTreeClassifier

		if parsed.cross_valid:
			## sklearn model selection
			from sklearn.model_selection import GridSearchCV
			ab = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3))
			hyperparams = {'learning_rate': [.01, .0075, .005, .001, .0005], 
							'n_estimators': [1000]}
			clf = GridSearchCV(ab, hyperparams, cv=parsed.cross_valid, n_jobs=4)
			clf.fit(expr_tr, label_tr)
			params = parse_cv_result(clf)
		else:
			params = {'learning_rate': .0025, 
						'n_estimators': 1000}

		## train the model
		clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), 
									learning_rate=params['learning_rate'],
									n_estimators=params['n_estimators'])
		clf.fit(expr_tr, label_tr)
		label_pred = clf.predict(expr_tr)
		accuracy_pred = clf.score(expr_tr, label_tr)

		## save the model
		if parsed.output_directory != None:
			joblib.dump(clf, parsed.output_directory + 
				parsed.learning_algorithm.lower() + '_model.pkl')


	##### Gaussian Process #####
	elif parsed.learning_algorithm.lower() == 'gauss_process':
		from sklearn.gaussian_process import GaussianProcessClassifier
		from sklearn.gaussian_process.kernels import RBF

		if parsed.cross_valid:
			## sklearn model selection
			from sklearn.model_selection import GridSearchCV
			gb = GaussianProcessClassifier()
			hyperparams = {}
			clf = GridSearchCV(gb, hyperparams, cv=parsed.cross_valid, n_jobs=4)
			clf.fit(expr_tr, label_tr)
			params = parse_cv_result(clf)
		else:
			params = {}

		## train the model
		clf = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0), 
										optimizer="fmin_l_bfgs_b")
		clf.fit(expr_tr, label_tr)
		label_pred = clf.predict(expr_tr)

		## save the model
		if parsed.output_directory != None:
			joblib.dump(clf, parsed.output_directory + 
				parsed.learning_algorithm.lower() + '_model.pkl')

	else:
		sys.exit('Improper learning algorithm option given.')


	## print timer messages
	time_end = time.clock()
예제 #26
0
        log = open(logfilename, 'a')
        log.write('{0},{1},{2},{3},{4},{5},{6},{7},{8},{9}\n'.format(
            packetlevel, neibour, component, i, trainauc, teatauc, traintpr,
            testtpr, trainfpr, testfpr))
    log.close()
    return clf


warnings.filterwarnings("ignore")
pool = Pool(10)

ClassfiedList = {
    "Nearest Neighbors": KNeighborsClassifier(3),
    "SVMLinear": SVC(kernel="linear", C=0.025),
    "SVMrbf": SVC(gamma=2, C=1),
    "Gaussian": GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    "DT": DecisionTreeClassifier(max_depth=5, random_state=0),
    "RF": RandomForestClassifier(max_depth=5, n_estimators=10, random_state=0),
    "GBRT": GradientBoostingClassifier(random_state=0),
    "NeualNet": MLPClassifier(alpha=1, random_state=0),
    "Ada": AdaBoostClassifier(),
    "NB": GaussianNB(),
    "xgb": xgb.XGBClassifier(n_estimators=125, max_depth=3,
                             learning_rate=0.05),
    "QDA": QuadraticDiscriminantAnalysis()
}

manifoldlist = {
    'LLE':
    manifold.LocallyLinearEmbedding(n_neighbors=30,
                                    n_components=2,
예제 #27
0
import matplotlib.pyplot as plt
import numpy as np
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, DotProduct

xx, yy = np.meshgrid(np.linspace(-3, 3, 50), np.linspace(-3, 3, 50))
rng = np.random.RandomState(0)
X = rng.randn(200, 2)
Y = np.logical_xor(X[:, 0] > 0, X[:, 1] > 0)

# fit the model
plt.figure(figsize=(10, 5))
kernels = [1.0 * RBF(length_scale=1.0), 1.0 * DotProduct(sigma_0=1.0)**2]
for i, kernel in enumerate(kernels):
    clf = GaussianProcessClassifier(kernel=kernel, warm_start=True).fit(X, Y)

    # plot the decision function for each datapoint on the grid
    Z = clf.predict_proba(np.vstack((xx.ravel(), yy.ravel())).T)[:, 1]
    Z = Z.reshape(xx.shape)

    plt.subplot(1, 2, i + 1)
    image = plt.imshow(Z,
                       interpolation='nearest',
                       extent=(xx.min(), xx.max(), yy.min(), yy.max()),
                       aspect='auto',
                       origin='lower',
                       cmap=plt.cm.PuOr_r)
    contours = plt.contour(xx, yy, Z, levels=[0], linewidths=2, linetypes='--')
    plt.scatter(X[:, 0], X[:, 1], s=30, c=Y, cmap=plt.cm.Paired)
    plt.xticks(())
예제 #28
0
df=clean_dataset(df)
#-keep in x the variable and as y the validation 
x = df.iloc[:,1:].values
y = df.iloc[:,0].values
#-rename them 
data_input = x
data_output = y
#-set parameters for the kfold validation
kf = KFold(10, n_folds = 5, shuffle=True)
#-set parameters for the classifiers    
rf_class = RandomForestClassifier(n_estimators=10)
log_class = LogisticRegression()
svm_class = svm.SVC()
nn_class = KNeighborsClassifier(n_neighbors=3)
svc_class= SVC(kernel="linear", C=0.025)
gausian_class= GaussianProcessClassifier(1.0 * RBF(1.0))
dtc_class = DecisionTreeClassifier(max_depth=5)
mpl_class = MLPClassifier(alpha=1)
abc_class = AdaBoostClassifier()
bnb_class= GaussianNB()


accu=[]#-- here we will keep all the accuracies of each classifier 

print("Random Forests: ")
print(cross_val_score(rf_class, data_input, data_output, scoring='accuracy', cv = 10))
accuracy1 = cross_val_score(rf_class, data_input, data_output, scoring='accuracy', cv = 10).mean() * 100
accu.append(accuracy1)
print("Accuracy of Random Forests is: " , accuracy1)

print("\n\nsvm-linear: ")
예제 #29
0
def classifier_example(table, mf, classcol='more_red'):
    """
    code from https://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
    """

    h = .02  # step size in the mesh

    names = [
        "Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
        "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
        "Naive Bayes", "QDA"
    ]

    classifiers = [
        KNeighborsClassifier(3),
        SVC(kernel="linear", C=0.025),
        SVC(gamma=2, C=1),
        GaussianProcessClassifier(1.0 * RBF(1.0)),
        DecisionTreeClassifier(max_depth=5),
        RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        MLPClassifier(alpha=1, max_iter=1000),
        AdaBoostClassifier(),
        GaussianNB(),
        QuadraticDiscriminantAnalysis()
    ]

    # Perform Aitchison PCA
    ordination = apca(table.T.astype(float))
    X = ordination.samples.values
    y = mf[classcol].values

    #rng = np.random.RandomState(2)
    #X += 2 * rng.uniform(size=X.shape)
    linearly_separable = (X, y)

    datasets = [linearly_separable]

    figure = plt.figure(figsize=(27, 3))
    i = 1
    # iterate over datasets
    for ds_cnt, ds in enumerate(datasets):
        # preprocess dataset, split into training and test part
        X, y = ds
        X = StandardScaler().fit_transform(X)
        X_train, X_test, y_train, y_test = \
            train_test_split(X, y, test_size=.5, random_state=42)

        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                             np.arange(y_min, y_max, h))

        # just plot the dataset first
        cm = plt.cm.RdBu
        cm_bright = ListedColormap(['#FF0000', '#0000FF'])
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        if ds_cnt == 0:
            ax.set_title("Input data")
        # Plot the training points
        ax.scatter(X_train[:, 0],
                   X_train[:, 1],
                   c=y_train,
                   cmap=cm_bright,
                   edgecolors='k')
        # Plot the testing points
        ax.scatter(X_test[:, 0],
                   X_test[:, 1],
                   c=y_test,
                   cmap=cm_bright,
                   alpha=0.6,
                   edgecolors='k')
        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
        i += 1

        # iterate over classifiers
        for name, clf in zip(names, classifiers):
            ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
            clf.fit(X_train, y_train)
            score = clf.score(X_test, y_test)

            # Plot the decision boundary. For that, we will assign a color to each
            # point in the mesh [x_min, x_max]x[y_min, y_max].
            if hasattr(clf, "decision_function"):
                Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
            else:
                Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
            ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

            # Plot the training points
            ax.scatter(X_train[:, 0],
                       X_train[:, 1],
                       c=y_train,
                       cmap=cm_bright,
                       edgecolors='k')
            # Plot the testing points
            ax.scatter(X_test[:, 0],
                       X_test[:, 1],
                       c=y_test,
                       cmap=cm_bright,
                       edgecolors='k',
                       alpha=0.6)

            ax.set_xlim(xx.min(), xx.max())
            ax.set_ylim(yy.min(), yy.max())
            ax.set_xticks(())
            ax.set_yticks(())
            if ds_cnt == 0:
                ax.set_title(name, fontsize=18)
            ax.text(xx.max() - .3,
                    yy.min() + .3, ('%.2f' % score).lstrip('0'),
                    size=18,
                    horizontalalignment='right')
            i += 1

    plt.tight_layout()
    return ax
예제 #30
0
def main():

    # Checks for correct number of arguments
    if len(sys.argv) != 3:
        print(
            'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]')
        sys.sys.exit()

    # set up dataset
    data_train = pd.read_csv(sys.argv[1])
    data_test = pd.read_csv(sys.argv[2])

    print('train:\n{}\n'.format(sys.argv[1]))
    print('test:\n{}\n'.format(sys.argv[2]))

    if 'small' in sys.argv[1]:
        size = 'small'
    elif 'medium' in sys.argv[1]:
        size = 'medium'
    else:
        size = 'large'

    x_train = data_train.drop(
        [data_train.columns[0], data_train.columns[1], data_train.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_train = pd.Series(data_train.iloc[:, -1])
    x_test = data_test.drop(
        [data_test.columns[0], data_test.columns[1], data_test.columns[-1]],
        axis=1).apply(pd.to_numeric, errors='ignore')
    y_test = pd.Series(data_test.iloc[:, -1])

    # type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ')
    # if type == 1:
    parameter = None
    method = input('select a method: {}: '.format(methods))
    if method == 1:
        classifier = input('select a classifier: {}: '.format(classifiers))
        if classifier == 1:
            parameter = input('criterion: [1: gini, 2: entropy] ')
            if parameter == 1:
                model = DecisionTreeClassifier(criterion='gini')
                parameter = 'gini'
            elif parameter == 2:
                model = DecisionTreeClassifier(criterion='entropy')
                parameter = 'entropy'
            else:
                print('no criterion chosen')
                sys.exit()
        elif classifier == 2:
            model = ExtraTreeClassifier()
        elif classifier == 3:
            model = ExtraTreesClassifier()
        elif classifier == 4:
            parameter = input('n: [1: 1, 2: 3: 3: 5] ')
            if parameter == 1:
                model = KNeighborsClassifier(n_neighbors=1)
                parameter = '1'
            elif parameter == 2:
                model = KNeighborsClassifier(n_neighbors=3)
                parameter = '3'
            elif parameter == 3:
                model = KNeighborsClassifier(n_neighbors=5)
                parameter = '5'
            else:
                print('no n chosen')
                sys.exit()
        elif classifier == 5:
            parameter = input(
                'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] '
            )
            if parameter == 1:
                model = GaussianNB()
                parameter = 'gaussian'
            elif parameter == 2:
                model = BernoulliNB()
                parameter = 'bernoulli'
            elif parameter == 3:
                model = MultinomialNB()
                parameter = 'multinomial'
            elif parameter == 4:
                model = ComplementNB()
                parameter = 'complement'
            else:
                print('no version chosen')
                sys.exit()
        elif classifier == 6:
            model = RadiusNeighborsClassifier(radius=1.0)
        elif classifier == 7:
            model = RandomForestClassifier(n_estimators=50, random_state=1)
        elif classifier == 8:
            model = LinearSVC(multi_class='crammer_singer')  #multi_class='ovr'
        elif classifier == 9:
            model = GradientBoostingClassifier()
        elif classifier == 10:
            model = GaussianProcessClassifier(multi_class='one_vs_one')
        elif classifier == 11:
            model = SGDClassifier()
        elif classifier == 12:
            model = PassiveAggressiveClassifier()
        elif classifier == 13:
            model = NearestCentroid()
        elif classifier == 14:
            model = Perceptron(tol=1e-3, random_state=0)
        elif classifier == 15:
            model = MLPClassifier()
        elif classifier == 16:
            model = AdaBoostClassifier(n_estimators=50)
        elif classifier == 17:
            parameter = input(
                'strategy: [1: stratified, 2: most frequent, 3: prior, 4: uniform, 5: constant] '
            )
            if parameter == 1:
                model = DummyClassifier(strategy='stratified')
                parameter = 'stratified'
            elif parameter == 2:
                model = DummyClassifier(strategy='most_frequent')
                parameter = 'most frequent'
            elif parameter == 3:
                model = DummyClassifier(strategy='prior')
                parameter = 'prior'
            elif parameter == 4:
                model = DummyClassifier(strategy='uniform')
                parameter = 'uniform'
            elif parameter == 5:
                model = DummyClassifier(strategy='constant')
                parameter = 'constant'
            else:
                print('no strategy selected')
                sys.exit()
        else:
            print('no classifier chosen')
            sys.exit()

        import time
        # Starts timer
        start = time.clock()

        # train the model using the training sets and check score
        model.fit(x_train, y_train)
        model.score(x_train, y_train)

        # predict output
        predictions = pd.Series(model.predict(x_test))
        report = classification_report(
            y_test,
            predictions,
            target_names=['RightTroll', 'LeftTroll', 'Other'])
        confusion = confusion_matrix(
            y_test, predictions, labels=["RightTroll", "LeftTroll", "Other"])
        if (parameter != None):
            filename = '{},{},{},{}.txt'.format(size, methods[method],
                                                classifiers[classifier],
                                                parameter)
        else:
            filename = '{},{},{}.txt'.format(size, methods[method],
                                             classifiers[classifier])

        # Prints the time taken
        end = time.clock()
        time = str(end - start)

        with open(filename, 'w') as output:
            output.write('method:\n{}\n\n'.format(methods[method]))
            output.write('classifier:\n{}\n\n'.format(classifiers[classifier]))
            output.write('accuracy:\n{:.2f}%\n\n'.format(
                100 * accuracy_score(y_test, predictions)))
            output.write('report:\n{}\n\n'.format(report))
            output.write('confusion:\n{}\n\n'.format(confusion))
            output.write('time:\n{}s\n\n'.format(time))
            output.write('data:\n{:10}\t{:10}\t{:10}\n'.format(
                'actual', 'predict', 'match?'))
            for i in range(len(predictions)):
                output.write('{:10}\t{:10}\t{:10}\n'.format(
                    y_train[i], predictions[i], y_test[i] == predictions[i]))

        print('\nmethod:\n{}\n'.format(methods[method]))
        print('classifier:\n{}\n'.format(classifiers[classifier]))
        print('accuracy:\n{:.2f}%\n'.format(
            100 * accuracy_score(y_test, predictions)))
        print('report:\n{}\n'.format(report))
        print('confusion:\n{}\n'.format(confusion))
        print('time: {}s\n'.format(time))

    elif method == 2:
        # transform into binary classification problem
        # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1)
        # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1)

        # transform string labels into integers
        le = LabelEncoder()
        le.fit(
            y_train
        )  # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1]))
        print(le.classes_)

        y_train = le.transform(y_train)
        y_test = le.transform(y_test)

        regressor = input('select a regressor: {}: '.format(regressors))
        if regressor == 1:
            print(method, regressor)
            model = LinearDiscriminantAnalysis()
        elif regressor == 2:
            print(method, regressor)
            model = LogisticRegression(solver='lbfgs',
                                       multi_class='multinomial')  #'newton-cg'
        elif regressor == 3:
            print(method, regressor)
            model = RidgeClassifier()
        elif regressor == 4:
            print(method, regressor)
            model = QuadraticDiscriminantAnalysis()
        elif regressor == 5:
            model = OneVsRestClassifier(LinearRegression())
        elif regressor == 6:
            model = OneVsRestClassifier(DecisionTreeRegressor())
        elif regressor == 7:
            print(method, regressor)
            model = OneVsRestClassifier(Lasso(alpha=0.1))
        elif regressor == 8:
            print(method, regressor)
            model = OneVsRestClassifier(MultiTaskLasso(alpha=0.1))
        elif regressor == 9:
            print(method, regressor)
            model = OneVsRestClassifier(ElasticNet(random_state=0))
        elif regressor == 10:
            print(method, regressor)
            model = OneVsRestClassifier(MultiTaskElasticNet(random_state=0))
        elif regressor == 11:
            print(method, regressor)
            model = OneVsRestClassifier(Lars(n_nonzero_coefs=1))
        elif regressor == 12:
            print(method, regressor)
            model = OneVsRestClassifier(LassoLars(alpha=.1))
        elif regressor == 13:
            print(method, regressor)
            model = OneVsRestClassifier(OrthogonalMatchingPursuit())
        elif regressor == 14:
            print(method, regressor)
            model = OneVsRestClassifier(BayesianRidge())
        elif regressor == 15:
            print(method, regressor)
            model = OneVsRestClassifier(ARDRegression())
        elif regressor == 16:
            print(method, regressor)
            model = OneVsRestClassifier(TheilSenRegressor(random_state=0))
        elif regressor == 17:
            print(method, regressor)
            model = OneVsRestClassifier(HuberRegressor())
        elif regressor == 18:
            print(method, regressor)
            model = OneVsRestClassifier(RANSACRegressor(random_state=0))
        else:
            print('no regressor chosen')
            sys.exit()

        import time
        # Starts timer
        start = time.clock()

        # train the model using the training sets and check score
        model.fit(x_train, y_train)
        model.score(x_train, y_train)

        # y_train = le.inverse_transform(y_train)
        # y_test = le.inverse_transform(y_test)
        # print('coefficient:', model.coef_)
        # print('intercept:', model.intercept_)

        # predict output
        predictions = pd.Series(model.predict(x_test))
        if (parameter != None):
            filename = '{},{},{},{}.txt'.format(size, methods[method],
                                                regressors[regressor],
                                                parameter)
        else:
            filename = '{},{},{}.txt'.format(size, methods[method],
                                             regressors[regressor])

        # Prints the time taken
        end = time.clock()
        time = str(end - start)

        with open(filename, 'w') as output:
            output.write('method:\n{}\n\n'.format(methods[method]))
            output.write('regressor:\n{}\n\n'.format(regressors[regressor]))
            output.write('accuracy:\n{:.2f}%\n\n'.format(
                100 * accuracy_score(y_test, predictions)))
            output.write('time:\n{}s\n\n'.format(time))
            output.write('data:\n{:10}\t{:10}\t{:10}\n'.format(
                'actual', 'predict', 'match?'))
            for i in range(len(predictions)):
                output.write('{:10}\t{:10}\t{:10}\n'.format(
                    y_train[i], predictions[i], y_test[i] == predictions[i]))

        print('\nmethod:\n{}\n'.format(methods[method]))
        print('regressor:\n{}\n'.format(regressors[regressor]))
        print('accuracy:\n{:.2f}%\n'.format(
            100 * accuracy_score(y_test, predictions)))
        print('time: {}s\n'.format(time))

    else:
        print('no method chosen')
        sys.exit()