Пример #1
0
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
wine = datasets.load_wine()
X, y = wine.data, wine.target

The code below shows `scikit-learn` implementations of LDA, QDA, and Naive Bayes using the {doc}`wine </content/appendix/data>` dataset. Note that the Naive Bayes implementation assumes *all* variables follow a Normal distribution, unlike the construction in the previous section.

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

lda = LinearDiscriminantAnalysis()
lda.fit(X, y);

qda = QuadraticDiscriminantAnalysis()
qda.fit(X, y);

nb = GaussianNB()
nb.fit(X, y);

Next, let's check that these `scikit-learn` implementations return the same decision boundaries as our constructions in the previous section. The code to create these graphs is written below. 

def graph_boundaries(X, model, model_title, n0 = 1000, n1 = 1000, figsize = (7, 5), label_every = 4):
        
        # Generate X for plotting 
        d0_range = np.linspace(X[:,0].min(), X[:,0].max(), n0)
        d1_range = np.linspace(X[:,1].min(), X[:,1].max(), n1)
        X_plot = np.array(np.meshgrid(d0_range, d1_range)).T.reshape(-1, 2)
        
        # Get class predictions
Пример #2
0
        log_model = lr_model
    if log == 'nb':
        from sklearn.naive_bayes import GaussianNB
        lr_model = GaussianNB()
        log_model = lr_model
    if log == 'knn':
        from sklearn.neighbors import KNeighborsClassifier
        lr_model = KNeighborsClassifier(n_neighbors=35, weights='distance')
        log_model = lr_model
    if log == 'lda':
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        lr_model = LinearDiscriminantAnalysis()
        log_model = lr_model
    if log == 'qda':
        from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
        lr_model = QuadraticDiscriminantAnalysis()
        log_model = lr_model
    # Run CV

    #if True:
    #    fit_model = log_model.fit(X,y)
    #    pred = fit_model.predict_proba(look)[:,1]#.clip(0.001,.999)
    #    print( "  look Gini = ", log_loss(ylook, pred) )
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        
      # Create data for this fold
      y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
      X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
      X_test = test_df.copy()
      print( "\nFold ", i)
      
Пример #3
0
		Y = whole_data['target'].values
		X = whole_data.drop('target',axis=1).values

		X_train, X_test, Y_train, Y_test = train_test_split (X, Y, test_size = 0.33, random_state=1)

		# now declare some varible for algorithum.

		list_of_lagorithum = []
		num_of_folds = 10
		results_of_algo = []
		names_of_algo = []

		# now check accuracy with some algo without standardization.
		list_of_lagorithum.append(('RandomForestClassifier ',RandomForestClassifier()))
		list_of_lagorithum.append(('QuadraticDiscriminantAnalysis ',QuadraticDiscriminantAnalysis()))
		list_of_lagorithum.append(('LogisticRegression ',LogisticRegression()))
		list_of_lagorithum.append(('DecisionTree ',DecisionTreeClassifier()))
		list_of_lagorithum.append(('LinearDiscriminant ',LinearDiscriminantAnalysis()))
		list_of_lagorithum.append(('Support Vector Machine ',SVC()))
		list_of_lagorithum.append(('GaussianNB ',GaussianNB()))
		list_of_lagorithum.append(('BernoulliNB ',BernoulliNB()))
		list_of_lagorithum.append(('KNeighborsClassifier ',KNeighborsClassifier()))


		print("\n\n\nAccuracies of algorithm without  standardization \n\n")
		for name, model in list_of_lagorithum:
			kfold = KFold(n_splits=num_of_folds, random_state=13)
			startTime = time.time()
			cv_results = cross_val_score(model, X,Y, cv=kfold, scoring='accuracy')
			endTime = time.time()
Пример #4
0
def QDA():
    pipe = Pipeline([('a_preprocess', MinMaxScaler()),
                     ('b_reduce',
                      PCA(iterated_power=7, random_state=86, n_components=14)),
                     ('c_classify', QuadraticDiscriminantAnalysis())])
    return ('QDA', pipe)
Пример #5
0
    def handle(self, *args, **options):
        # http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
        import numpy as np
        import matplotlib.pyplot as plt
        from matplotlib.colors import ListedColormap
        from sklearn.cross_validation import train_test_split
        from sklearn.preprocessing import StandardScaler
        from sklearn.datasets import make_moons, make_circles, make_classification
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.svm import SVC
        from sklearn.tree import DecisionTreeClassifier
        from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
        from sklearn.naive_bayes import GaussianNB
        from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
        from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

        h = .02  # step size in the mesh

        names = [
            "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
            "Random Forest", "AdaBoost", "Naive Bayes",
            "Linear Discriminant Analysis", "Quadratic Discriminant Analysis"
        ]
        classifiers = [
            KNeighborsClassifier(3),
            SVC(kernel="linear", C=0.025),
            SVC(gamma=2, C=1),
            DecisionTreeClassifier(max_depth=5),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=10,
                                   max_features=1),
            AdaBoostClassifier(),
            GaussianNB(),
            LinearDiscriminantAnalysis(),
            QuadraticDiscriminantAnalysis()
        ]

        X, y = make_classification(n_features=2,
                                   n_redundant=0,
                                   n_informative=2,
                                   random_state=1,
                                   n_clusters_per_class=1)
        rng = np.random.RandomState(2)
        X += 2 * rng.uniform(size=X.shape)
        linearly_separable = (X, y)

        from history.tools import normalization, filter_by_mins, create_sample_row
        from history.models import Price

        graph = False
        self.symbol = 'BTC_ETH'
        self.minutes_back = 100
        self.timedelta_back_in_granularity_increments = 0
        datasetinputs = 2
        gran_options = [1, 5, 15, 30]
        gran_options = [30, 60, 120, 240]
        datasets = []
        _names = []
        for gran in gran_options:
            self.granularity = gran

            splice_point = self.minutes_back + self.timedelta_back_in_granularity_increments
            prices = Price.objects.filter(
                symbol=self.symbol).order_by('-created_on')
            prices = filter_by_mins(prices, self.granularity)
            prices = [price.price for price in prices]
            data = normalization(list(prices[0:splice_point]))
            data.reverse()

            price_datasets = [[], []]
            for i, val in enumerate(data):
                try:
                    # get NN projection
                    sample = create_sample_row(data, i, datasetinputs)
                    last_price = data[i + datasetinputs - 1]
                    next_price = data[i + datasetinputs]
                    change = next_price - last_price
                    pct_change = change / last_price
                    fee_pct = 0.002
                    do_buy = -1 if abs(pct_change) < fee_pct and False else (
                        1 if change > 0 else 0)
                    price_datasets[0].append(sample)
                    price_datasets[1].append(do_buy)
                except Exception as e:
                    print(e)
            datasets.append(price_datasets)
            _names.append(str(gran))

        if graph:
            figure = plt.figure(figsize=(27, 9))
        i = 1
        # iterate over datasets
        for _index in range(0, len(datasets)):
            ds = datasets[_index]
            # preprocess dataset, split into training and test part
            X, y = ds
            X = StandardScaler().fit_transform(X)
            X_train, X_test, y_train, y_test = train_test_split(X,
                                                                y,
                                                                test_size=.4)

            x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
            y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                                 np.arange(y_min, y_max, h))

            # just plot the dataset first
            if graph:
                cm = plt.cm.RdBu
                cm_bright = ListedColormap(['#FF0000', '#0000FF'])
                ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
                # Plot the training points
                ax.scatter(X_train[:, 0],
                           X_train[:, 1],
                           c=y_train,
                           cmap=cm_bright)
                # and testing points
                ax.scatter(X_test[:, 0],
                           X_test[:, 1],
                           c=y_test,
                           cmap=cm_bright,
                           alpha=0.6)
                ax.set_xlim(xx.min(), xx.max())
                ax.set_ylim(yy.min(), yy.max())
                ax.set_xticks(())
                ax.set_yticks(())
            i += 1

            # iterate over classifiers
            for name, clf in zip(names, classifiers):
                if graph:
                    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
                clf.fit(X_train, y_train)
                score = clf.score(X_test, y_test)
                # Plot the decision boundary. For that, we will assign a color to each
                # point in the mesh [x_min, m_max]x[y_min, y_max].
                _input = np.c_[xx.ravel(), yy.ravel()]
                if hasattr(clf, "decision_function"):
                    Z = clf.decision_function(_input)
                else:
                    Z = clf.predict_proba(_input)[:, 1]

                print(name, round(score * 100))
                # Put the result into a color plot
                if graph:
                    Z = Z.reshape(xx.shape)
                    ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

                    # Plot also the training points
                    ax.scatter(X_train[:, 0],
                               X_train[:, 1],
                               c=y_train,
                               cmap=cm_bright)
                    # and testing points
                    ax.scatter(X_test[:, 0],
                               X_test[:, 1],
                               c=y_test,
                               cmap=cm_bright,
                               alpha=0.6)

                    ax.set_xlim(xx.min(), xx.max())
                    ax.set_ylim(yy.min(), yy.max())
                    ax.set_xticks(())
                    ax.set_yticks(())
                    ax.set_title("(" + _names[_index] + ")" + name)
                    text = ('%.2f' % score).lstrip('0')
                    ax.text(xx.max() - .3,
                            yy.min() + .3,
                            text,
                            size=15,
                            horizontalalignment='right')
                    i += 1

                stats = {'r': 0, 'w': 0}
                for ds in datasets:
                    for i in range(0, len(ds[0])):
                        sample = ds[0][i]
                        actual = ds[1][i]
                        prediction = clf.predict(sample)
                        stats['r' if actual == prediction[0] else 'w'] = stats[
                            'r' if actual == prediction[0] else 'w'] + 1
                print(
                    'stats', name, stats,
                    round((100.0 * stats['r'] / (stats['r'] + stats['w'])), 2))

        if graph:
            figure.subplots_adjust(left=.02, right=.98)
            plt.show()
Пример #6
0
                    "\n  Elapsed time: {}\n-------------------------".format(
                        elapsed_time))
            ref_dict["gnb"] = [gnb_dict_avg, "gnb_dict_avg"]

        ###########################################
        ########            QDA         ###########
        ###########################################
        if algorithm.lower() == "qda":
            start_time = datetime.now()
            print("   Selected Classifier: Quadratic Discriminant Analysis")
            with open(log_file_name, "a") as file:
                file.write(
                    "\n   Selected Classifier: Quadratic Discriminant Analysis"
                )
            from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
            qda = QuadraticDiscriminantAnalysis().fit(X, y)
            qda_dict = perform_prediction(model=qda,
                                          pred_pos=pred_pos,
                                          pred_neg=pred_neg,
                                          print_log=print_log)
            if i == 0:
                qda_dict_avg = qda_dict
            else:
                for k, n in zip(qda_dict_avg.keys(), qda_dict.keys()):
                    qda_dict_avg[k] = float(qda_dict_avg[k]) + float(
                        qda_dict[k])
            if save_models.lower() in ["1", "yes", "y", "yeah", "whatever"]:
                save_model_pickle(qda,
                                  root_dir,
                                  file_name="qda-{}.pickle".format(i))
            elapsed_time = datetime.now() - start_time
Пример #7
0
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


#define X y
X, y = data.loc[:,data.columns != 'state'].values, data.loc[:,data.columns == 'state'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

#ADASYN
ada = ADASYN()
os_X,os_y = ada.fit_sample(X_train, y_train)

#QDA
clf_QDA = QuadraticDiscriminantAnalysis(store_covariances=True)
clf_QDA.fit(os_X, os_y)
y_true, y_pred = y_test, clf_QDA.predict(X_test)

#F1_score, precision, recall, specifity, G score
print "F1_score : %.4g" % metrics.f1_score(y_true, y_pred)  
print "Recall : %.4g" % metrics.recall_score(y_true, y_pred)
recall = metrics.recall_score(y_true, y_pred)  
print "Precision : %.4g" % metrics.precision_score(y_true, y_pred)
 
#Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print "Specifity: " , float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1])
specifity = float(cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[0,1]) 
print "G score: " , math.sqrt(recall/ specifity) 
Пример #8
0
raw_data = pd.read_csv("./Biomechanical features of orthopedic patients.csv")
raw_data = raw_data.sample(frac=1).reset_index(drop=True)
inputs = raw_data[[
    'pelvic_incidence', 'pelvic_tilt numeric', 'lumbar_lordosis_angle',
    'sacral_slope', 'pelvic_radius', 'degree_spondylolisthesis'
]]
outputs = raw_data[['class']]

linear_Regression_Dictionary = {"Normal": 1, "Abnormal": 0}

output_linr = outputs.replace({"class": linear_Regression_Dictionary})

inputs_train = inputs.loc[:248]
inputs_test = inputs.loc[248:]
outputs_train = output_linr.loc[:248]
outputs_test = output_linr.loc[248:]

qda_classification = QuadraticDiscriminantAnalysis()

qda_classification.fit(inputs_train, outputs_train)

prediction = qda_classification.predict(inputs_test)

reverting = lambda x: 1 if (x > 0.5) else 0
finalPrediction = pd.DataFrame(np.array([reverting(xi) for xi in prediction]))

print('MAE:', metrics.mean_absolute_error(outputs_test, finalPrediction))
print('MSE:', metrics.mean_squared_error(outputs_test, finalPrediction))
print('RMSE:',
      np.sqrt(metrics.mean_squared_error(outputs_test, finalPrediction)))
Пример #9
0
def run_predict():

    master_sdata = []
    today = datetime.datetime.today()

    stocks = collections.OrderedDict([('BP', 0), ('SWN', 0), ('GLD', 0),
                                      ('USO', 0), ('^DJI', 0), ('CVX', 0)])

    for s in stocks.keys():
        sdata, today_df = retrieve_data(s,
                                        datetime.datetime(2007, 1, 1),
                                        today,
                                        lags=5)

        # Create training data - can change lag if needed
        lag_train_data = sdata[[
            "Lag1 PercChange", "Lag2 PercChange", "Lag3 PercChange",
            "Lag4 PercChange"
        ]]
        today_train_data = today_df[[
            "Lag1 PercChange", "Lag2 PercChange", "Lag3 PercChange",
            "Lag4 PercChange"
        ]]
        dir_train_data = sdata["Direction"]

        today_train_data1 = lag_train_data.append(today_train_data)

        # Test data start - one year ago
        test_start_date = datetime.datetime.now() - relativedelta(years=1)

        lag_train_set = today_train_data1[
            today_train_data1.index < test_start_date]
        lag_test_set = today_train_data1[
            today_train_data1.index >= test_start_date]
        dir_train_set = dir_train_data[dir_train_data.index < test_start_date]
        dir_test_set = dir_train_data[dir_train_data.index >= test_start_date]

        #scaler = StandardScaler()
        #scaler.fit(lag_train_set)
        #scaler.fit(dir_train_set)
        #lag_train_set = scaler.transform(lag_train_set)
        #dir_train_set = scaler.transform(dir_train_set)
        #lag_test_set = scaler.transform(lag_test_set)

        #print("LAG_TRAIN")
        #print_full(lag_train_set)
        #print("DIR_TRAIN")
        #print_full(dir_train_set)
        #print(dir_train_set['continuous'])

        # Prediction results
        pred = pd.DataFrame(index=lag_test_set.index)
        #print("PREDPRED")
        #print(pred.index)
        pred["Actual"] = dir_test_set

        # Running machine learning analysis with the models
        models = [("SVC", SVC()),
                  ("LR",
                   LogisticRegression(solver='lbfgs',
                                      multi_class='multinomial')),
                  ("Forest", RandomForestRegressor(n_estimators=1, n_jobs=-1)),
                  ("LDA", LinearDiscriminantAnalysis()),
                  ("QDA", QuadraticDiscriminantAnalysis()),
                  ("NN",
                   MLPClassifier(algorithm='sgd',
                                 alpha=1e-5,
                                 learning_rate='adaptive',
                                 learning_rate_init=0.0001,
                                 hidden_layer_sizes=(5, 8),
                                 random_state=3,
                                 max_iter=400,
                                 activation='relu'))]

        for m in models:
            run_analysis(m[0], m[1], lag_train_set, dir_train_set,
                         lag_test_set, pred)

        pred = pred.ix[1:]

        #print_full(pred)

        man_date = '2016-4-18'
        #print("Actual for " + s + "  " + str(pred.ix[man_date]["Actual"]))
        #print("Prediction SVM: " + str(pred.ix[man_date]["SVC"]))
        #print("Prediction Linear Regression: " + str(pred.ix[man_date]["LR"]))
        #print("Prediction Linear Discriminant Analysis: " + str(pred.ix[man_date]["LDA"]))
        #print("Prediction Quad Discriminate Analysis: " + str(pred.ix[man_date]["QDA"]))
        #print("Prediction Random Forest: " + str(pred.ix[man_date]["Forest"]))
        #print("Prediction Neural Network: " + str(pred.ix[man_date]["NN"]))

        stocks[s] = pred.ix[-1]["NN"]

        master_sdata.append(sdata)

    return master_sdata, stocks
Пример #10
0
def classifiers_evaluation(df_res, y):

    classifiers = [
        LinearSVC(),
        LinearSVR(),
        KNeighborsClassifier(3),
        SVC(probability=True),
        NuSVC(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        AdaBoostClassifier(),
        GradientBoostingClassifier(),
        BernoulliNB(),
        GaussianNB(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        LogisticRegression(),
        MLPClassifier(max_iter=600),
        SGDClassifier(max_iter=600),
        LogisticRegressionCV(max_iter=600)
    ]

    res = list()

    preprocess = [
        preprocessing.QuantileTransformer(),
        preprocessing.MinMaxScaler(),
        preprocessing.Normalizer(),
        preprocessing.StandardScaler(),
        preprocessing.RobustScaler(),
        preprocessing.MaxAbsScaler()
    ]
    for processor in preprocess:
        X = processor.fit_transform(df_res)

        log_cols = ["Classifier", "ROC_AUC score"]
        log = pd.DataFrame(columns=log_cols)

        sss = StratifiedShuffleSplit(n_splits=10,
                                     test_size=0.1,
                                     random_state=0)

        acc_dict = {}

        for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            for clf in classifiers:
                name = clf.__class__.__name__
                clf.fit(X_train, y_train)
                train_predictions = clf.predict(X_test)
                #            acc = accuracy_score(y_test, train_predictions)
                acc = roc_auc_score(y_test, train_predictions)

                if name in acc_dict:
                    acc_dict[name] += acc
                else:
                    acc_dict[name] = acc

        for clf in acc_dict:
            acc_dict[clf] = acc_dict[clf] / 10.0
            log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
            log = log.append(log_entry)

        print(processor.__class__.__name__)
        print(log)
        res.append([processor.__class__.__name__, log])

    return res
Пример #11
0
def quadratic_discriminant(x_train, y_train, x_test, y_test):
    clf = QuadraticDiscriminantAnalysis()
    return __fit_clf_model('quadratic_discriminant', clf, x_train, y_train,
                           x_test, y_test)
Пример #12
0
    def performance_analysis(self):
        """
        Analyze and print to stdout the performances of a big list of classifiers, in order
        to include only the best ones in the final version of RiskInDroid.
        :return: None.
        """

        # Category of permissions for which to calculate the performances.
        _cat = 'declared'

        _k_fold = StratifiedKFold(n_splits=10,
                                  shuffle=True,
                                  random_state=self.seed)

        # The original list of classifiers taken into consideration, before selecting
        # only the best ones for RiskInDroid.
        _all_models = (SVC(kernel='linear',
                           probability=True,
                           random_state=self.seed), GaussianNB(),
                       MultinomialNB(), BernoulliNB(),
                       DecisionTreeClassifier(random_state=self.seed),
                       RandomForestClassifier(random_state=self.seed),
                       AdaBoostClassifier(random_state=self.seed),
                       GradientBoostingClassifier(random_state=self.seed),
                       SGDClassifier(loss='log', random_state=self.seed),
                       LogisticRegression(random_state=self.seed),
                       LogisticRegressionCV(random_state=self.seed),
                       KNeighborsClassifier(), LinearDiscriminantAnalysis(),
                       QuadraticDiscriminantAnalysis(),
                       MLPClassifier(random_state=self.seed))

        _training_sets = list(self.get_training_vectors_3_sets())

        for model in _all_models:
            print('\n\n\nAnalysis of ' + model.__class__.__name__ + ':')

            # Goodware and malware scores for the current model.
            _malware_scores = numpy.array([])
            _goodware_scores = numpy.array([])

            # Correctly predicted targets for the current model.
            _ok_targets = numpy.array([])

            # We analyze the 3 training sets for each model.
            for (index, current_set) in enumerate(_training_sets):

                # current_set[0] = application set
                # current_set[1] = application targets

                # Goodware and malware scores for the current set.
                _loc_m_scores = numpy.array([])
                _loc_g_scores = numpy.array([])

                # Correctly predicted targets for the current set.
                _loc_ok_targets = numpy.array([])

                # The analysis is done using 10-cross fold validation.
                for train_index, test_index in _k_fold.split(
                        current_set[0][_cat], current_set[1]):

                    _train_data = numpy.array(current_set[0][_cat])
                    _train_targets = numpy.array(current_set[1])

                    model.fit(_train_data[train_index],
                              _train_targets[train_index])

                    # Correctly predicted targets for the current fold.
                    _fold_ok_targets = 0

                    for loc_index in test_index:

                        proba = list(
                            zip(
                                model.classes_,
                                model.predict_proba([_train_data[loc_index]
                                                     ])[0]))

                        # The malware probability is considered as the risk value.
                        if proba[0][0] == b'malware':
                            _result = proba[0]
                        else:
                            _result = proba[1]

                        # We consider only correct predictions for calculating the mean
                        # and the standard deviation.
                        _true_target = _train_targets[loc_index]

                        # If the current app under test is a malware.
                        if _result[1] >= 0.5:
                            # If the prediction is correct.
                            if _result[0] == _true_target:
                                _fold_ok_targets += 1
                                _loc_m_scores = numpy.append(
                                    _loc_m_scores, _result[1])

                        # If the current app under test is not a malware.
                        else:
                            # If the prediction is correct.
                            if _result[0] != _true_target:
                                _fold_ok_targets += 1
                                _loc_g_scores = numpy.append(
                                    _loc_g_scores, _result[1])

                    _loc_ok_targets = numpy.append(
                        _loc_ok_targets, _fold_ok_targets / len(test_index))

                print('    set_{0}:'.format(index + 1))
                print('        accuracy: {0:.2f}'.format(
                    _loc_ok_targets.mean() * 100))
                print('        malware mean: {0:.2f}'.format(
                    _loc_m_scores.mean() * 100))
                print('        malware std_dev: {0:.2f}'.format(
                    _loc_m_scores.std() * 100))
                print('        goodware mean: {0:.2f}'.format(
                    _loc_g_scores.mean() * 100))
                print('        goodware std_dev: {0:.2f}'.format(
                    _loc_g_scores.std() * 100))

                _ok_targets = numpy.append(_ok_targets, _loc_ok_targets)
                _malware_scores = numpy.append(_malware_scores, _loc_m_scores)
                _goodware_scores = numpy.append(_goodware_scores,
                                                _loc_g_scores)

            print('    total:')
            print('        accuracy: {0:.2f}'.format(_ok_targets.mean() * 100))
            print('        malware mean: {0:.2f}'.format(
                _malware_scores.mean() * 100))
            print('        malware std_dev: {0:.2f}'.format(
                _malware_scores.std() * 100))
            print('        goodware mean: {0:.2f}'.format(
                _goodware_scores.mean() * 100))
            print('        goodware std_dev: {0:.2f}'.format(
                _goodware_scores.std() * 100))
# Header for Features without Labels
features = [str(i) for i in range(1, 1583)]

# Standarize the DATA
X = df.loc[:, features].values

Y = df.loc[:, 'label'].values

X = StandardScaler().fit_transform(X)

# PCA
# n : Number of principal components
n = 90
pca = PCA(n_components=n)

X = pca.fit_transform(X)

#Split data to train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

#Create a Quadratic Discriminant Analysis instance
classifier = QuadraticDiscriminantAnalysis()

#Fit the classifier
classifier.fit(X_train, Y_train)

#Calculate the score (Accuracy)
score = classifier.score(X_test, Y_test)

#Printing the score
print(score)
Пример #14
0
def classify_through_discriminant_analysis(classification_data={}):
    clf = QuadraticDiscriminantAnalysis()
    return general_classifier(classification_data, clf)
Пример #15
0
###############################################################################
#                        3. Create train and test set                         #
###############################################################################
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=1000)

###############################################################################
#                               4. Classifiers                                #
###############################################################################
# Create list of tuples with classifier label and classifier object
classifiers = {}
classifiers.update({"LDA": LinearDiscriminantAnalysis()})
classifiers.update({"QDA": QuadraticDiscriminantAnalysis()})
classifiers.update({"AdaBoost": AdaBoostClassifier()})
classifiers.update({"Bagging": BaggingClassifier()})
classifiers.update({"Extra Trees Ensemble": ExtraTreesClassifier()})
classifiers.update({"Gradient Boosting": GradientBoostingClassifier()})
classifiers.update({"Random Forest": RandomForestClassifier()})
classifiers.update({"Ridge": RidgeClassifier()})
classifiers.update({"SGD": SGDClassifier()})
classifiers.update({"BNB": BernoulliNB()})
classifiers.update({"GNB": GaussianNB()})
classifiers.update({"KNN": KNeighborsClassifier()})
classifiers.update({"MLP": MLPClassifier()})
classifiers.update({"LSVC": LinearSVC()})
classifiers.update({"NuSVC": NuSVC()})
classifiers.update({"SVC": SVC()})
classifiers.update({"DTC": DecisionTreeClassifier()})
Пример #16
0
    def learn(self, fname, file_data=None):
        csvfile = None
        if file_data:
            # base64 and gzipped file
            data = base64.b64decode(file_data)
            # data = zlib.decompress(data, 16 + zlib.MAX_WBITS)
            data = gzip.decompress(data)
            csvfile = StringIO(data.decode('utf-8'))
        else:
            csvfile = open(fname, 'r')

        t = time.time()
        # load CSV file
        self.header = []
        rows = []
        naming_num = 0
        # with open(fname, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        for i, row in enumerate(reader):
            self.logger.debug(row)
            if i == 0:
                self.header = row
            else:
                for j, val in enumerate(row):
                    if j == 0:
                        # this is a name of the location
                        if val not in self.naming['from']:
                            self.naming['from'][val] = naming_num
                            self.naming['to'][naming_num] = val
                            naming_num += 1
                        row[j] = self.naming['from'][val]
                        continue
                    if val == '':
                        row[j] = 0
                        continue
                    try:
                        row[j] = float(val)
                    except:
                        self.logger.error("problem parsing value " + str(val))
                rows.append(row)
        csvfile.close()

        # first column in row is the classification, Y
        y = numpy.zeros(len(rows))
        x = numpy.zeros((len(rows), len(rows[0]) - 1))

        # shuffle it up for training
        record_range = list(range(len(rows)))
        shuffle(record_range)
        for i in record_range:
            y[i] = rows[i][0]
            x[i, :] = numpy.array(rows[i][1:])

        names = [
            "Nearest Neighbors",
            "Linear SVM",
            "RBF SVM",
            # "Gaussian Process",
            "Decision Tree",
            "Random Forest",
            "Neural Net",
            "AdaBoost",
            "Naive Bayes",
            "QDA"
        ]
        classifiers = [
            KNeighborsClassifier(3),
            SVC(kernel="linear", C=0.025, probability=True),
            SVC(gamma=2, C=1, probability=True),
            # GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
            DecisionTreeClassifier(max_depth=5),
            RandomForestClassifier(max_depth=5,
                                   n_estimators=10,
                                   max_features=1),
            MLPClassifier(alpha=1),
            AdaBoostClassifier(),
            GaussianNB(),
            QuadraticDiscriminantAnalysis()
        ]
        self.algorithms = {}
        # split_for_learning = int(0.70 * len(y))
        for name, clf in zip(names, classifiers):
            t2 = time.time()
            self.logger.debug("learning {}".format(name))
            try:
                self.algorithms[name] = self.train(clf, x, y)
                # score = self.algorithms[name].score(x,y)
                # logger.debug(name, score)
                self.logger.debug("learned {}, {:d} ms".format(
                    name, int(1000 * (t2 - time.time()))))
            except Exception as e:
                self.logger.error("{} {}".format(name, str(e)))

        self.logger.debug("{:d} ms".format(int(1000 * (t - time.time()))))
Пример #17
0
def analysis_results(options):
    """
    Analyzes the results of the comparisons
    """

    # Start marker for time measure
    start = time.time()

    print("\n\t\t------------------------------------------------------------------------------------------------------------------------\n")
    print("\t\tStarting Drug Interactions ANAlysis (DIANA), a program created by @OLIVA'S LAB. Analysis of results: Analysis by targets\n")
    print("\t\t------------------------------------------------------------------------------------------------------------------------\n")

    # Get the script path
    main_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
    toolbox_dir = os.path.join(main_path, 'diana/toolbox')

    # Check the directory of the profiles, comparisons and analysis
    data_dir = os.path.join(options.workspace, "profiles")
    check_directory(data_dir)

    results_dir = os.path.join(options.workspace, "comparisons")
    check_directory(results_dir)

    analysis_dir = os.path.join(options.workspace, "analysis")
    check_directory(analysis_dir)

    # Get the list of thresholds to create the profiles
    if options.threshold_list and fileExist(options.threshold_list):
        threshold_list = get_values_from_threshold_file(options.threshold_list)
    else:
        threshold_list = [1, 5, 10, 20, 50]

    # Do we consider Side Effects/ATC?
    if options.consider_se:
        consider_se = True
    else:
        consider_se = False

    # Get the names of the columns
    columns = diana_analysis.obtain_columns(threshold_list, ATC_SE=consider_se)



    #-----------------------------------------------------#
    #   PARSE THE RESULTS AND CREATE A PANDAS DATAFRAME   #
    #-----------------------------------------------------#

    pair2comb_file = os.path.join(toolbox_dir, 'pair2comb.pcl')
    pair2comb = cPickle.load(open(pair2comb_file))

    ddi = sum(1 for x in pair2comb.values() if x == 1)
    non_ddi = sum(1 for x in pair2comb.values() if x == 0)

    print('NUMBER OF DRUG COMBINATIONS:\t\t{}\n'.format(ddi))
    print('NUMBER OF NON-DRUG COMBINATIONS:\t{}\n'.format(non_ddi))

    output_dataframe = os.path.join(analysis_dir, 'dcdb_comparisons.csv')

    if not fileExist(output_dataframe):

        # Create a data frame to store the results
        df = pd.DataFrame(columns=columns)


        # Obtain all the results subfolders of the results main folder
        results_dir_list = [f for f in os.listdir(results_dir) if os.path.isdir(os.path.join(results_dir, f))]

        for comparison in results_dir_list:

            drug_id1, drug_id2 = comparison.split('---')
            comparison_dir = os.path.join(results_dir, comparison)
            results_table = os.path.join(comparison_dir, 'results_table.tsv')

            # Add the Comb field (if it is drug combination or not)
            drug1 = drug_id1.split('_')[0].upper()
            drug2 = drug_id2.split('_')[0].upper()
            comparison_without_id = '{}---{}'.format(drug1, drug2)
            if comparison_without_id in pair2comb:
                combination_field = pair2comb[comparison_without_id]
            else:
                print('The comparison {} is not in the pair2comb dictionary!\n'.format(comparison_without_id))
                print(pair2comb)
                sys.exit(10)

            if not fileExist(results_table):
                print('The comparison {} has not been executed properly!\n'.format(comparison))
                sys.exit(10)

            results = get_results_from_table(results_table, columns, combination_field)

            df2 = pd.DataFrame([results], columns=columns, index=[comparison])
            # Add the information to the main data frame
            df = df.append(df2)

        # Output the Pandas dataframe in a CSV file
        df.to_csv(output_dataframe)

    else:
        df = pd.read_csv(output_dataframe, index_col=0)



    #---------------------------#
    #   REMOVE MISSING VALUES   #
    #---------------------------#

    # Replace the None values in dcstructure by nan
    if 'None' in df['dcstructure']:
        df = df.replace(to_replace={'dcstructure':{'None':np.nan}})

    # Remove the nan values in dcstructure
    df = df.dropna()

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing missing values:\t{}\n'.format(num_dc))
    print('Number of non-drug combinations after removing missing values:\t{}\n'.format(num_ndc))



    #---------------------------#
    #   IDENTIFY ME-TOO DRUGS   #
    #---------------------------#

    me_too_dir = os.path.join(analysis_dir, 'me_too_drugs')
    create_directory(me_too_dir)
    me_too_drugs_table = os.path.join(me_too_dir, 'me_too_drugs.tsv')
    me_too_drug_combs_table = os.path.join(me_too_dir, 'me_too_drug_combinations.tsv')

    me_too_drug_pairs_file = os.path.join(me_too_dir, 'me_too_drug_pairs.pcl')
    me_too_drug_comb_pairs_file = os.path.join(me_too_dir, 'me_too_drug_comb_pairs.pcl')

    if not fileExist(me_too_drug_pairs_file) or not fileExist(me_too_drug_comb_pairs_file):

        df_struc = df[['dcstructure']]
        df_struc = df_struc.astype(float)
        me_too_drug_pairs, me_too_drug_comb_pairs = diana_analysis.obtain_me_too_drugs_and_combinations(df_struc, columns, me_too_drugs_table, me_too_drug_combs_table)
        cPickle.dump(me_too_drug_pairs, open(me_too_drug_pairs_file, 'w'))
        cPickle.dump(me_too_drug_comb_pairs, open(me_too_drug_comb_pairs_file, 'w'))

    else:

        me_too_drug_pairs = cPickle.load(open(me_too_drug_pairs_file))
        me_too_drug_comb_pairs = cPickle.load(open(me_too_drug_comb_pairs_file))

    # Process me-too drug combination pairs
    me_too_drug_combinations = set()
    drug_pair_to_me_too_times = {}
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        me_too_drug_combinations.add(frozenset([drug_comb1, drug_comb2]))
        drug_pair_to_me_too_times.setdefault(drug_comb1, 0)
        drug_pair_to_me_too_times.setdefault(drug_comb2, 0)
        drug_pair_to_me_too_times[drug_comb1] += 1
        drug_pair_to_me_too_times[drug_comb2] += 1
    removed_drug_pairs = set()
    for pair in me_too_drug_comb_pairs:
        drug_comb1, drug_comb2 = pair.split('___')
        if drug_comb1 in removed_drug_pairs or drug_comb2 in removed_drug_pairs:
            continue
        if drug_pair_to_me_too_times[drug_comb1] > drug_pair_to_me_too_times[drug_comb2]:
            removed_drug_pairs.add(drug_comb1)
        else:
            removed_drug_pairs.add(drug_comb2)

    # Remove the drug pairs which appear in me-too pairs of drug pairs more times
    df = df.loc[~df.index.isin(list(removed_drug_pairs))]

    # Count the number of drug combinations / non-drug combinations
    dc_data = df[df['combination'] == 1]
    ndc_data = df[df['combination'] == 0]
    num_dc = len(dc_data.index)
    num_ndc = len(ndc_data.index)
    print('Number of drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_dc))
    print('Number of non-drug combinations after removing me-too conflictive drug pairs:\t{}\n'.format(num_ndc))



    #-------------------------------------#
    #   EVALUATE PERFORMANCE BY TARGETS   #
    #-------------------------------------#

    img_dir = os.path.join(analysis_dir, 'figures')
    create_directory(img_dir)
    fig_format = 'png'

    tables_dir = os.path.join(analysis_dir, 'tables')
    create_directory(tables_dir)

    # Number of targets
    num_targets = [[1],[2],[3,4,5,6],[7]]

    # Names of the methods
    if consider_se:
        if options.different_atc:
            types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'dcse', 'random']
            types_analysis2 = ['dctargets', 'dcguild', 'dcstructure', 'dcse'] # Without random!!
            #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcSE', 'Random']
            types_analysis_labels = [ 'Target', 'PPI','Structure', 'Side Effects', 'Random']
        else:
            types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse', 'random']
            types_analysis2 = ['dctargets', 'dcguild', 'dcstructure', 'dcatc', 'dcse'] # Without random!!
            #types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'dcATC', 'dcSE', 'Random']
            types_analysis_labels = [ 'Target', 'PPI','Structure', 'ATC', 'Side Effects', 'Random']
    else:
        types_analysis = ['dctargets', 'dcguild', 'dcstructure', 'random']
        types_analysis2 = ['dctargets', 'dcguild', 'dcstructure'] # Without random!!
        types_analysis_labels = ['dcTargets', 'dcGUILD', 'dcStructure', 'Random']
        types_analysis_labels = [ 'Target', 'PPI','Structure', 'Random']


    # Machine learning parameters
    repetitions = 25 # Number of repetititons
    n_fold = 2     # Number of folds
    min_num_dc_group = 10
    greater_or_smaller = 'greater'
    classifier = 'SVC best 1'
    classifiers = {
        'KNeighbors' : KNeighborsClassifier(3),
        'SVC' : SVC(probability=True),
        'SVC linear' : SVC(kernel="linear", C=0.025),
        'SVC rbf' : SVC(gamma=2, C=1),
        'DecisionTree' : DecisionTreeClassifier(max_depth=5),
        'RandomForest' : RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
        'MLP' : MLPClassifier(alpha=1),
        'AdaBoost' : AdaBoostClassifier(),
        'GaussianNB' : GaussianNB(),
        'QuadraticDiscr.' : QuadraticDiscriminantAnalysis(),
        'SVC best 1' : SVC(kernel="rbf", gamma=0.01, C=100, probability=True),
        'SVC best 2' : SVC(kernel="rbf", gamma=0.1, C=1.0, probability=True)
    }

    if options.pca:
        pca_str = '_withPCA'
    else:
        pca_str = '_withoutPCA'

    # Plot of distributions of AUC
    plot_auc_distribution = os.path.join(img_dir, 'numtargets_auc_distribution_ranges{}.{}'.format(pca_str, fig_format))

    # Plot of accuracy/sensitivity name
    acc_sens_dctargets = os.path.join(img_dir, 'numtargets_accsens_dctargets_ranges{}.{}'.format(pca_str, fig_format))
    acc_sens_dcguild = os.path.join(img_dir, 'numtargets_accsens_dcguild_ranges{}.{}'.format(pca_str, fig_format))
    acc_sens_dcstructure = os.path.join(img_dir, 'numtargets_accsens_dcstructure_ranges{}.{}'.format(pca_str, fig_format))
    acc_sens_dcatc = os.path.join(img_dir, 'numtargets_accsens_dcatc_ranges{}.{}'.format(pca_str, fig_format))
    acc_sens_dcse = os.path.join(img_dir, 'numtargets_accsens_dcse_ranges{}.{}'.format(pca_str, fig_format))

    # Results table
    results_table = os.path.join(tables_dir, 'numtargets_auc_table_ranges{}.txt'.format(pca_str))

    # Accuracy/Sensitivity results table
    prec_rec_table = os.path.join(tables_dir, 'numtargets_accsens_table_ranges{}.txt'.format(pca_str))

    # File with results of Mann Whitney tests
    mannwhitney_file = os.path.join(tables_dir, 'numtargets_mannwhitney_ranges{}.txt'.format(pca_str))

    # Get the targets file
    drugbank_to_targets_file = os.path.join(toolbox_dir, 'drugbank_to_targets.pcl')
    drugbank_to_targets = cPickle.load(open(drugbank_to_targets_file))

    # Get the DIANA IDs file
    diana_id_to_drugbank_file = os.path.join(toolbox_dir, 'diana_id_to_drugbank.pcl')
    diana_id_to_drugbank = cPickle.load(open(diana_id_to_drugbank_file))


    analysis_results = {} # Defining the dictionary that will store the results

    if consider_se:
        dct_columns, dcg_columns, dcs_columns, dcatc_columns, dcse_columns = diana_analysis.obtain_method_to_columns(threshold_list, ATC_SE=consider_se)
    else:
        dct_columns, dcg_columns, dcs_columns = diana_analysis.obtain_method_to_columns(threshold_list, ATC_SE=consider_se)

    for range_tar in num_targets:

        selected_rows = []

        for index, row in df.iterrows():

            (drug_id1, drug_id2) = index.split('---')
            drug1 = diana_id_to_drugbank[drug_id1].upper()
            drug2 = diana_id_to_drugbank[drug_id2].upper()

            if len(range_tar) == 1:
                # If it is the first of the range
                if range_tar == num_targets[0]:
                    if len(drugbank_to_targets[drug1]) <= range_tar[0] and len(drugbank_to_targets[drug2]) <= range_tar[0]:
                        selected_rows.append(index)
                # If it is the last of the range
                elif range_tar == num_targets[len(num_targets)-1]:
                    if len(drugbank_to_targets[drug1]) >= range_tar[0] and len(drugbank_to_targets[drug2]) >= range_tar[0]:
                        selected_rows.append(index)
                # If it is in the middle of the range
                else:
                    if len(drugbank_to_targets[drug1]) == range_tar[0] and len(drugbank_to_targets[drug2]) == range_tar[0]:
                        selected_rows.append(index)
            else:
                if len(drugbank_to_targets[drug1]) in range_tar and len(drugbank_to_targets[drug2]) in range_tar:
                    selected_rows.append(index)


        df_tar = df.ix[selected_rows]
        dc_data = df_tar[df_tar['combination'] == 1]
        num_dc = len(dc_data.index)
        print('Num drug combinations: {}'.format(num_dc))

        if consider_se:
            list_methods = [ ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['dcatc', dcatc_columns], ['dcse', dcse_columns], ['random', columns] ]
        else:
            list_methods = [ ['dctargets', dct_columns], ['dcguild', dcg_columns], ['dcstructure', dcs_columns], ['random', columns] ]

        for method, columns_method in list_methods:

            print('Evaluating {} targets with method {}\n'.format(range_tar,method))

            #------------------------------------------------------------------#
            #   SELECT RELEVANT FEATURES / REDUCE DIMENSIONALITY OF THE DATA   #
            #------------------------------------------------------------------#

            if options.pca:

                variance_cut_off = 0.01
                num_components = 0
                df_method = df_tar[columns_method]
                df_raw = df_method.drop('combination', axis=1)
                raw_columns = copy.copy(columns_method)
                raw_columns.remove('combination')
                pca = PCA(n_components=None)
                pca.fit(df_raw)
                values_trans = pca.transform(df_raw)
                explained_variance = pca.explained_variance_ratio_
                for column, var in sorted(zip(raw_columns, explained_variance), key=lambda x: x[1], reverse=True):
                    #print(column, var)
                    if var > variance_cut_off:
                        num_components += 1

                if num_components < len(raw_columns):

                    print('Number of features:\t{}\n'.format(len(raw_columns)))
                    print('Reduction to {} components\n'.format(num_components))

                    pca = PCA(n_components=num_components)
                    pca.fit(df_raw)
                    values_trans = pca.transform(df_raw)
                    indexes = df_method.index.values
                    df_trans = pd.DataFrame.from_records(values_trans, index=indexes)
                    df_comb = df_method[['combination']]
                    df_new = pd.concat([df_trans, df_comb], axis=1)
                    df_method = df_new

            else:

                # Manually introduced features
                guild_thresholds = [1, 5]
                rank_scoring = ['spearman', 'dot_product']
                list_scoring = ['jaccard']
                if method == 'Combination' or method == 'random':
                    selected_columns = diana_analysis.obtain_columns_best_features(guild_thresholds, rank_scoring, list_scoring, ATC_SE=consider_se)
                else:
                    selected_columns = diana_analysis.obtain_columns_best_features_for_specific_method(method, guild_thresholds, rank_scoring, list_scoring)

                # Remove ATC columns if different ATC
                if options.different_atc and consider_se:
                    selected_columns = [col for col in selected_columns if col not in dcatc_columns or col == 'combination']

                print('Selected columns: {}\n'.format(', '.join(selected_columns)))
                print('Number of selected features: {}\n'.format(len(selected_columns)-1)) # We take away the combinations column

                # Define the new table with the selected columns
                df_method = df_tar[selected_columns]
                dc_data = df_method[df_method['combination'] == 1]
                ndc_data = df_method[df_method['combination'] == 0]
                num_dc = len(dc_data.index)
                num_ndc = len(ndc_data.index)

            #------------------------------------------------------------------#


            dc_data = df_method[df_method['combination'] == 1]
            ndc_data = df_method[df_method['combination'] == 0]
            num_dc = len(dc_data.index)
            num_ndc = len(ndc_data.index)

            print('Building {} repetition groups of {} (same) DC and {} (different) non-DC'.format(repetitions,num_dc,num_dc))
            ndc_repetitions = diana_analysis.obtain_n_groups_of_k_length(ndc_data, repetitions, num_dc) # Obtain n number of groups containing different non-drug combinations to repeat the analysis n times

            mean_aucs = [] # Here we will store the means of AUCs from the cross-validations
            std_aucs = [] # Here we will store the standard deviations of the AUCs from the cross-validations
            all_aucs = [] # Here we will store ALL the AUCs
            all_probs = [] # Here we store all the probabilities and labels

            num_repetitions=0
            for ndc_data_equal in ndc_repetitions:

                num_repetitions+=1
                num_items_group = int( float(num_dc) / float(n_fold) ) # Calculate the number of items in each group of the cross-validation
                if num_repetitions == 1:
                    print('Building {} fold groups of {} DC and {} non-DC x {} repetitions'.format(n_fold,num_items_group,num_items_group, repetitions))

                dc_groups = diana_analysis.obtain_n_groups_of_k_length(dc_data, n_fold, num_items_group, me_too_drug_combinations) # Defining the drug combination groups in each cross-validation step
                ndc_groups = diana_analysis.obtain_n_groups_of_k_length(ndc_data_equal, n_fold, num_items_group, me_too_drug_combinations) # Defining the non-drug combination groups in each cross-validation step
                merged_groups = [pd.concat([x,y]) for x,y in zip(dc_groups, ndc_groups)]

                if method == 'random':
                    #mean, var, std, list_auc = run_nfold_crossvalidation_random(n_fold, merged_groups, classifiers[classifier])
                    mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_dummy(n_fold, merged_groups, classifiers[classifier])
                else:
                    mean, var, std, list_auc, list_prob = diana_analysis.run_nfold_crossvalidation_scikit_with_prob(n_fold, merged_groups, classifiers[classifier])

                mean_aucs.append(mean)
                std_aucs.append(std)
                all_aucs = all_aucs + list_auc
                all_probs = all_probs + list_prob

            final_mean = np.mean(all_aucs)
            #final_mean = np.mean(mean_aucs)
            std = np.std(all_aucs)
            mean_std = np.mean(std_aucs)
            std_means = np.std(mean_aucs)
            print('FINAL MEAN: {}'.format(final_mean))
            print('STD: {}\n'.format(std))
            #print('MEAN of STD: {}'.format(mean_std))

            # Store the distribution of AUCs in the dictionary
            analysis_results.setdefault(range_tar[0], {})
            analysis_results[range_tar[0]].setdefault(method, {})
            analysis_results[range_tar[0]][method]['all_aucs'] = all_aucs
            analysis_results[range_tar[0]][method]['all_probs'] = all_probs
            analysis_results[range_tar[0]][method]['mean'] = final_mean
            analysis_results[range_tar[0]][method]['std'] = std
            analysis_results[range_tar[0]][method]['num_dc'] = num_dc


    #------------------------------------#
    #   PLOT PRECISION VS. SENSITIVITY   #
    #------------------------------------#

    analysis_results = plot_precision_sensitivity(analysis_results, 'dctargets', num_targets, acc_sens_dctargets)
    analysis_results = plot_precision_sensitivity(analysis_results, 'dcguild', num_targets, acc_sens_dcguild)
    analysis_results = plot_precision_sensitivity(analysis_results, 'dcstructure', num_targets, acc_sens_dcstructure)
    if consider_se:
        analysis_results = plot_precision_sensitivity(analysis_results, 'dcatc', num_targets, acc_sens_dcatc)
        analysis_results = plot_precision_sensitivity(analysis_results, 'dcse', num_targets, acc_sens_dcse)


    #----------------------------------------------------#
    #   PLOT DISTRIBUTION OF AUC PER NUMBER OF TARGETS   #
    #----------------------------------------------------#

    plot_auc_distributions(analysis_results, num_targets, types_analysis, types_analysis_labels, plot_auc_distribution, fig_format=fig_format, consider_se=consider_se)


    #--------------------------------------------------------#
    #   TABLE OF DISTRIBUTION OF AUC PER NUMBER OF TARGETS   #
    #--------------------------------------------------------#

    with open(results_table, 'w') as results_table_fd:

        # Header
        results_table_fd.write(' ')
        for method in types_analysis_labels:
            results_table_fd.write('\t{}\t \t '.format(method))
        results_table_fd.write('\n')

        for num in num_targets:
            results_table_fd.write('{}'.format(num))
            for method in types_analysis:
                mean = analysis_results[num[0]][method]['mean']
                std = analysis_results[num[0]][method]['std']
                num_dc = analysis_results[num[0]][method]['num_dc']
                results_table_fd.write('\t{}\t{}\t{}'.format(mean, std, num_dc))
            results_table_fd.write('\n')


    #----------------------------------------#
    #   TABLE OF PRECISION VS. SENSITIVITY   #
    #----------------------------------------#

    with open(prec_rec_table, 'w') as prec_rec_table_fd:

        # Header
        prec_rec_table_fd.write(' ')
        for method in types_analysis2:
            prec_rec_table_fd.write('\t{}\t '.format(method))
        prec_rec_table_fd.write('\n')

        for num in num_targets:
            prec_rec_table_fd.write('{}'.format(num))
            for method in types_analysis2:
                cut_off = analysis_results[num[0]][method]['cut_off']
                value = analysis_results[num[0]][method]['value']
                prec_rec_table_fd.write('\t{}\t{}'.format(cut_off, value))
            prec_rec_table_fd.write('\n')


    #-------------------------------------------------------------------#
    #   TABLE OF COMPARISON OF AUC DISTRIBUTIONS USING MANN WHITNEY U   #
    #-------------------------------------------------------------------#

    with open(mannwhitney_file, 'w') as mannwhitney_fd:

        mann_results = {}

        mannwhitney_fd.write(' \t ')
        for method in types_analysis_labels:
            mannwhitney_fd.write('\t{}'.format(method))
        mannwhitney_fd.write('\n')

        # Perform the comparisons
        for num in num_targets:
            mann_results.setdefault(num[0], {})
            for method1 in types_analysis:
                mann_results[num[0]].setdefault(method1, {})
                for method2 in types_analysis:
                    if method1 == method2:
                        mann_results[num[0]][method1][method2] = '-'
                    else:
                        method1_dist = analysis_results[num[0]][method1]['all_aucs']
                        method2_dist = analysis_results[num[0]][method2]['all_aucs']
                        stat, pval = scipy.stats.mannwhitneyu(method1_dist, method2_dist)
                        mann_results[num[0]][method1][method2] = [stat, pval]

        # Write the table of crossings
        for num in num_targets:
            for method1 in types_analysis:
                mannwhitney_fd.write('{}\t{}'.format(num[0], method1))
                for method2 in types_analysis:
                    if method1 == method2:
                        mannwhitney_fd.write('\t-')
                    else:
                        stat, pval = mann_results[num[0]][method1][method2]
                        mannwhitney_fd.write('\t{}, {:.2e}'.format(stat,pval))
                mannwhitney_fd.write('\n')




    # End marker for time
    end = time.time()
    print('\n  DIANA INFO:\tTIME OF EXECUTION: {:.3f} seconds or {:.3f} minutes.\n'.format(end - start, (end - start) / 60))



    return
Пример #18
0
def classiferCompare(X_train, X_test, Y_train, Y_test):
    names = [
        "KNeighborsClassifier",
        "Linear SVM",
        # "RBF SVM",
        "Decision Tree",
        "Stochastic Gradient Descent",
        "Gaussian Process",
        "LDA",
        "QDA",
        "Random Forest",
        "GaussianNB",
        "AdaBoost",
        "XGBoost",
        "LogisticRegression(L1)",
        "LogisticRegression(L2)"
    ]

    classifiers = [
        KNeighborsClassifier(3),
        LinearSVC(C=1, penalty='l1', loss='squared_hinge', dual=False),
        # SVC(kernel='rbf', C=1000),
        DecisionTreeClassifier(),
        SGDClassifier(loss="perceptron", penalty="l2"),
        GaussianProcessClassifier(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        RandomForestClassifier(n_estimators=200, max_features=15),
        GaussianNB(),
        AdaBoostClassifier(),
        LogisticRegression(penalty='l1'),
        LogisticRegression(penalty='l2')
    ]

    figure = plt.figure(figsize=(27, 9))

    # iterate over classifer models
    print("Start training!")

    plot_number = 1
    for name, clf in zip(names, classifiers):
        #ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        y_score = clf.fit(X_train, Y_train)
        train_score = cross_val_score(clf, X_train, Y_train, cv=5)
        test_score = clf.score(X_test, Y_test)
        Y_pred = clf.predict(X_test)

        precision = precision_score(Y_test, Y_pred)
        recall = recall_score(Y_test, Y_pred)
        f1 = f1_score(Y_test, Y_pred)

        print("***", name, "***")
        print("Train Score:", train_score.mean())
        print("Test Score:", test_score)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 score", f1)
        print(classification_report(Y_test, Y_pred))

        # Plot ROC curve
        ax = plt.subplot(4, len(classifiers) / 2, plot_number)
        fpr, tpr, thresholds = roc_curve(Y_test, Y_pred)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr,
                 tpr,
                 lw=1,
                 color='darkorange',
                 label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.title(name)
        plt.xlabel('False positive rate', fontsize=12)
        plt.ylabel('True positive rate', fontsize=12)
        plt.legend(loc="lower right")
        plot_number += 1

    plt.subplots_adjust(hspace=0.5)
    plt.show()
Пример #19
0
def main():
    """Orchestrate data analysis."""
    # Load configuration file which describes the problem
    with open('config.json') as data_file:
        config = json.load(data_file)

    # Load data
    train_df = pd.read_csv(config['input']['train'])
    test_df = pd.read_csv(config['input']['test'])
    for factor_column in config['input']['factor_columns']:
        train_df.ix[:, factor_column] = (
            train_df.ix[:, factor_column].astype('category'))
    train_x = train_df.ix[:, config['input']['feature_columns']]
    test_x = test_df.ix[:, config['input']['feature_columns']]
    test_ids = test_df.ix[:, 0]
    train_y = train_df.ix[:, config['input']['label_column']]

    # Add new colum
    train_x['blood_per_donation'] = train_x.ix[:, 3] / train_x.ix[:, 2]
    del train_x['Total Volume Donated (c.c.)']
    test_x['blood_per_donation'] = test_x.ix[:, 3] / test_x.ix[:, 2]
    del test_x['Total Volume Donated (c.c.)']

    # Simple statistics
    print(train_x.describe(include='all'))
    print(train_y.describe(include='all'))
    print("# Class 1: %i \t\t # class 0: %i" %
          (sum(train_y), len(train_y) - sum(train_y)))

    # It's easier to work with numpy
    train_x_orig = train_x.as_matrix()
    train_y_orig = train_y.as_matrix()

    # Shuffle data
    perm = np.random.permutation(len(train_y_orig))
    train_x_orig = train_x_orig[perm]
    train_y_orig = train_y_orig[perm]

    # Get classifiers
    classifiers = [
        ('Logistic Regression (C=1)', LogisticRegression(C=1)),
        ('Logistic Regression (C=1000)', LogisticRegression(C=10000)),
        # ('RBM 200, n_iter=40, LR=0.01, Reg: C=1',
        #  Pipeline(steps=[('rbm', BernoulliRBM(n_components=200,
        #                                       n_iter=40,
        #                                       learning_rate=0.01,
        #                                       verbose=True)),
        #                  ('logistic', LogisticRegression(C=1))])),
        # ('RBM 200, n_iter=40, LR=0.01, Reg: C=10000',
        #  Pipeline(steps=[('rbm', BernoulliRBM(n_components=200,
        #                                       n_iter=40,
        #                                       learning_rate=0.01,
        #                                       verbose=True)),
        #                  ('logistic', LogisticRegression(C=10000))])),
        # ('RBM 100', Pipeline(steps=[('rbm', BernoulliRBM(n_components=100)),
        #                             ('logistic', LogisticRegression(C=1))])),
        # ('RBM 100, n_iter=20',
        #  Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, n_iter=20)),
        #                  ('logistic', LogisticRegression(C=1))])),
        # ('RBM 256', Pipeline(steps=[('rbm', BernoulliRBM(n_components=256)),
        #                             ('logistic', LogisticRegression(C=1))])),
        # ('RBM 512, n_iter=100',
        #  Pipeline(steps=[('rbm', BernoulliRBM(n_components=512, n_iter=10)),
        #                  ('logistic', LogisticRegression(C=1))])),
        # ('NN 20:5', skflow.TensorFlowDNNClassifier(hidden_units=[20, 5],
        #                                            n_classes=config['classes'],
        #                                            steps=500)),
        # ('NN 500:200 dropout',
        #  skflow.TensorFlowEstimator(model_fn=dropout_model,
        #                             n_classes=10,
        #                             steps=20000)),
        # ('CNN', skflow.TensorFlowEstimator(model_fn=conv_model,
        #                                    n_classes=10,
        #                                    batch_size=100,
        #                                    steps=20000,
        #                                    learning_rate=0.001)),
        ('SVM, adj.',
         SVC(probability=True,
             kernel="rbf",
             C=2.8,
             gamma=.0073,
             cache_size=200)),
        # ('SVM, linear', SVC(probability=True,
        #                     kernel="linear",
        #                     C=0.025,
        #                     cache_size=200)),
        ('k nn (k=3)', KNeighborsClassifier(3)),
        ('k nn (k=5)', KNeighborsClassifier(5)),
        ('k nn (k=7)', KNeighborsClassifier(7)),
        ('k nn (k=21)', KNeighborsClassifier(21)),
        ('Decision Tree', DecisionTreeClassifier(max_depth=5)),
        ('Random Forest', RandomForestClassifier(n_estimators=50, n_jobs=10)),
        ('Random Forest 2',
         RandomForestClassifier(max_depth=5,
                                n_estimators=10,
                                max_features=1,
                                n_jobs=10)),
        ('AdaBoost', AdaBoostClassifier()),
        ('Naive Bayes', GaussianNB()),
        ('Gradient Boosting', GradientBoostingClassifier()),
        ('LDA', LinearDiscriminantAnalysis()),
        ('QDA', QuadraticDiscriminantAnalysis())
    ]

    kf = KFold(n_splits=5)
    i = 0
    for clf_name, clf in classifiers:
        print("-" * 80)
        print("Name: %s (%i)" % (clf_name, i))
        score_estimates = []
        for train_ids, val_ids in kf.split(train_x_orig):
            # Split labeled data into training and validation
            train_x = train_x_orig[train_ids]
            train_y = train_y_orig[train_ids]
            val_x = train_x_orig[val_ids]
            val_y = train_y_orig[val_ids]

            # Train classifier
            clf.fit(train_x, train_y)

            # Estimate loss
            val_pred = clf.predict_proba(val_x)[:, 1]
            score_estimates.append(calculate_score(val_y, val_pred))
            print("Estimated score: %0.4f" % score_estimates[-1])
        print("Average estimated score: %0.4f" %
              np.array(score_estimates).mean())
        i += 1
    print("#" * 80)

    # Train classifier on complete data
    clf_name, clf = classifiers[13]
    print("Train %s on complete data and generated %s" %
          (clf_name, config['output']))
    clf.fit(train_x_orig, train_y_orig)

    # Predict and write output
    test_predicted = clf.predict_proba(test_x)[:, 1]
    write_solution(test_ids, test_predicted, config['output'])
Пример #20
0
def preprocess(X,
               y,
               X_val,
               test_data,
               verbose=True,
               scale=True,
               autoencoder=True,
               qda=True,
               knn=False,
               xgb=False):
    """Preprocess the data by adding features and scaling it.

    For each method, we train the model on the training data using the
    corresponding labels, then apply the same transformation to
    validation and test data.

    Args:
        X (numpy ndarray): Training data
        y (numpy ndarray): Training labels
        X_val (numpy ndarray): Validation data
        test_data (numpy ndarray): Test data for submission
        verbose (bool): log level
        scale (bool): scale the data
        autoencoder (bool): use autoencoder feature
        qda (bool): use Quadratic Discriminant Analysis feature
        knn (bool): use k-nearest neighbours feature
        xgb (bool): use XGBoost feature

    Returns:
        The dataset appropriately transformed by the selected methods.
    """
    if autoencoder:
        if verbose:
            print("## Autoencoder")
            print("### Train...", end=" ", flush=True)
            ae = train_autoencoder(X, size=32, epochs=20, verbose=1)
        else:
            ae = train_autoencoder(X, size=32, epochs=20, verbose=0)
        if verbose:
            print("done.")
            print("### Evaluate...", end=" ", flush=True)
        ae.eval()
        X_ae = ae.layer1(Variable(torch.Tensor(X))).data
        X = np.c_[X, X_ae]
        X_val_ae = ae.layer1(Variable(torch.Tensor(X_val))).data
        X_val = np.c_[X_val, X_val_ae]
        test_data_ae = ae.layer1(Variable(torch.Tensor(test_data))).data
        test_data = np.c_[test_data, test_data_ae]
        if verbose:
            print("done.")

    if qda:
        if verbose:
            print("## Quadratic Discriminant Analysis...", end=" ", flush=True)
        qdaclf = QuadraticDiscriminantAnalysis(reg_param=0.02)
        qdaclf.fit(X, y)
        X_qda = qdaclf.predict_proba(X)
        X = np.c_[X, X_qda[:, 1]]
        X_val_qda = qdaclf.predict_proba(X_val)
        X_val = np.c_[X_val, X_val_qda[:, 1]]
        test_data_qda = qdaclf.predict_proba(test_data)
        test_data = np.c_[test_data, test_data_qda[:, 1]]
        if verbose:
            print("done.")

    if knn:
        print("## K-Nearest Neighbours...", end=" ", flush=True)
        knnclf = KNeighborsClassifier(n_neighbors=10, p=2, n_jobs=-1)
        knnclf.fit(X, y)
        X_knn = knnclf.predict_proba(X)
        X = np.c_[X, X_knn[:, 1]]
        X_val_knn = knnclf.predict_proba(X_val)
        X_val = np.c_[X_val, X_val_knn[:, 1]]
        test_data_knn = knnclf.predict_proba(test_data)
        test_data = np.c_[test_data, test_data_knn[:, 1]]
        print("done.")

    if xgb:
        print("## XGBoost...", end=" ", flush=True)
        xgbclf = XGBClassifier(max_depth=3,
                               learning_rate=0.1,
                               n_estimators=1000,
                               gamma=10,
                               min_child_weight=10,
                               objective='binary:logistic',
                               n_jobs=4)
        xgbclf.fit(X, y)
        X_xgb = xgbclf.predict_proba(X)
        X_val_xgb = xgbclf.predict_proba(X_val)
        X = np.c_[X, X_xgb[:, 1]]
        X_val = np.c_[X_val, X_val_xgb[:, 1]]
        test_data_xgb = xgbclf.predict_proba(test_data)
        test_data = np.c_[test_data, test_data_xgb[:, 1]]
        print("done.")

    if scale:
        if verbose:
            print("## Scaling...", end=" ", flush=True)
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        X_val = scaler.transform(X_val)
        test_data = scaler.transform(test_data)
        if verbose:
            print("done.")

    return X, y, X_val, test_data
Пример #21
0
def main():
    # when running on circleci, set the vars in the project settings
    public_id = os.environ.get('NUMERAPI_PUBLIC_ID', '')
    secret_key = os.environ.get('NUMERAPI_SECRET_KEY', '')

    if not os.path.exists(test_csv):
        os.makedirs(test_csv)

    napi = NumeraiApiWrapper(public_id=public_id, secret_key=secret_key)

    if not os.path.exists(DATA_SET_PATH):
        logger.info("Downloading the current dataset...")
        os.makedirs(DATA_SET_PATH)
        napi.download_current_dataset(dest_path=DATA_SET_PATH,
                                      dest_filename=DATA_SET_FILE + '.zip',
                                      unzip=True)

        shutil.move(os.path.join(DATA_SET_PATH, DATA_SET_FILE, TRAIN_FILE),
                    os.path.join(DATA_SET_PATH, TRAIN_FILE))
        shutil.move(os.path.join(DATA_SET_PATH, DATA_SET_FILE, TOURN_FILE),
                    os.path.join(DATA_SET_PATH, TOURN_FILE))
    else:
        logger.info("Found old data to use.")

    training_data = pd.read_csv('%s/%s' % (DATA_SET_PATH, TRAIN_FILE),
                                header=0)
    tournament_data = pd.read_csv('%s/%s' % (DATA_SET_PATH, TOURN_FILE),
                                  header=0)

    napi.set_data(tournament_data, training_data)

    features = [f for f in list(training_data) if "feature" in f]
    features = features[:len(features) //
                        2]  # just use half, speed things up a bit
    X, Y = training_data[features], training_data[
        "target_bernie"]  # hardcode to target bernie for now

    x_prediction = tournament_data[features]
    ids = tournament_data["id"]

    clfs = [
        RandomForestClassifier(n_estimators=15,
                               max_features=1,
                               max_depth=2,
                               n_jobs=1,
                               criterion='entropy',
                               random_state=42),
        XGBClassifier(learning_rate=0.1,
                      subsample=0.4,
                      max_depth=2,
                      n_estimators=20,
                      nthread=1,
                      seed=42),
        DecisionTreeClassifier(max_depth=5, random_state=42),
        MLPClassifier(alpha=1, hidden_layer_sizes=(25, 25), random_state=42),
        GaussianNB(),
        QuadraticDiscriminantAnalysis(tol=1.0e-3),
        # last item can have multiple jobs since it may be the last to be processed so we have an extra core
        LogisticRegression(n_jobs=2,
                           solver='sag',
                           C=1,
                           tol=1e-2,
                           random_state=42,
                           max_iter=50)
    ]

    before = time.time()
    fit_all(clfs, X, Y)
    logger.info('all clfs fit() took %.2fs' % (time.time() - before))

    before = time.time()
    uploads_wait_for_legit = predict_and_upload_legit(napi, clfs, x_prediction,
                                                      ids)
    logger.info('all legit clfs predict_proba() took %.2fs' %
                (time.time() - before))

    before = time.time()
    uploads_wait_for_mix = predict_and_upload_mix(napi, clfs, tournament_data,
                                                  x_prediction, ids)
    logger.info('all mix clfs predict_proba() took %.2fs' %
                (time.time() - before))

    legit_submission_ids = list()
    mix_submission_ids = list()

    before = time.time()
    for f in futures.as_completed(uploads_wait_for_legit):
        legit_submission_ids.append(f.result())
    logger.info('await legit uploads took %.2fs' % (time.time() - before))

    before = time.time()
    for f in futures.as_completed(uploads_wait_for_mix):
        mix_submission_ids.append(f.result())
    logger.info('await mix uploads took %.2fs' % (time.time() - before))

    n_passed_concordance = get_concordance(napi, legit_submission_ids)
    if len(n_passed_concordance) != len(clfs):
        logger.error('legit passed concordance %s/%s' %
                     (len(n_passed_concordance), len(clfs)))
        sys.exit(1)
    else:
        logger.info('all legit tests passed!')

    n_passed_concordance = get_concordance(napi, mix_submission_ids)
    if len(n_passed_concordance) > 0:
        logger.error('mix passed concordance %s/%s' %
                     (len(n_passed_concordance), len(clfs)))
        sys.exit(1)
    else:
        logger.info('all mix tests passed!')

    sys.exit(0)
Пример #22
0
def create_csv_score_YES_NO(scaler_, abbr_scaler):

    #tot_random_state = []
    tot_train_score = []
    tot_test_score = []
    #tot_macro_ovo = []
    #tot_weighted_ovo = []
    #tot_macro_ovr = []
    tot_weighted_ovr = []

    for i in range(1, 31):

        #train test split
        X_train, X_test, y_train, y_test = train_test_split(
            public_data, public_labels, test_size=0.3, stratify=public_labels)

        #tot_random_state.append(500*i)

        #vettorizzare i label
        train_labels_encoded = encoder.fit_transform(y_train)
        test_labels_encoded = encoder.transform(y_test)

        scaler = scaler_
        clf = QuadraticDiscriminantAnalysis()

        steps = [('scaler', scaler), ('red_dim', None), ('clf', clf)]

        pipeline = Pipeline(steps)

        summary = pipeline.named_steps

        pipeline.fit(X_train, train_labels_encoded)

        score_train = pipeline.score(X_train, train_labels_encoded)
        tot_train_score.append(score_train)

        score_test = pipeline.score(X_test, test_labels_encoded)
        tot_test_score.append(score_test)

        y_scores = pipeline.predict_proba(X_test)

        #macro_ovo = roc_auc_score(test_labels_encoded, y_scores, average='macro',  multi_class='ovo')
        #weighted_ovo = roc_auc_score(test_labels_encoded, y_scores, average='weighted',  multi_class='ovo')
        #macro_ovr = roc_auc_score(test_labels_encoded, y_scores, average='macro',  multi_class='ovr')
        weighted_ovr = roc_auc_score(test_labels_encoded,
                                     y_scores,
                                     average='weighted',
                                     multi_class='ovr')

        #tot_macro_ovo.append(macro_ovo)
        #tot_weighted_ovo.append(weighted_ovo)
        #tot_macro_ovr.append(macro_ovr)
        tot_weighted_ovr.append(weighted_ovr)

        y_pred = pipeline.predict(X_test)

        report = classification_report(test_labels_encoded,
                                       y_pred,
                                       output_dict=True)
        df_r = pd.DataFrame(report)
        df_r = df_r.transpose()
        #df_r.to_csv(f'/home/users/ubaldi/TESI_PA/result_CV/report_{name}/report_{i}')

        #outname = f'report_{i}.csv'

        #outdir = f'/home/users/ubaldi/TESI_PA/result_score/Public/{folder}/report_{name}_{str(abbr_scaler)}_YES_NO'
        #if not os.path.exists(outdir):
        #    os.makedirs(outdir)

        #fullname_r = os.path.join(outdir, outname)

        #df_r.to_csv(fullname_r)

    #mean value and std

    mean_train_score = np.mean(tot_train_score)
    mean_test_score = np.mean(tot_test_score)
    mean_weighted_ovr = np.mean(tot_weighted_ovr)

    std_train_score = np.std(tot_train_score)
    std_test_score = np.std(tot_test_score)
    std_weighted_ovr = np.std(tot_weighted_ovr)

    # pandas can convert a list of lists to a dataframe.
    # each list is a row thus after constructing the dataframe
    # transpose is applied to get to the user's desired output.
    df = pd.DataFrame([
        tot_train_score, [mean_train_score], [std_train_score], tot_test_score,
        [mean_test_score], [std_test_score], tot_weighted_ovr,
        [mean_weighted_ovr], [std_weighted_ovr], [scaler]
    ])
    df = df.transpose()

    fieldnames = [
        'train_accuracy', 'train_accuracy_MEAN', 'train_accuracy_STD',
        'test_accuracy', 'test_accuracy_MEAN', 'test_accuracy_STD',
        'roc_auc_score_weighted_ovr', 'roc_auc_score_weighted_ovr_MEAN',
        'roc_auc_score_weighted_ovr_STD', 'SCALER'
    ]

    ## write the data to the specified output path: "output"/+file_name
    ## without adding the index of the dataframe to the output
    ## and without adding a header to the output.
    ## => these parameters are added to be fit the desired output.
    #df.to_csv(f'/home/users/ubaldi/TESI_PA/result_score/Public/score_{name}.csv', index=False, header=fieldnames)

    #create folder and save

    import os

    outname = f'score_{name}_{str(abbr_scaler)}_YES_NO.csv'

    outdir = f'/home/users/ubaldi/TESI_PA/result_score/Public/{folder}/'
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    fullname = os.path.join(outdir, outname)

    df.to_csv(fullname, index=False, header=fieldnames)
    "Nearest Neighbors", "Linear SVC", "RBF SVC", "Gaussian Process",
    "Decision Tree", "Random Forest", "Multilayer Perceptron", "AdaBoost",
    "Naive Bayes", "QDA", "XGBoost", "Logistic Regression"
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(n_estimators=100, max_features='auto'),
    MLPClassifier(alpha=1, max_iter=int(1e8)),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(), XGBClassifier,
    LogisticRegression()
]

selectors = [
    reliefF.reliefF, fisher_score.fisher_score, gini_index.gini_index,
    chi_square.chi_square, JMI.jmi, CIFE.cife, DISR.disr, MIM.mim, CMIM.cmim,
    ICAP.icap, MRMR.mrmr, MIFS.mifs
]

selectornames_short = [
    "RELF", "FSCR", "GINI", "CHSQ", "JMI", "CIFE", "DISR", "MIM", "CMIM",
    "ICAP", "MRMR", "MIFS"
]

# class boundary list
Пример #24
0
def predefined_ops():
    '''return dict of user defined none-default instances of operators
    '''
    clean = {
        'clean':
        Cleaner(dtype_filter='not_datetime',
                na1='null',
                na2='mean',
                drop_uid=True),
        'cleanNA':
        Cleaner(dtype_filter='not_datetime', na1=None, na2=None),
        'cleanMean':
        Cleaner(dtype_filter='not_datetime', na1='most_frequent', na2='mean'),
        'cleanMn':
        Cleaner(dtype_filter='not_datetime', na1='missing', na2='mean'),
    }
    #
    encode = {
        'woe8': WoeEncoder(max_leaf_nodes=8),
        'woe5': WoeEncoder(max_leaf_nodes=5),
        'woeq8': WoeEncoder(q=8),
        'woeq5': WoeEncoder(q=5),
        'woeb5': WoeEncoder(bins=5),
        'woem': WoeEncoder(mono=True),
        'oht': OhtEncoder(),
        'ordi': OrdiEncoder(),

        # 'bin10': BinEncoder(bins=10, int_bins=True),  # 10 bin edges encoder
        # 'bin5': BinEncoder(bins=5, int_bins=True),  # 5 bin edges encoder
        # 'binm10': BinEncoder(max_leaf_nodes=10,
        #                      int_bins=True),  # 10 bin tree cut edges encoder
        # 'binm5': BinEncoder(max_leaf_nodes=5,
        #                     int_bins=True),  # 5 bin tree cut edges encoder
    }

    resample = {
        # over_sampling
        # under sampling controlled methods
        'runder':
        RandomUnderSampler(),
        'nearmiss':
        NearMiss(version=3),
        'pcart':
        InstanceHardnessThreshold(),
        # clean outliers
        'inlierForest':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'IsolationForest',
                            'contamination': 0.1
                        }),
        'inlierLocal':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'LocalOutlierFactor',
                            'contamination': 0.1
                        }),
        'inlierEllip':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'EllipticEnvelope',
                            'contamination': 0.1
                        }),
        'inlierOsvm':
        FunctionSampler(_outlier_rejection,
                        kw_args={
                            'method': 'OneClassSVM',
                            'contamination': 0.1
                        }),
    }

    scale = {
        'stdscale': StandardScaler(),
        'minmax': MinMaxScaler(),
        'absmax': MaxAbsScaler(),
        'rscale': RobustScaler(quantile_range=(10, 90)),
        'quantile': QuantileTransformer(),  # uniform distribution
        'power': PowerTransformer(),  # Gaussian distribution
        'norm': Normalizer(),  # default L2 norm

        # scale sparse data
        'maxabs': MaxAbsScaler(),
        'stdscalesp': StandardScaler(with_mean=False),
    }
    # feature construction
    feature_c = {
        'pca': PCA(whiten=True),
        'spca': SparsePCA(n_jobs=-1),
        'ipca': IncrementalPCA(whiten=True),
        'kpca': KernelPCA(kernel='rbf', n_jobs=-1),
        'poly': PolynomialFeatures(degree=2),
        # kernel approximation
        'Nys': Nystroem(random_state=0),
        'rbf': RBFSampler(random_state=0),
        'rfembedding': RandomTreesEmbedding(n_estimators=10),
        'LDA': LinearDiscriminantAnalysis(),
        'QDA': QuadraticDiscriminantAnalysis(),
    }
    # select from model
    feature_m = {
        'fwoe':
        SelectFromModel(WoeEncoder(max_leaf_nodes=5)),
        'flog':
        SelectFromModel(LogisticRegression(penalty='l1', solver='saga',
                                           C=1e-2)),
        'fsgd':
        SelectFromModel(SGDClassifier(penalty="l1")),
        'fxgb':
        SelectFromModel(
            XGBClassifier(n_jobs=-1,
                          booster='gbtree',
                          max_depth=2,
                          n_estimators=50), ),
        'frf':
        SelectFromModel(ExtraTreesClassifier(n_estimators=50, max_depth=2)),

        # fixed number of features
        'fxgb20':
        SelectFromModel(XGBClassifier(n_jobs=-1, booster='gbtree'),
                        max_features=20),
        'frf20':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=20),
        'frf10':
        SelectFromModel(ExtraTreesClassifier(n_estimators=100, max_depth=5),
                        max_features=10),
        'fRFElog':
        RFE(LogisticRegression(penalty='l1', solver='saga', C=1e-2), step=0.1),
        'fRFExgb':
        RFE(XGBClassifier(n_jobs=-1, booster='gbtree'), step=0.1),
    }
    # Univariate feature selection
    feature_u = {
        'fchi2':
        GenericUnivariateSelect(chi2, 'percentile', 25),
        'fMutualclf':
        GenericUnivariateSelect(mutual_info_classif, 'percentile', 25),
        'fFclf':
        GenericUnivariateSelect(f_classif, 'percentile', 25),
    }

    imp = {
        "impXGB":
        XGBClassifier(n_jobs=-1,
                      booster='gbtree',
                      max_depth=2,
                      n_estimators=50),
        "impRF":
        ExtraTreesClassifier(n_estimators=100, max_depth=2)
    }

    instances = {}
    instances.update(**clean, **encode, **scale, **feature_c, **feature_m,
                     **feature_u, **resample, **imp)
    return instances
Пример #25
0
  plot_CM_and_ROC_curve(clf, x_std, y_train, x_test_std, y_test)

#Ensemble Model
plot_CM_and_ROC_curve(('Ensemble model', eclf), x_std, y_train, x_test_std, y_test)

#A list of classifiers to run K-Fold Cross Validation on
clfrs = []

clfrs.append(('Logistic Regression', LogisticRegression(random_state=42)))
clfrs.append(('Naive Bayes', GaussianNB()))
#classifiers.append(('KNN', KNeighborsClassifier()))#This one takes a very long time to run!
#classifiers.append(('SVM', SVC(random_state=42, probability=True))) #This one takes a very long time to run!
clfrs.append(('Decision Tree', DecisionTreeClassifier(random_state=42)))
clfrs.append(('Random Forest', RandomForestClassifier(random_state=42)))
clfrs.append(('LDA', LinearDiscriminantAnalysis()))
clfrs.append(('QDA', QuadraticDiscriminantAnalysis()))
clfrs.append(('Ensemble Model', eclf))

#Iterate over the list to validate every model
#This step of validating every trained model takes a lot of time to execute. As the dataset it has to validate over is very large
#The runtime of the code is subject to a good GPU unit, which in general laptops is a constraint
#The value of k is set 20
for classifier in clfrs:
    clf = classifier[1]
    clf.fit(x_train, y_train)
    training_score = cross_val_score(clf, x_train, y_train, cv=20)
    print("Classifiers: ", classifier[0], "has a cross validation score of", round(training_score.mean(), 2) * 100, "% accuracy score")

#A bar plot for all the trained models and their F1 score
train_accuracies = [0.71, 0.61, 1.00, 1.00, .72, 0.68, 0.86]
models = ['Logistic Regression', 'Naive Bayes', 'Decision Tree', 'Random Forest', 'LDA', 'QDA', 'Ensemble']
Пример #26
0
    splot.set_yticks(())


def plot_lda_cov(lda, splot):
    plot_ellipse(splot, lda.means_[0], lda.covariance_, 'red')
    plot_ellipse(splot, lda.means_[1], lda.covariance_, 'blue')


def plot_qda_cov(qda, splot):
    plot_ellipse(splot, qda.means_[0], qda.covariances_[0], 'red')
    plot_ellipse(splot, qda.means_[1], qda.covariances_[1], 'blue')


for i, (X, y) in enumerate([dataset_fixed_cov(), dataset_cov()]):
    # Linear Discriminant Analysis
    lda = LinearDiscriminantAnalysis(solver="svd", store_covariance=True)
    y_pred = lda.fit(X, y).predict(X)
    splot = plot_data(lda, X, y, y_pred, fig_index=2 * i + 1)
    plot_lda_cov(lda, splot)
    plt.axis('tight')

    # Quadratic Discriminant Analysis
    qda = QuadraticDiscriminantAnalysis(store_covariance=True)
    y_pred = qda.fit(X, y).predict(X)
    splot = plot_data(qda, X, y, y_pred, fig_index=2 * i + 2)
    plot_qda_cov(qda, splot)
    plt.axis('tight')
plt.suptitle('Linear Discriminant Analysis vs Quadratic Discriminant'
             'Analysis')
plt.show()
Пример #27
0
def train_models(model_name, epoch=5, batch_size=100):
    log.info("current model:{}".format(model_name))
    pos = pd.read_csv(
        "Order_predicts/datasets/results/train/action_pos_features.csv")
    posfillna = pos.fillna(pos.median()).replace(np.inf, 100)
    neg = pd.read_csv(
        "Order_predicts/datasets/results/train/action_neg_features.csv")
    negfillna = neg.fillna(neg.median()).replace(np.inf, 100)
    data = pd.concat([posfillna, negfillna])
    data = shuffle(data)
    del data['id']
    y = data['label']
    del data['label']
    scaler = preprocessing.StandardScaler().fit(data)
    X = scaler.transform(data)
    pd.DataFrame(X).to_csv("Order_predicts/datasets/results/scale_x.csv",
                           index=None)
    data_scaled = preprocessing.scale(X)
    log.info("data shape: {}".format(data_scaled.shape))
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.1,
                                                        random_state=0)
    log.info("{}, {}".format(X_train.shape, X_test.shape))

    i = 0
    for e in range(epoch):
        for train_x, train_y in minibatches(X_train,
                                            y_train,
                                            batch_size=batch_size,
                                            shuffle=False):
            if model_name == 'svc':
                clf_weights = svm.SVC(C=1.0,
                                      kernel='rbf',
                                      degree=3,
                                      gamma='auto',
                                      coef0=0.0,
                                      shrinking=True,
                                      probability=False,
                                      tol=1e-3,
                                      cache_size=200,
                                      class_weight={1: 10},
                                      verbose=False,
                                      max_iter=-1,
                                      decision_function_shape='ovr',
                                      random_state=0)
            elif model_name == 'svr':
                clf_weights = svm.SVR(kernel='rbf',
                                      degree=3,
                                      gamma='auto',
                                      coef0=0.0,
                                      tol=1e-3,
                                      C=1.0,
                                      epsilon=0.1,
                                      shrinking=True,
                                      cache_size=200,
                                      verbose=False,
                                      max_iter=-1)
            elif model_name == 'lasso':
                clf_weights = Lasso(alpha=1.0,
                                    fit_intercept=True,
                                    normalize=False,
                                    precompute=False,
                                    copy_X=True,
                                    max_iter=1000,
                                    tol=1e-4,
                                    warm_start=False,
                                    positive=False,
                                    random_state=0,
                                    selection='cyclic')
            elif model_name == 'logistic':
                clf_weights = LogisticRegression(penalty='l2',
                                                 dual=False,
                                                 tol=1e-4,
                                                 C=1.0,
                                                 fit_intercept=True,
                                                 intercept_scaling=1,
                                                 class_weight={
                                                     0: 0.1,
                                                     1: 0.9
                                                 },
                                                 random_state=0,
                                                 solver='newton-cg',
                                                 max_iter=100,
                                                 multi_class='ovr',
                                                 verbose=0,
                                                 warm_start=False,
                                                 n_jobs=1)
            elif model_name == 'mlpr':
                # learning_rate: {'constant', 'invscaling', 'adaptive'}
                clf_weights = MLPRegressor(hidden_layer_sizes=(100, ),
                                           activation="logistic",
                                           solver='adam',
                                           alpha=0.0001,
                                           batch_size='auto',
                                           learning_rate="constant",
                                           learning_rate_init=0.001,
                                           power_t=0.5,
                                           max_iter=200,
                                           shuffle=True,
                                           random_state=0,
                                           tol=1e-4,
                                           verbose=False,
                                           warm_start=False,
                                           momentum=0.9,
                                           nesterovs_momentum=True,
                                           early_stopping=False,
                                           validation_fraction=0.1,
                                           beta_1=0.9,
                                           beta_2=0.999,
                                           epsilon=1e-8)
            elif model_name == 'rf':
                clf_weights = RandomForestClassifier(
                    n_estimators=20,
                    criterion="entropy",
                    max_depth=None,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_features="auto",
                    max_leaf_nodes=None,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    bootstrap=True,
                    oob_score=False,
                    n_jobs=1,
                    random_state=0,
                    verbose=0,
                    warm_start=False,
                    class_weight={
                        0: 0.1,
                        1: 0.9
                    })
            elif model_name == 'adaboost':
                base_estimator = RandomForestClassifier(
                    n_estimators=20,
                    criterion="entropy",
                    max_depth=None,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_features="auto",
                    max_leaf_nodes=None,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    bootstrap=True,
                    oob_score=False,
                    n_jobs=1,
                    random_state=0,
                    verbose=0,
                    warm_start=False,
                    class_weight={
                        0: 0.1,
                        1: 0.9
                    })
                base_estimator1 = LogisticRegression(penalty='l2',
                                                     dual=False,
                                                     tol=1e-4,
                                                     C=1.0,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     class_weight={
                                                         0: 0.1,
                                                         1: 0.9
                                                     },
                                                     random_state=0,
                                                     solver='newton-cg',
                                                     max_iter=100,
                                                     multi_class='ovr',
                                                     verbose=0,
                                                     warm_start=False,
                                                     n_jobs=1)
                clf_weights = AdaBoostClassifier(base_estimator=base_estimator,
                                                 n_estimators=50,
                                                 learning_rate=0.6666,
                                                 algorithm='SAMME.R',
                                                 random_state=0)

            elif model_name == 'gbr':
                clf_weights = GradientBoostingRegressor(
                    loss='ls',
                    learning_rate=0.1,
                    n_estimators=100,
                    subsample=1.0,
                    criterion='friedman_mse',
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_depth=3,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    init=None,
                    random_state=0,
                    max_features=None,
                    alpha=0.9,
                    verbose=0,
                    max_leaf_nodes=None,
                    warm_start=False,
                    presort='auto')
            elif model_name == 'qda':
                clf_weights = QuadraticDiscriminantAnalysis(
                    priors=None,
                    reg_param=0.,
                    store_covariance=False,
                    tol=1.0e-4,
                    store_covariances=None)
            elif model_name == 'lda':
                clf_weights = LinearDiscriminantAnalysis(
                    solver='svd',
                    shrinkage=None,
                    priors=None,
                    n_components=None,
                    store_covariance=False,
                    tol=1e-4)
            elif model_name == 'n_n':
                clf_weights = NearestNeighbors(n_neighbors=5,
                                               radius=1.0,
                                               algorithm='auto',
                                               leaf_size=30,
                                               metric='minkowski',
                                               p=2,
                                               metric_params=None,
                                               n_jobs=1)
            elif model_name == 'gnb':
                clf_weights = GaussianNB(priors=None)
            elif model_name == 'bnb':
                clf_weights = BernoulliNB(alpha=1.0,
                                          binarize=.0,
                                          fit_prior=True,
                                          class_prior=None)
            elif model_name == 'dcc':
                clf_weights = DecisionTreeClassifier(
                    criterion="gini",
                    splitter="best",
                    max_depth=None,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_features=None,
                    random_state=0,
                    max_leaf_nodes=None,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    class_weight=None,
                    presort=False)
            elif model_name == 'dcr':
                clf_weights = DecisionTreeRegressor(
                    criterion="mse",
                    splitter="best",
                    max_depth=None,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    min_weight_fraction_leaf=0.,
                    max_features=None,
                    random_state=0,
                    max_leaf_nodes=None,
                    min_impurity_decrease=0.,
                    min_impurity_split=None,
                    presort=False)
            elif model_name == 'RAN':
                base_estimator = LinearRegression()
                clf_weights = RANSACRegressor(base_estimator=base_estimator,
                                              min_samples=None,
                                              residual_threshold=None,
                                              is_data_valid=None,
                                              is_model_valid=None,
                                              max_trials=100,
                                              max_skips=np.inf,
                                              stop_n_inliers=np.inf,
                                              stop_score=np.inf,
                                              stop_probability=0.99,
                                              residual_metric=None,
                                              loss='absolute_loss',
                                              random_state=0)
            elif model_name == 'adar':
                clf_weights = AdaBoostRegressor(base_estimator=None,
                                                n_estimators=50,
                                                learning_rate=1.,
                                                loss='linear',
                                                random_state=None)

            else:  # model_name == 'SGDR':
                clf_weights = SGDRegressor(loss="squared_loss",
                                           penalty="l2",
                                           alpha=0.0001,
                                           l1_ratio=0.15,
                                           fit_intercept=True,
                                           max_iter=None,
                                           tol=None,
                                           shuffle=True,
                                           verbose=0,
                                           epsilon=0.1,
                                           random_state=None,
                                           learning_rate="invscaling",
                                           eta0=0.01,
                                           power_t=0.25,
                                           warm_start=False,
                                           average=False,
                                           n_iter=None)

            # build
            clf_weights.fit(train_x, train_y)
            i += 1

            if i % 20 == 0:
                mse = mean_squared_error(y_test, clf_weights.predict(X_test))
                log.info("均方误差:{}".format(mse))
                avgscores = cross_val_score(clf_weights, train_x,
                                            train_y).mean()
                log.info("{}/{} 训练集得分平均值: {}".format(e, i, avgscores))
                model_path = os.path.join(
                    "Order_predicts/datasets/results/models",
                    '{}'.format(model_name))
                if not os.path.exists(model_path):
                    os.makedirs(model_path)
                joblib.dump(
                    clf_weights,
                    os.path.join(model_path, "{}_{}.model".format(e, i)))
                log.info(" Save ")

            if i % 50 == 0:
                scores = clf_weights.score(X_test, y_test)
                log.info("验证得分: {}".format(scores))
Пример #28
0
        print(confusion_matrix(y_test_folds1, y_pred1))
        i=i+1
        if i == 3:
            break
    break

# =============================================================================
# [[42538   462]
#  [ 2369   255]] # not good enough yet
# =============================================================================




#### QuadraticDiscriminantAnalysis
qda_clf = QuadraticDiscriminantAnalysis()

skfolds = StratifiedKFold(n_splits = 20, random_state=77)
skfolds1 = StratifiedKFold(n_splits = 3, random_state=77)

for train_index, test_index in skfolds.split(x_train, y_train):
    clone_clf = clone(qda_clf)
    x_train_folds = x_train[train_index]
    y_train_folds = y_train[train_index]
    x_test_folds = x_train[test_index]
    y_test_folds = y_train[test_index]
    for train_index1, test_index1 in skfolds1.split(x_test_folds, y_test_folds):
        clone_clf = clone(qda_clf)
        x_train_folds1 = x_test_folds[train_index1]
        y_train_folds1 = y_test_folds[train_index1]
        x_test_folds1 = x_test_folds[test_index1]
Пример #29
0
clf.fit(X_train, Y_train)
print clf.best_params_
#for score in clf.grid_scores_:
#    print score

# AdaBoost
print
print "AdaBoost"
rate = clf.best_params_['learning_rate']
classifier = AdaBoostClassifier(learning_rate=rate)
classifier.fit(X_train, Y_train)
acc = classifier.score(X_test, Y_test)
print "Accuracy:", acc

print
print "Compare models"
for classifier in [
        DecisionTreeClassifier(),
        LogisticRegression(),
        LinearDiscriminantAnalysis(),
        QuadraticDiscriminantAnalysis(),
        KNeighborsClassifier(),
        AdaBoostClassifier(),
        BaggingClassifier(),
        RandomForestClassifier(),
]:
    print str(classifier)[:str(classifier).find('(')],
    classifier.fit(X_train, Y_train)
    pred = classifier.predict(X_test)
    acc = metrics.accuracy_score(Y_test, pred)
    print "Accuracy:", acc
Пример #30
0
recall_2 = recall(predict_2, t)
recall_3 = recall(predict_3, t)

print('P(C = 1|x) = 0.05, precision: ' + str(precision_1) + ', recall: ' +
      str(recall_1))
print('P(C = 1|x) = 0.5, precision: ' + str(precision_2) + ', recall: ' +
      str(recall_2))
print('P(C = 1|x) = 0.6, precision: ' + str(precision_3) + ', recall: ' +
      str(recall_3))

#4
#a
X, t = gen_data(mu0=(1, 1), mu1=(2, 2), cov0=0, cov1=-0.9, N0=1000, N1=500)
X_repeat, t_repeat = X, t  ## used in 4e

clf = QuadraticDiscriminantAnalysis()
model = clf.fit(X, t)
accuracy4a = clf.score(X, t)
print('accuracy: ' + str(accuracy4a))

fig_4a = plt.figure()
fig_4a.suptitle('Question 4(a): Decision boundary and contours')
X, y = X.T
colors = []
for i in range(len(t)):
    if (t[i] == 0):
        colors.append('red')
    else:
        colors.append('blue')
plt.scatter(X, y, c=colors, s=2)
bonnerlib2.dfContour(clf)