예제 #1
0
def analyze_logistic(X,
                     y,
                     model,
                     scale_columns,
                     analyze_params=False,
                     balance_outcomes=False):
    """
    Function for doing analysis of logistic regression. Plots cumulative gain, confusion matrix
    and grid search of optimal learning rate/epochs in SGD with k-fold CV (optional).
    Performs scaling of all continuous features in the data set.

    Inputs:
    - X: design matrix, shape (n, p)
    - y: targets, shape (n,)
    - scale_columns: list of indices of which columns to MinMax scale
    - analyze_params: boolean, option to perform grid search of learning rate and n_epochs in SGD
    - balance_outcomes: boolean, option to balance training data in case of skewed classes
    """

    #split data in train/validate and test
    X_train_val, X_test, y_train_val, y_test = train_test_split(X,
                                                                y,
                                                                test_size=0.1)

    #balance training set such that outcomes are 50/50 in training data
    if balance_outcomes:
        non_default_inds = np.where(y_train_val == 0)[0]
        default_inds = np.where(y_train_val == 1)[0]

        remove_size = len(non_default_inds) - len(default_inds)
        remove_inds = np.random.choice(non_default_inds,
                                       size=remove_size,
                                       replace=False)

        X_train_val = np.delete(X, remove_inds, axis=0)
        y_train_val = np.delete(y, remove_inds, axis=0)
    #end if

    #scale continuous features
    minmaxscaler = MinMaxScaler(feature_range=(-1, 1))
    scaler = ColumnTransformer(remainder='passthrough',
                               transformers=[('minmaxscaler', minmaxscaler,
                                              scale_columns)])

    #scale only test data at this point (CV scales training/validation)
    scaler.fit(X_train_val)
    X_test = scaler.transform(X_test)

    if analyze_params:

        #initialize vectors for saving results
        error_scores = pd.DataFrame(
            columns=['log eta', 'n_epochs', 'mse', 'r2', 'accuracy'])
        n_etas = 4
        eta_vals = np.linspace(-1, -4, n_etas)
        n_epoch_vals = np.array([10, 100, 500, 1000])
        n_epochs = len(n_epoch_vals)
        accuracy_scores = np.zeros((n_etas, n_epochs))

        max_accuracy = 0
        best_eta = 0
        best_n_epochs = 0

        #perform grid search of best learning rate
        #and number of epochs with k-fold cross-validation
        i = 0
        for eta in eta_vals:
            model.set_eta(10**eta)

            j = 0
            for epoch in n_epoch_vals:
                model.set_n_epochs(epoch)

                #perform cross validation
                mse, r2, accuracy = CV(X_train_val, y_train_val, model)
                accuracy_scores[i, j] = accuracy

                error_scores = error_scores.append(
                    {
                        'log eta': eta,
                        'n_epochs': epoch,
                        'mse': mse,
                        'r2': r2,
                        'accuracy': accuracy
                    },
                    ignore_index=True)

                #check if current configuration is better
                if accuracy > max_accuracy:
                    max_accuracy = accuracy
                    best_eta = eta
                    best_n_epochs = epoch

                j += 1
                #end for epoch
            i += 1
            #end for eta

        #set optimal model parameters
        model.set_eta(10**best_eta)
        model.set_n_epochs(best_n_epochs)

        #plot heatmap of grid search
        acc_table = pd.pivot_table(error_scores,
                                   values='accuracy',
                                   index=['log eta'],
                                   columns='n_epochs')
        idx_i = np.where(acc_table == max_accuracy)[0]
        idx_j = np.where(acc_table == max_accuracy)[1]

        fig = plt.figure()
        ax = sns.heatmap(acc_table,
                         annot=True,
                         fmt='.2g',
                         cbar=True,
                         linewidths=1,
                         linecolor='white',
                         cbar_kws={'label': 'Accuracy'})

        ax.add_patch(
            Rectangle((idx_j, idx_i), 1, 1, fill=False, edgecolor='red', lw=2))
        ax.set_xlabel('Number of epochs')
        ax.set_ylabel(r'log$_{10}$ of Learning rate')

        bottom, top = ax.get_ylim()
        ax.set_ylim(bottom + 0.5, top - 0.5)
        plt.show()
    #end if

    #scale training data
    X_train_val = scaler.transform(X_train_val)

    #pylearn model
    model.fit(X_train_val, y_train_val)
    pred_train = model.predict(X_train_val)
    pred_test = model.predict(X_test)

    #sklearn model
    clf = linear_model.LogisticRegressionCV()
    clf.fit(X_train_val, y_train_val)
    pred_skl = clf.predict(X_test)

    #get accuracy scores
    accuracy_on_test = accuracy_score(y_test, pred_test)
    accuracy_on_train = accuracy_score(y_train_val, pred_train)
    accuracy_skl = accuracy_score(y_test, pred_skl)

    #predict
    pred_train_prob = model.predict(X_train_val, probability=True)
    pred_test_prob = model.predict(X_test, probability=True)

    #get area ratio and plot cumulaive gain
    area_ratio_train = cumulative_gain_area_ratio(y_train_val,
                                                  pred_train_prob,
                                                  title='Training results')
    area_ratio_test = cumulative_gain_area_ratio(y_test,
                                                 pred_test_prob,
                                                 title=None)
    plt.show()

    #plot confusion matrix
    ax1 = plot_confusion_matrix(y_test,
                                pred_test,
                                normalize=True,
                                cmap='Blues',
                                title=' ')
    ax2 = plot_confusion_matrix(y_train_val,
                                pred_train,
                                normalize=True,
                                cmap='Blues',
                                title='Training data')

    bottom, top = ax1.get_ylim()
    ax1.set_ylim(bottom + 0.5, top - 0.5)
    ax2.set_ylim(bottom + 0.5, top - 0.5)

    plt.show()

    #print some stats
    print('===accuracy and area ratio stats===')
    print('accuracy on test:', accuracy_on_test)
    print('accuracy on train:', accuracy_on_train)
    print('accuracy skl:', accuracy_skl)
    print('area ratio train:', area_ratio_train)
    print('area ratio test:', area_ratio_test)

    if analyze_params:
        print('===grid search stats===')
        print('max accuracy:', max_accuracy)
        print('eta:', best_eta)
        print('n_epochs:', best_n_epochs)
예제 #2
0
    m = model_info['model']

    m.fit(X_train, y_train)

    p_train = m.predict(X_train)
    p_test = m.predict(X_test)

    model_info['train'] = accuracy_score(y_train, p_train)
    model_info['test'] = accuracy_score(y_test, p_test)

    dump(m, model_info['nome'] + '.joblib')
    del model_info['model']

# grafico prestazioni
df = pd.DataFrame(models_info)
#df.plot.bar(x='nome')
#plt.show()

# ricarico un modello salvato
model = load('tree.joblib')
new_example = [[0.0, 0.1, 2.4, 2.4]]
p = model.predict(new_example)
print('Prediction', dataset['target_names'][p])

# carico kNN e produco la matrice di confusione
knn = load('kNN.joblib')
p_test = knn.predict(X_test)
plot_confusion_matrix(y_test, p_test)
plt.show()
예제 #3
0
 def test_array_like(self):
     plot_confusion_matrix([0, 'a'], ['a', 0])
     plot_confusion_matrix([0, 1], [1, 0])
     plot_confusion_matrix(['b', 'a'], ['a', 'b'])
예제 #4
0
def models():
    data = request.get_json()

    if data['model'] == 'svm':
        training = pd.read_csv('training.csv')
        testing = pd.read_csv('testing.csv')

        x_train = training.drop(columns=['target'])
        y_train = training['target']

        x_test = testing.drop(columns=['target'])
        y_test_actual = testing['target']

        classifier = SVC(kernel=data['kernel'])
        classifier.fit(x_train, y_train)

        y_test_obtained = classifier.predict(x_test)

        testing['obtained'] = y_test_obtained
        testing.to_csv('result.csv', index=False)

        plot_confusion_matrix(y_test_actual, y_test_obtained)
        plt.savefig('figure.png')

    if data['model'] == 'knn':
        training = pd.read_csv('training.csv')
        testing = pd.read_csv('testing.csv')

        x_train = training.drop(columns=['target'])
        y_train = training['target']

        x_test = testing.drop(columns=['target'])
        y_test_actual = testing['target']

        classifier = KNeighborsClassifier(n_neighbors=data['k'])
        classifier.fit(x_train, y_train)

        y_test_obtained = classifier.predict(x_test)

        testing['obtained'] = y_test_obtained
        testing.to_csv('result.csv', index=False)

        plot_confusion_matrix(y_test_actual, y_test_obtained)
        plt.savefig('figure.png')

    if data['model'] == 'logistic_regression':
        training = pd.read_csv('training.csv')
        testing = pd.read_csv('testing.csv')

        x_train = training.drop(columns=['target'])
        y_train = training['target']

        x_test = testing.drop(columns=['target'])
        y_test_actual = testing['target']

        classifier = LogisticRegression(random_state=0,
                                        solver="liblinear",
                                        penalty=data['penalty'])
        classifier.fit(x_train, y_train)

        y_test_obtained = classifier.predict(x_test)

        testing['obtained'] = y_test_obtained
        testing.to_csv('result.csv', index=False)

        plot_confusion_matrix(y_test_actual, y_test_obtained)
        plt.savefig('figure.png')

    if data['model'] == 'linear_regression':
        training = pd.read_csv('training.csv')
        testing = pd.read_csv('testing.csv')

        x_train = training.drop(columns=['target'])
        y_train = training['target']

        x_test = testing.drop(columns=['target'])
        y_test_actual = testing['target']

        classifier = LinearRegression()
        classifier.fit(x_train, y_train)

        y_test_obtained = classifier.predict(x_test)

        testing['obtained'] = y_test_obtained
        testing.to_csv('result.csv', index=False)

        plt.scatter(x_train, y_train, color='blue')
        plt.scatter(x_test, y_test_actual, color='green')
        plt.plot(x_test, y_test_obtained, color='red')
        plt.title(
            "MSE: " +
            str(metrics.mean_squared_error(y_test_actual, y_test_obtained)))
        plt.legend(['Fitted Line', 'Training Data', 'Actual Testing Data'])
        plt.savefig('figure.png')

    if data['model'] == 'nn':
        training = pd.read_csv('training.csv')
        testing = pd.read_csv('testing.csv')

        x_train = training.drop(columns=['target'])
        y_train = training['target']

        x_test = testing.drop(columns=['target'])
        y_test_actual = testing['target']

        model = Sequential()
        model.add(
            Dense(data['layers_dims'][0],
                  activation=data['layers_activation'][0],
                  input_shape=(x_train.shape[1], )))
        for i in range(1, len(data['layers_dims'])):
            model.add(
                Dense(data['layers_dims'][i],
                      activation=data['layers_activation'][i]))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mean_squared_error')
        model.fit(x_train, y_train, validation_split=0.2, epochs=30)

        y_test_obtained = model.predict(x_test)

        testing['obtained'] = y_test_obtained
        testing.to_csv('result.csv', index=False)

    return jsonify([]), 200
예제 #5
0

predictions = keras_model.predict_classes(X_test)

for i in range(len(X_test)):
	print('(Prediction %d) => (expected %s)' % (predictions[i], y_test[i]))
print("Accuracy Train: %.2f%%" % (score1[1]*100))
print("Accuracy Test: %.2f%%" % (score[1]*100))

print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

#print(confusion_matrix(y_test, predictions))
plot_confusion_matrix(y_test2, y_pred, cmap=plt.cm.Oranges)
plt.show()
예제 #6
0
 def test_string_classes(self):
     np.random.seed(0)
     clf = LogisticRegression()
     clf.fit(self.X, convert_labels_into_string(self.y))
     preds = clf.predict(self.X)
     plot_confusion_matrix(convert_labels_into_string(self.y), preds)
예제 #7
0
def create_booster_summary(
    booster: Union[lgb.Booster, lgb.sklearn.LGBMModel],
    log_importances: bool = True,
    max_num_features: int = 10,
    list_trees: list = None,
    log_trees_as_dataframe: bool = True,
    log_pickled_booster: bool = True,
    log_trees: bool = False,
    tree_figsize: int = 30,
    log_confusion_matrix: bool = False,
    y_true: np.ndarray = None,
    y_pred: np.ndarray = None,
):
    """Create model summary after training that can be assigned to the run namespace.

    See guide with examples in the `Neptune-LightGBM docs`_.

    You can log multiple types of metadata:
        - pickled model
        - feature importance chart
        - visualized trees
        - trees represented as DataFrame
        - confusion matrix (only for classification problems)

    See Args section for more info how to parametrize behaviour of this function.

    Note:
        You can log summary to the new run, or to the same run that you used for logging model training.
        Second option can be very useful because you have all the information in the single run.

    Args:
        booster (:obj:`lgb.Booster` or :obj:`lgb.sklearn.LGBMModel`): Trained LightGBM model.
        log_importances (bool): Defaults to True. Log feature importance charts.
        max_num_features (int): Defaults to 10. Max number of top features on the importance charts.
            Works only if ``log_importances`` is set to ``True``.
            If None or <1, all features will be displayed.
            See `lightgbm.plot_importance`_ for details.
        list_trees (list): Defaults to None. Indices of the target tree to visualize.
            Works only if ``log_trees`` is set to ``True``.
            See `lightgbm.plot_tree`_ for details.
        log_trees_as_dataframe (bool): Defaults to True.
            Parse the model and log trees in the easy-to-read pandas DataFrame format.
            Works only for ``lgb.Booster``.
            See `lightgbm.Booster.trees_to_dataframe`_ for details.
        log_pickled_booster (bool): Defaults to True. Log model as pickled file.
        log_trees (bool): Defaults to False. Log visualized trees.
            This requires graphviz to work. Learn about setup in the `Neptune-LightGBM installation`_ docs.
        tree_figsize (int): Defaults to 30, Control size of the visualized tree image.
            Increase this in case you work with large trees.
            Works only if ``log_trees`` is set to ``True``.
        log_confusion_matrix (bool): Defaults to False. Log confusion matrix.
            If set to True, you need to pass ``y_true`` and ``y_pred``.
        y_true (:obj:`np.ndarray`): Defaults to None. True labels on the test set.
            Needed only if ``log_confusion_matrix`` is set to True.
        y_pred (:obj:`np.ndarray`): Defaults to None. Predictions on the test set.
            Needed only if ``log_confusion_matrix`` is set to True.

    Returns:
        dict: Python dictionary with all metadata, that can be assigned to the run namespace.
            ``run["booster_summary"] = create_booster_summary(...)``

    Examples:
        For more examples visit `example scripts`_.

        Full script that does logging during model training and logs booster summary after training::

            import lightgbm as lgb
            import neptune.new as neptune
            import numpy as np
            from neptune.new.integrations.lightgbm import NeptuneCallback, create_booster_summary
            from sklearn.datasets import load_digits
            from sklearn.model_selection import train_test_split

            # Create run
            run = neptune.init(
                project="common/lightgbm-integration",
                api_token="ANONYMOUS",
                name="train-cls",
                tags=["lgbm-integration", "train", "cls"]
            )

            # Create neptune callback
            neptune_callback = NeptuneCallback(run=run)

            # Prepare data
            X, y = load_digits(return_X_y=True)
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
            lgb_train = lgb.Dataset(X_train, y_train)
            lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

            # Define parameters
            params = {
                "boosting_type": "gbdt",
                "objective": "multiclass",
                "num_class": 10,
                "metric": ["multi_logloss", "multi_error"],
                "num_leaves": 21,
                "learning_rate": 0.05,
                "feature_fraction": 0.9,
                "bagging_fraction": 0.8,
                "bagging_freq": 5,
                "max_depth": 12,
            }

            # Train the model and log metadata to the run in Neptune
            gbm = lgb.train(
                params,
                lgb_train,
                num_boost_round=200,
                valid_sets=[lgb_train, lgb_eval],
                valid_names=["training", "validation"],
                callbacks=[neptune_callback],
            )

            y_pred = np.argmax(gbm.predict(X_test), axis=1)

            # Log summary metadata to the same run under the "lgbm_summary" namespace
            run["lgbm_summary"] = create_booster_summary(
                booster=gbm,
                log_trees=True,
                list_trees=[0, 1, 2, 3, 4],
                log_confusion_matrix=True,
                y_pred=y_pred,
                y_true=y_test
            )

    .. _Neptune-LightGBM docs:
        https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightgbm
       _lightgbm.plot_importance:
        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_importance.html#lightgbm-plot-importance
       _lightgbm.plot_tree:
        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.plot_tree.html#lightgbm-plot-tree
       _lightgbm.Booster.trees_to_dataframe:
        https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.Booster.html#lightgbm.Booster.trees_to_dataframe
       _Neptune-LightGBM installation:
        https://docs.neptune.ai/integrations-and-supported-tools/model-training/lightgbm#install-requirements
       _example scripts:
        https://github.com/neptune-ai/examples/tree/main/integrations-and-supported-tools/lightgbm/scripts

    """
    results_dict = {}
    visuals_path = "visualizations/"
    if log_importances:
        split_plot = lgb.plot_importance(booster,
                                         importance_type="split",
                                         title="Feature importance (split)",
                                         max_num_features=max_num_features)
        gain_plot = lgb.plot_importance(booster,
                                        importance_type="gain",
                                        title="Feature importance (gain)",
                                        max_num_features=max_num_features)
        results_dict["{}feature_importances/split".format(visuals_path)] \
            = neptune.types.File.as_image(split_plot.figure)
        results_dict["{}feature_importances/gain".format(visuals_path)] \
            = neptune.types.File.as_image(gain_plot.figure)

    if log_trees:
        try:
            subprocess.call(["dot", "-V"],
                            stdout=subprocess.DEVNULL,
                            stderr=subprocess.DEVNULL)
        except OSError:
            log_trees = False
            message = "Graphviz executables not found, so trees will not be logged. " \
                      "Make sure the Graphviz executables are on your systems' PATH"
            warnings.warn(message)

    if log_trees:
        trees_series = []
        for i in list_trees:
            digraph = lgb.create_tree_digraph(booster,
                                              tree_index=i,
                                              show_info="data_percentage")
            _, ax = plt.subplots(1, 1, figsize=(tree_figsize, tree_figsize))
            s = BytesIO()
            s.write(digraph.pipe(format="png"))
            s.seek(0)
            ax.imshow(image.imread(s))
            ax.axis("off")
            trees_series.append(neptune.types.File.as_image(ax.figure))
        results_dict["{}trees".format(
            visuals_path)] = neptune.types.FileSeries(trees_series)

    if log_trees_as_dataframe:
        if isinstance(booster, lgb.Booster):
            df = booster.trees_to_dataframe()
            html_df = neptune.types.File.as_html(df)
            results_dict["trees_as_dataframe"] = html_df
            if not df.empty and not html_df.content:
                warnings.warn(
                    "'trees_as_dataframe' wasn't logged. Probably generated dataframe was to large."
                )
        else:
            warnings.warn(
                "'trees_as_dataframe' won't be logged."
                " `booster` must be instance of `lightgbm.Booster` class.")

    if log_pickled_booster:
        results_dict["pickled_model"] = neptune.types.File.as_pickle(booster)

    if log_confusion_matrix:
        ax = plot_confusion_matrix(y_true=y_true, y_pred=y_pred)
        results_dict[
            f"{visuals_path}confusion_matrix"] = neptune.types.File.as_image(
                ax.figure)

    return results_dict
예제 #8
0
# Avoid using 'accuracy' (*even if balanced classes*) - https://stats.stackexchange.com/q/312780

# Brier score (`brier_score_loss`)

# Confusion Matrix

### sklearn

from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

### skplot

from scikitplot.metrics import plot_confusion_matrix
plot_confusion_matrix(y_test, y_pred, normalize=False)

### Yellowbrick

from yellowbrick.classifier import ConfusionMatrix

conf_matrix = ConfusionMatrix(rf,
                              classes=cancer.target_names,
                              label_encoder={
                                  0: 'benign',
                                  1: 'malignant'
                              })
conf_matrix.fit(X_train, y_train)
conf_matrix.score(X_test, y_test)
conf_matrix.poof()
예제 #9
0
xAxisTitle = "Sig_Prob"
yAxisTitle = "# Events (scaled to 1)"
outPlotName = "Discriminator_Beautified.pdf"
pB.plotBeautifier(histList, labelList, xAxisTitle, yAxisTitle, outPlotName)

# In[27]:

type((sigTrainPredict[:, 0]).astype(float))

# In[12]:

# Confusion Matrix
plt.clf()
plt.figure(figsize=(10, 7))
plot_confusion_matrix(
    np.array(yTest)[:, 0],
    np.array(xTestPredict)[:, 0] > 0.5)
plt.savefig("ConfusionMatrix.pdf")
print("Confusion Matrix printed!!!")

# In[15]:

# For plots based on discriminator cut

sigProb = np.arange(0, 1, 0.01)
xTestPredictSigProb = np.array(xTestPredict)[:, 0]
yTestClass = np.array(yTest)[:, 0]

tpr = []
fpr = []
예제 #10
0
print( df['sport'].unique() )
print( df['altezza'].describe() )

df.plot.scatter(x='altezza', y='peso')
df.plot.bar()


# SCIKIT

dataset = load_wine()

X = dataset['data']
y = dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y)

model = LogisticRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

print( 'accuracy', accuracy_score(y_test, predictions) )
plot_confusion_matrix(y_test, predictions)


plt.show()




예제 #11
0
################################################################################
##
##  Evaluate the better model on valid
EvaluateTheBetter = {}
EvaluateTheBetter["Model"] = TheBetterModel
##
## AUC score
EvaluateTheBetter["Probability"] = EvaluateTheBetter["Model"].predict([Valid["Image"], Valid["Variable"]])
EvaluateTheBetter["AUC"]         = roc_auc_score(numpy.array(Valid["Label"]), EvaluateTheBetter["Probability"][:,1])
##
##  plot AUC
AUC = plot_roc(y_true=Valid["Label"], y_probas=EvaluateTheBetter["Probability"], plot_micro=False, plot_macro=False, classes_to_plot=[1]).get_figure()
AUC.savefig(ResultPath + Time + "/AUC.png")
EvaluateTheBetter["Prediction"]   = numpy.argmax(EvaluateTheBetter["Probability"], axis=1)
EvaluateTheBetter["ConfuseTable"] = confusion_matrix(Valid["Label"], EvaluateTheBetter["Prediction"])
ConfusionTable = plot_confusion_matrix(y_true = Valid["Label"], y_pred = EvaluateTheBetter["Prediction"]).get_figure()
ConfusionTable.savefig(ResultPath + Time + "/ConfusionTable.png")
##
##  Threshold table
_, _, Threshold = roc_curve(numpy.array(Valid["Label"]), EvaluateTheBetter["Probability"][:,1])
ThresholdTable = {"Threshold":[], "Accuracy":[], "Sensitivity":[], "Specificity":[], "Precision":[]}
for threshold in Threshold:
    prediction = EvaluateTheBetter["Probability"][:,1] > threshold
    accuracy   = accuracy_score(Valid["Label"], prediction)
    confuse    = confusion_matrix(Valid["Label"], prediction)
    sensitivity = confuse[1,1]/sum(confuse[1,:])
    specificity = confuse[0,0]/sum(confuse[0,:])
    precision   = confuse[1,1]/sum(confuse[1,:])
    ThresholdTable["Threshold"].append(threshold)
    ThresholdTable["Accuracy"].append(accuracy)
    ThresholdTable["Sensitivity"].append(sensitivity)
예제 #12
0
 def test_hide_counts(self):
     np.random.seed(0)
     clf = LogisticRegression()
     clf.fit(self.X, self.y)
     preds = clf.predict(self.X)
     plot_confusion_matrix(self.y, preds, hide_counts=True)
예제 #13
0
print 'recall(macro): ' + str(
    recall_score(groundTruthList, predictionList, average='macro'))
print 'recall(micro): ' + str(
    recall_score(groundTruthList, predictionList, average='micro'))
fs.write('recall(macro): ' +
         str(recall_score(groundTruthList, predictionList, average='macro')) +
         '\n')
fs.write('recall(micro): ' +
         str(recall_score(groundTruthList, predictionList, average='micro')) +
         '\n')

print 'f1(macro): ' + str(
    f1_score(groundTruthList, predictionList, average='macro'))
print 'f1(micro): ' + str(
    f1_score(groundTruthList, predictionList, average='micro'))
fs.write('f1(macro): ' +
         str(f1_score(groundTruthList, predictionList, average='macro')) +
         '\n')
fs.write('f1(micro): ' +
         str(f1_score(groundTruthList, predictionList, average='micro')) +
         '\n')

print confusion_matrix(groundTruthList, predictionList)

plot_confusion_matrix(groundTruthList, predictionList)
matrixFileName = dirName + '/confusionMatrix.png'
plt.savefig(matrixFileName)
#plt.show()
fs.close()
예제 #14
0
         density=True,
         color=None,
         histtype='step',
         label='bkg')
plt.xlabel("Sig_Prob")
plt.ylabel("#Events (Area scaled to 1)")
plt.legend()
plt.savefig("Discriminator.pdf")
print("Discriminator plotted!!!")

# Confusion matrix
threshold = 0.5
pred_class = pred_one > threshold
plt.clf()
plt.figure(figsize=(10, 7))
plot_confusion_matrix(labl, pred_class)
plt.savefig("ConfusionMatrix.pdf")
print("Confusion Matrix printed!!!")

# F1 Score
nofvalues = 100
thresh = np.empty(nofvalues)
f1score = np.empty(nofvalues)

plt.clf()
for beta in [0.1, 0.25, 0.5, 1, 1.25, 1.5, 2, 5, 10]:
    for i in range(100):
        thresh[i] = i * 1.0 / 100
        pred_class = pred_one > thresh[i]
        f1score[i] = fbeta_score(labl, pred_class, beta)
예제 #15
0
    def classify(self, X, type: str, classifier: str, test_prop: float,
                 res: None, res_method: None):

        if type == 'binary':
            y = self.df['class'].replace(0, 1)
        elif type == 'multi':
            y = self.df['class']
        else:
            raise TypeError("Choose a proper type of classification")

        X_train, X_test, Y_train, Y_test = train_test_split(
            X, y, test_size=test_prop, stratify=y)

        if res == True:
            if res_method == 'down':
                nm = NearMiss()
                X_res, Y_res = nm.fit_resample(X_train, Y_train)
            elif res_method == 'up':
                sm = ADASYN()
                X_res, Y_res = sm.fit_resample(X_train, Y_train)
            else:
                raise TypeError(
                    "Resampling method not provided. Please use 'up' for oversampling or 'down' for undersampling."
                )

        if classifier == 'lr':
            model = LogisticRegression(solver='liblinear',
                                       class_weight='balanced',
                                       C=0.04,
                                       penalty='l2')
        elif classifier == 'svc':
            model = LinearSVC(C=0.004, penalty='l2')
        elif classifier == 'rf':
            n_est = int(
                input("Type in number of trees to estimate from: ").strip())
            model = RandomForestClassifier(n_estimators=n_est,
                                           bootstrap=True,
                                           max_depth=5)
        elif classifier == 'xgb':
            n_est = int(
                input("Type in number of trees to estimate from: ").strip())
            model = XGBClassifier(n_estimators=n_est,
                                  bootstrap=True,
                                  max_depth=5,
                                  reg_lamba=0.4)
        elif classifier == 'ada':
            n_est = int(
                input("Type in number of trees to estimate from: ").strip())
            model = AdaBoostClassifier(n_estimators=n_est, learning_rate=0.005)
        else:
            raise TypeError(
                "Choose a proper classifier. Possible inputs: 'lr', 'svc', 'rf', 'xgb', 'ada' ."
            )

        if res == True:
            model.fit(X_res, Y_res)
        else:
            model.fit(X_train, Y_train)

        Y_pred = model.predict(X_test)

        # Accuracy Percentage
        print(f"Accuracy is {round(accuracy_score(Y_test, Y_pred), 2)*100}%")

        # Classification Report
        print(classification_report(Y_pred, Y_test))

        # Matthew's Correlation Coefficient
        print(
            f"Matthew's Correlation Coefficient is {matthews_corrcoef(Y_test, Y_pred)}"
        )

        # Plots of Confusion Matrix and ROC Curve
        plot_confusion_matrix(Y_test, Y_pred, figsize=(10, 10))

        return model
예제 #16
0
Y = df.iloc[:,2]

#labelencoder_Y = LabelEncoder()
#Y= labelencoder_Y.fit_transform(y)
#Y=onehotencoder=pd.get_dummies(y)
#df = pd.read_csv('open_palm_simi_test1.csv') 
#X_test= df.iloc[0:,:].values 

from sklearn.metrics import confusion_matrix
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.33,random_state=42,shuffle=False)
# Run classifier, using a model that is too regularized (C too low) to see
# the impact on the results
model = SVC(kernel = 'linear', C = 1).fit(X_train,y_train) 
predictions = model.predict(X_test) 
# creating a confusion matrix 
cm = plot_confusion_matrix(y_test, predictions) 
########################################
#file1 = open("result.txt","w+") 
#file1.write(predictions[1]) #this will give predicted guesture as output to the text file
#file1.close()

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 10]
labels = ['fist_palm','fist_dorsal','open_palm','open_dorsal','three_fingers_dorsal','three_fingers_palm']
sk.metrics.plot_confusion_matrix(model, 
                                 X_test, 
                                 y_test, 
                                 normalize="pred",
                                 display_labels = labels
                                )
plt.show()
예제 #17
0
dataset = pd.read_json("./data.json")
#520

x = np.array([])
y = np.array([])

for value in enumerate(dataset["LIGHT_VALUE"]):
    x = np.append(x, int(value[1]))

for value in enumerate(dataset["NEED_LIGHT"]):
    y = np.append(y, int(value[1]))

x = x.reshape(-1, 1)
y = y.reshape(-1, 1)

x_train, x_test, y_train, y_test = train_test_split(x, y)

model = DecisionTreeClassifier()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

print("Accuratezza test")
print(str(accuracy_score(y_test, y_pred) * 100) + "%")

plot_confusion_matrix(y_test, y_pred)
plt.show()

y_pred = model.predict([[400]])
print(bool(y_pred[0]))
예제 #18
0
test_y = y.values[test_ind]

train_X_2 = train_X[:, selector.support_]
test_X_2 = test_X[:, selector.support_]

clsf_1 = RandomForestClassifier(n_estimators=500,
                                class_weight="balanced",
                                n_jobs=-1)
clsf_1.fit(train_X, train_y)
RF_pred = clsf_1.predict(test_X)
pred_probability = clsf_1.predict_proba(test_X)

evaluation_metrics(test_y, RF_pred, pred_probability)

_now = dt.now().strftime("%Y%m%d-%H-%M-%S")
plot_confusion_matrix(test_y, RF_pred)
plt.savefig("{}_result_before_rfe.png".format(_now))
plot_roc(test_y, pred_probability)
plt.savefig("{}_roc_result_before_rfe.png".format(_now))

clsf_2 = RandomForestClassifier(n_estimators=500,
                                class_weight="balanced",
                                n_jobs=-1)
clsf_2.fit(train_X_2, train_y)
RF_pred = clsf_2.predict(test_X_2)
pred_probability = clsf_2.predict_proba(test_X_2)

evaluation_metrics(test_y, RF_pred, pred_probability)

#plot_confusion_matrix(test_y, RF_pred)
#plot_roc(test_y, pred_probability)
예제 #19
0
def evaluate_models(y, preds):
    print('Accuracy: {}'.format(accuracy_score(y, preds)))
    print('ROC-AUC: {}'.format(roc_auc_score(y, preds)))
    print('Recall: {}'.format(recall_score(y, preds)))
    skplt.plot_confusion_matrix(y, preds, figsize=(8, 6))
예제 #20
0
                 round(metrics.mean_absolute_error(y_test, pred), 4))
        st.write('Mean Squared Error (MSE):',
                 round(metrics.mean_squared_error(y_test, pred), 4))
        st.write('Root Mean Squared Error (RMSE):',
                 round(np.sqrt(metrics.mean_squared_error(y_test, pred)), 4))
        st.write('Accuracy of Logistic Regression on training set: ',
                 round(logReg.score(X_train, y_train), 4))
        st.write('Accuracy of Logistic Regression  on test set: ',
                 round(logReg.score(X_test, y_test), 4))

        st.subheader("Classification Report")
        st.text(classification_report(y_test, pred))

        try:
            # Confusion matrix
            plot_confusion_matrix(y_test, pred, figsize=(7, 5), cmap="PuBuGn")
            bottom, top = plt.ylim()
            plt.ylim(bottom + 0.5, top - 0.5)
            st.pyplot()
        except:
            st.write("Confusion matrix do not support multioutput.")

    except:
        st.write("Fill all parameters.")

    # plot_calibration_curve(y_test, [pred])
    # st.pyplot()

########################################
# KNN CLASSIFIER
########################################
예제 #21
0
def plot_classification_metrics(y_true,
                                y_pred_proba,
                                classes_to_plot=None,
                                threshold=None,
                                plot_micro=True):
    """ Plot ROC Curve, Precision-Recall Curve and Confusion Matrix

    Parameters
    ----------
    y_true : array-like, shape (n_samples)
        Ground truth labels.

    y_pred_proba : array-like, shape (n_samples, n_classes)
        Prediction probabilities or decision scores for each class
        returned by a classifier.

    classes_to_plot : list-like, optional
        Classes for which the ROC curve should be plotted. e.g. [0, 'cold'].
        If the specified class does not exist, it will be ignored.
        If ``None``, all classes will be plotted.

    threshold : None or float
        if a float is set, it will be used as the decision threshold for
        binary classification

    plot_micro : bool
        whether to plot the averaged roc_curve and the precision-recall curve
        using average method 'micro'

    Returns
    -------
    fig : :class:`matplotlib.figure.Figure` object

    axs : Axes object or array of Axes objects.
    """

    fig = plt.figure(dpi=100, figsize=(10.5, 8))
    ax1 = plt.subplot2grid((4, 4), (0, 0), rowspan=2, colspan=2)
    ax2 = plt.subplot2grid((4, 4), (0, 2), rowspan=2, colspan=2)
    ax3 = plt.subplot2grid((4, 4), (2, 0), rowspan=2, colspan=2)

    # region Plot ROC Curve
    plot_roc(y_true,
             y_pred_proba,
             plot_macro=False,
             plot_micro=plot_micro,
             classes_to_plot=classes_to_plot,
             ax=ax1)

    # endregion

    # region plot Precision-Recall Curve
    plot_precision_recall(y_true,
                          y_pred_proba,
                          plot_micro=plot_micro,
                          classes_to_plot=classes_to_plot,
                          ax=ax2)
    ax2.legend(loc='lower right')
    # endregion

    # region Plot Confusion Matrix
    y_pred_idx = np.argmax(y_pred_proba, axis=-1)
    labels = np.sort(np.unique(y_true))
    y_pred = labels[y_pred_idx]
    plot_confusion_matrix(y_true, y_pred, normalize=True, ax=ax3)
    # endregion
    axs = [ax1, ax2, ax3]

    if threshold:
        # region Plot Confusion Matrix
        labels = np.sort(np.unique(y_true))
        assert len(labels) == 2, """Problem is not binary classification
        but decision threshold is set"""
        ax4 = plt.subplot2grid((4, 4), (2, 2), rowspan=2, colspan=2)
        is_positive = y_pred_proba[:, 1] > threshold
        y_pred = labels[is_positive.astype('int')]
        plot_confusion_matrix(y_true, y_pred, normalize=True, ax=ax4)
        ax4.set_title('Confusion Matrix with adjusted '
                      'decision threshold: {:.2}'.format(threshold))

        # update color limit
        im3 = ax3.get_images()[0]
        clim = im3.get_clim()
        im4 = ax4.get_images()[0]
        im4.set_clim(clim)
        axs.append(ax4)

        # endregion
    fig.tight_layout()
    return fig, axs
예제 #22
0
 def test_labels(self):
     np.random.seed(0)
     clf = LogisticRegression()
     clf.fit(self.X, self.y)
     preds = clf.predict(self.X)
     plot_confusion_matrix(self.y, preds, labels=[0, 1, 2])
예제 #23
0
 def plot_image(self):
     plot_confusion_matrix(self.__train_label,
                           self.__val_preds,
                           normalize=True)
     plt.show()
예제 #24
0
from scikitplot.metrics import plot_confusion_matrix

model.freeze()
test_data = dm.test_dataloader()
y_true = np.array([])
y_pred = np.array([])

for i, (x, y) in enumerate(test_data):
    y = y.cpu().detach().numpy()
    y_hat = model.forward(x).argmax(axis=1).cpu().detach().numpy()

    y_true = np.append(y_true, y)
    y_pred = np.append(y_pred, y_hat)

fig, ax = plt.subplots(figsize=(16, 12))
plot_confusion_matrix(y_true, y_pred, ax=ax)
neptune_logger.experiment.log_image('confusion_matrix', fig)

## 2: Log model checkpoints to Neptune

for k in model_checkpoint.best_k_models.keys():
    model_name = 'checkpoints/' + k.split('/')[-1]
    neptune_logger.experiment.log_artifact(k, model_name)

## 3: Log best model checkpoint score to Neptune

neptune_logger.experiment.set_property(
    'best_model_score', model_checkpoint.best_model_score.tolist())

## 4 Log model summary
예제 #25
0
    fp -= tp
    result = round((tp + tn) / float(tp + tn + fn + fp), 4)
    accuracys.append(result)

#print accuracys

fs.write('class accuracys: ' + str(accuracys).strip('[]'))
fs.write('\n')
fs.write('\n')
fs.write(
    toLatexTab(accuracys,
               precision_score(groundTruthList, predictionList, average=None),
               recall_score(groundTruthList, predictionList, average=None),
               f1_score(groundTruthList, predictionList, average=None), 4))
fs.write('\n')
fs.write(
    toLatexTab(accuracys,
               precision_score(groundTruthList, predictionList, average=None),
               recall_score(groundTruthList, predictionList, average=None),
               f1_score(groundTruthList, predictionList, average=None), 2))
plot_confusion_matrix(groundTruthList,
                      predictionList,
                      x_tick_rotation=90,
                      figsize=(14, 13),
                      title=' ',
                      text_fontsize='large',
                      cmap='Reds')
matrixFileName = dirName + '/confusionMatrix.png'
plt.savefig(matrixFileName)
#plt.show()
fs.close()
예제 #26
0
# fit the model
xgb.fit(X_train, y_train)

# make prediction
y_pred = (xgb.predict_proba(X_val)[:, 1] >= 0.21).astype('int') #setting a threshold

!pip install scikit-plot

# evaluation
from sklearn.metrics import f1_score, classification_report
from scikitplot.metrics import plot_confusion_matrix
from scikitplot.classifiers import plot_feature_importances
print('F1 Score: {}'.format(f1_score(y_pred, y_val)))
print(classification_report(y_pred, y_val))
# plot confusion matrix
plot_confusion_matrix(y_pred, y_val)

# plot importance
features = train.drop(columns=['user_id','product_id','reordered'])
plot_feature_importances(xgb, feature_names=features.columns, x_tick_rotation=90, max_num_features=20, figsize=(10,8))

# delete X_train, X_test, y_train, y_test
del [X_train, X_val, y_train, y_val, means]
gc.collect()

"""## Fit entire training set"""

# fit on entire dataset
xgb.fit(train.drop(columns=['user_id','product_id','reordered','user_avg_order_dow','user_avg_order_hour']), train['reordered'])

# make prediction on test set
# Realizar as previsões
previsoes = model_dt.predict(X_valid)

# Fazendo as previsões e construindo o relatório
report = classification_report(y_valid, previsoes)

# Imprimindo o relatório
print(report)

# Permite verificar a acurácia em um formato de tabela
matrix = confusion_matrix(y_valid, previsoes)

# Chamando a função para visualizar a confusion matrix
plot_confusion_matrix(matrix, 
                      target_names = ['0', '1'],
                      normalize    = False,
                      title        = "Confusion Matrix")

"""### Realizando o treinamento com **LOGISTIC REGRESSION**"""

# Treinando o modelo de Regressão Logística
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X_train, y_train)

# Realizar as previsões
previsoes = model_lr.predict(X_valid)

# Fazendo as previsões e construindo o relatório
report = classification_report(y_valid, previsoes)

# Imprimindo o relatório
import numpy as np

y_test_pred = np.asarray(model.predict(x_test))
y_test_pred_class = np.argmax(y_test_pred, axis=1)

from sklearn.metrics import f1_score

f1 = f1_score(y_test, y_test_pred_class, average='micro')

neptune.log_metric('test_f1', f1)

import matplotlib.pyplot as plt
from scikitplot.metrics import plot_confusion_matrix, plot_roc

fig, ax = plt.subplots(figsize=(16, 12))
plot_confusion_matrix(y_test, y_test_pred_class, ax=ax)
neptune.log_image('diagnostic_charts', fig)

fig, ax = plt.subplots(figsize=(16, 12))
plot_roc(y_test, y_test_pred, ax=ax)
neptune.log_image('diagnostic_charts', fig)

model.save('my_model.h5')
neptune.log_artifact('my_model.h5')

# tests
current_exp = neptune.get_experiment()

correct_logs = [
    'batch_loss', 'batch_accuracy', 'epoch_loss', 'epoch_accuracy',
    'epoch_val_loss', 'epoch_val_accuracy', 'test_f1', 'diagnostic_charts'
def main():
    neptune.init(api_token=os.getenv('NEPTUNE_API_TOKEN'),
                 project_qualified_name=os.getenv('NEPTUNE_PROJECT'))

    train_idx = pd.read_csv(TRAIN_IDX_PATH, nrows=NROWS)
    valid_idx = pd.read_csv(VALID_IDX_PATH, nrows=NROWS)
    features = pd.read_csv(FEATURES_PATH, nrows=NROWS)

    train = pd.merge(train_idx, features, on='SK_ID_CURR')
    valid = pd.merge(valid_idx, features, on='SK_ID_CURR')

    all_params = {
        'num_boost_round': NUM_BOOST_ROUND,
        'early_stopping_rounds': EARLY_STOPPING_ROUNDS,
        **LGBM_PARAMS
    }

    with neptune.create_experiment(name='model training',
                                   params=all_params,
                                   tags=['lgbm'],
                                   upload_source_files=get_filepaths(),
                                   properties={
                                       'features_path':
                                       FEATURES_PATH,
                                       'features_version':
                                       md5_hash(FEATURES_PATH),
                                       'train_split_version':
                                       md5_hash(TRAIN_IDX_PATH),
                                       'valid_split_version':
                                       md5_hash(VALID_IDX_PATH),
                                   }):
        results = train_evaluate(train,
                                 valid,
                                 LGBM_PARAMS,
                                 callbacks=[neptune_monitor()])
        train_score, valid_score = results['train_score'], results[
            'valid_score']
        train_preds, valid_preds = results['train_preds'], results[
            'valid_preds']

        neptune.send_metric('train_auc', train_score)
        neptune.send_metric('valid_auc', valid_score)

        train_pred_path = os.path.join(PREDICTION_DIRPATH, 'train_preds.csv')
        train_preds.to_csv(train_pred_path, index=None)
        neptune.send_artifact(train_pred_path)

        valid_pred_path = os.path.join(PREDICTION_DIRPATH, 'valid_preds.csv')
        valid_preds.to_csv(valid_pred_path, index=None)
        neptune.send_artifact(valid_pred_path)

        model_path = os.path.join(MODEL_DIRPATH, 'model.pkl')
        joblib.dump(results['model'], model_path)
        neptune.set_property('model_path', model_path)
        neptune.set_property('model_version', md5_hash(model_path))
        neptune.send_artifact(model_path)

        if PACKAGE_TO_PROD:
            saved_path = CreditDefaultClassifier.pack(
                model=results['model']).save(PRODUCTION_DIRPATH)
            neptune.set_property('production_model_path', saved_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        sk_metrics.plot_confusion_matrix(valid_preds['TARGET'],
                                         valid_preds['preds_pos'] > 0.5,
                                         ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'conf_matrix.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        sk_metrics.plot_roc(valid_preds['TARGET'],
                            valid_preds[['preds_neg', 'preds_pos']],
                            ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'roc_auc.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        sk_metrics.plot_precision_recall(
            valid_preds['TARGET'],
            valid_preds[['preds_neg', 'preds_pos']],
            ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'prec_recall.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)

        fig, ax = plt.subplots(figsize=(16, 12))
        plot_prediction_distribution(valid_preds['TARGET'],
                                     valid_preds['preds_pos'],
                                     ax=ax)
        plot_path = os.path.join(REPORTS_DIRPATH, 'preds_dist.png')
        fig.savefig(plot_path)
        neptune.send_image('diagnostics', plot_path)