示例#1
0
import data_preprocessing
import feature_engineering
import eda_monitoring
import modeling
import performance_monitoring

from prefect import Flow, task, context

import pandas as pd

# Pandas options for better shell display
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

start_time = greenhouse_clock.get_time()


@task
def sourcing():

    return data_sourcing.get()


@task
def cleansing(df):

    return data_preprocessing.clean(df)


@task
示例#2
0
import numpy as np
from sklearn import metrics
import json

import greenhouse_clock

meta = {}

# Timestamp for files
meta["timestr"] = greenhouse_clock.get_time()


def optimal_threshold(y_true, y_score):

    # Performance extracted from the "ROC curve"
    fpr, tpr, thr = metrics.roc_curve(
        y_true=y_true, y_score=y_score, pos_label=1, drop_intermediate=False
    )

    diff = np.abs(tpr - fpr)

    # Numpy index of the maximum separation between TPR and FPR
    diff_idx = np.argmax(diff)

    # Optimum threshold based on max diff criterium
    return thr[diff_idx]


def report_performance(
    y_true, y_score, best_hyperparams, path, opt_thr=0.5, suffix="_"
):