Exemplo n.º 1
0
def initialize_app():
    """
    Initializes our Flask application.
    - creates a Flask app object
    - sets AWS keys for uploading payloads to S3
    - retrieves and sets the application config
    - integrates with Sentry for error reporting
    - sets up a background scheduler to refresh teh config every 3,600 seconds
    - loads the trained model and sets it as a global object
    """
    app = Flask(__name__)

    if ENVIRONMENT != 'local':
        sentry_sdk.init(dsn=SENTRY_DSN,
                        integrations=[FlaskIntegration()],
                        traces_sample_rate=1.0)

    config_dict = retrieve_app_config(DB_SCHEMA,
                                      make_mysql_connection(DATABASE_SECRET),
                                      ENVIRONMENT)
    for key, value in config_dict.items():
        app.config[key] = value

    scheduler = BackgroundScheduler()
    scheduler.add_job(func=hit_config_refresh_endpoint,
                      trigger="interval",
                      seconds=3_600)
    scheduler.start()

    global model
    model = joblib.load(MODEL_PATH)

    return app
Exemplo n.º 2
0
def predict():
    """
    Endpoint to produce model predictions. Output is logged to S3.
    """
    try:
        session["endpoint"] = "predict"
        response_start_time = time.time()
        input_data = request.json

        ltv_df = get_client_ltv_table(make_mysql_connection(DATABASE_SECRET))
        client_id = input_data.get("client_id", "000000")
        try:
            client_ltv = (
                ltv_df.loc[ltv_df["client_id"] == client_id])["ltv"].iloc[-1]
        except IndexError:
            client_ltv = 0

        input_df = convert_json_to_dataframe(input_data)
        prediction = make_prediction(input_df, model)
        if prediction >= float(app.config.get("proba_cutoff", 0.75)):
            high_risk = "yes"
        else:
            high_risk = "no"

        processing_time = round(time.time() - response_start_time, 3)

        input_data["uid"] = session.get("uid")
        input_data["url"] = request.url
        input_data["endpoint"] = "predict"
        output = dict()
        output["prediction"] = prediction
        output["high_risk"] = high_risk
        output["response_time"] = processing_time
        output["ltv"] = client_ltv
        session["output"] = deepcopy(output)
        session["input"] = input_data

        print(output)
        return output
    except Exception as exception:
        print(exception)
        sentry_sdk.capture_exception(exception)
        output = {
            "error": "app was not able to process request",
            "prediction": 0
        }
        return output
    finally:
        if ENVIRONMENT != "local":
            uid = session.get("uid")
            input_payload = session.get("input")
            output_payload = session.get("output", {})
            output_payload["logging_timestamp"] = str(get_current_timestamp())
            output_payload["logging_epoch"] = time.time()
            log_payload_to_s3(input_payload, output_payload, uid,
                              OUTPUT_LOGS_S3_BUCKET_NAME)
            log_payloads_to_mysql(input_payload, output_payload,
                                  OUTPUT_LOGS_TABLE_NAME, DB_SCHEMA,
                                  DATABASE_SECRET)
Exemplo n.º 3
0
def main():
    """
    Loads the csv churn data into a MySQL table.
    """
    db_conn = make_mysql_connection('churn-model-mysql')
    df = pd.read_csv('data/site_churn_data.csv')
    dynamically_create_ddl_and_execute(df, 'churn_model', 'churn_data', db_conn)
    write_dataframe_to_database(df, 'churn_model', 'churn_data', db_conn)
    sleep(2)
    validation_df = pd.read_sql('''select * from churn_model.churn_data;''', db_conn)
    print(validation_df.head())
Exemplo n.º 4
0
def config_refresh():
    """
    Endpoint to refresh the config. This invokes the retrieve_app_config function to query the relevant MySQL table with
    configuration values.
    """
    config_dict = retrieve_app_config(DB_SCHEMA,
                                      make_mysql_connection(DATABASE_SECRET),
                                      ENVIRONMENT)
    for key, value in config_dict.items():
        app.config[key] = value
    return "config refresh hit"
Exemplo n.º 5
0
def get_data_to_explore():
    """
    Tightly-coupled function to retrieve the data we want to explore.
    """
    df = pd.read_sql('''select * from churn_model.churn_data;''',
                     make_mysql_connection('churn-model-mysql'))
    df['churn'] = np.where(df['churn'].str.startswith('y'), 1, 0)
    df.drop(['id', 'meta__inserted_at', 'client_id', 'acquired_date'],
            1,
            inplace=True)
    return df
def query_logs_table(db_secret_name, start_timestamp):
    """
    Queries table of API logs.

    :param db_secret_name: name of the Secrets Manager secret with the database credentials
    :param start_timestamp: timestamp for which to pull logs starting at
    :returns: pandas dataframe
    """
    query = f'''
    select JSON_EXTRACT(input_output_payloads, "$.output.prediction") as prediction
    FROM churn_model.model_logs
    where logging_timestamp >= '{start_timestamp}';
    '''
    df = pd.read_sql(query, make_mysql_connection(db_secret_name))
    return df
Exemplo n.º 7
0
def login():
    """
    Login endpoint for the model user interface.
    """
    if request.method == 'POST':
        form_submission = request.form
        username = str(form_submission['username'])
        password = str(form_submission['password'])
        hashed_password = sha256(password.encode('utf-8')).hexdigest()
        database_password = get_hashed_password_for_username(
            username, make_mysql_connection(DATABASE_SECRET))
        if hashed_password == database_password:
            session['logged_in'] = True
            return redirect(url_for('model_interface'))
        else:
            flash('Credentials are not valid. Please try again.')
    return render_template('login.html')
Exemplo n.º 8
0
def model_interface():
    """
    Model user interface to render predictions in HTML.
    """
    logged_in = session.get('logged_in', False)
    if logged_in:
        if request.method == 'POST':
            form_submission = request.form
            raw_clients = str(form_submission['clients'])
            client_list = raw_clients.split(',')
            client_list = [str(c) for c in client_list]
            model_df = get_training_data(
                make_mysql_connection(DATABASE_SECRET))
            model_df = model_df.loc[model_df['client_id'].isin(client_list)]
            if len(model_df) > 0:
                model_df.reset_index(inplace=True, drop=True)
                predictions_df = pd.DataFrame(model.predict_proba(model_df)[:,
                                                                            1],
                                              columns=['prediction'])
                predictions_df = pd.concat(
                    [model_df[['client_id']], predictions_df], axis=1)
                client_df = pd.DataFrame({
                    'client_id':
                    client_list,
                    'prediction': ['client_id_not_found'] * len(client_list)
                })
                predictions_df = pd.concat([predictions_df, client_df], axis=0)
                predictions_df['client_id'] = predictions_df[
                    'client_id'].astype(str)
                predictions_df['client_id'] = predictions_df[
                    'client_id'].str.strip()
                predictions_df = predictions_df.drop_duplicates(
                    subset=['client_id'], keep='first')
                return render_template('model_interface.html',
                                       predictions=predictions_df.to_html(
                                           header=True, index=False))
            else:
                return render_template(
                    'model_interface.html',
                    predictions='None of the passed Client Ids could be found.'
                )
        else:
            return render_template(
                'model_interface.html',
                predictions='predictions will be rendered here')
    return redirect(url_for('login'))
Exemplo n.º 9
0
def main(target, test_set_percentage, model_training_list, cv_strategy,
         cv_scoring, static_param_space, class_cutoff, evaluation_list,
         calibration_bins, drop_col_scorer, drop_col_scorer_string,
         drop_col_scoring_type, drop_col_higher_is_better,
         explanation_sample_n, use_shap_kernel, s3_logging_bucket,
         db_schema_name, log_to_db, db_secret_name):
    """
    Main execution function.

    :param target: name of the target
    :param test_set_percentage: percentage of observations for the test set
    :param model_training_list: list of named tuples containing model configurations; the following tuple elements are
    required: model_name, model, param_space, iterations
    :param cv_strategy: cross validation strategy
    :param cv_scoring: scoring strategy for cross validation
    :param static_param_space: param space valid for every model
    :param class_cutoff: probability percentage to be classified in the position class
    :param target: name of the target
    :param evaluation_list: list of named tuples containing model evaluation configurations: the following tuple
    elements are required: evaluation_column, scorer_callable, metric_name
    :param calibration_bins: list of calibration bins to show
    :param drop_col_scorer: scikit-learn scoring function for drop col model
    :param drop_col_scorer_string: scoring metric in the form of a string (e.g. 'neg_log-loss') for drop col model
    :param drop_col_scoring_type: either class or probability for drop col model
    :param drop_col_higher_is_better: Boolean of whether or not a higher score is better (e.g. roc auc vs. log loss) for
    drop col model
    :param explanation_sample_n: number of observations to include when performing feature explanation
    :param use_shap_kernel: Boolean of whether or not to use the SHAP kernel explainer
    :param s3_logging_bucket: S3 bucket in which to store the model output
    :param db_schema_name: name of the schema for logging model results
    :param log_to_db: Boolean of whether or not to log results to the database
    :param db_secret_name: Secrets Manager secret with database credentials
    """
    db_conn = make_mysql_connection(db_secret_name)
    x_train, x_test, y_train, y_test = create_training_and_testing_data(
        target, test_set_percentage, db_conn)
    train_and_evaluate_model(
        x_train, x_test, y_train, y_test, model_training_list, cv_strategy,
        cv_scoring, static_param_space, class_cutoff, target, evaluation_list,
        calibration_bins, drop_col_scorer, drop_col_scorer_string,
        drop_col_scoring_type, drop_col_higher_is_better, explanation_sample_n,
        use_shap_kernel, s3_logging_bucket, db_schema_name, db_conn, log_to_db)
def main(model_path, db_secret_name, p_value_cutoff, model_features):
    """
    Determines if concept shift has occurred.

    :param model_path: path to the model
    :param db_secret_name: Secrets Manager secret with DB credentials
    :param p_value_cutoff: p-value for chi-squared calculation
    :param model_features: features used for modeling
    """
    db_conn = make_mysql_connection(db_secret_name)
    model_uid = extract_model_uid_from_path(model_path)
    query_start_time = get_query_start_timestamp(model_uid, db_conn)
    production_df = extract_production_data(query_start_time, model_uid, db_conn)
    original_training_df = recreate_data_used_for_training(model_uid, model_features)

    cat_production_df = production_df.select_dtypes(include='object')
    num_production_df = production_df.select_dtypes(exclude='object')
    cat_training_df = original_training_df.select_dtypes(include='object')
    num_training_df = original_training_df.select_dtypes(exclude='object')

    cat_columns = set(list(cat_production_df) + list(cat_training_df))
    num_columns = set(list(num_production_df) + list(num_training_df))
    main_drift_df = pd.DataFrame()

    for cat_col in cat_columns:
        temp_chi_squared_df = prep_category_for_chi_squared(cat_training_df, cat_production_df, cat_col)
        p_value = calculate_chi_squared_statistic(temp_chi_squared_df['train_count'],
                                                  temp_chi_squared_df['prod_count'])
        temp_drift_df = pd.DataFrame({'feature': [cat_col], 'p_value': [p_value]})
        main_drift_df = main_drift_df.append(temp_drift_df)

    for num_col in num_columns:
        p_value = calculate_ks_statistic(num_training_df[num_col], num_production_df[num_col])
        temp_drift_df = pd.DataFrame({'feature': [num_col], 'p_value': [p_value]})
        main_drift_df = main_drift_df.append(temp_drift_df)

    main_drift_df['shift_occurred'] = np.where(main_drift_df['p_value'] <= p_value_cutoff, True, False)
    main_drift_df['p_value_cutoff'] = p_value_cutoff
    db.write_dataframe_to_database(main_drift_df, 'churn_model', 'data_shift', db_conn)