예제 #1
def ingest(df):
    Function desigend to execute all ingestion functions.
            df (string): dataframe that will go through the initial cleaning process

    ## Storing time execution metadata
    extract_metadata[extract_metadata_index] = str(datetime.now())

    ## Executing ingestion functions

    #### Receiving extraction
    df = initial_cleaning(df)

    #### Saving result as pickle
    pickle.dump(df, open(ingestion_pickle_loc, "wb"))

    ## Converting metadata into dataframe and saving locally
    df_meta = pd.DataFrame.from_dict(extract_metadata, orient="index").T
    df_meta.set_index(extract_metadata_index, inplace=True)
    write_csv_from_df(df_meta, metadata_dir_loc, extract_metadata_csv_name)

    ## Running unit test
    class TestExtract(marbles.core.TestCase):
        def test_empty_df(self):
            self.assertNotEqual(df.shape, [0, 0],
                                note="Your dataframe is empty")
            with self.assertRaises(TypeError):
                df.shape[0, 0]

    stream = StringIO()
    runner = unittest.TextTestRunner(stream=stream)
    result = runner.run(unittest.makeSuite(TestExtract))
    suite = unittest.TestLoader().loadTestsFromTestCase(TestExtract)

    with open(tests_dir_loc + 'extract_unittest.txt', 'w') as f:
        unittest.TextTestRunner(stream=f, verbosity=2).run(suite)

    res = []
    with open(tests_dir_loc + "extract_unittest.txt") as fp:
        lines = fp.readlines()
        for line in lines:
            if "FAILED" in line:
                    [str(datetime.now()), "FAILED, Your dataframe is empty"])
            if "OK" in line:
                res.append([str(datetime.now()), "PASS"])

    res_df = pd.DataFrame(res, columns=['Date', 'Result'])

    res_df.to_csv(tests_dir_loc + 'extract_unittest.csv', index=False)

    ## Success message
    print("\n** Ingestion module successfully executed **\n")

    return df
예제 #2
def models_training(fe_results_dict, mt_results_pickle_loc):
    Function desigend to execute all modeling functions.
            fe_pickle_loc (string): path where the picke obtained from the feature engineering is.
            models_pickle_loc (string): location where the resulting pickle object (best model) will be stored.

    ## Storing time execution metadata
    mt_metadata[mt_metadata_index] = str(datetime.now())

    ## Implementing magic loop to train various models
    models_mloop, X_train, X_test, y_train, y_test = magic_loop(
        models_dict, fe_results_dict)

    ## Saving models training results

    #### Dictionary with all module results
    mt_results_dict = {
        "trained_models": models_mloop,
        "training_data": X_train,
        "training_labels": y_train,
        "test_data": X_test,
        "test_labels": y_test,

    #### Saving dictionary with results as pickle
    pickle.dump(mt_results_dict, open(mt_results_pickle_loc, "wb"))

    print("\n** Models training module successfully executed **\n")

    ## Saving relevant module metadata

    #### Number of models trained
    mt_metadata["no_models_trained"] = len(models_mloop)

    #### Types of models trained
    mt_metadata["type_models_trained"] = " | ".join(
        [mdl for mdl in models_dict])

    #### Converting metadata into dataframe and saving locally
    df_meta = pd.DataFrame.from_dict(mt_metadata, orient="index").T
    df_meta.set_index(mt_metadata_index, inplace=True)
    write_csv_from_df(df_meta, metadata_dir_loc, mt_metadata_csv_name)

    return mt_results_dict
예제 #3
def model_selection(mt_results_dict, ms_results_pickle_loc):
    Function desigend to execute all modeling functions.
            fe_pickle_loc (string): path where the picke obtained from the feature engineering is.

    ## Storing time execution metadata
    ms_metadata[ms_metadata_index] = str(datetime.now())

    ## Selecting best trained model from magic_loop
    best_model = select_best_model(mt_results_dict)

    ## Testing best model with test data
    test_predict_labs, test_predict_scores = best_model_predict_test(
        best_model, mt_results_dict["test_data"])

    ## Saving modeling results

    #### Dictionary with all module results
    ms_results_dict = {
        "best_trained_model": best_model,
        "model_test_predict_labels": test_predict_labs,
        "model_test_predict_scores": test_predict_scores

    #### Saving dictionary with results as pickle
    pickle.dump(ms_results_dict, open(ms_results_pickle_loc, "wb"))

    print("\n** Model selection module successfully executed **\n")

    ## Saving relevant module metadata

    #### Model selected metadata
    ms_metadata["selected_model"] = str(best_model)

    #### Converting metadata into dataframe and saving locally
    df_meta = pd.DataFrame.from_dict(ms_metadata, orient="index").T
    df_meta.set_index(ms_metadata_index, inplace=True)
    write_csv_from_df(df_meta, metadata_dir_loc, ms_metadata_csv_name)

    return ms_results_dict
예제 #4
def bias_fairness(df_aeq):
         df (dataframe): dataframes that will be analyzed by Aequitas according to the selected model.

    xtab, conteos_grupo, metricas_absolutas, absolute_metrics = group(df_aeq)

    bdf, disparities, disparities_majority, disparities_min, bias = biasf(
        df_aeq, xtab)

    fairness, gaf, gof = fairnessf(bdf, absolute_metrics, bias)

    ## Storing time execution metadata
    aq_metadata[aq_metadata_index] = str(datetime.now())

    ## Module results
    aeq_results_dict = {
        "xtab_results": xtab,
        "conteos_grupo_results": conteos_grupo,
        "metricas_absolutas_results": metricas_absolutas,
        "bdf_results": bdf,
        "disparities_results": disparities,
        "disparities_majority_results": disparities_majority,
        "disparities_minority_results": disparities_min,
        "fairness_results": fairness,
        "gaf_results": gaf,
        "gof_results": gof,

    ## Saving relevant module metadata

    #### Converting metadata into dataframe and saving locally
    df_meta = pd.DataFrame.from_dict(aq_metadata, orient="index").T
    df_meta.set_index(aq_metadata_index, inplace=True)
    write_csv_from_df(df_meta, metadata_dir_loc, aq_metadata_csv_name)

    print("\n** Aequitas module successfully executed **\n")

    return aeq_results_dict
예제 #5
def feature_engineering(df, fe_results_pickle_loc):
    Function desigend to execute all fe functions.

    ## Storing time execution metadata
    fe_metadata[fe_metadata_index] = str(datetime.now())

    ## Executing feature engineering functions

    #### Metadata: df shape prior fe metadata
    fe_metadata["dim_prior_fe"] = str(df.shape)

    #### Generating features processed by specific pipeline
    df_features_prc, df_labels, ohe_dict, df_features_prc_cols = feature_generation(df)

    #### Using model to evaluate which features are more important based on threshold
    df_imp_features_prc = feature_selection(df, df_features_prc, df_labels, df_features_prc_cols, ohe_dict)

    #### Saving all module's results in dictionary
    fe_results_dict = {
        "df_imp_engineered_features": df_imp_features_prc,
        "data_labels": df_labels,
        "ohe_reference": ohe_dict,
        "df_cols_features_engineered": df_features_prc_cols,
        "features_engineered": df_features_prc

    #### Saving fe results
    pickle.dump(fe_results_dict, open(fe_results_pickle_loc, "wb"))

    print("\n** Feature engineering module successfully executed **\n")

    ## Working with module's metadata

    #### Converting metadata into dataframe and saving locally
    df_meta = pd.DataFrame.from_dict(fe_metadata, orient="index").T
    df_meta.set_index(fe_metadata_index, inplace=True)
    write_csv_from_df(df_meta, metadata_dir_loc, fe_metadata_csv_name)

    #### Running unit test
    class TestFeatureEngineering(marbles.core.TestCase):
        def test_feature_engineering(self):
            res = not bool(fe_results_dict)
            self.assertFalse(res, note="Your dictionary is empty")

    stream = StringIO()
    runner = unittest.TextTestRunner(stream=stream)
    result = runner.run(unittest.makeSuite(TestFeatureEngineering))

    suite = unittest.TestLoader().loadTestsFromTestCase(TestFeatureEngineering)

    with open(tests_dir_loc + 'test_feature_engineering.txt', 'w') as f:
        unittest.TextTestRunner(stream=f, verbosity=2).run(suite)

    res = []
    with open(tests_dir_loc + "test_feature_engineering.txt") as fp:
        lines = fp.readlines()
        for line in lines:
            if "FAILED" in line:
                res.append([str(datetime.now()), "FAILED, Your dictionary is empty."])
            if "OK" in line:
                res.append([str(datetime.now()), "PASS"])

    res_df = pd.DataFrame(res, columns=['Date', 'Result'])

    res_df.to_csv(tests_dir_loc + 'feature_engineering_unittest.csv', index=False)

    return fe_results_dict
예제 #6
    def run(self):

        ###### Name of file inside directories

        path_file = path_file_fn(self.ingest_type)

        ## Location to find most recent local ingestion
        path_full = local_temp_ingestions + self.ingest_type + "/" + self.path_date + path_file

        #### Running unit test
        class TestSaveS3(marbles.core.TestCase):
            def test_size_pkl(self):
                size_pkl = os.path.getsize(path_full) > 0
                self.assertTrue(size_pkl, note="Your pickle's size is OKB")

        stream = StringIO()
        runner = unittest.TextTestRunner(stream=stream)
        result = runner.run(unittest.makeSuite(TestSaveS3))
        suite = unittest.TestLoader().loadTestsFromTestCase(TestSaveS3)

        with open(tests_dir_loc + 'saveS3_unittest.txt', 'w') as f:
            unittest.TextTestRunner(stream=f, verbosity=2).run(suite)

        res = []
        with open(tests_dir_loc + "saveS3_unittest.txt") as fp:
            lines = fp.readlines()
            for line in lines:
                if "FAILED" in line:
                        "FAILED, Your pickle's size is OKB"
                if "OK" in line:
                    res.append([str(datetime.now()), "PASS"])

        res_df = pd.DataFrame(res, columns=['Date', 'Result'])

        res_df.to_csv(tests_dir_loc + 'saveS3_unittest.csv', index=False)

        ## Loading most recent ingestion
        ingesta_df = pickle.load(open(path_full, "rb"))

        ## Obtaining task metadata

        #### Storing time execution metadata
        save_s3_metadata[save_s3_metadata_index] = str(datetime.now())
        #### Bucket where data will be saved
        save_s3_metadata["s3_bucket_name"] = str(self.bucket)
        #### S3 key related to the data
        save_s3_metadata["s3_key_name"] = str(get_key(self.output().path))
        #### Shape of df going into s3
        save_s3_metadata["df_shape"] = str(ingesta_df.shape)

        #### Converting dict to df and writing contents to df
        df_meta = pd.DataFrame.from_dict(save_s3_metadata, orient="index").T
        df_meta.set_index(save_s3_metadata_index, inplace=True)
        write_csv_from_df(df_meta, metadata_dir_loc, save_s3_metadata_csv_name)

        ## Storing object in s3 as pickle
        ingesta_pkl = pickle.dumps(ingesta_df)
        s3 = get_s3_resource()
예제 #7
def predict(sel_model, fe_results, pr_results_pickle_loc):
    Function desigend to execute all fe functions.

    ## Storing time execution metadata
    pr_metadata[pr_metadata_index] = str(datetime.now())

    #### Data IDs
    data_ids = fe_results["data_labels"].index
    data_features = fe_results["df_imp_engineered_features"]

    ## Predicting for every entry and attaching the ids info
    dfp = pd.DataFrame.from_dict(
            "ids": data_ids,
            "prediction_date": [str(datetime.now())[:10]]*len(data_ids),
            "model_label": sel_model.predict(data_features),
            "score_label_0": sel_model.predict_proba(data_features)[:, 0],
            "score_label_1": sel_model.predict_proba(data_features)[:, 1],
    dfp.set_index("ids", inplace=True)
    #### Running unit test
    class TestPredict(marbles.core.TestCase):
        #Testing for no empty inputs
        def test_inputs_pred(self):
            a = bool(sel_model)
            b = bool(fe_results)
            c = bool(pr_results_pickle_loc)
            lista = [a, b, c]
            t_list = [True, True, True]
            self.assertEqual(lista, t_list, note="Your inputs are empty!")

        #Testing to check that not all model labels have the same prediction
        def test_predictions(self):
            res =not bool(len(dfp['model_label'].unique())!=2)
            self.assertTrue(res, note="Your predictions have only one value!")

    stream = StringIO()
    runner = unittest.TextTestRunner(stream=stream)
    result = runner.run(unittest.makeSuite(TestPredict))

    suite = unittest.TestLoader().loadTestsFromTestCase(TestPredict)

    with open(tests_dir_loc + 'predict_unittest.txt', 'w') as f:
        unittest.TextTestRunner(stream=f, verbosity=2).run(suite)

    res = []
    with open(tests_dir_loc + "predict_unittest.txt") as fp:
        lines = fp.readlines()
        for line in lines:
            if "FAILED" in line:
                res.append([str(datetime.now()), "FAILED, Your predictions have only one value or empty inputs!"])
            if "OK" in line:
                res.append([str(datetime.now()), "PASS"])

    res_df = pd.DataFrame(res, columns=['Date', 'Result'])

    res_df.to_csv(tests_dir_loc + 'predict_unittest.csv', index=False)

    ## Saving results as pickle and storing them in s3
    pickle.dump(dfp, open(pr_results_pickle_loc, "wb"))

    print("\n** Prediction module successfully executed **\n")

    ## Working with module's metadata

    #### Model used to make the predicions
    pr_metadata["predict_model"] = str(sel_model)

    #### Metadata: percentage of positives (1's)
    pr_metadata["percentage_positives"] = str(round(dfp["model_label"].value_counts(normalize=True)[1], 2))

    #### Average score for positives (1's)
    pr_metadata["mean_score_positives"] = str(round(dfp["score_label_1"].mean(), 2))

    #### Converting metadata into dataframe and saving locally
    df_meta = pd.DataFrame.from_dict(pr_metadata, orient="index").T
    df_meta.set_index(pr_metadata_index, inplace=True)
    write_csv_from_df(df_meta, metadata_dir_loc, pr_metadata_csv_name)

    return dfp
예제 #8
def transform(df, transformation_pickle_loc):
    Function desigend to execute all transformation functions.
            df: data frame ingestion
            #ingestion_pickle_loc (string): path where the pickle obtained from the ingestion is.
            transformation_pickle_loc (string): location where the resulting pickle object will be stored.

    ## Storing time execution metadata
    transformation_metadata[transformation_metadata_index] = str(

    ## Executing transformation functions

    #df = load_ingestion(ingestion_pickle_loc)

    #### List of df's original set of columns
    orig_cols = df.columns

    #### Adding column of serious violations (transformation)
    df = serious_viols_col(df)

    #### Adding column with zip classification
    df = create_reference_group(df)

    #### Reducing the number of categories in data (transformation)
    df = category_reductions(df)

    #### List of df's resulting set of columns
    res_cols = df.columns

    ## Saving results
    save_transformation(df, transformation_pickle_loc)

    ## Working with module's metadata

    #### Storing number of transformations in metadata
    transformation_metadata["trans_count"] = trans_count

    #### String with list of new columns added after transformation (pipe separated)
    transformation_metadata["new_cols"] = " | ".join(
        set(res_cols) - set(orig_cols))

    #### Converting metadata into dataframe and saving locally
    df_meta = pd.DataFrame.from_dict(transformation_metadata, orient="index").T
    df_meta.set_index(transformation_metadata_index, inplace=True)
    write_csv_from_df(df_meta, metadata_dir_loc, trans_metadata_csv_name)

    ## Running unit test with marbles
    class TestTransform(marbles.core.TestCase):
        def test_transformation(self):
                             note='Oops, DataFrame columns are missing!')

    stream = StringIO()
    runner = unittest.TextTestRunner(stream=stream)
    result = runner.run(unittest.makeSuite(TestTransform))

    suite = unittest.TestLoader().loadTestsFromTestCase(TestTransform)

    with open(tests_dir_loc + 'test_transformation.txt', 'w') as f:
        unittest.TextTestRunner(stream=f, verbosity=2).run(suite)

    res = []
    with open(tests_dir_loc + "test_transformation.txt") as fp:
        lines = fp.readlines()
        for line in lines:
            if "FAILED" in line:
                    str(datetime.now()), "FAILED,DataFrame columns are missing"
            if "OK" in line:
                res.append([str(datetime.now()), "PASS"])

    res_df = pd.DataFrame(res, columns=['Date', 'Result'])

    res_df.to_csv(tests_dir_loc + 'transform_unittest.csv', index=False)

    ## Printing flag message about execution
    print("\n** Transformation module successfully executed **\n")

    return df