def ingest(df): """ Function desigend to execute all ingestion functions. args: df (string): dataframe that will go through the initial cleaning process returns: - """ ## Storing time execution metadata extract_metadata[extract_metadata_index] = str(datetime.now()) ## Executing ingestion functions #### Receiving extraction df = initial_cleaning(df) #### Saving result as pickle pickle.dump(df, open(ingestion_pickle_loc, "wb")) ## Converting metadata into dataframe and saving locally df_meta = pd.DataFrame.from_dict(extract_metadata, orient="index").T df_meta.set_index(extract_metadata_index, inplace=True) write_csv_from_df(df_meta, metadata_dir_loc, extract_metadata_csv_name) ## Running unit test class TestExtract(marbles.core.TestCase): def test_empty_df(self): self.assertNotEqual(df.shape, [0, 0], note="Your dataframe is empty") with self.assertRaises(TypeError): df.shape[0, 0] stream = StringIO() runner = unittest.TextTestRunner(stream=stream) result = runner.run(unittest.makeSuite(TestExtract)) suite = unittest.TestLoader().loadTestsFromTestCase(TestExtract) with open(tests_dir_loc + 'extract_unittest.txt', 'w') as f: unittest.TextTestRunner(stream=f, verbosity=2).run(suite) res = [] with open(tests_dir_loc + "extract_unittest.txt") as fp: lines = fp.readlines() for line in lines: if "FAILED" in line: res.append( [str(datetime.now()), "FAILED, Your dataframe is empty"]) if "OK" in line: res.append([str(datetime.now()), "PASS"]) res_df = pd.DataFrame(res, columns=['Date', 'Result']) res_df.to_csv(tests_dir_loc + 'extract_unittest.csv', index=False) ## Success message print("\n** Ingestion module successfully executed **\n") return df
def models_training(fe_results_dict, mt_results_pickle_loc): """ Function desigend to execute all modeling functions. args: fe_pickle_loc (string): path where the picke obtained from the feature engineering is. models_pickle_loc (string): location where the resulting pickle object (best model) will be stored. returns: - """ ## Storing time execution metadata mt_metadata[mt_metadata_index] = str(datetime.now()) ## Implementing magic loop to train various models models_mloop, X_train, X_test, y_train, y_test = magic_loop( models_dict, fe_results_dict) ## Saving models training results #### Dictionary with all module results mt_results_dict = { "trained_models": models_mloop, "training_data": X_train, "training_labels": y_train, "test_data": X_test, "test_labels": y_test, } #### Saving dictionary with results as pickle pickle.dump(mt_results_dict, open(mt_results_pickle_loc, "wb")) print("\n** Models training module successfully executed **\n") ## Saving relevant module metadata #### Number of models trained mt_metadata["no_models_trained"] = len(models_mloop) #### Types of models trained mt_metadata["type_models_trained"] = " | ".join( [mdl for mdl in models_dict]) #### Converting metadata into dataframe and saving locally df_meta = pd.DataFrame.from_dict(mt_metadata, orient="index").T df_meta.set_index(mt_metadata_index, inplace=True) write_csv_from_df(df_meta, metadata_dir_loc, mt_metadata_csv_name) return mt_results_dict
def model_selection(mt_results_dict, ms_results_pickle_loc): """ Function desigend to execute all modeling functions. args: fe_pickle_loc (string): path where the picke obtained from the feature engineering is. returns: - """ ## Storing time execution metadata ms_metadata[ms_metadata_index] = str(datetime.now()) ## Selecting best trained model from magic_loop best_model = select_best_model(mt_results_dict) ## Testing best model with test data test_predict_labs, test_predict_scores = best_model_predict_test( best_model, mt_results_dict["test_data"]) ## Saving modeling results #### Dictionary with all module results ms_results_dict = { "best_trained_model": best_model, "model_test_predict_labels": test_predict_labs, "model_test_predict_scores": test_predict_scores } #### Saving dictionary with results as pickle pickle.dump(ms_results_dict, open(ms_results_pickle_loc, "wb")) print("\n** Model selection module successfully executed **\n") ## Saving relevant module metadata #### Model selected metadata ms_metadata["selected_model"] = str(best_model) #### Converting metadata into dataframe and saving locally df_meta = pd.DataFrame.from_dict(ms_metadata, orient="index").T df_meta.set_index(ms_metadata_index, inplace=True) write_csv_from_df(df_meta, metadata_dir_loc, ms_metadata_csv_name) return ms_results_dict
def bias_fairness(df_aeq): """ args: df (dataframe): dataframes that will be analyzed by Aequitas according to the selected model. returns: - """ xtab, conteos_grupo, metricas_absolutas, absolute_metrics = group(df_aeq) bdf, disparities, disparities_majority, disparities_min, bias = biasf( df_aeq, xtab) fairness, gaf, gof = fairnessf(bdf, absolute_metrics, bias) ## Storing time execution metadata aq_metadata[aq_metadata_index] = str(datetime.now()) ## Module results aeq_results_dict = { "xtab_results": xtab, "conteos_grupo_results": conteos_grupo, "metricas_absolutas_results": metricas_absolutas, "bdf_results": bdf, "disparities_results": disparities, "disparities_majority_results": disparities_majority, "disparities_minority_results": disparities_min, "fairness_results": fairness, "gaf_results": gaf, "gof_results": gof, } ## Saving relevant module metadata #### Converting metadata into dataframe and saving locally df_meta = pd.DataFrame.from_dict(aq_metadata, orient="index").T df_meta.set_index(aq_metadata_index, inplace=True) write_csv_from_df(df_meta, metadata_dir_loc, aq_metadata_csv_name) print("\n** Aequitas module successfully executed **\n") return aeq_results_dict
def feature_engineering(df, fe_results_pickle_loc): """ Function desigend to execute all fe functions. args: returns: - """ ## Storing time execution metadata fe_metadata[fe_metadata_index] = str(datetime.now()) ## Executing feature engineering functions #### Metadata: df shape prior fe metadata fe_metadata["dim_prior_fe"] = str(df.shape) #### Generating features processed by specific pipeline df_features_prc, df_labels, ohe_dict, df_features_prc_cols = feature_generation(df) #### Using model to evaluate which features are more important based on threshold df_imp_features_prc = feature_selection(df, df_features_prc, df_labels, df_features_prc_cols, ohe_dict) #### Saving all module's results in dictionary fe_results_dict = { "df_imp_engineered_features": df_imp_features_prc, "data_labels": df_labels, "ohe_reference": ohe_dict, "df_cols_features_engineered": df_features_prc_cols, "features_engineered": df_features_prc } #### Saving fe results pickle.dump(fe_results_dict, open(fe_results_pickle_loc, "wb")) print("\n** Feature engineering module successfully executed **\n") ## Working with module's metadata #### Converting metadata into dataframe and saving locally df_meta = pd.DataFrame.from_dict(fe_metadata, orient="index").T df_meta.set_index(fe_metadata_index, inplace=True) write_csv_from_df(df_meta, metadata_dir_loc, fe_metadata_csv_name) #### Running unit test class TestFeatureEngineering(marbles.core.TestCase): def test_feature_engineering(self): res = not bool(fe_results_dict) self.assertFalse(res, note="Your dictionary is empty") stream = StringIO() runner = unittest.TextTestRunner(stream=stream) result = runner.run(unittest.makeSuite(TestFeatureEngineering)) suite = unittest.TestLoader().loadTestsFromTestCase(TestFeatureEngineering) with open(tests_dir_loc + 'test_feature_engineering.txt', 'w') as f: unittest.TextTestRunner(stream=f, verbosity=2).run(suite) res = [] with open(tests_dir_loc + "test_feature_engineering.txt") as fp: lines = fp.readlines() for line in lines: if "FAILED" in line: res.append([str(datetime.now()), "FAILED, Your dictionary is empty."]) if "OK" in line: res.append([str(datetime.now()), "PASS"]) res_df = pd.DataFrame(res, columns=['Date', 'Result']) res_df.to_csv(tests_dir_loc + 'feature_engineering_unittest.csv', index=False) return fe_results_dict
def run(self): ###### Name of file inside directories path_file = path_file_fn(self.ingest_type) ## Location to find most recent local ingestion path_full = local_temp_ingestions + self.ingest_type + "/" + self.path_date + path_file #### Running unit test class TestSaveS3(marbles.core.TestCase): def test_size_pkl(self): size_pkl = os.path.getsize(path_full) > 0 self.assertTrue(size_pkl, note="Your pickle's size is OKB") stream = StringIO() runner = unittest.TextTestRunner(stream=stream) result = runner.run(unittest.makeSuite(TestSaveS3)) suite = unittest.TestLoader().loadTestsFromTestCase(TestSaveS3) with open(tests_dir_loc + 'saveS3_unittest.txt', 'w') as f: unittest.TextTestRunner(stream=f, verbosity=2).run(suite) res = [] with open(tests_dir_loc + "saveS3_unittest.txt") as fp: lines = fp.readlines() for line in lines: if "FAILED" in line: res.append([ str(datetime.now()), "FAILED, Your pickle's size is OKB" ]) if "OK" in line: res.append([str(datetime.now()), "PASS"]) res_df = pd.DataFrame(res, columns=['Date', 'Result']) res_df.to_csv(tests_dir_loc + 'saveS3_unittest.csv', index=False) ## Loading most recent ingestion ingesta_df = pickle.load(open(path_full, "rb")) ## Obtaining task metadata #### Storing time execution metadata save_s3_metadata[save_s3_metadata_index] = str(datetime.now()) #### Bucket where data will be saved save_s3_metadata["s3_bucket_name"] = str(self.bucket) #### S3 key related to the data save_s3_metadata["s3_key_name"] = str(get_key(self.output().path)) #### Shape of df going into s3 save_s3_metadata["df_shape"] = str(ingesta_df.shape) #### Converting dict to df and writing contents to df df_meta = pd.DataFrame.from_dict(save_s3_metadata, orient="index").T df_meta.set_index(save_s3_metadata_index, inplace=True) write_csv_from_df(df_meta, metadata_dir_loc, save_s3_metadata_csv_name) ## Storing object in s3 as pickle ingesta_pkl = pickle.dumps(ingesta_df) s3 = get_s3_resource() s3.put_object(Bucket=self.bucket, Key=get_key(self.output().path), Body=ingesta_pkl)
def predict(sel_model, fe_results, pr_results_pickle_loc): """ Function desigend to execute all fe functions. args: returns: - """ ## Storing time execution metadata pr_metadata[pr_metadata_index] = str(datetime.now()) #### Data IDs data_ids = fe_results["data_labels"].index data_features = fe_results["df_imp_engineered_features"] ## Predicting for every entry and attaching the ids info dfp = pd.DataFrame.from_dict( { "ids": data_ids, "prediction_date": [str(datetime.now())[:10]]*len(data_ids), "model_label": sel_model.predict(data_features), "score_label_0": sel_model.predict_proba(data_features)[:, 0], "score_label_1": sel_model.predict_proba(data_features)[:, 1], } ) dfp.set_index("ids", inplace=True) #### Running unit test class TestPredict(marbles.core.TestCase): #Testing for no empty inputs def test_inputs_pred(self): a = bool(sel_model) b = bool(fe_results) c = bool(pr_results_pickle_loc) lista = [a, b, c] t_list = [True, True, True] self.assertEqual(lista, t_list, note="Your inputs are empty!") #Testing to check that not all model labels have the same prediction def test_predictions(self): res =not bool(len(dfp['model_label'].unique())!=2) self.assertTrue(res, note="Your predictions have only one value!") stream = StringIO() runner = unittest.TextTestRunner(stream=stream) result = runner.run(unittest.makeSuite(TestPredict)) suite = unittest.TestLoader().loadTestsFromTestCase(TestPredict) with open(tests_dir_loc + 'predict_unittest.txt', 'w') as f: unittest.TextTestRunner(stream=f, verbosity=2).run(suite) res = [] with open(tests_dir_loc + "predict_unittest.txt") as fp: lines = fp.readlines() for line in lines: if "FAILED" in line: res.append([str(datetime.now()), "FAILED, Your predictions have only one value or empty inputs!"]) if "OK" in line: res.append([str(datetime.now()), "PASS"]) res_df = pd.DataFrame(res, columns=['Date', 'Result']) res_df.to_csv(tests_dir_loc + 'predict_unittest.csv', index=False) ## Saving results as pickle and storing them in s3 pickle.dump(dfp, open(pr_results_pickle_loc, "wb")) print("\n** Prediction module successfully executed **\n") ## Working with module's metadata #### Model used to make the predicions pr_metadata["predict_model"] = str(sel_model) #### Metadata: percentage of positives (1's) pr_metadata["percentage_positives"] = str(round(dfp["model_label"].value_counts(normalize=True)[1], 2)) #### Average score for positives (1's) pr_metadata["mean_score_positives"] = str(round(dfp["score_label_1"].mean(), 2)) #### Converting metadata into dataframe and saving locally df_meta = pd.DataFrame.from_dict(pr_metadata, orient="index").T df_meta.set_index(pr_metadata_index, inplace=True) write_csv_from_df(df_meta, metadata_dir_loc, pr_metadata_csv_name) return dfp
def transform(df, transformation_pickle_loc): """ Function desigend to execute all transformation functions. args: df: data frame ingestion #ingestion_pickle_loc (string): path where the pickle obtained from the ingestion is. transformation_pickle_loc (string): location where the resulting pickle object will be stored. returns: - """ ## Storing time execution metadata transformation_metadata[transformation_metadata_index] = str( datetime.now()) ## Executing transformation functions #df = load_ingestion(ingestion_pickle_loc) #### List of df's original set of columns orig_cols = df.columns #### Adding column of serious violations (transformation) df = serious_viols_col(df) #### Adding column with zip classification df = create_reference_group(df) #### Reducing the number of categories in data (transformation) df = category_reductions(df) #### List of df's resulting set of columns res_cols = df.columns ## Saving results save_transformation(df, transformation_pickle_loc) ## Working with module's metadata #### Storing number of transformations in metadata transformation_metadata["trans_count"] = trans_count #### String with list of new columns added after transformation (pipe separated) transformation_metadata["new_cols"] = " | ".join( set(res_cols) - set(orig_cols)) #### Converting metadata into dataframe and saving locally df_meta = pd.DataFrame.from_dict(transformation_metadata, orient="index").T df_meta.set_index(transformation_metadata_index, inplace=True) write_csv_from_df(df_meta, metadata_dir_loc, trans_metadata_csv_name) ## Running unit test with marbles class TestTransform(marbles.core.TestCase): def test_transformation(self): self.assertEqual(df.shape[1], 10, note='Oops, DataFrame columns are missing!') stream = StringIO() runner = unittest.TextTestRunner(stream=stream) result = runner.run(unittest.makeSuite(TestTransform)) suite = unittest.TestLoader().loadTestsFromTestCase(TestTransform) with open(tests_dir_loc + 'test_transformation.txt', 'w') as f: unittest.TextTestRunner(stream=f, verbosity=2).run(suite) res = [] with open(tests_dir_loc + "test_transformation.txt") as fp: lines = fp.readlines() for line in lines: if "FAILED" in line: res.append([ str(datetime.now()), "FAILED,DataFrame columns are missing" ]) if "OK" in line: res.append([str(datetime.now()), "PASS"]) res_df = pd.DataFrame(res, columns=['Date', 'Result']) res_df.to_csv(tests_dir_loc + 'transform_unittest.csv', index=False) ## Printing flag message about execution print("\n** Transformation module successfully executed **\n") return df