def run(self): ## Establishing connection with S3 s3 = get_s3_resource() ## Loading latest model objects = s3.list_objects_v2(Bucket=self.bucket, Prefix=ms_aws_key)['Contents'] obj_path = [file["Key"] for file in objects][-1] response = s3.get_object(Bucket=self.bucket, Key=obj_path) #### Latest model stored in S3 sel_model = pickle.loads(response["Body"].read())["best_trained_model"] ## Loading most recent data that will be fed to the model fe_results_s3_pth = 'feature_engineering/feature_engineering_' + today_info + '.pkl' fe_results_pkl = s3.get_object(Bucket=self.bucket, Key=fe_results_s3_pth) fe_results = pickle.loads(fe_results_pkl['Body'].read()) ## Executing prediction master function dfp = predict(sel_model, fe_results, pr_results_pickle_loc) ## Storing local dataframe with results for API (Exposición del modelo) dfp.to_csv(api_store_preds_data) ## Storing local dataframe with results for model monitoring (Monitoreo del modelo) dfp.to_csv(api_monitor_data) ## Storing results in s3 as pickle pr_results_pickle = pickle.dumps(dfp) s3.put_object(Bucket=self.bucket, Key=get_key(self.output().path), Body=pr_results_pickle)
def run(self): ## makes transformation most recent ingestion # Agregar lectura de datos desde AWS. ## Storing object in s3 s3 = get_s3_resource() ## Geet extraction path: path_file = path_file_fn(self.ingest_type) ## Define the path where the ingestion will be stored in s3 extract_path_start = "{}/{}/".format( 'ingestion', self.ingest_type, ) extract_pickle_loc_s3 = extract_path_start + self.path_date + path_file ## Reading data form s3 s3_ingestion = s3.get_object(Bucket=self.bucket, Key=extract_pickle_loc_s3) ingestion_pickle_loc_ok = pickle.loads(s3_ingestion['Body'].read()) ingestion_df = pd.DataFrame(ingestion_pickle_loc_ok) transformation = pickle.dumps( transform(ingestion_df, transformation_pickle_loc)) s3.put_object(Bucket=self.bucket, Key=get_key(self.output().path), Body=transformation)
def run(self): s3 = get_s3_resource() fe_pickle_loc_s3 = 'feature_engineering/feature_engineering_' + today_info + '.pkl' fe_results_pkl = s3.get_object(Bucket=self.bucket, Key=fe_pickle_loc_s3) fe_results_dict = pickle.loads(fe_results_pkl['Body'].read()) mt_results_dict = models_training(fe_results_dict, mt_results_pickle_loc) mt_pkl = pickle.dumps(mt_results_dict) s3.put_object(Bucket=self.bucket, Key=get_key(self.output().path), Body=mt_pkl)
def request_data_to_API(ingest_type, bucket_name): """ Saving data donwloaded with Chicago's API :param ingest_type: :param bucket_name: :return: """ ## Getting s3 resource to store data in s3. s3 = get_s3_resource() ## Read token from credentials file token = get_api_token("conf/local/credentials.yaml") ## Getting client to download data with API client = get_client(token) ## Downloading data and storing it temporaly in local machine prior upload to s3 if ingest_type == "initial": ## Requesting all data from API ingesta = ingesta_inicial(client) create_path_ingestion(ingest_type) elif ingest_type == "consecutive": ## Finding most recent date in consecutive pickles pkl_mrd = most_recent_lcl_for_cons() print("**** Consecutive data will be downloaded from {} ****".format( pkl_mrd)) print("********") create_path_ingestion(ingest_type) ## Building query to download data of interest soql_query = "inspection_date >= '{}'".format(pkl_mrd) #ingesta = pickle.dumps(ingesta_consecutiva(client, soql_query)) ingesta = ingesta_consecutiva(client, soql_query) else: raise NameError('Invalid parameter') ## Obtaining and storing ingestion metadata return ingesta
def run(self): s3 = get_s3_resource() transformation_pickle_loc_s3 = 'transformation/transformation_' + today_info + '.pkl' feature_engineering_luigi = s3.get_object( Bucket=self.bucket, Key=transformation_pickle_loc_s3) df_pre_fe = pickle.loads(feature_engineering_luigi['Body'].read()) df_post_fe = feature_engineering(df_pre_fe, fe_results_pickle_loc) fe_results_dict = pickle.dumps(df_post_fe) s3.put_object(Bucket=self.bucket, Key=get_key(self.output().path), Body=fe_results_dict)
def run(self): s3 = get_s3_resource() mt_pickle_loc_s3 = 'trained_models/trained_models_' + today_info + '.pkl' mt_results_pkl = s3.get_object(Bucket=self.bucket, Key=mt_pickle_loc_s3) mt_results_dict = pickle.loads(mt_results_pkl['Body'].read()) ms_results_dict = model_selection(mt_results_dict, ms_results_pickle_loc) ms_results_pkl = pickle.dumps(ms_results_dict) s3.put_object(Bucket=self.bucket, Key=get_key(self.output().path), Body=ms_results_pkl)
def run(self): ## Establishing connection with S3 s3 = get_s3_resource() ## Building aequitas dataframe based on previous models results #### (Models training results) - building initial df with unique IDs and real test labels mt_results_s3_pth = 'trained_models/trained_models_' + today_info + '.pkl' mt_results_pkl = s3.get_object(Bucket=self.bucket, Key=mt_results_s3_pth) mt_results = pickle.loads(mt_results_pkl['Body'].read()) df_aeq = mt_results["test_labels"].to_frame() #### (Model selection results) - adding labels predicted by best model ms_results_s3_pth = 'model_selection/selected_model_' + today_info + '.pkl' ms_results_pkl = s3.get_object(Bucket=self.bucket, Key=ms_results_s3_pth) ms_results = pickle.loads(ms_results_pkl['Body'].read()) df_aeq["score"] = ms_results["model_test_predict_labels"] #### (Transformation results) - adding zip and reference group tr_results_s3_pth = 'transformation/transformation_' + today_info + '.pkl' tr_results_pkl = s3.get_object(Bucket=self.bucket, Key=tr_results_s3_pth) tr_results = pickle.loads(tr_results_pkl['Body'].read()) df_aeq = df_aeq.join(tr_results.loc[:, ["zip-income-class"]], how="inner") #### Renaming columns for aequitas analysis df_aeq.rename(columns={ "label": "label_value", "model_test_predict_labels": "score", "zip-income-class": "reference_group" }, inplace=True) #### Converting index "inspection-id" to column df_aeq.reset_index(inplace=True, drop=True) print("***********") print(df_aeq.columns) print("***********") df_aeq2 = df_aeq.drop("score", axis=1) ## Running unit test class TestBiasFairness(marbles.core.TestCase): def test_df_aeq(self): columns_names = list(df_aeq.columns) df_expected_names = ['label_value', 'score', 'reference_group'] print("") print("***********") print(list(df_aeq2.columns)) print(len(columns_names)) print("shape[1]: ", df_aeq2.shape[1]) print(len(df_expected_names)) print("***********") self.assertEqual(int(df_aeq.shape[1]), 3, note='Oops, columns are missing!') stream = StringIO() runner = unittest.TextTestRunner(stream=stream) result = runner.run(unittest.makeSuite(TestBiasFairness)) suite = unittest.TestLoader().loadTestsFromTestCase(TestBiasFairness) with open(tests_dir_loc + 'test_bias_fairness.txt', 'w') as f: unittest.TextTestRunner(stream=f, verbosity=2).run(suite) res = [] with open(tests_dir_loc + "test_bias_fairness.txt") as fp: lines = fp.readlines() for line in lines: if "FAILED" in line: res.append( [str(datetime.now()), "FAILED, Columns are missing."]) if "OK" in line: res.append([str(datetime.now()), "PASS"]) res_df = pd.DataFrame(res, columns=['Date', 'Result']) res_df.to_csv(tests_dir_loc + 'bias_fairness_unittest.csv', index=False) ## Do Bias_fairness and save results: aeq_results_dict = bias_fairness(df_aeq) pickle.dump(aeq_results_dict, open(aq_results_pickle_loc, "wb")) aq_pickle = pickle.dumps(aeq_results_dict) s3.put_object(Bucket=self.bucket, Key=get_key(self.output().path), Body=aq_pickle)
def run(self): ###### Name of file inside directories path_file = path_file_fn(self.ingest_type) ## Location to find most recent local ingestion path_full = local_temp_ingestions + self.ingest_type + "/" + self.path_date + path_file #### Running unit test class TestSaveS3(marbles.core.TestCase): def test_size_pkl(self): size_pkl = os.path.getsize(path_full) > 0 self.assertTrue(size_pkl, note="Your pickle's size is OKB") stream = StringIO() runner = unittest.TextTestRunner(stream=stream) result = runner.run(unittest.makeSuite(TestSaveS3)) suite = unittest.TestLoader().loadTestsFromTestCase(TestSaveS3) with open(tests_dir_loc + 'saveS3_unittest.txt', 'w') as f: unittest.TextTestRunner(stream=f, verbosity=2).run(suite) res = [] with open(tests_dir_loc + "saveS3_unittest.txt") as fp: lines = fp.readlines() for line in lines: if "FAILED" in line: res.append([ str(datetime.now()), "FAILED, Your pickle's size is OKB" ]) if "OK" in line: res.append([str(datetime.now()), "PASS"]) res_df = pd.DataFrame(res, columns=['Date', 'Result']) res_df.to_csv(tests_dir_loc + 'saveS3_unittest.csv', index=False) ## Loading most recent ingestion ingesta_df = pickle.load(open(path_full, "rb")) ## Obtaining task metadata #### Storing time execution metadata save_s3_metadata[save_s3_metadata_index] = str(datetime.now()) #### Bucket where data will be saved save_s3_metadata["s3_bucket_name"] = str(self.bucket) #### S3 key related to the data save_s3_metadata["s3_key_name"] = str(get_key(self.output().path)) #### Shape of df going into s3 save_s3_metadata["df_shape"] = str(ingesta_df.shape) #### Converting dict to df and writing contents to df df_meta = pd.DataFrame.from_dict(save_s3_metadata, orient="index").T df_meta.set_index(save_s3_metadata_index, inplace=True) write_csv_from_df(df_meta, metadata_dir_loc, save_s3_metadata_csv_name) ## Storing object in s3 as pickle ingesta_pkl = pickle.dumps(ingesta_df) s3 = get_s3_resource() s3.put_object(Bucket=self.bucket, Key=get_key(self.output().path), Body=ingesta_pkl)