示例#1
0
    def run(self):

        ## Establishing connection with S3
        s3 = get_s3_resource()

        ## Loading latest model
        objects = s3.list_objects_v2(Bucket=self.bucket,
                                     Prefix=ms_aws_key)['Contents']
        obj_path = [file["Key"] for file in objects][-1]
        response = s3.get_object(Bucket=self.bucket, Key=obj_path)

        #### Latest model stored in S3
        sel_model = pickle.loads(response["Body"].read())["best_trained_model"]

        ## Loading most recent data that will be fed to the model
        fe_results_s3_pth = 'feature_engineering/feature_engineering_' + today_info + '.pkl'
        fe_results_pkl = s3.get_object(Bucket=self.bucket,
                                       Key=fe_results_s3_pth)
        fe_results = pickle.loads(fe_results_pkl['Body'].read())

        ## Executing prediction master function
        dfp = predict(sel_model, fe_results, pr_results_pickle_loc)

        ## Storing local dataframe with results for API (Exposición del modelo)
        dfp.to_csv(api_store_preds_data)

        ## Storing local dataframe with results for model monitoring (Monitoreo del modelo)
        dfp.to_csv(api_monitor_data)

        ## Storing results in s3 as pickle
        pr_results_pickle = pickle.dumps(dfp)
        s3.put_object(Bucket=self.bucket,
                      Key=get_key(self.output().path),
                      Body=pr_results_pickle)
示例#2
0
    def run(self):

        ## makes transformation most recent ingestion

        # Agregar lectura de datos desde AWS.

        ## Storing object in s3
        s3 = get_s3_resource()

        ## Geet extraction path:

        path_file = path_file_fn(self.ingest_type)

        ## Define the path where the ingestion will be stored in s3
        extract_path_start = "{}/{}/".format(
            'ingestion',
            self.ingest_type,
        )
        extract_pickle_loc_s3 = extract_path_start + self.path_date + path_file

        ## Reading data form s3

        s3_ingestion = s3.get_object(Bucket=self.bucket,
                                     Key=extract_pickle_loc_s3)

        ingestion_pickle_loc_ok = pickle.loads(s3_ingestion['Body'].read())

        ingestion_df = pd.DataFrame(ingestion_pickle_loc_ok)

        transformation = pickle.dumps(
            transform(ingestion_df, transformation_pickle_loc))

        s3.put_object(Bucket=self.bucket,
                      Key=get_key(self.output().path),
                      Body=transformation)
示例#3
0
    def run(self):

        s3 = get_s3_resource()

        fe_pickle_loc_s3 = 'feature_engineering/feature_engineering_' + today_info + '.pkl'

        fe_results_pkl = s3.get_object(Bucket=self.bucket, Key=fe_pickle_loc_s3)

        fe_results_dict = pickle.loads(fe_results_pkl['Body'].read())

        mt_results_dict = models_training(fe_results_dict, mt_results_pickle_loc)

        mt_pkl = pickle.dumps(mt_results_dict)

        s3.put_object(Bucket=self.bucket, Key=get_key(self.output().path), Body=mt_pkl)
示例#4
0
def request_data_to_API(ingest_type, bucket_name):
    """
    Saving data donwloaded with Chicago's API
    :param ingest_type:
    :param bucket_name:
    :return:
    """

    ## Getting s3 resource to store data in s3.
    s3 = get_s3_resource()

    ## Read token from credentials file
    token = get_api_token("conf/local/credentials.yaml")

    ## Getting client to download data with API
    client = get_client(token)

    ## Downloading data and storing it temporaly in local machine prior upload to s3
    if ingest_type == "initial":

        ## Requesting all data from API
        ingesta = ingesta_inicial(client)

        create_path_ingestion(ingest_type)

    elif ingest_type == "consecutive":

        ## Finding most recent date in consecutive pickles
        pkl_mrd = most_recent_lcl_for_cons()
        print("**** Consecutive data will be downloaded from {} ****".format(
            pkl_mrd))
        print("********")

        create_path_ingestion(ingest_type)

        ## Building query to download data of interest
        soql_query = "inspection_date >= '{}'".format(pkl_mrd)

        #ingesta = pickle.dumps(ingesta_consecutiva(client, soql_query))
        ingesta = ingesta_consecutiva(client, soql_query)

    else:
        raise NameError('Invalid parameter')

    ## Obtaining and storing ingestion metadata

    return ingesta
示例#5
0
    def run(self):

        s3 = get_s3_resource()

        transformation_pickle_loc_s3 = 'transformation/transformation_' + today_info + '.pkl'

        feature_engineering_luigi = s3.get_object(
            Bucket=self.bucket, Key=transformation_pickle_loc_s3)

        df_pre_fe = pickle.loads(feature_engineering_luigi['Body'].read())

        df_post_fe = feature_engineering(df_pre_fe, fe_results_pickle_loc)

        fe_results_dict = pickle.dumps(df_post_fe)

        s3.put_object(Bucket=self.bucket,
                      Key=get_key(self.output().path),
                      Body=fe_results_dict)
示例#6
0
    def run(self):

        s3 = get_s3_resource()

        mt_pickle_loc_s3 = 'trained_models/trained_models_' + today_info + '.pkl'

        mt_results_pkl = s3.get_object(Bucket=self.bucket,
                                       Key=mt_pickle_loc_s3)

        mt_results_dict = pickle.loads(mt_results_pkl['Body'].read())

        ms_results_dict = model_selection(mt_results_dict,
                                          ms_results_pickle_loc)

        ms_results_pkl = pickle.dumps(ms_results_dict)

        s3.put_object(Bucket=self.bucket,
                      Key=get_key(self.output().path),
                      Body=ms_results_pkl)
示例#7
0
    def run(self):

        ## Establishing connection with S3
        s3 = get_s3_resource()

        ## Building aequitas dataframe based on previous models results

        #### (Models training results) - building initial df with unique IDs and real test labels
        mt_results_s3_pth = 'trained_models/trained_models_' + today_info + '.pkl'
        mt_results_pkl = s3.get_object(Bucket=self.bucket,
                                       Key=mt_results_s3_pth)
        mt_results = pickle.loads(mt_results_pkl['Body'].read())
        df_aeq = mt_results["test_labels"].to_frame()

        #### (Model selection results) - adding labels predicted by best model
        ms_results_s3_pth = 'model_selection/selected_model_' + today_info + '.pkl'
        ms_results_pkl = s3.get_object(Bucket=self.bucket,
                                       Key=ms_results_s3_pth)
        ms_results = pickle.loads(ms_results_pkl['Body'].read())
        df_aeq["score"] = ms_results["model_test_predict_labels"]

        #### (Transformation results) - adding zip and reference group
        tr_results_s3_pth = 'transformation/transformation_' + today_info + '.pkl'
        tr_results_pkl = s3.get_object(Bucket=self.bucket,
                                       Key=tr_results_s3_pth)
        tr_results = pickle.loads(tr_results_pkl['Body'].read())
        df_aeq = df_aeq.join(tr_results.loc[:, ["zip-income-class"]],
                             how="inner")

        #### Renaming columns for aequitas analysis
        df_aeq.rename(columns={
            "label": "label_value",
            "model_test_predict_labels": "score",
            "zip-income-class": "reference_group"
        },
                      inplace=True)

        #### Converting index "inspection-id" to column
        df_aeq.reset_index(inplace=True, drop=True)

        print("***********")
        print(df_aeq.columns)
        print("***********")

        df_aeq2 = df_aeq.drop("score", axis=1)

        ## Running unit test
        class TestBiasFairness(marbles.core.TestCase):
            def test_df_aeq(self):
                columns_names = list(df_aeq.columns)
                df_expected_names = ['label_value', 'score', 'reference_group']
                print("")
                print("***********")
                print(list(df_aeq2.columns))
                print(len(columns_names))
                print("shape[1]: ", df_aeq2.shape[1])
                print(len(df_expected_names))
                print("***********")
                self.assertEqual(int(df_aeq.shape[1]),
                                 3,
                                 note='Oops, columns are missing!')

        stream = StringIO()
        runner = unittest.TextTestRunner(stream=stream)
        result = runner.run(unittest.makeSuite(TestBiasFairness))

        suite = unittest.TestLoader().loadTestsFromTestCase(TestBiasFairness)

        with open(tests_dir_loc + 'test_bias_fairness.txt', 'w') as f:
            unittest.TextTestRunner(stream=f, verbosity=2).run(suite)

        res = []
        with open(tests_dir_loc + "test_bias_fairness.txt") as fp:
            lines = fp.readlines()
            for line in lines:
                if "FAILED" in line:
                    res.append(
                        [str(datetime.now()), "FAILED, Columns are missing."])
                if "OK" in line:
                    res.append([str(datetime.now()), "PASS"])

        res_df = pd.DataFrame(res, columns=['Date', 'Result'])

        res_df.to_csv(tests_dir_loc + 'bias_fairness_unittest.csv',
                      index=False)

        ## Do Bias_fairness and save results:

        aeq_results_dict = bias_fairness(df_aeq)

        pickle.dump(aeq_results_dict, open(aq_results_pickle_loc, "wb"))
        aq_pickle = pickle.dumps(aeq_results_dict)

        s3.put_object(Bucket=self.bucket,
                      Key=get_key(self.output().path),
                      Body=aq_pickle)
示例#8
0
    def run(self):

        ###### Name of file inside directories

        path_file = path_file_fn(self.ingest_type)

        ## Location to find most recent local ingestion
        path_full = local_temp_ingestions + self.ingest_type + "/" + self.path_date + path_file

        #### Running unit test
        class TestSaveS3(marbles.core.TestCase):
            def test_size_pkl(self):
                size_pkl = os.path.getsize(path_full) > 0
                self.assertTrue(size_pkl, note="Your pickle's size is OKB")

        stream = StringIO()
        runner = unittest.TextTestRunner(stream=stream)
        result = runner.run(unittest.makeSuite(TestSaveS3))
        suite = unittest.TestLoader().loadTestsFromTestCase(TestSaveS3)

        with open(tests_dir_loc + 'saveS3_unittest.txt', 'w') as f:
            unittest.TextTestRunner(stream=f, verbosity=2).run(suite)

        res = []
        with open(tests_dir_loc + "saveS3_unittest.txt") as fp:
            lines = fp.readlines()
            for line in lines:
                if "FAILED" in line:
                    res.append([
                        str(datetime.now()),
                        "FAILED, Your pickle's size is OKB"
                    ])
                if "OK" in line:
                    res.append([str(datetime.now()), "PASS"])

        res_df = pd.DataFrame(res, columns=['Date', 'Result'])

        res_df.to_csv(tests_dir_loc + 'saveS3_unittest.csv', index=False)

        ## Loading most recent ingestion
        ingesta_df = pickle.load(open(path_full, "rb"))

        ## Obtaining task metadata

        #### Storing time execution metadata
        save_s3_metadata[save_s3_metadata_index] = str(datetime.now())
        #### Bucket where data will be saved
        save_s3_metadata["s3_bucket_name"] = str(self.bucket)
        #### S3 key related to the data
        save_s3_metadata["s3_key_name"] = str(get_key(self.output().path))
        #### Shape of df going into s3
        save_s3_metadata["df_shape"] = str(ingesta_df.shape)

        #### Converting dict to df and writing contents to df
        df_meta = pd.DataFrame.from_dict(save_s3_metadata, orient="index").T
        df_meta.set_index(save_s3_metadata_index, inplace=True)
        write_csv_from_df(df_meta, metadata_dir_loc, save_s3_metadata_csv_name)

        ## Storing object in s3 as pickle
        ingesta_pkl = pickle.dumps(ingesta_df)
        s3 = get_s3_resource()
        s3.put_object(Bucket=self.bucket,
                      Key=get_key(self.output().path),
                      Body=ingesta_pkl)