def test_copy_step_pvs_for_step_data_shift_weight(self,
                                                      database_connection):
        step_config = {
            'name':
            'SHIFT_DATA',
            'pv_table':
            'SAS_SHIFT_PV',
            'pv_columns':
            ["'SHIFT_PORT_GRP_PV'", "'WEEKDAY_END_PV'", "'AM_PM_NIGHT_PV'"],
            'order':
            0
        }
        run_id = 'copy-step-pvs-for-step-data'

        # clean the tables before putting in data
        db.delete_from_table('PROCESS_VARIABLE_PY', 'RUN_ID', '=', run_id)
        db.delete_from_table(idm.SAS_PROCESS_VARIABLES_TABLE)

        # read test data and insert into remote database table
        test_data = pd.read_pickle(COPY_PV_PATH +
                                   'copy_shift_weight_pvs_for_shift_data.pkl')
        db.insert_dataframe_into_table("PROCESS_VARIABLE_PY", test_data,
                                       database_connection)

        # run the test function (this inserts into 'SAS_PROCESS_VARIABLE' table in remote database)
        idm.copy_step_pvs_for_step_data(run_id, database_connection,
                                        step_config)

        # write the results back to csv, and read the csv back (this solves the data type matching issues)
        results = db.get_table_values('SAS_PROCESS_VARIABLE')

        temp_output = COPY_PV_PATH + 'copy_shift_weight_pvs_for_shift_data_results.csv'
        results.to_csv(temp_output, index=False)
        results = pd.read_csv(temp_output)

        # remove the temporary file
        os.remove(temp_output)

        # from the test data make a dataframe of the expected results
        pv_cols = [item.replace("'", "") for item in step_config['pv_columns']]
        test_inserted_data = test_data[test_data['PV_NAME'].isin(pv_cols)]
        test_inserted_data_2 = test_inserted_data[['PV_NAME', 'PV_DEF']]

        test_results = results[['PROCVAR_NAME', 'PROCVAR_RULE']]

        # check that the PROCVAR_NAME and PROCVAR_RULE string match the ones from test data for the required pvs only
        npt.assert_array_equal(test_inserted_data_2, test_results)

        # Assert step_configuration["pv_table"] has 0 records
        result = db.get_table_values(step_config['pv_table'])
        assert len(result) == 0

        # Cleanse tables before continuing
        db.delete_from_table(idm.SAS_PROCESS_VARIABLES_TABLE)
        db.delete_from_table('PROCESS_VARIABLE_PY', 'RUN_ID', '=', run_id)
Exemplo n.º 2
0
def airmiles_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the air miles calculation steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    Requirements : NA
    Dependencies : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_air_miles()

    # Populate Survey Data For Air Miles
    idm.populate_survey_data_for_step(run_id, config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate Air Miles
    survey_data_out = calculate_airmiles.do_ips_airmiles_calculation(df_surveydata=survey_data,
                                                                     var_serial='SERIAL')

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)

    # Update Survey Data with Air Miles Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data with Air Miles Results
    idm.store_survey_data_with_step_results(run_id, config)
def imbalance_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the imbalance weight steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_imbalance_weight()

    # Populate Survey Data For Imbalance Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Copy Imbalance Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Imbalance Wt PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_IMBALANCE_SPV',
                              in_id='serial')

    # Update Survey Data With Imbalance Wt PVs Output
    idm.update_survey_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate Imbalance Weight
    survey_data_out, summary_data_out = \
        calculate_imb_weight.do_ips_imbweight_calculation(survey_data,
                                                          serial="SERIAL",
                                                          shift_weight="SHIFT_WT",
                                                          non_response_weight="NON_RESPONSE_WT",
                                                          min_weight="MINS_WT",
                                                          traffic_weight="TRAFFIC_WT",
                                                          oo_weight="UNSAMP_TRAFFIC_WT",
                                                          imbalance_weight="IMBAL_WT")

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data_out)

    # Update Survey Data With Imbalance Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Imbalance Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Imbalance Weight Summary
    idm.store_step_summary(run_id, config)
Exemplo n.º 4
0
def process(in_table_name, out_table_name, in_id, dataset):
    """
    Author       : Thomas Mahoney
    Date         : 27 / 03 / 2018
    Purpose      : Runs the process variables step of the IPS calculation process.
    Parameters   : in_table_name - the table where the data is coming from.
                   out_table_name - the destination table where the modified data will be sent.
                   in_id - the column id used in the output dataset (this is used when the data is merged into the main
                           table later.
                   dataset - an identifier for the dataset currently being processed.
    Returns      : NA
    Requirements : NA
    Dependencies : NA
    """

    # Ensure the input table name is capitalised
    in_table_name = in_table_name.upper()

    # Extract the table's content into a local dataframe
    df_data = db.get_table_values(in_table_name)

    # Fill nan values
    df_data.fillna(value=np.NaN, inplace=True)

    # Get the process variable statements
    process_variables = get_pvs()

    if dataset == 'survey':
        df_data = df_data.sort_values('SERIAL')

    # Apply process variables
    df_data = df_data.apply(modify_values, axis=1, args=(process_variables, dataset))

    # Create a list to hold the PV column names
    updated_columns = []

    # Loop through the pv's
    for pv in process_variables:
        updated_columns.append(pv[0].upper())

    # Generate a column list from the in_id column and the pvs for the current run
    columns = [in_id] + updated_columns
    columns = [col.upper() for col in columns]
    # Create a new dataframe from the modified data using the columns specified
    df_out = df_data[columns]

    # for column in df_out:
    #     if df_out[column].dtype == np.int64:
    #         df_out[column] = df_out[column].astype(int)

    # Insert the dataframe to the output table
    db.insert_dataframe_into_table(out_table_name, df_out)
def minimums_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the minimums weight steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_minimums_weight()

    # Populate Survey Data For Minimums Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Copy Minimums Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Minimums Wt PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_MINIMUMS_SPV',
                              in_id='serial')

    # Update Survey Data with Minimums Wt PVs Output
    idm.update_survey_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate Minimums Weight
    output_data, summary_data = \
        calculate_minimums_weight.do_ips_minweight_calculation(df_surveydata=survey_data,
                                                               serial_num='SERIAL',
                                                               shift_weight='SHIFT_WT',
                                                               nr_weight='NON_RESPONSE_WT',
                                                               min_weight='MINS_WT')

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], output_data)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data)

    # Update Survey Data With Minimums Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Minimums Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Minimums Wt Summary
    idm.store_step_summary(run_id, config)
def test_copy_step_pvs_for_survey_data(step_name, pv_columns, spv_table,
                                       database_connection):
    # This test is parameterised. The values for the arguments of this test function
    # are taken from the parameters specified in pytest.mark.parametrize

    # Setup step configuration variables
    step_config = {
        'name': step_name,
        'spv_table': spv_table,
        'pv_columns': pv_columns
    }

    run_id = 'TEMPLATE'

    idm.copy_step_pvs_for_survey_data(run_id, database_connection, step_config)

    # Get all values from the sas_process_variables table
    results = db.get_table_values(idm.SAS_PROCESS_VARIABLES_TABLE)

    # Copy the PV column names
    pv_names = step_config['pv_columns']

    # Strip quotation marks out of the pv_names to use in column comparisons
    for i in range(0, len(pv_names)):
        pv_names[i] = pv_names[i].replace("'", "")

    # Check number of PV records moved matches number passed in through step configuration.
    assert len(results) == len(step_config['pv_columns'])

    # Ensure the pv_names in the results data frame match the expected pv names
    for name in results['PROCVAR_NAME']:
        assert name.upper() in pv_names

    # Get the spv_table values and ensure all records have been deleted
    results = db.get_table_values(step_config['spv_table'])
    assert len(results) == 0
def rail_imputation_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the rail imputation steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_rail_imputation()

    # Populate Survey Data For Rail Imputation
    idm.populate_survey_data_for_step(run_id, config)

    # Copy Rail Imp PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Rail Imp PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_RAIL_SPV',
                              in_id='serial')

    # Update Survey Data with Rail Imp PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate Rail Imputation
    survey_data_out = calculate_rail_imputation.do_ips_railex_imp(
        survey_data,
        var_serial='SERIAL',
        var_final_weight='FINAL_WT',
        minimum_count_threshold=30)

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)

    # Update Survey Data With Rail Imp Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Rail Imp Results
    idm.store_survey_data_with_step_results(run_id, config)
def fares_imputation_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the fares imputation steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_fares_imputation()

    # Populate Survey Data For Fares Imputation
    idm.populate_survey_data_for_step(run_id, config)

    # Copy Fares Imp PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Fares Imp PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_FARES_SPV',
                              in_id='serial')

    # Update Survey Data with Fares Imp PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate Fares Imputation
    survey_data_out = calculate_fares_imputation.do_ips_fares_imputation(survey_data,
                                                                         var_serial='SERIAL',
                                                                         num_levels=9,
                                                                         measure='mean')

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)

    # Update Survey Data With Fares Imp Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Fares Imp Results
    idm.store_survey_data_with_step_results(run_id, config)
Exemplo n.º 9
0
def town_stay_expenditure_imputation_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the town stay expenditure imputation steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_town_and_stay_expenditure()

    # Populate Survey Data For TSE Imputation
    idm.populate_survey_data_for_step(run_id, config)

    # Copy TSE Imputation PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply TSE Imputation PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_TOWN_STAY_SPV',
                              in_id='serial')

    # Update Survey Data with TSE Imputation PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate TSE Imputation
    survey_data_out = calculate_town_and_stay_expenditure.do_ips_town_exp_imp(
        survey_data, var_serial="SERIAL", var_final_wt="FINAL_WT")

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)

    # Update Survey Data With TSE Imputation Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With TSE Imputation Results
    idm.store_survey_data_with_step_results(run_id, config)
def final_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the final weight steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_final_weight()

    # Populate Survey Data For Final Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Calculate Final Weight
    survey_data_out, summary_data_out = \
        calculate_final_weight.do_ips_final_wt_calculation(survey_data,
                                                           serial_num='SERIAL',
                                                           shift_weight='SHIFT_WT',
                                                           non_response_weight='NON_RESPONSE_WT',
                                                           min_weight='MINS_WT',
                                                           traffic_weight='TRAFFIC_WT',
                                                           unsampled_weight='UNSAMP_TRAFFIC_WT',
                                                           imbalance_weight='IMBAL_WT',
                                                           final_weight='FINAL_WT')

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data_out)

    # Update Survey Data With Final Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Final Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Final Weight Summary
    idm.store_step_summary(run_id, config)
Exemplo n.º 11
0
def do_ips_ges_weighting(df_surveydata: pd.DataFrame,
                         df_ustotals: pd.DataFrame):
    # Deletes from poprowvec and survey_unsamp_aux tables
    db.delete_from_table('survey_unsamp_aux')

    # cf.drop_table('poprowvec_unsamp')
    # cf.drop_table('r_unsampled')
    db.clear_memory_table('poprowvec_unsamp')
    db.clear_memory_table('r_unsampled')

    # Call the GES weighting macro
    df_surveydata = df_surveydata.sort_values('SERIAL')

    r_survey_input(df_surveydata)

    r_population_input(df_surveydata, df_ustotals)

    run_r_ges_script()

    df_summarydata = db.get_table_values('r_unsampled')
    df_summarydata = df_summarydata[['SERIAL', 'UNSAMP_TRAFFIC_WT']]

    return df_surveydata, df_summarydata
Exemplo n.º 12
0
def non_response_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 26 April 2018 / 2 October 2018
    Purpose      : Runs the non response weight steps of the ips process
    Params       : run_id - the id for the current run.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_non_response()

    # Populate Survey Data For Non Response Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Populate Non Response Data
    idm.populate_step_data(run_id, config)

    # Copy Non Response Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Non Response Wt PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_NON_RESPONSE_SPV',
                              in_id='serial')

    # Update Survey Data with Non Response Wt PVs Output
    idm.update_survey_data_with_step_pv_output(config)

    # Copy Non Response Wt PVs for Non Response Data
    idm.copy_step_pvs_for_step_data(run_id, config)

    # Apply Non Response Wt PVs On Non Response Data
    process_variables.process(dataset='non_response',
                              in_table_name='SAS_NON_RESPONSE_DATA',
                              out_table_name='SAS_NON_RESPONSE_PV',
                              in_id='REC_ID')

    # Update NonResponse Data With PVs Output
    idm.update_step_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    non_response_data = db.get_table_values(config["data_table"])

    # Calculate Non Response Weight
    survey_data_out, summary_data_out = \
        calculate_nonresponse_weight.do_ips_nrweight_calculation(survey_data,
                                                                 non_response_data,
                                                                     'NON_RESPONSE_WT',
                                                                     'SERIAL')

    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data_out)

    # Update Survey Data With Non Response Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With NonResponse Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Non Response Wt Summary
    idm.store_step_summary(run_id, config)
Exemplo n.º 13
0
 def read():
     return db.get_table_values(table)
    def test_update_step_data_with_step_pv_output(self, database_connection):
        # step_config and variables
        step_config = {
            "pv_columns2":
            ["[SHIFT_PORT_GRP_PV]", "[WEEKDAY_END_PV]", "[AM_PM_NIGHT_PV]"],
            "pv_table":
            "SAS_SHIFT_PV",
            "data_table":
            "SAS_SHIFT_DATA",
            "temp_table":
            "SAS_SHIFT_WT",
            "sas_ps_table":
            "SAS_PS_SHIFT_DATA"
        }

        # Set up test data/tables
        test_shift_pv_data = pd.read_csv(
            UPDATE_STEP_DATA_WITH_STEP_PV_OUTPUT_PATH +
            'test_shift_pv_data.csv')

        # Get rec_id and amend test dataframe
        rec_id = self.get_rec_id("MAX", step_config["data_table"],
                                 database_connection)
        test_shift_pv_data = self.amend_rec_id(test_shift_pv_data,
                                               rec_id,
                                               ascend=False)

        db.insert_dataframe_into_table(step_config['pv_table'],
                                       test_shift_pv_data, database_connection)

        # run the test function
        idm.update_step_data_with_step_pv_output(database_connection,
                                                 step_config)

        # write the results back to csv, and read the csv back (this solves the data type matching issues)
        results = db.get_table_values(step_config['data_table'])

        temp_output = UPDATE_STEP_DATA_WITH_STEP_PV_OUTPUT_PATH + 'copy_update_step_data_with_step_pv_output.csv'
        results.to_csv(temp_output, index=False)
        results = pd.read_csv(temp_output)

        # get the unique REC_ID of the test_shift_pv_data
        rec_id = test_shift_pv_data["REC_ID"]

        # select all rows with matching updated rec_id
        results_1 = results[results['REC_ID'].isin(rec_id)]

        # create column list of pvs
        cols_temp = [
            item.replace("[", "") for item in step_config['pv_columns2']
        ]
        cols_to_keep = [item.replace("]", "") for item in cols_temp]
        cols_to_keep.insert(0, "REC_ID")

        # keep only the required columns from results_1 and importantly reset index and drop it
        results_2 = results_1[cols_to_keep]
        results_3 = results_2.reset_index(drop=True)

        # sort rows in test_shift_pv_data by REC_ID and importantly reset index and drop it
        sorted_test_shift_pv_data_1 = test_shift_pv_data.sort_values(
            by=['REC_ID'])
        sorted_test_shift_pv_data_2 = sorted_test_shift_pv_data_1.reset_index(
            drop=True)

        # check that the two dataframes match
        assert_frame_equal(results_3,
                           sorted_test_shift_pv_data_2,
                           check_names=False,
                           check_like=True,
                           check_dtype=False)

        # Assert temp tables had been cleanse in function
        results = db.get_table_values(step_config['pv_table'])
        assert len(results) == 0

        results = db.get_table_values(step_config['temp_table'])
        assert len(results) == 0

        results = db.get_table_values(idm.SAS_PROCESS_VARIABLES_TABLE)
        assert len(results) == 0

        results = db.get_table_values(step_config['sas_ps_table'])
        assert len(results) == 0
    def test_update_survey_data_with_step_pv_output_with_name_minimums_weight(
            self, database_connection):
        step_config = {
            'name':
            "MINIMUMS_WEIGHT",
            'spv_table':
            'SAS_MINIMUMS_SPV',
            "pv_columns": [
                "'MINS_FLAG_PV'", "'MINS_PORT_GRP_PV'", "'MINS_CTRY_GRP_PV'",
                "'MINS_NAT_GRP_PV'", "'MINS_CTRY_PORT_GRP_PV'"
            ],
            "temp_table":
            "SAS_MINIMUMS_WT",
            "sas_ps_table":
            "SAS_PS_MINIMUMS",
        }

        run_id = 'update-survey-pvs'

        # delete the data in the table so that we have no data in table for test
        db.delete_from_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
        db.delete_from_table(step_config['spv_table'])

        # read and insert into the database the survey data
        test_survey_data = pd.read_pickle(STEP_PV_OUTPUT_PATH +
                                          'update_survey_data_pvs.pkl')
        db.insert_dataframe_into_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE,
                                       test_survey_data, database_connection)

        # read and insert into the database the pvs
        test_nr_pv_data = pd.read_csv(STEP_PV_OUTPUT_PATH +
                                      'test_mw_pv_data.csv')
        db.insert_dataframe_into_table(step_config['spv_table'],
                                       test_nr_pv_data, database_connection)

        # call the test function
        idm.update_survey_data_with_step_pv_output(database_connection,
                                                   step_config)

        # get the newly updated table data write the results back to csv to read back and resolve formatting
        results = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

        # write the results back to csv, and read the csv back (this solves the data type matching issues)
        temp_output = STEP_PV_OUTPUT_PATH + 'update_survey_data_pvs_result_results.csv'
        results.to_csv(temp_output, index=False)
        results = pd.read_csv(temp_output)

        # remove the temporary written file
        os.remove(temp_output)

        # clean test data before actually testing results
        db.delete_from_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
        db.delete_from_table(step_config['spv_table'])

        # check ONLY updated pv columns are as expected in results, check NaN values are handled correctly
        stripped_pv_cols = [
            item.replace("'", "") for item in step_config['pv_columns']
        ]
        stripped_pv_cols.insert(0, 'SERIAL')  # add the SERIAL column
        test_dummy_1 = results[stripped_pv_cols]

        # get the SERIAL column values as a list, and select rows from updated data that match input data
        serials = test_nr_pv_data['SERIAL']
        test_dummy_2 = test_dummy_1[test_dummy_1['SERIAL'].isin(serials)]

        # clean test data before actually testing results
        db.delete_from_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
        db.delete_from_table(step_config['spv_table'])

        # check updated pv columns match the corresponding dummy values
        assert_frame_equal(test_dummy_2,
                           test_nr_pv_data,
                           check_dtype=False,
                           check_like=True)

        # check that the non-pv column values are still the same by dropping pv columns
        columns_to_drop = [
            item.replace("'", "") for item in step_config['pv_columns']
        ]
        new_res = results.drop(columns_to_drop, axis=1)
        new_test_res = test_survey_data.drop(columns_to_drop, axis=1)

        assert_frame_equal(new_res,
                           new_test_res,
                           check_dtype=False,
                           check_like=True)

        # check that spv_table has been deleted
        results_2 = db.get_table_values(step_config['spv_table'])
        assert len(results_2) == 0

        results = db.get_table_values(idm.SAS_PROCESS_VARIABLES_TABLE)
        assert len(results) == 0

        results = db.get_table_values(step_config["temp_table"])
        assert len(results) == 0

        results = db.get_table_values(step_config["sas_ps_table"])
        assert len(results) == 0
Exemplo n.º 16
0
def test_store_step_summary(database_connection):
    # step_config and variables
    step_config = {
        "ps_table":
        "PS_SHIFT_DATA",
        "sas_ps_table":
        "SAS_PS_SHIFT_DATA",
        "ps_columns": [
            "[RUN_ID]", "[SHIFT_PORT_GRP_PV]", "[ARRIVEDEPART]",
            "[WEEKDAY_END_PV]", "[AM_PM_NIGHT_PV]", "[MIGSI]",
            "[POSS_SHIFT_CROSS]", "[SAMP_SHIFT_CROSS]", "[MIN_SH_WT]",
            "[MEAN_SH_WT]", "[MAX_SH_WT]", "[COUNT_RESPS]", "[SUM_SH_WT]"
        ]
    }
    run_id = 'shift-wt-idm-test'
    folder = '/store_step_summary'

    # Set up test data/tables
    test_ps_data = pd.read_csv(TEST_DATA_DIR + folder +
                               '/shift_wt_sas_ps_shift_data_test_input.csv')
    db.insert_dataframe_into_table(step_config["sas_ps_table"], test_ps_data,
                                   database_connection)

    # Run function return results
    idm.store_step_summary(run_id, database_connection, step_config)
    sql = """
    SELECT * FROM {}
    WHERE RUN_ID = '{}'
    """.format(step_config["ps_table"], run_id)
    results = pd.read_sql(sql, database_connection)
    results.to_csv(TEST_DATA_DIR + folder + '/shift_wt_actual_results.csv',
                   index=False)

    # Get and format results
    results = pd.read_csv(TEST_DATA_DIR + folder +
                          '/shift_wt_actual_results.csv',
                          dtype=object)
    test_results = pd.read_csv(TEST_DATA_DIR + folder +
                               '/shift_wt_expected_results.csv',
                               dtype=object)

    results.sort_values(by=[
        'SHIFT_PORT_GRP_PV', 'ARRIVEDEPART', 'WEEKDAY_END_PV', 'AM_PM_NIGHT_PV'
    ],
                        inplace=True)
    results.index = range(0, len(results))

    test_results.sort_values(by=[
        'SHIFT_PORT_GRP_PV', 'ARRIVEDEPART', 'WEEKDAY_END_PV', 'AM_PM_NIGHT_PV'
    ],
                             inplace=True)
    test_results.index = range(0, len(test_results))

    assert_frame_equal(results, test_results, check_dtype=False)

    # Assert temp tables had been cleansed in function
    results = db.get_table_values(step_config['sas_ps_table'])
    assert len(results) == 0

    # Cleanse test inputs
    db.delete_from_table(step_config['ps_table'], 'RUN_ID', '=', run_id)
Exemplo n.º 17
0
def test_store_survey_data_with_step_results(step_name, nullify_pvs, ps_table,
                                             prefix, database_connection):
    """
    # This test is parameterised. The values for the arguments of this test function
    # are taken from the parameters specified in pytest.mark.parametrize
    # see https://docs.pytest.org/en/latest/parametrize.html
    """

    # step_config and variables
    step_config = {
        "name": step_name,
        "nullify_pvs": nullify_pvs,
        "ps_table": ps_table
    }
    run_id = 'store_survey_data_test'
    folder = '/store_survey_data_with_step_results'
    applicable_ps_tables = [
        "SHIFT_WEIGHT", "NON_RESPONSE", "MINIMUMS_WEIGHT", "TRAFFIC_WEIGHT",
        "UNSAMPLED_WEIGHT", "IMBALANCE_WEIGHT", "FINAL_WEIGHT"
    ]

    # Cleanse and delete test inputs
    db.delete_from_table(idm.SURVEY_SUBSAMPLE_TABLE, 'RUN_ID', '=', run_id)
    db.delete_from_table(step_config['ps_table'], 'RUN_ID', '=', run_id)

    # Set up records in SURVEY_SUBSAMPLE with above run_id
    survey_subsample_input = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                                         'survey_subsample_test_input.csv',
                                         dtype=object)
    db.insert_dataframe_into_table(idm.SURVEY_SUBSAMPLE_TABLE,
                                   survey_subsample_input,
                                   database_connection,
                                   fast=False)

    # Set up records in SAS_SURVEY_SUBSAMPLE with above run_id
    sas_survey_subsample_input = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                                             'sss_test_input.csv',
                                             dtype=object)
    db.insert_dataframe_into_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE,
                                   sas_survey_subsample_input,
                                   database_connection,
                                   fast=False)

    # Set up records in ps_table with above run_id
    if step_name in applicable_ps_tables:
        ps_shift_data_input = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                                          'summary_table_test_input.csv',
                                          dtype=object)
        db.insert_dataframe_into_table(step_config['ps_table'],
                                       ps_shift_data_input,
                                       database_connection,
                                       fast=False)

    # Run function
    idm.store_survey_data_with_step_results(run_id, database_connection,
                                            step_config)

    # Assert tables were cleansed by function
    if step_name in applicable_ps_tables:
        sql = """
            SELECT * FROM {}
            WHERE RUN_ID = '{}'""".format(step_config['ps_table'], run_id)
        cur = database_connection.cursor()
        result = cur.execute(sql).fetchone()
        assert result is None

    result = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    assert len(result) == 0

    # Retrieve results produced by function
    sql = """
    SELECT * FROM {}
    WHERE RUN_ID = '{}'
    """.format(idm.SURVEY_SUBSAMPLE_TABLE, run_id)
    results = pd.read_sql(sql, database_connection)
    results.to_csv(TEST_DATA_DIR + folder + prefix + 'actual_results.csv',
                   index=False)

    # Get and format results
    results = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                          'actual_results.csv',
                          dtype=object)
    test_results = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                               'expected_result.csv',
                               dtype=object)

    results.sort_values(by=["SERIAL"], inplace=True)
    results.index = range(0, len(results))

    test_results.sort_values(by=["SERIAL"], inplace=True)
    test_results.index = range(0, len(test_results))

    assert_frame_equal(results, test_results, check_dtype=False)
def traffic_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the traffic weight steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_traffic_weight()

    # Populate Survey Data For Traffic Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Populate Traffic Data
    idm.populate_step_data(run_id, config)

    # Copy Traffic Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Traffic Wt PV On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_TRAFFIC_SPV',
                              in_id='serial')

    # Update Survey Data with Traffic Wt PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Copy Traffic Wt PVs For Traffic Data
    idm.copy_step_pvs_for_step_data(run_id, config)

    # Apply Traffic Wt PV On Traffic Data
    process_variables.process(dataset='traffic',
                              in_table_name='SAS_TRAFFIC_DATA',
                              out_table_name='SAS_TRAFFIC_PV',
                              in_id='REC_ID')

    # Update Traffic Data With Traffic Wt PV Output
    idm.update_step_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    traffic_data = db.get_table_values(config["data_table"])

    # Calculate Traffic Weight
    output_data, summary_data = do_ips_trafweight_calculation_with_R(
        survey_data, traffic_data)

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], output_data)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data)

    # Update Survey Data With Traffic Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Traffic Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Traffic Wt Summary
    idm.store_step_summary(run_id, config)
Exemplo n.º 19
0
def test_update_survey_data_with_step_results(step_name, temp_table,
                                              results_columns, prefix,
                                              database_connection):
    """
    # This test is parameterised. The values for the arguments of this test function
    # are taken from the parameters specified in pytest.mark.parametrize
    # see https://docs.pytest.org/en/latest/parametrize.html
    """

    # step_config and variables
    step_config = {
        "name": step_name,
        "temp_table": temp_table,
        "results_columns": results_columns
    }

    folder = '/update_survey_data_with_step_results'

    # Cleanse and set up test data/tables
    db.delete_from_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    sas_survey_subsample_input = pd.read_csv(
        TEST_DATA_DIR + folder + prefix +
        'sas_survey_subsample_test_input.csv',
        dtype=object)
    db.insert_dataframe_into_table(idm.SAS_SURVEY_SUBSAMPLE_TABLE,
                                   sas_survey_subsample_input,
                                   database_connection,
                                   fast=False)

    db.delete_from_table(step_config["temp_table"])
    sas_shift_wt_input = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                                     'temp_table_test_input.csv',
                                     dtype=object)
    db.insert_dataframe_into_table(step_config["temp_table"],
                                   sas_shift_wt_input,
                                   database_connection,
                                   fast=False)

    # Run function
    idm.update_survey_data_with_step_results(database_connection, step_config)

    # Get and format results
    results = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    results.to_csv(TEST_DATA_DIR + folder + prefix + 'actual_results.csv',
                   index=False)
    results = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                          'actual_results.csv',
                          dtype=object)
    test_results = pd.read_csv(TEST_DATA_DIR + folder + prefix +
                               'expected_results.csv',
                               dtype=object)

    results.sort_values(by=["SERIAL"], inplace=True)
    results.index = range(0, len(results))

    test_results.sort_values(by=["SERIAL"], inplace=True)
    test_results.index = range(0, len(test_results))

    assert_frame_equal(results, test_results, check_dtype=False)

    # Assert temp tables had been cleansed in function
    result = db.get_table_values(step_config['temp_table'])
    assert len(result) == 0
Exemplo n.º 20
0
def do_ips_trafweight_calculation_with_R(survey_data, trtotals):
    # clear the auxillary tables
    db.delete_from_table(SURVEY_TRAFFIC_AUX_TABLE)

    # drop aux tables and r created tables
    # cf.drop_table(POP_PROWVEC_TABLE)
    # cf.drop_table(R_TRAFFIC_TABLE)
    db.clear_memory_table(R_TRAFFIC_TABLE)
    db.clear_memory_table(POP_PROWVEC_TABLE)

    # inserts into survey_traffic_aux a.k.a. SURVEY_TRAFFIC_AUX_TABLE
    df_r_ges_input_imported = r_survey_input(survey_data)
    # inserts into POP_PROWVEC_TABLE
    df_mod_pop_totals_import = r_population_input(survey_data, trtotals)

    run_r_ges_script()

    # grab the data from the SQL table and return
    output_final_import = db.get_table_values(R_TRAFFIC_TABLE)

    ret_out = output_final_import[[SERIAL, TRAFFIC_WT]]

    # sort
    ret_out_sorted = ret_out.sort_values(SERIAL)
    ret_out_final = ret_out_sorted.reset_index(drop=True)

    # copy out the df without random for generate_ips_tw_summary
    df_ret_out_final_not_rounded = ret_out_final.copy()

    # Round the weights to 3dp
    ret_out_final[TRAFFIC_WT] = ret_out_final[TRAFFIC_WT].apply(
        lambda x: round(x, 3))

    # #################################
    # Generate the summary table
    # #################################

    # perform calculation
    survey_data[TRAFFIC_DESIGN_WEIGHT_COLUMN] = survey_data[
        var_shiftWeight] * survey_data[var_NRWeight] * survey_data[
            var_minWeight]

    # Summarise the population totals over the strata
    df_PopTotals = trtotals.sort_values(STRATA)

    # Re-index the data frame
    df_PopTotals.index = range(df_PopTotals.shape[0])

    df_popTotals = df_PopTotals.groupby(STRATA)[TRAFFIC_TOTAL_COLUMN] \
        .agg([(TRAFFIC_TOTAL_COLUMN, 'sum')]) \
        .reset_index()

    # ensure unrounded df_ret_out_final_not_rounded is supplied
    df_summary_merge_sum_traftot = generate_ips_tw_summary(
        survey_data, df_ret_out_final_not_rounded, var_serialNum, GWeightVar,
        df_popTotals, minCountThresh)

    # update the output SQL tables
    db.insert_dataframe_into_table(OUTPUT_TABLE_NAME, ret_out_final)
    db.insert_dataframe_into_table(SUMMARY_TABLE_NAME,
                                   df_summary_merge_sum_traftot)

    return ret_out_final, df_summary_merge_sum_traftot
Exemplo n.º 21
0
def convert_dataframe_to_sql_format(table_name, dataframe):
    db.insert_dataframe_into_table(table_name, dataframe)
    return db.get_table_values(table_name)
Exemplo n.º 22
0
def r_population_input(df_survey_input, df_tr_totals):
    """
    Author       : David Powell / edits by Nassir Mohammad
    Date         : 07/06/2018
    Purpose      : Creates population data that feeds into the R GES weighting
    Parameters   : df_survey_input - A data frame containing the survey data for
                   processing month
                   trtotals - A data frame containing population information for
                   processing year
    Returns      : A data frame containing the information needed for GES weighting
    Requirements : NA
    Dependencies : NA
    """

    # Sort input values
    sort1 = [SAMP_PORT_GRP_PV, ARRIVEDEPART]
    df_survey_input_sorted = df_survey_input.sort_values(sort1)

    # Cleanse data
    df_survey_input_sorted = df_survey_input_sorted[
        ~df_survey_input_sorted[SAMP_PORT_GRP_PV].isnull()]
    df_survey_input_sorted = df_survey_input_sorted[
        ~df_survey_input_sorted[ARRIVEDEPART].isnull()]

    # Sort input values
    df_pop_totals = df_tr_totals.sort_values(sort1)

    # Cleanse data
    df_pop_totals = df_pop_totals[~df_pop_totals[SAMP_PORT_GRP_PV].isnull()]
    df_pop_totals = df_pop_totals[~df_pop_totals[ARRIVEDEPART].isnull()]

    # Create unique list of items from survey input
    items = df_survey_input_sorted[SAMP_PORT_GRP_PV].tolist()
    unique = []
    [unique.append(x) for x in items if x not in unique]

    df_pop_totals_match = df_pop_totals[df_pop_totals[SAMP_PORT_GRP_PV].isin(
        unique)]

    # Create traffic totals
    df_pop_totals_match = df_pop_totals_match.sort_values(
        [ARRIVEDEPART, SAMP_PORT_GRP_PV])
    df_traffic_totals = df_pop_totals_match.groupby(
        [SAMP_PORT_GRP_PV, ARRIVEDEPART]).agg({
            TRAFFIC_TOTAL_COLUMN: 'sum'
        }).reset_index()

    # Create lookup. Group by and aggregate
    lookup_dataframe = df_survey_input_sorted.copy()
    lookup_dataframe["count"] = ""
    lookup_dataframe = lookup_dataframe.groupby(
        [SAMP_PORT_GRP_PV, ARRIVEDEPART]).agg({
            "count": 'count'
        }).reset_index()

    # Cleanse data
    # lookup_dataframe.drop(["count"], axis=1)
    lookup_dataframe[T1] = range(len(lookup_dataframe))
    lookup_dataframe[T1] = lookup_dataframe[T1] + 1

    # Create population totals for current survey data - Cleanse data and merge
    lookup_dataframe_aux = lookup_dataframe[[
        SAMP_PORT_GRP_PV, ARRIVEDEPART, T1
    ]]
    lookup_dataframe_aux[T1] = lookup_dataframe_aux.T1.astype(np.int64)

    df_mod_totals = pd.merge(df_traffic_totals,
                             lookup_dataframe_aux,
                             on=[SAMP_PORT_GRP_PV, ARRIVEDEPART],
                             how='left')

    df_mod_totals[MODEL_GROUP] = 1
    df_mod_totals = df_mod_totals.drop(
        columns=[ARRIVEDEPART, SAMP_PORT_GRP_PV])
    df_mod_pop_totals = df_mod_totals.pivot_table(index=MODEL_GROUP,
                                                  columns=T1,
                                                  values=TRAFFIC_TOTAL_COLUMN)
    df_mod_pop_totals = df_mod_pop_totals.add_prefix('T_')

    df_mod_pop_totals[MODEL_GROUP] = 1
    cols = [MODEL_GROUP
            ] + [col for col in df_mod_pop_totals if col != MODEL_GROUP]
    df_mod_pop_totals = df_mod_pop_totals[cols]

    df_mod_pop_totals = df_mod_pop_totals.reset_index(drop=True)

    con = db.get_sql_connection()
    # recreate proc_vec table

    # note the index gets added so needs to be removed when re-read from SQL
    df_mod_pop_totals.to_sql(POP_PROWVEC_TABLE, con, if_exists='replace')

    df_mod_pop_totals_import = db.get_table_values(POP_PROWVEC_TABLE)
    df_mod_pop_totals_import = df_mod_pop_totals_import.drop('index', axis=1)

    return df_mod_pop_totals_import
def shift_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 26 April 2018 / 2 October 2018
    Purpose      : Runs the shift weight steps of the ips process
    Params       : run_id - the id for the current run.
                   connection - a connection object pointing at the database.
    Returns      : NA
    """

    # Load configuration variables
    config = ServicesConfiguration().get_shift_weight()

    # Populate Survey Data For Shift Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Populate Shift Data
    idm.populate_step_data(run_id, config)

    # Copy Shift Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Shift Wt PVs On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_SHIFT_SPV',
                              in_id='serial')

    # Update Survey Data with Shift Wt PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Copy Shift Wt PVs For Shift Data
    idm.copy_step_pvs_for_step_data(run_id, config)

    # Apply Shift Wt PVs On Shift Data
    process_variables.process(dataset='shift',
                              in_table_name='SAS_SHIFT_DATA',
                              out_table_name='SAS_SHIFT_PV',
                              in_id='REC_ID')

    # Update Shift Data with PVs Output
    idm.update_step_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    shift_data = db.get_table_values(config["data_table"])

    # shift_data = sas_shift_schema.convert_dtype(shift_data)

    # Calculate Shift Weight
    survey_data_out, summary_data_out = \
        calculate_shift_weight.do_ips_shift_weight_calculation(survey_data,
                                                               shift_data,
                                                               serial_number='SERIAL',
                                                               shift_weight='SHIFT_WT')

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], survey_data_out)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data_out)

    # Update Survey Data With Shift Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Shift Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Shift Wt Summary
    idm.store_step_summary(run_id, config)
def unsampled_weight_step(run_id):
    """
    Author       : Thomas Mahoney / Elinor Thorne
    Date         : 30 April 2018 / 2 October 2018
    Purpose      : Runs the unsampled weight steps of the ips process
    Params       : run_id - the id for the current run.
    Returns      : None
    """

    # Load configuration variables
    config = ServicesConfiguration().get_unsampled_weight()

    # Populate Survey Data For Unsampled Wt
    idm.populate_survey_data_for_step(run_id, config)

    # Populate Unsampled Data
    idm.populate_step_data(run_id, config)

    # Copy Unsampled Wt PVs For Survey Data
    idm.copy_step_pvs_for_survey_data(run_id, config)

    # Apply Unsampled Wt PV On Survey Data
    process_variables.process(dataset='survey',
                              in_table_name='SAS_SURVEY_SUBSAMPLE',
                              out_table_name='SAS_UNSAMPLED_OOH_SPV',
                              in_id='serial')

    # Update Survey Data with Unsampled Wt PV Output
    idm.update_survey_data_with_step_pv_output(config)

    # Copy Unsampled Wt PVs For Unsampled Data
    idm.copy_step_pvs_for_step_data(run_id, config)

    # Apply Unsampled Wt PV On Unsampled Data
    process_variables.process(dataset='unsampled',
                              in_table_name='SAS_UNSAMPLED_OOH_DATA',
                              out_table_name='SAS_UNSAMPLED_OOH_PV',
                              in_id='REC_ID')

    # Update Unsampled Data With PV Output
    idm.update_step_data_with_step_pv_output(config)

    # Retrieve data from SQL
    survey_data = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)
    unsampled_data = db.get_table_values(config["data_table"])

    # Calculate Unsampled Weight
    output_data, summary_data = calculate_unsampled_weight.do_ips_unsampled_weight_calculation(
        df_surveydata=survey_data,
        serial_num='SERIAL',
        shift_weight='SHIFT_WT',
        nr_weight='NON_RESPONSE_WT',
        min_weight='MINS_WT',
        traffic_weight='TRAFFIC_WT',
        out_of_hours_weight="UNSAMP_TRAFFIC_WT",
        df_ustotals=unsampled_data,
        min_count_threshold=30)

    # Insert data to SQL
    db.insert_dataframe_into_table(config["temp_table"], output_data)
    db.insert_dataframe_into_table(config["sas_ps_table"], summary_data)

    # Update Survey Data With Unsampled Wt Results
    idm.update_survey_data_with_step_results(config)

    # Store Survey Data With Unsampled Wt Results
    idm.store_survey_data_with_step_results(run_id, config)

    # Store Unsampled Weight Summary
    idm.store_step_summary(run_id, config)
def test_populate_survey_data(name, delete_tables, nullify_pvs,
                              database_connection):
    # This test is parameterised. The values for the arguments of this test function
    # are taken from the parameters specified in pytest.mark.parametrize

    # Delete existing survey data from table where RUN_ID matches our test id
    db.delete_from_table(SURVEY_SUBSAMPLE_TABLE, 'RUN_ID', '=',
                         '9e5c1872-3f8e-4ae5-85dc-c67a602d011e')

    # Read the test data in from a csv file
    test_data = pd.read_csv(TEST_DATA_DIR +
                            "populate_survey_data/survey_subsample.csv",
                            dtype=object)

    # Insert the test data into survey_subsample table
    db.insert_dataframe_into_table(idm.SURVEY_SUBSAMPLE_TABLE, test_data)

    # Setup step configuration
    step_config = {
        'nullify_pvs': nullify_pvs,
        'name': name,
        'delete_tables': delete_tables
    }

    # Run test function
    idm.populate_survey_data_for_step(
        run_id='9e5c1872-3f8e-4ae5-85dc-c67a602d011e',
        conn=database_connection,
        step_configuration=step_config)

    # Get test_result from sas_survey_subsample table
    test_result = db.get_table_values(idm.SAS_SURVEY_SUBSAMPLE_TABLE)

    # Write the test results to a csv
    test_result.to_csv(TEST_DATA_DIR + "populate_survey_data/test_result.csv",
                       index=False)

    # Import the expected result (this result varies if the TRAFFIC_WEIGHT or UNSAMPLED_WEIGHT step is being tested)
    if name == 'TRAFFIC_WEIGHT' or name == 'UNSAMPLED_WEIGHT':
        expected_result = pd.read_csv(
            TEST_DATA_DIR +
            "populate_survey_data/populate_result_traffic_unsampled.csv")
    else:
        expected_result = pd.read_csv(
            TEST_DATA_DIR + "populate_survey_data/populate_result.csv")

    # Import the test result
    test_result = pd.read_csv(TEST_DATA_DIR +
                              "populate_survey_data/test_result.csv")

    # Sort the values by SERIAL
    expected_result = expected_result.sort_values(by='SERIAL')
    test_result = test_result.sort_values(by='SERIAL')

    # Reset the dataframe's indexes so correct rows are compared
    expected_result.index = range(0, len(expected_result))
    test_result.index = range(0, len(test_result))

    # Check all deleted tables are empty
    for table in step_config['delete_tables']:
        delete_result = db.get_table_values(table)
        assert delete_result.empty

    # Check all nullified columns are NULL
    for column in step_config['nullify_pvs']:
        column_name = column.replace('[', '').replace(']', '')
        result = db.select_data(column_name, idm.SURVEY_SUBSAMPLE_TABLE,
                                'RUN_ID',
                                "9e5c1872-3f8e-4ae5-85dc-c67a602d011e")
        assert result[column_name].isnull().sum() == len(result)

    # Check results match
    assert_frame_equal(expected_result,
                       test_result,
                       check_dtype=False,
                       check_like=True)
def test_populate_step_data(table_name, data_table, insert_to_populate,
                            step_data, sas_step_data, result_data,
                            database_connection):
    # This test is parameterised. The values for the arguments of this test function
    # are taken from the parameters specified in pytest.mark.parametrize

    run_id = '9e5c1872-3f8e-4ae5-85dc-c67a602d011e'

    # Setup step configuration
    step_config = {
        "table_name": table_name,
        "data_table": data_table,
        "insert_to_populate": insert_to_populate,
    }

    # Clear existing test records from the shift_data table
    db.delete_from_table(step_config['table_name'], 'RUN_ID', '=',
                         '9e5c1872-3f8e-4ae5-85dc-c67a602d011e')

    # Get test data from file
    test_data = pd.read_csv(TEST_DATA_DIR + "populate_step_data/" + step_data,
                            dtype=object)

    # Insert test data into table
    db.insert_dataframe_into_table(step_config["table_name"], test_data)

    # Run XML step which deletes old data from sas_survey_subsample and repopulates it with the new data
    idm.populate_step_data(run_id, database_connection, step_config)

    # Get test_result from (sas) external data table
    test_result = db.get_table_values(step_config['data_table'])

    # Write the test results to a csv
    test_result.to_csv(TEST_DATA_DIR + "populate_step_data/" + result_data,
                       index=False)

    # Import both the expected result and test result from the csv files
    expected_result = pd.read_csv(TEST_DATA_DIR + "populate_step_data/" +
                                  sas_step_data)
    test_result = pd.read_csv(TEST_DATA_DIR + "populate_step_data/" +
                              result_data)

    # Nullify the rec_id for comparison (this needs to be done because the expected result contains no rec_id)
    expected_result['REC_ID'] = ''
    test_result['REC_ID'] = ''

    # Sort records to match order
    if table_name == 'SHIFT_DATA':
        expected_result = expected_result.sort_values(
            by=['PORTROUTE', 'WEEKDAY'])
        test_result = test_result.sort_values(by=['PORTROUTE', 'WEEKDAY'])
    elif table_name == 'NON_RESPONSE_DATA':
        expected_result = expected_result.sort_values(by=[
            'PORTROUTE', 'WEEKDAY', 'ARRIVEDEPART', 'AM_PM_NIGHT',
            'SAMPINTERVAL', 'MIGTOTAL', 'ORDTOTAL'
        ])
        test_result = test_result.sort_values(by=[
            'PORTROUTE', 'WEEKDAY', 'ARRIVEDEPART', 'AM_PM_NIGHT',
            'SAMPINTERVAL', 'MIGTOTAL', 'ORDTOTAL'
        ])
    elif table_name == 'UNSAMPLED_OOH_DATA':
        expected_result = expected_result.sort_values(
            by=['PORTROUTE', 'REGION', 'ARRIVEDEPART', 'UNSAMP_TOTAL'])
        test_result = test_result.sort_values(
            by=['PORTROUTE', 'REGION', 'ARRIVEDEPART', 'UNSAMP_TOTAL'])
    elif table_name == 'TRAFFIC_DATA':
        expected_result = expected_result.sort_values(
            by=['PORTROUTE', 'ARRIVEDEPART', 'TRAFFICTOTAL', 'HAUL'])
        test_result = test_result.sort_values(
            by=['PORTROUTE', 'ARRIVEDEPART', 'TRAFFICTOTAL', 'HAUL'])

    # Reset the dataframe's indexes so correct rows are compared
    expected_result.index = range(0, len(expected_result))
    test_result.index = range(0, len(test_result))

    # Check results match
    assert_frame_equal(expected_result,
                       test_result,
                       check_dtype=False,
                       check_like=True)