Exemplo n.º 1
0
def check_allocation_ratios(flow_alloc_df_load, activity_set, config):
    """
    Check for issues with the flow allocation ratios
    :param flow_alloc_df_load: df, includes 'FlowAmountRatio' column
    :param activity_set: str, activity set
    :param config: dictionary, method yaml
    :return: print out information regarding allocation ratios,
             save csv of results to local directory
    """

    # create column of sector lengths
    flow_alloc_df =\
        flow_alloc_df_load.assign(SectorLength=flow_alloc_df_load['Sector'].str.len())
    # subset df
    flow_alloc_df2 = flow_alloc_df[[
        'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio'
    ]]
    # sum the flow amount ratios by location and sector length
    flow_alloc_df3 = \
        flow_alloc_df2.groupby(['FBA_Activity', 'Location', 'SectorLength'],
                               dropna=False, as_index=False).agg({"FlowAmountRatio": sum})
    # keep only rows of specified sector length
    flow_alloc_df4 = flow_alloc_df3[
        flow_alloc_df3['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)
    # keep data where the flowamountratio is greater than or less than 1 by 0.005
    tolerance = 0.01
    flow_alloc_df5 = flow_alloc_df4[
        (flow_alloc_df4['FlowAmountRatio'] < 1 - tolerance) |
        (flow_alloc_df4['FlowAmountRatio'] > 1 + tolerance)]

    if len(flow_alloc_df5) > 0:
        vLog.info(
            'There are %s instances at a sector length of %s '
            'where the allocation ratio for a location is greater '
            'than or less than 1 by at least %s. See Validation Log',
            len(flow_alloc_df5), config["target_sector_level"], str(tolerance))

    # add to validation log
    log.info(
        'Save the summary table of flow allocation ratios for each sector length for '
        '%s in validation log', activity_set)
    # if df not empty, print, if empty, print string
    if flow_alloc_df5.empty:
        vLogDetailed.info('Flow allocation ratios for %s all round to 1',
                          activity_set)

    else:
        vLogDetailed.info(
            'Flow allocation ratios for %s: '
            '\n {}'.format(flow_alloc_df5.to_string()), activity_set)
Exemplo n.º 2
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or
    FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: str, The datasource name
    :param v: dictionary, The datasource parameters
    :return: df of identified parquet
    """
    if v['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter to filter dataframe
        if 'source_fba_load_scale' in v:
            geo_level = v['source_fba_load_scale']
        else:
            geo_level = None
        vLog.info("Retrieving flowbyactivity for datasource %s in year %s", k,
                  str(v['year']))
        flows_df = flowsa.getFlowByActivity(datasource=k,
                                            year=v['year'],
                                            flowclass=v['class'],
                                            geographic_level=geo_level)
    elif v['data_format'] == 'FBS':
        vLog.info("Retrieving flowbysector for datasource %s", k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        vLog.info("Retrieving flowbysector for datasource %s", k)
        flows_df = dynamically_import_fxn(k, v["FBS_datapull_fxn"])(v)
    else:
        vLog.error(
            "Data format not specified in method file for datasource %s", k)

    return flows_df
Exemplo n.º 3
0
def check_if_data_exists_at_geoscale(df, geoscale, activitynames='All'):
    """
    Check if an activity or a sector exists at the specified geoscale
    :param df: flowbyactivity dataframe
    :param activitynames: Either an activity name (ex. 'Domestic') or a sector (ex. '1124')
    :param geoscale: national, state, or county
    :return: str, 'yes' or 'no'
    """

    # if any activity name is specified, check if activity data exists at the specified geoscale
    activity_list = []
    if activitynames != 'All':
        if isinstance(activitynames, str):
            activity_list.append(activitynames)
        else:
            activity_list = activitynames
        # check for specified activity name
        df = df[(df[fba_activity_fields[0]].isin(activity_list)) |
                (df[fba_activity_fields[1]].isin(activity_list))].reset_index(
                    drop=True)
    else:
        activity_list.append('activities')

    # filter by geoscale depends on Location System
    fips = create_geoscale_list(df, geoscale)

    df = df[df['Location'].isin(fips)]

    if len(df) == 0:
        vLog.info("No flows found for %s at the %s scale",
                  ', '.join(activity_list), geoscale)
        exists = "No"
    else:
        vLog.info("Flows found for %s at the %s scale",
                  ', '.join(activity_list), geoscale)
        exists = "Yes"

    return exists
Exemplo n.º 4
0
def check_if_data_exists_at_less_aggregated_geoscale(df, geoscale,
                                                     activityname):
    """
    In the event data does not exist at specified geoscale,
    check if data exists at less aggregated level
    :param df: Either flowbyactivity or flowbysector dataframe
    :param data_to_check: Either an activity name (ex. 'Domestic') or a sector (ex. '1124')
    :param geoscale: national, state, or county
    :param flowbytype: 'fba' for flowbyactivity, 'fbs' for flowbysector
    :return: str, geoscale to use
    """

    if geoscale == 'national':
        df = df[(df[fba_activity_fields[0]] == activityname) |
                (df[fba_activity_fields[1]] == activityname)]
        fips = create_geoscale_list(df, 'state')
        df = df[df['Location'].isin(fips)]
        if len(df) == 0:
            vLog.info("No flows found for %s at the state scale", activityname)
            fips = create_geoscale_list(df, 'county')
            df = df[df['Location'].isin(fips)]
            if len(df) == 0:
                vLog.info("No flows found for %s at the county scale",
                          activityname)
            else:
                vLog.info(
                    "Flow-By-Activity data exists for %s at the county level",
                    activityname)
                new_geoscale_to_use = 'county'
                return new_geoscale_to_use
        else:
            vLog.info("Flow-By-Activity data exists for %s at the state level",
                      activityname)
            new_geoscale_to_use = 'state'
            return new_geoscale_to_use
    if geoscale == 'state':
        df = df[(df[fba_activity_fields[0]] == activityname) |
                (df[fba_activity_fields[1]] == activityname)]
        fips = create_geoscale_list(df, 'county')
        df = df[df['Location'].isin(fips)]
        if len(df) == 0:
            vLog.info("No flows found for %s at the county scale",
                      activityname)
        else:
            vLog.info(
                "Flow-By-Activity data exists for %s at the county level",
                activityname)
            new_geoscale_to_use = 'county'
            return new_geoscale_to_use
Exemplo n.º 5
0
def compare_fba_geo_subset_and_fbs_output_totals(fba_load, fbs_load,
                                                 activity_set, source_name,
                                                 source_attr, activity_attr,
                                                 method):
    """
    Function to compare the loaded flowbyactivity total after subsetting by activity
    and geography with the final flowbysector output total. Not a direct comparison
    of the loaded FBA because FBAs are modified before being subset by activity.
    for the target sector level
    :param fba_load: df, FBA loaded, before being mapped
    :param fbs_load: df, final FBS df at target sector level
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param source_attr: dictionary, attribute data from method yaml for source data
    :param activity_attr: dictionary, attribute data from method yaml for activity set
    :param method: dictionary, FBS method yaml
    :return: printout data differences between loaded FBA and FBS output totals by location,
             save results as csv in local directory
    """

    vLog.info('Comparing Flow-By-Activity subset by activity and geography to '
              'the subset Flow-By-Sector FlowAmount total.')

    # load source catalog
    cat = load_source_catalog()
    src_info = cat[source_name]

    # extract relevant geoscale data or aggregate existing data
    fba = subset_df_by_geoscale(fba_load,
                                activity_attr['allocation_from_scale'],
                                method['target_geoscale'])
    if src_info['sector-like_activities']:
        # if activities are sector-like, run sector aggregation and then
        # subset df to only keep NAICS2
        fba = fba[[
            'Class', 'FlowAmount', 'Unit', 'Context', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Location', 'LocationSystem'
        ]]
        # rename the activity cols to sector cols for purposes of aggregation
        fba = fba.rename(
            columns={
                'ActivityProducedBy': 'SectorProducedBy',
                'ActivityConsumedBy': 'SectorConsumedBy'
            })
        group_cols_agg = [
            'Class', 'Context', 'Unit', 'Location', 'LocationSystem',
            'SectorProducedBy', 'SectorConsumedBy'
        ]
        fba = sector_aggregation(fba, group_cols_agg)
        # subset fba to only include NAICS2
        fba = replace_NoneType_with_empty_cells(fba)
        fba = fba[fba['SectorConsumedBy'].apply(lambda x: len(x) == 2)
                  | fba['SectorProducedBy'].apply(lambda x: len(x) == 2)]
    # subset/agg dfs
    col_subset = [
        'Class', 'FlowAmount', 'Unit', 'Context', 'Location', 'LocationSystem'
    ]
    group_cols = ['Class', 'Unit', 'Context', 'Location', 'LocationSystem']
    # check units
    compare_df_units(fba, fbs_load)
    # fba
    fba = fba[col_subset]
    fba_agg = aggregator(fba, group_cols).reset_index(drop=True)
    fba_agg.rename(columns={
        'FlowAmount': 'FBA_amount',
        'Unit': 'FBA_unit'
    },
                   inplace=True)

    # fbs
    fbs = fbs_load[col_subset]
    fbs_agg = aggregator(fbs, group_cols)
    fbs_agg.rename(columns={
        'FlowAmount': 'FBS_amount',
        'Unit': 'FBS_unit'
    },
                   inplace=True)

    try:
        # merge FBA and FBS totals
        df_merge = fba_agg.merge(fbs_agg, how='left')
        df_merge['FlowAmount_difference'] = df_merge['FBA_amount'] - df_merge[
            'FBS_amount']
        df_merge['Percent_difference'] =\
            (df_merge['FlowAmount_difference']/df_merge['FBA_amount']) * 100

        # reorder
        df_merge = df_merge[[
            'Class', 'Context', 'Location', 'LocationSystem', 'FBA_amount',
            'FBA_unit', 'FBS_amount', 'FBS_unit', 'FlowAmount_difference',
            'Percent_difference'
        ]]
        df_merge = replace_NoneType_with_empty_cells(df_merge)

        # list of contexts and locations
        context_list = df_merge[['Context', 'Location']].values.tolist()

        # loop through the contexts and print results of comparison
        vLog.info(
            'Comparing FBA %s %s subset to FBS results. Details in Validation Log',
            activity_set, source_attr['geoscale_to_use'])
        for i, j in context_list:
            df_merge_subset = df_merge[(df_merge['Context'] == i) & (
                df_merge['Location'] == j)].reset_index(drop=True)
            diff_per = df_merge_subset['Percent_difference'][0]
            if np.isnan(diff_per):
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s '
                    'does not exist in the FBS', source_name, activity_set, i)
                continue
            # make reporting more manageable
            if abs(diff_per) > 0.01:
                diff_per = round(diff_per, 2)
            else:
                diff_per = round(diff_per, 6)

            # diff_units = df_merge_subset['FBS_unit'][0]
            if diff_per > 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'less than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per < 0:
                vLog.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is %s%% '
                    'more than the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j, str(abs(diff_per)))
            elif diff_per == 0:
                vLogDetailed.info(
                    'FlowBySector FlowAmount for %s %s %s at %s is '
                    'equal to the FlowByActivity FlowAmount', source_name,
                    activity_set, i, j)

        # subset the df to include in the validation log
        # only print rows where the percent difference does not round to 0
        df_v = df_merge[df_merge['Percent_difference'].apply(
            lambda x: round(x, 3) != 0)].reset_index(drop=True)

        # log output
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector '
            'total FlowAmounts for %s in validation log file', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Percent difference for %s all round to 0',
                              activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FBA load to FBS total '
                'FlowAmounts for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
    except:
        vLog.info('Error occurred when comparing total FlowAmounts '
                  'for FlowByActivity and FlowBySector')
Exemplo n.º 6
0
def compare_activity_to_sector_flowamounts(fba_load, fbs_load, activity_set,
                                           source_name, config):
    """
    Function to compare the loaded flowbyactivity with the final flowbysector
    by activityname (if exists) to target sector level
    output, checking for data loss
    :param fba_load: df, FBA loaded and mapped using FEDEFL
    :param fbs_load: df, final FBS df
    :param activity_set: str, activity set
    :param source_name: str, source name
    :param config: dictionary, method yaml
    :return: printout data differences between loaded FBA and FBS output,
             save results as csv in local directory
    """
    if load_source_catalog()[source_name]['sector-like_activities']:
        vLog.debug(
            'Not comparing loaded FlowByActivity to FlowBySector ratios '
            'for a dataset with sector-like activities because if there '
            'are modifications to flowamounts for a sector, then the '
            'ratios will be different')
    else:
        # subset fba df
        fba = fba_load[[
            'Class', 'MetaSources', 'Flowable', 'Unit', 'FlowType',
            'ActivityProducedBy', 'ActivityConsumedBy', 'Context', 'Location',
            'LocationSystem', 'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)
        fba.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year'
        ]
        fba_agg = aggregator(fba, group_cols)
        fba_agg.rename(columns={'FlowAmount': 'FBA_amount'}, inplace=True)

        # subset fbs df

        fbs = fbs_load[[
            'Class', 'SectorSourceName', 'Flowable', 'Unit', 'FlowType',
            'SectorProducedBy', 'SectorConsumedBy', 'ActivityProducedBy',
            'ActivityConsumedBy', 'Context', 'Location', 'LocationSystem',
            'Year', 'FlowAmount'
        ]].drop_duplicates().reset_index(drop=True)

        fbs = replace_NoneType_with_empty_cells(fbs)

        fbs['ProducedLength'] = fbs['SectorProducedBy'].str.len()
        fbs['ConsumedLength'] = fbs['SectorConsumedBy'].str.len()
        fbs['SectorLength'] = fbs[['ProducedLength',
                                   'ConsumedLength']].max(axis=1)
        fbs.loc[:, 'Location'] = US_FIPS
        group_cols = [
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength'
        ]
        fbs_agg = aggregator(fbs, group_cols)
        fbs_agg.rename(columns={'FlowAmount': 'FBS_amount'}, inplace=True)

        # merge compare 1 and compare 2
        df_merge = fba_agg.merge(fbs_agg,
                                 left_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 right_on=[
                                     'ActivityProducedBy',
                                     'ActivityConsumedBy', 'Flowable', 'Unit',
                                     'FlowType', 'Context', 'Location',
                                     'LocationSystem', 'Year'
                                 ],
                                 how='left')
        df_merge['Ratio'] = df_merge['FBS_amount'] / df_merge['FBA_amount']

        # reorder
        df_merge = df_merge[[
            'ActivityProducedBy', 'ActivityConsumedBy', 'Flowable', 'Unit',
            'FlowType', 'Context', 'Location', 'LocationSystem', 'Year',
            'SectorLength', 'FBA_amount', 'FBS_amount', 'Ratio'
        ]]

        # keep onlyrows of specified sector length
        comparison = df_merge[df_merge['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)

        tolerance = 0.01
        comparison2 = comparison[(comparison['Ratio'] < 1 - tolerance) |
                                 (comparison['Ratio'] > 1 + tolerance)]

        if len(comparison2) > 0:
            vLog.info(
                'There are %s combinations of flowable/context/sector length where the '
                'flowbyactivity to flowbysector ratio is less than or greater than 1 by %s',
                len(comparison2), str(tolerance))

        # include df subset in the validation log
        # only print rows where flowamount ratio is less than 1 (round flowamountratio)
        df_v = comparison2[comparison2['Ratio'].apply(
            lambda x: round(x, 3) < 1)].reset_index(drop=True)

        # save to validation log
        log.info(
            'Save the comparison of FlowByActivity load to FlowBySector ratios '
            'for %s in validation log', activity_set)
        # if df not empty, print, if empty, print string
        if df_v.empty:
            vLogDetailed.info('Ratios for %s all round to 1', activity_set)
        else:
            vLogDetailed.info(
                'Comparison of FlowByActivity load to FlowBySector ratios for %s: '
                '\n {}'.format(df_v.to_string()), activity_set)
Exemplo n.º 7
0
def calculate_flowamount_diff_between_dfs(dfa_load, dfb_load):
    """
    Calculate the differences in FlowAmounts between two dfs
    :param dfa_load: df, initial df
    :param dfb_load: df, modified df
    :return: df, comparing changes in flowamounts between 2 dfs
    """

    # subset the dataframes, only keeping data for easy comparison of flowamounts
    drop_cols = [
        'Year', 'MeasureofSpread', 'Spread', 'DistributionType', 'Min', 'Max',
        'DataReliability', 'DataCollection'
    ]
    # drop cols and rename, ignore error if a df does not contain a column to drop
    dfa = dfa_load.drop(
        drop_cols, axis=1,
        errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Original'})
    dfb = dfb_load.drop(
        drop_cols, axis=1,
        errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Modified'})
    # create df dict for modified dfs created in for loop
    df_list = []
    for d in ['a', 'b']:
        df_name = f'df{d}'
        # assign new column of geoscale by which to aggregate
        vars()[df_name + '2'] = vars()[df_name].assign(geoscale=np.where(
            vars()[df_name]['Location'].apply(lambda x: x.endswith('000')),
            'state', 'county'))
        vars()[df_name + '2'] = vars()[df_name + '2'].assign(
            geoscale=np.where(vars()[df_name +
                                     '2']['Location'] == '00000', 'national',
                              vars()[df_name + '2']['geoscale']))
        # ensure all nan/nones filled/match
        vars()[df_name + '2'] = replace_strings_with_NoneType(vars()[df_name +
                                                                     '2'])
        df_list.append(vars()[df_name + '2'])
    # merge the two dataframes
    df = df_list[0].merge(df_list[1], how='outer')

    # determine if any new data is negative
    dfn = df[df['FlowAmount_Modified'] < 0].reset_index(drop=True)
    if len(dfn) > 0:
        vLog.info(
            'There are negative FlowAmounts in new dataframe, see Validation Log'
        )
        vLogDetailed.info('Negative FlowAmounts in new dataframe: '
                          '\n {}'.format(dfn.to_string()))

    # Because code will sometimes change terminology, aggregate
    # data by context and flowable to compare df differences
    # subset df
    dfs = df[[
        'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy',
        'FlowAmount_Original', 'FlowAmount_Modified', 'Unit', 'geoscale'
    ]]
    agg_cols = [
        'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy',
        'Unit', 'geoscale'
    ]
    dfagg = dfs.groupby(agg_cols, dropna=False, as_index=False).agg({
        'FlowAmount_Original':
        sum,
        'FlowAmount_Modified':
        sum
    })
    # column calculating difference
    dfagg['FlowAmount_Difference'] = dfagg['FlowAmount_Modified'] - dfagg[
        'FlowAmount_Original']
    dfagg['Percent_Difference'] = (dfagg['FlowAmount_Difference'] /
                                   dfagg['FlowAmount_Original']) * 100
    # drop rows where difference = 0
    dfagg2 = dfagg[dfagg['FlowAmount_Difference'] != 0].reset_index(drop=True)
    if len(dfagg2) == 0:
        vLogDetailed.info('No FlowAmount differences')
    else:
        # subset df and aggregate, also print out the total aggregate diff at the geoscale
        dfagg3 = replace_strings_with_NoneType(dfagg).drop(columns=[
            'ActivityProducedBy', 'ActivityConsumedBy',
            'FlowAmount_Difference', 'Percent_Difference'
        ])
        dfagg4 = dfagg3.groupby(['Flowable', 'Context', 'Unit', 'geoscale'],
                                dropna=False,
                                as_index=False).agg({
                                    'FlowAmount_Original': sum,
                                    'FlowAmount_Modified': sum
                                })
        # column calculating difference
        dfagg4['FlowAmount_Difference'] = dfagg4[
            'FlowAmount_Modified'] - dfagg4['FlowAmount_Original']
        dfagg4['Percent_Difference'] = (dfagg4['FlowAmount_Difference'] /
                                        dfagg4['FlowAmount_Original']) * 100
        # drop rows where difference = 0
        dfagg5 = dfagg4[dfagg4['FlowAmount_Difference'] != 0].reset_index(
            drop=True)
        vLogDetailed.info('Total FlowAmount differences between dataframes: '
                          '\n {}'.format(dfagg5.to_string(), index=False))

        # save detail output in log file
        vLogDetailed.info('Total FlowAmount differences by Activity Columns: '
                          '\n {}'.format(dfagg2.to_string(), index=False))
Exemplo n.º 8
0
def compare_geographic_totals(df_subset, df_load, sourcename, attr,
                              activity_set, activity_names):
    """
    Check for any data loss between the geoscale used and published national data
    :param df_subset: df, after subset by geography
    :param df_load: df, loaded data, including published national data
    :param sourcename: str, source name
    :param attr: dictionary, attributes
    :param activity_set: str, activity set
    :param activity_names: list of names in the activity set by which
           to subset national level data
    :return: df, comparing published national level data to df subset
    """

    # subset df_load to national level
    nat = df_load[df_load['Location'] == US_FIPS].reset_index(
        drop=True).rename(columns={'FlowAmount': 'FlowAmount_nat'})
    # if df len is not 0, continue with comparison
    if len(nat) != 0:
        # subset national level data by activity set names
        nat = nat[(nat[fba_activity_fields[0]].isin(activity_names)) | (
            nat[fba_activity_fields[1]].isin(activity_names))].reset_index(
                drop=True)
        # drop the geoscale in df_subset and sum
        sub = df_subset.assign(Location=US_FIPS)
        # depending on the datasource, might need to rename some strings for national comparison
        sub = rename_column_values_for_comparison(sub, sourcename)
        sub2 = aggregator(sub, fba_default_grouping_fields).rename(
            columns={'FlowAmount': 'FlowAmount_sub'})

        # compare df
        merge_cols = [
            'Class', 'SourceName', 'FlowName', 'Unit', 'FlowType',
            'ActivityProducedBy', 'ActivityConsumedBy', 'Compartment',
            'Location', 'LocationSystem', 'Year'
        ]
        # comapare units
        compare_df_units(nat, sub2)
        df_m = pd.merge(nat[merge_cols + ['FlowAmount_nat']],
                        sub2[merge_cols + ['FlowAmount_sub']],
                        how='outer')
        df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_nat'] -
                           df_m['FlowAmount_sub'])
        df_m = df_m.assign(Percent_Diff=(
            abs(df_m['FlowAmount_diff'] / df_m['FlowAmount_nat']) * 100))
        df_m = df_m[df_m['FlowAmount_diff'] != 0].reset_index(drop=True)
        # subset the merged df to what to include in the validation df
        # include data where percent difference is > 1 or where value is nan
        df_m_sub = df_m[(df_m['Percent_Diff'] > 1) |
                        (df_m['Percent_Diff'].isna())].reset_index(drop=True)

        if len(df_m_sub) == 0:
            vLog.info(
                'No data loss greater than 1%% between national '
                'level data and %s subset', attr['allocation_from_scale'])
        else:
            vLog.info(
                'There are data differences between published national values '
                'and %s subset, saving to validation log',
                attr['allocation_from_scale'])

            vLogDetailed.info(
                'Comparison of National FlowAmounts to aggregated Data '
                'Subset for %s: \n {}'.format(df_m_sub.to_string()),
                activity_set)
Exemplo n.º 9
0
def main(**kwargs):
    """
    Creates a flowbysector dataset
    :param kwargs: dictionary of arguments, only argument is "method_name", the name of method
                   corresponding to flowbysector method yaml name
    :return: parquet, FBS save to local folder
    """
    if len(kwargs) == 0:
        kwargs = parse_args()

    method_name = kwargs['method']
    # assign arguments
    vLog.info("Initiating flowbysector creation for %s", method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of data and allocation datasets
    fb = method['source_names']
    # Create empty list for storing fbs files
    fbs_list = []
    for k, v in fb.items():
        # pull fba data for allocation
        flows = load_source_dataframe(k, v)

        if v['data_format'] == 'FBA':
            # ensure correct datatypes and that all fields exist
            flows = clean_df(flows,
                             flow_by_activity_fields,
                             fba_fill_na_dict,
                             drop_description=False)

            # map flows to federal flow list or material flow list
            flows_mapped, mapping_files = map_fbs_flows(flows,
                                                        k,
                                                        v,
                                                        keep_fba_columns=True)

            # clean up fba, if specified in yaml
            if "clean_fba_df_fxn" in v:
                vLog.info("Cleaning up %s FlowByActivity", k)
                flows_mapped = dynamically_import_fxn(
                    k, v["clean_fba_df_fxn"])(flows_mapped)

            # if activity_sets are specified in a file, call them here
            if 'activity_set_file' in v:
                aset_names = pd.read_csv(flowbysectoractivitysetspath +
                                         v['activity_set_file'],
                                         dtype=str)
            else:
                aset_names = None

            # create dictionary of allocation datasets for different activities
            activities = v['activity_sets']
            # subset activity data and allocate to sector
            for aset, attr in activities.items():
                # subset by named activities
                if 'activity_set_file' in v:
                    names = aset_names[aset_names['activity_set'] ==
                                       aset]['name']
                else:
                    names = attr['names']

                vLog.info("Preparing to handle %s in %s", aset, k)
                # subset fba data by activity
                flows_subset =\
                    flows_mapped[(flows_mapped[fba_activity_fields[0]].isin(names)) |
                              (flows_mapped[fba_activity_fields[1]].isin(names)
                               )].reset_index(drop=True)

                # if activities are sector-like, check sectors are valid
                if load_source_catalog()[k]['sector-like_activities']:
                    flows_subset2 =\
                        replace_naics_w_naics_from_another_year(flows_subset,
                                                                method['target_sector_source'])
                    # check impact on df FlowAmounts
                    vLog.info(
                        'Calculate FlowAmount difference caused by '
                        'replacing NAICS Codes with %s, saving difference in Validation log',
                        method['target_sector_source'],
                    )
                    calculate_flowamount_diff_between_dfs(
                        flows_subset, flows_subset2)
                else:
                    flows_subset2 = flows_subset.copy()

                # extract relevant geoscale data or aggregate existing data
                flows_subset_geo = subset_df_by_geoscale(
                    flows_subset2, v['geoscale_to_use'],
                    attr['allocation_from_scale'])
                # if loading data subnational geoscale, check for data loss
                if attr['allocation_from_scale'] != 'national':
                    compare_geographic_totals(flows_subset_geo, flows_mapped,
                                              k, attr, aset, names)

                # Add sectors to df activity, depending on level of specified sector aggregation
                log.info("Adding sectors to %s", k)
                flows_subset_wsec =\
                    add_sectors_to_flowbyactivity(flows_subset_geo,
                                                  sectorsourcename=method['target_sector_source'],
                                                  allocationmethod=attr['allocation_method'])
                # clean up fba with sectors, if specified in yaml
                if "clean_fba_w_sec_df_fxn" in v:
                    vLog.info("Cleaning up %s FlowByActivity with sectors", k)
                    flows_subset_wsec = \
                        dynamically_import_fxn(k, v["clean_fba_w_sec_df_fxn"])(flows_subset_wsec,
                                                                               attr=attr,
                                                                               method=method)

                # rename SourceName to MetaSources and drop columns
                flows_mapped_wsec = flows_subset_wsec.\
                    rename(columns={'SourceName': 'MetaSources'}).\
                    drop(columns=['FlowName', 'Compartment'])

                # if allocation method is "direct", then no need to create alloc ratios,
                # else need to use allocation
                # dataframe to create sector allocation ratios
                if attr['allocation_method'] == 'direct':
                    fbs = direct_allocation_method(flows_mapped_wsec, k, names,
                                                   method)
                # if allocation method for an activity set requires a specific
                # function due to the complicated nature
                # of the allocation, call on function here
                elif attr['allocation_method'] == 'allocation_function':
                    fbs = function_allocation_method(flows_mapped_wsec, k,
                                                     names, attr, fbs_list)
                else:
                    fbs =\
                        dataset_allocation_method(flows_mapped_wsec, attr,
                                                  names, method, k, v, aset,
                                                  method_name, aset_names)

                # drop rows where flowamount = 0 (although this includes dropping suppressed data)
                fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)

                # define grouping columns dependent on sectors being activity-like or not
                if load_source_catalog()[k]['sector-like_activities'] is False:
                    groupingcols = fbs_grouping_fields_w_activities
                    groupingdict = flow_by_sector_fields_w_activity
                else:
                    groupingcols = fbs_default_grouping_fields
                    groupingdict = flow_by_sector_fields

                # clean df
                fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict)

                # aggregate df geographically, if necessary
                log.info("Aggregating flowbysector to %s level",
                         method['target_geoscale'])
                # determine from scale
                if fips_number_key[v['geoscale_to_use']] <\
                        fips_number_key[attr['allocation_from_scale']]:
                    from_scale = v['geoscale_to_use']
                else:
                    from_scale = attr['allocation_from_scale']

                fbs_geo_agg = agg_by_geoscale(fbs, from_scale,
                                              method['target_geoscale'],
                                              groupingcols)

                # aggregate data to every sector level
                log.info("Aggregating flowbysector to all sector levels")
                fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols)
                # add missing naics5/6 when only one naics5/6 associated with a naics4
                fbs_agg = sector_disaggregation(fbs_sec_agg)

                # check if any sector information is lost before reaching
                # the target sector length, if so,
                # allocate values equally to disaggregated sectors
                vLog.info(
                    'Searching for and allocating FlowAmounts for any parent '
                    'NAICS that were dropped in the subset to '
                    '%s child NAICS', method['target_sector_level'])
                fbs_agg_2 = allocate_dropped_sector_data(
                    fbs_agg, method['target_sector_level'])

                # compare flowbysector with flowbyactivity
                compare_activity_to_sector_flowamounts(flows_mapped_wsec,
                                                       fbs_agg_2, aset, k,
                                                       method)

                # return sector level specified in method yaml
                # load the crosswalk linking sector lengths
                sector_list = get_sector_list(method['target_sector_level'])

                # subset df, necessary because not all of the sectors are
                # NAICS and can get duplicate rows
                fbs_1 = fbs_agg_2.loc[(fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                                      (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_2 = fbs_agg_2.loc[(fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) &
                                      (fbs_agg_2[fbs_activity_fields[1]].isnull())].\
                    reset_index(drop=True)
                fbs_3 = fbs_agg_2.loc[(fbs_agg_2[fbs_activity_fields[0]].isnull()) &
                                      (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list))].\
                    reset_index(drop=True)
                fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3])

                # drop activity columns
                fbs_sector_subset = fbs_sector_subset.drop(
                    ['ActivityProducedBy', 'ActivityConsumedBy'],
                    axis=1,
                    errors='ignore')

                # save comparison of FBA total to FBS total for an activity set
                compare_fba_geo_subset_and_fbs_output_totals(
                    flows_subset_geo, fbs_sector_subset, aset, k, v, attr,
                    method)

                log.info("Completed flowbysector for %s", aset)
                fbs_list.append(fbs_sector_subset)
        else:
            # if the loaded flow dt is already in FBS format, append directly to list of FBS
            log.info("Append %s to FBS list", k)
            # ensure correct field datatypes and add any missing fields
            flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict)
            fbs_list.append(flows)
    # create single df of all activities
    log.info("Concat data for all activities")
    fbss = pd.concat(fbs_list, ignore_index=True, sort=False)
    log.info("Clean final dataframe")
    # add missing fields, ensure correct data type, add missing columns, reorder columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    # prior to aggregating, replace MetaSources string with all sources
    # that share context/flowable/sector values
    fbss = harmonize_FBS_columns(fbss)
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    log.info("Sort and store dataframe")
    # ensure correct data types/order of columns
    fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict)
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable',
         'Context']).reset_index(drop=True)
    # tmp reset data quality scores
    fbss = reset_fbs_dq_scores(fbss)
    # save parquet file
    meta = set_fb_meta(method_name, "FlowBySector")
    write_df_to_file(fbss, paths, meta)
    write_metadata(method_name, method, meta, "FlowBySector")
    # rename the log file saved to local directory
    rename_log_file(method_name, meta)
    log.info(
        'See the Validation log for detailed assessment of model results in %s',
        logoutputpath)