Exemplo n.º 1
0
def store_test_flowbyactivity(csvname, year=None):
    """Prints the data frame into a parquet file."""
    if year is not None:
        f = fbaoutputpath + csvname + "_" + str(year) + '.parquet'
    else:
        f = fbaoutputpath + csvname + '.parquet'
    try:
        result = pd.read_csv(datapath + csvname + '.csv', dtype='str')
        result = add_missing_flow_by_fields(result, flow_by_activity_fields)
        result.to_parquet(f, engine="pyarrow")
    except:
        log.error('Failed to save ' + csvname + "_" + str(year) + ' file.')
Exemplo n.º 2
0
def prepare_stewi_fbs(df, inventory_dict, NAICS_level, geo_scale):
    from stewi.globals import weighted_average

    # update location to appropriate geoscale prior to aggregating
    df.dropna(subset=['Location'], inplace=True)
    df['Location'] = df['Location'].astype(str)
    df = update_geoscale(df, geo_scale)

    # assign grouping variables based on desired geographic aggregation level
    grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location']
    if 'MetaSources' in df:
        grouping_vars.append('MetaSources')

    # aggregate by NAICS code, FlowName, compartment, and geographic level
    fbs = df.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'Year': 'first',
        'Unit': 'first'
    })

    # add reliability score
    fbs['DataReliability'] = weighted_average(df, 'DataReliability',
                                              'FlowAmount', grouping_vars)
    fbs.reset_index(inplace=True)

    # apply flow mapping
    fbs = map_elementary_flows(fbs, list(inventory_dict.keys()))

    # rename columns to match flowbysector format
    fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"})

    # add hardcoded data, depending on the source data, some of these fields may need to change
    fbs['Class'] = 'Chemicals'
    fbs['SectorConsumedBy'] = 'None'
    fbs['SectorSourceName'] = 'NAICS_2012_Code'
    fbs['FlowType'] = 'ELEMENTARY_FLOW'

    fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0])

    # add missing flow by sector fields
    fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

    fbs = check_for_missing_sector_data(fbs, NAICS_level)

    # sort dataframe and reset index
    fbs = fbs.sort_values(list(
        flow_by_sector_fields.keys())).reset_index(drop=True)

    # check the sector codes to make sure NAICS 2012 codes
    fbs = replace_naics_w_naics_2012(fbs, 'NAICS_2012_Code')

    return fbs
Exemplo n.º 3
0
def bls_clean_allocation_fba_w_sec(df_w_sec, **kwargs):
    """
    clean up bls df with sectors by estimating suppresed data
    :param df_w_sec:
    :param attr:
    :param method:
    :return:
    """
    from flowsa.flowbyfunctions import estimate_suppressed_data, sector_disaggregation, sector_aggregation, \
        flow_by_activity_wsec_mapped_fields, add_missing_flow_by_fields, replace_strings_with_NoneType

    sector_column = 'SectorProducedBy'

    df = estimate_suppressed_data(df_w_sec, sector_column)

    df = add_missing_flow_by_fields(df, flow_by_activity_wsec_mapped_fields)
    df = replace_strings_with_NoneType(df)

    # df = sector_aggregation(df, fba_mapped_default_grouping_fields)
    # df = sector_disaggregation(df, fba_mapped_default_grouping_fields)

    return df
Exemplo n.º 4
0
        return df


if __name__ == '__main__':
    # assign arguments
    args = parse_args()
    # assign yaml parameters (common.py fxn)
    config = load_sourceconfig(args['source'])
    # build the base url with strings that will be replaced
    build_url = build_url_for_query(config['url'])
    # replace parts of urls with specific instructions from source.py
    urls = assemble_urls_for_query(build_url, config, args)
    # create a list with data from all source urls
    dataframe_list = call_urls(urls, args)
    # concat the dataframes and parse data with specific instructions from source.py
    df = parse_data(dataframe_list, args)
    # log that data was retrieved
    log.info("Retrieved data for " + args['source'])
    # add any missing columns of data and cast to appropriate data type
    flow_df = add_missing_flow_by_fields(df, flow_by_activity_fields)
    # modify flow units
    flow_df = convert_fba_unit(flow_df)
    # sort df and reset index
    flow_df = flow_df.sort_values([
        'Class', 'Location', 'ActivityProducedBy', 'ActivityConsumedBy',
        'FlowName', 'Compartment'
    ]).reset_index(drop=True)
    # save as parquet file
    parquet_name = args['source'] + '_' + args['year']
    store_flowbyactivity(flow_df, parquet_name)
Exemplo n.º 5
0
def disaggregate_cropland(fba_w_sector, attr):
    """
    In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to
    create ratios
    :param fba_w_sector:
    :param attr:
    :return:
    """

    import flowsa
    from flowsa.flowbyfunctions import generalize_activity_field_names, sector_aggregation,\
        fbs_default_grouping_fields, clean_df, fba_fill_na_dict, add_missing_flow_by_fields
    from flowsa.mapping import add_sectors_to_flowbyactivity

    # drop pastureland data
    crop = fba_w_sector.loc[fba_w_sector['Sector'].apply(lambda x: str(x[0:3]))
                            != '112'].reset_index(drop=True)
    # drop sectors < 4 digits
    crop = crop[crop['Sector'].apply(lambda x: len(x) > 3)].reset_index(
        drop=True)
    # create tmp location
    crop.loc[:, 'Location_tmp'] = crop['Location'].apply(lambda x: str(x[0:2]))

    # load the relevant state level harvested cropland by naics
    naics_load = flowsa.getFlowByActivity(
        flowclass=['Land'],
        years=[attr['allocation_source_year']],
        datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True)
    # clean df
    naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict)
    # subset the harvested cropland by naics
    naics = naics[naics['FlowName'] ==
                  'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True)
    # add sectors
    naics = add_sectors_to_flowbyactivity(naics,
                                          sectorsourcename='NAICS_2012_Code',
                                          levelofSectoragg='agg')
    # add missing fbs fields
    naics = add_missing_flow_by_fields(naics, flow_by_sector_fields)

    # aggregate sectors to create any missing naics levels
    naics = sector_aggregation(naics, fbs_default_grouping_fields)
    # add missing naics5/6 when only one naics5/6 associated with a naics4
    naics = sector_disaggregation(naics)
    # drop rows where sector consumed by is none and FlowAmount 0
    naics = naics[naics['SectorConsumedBy'].notnull()]
    naics = naics.loc[naics['FlowAmount'] != 0]
    # create ratios
    naics = sector_ratios(naics)
    # drop sectors < 4 digits
    #naics = naics[naics['SectorConsumedBy'].apply(lambda x: len(x) > 3)].reset_index(drop=True)
    # create temporary sector column to match the two dfs on
    naics.loc[:,
              'Location_tmp'] = naics['Location'].apply(lambda x: str(x[0:2]))

    # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate
    for i in range(4, 6):
        # subset df to sectors with length = i and length = i + 1
        crop_subset = crop.loc[crop['Sector'].apply(
            lambda x: i + 1 >= len(x) >= i)]
        crop_subset.loc[:, 'Sector_tmp'] = crop_subset['Sector'].apply(
            lambda x: x[0:i])
        # if duplicates drop all rows
        df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'],
                                         keep=False).reset_index(drop=True)
        # drop sector temp column
        df = df.drop(columns=["Sector_tmp"])
        # subset df to keep the sectors of length i
        df_subset = df.loc[df['Sector'].apply(lambda x: len(x) == i)]
        # subset the naics df where naics length is i + 1
        naics_subset = naics.loc[naics['SectorConsumedBy'].apply(
            lambda x: len(x) == i + 1)].reset_index(drop=True)
        naics_subset.loc[:, 'Sector_tmp'] = naics_subset[
            'SectorConsumedBy'].apply(lambda x: x[0:i])
        # merge the two df based on locations
        df_subset = pd.merge(df_subset,
                             naics_subset[[
                                 'SectorConsumedBy', 'FlowAmountRatio',
                                 'Sector_tmp', 'Location_tmp'
                             ]],
                             how='left',
                             left_on=['Sector', 'Location_tmp'],
                             right_on=['Sector_tmp', 'Location_tmp'])
        # create flow amounts for the new NAICS based on the flow ratio
        df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[
            'FlowAmountRatio']
        # drop rows of 0 and na
        df_subset = df_subset[df_subset['FlowAmount'] != 0]
        df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(
            drop=True)
        # drop columns
        df_subset = df_subset.drop(
            columns=['Sector', 'FlowAmountRatio', 'Sector_tmp'])
        # rename columns
        df_subset = df_subset.rename(columns={"SectorConsumedBy": "Sector"})
        # add new rows of data to crop df
        crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True)

    # clean up df
    crop = crop.drop(columns=['Location_tmp'])

    # pasture data
    pasture = fba_w_sector.loc[fba_w_sector['Sector'].apply(
        lambda x: str(x[0:3])) == '112'].reset_index(drop=True)

    # concat crop and pasture
    fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True)

    return fba_w_sector
Exemplo n.º 6
0
                data["ActivityProducedBy"] = produced_by(entry)
                data["ActivityConsumedBy"] = None
                dataframe = pd.read_csv(external_datapath + "/" + entry,
                                        header=0,
                                        dtype=str)
                for col in dataframe.columns:
                    if "Percent" in str(col):
                        del dataframe[col]

                for index, row in dataframe.iterrows():
                    for column in dataframe.columns:
                        if "Material" != column:
                            col_string = column.split()
                            data["Unit"] = col_string[1]
                            data['FlowName'] = dataframe.iloc[index][
                                "Material"] + " " + col_string[0]
                            if dataframe.iloc[index][column] != "-":
                                data["FlowAmount"] = int(
                                    dataframe.iloc[index][column])
                                output = output.append(data, ignore_index=True)
                                output = assign_fips_location_system(
                                    output, '2014')
    flow_df = add_missing_flow_by_fields(output, flow_by_activity_fields)
    # add missing dataframe fields (also converts columns to desired datatype)
    flow_df = clean_df(flow_df,
                       flow_by_activity_fields,
                       fba_fill_na_dict,
                       drop_description=False)
    parquet_name = 'California_Commercial_bySector'
    store_flowbyactivity(flow_df, parquet_name, 2014)
Exemplo n.º 7
0
def main(method_name):
    """
    Creates a flowbysector dataset
    :param method_name: Name of method corresponding to flowbysector method yaml name
    :return: flowbysector
    """

    log.info("Initiating flowbysector creation for " + method_name)
    # call on method
    method = load_method(method_name)
    # create dictionary of water data and allocation datasets
    fbas = method['flowbyactivity_sources']
    # Create empty list for storing fbs files
    fbss = []
    for k, v in fbas.items():
        # pull water data for allocation
        log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year']))
        flows = flowsa.getFlowByActivity(flowclass=[v['class']],
                                         years=[v['year']],
                                         datasource=k)

        # if necessary, standardize names in data set
        if v['activity_name_standardization_fxn'] != 'None':
            log.info("Standardizing activity names in " + k)
            flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows)

        # drop description field
        flows = flows.drop(columns='Description')
        # fill null values
        flows = flows.fillna(value=fba_fill_na_dict)
        # map df to elementary flows - commented out until mapping complete
        # log.info("Mapping flows in " + k + ' to federal elementary flow list')
        # flows_mapped = map_elementary_flows(flows, k)
        # convert unit todo: think about unit conversion here
        log.info("Converting units in " + k)
        flows = convert_unit(flows)

        # create dictionary of allocation datasets for different activities
        activities = v['activity_sets']
        for aset, attr in activities.items():
            # subset by named activities
            names = [attr['names']]
            log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k)
            # subset usgs data by activity
            flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) |
                                (flows[fba_activity_fields[1]].isin(names))]

            # Reset index values after subset
            flow_subset = flow_subset.reset_index(drop=True)

            # check if flowbyactivity data exists at specified geoscale to use
            log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " +
                     v['geoscale_to_use'] + ' level')
            geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use'])
            # aggregate geographically to the scale of the allocation dataset
            if geocheck == "Yes":
                activity_from_scale = v['geoscale_to_use']
            else:
                # if activity does not exist at specified geoscale, issue warning and use data at less aggregated
                # geoscale, and sum to specified geoscale
                log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level")
                new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names,
                                                                                        v['geoscale_to_use'])
                activity_from_scale = new_geoscale_to_use

            activity_to_scale = attr['allocation_from_scale']
            # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale
            if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale)
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names)
            # else, aggregate to geoscale want to use
            elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]:
                log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use'])
                flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names)
            # else, if usgs is more aggregated than allocation table, filter relevant rows
            else:
                log.info("Filtering out " + activity_from_scale + " data")
                flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names)

            # location column pad zeros if necessary
            flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5
                                                                    else x
                                                                    )

            # Add sectors to usgs activity, creating two versions of the flow subset
            # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS)
            # the second version, "flow_subset_agg" includes only the most aggregated level of sectors
            log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names)))
            flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset,
                                                             sectorsourcename=method['target_sector_source'])
            flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset,
                                                                 sectorsourcename=method['target_sector_source'],
                                                                 levelofSectoragg='agg')

            # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation
            # dataframe to create sector allocation ratios
            if attr['allocation_method'] == 'direct':
                fbs = flow_subset_wsec_agg.copy()
            else:
                # determine appropriate allocation dataset
                log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year']))
                fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']],
                                                          datasource=attr['allocation_source'],
                                                          years=[attr['allocation_source_year']]).reset_index(drop=True)

                # fill null values
                fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict)
                # convert unit
                fba_allocation = convert_unit(fba_allocation)

                # subset based on yaml settings
                if attr['allocation_flow'] != 'None':
                    fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])]
                if attr['allocation_compartment'] != 'None':
                    fba_allocation = fba_allocation.loc[
                        fba_allocation['Compartment'].isin(attr['allocation_compartment'])]
                # reset index
                fba_allocation = fba_allocation.reset_index(drop=True)

                # check if allocation data exists at specified geoscale to use
                log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) +
                         " at the " + attr['allocation_from_scale'] + " level")
                check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale'])

                # aggregate geographically to the scale of the flowbyactivty source, if necessary
                from_scale = attr['allocation_from_scale']
                to_scale = v['geoscale_to_use']
                # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale
                if fips_number_key[from_scale] > fips_number_key[to_scale]:
                    fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names)
                # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale
                else:
                    fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names)

                # assign sector to allocation dataset
                log.info("Adding sectors to " + attr['allocation_source'])
                fba_allocation = add_sectors_to_flowbyactivity(fba_allocation,
                                                               sectorsourcename=method['target_sector_source'],
                                                               levelofSectoragg=attr[
                                                                   'allocation_sector_aggregation'])
                # subset fba datsets to only keep the naics associated with usgs activity subset
                log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k)
                fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names)
                # Reset index values after subset
                fba_allocation_subset = fba_allocation_subset.reset_index(drop=True)
                # generalize activity field names to enable link to water withdrawal table
                log.info("Generalizing activity names in subset of " + attr['allocation_source'])
                fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset)
                # drop columns
                fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity'])

                # if there is an allocation helper dataset, modify allocation df
                if attr['allocation_helper'] == 'yes':
                    log.info("Using the specified allocation help for subset of " + attr['allocation_source'])
                    fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr)

                # create flow allocation ratios
                log.info("Creating allocation ratios for " + attr['allocation_source'])
                flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method'])

                # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \
                # aren't in list
                sector_list = flow_allocation['Sector'].unique().tolist()

                # subset fba allocation table to the values in the activity list, based on overlapping sectors
                flow_subset_wsec = flow_subset_wsec.loc[
                    (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) |
                    (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))]

                # check if fba and allocation dfs have the same LocationSystem
                log.info("Checking if flowbyactivity and allocation dataframes use the same location systems")
                check_if_location_systems_match(flow_subset_wsec, flow_allocation)

                # merge water withdrawal df w/flow allocation dataset
                log.info("Merge " + k + " and subset of " + attr['allocation_source'])
                fbs = flow_subset_wsec.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorProducedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                fbs = fbs.merge(
                    flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']],
                    left_on=['Location', 'LocationSystem', 'SectorConsumedBy'],
                    right_on=['Location', 'LocationSystem', 'Sector'], how='left')

                # drop columns where both sector produced/consumed by in flow allocation dif is null
                fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index()

                # merge the flowamount columns
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y'])
                fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0)

                # calculate flow amounts for each sector
                log.info("Calculating new flow amounts using flow ratios")
                fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio']

                # drop columns
                log.info("Cleaning up new flow by sector")
                fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y',
                                        'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy'])

            # rename flow name to flowable
            fbs = fbs.rename(columns={"FlowName": 'Flowable',
                                      "Compartment": "Context"
                                      })

            # drop rows where flowamount = 0 (although this includes dropping suppressed data)
            fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True)
            # add missing data columns
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)
            # fill null values
            fbs = fbs.fillna(value=fbs_fill_na_dict)

            # aggregate df geographically, if necessary
            log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level")
            if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]:
                from_scale = v['geoscale_to_use']
            else:
                from_scale = attr['allocation_from_scale']

            to_scale = method['target_geoscale']

            fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names)

            # aggregate data to every sector level
            log.info("Aggregating flowbysector to " + method['target_sector_level'])
            fbs = sector_aggregation(fbs, fbs_default_grouping_fields)

            # test agg by sector
            sector_agg_comparison = sector_flow_comparision(fbs)

            # return sector level specified in method yaml
            # load the crosswalk linking sector lengths
            cw = load_sector_length_crosswalk()
            sector_list = cw[method['target_sector_level']].unique().tolist()

            # add any non-NAICS sectors used with NAICS
            household = load_household_sector_codes()
            household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']]
            # add household sector to sector list
            sector_list.extend(household['Code'].tolist())
            # subset df
            fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) |
                          (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True)

            # add any missing columns of data and cast to appropriate data type
            fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

            log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names)))
            fbss.append(fbs)
    # create single df of all activities
    fbss = pd.concat(fbss, ignore_index=True, sort=False)
    # aggregate df as activities might have data for the same specified sector length
    fbss = aggregator(fbss, fbs_default_grouping_fields)
    # sort df
    fbss = fbss.sort_values(
        ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True)
    # save parquet file
    store_flowbysector(fbss, method_name)
Exemplo n.º 8
0
def stewicombo_to_sector(inventory_dict, NAICS_level, geo_level, compartments):
    """
    This function takes the following inputs:
        - inventory_dict: a dictionary of inventory types and years (e.g., {'NEI':'2017', 'TRI':'2017'})
        - NAICS_level: desired NAICS aggregation level (2-6)
        - geo_level: desired geographic aggregation level ('National', 'State', 'County')
        - compartments: list of compartments to include (e.g., 'water', 'air', 'land')
    """

    ## run stewicombo to combine inventories, filter for LCI, remove overlap
    df = stewicombo.combineFullInventories(inventory_dict,
                                           filter_for_LCI=True,
                                           remove_overlap=True,
                                           compartments=compartments)

    ## create mapping to convert facility IDs --> NAICS codes
    facility_mapping = pd.DataFrame()
    # for all inventories in list:
    # - load facility data from stewi output directory, keeping only the facility IDs, NAICS codes, and geographic information
    # - create new column indicating inventory source (database and year)
    # - append data to master data frame
    for i in range(len(inventory_dict)):
        # define inventory name as inventory type + inventory year (e.g., NEI_2017)
        inventory_name = list(inventory_dict.keys())[i] + '_' + list(
            inventory_dict.values())[i]
        facilities = pd.read_csv(
            stw_output_dir + 'facility/' + inventory_name + '.csv',
            usecols=['FacilityID', 'NAICS', 'State', 'County'],
            dtype={
                'FacilityID': str,
                'NAICS': int
            })
        # rename counties as County + State (e.g., Bristol_MA), since some states share county names
        facilities['County'] = facilities['County'] + '_' + facilities['State']
        facilities['SourceYear'] = inventory_name
        facility_mapping = facility_mapping.append(facilities)

    ## merge dataframes to assign NAICS codes based on facility IDs
    df['SourceYear'] = df['Source'] + '_' + df['Year']
    df = pd.merge(df,
                  facility_mapping,
                  how='left',
                  left_on=['FacilityID', 'SourceYear'],
                  right_on=['FacilityID', 'SourceYear'])

    ## subtract emissions for air transportation from airports
    # PLACEHOLDER TO SUBTRACT EMISSIONS FOR AIR TRANSPORT

    ## aggregate data based on NAICS code and chemical ID
    # add levelized NAICS code
    df['NAICS_lvl'] = df['NAICS'].astype(str).str[0:NAICS_level]
    # assign grouping variables based on desired geographic aggregation level
    if geo_level == 'National':
        grouping_vars = ['NAICS_lvl', 'SRS_ID', 'Compartment']
    elif geo_level == 'State':
        grouping_vars = ['NAICS_lvl', 'SRS_ID', 'Compartment', 'State']
    elif geo_level == 'County':
        grouping_vars = ['NAICS_lvl', 'SRS_ID', 'Compartment', 'County']
    # aggregate by NAICS code, chemical ID, compartment, and geographic level
    fbs = df.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'NAICS_lvl': 'first',
        'Compartment': 'first',
        'FlowName': 'first',
        'Year': 'first',
        'Unit': 'first',
        'State': 'first',
        'County': 'first'
    })
    # add reliability score
    fbs['DataReliability'] = weighted_average(df, 'ReliabilityScore',
                                              'FlowAmount', grouping_vars)

    ## perform operations to match flowbysector format
    # rename columns to match flowbysector format
    fbs = fbs.rename(
        columns={
            "NAICS_lvl": "SectorProducedBy",
            "FlowName": "Flowable",
            "Compartment": "Context"
        })
    # add hardcoded data
    fbs['National'] = 'United States'
    fbs['Class'] = 'Chemicals'
    fbs['SectorConsumedBy'] = 'None'
    fbs['Location'] = fbs[geo_level]
    fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0])
    # add missing flow by sector fields
    fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)
    # sort dataframe and reset index
    fbs = fbs.sort_values(list(
        flow_by_sector_fields.keys())).reset_index(drop=True)

    ## save result to output directory
    fbs.to_csv(output_dir + 'Chemicals_' + geo_level + '.csv')
Exemplo n.º 9
0
def stewicombo_to_sector(inventory_dict, NAICS_level, geo_scale, compartments):
    """
    Returns emissions from stewicombo in fbs format
    :param inventory_dict: a dictionary of inventory types and years (e.g., 
                {'NEI':'2017', 'TRI':'2017'})
    :param NAICS_level: desired NAICS aggregation level, using sector_level_key,
                should match target_sector_level
    :param geo_scale: desired geographic aggregation level ('national', 'state',
                'county'), should match target_geoscale
    :param compartments: list of compartments to include (e.g., 'water', 'air',
                'soil'), use None to include all compartments
    """

    from stewi.globals import output_dir as stw_output_dir
    from stewi.globals import weighted_average
    import stewi
    import stewicombo
    import facilitymatcher
    from stewicombo.overlaphandler import remove_default_flow_overlaps
    from stewicombo.globals import addChemicalMatches
    from facilitymatcher import output_dir as fm_output_dir

    NAICS_level_value = sector_level_key[NAICS_level]
    ## run stewicombo to combine inventories, filter for LCI, remove overlap
    df = stewicombo.combineFullInventories(inventory_dict,
                                           filter_for_LCI=True,
                                           remove_overlap=True,
                                           compartments=compartments)
    df.drop(columns=['SRS_CAS', 'SRS_ID', 'FacilityIDs_Combined'],
            inplace=True)

    facility_mapping = pd.DataFrame()
    # load facility data from stewi output directory, keeping only the facility IDs, and geographic information
    inventory_list = list(inventory_dict.keys())
    for i in range(len(inventory_dict)):
        # define inventory name as inventory type + inventory year (e.g., NEI_2017)
        inventory_name = inventory_list[i] + '_' + list(
            inventory_dict.values())[i]
        facilities = pd.read_csv(stw_output_dir + 'facility/' +
                                 inventory_name + '.csv',
                                 usecols=['FacilityID', 'State', 'County'],
                                 dtype={'FacilityID': str})
        if len(facilities[facilities.duplicated(subset='FacilityID',
                                                keep=False)]) > 0:
            log.info('Duplicate facilities in ' + inventory_name +
                     ' - keeping first listed')
            facilities.drop_duplicates(subset='FacilityID',
                                       keep='first',
                                       inplace=True)
        facility_mapping = facility_mapping.append(facilities)

    # Apply FIPS to facility locations
    facility_mapping = apply_county_FIPS(facility_mapping)

    ## merge dataframes to assign facility information based on facility IDs
    df = pd.merge(df, facility_mapping, how='left', on='FacilityID')

    ## Access NAICS From facility matcher and assign based on FRS_ID
    all_NAICS = facilitymatcher.get_FRS_NAICSInfo_for_facility_list(
        frs_id_list=None, inventories_of_interest_list=inventory_list)
    all_NAICS = all_NAICS.loc[all_NAICS['PRIMARY_INDICATOR'] == 'PRIMARY']
    all_NAICS.drop(columns=['PRIMARY_INDICATOR'], inplace=True)
    all_NAICS = naics_expansion(all_NAICS)
    if len(all_NAICS[all_NAICS.duplicated(subset=['FRS_ID', 'Source'],
                                          keep=False)]) > 0:
        log.info('Duplicate primary NAICS reported - keeping first')
        all_NAICS.drop_duplicates(subset=['FRS_ID', 'Source'],
                                  keep='first',
                                  inplace=True)
    df = pd.merge(df, all_NAICS, how='left', on=['FRS_ID', 'Source'])

    # add levelized NAICS code prior to aggregation
    df['NAICS_lvl'] = df['NAICS'].str[0:NAICS_level_value]

    ## subtract emissions for air transportation from airports in NEI
    airport_NAICS = '4881'
    air_transportation_SCC = '2275020000'
    air_transportation_naics = '481111'
    if 'NEI' in inventory_list:
        log.info('Reassigning emissions from air transportation from airports')

        # obtain and prepare SCC dataset
        df_airplanes = stewi.getInventory('NEI',
                                          inventory_dict['NEI'],
                                          stewiformat='flowbySCC')
        df_airplanes = df_airplanes[df_airplanes['SCC'] ==
                                    air_transportation_SCC]
        df_airplanes['Source'] = 'NEI'
        df_airplanes = addChemicalMatches(df_airplanes)
        df_airplanes = remove_default_flow_overlaps(df_airplanes, SCC=True)
        df_airplanes.drop(columns=['SCC'], inplace=True)

        facility_mapping_air = df[['FacilityID', 'NAICS']]
        facility_mapping_air.drop_duplicates(keep='first', inplace=True)
        df_airplanes = df_airplanes.merge(facility_mapping_air,
                                          how='left',
                                          on='FacilityID')

        df_airplanes['Year'] = inventory_dict['NEI']
        df_airplanes = df_airplanes[
            df_airplanes['NAICS'].str[0:len(airport_NAICS)] == airport_NAICS]

        # subtract airplane emissions from airport NAICS at individual facilities
        df_planeemissions = df_airplanes[[
            'FacilityID', 'FlowName', 'FlowAmount'
        ]]
        df_planeemissions.rename(columns={'FlowAmount': 'PlaneEmissions'},
                                 inplace=True)
        df = df.merge(df_planeemissions,
                      how='left',
                      on=['FacilityID', 'FlowName'])
        df[['PlaneEmissions']] = df[['PlaneEmissions']].fillna(value=0)
        df['FlowAmount'] = df['FlowAmount'] - df['PlaneEmissions']
        df.drop(columns=['PlaneEmissions'], inplace=True)

        # add airplane emissions under air transport NAICS
        df_airplanes.loc[:, 'NAICS_lvl'] = air_transportation_naics[
            0:NAICS_level_value]
        df = pd.concat([df, df_airplanes], ignore_index=True)

    # update location to appropriate geoscale prior to aggregating
    df.dropna(subset=['Location'], inplace=True)
    df['Location'] = df['Location'].astype(str)
    df = update_geoscale(df, geo_scale)

    # assign grouping variables based on desired geographic aggregation level
    grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location']

    # aggregate by NAICS code, FlowName, compartment, and geographic level
    fbs = df.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'Year': 'first',
        'Unit': 'first'
    })

    # add reliability score
    fbs['DataReliability'] = weighted_average(df, 'ReliabilityScore',
                                              'FlowAmount', grouping_vars)
    fbs.reset_index(inplace=True)

    # apply flow mapping
    fbs = map_elementary_flows(fbs, inventory_list)

    # rename columns to match flowbysector format
    fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"})

    # add hardcoded data, depending on the source data, some of these fields may need to change
    fbs['Class'] = 'Chemicals'
    fbs['SectorConsumedBy'] = 'None'
    fbs['SectorSourceName'] = 'NAICS_2012_Code'
    fbs['FlowType'] = 'ELEMENTARY_FLOW'

    fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0])

    # add missing flow by sector fields
    fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

    # sort dataframe and reset index
    fbs = fbs.sort_values(list(
        flow_by_sector_fields.keys())).reset_index(drop=True)

    return fbs