def apply_tons_per_employee_per_year_to_states(fbs): """ Calculates tons per employee per year based on BLS_QCEW employees by sector and applies that quantity to employees in all states """ bls = load_fba_w_standardized_units(datasource='BLS_QCEW', year=fbs['Year'].unique()[0], flowclass='Employment', geographic_level='state') bls = bls[bls['FlowName'] == 'Number of employees'] # clean df bls = clean_bls_qcew_fba(bls) bls = add_sectors_to_flowbyactivity(bls) # Subset BLS dataset sector_list = list(filter(None, fbs['SectorProducedBy'].unique())) bls = get_fba_allocation_subset(bls, 'BLS_QCEW', sector_list) bls = bls.rename(columns={'FlowAmount': 'Employees'}) bls = bls[['Employees', 'Location', 'Year', 'SectorProducedBy']] # Calculate tons per employee per year per material and sector in CA bls_CA = bls[bls['Location'] == '06000'] # California tpepy = fbs.merge(bls_CA, how='inner') tpepy['TPEPY'] = np.divide(tpepy['FlowAmount'], tpepy['Employees'], out=np.zeros_like(tpepy['Employees']), where=tpepy['Employees'] != 0) tpepy = tpepy.drop(columns=['Employees', 'FlowAmount', 'Location']) # Apply TPEPY back to all employees in all states national_waste = tpepy.merge(bls, how='outer') national_waste['FlowAmount'] = \ national_waste['Employees'] * national_waste['TPEPY'] return national_waste
def scale_blackhurst_results_to_usgs_values(df_load, attr, download_FBA_if_missing): """ Scale the initial estimates for Blackhurst-based mining estimates to USGS values. Oil-based sectors are allocated a larger percentage of the difference between initial water withdrawal estimates and published USGS values. This method is based off the Water Satellite Table created by Yang and Ingwersen, 2017 :param df_load: df, fba dataframe to be modified :param attr: dictionary, attribute data from method yaml for activity set :param download_FBA_if_missing: bool, indicate if missing FBAs should be downloaded from Data Commons :return: scaled fba results """ # determine national level published withdrawal data for usgs mining # in FBS method year pv_load = load_fba_w_standardized_units( datasource="USGS_NWIS_WU", year=str(attr['helper_source_year']), flowclass='Water', download_FBA_if_missing=download_FBA_if_missing) pv_sub = pv_load[(pv_load['ActivityConsumedBy'] == 'Mining') & (pv_load['Compartment'] == 'total') & (pv_load['FlowName'] == 'total')].reset_index(drop=True) # rename the published value flow name and merge with Blackhurst data pv_sub = pv_sub.rename(columns={'FlowAmount': 'pv'}) df = df_load.merge(pv_sub[['Location', 'pv']], how='left') # calculate the difference between published value and allocated value # for each naics length df = df.assign(nLen=df['SectorConsumedBy'].apply(lambda x: len(x))) # calculate initial FlowAmount accounted for df = df.assign(av=df.groupby('nLen')['FlowAmount'].transform('sum')) # calc difference df = df.assign(vd=df['pv'] - df['av']) # subset df to scale into oil and non-oil sectors df['sector_label'] = np.where( df['SectorConsumedBy'].apply(lambda x: x[0:5] == '21111'), 'oil', 'nonoil') df['ratio'] = np.where(df['sector_label'] == 'oil', 2 / 3, 1 / 3) df['label_sum'] = df.groupby(['Location', 'nLen', 'sector_label' ])['FlowAmount'].transform('sum') # calculate revised water withdrawal allocation df_scaled = df.copy() df_scaled.loc[:, 'FlowAmount'] = \ df_scaled['FlowAmount'] + \ (df_scaled['FlowAmount'] / df_scaled['label_sum']) * \ (df_scaled['ratio'] * df_scaled['vd']) df_scaled = df_scaled.drop(columns=[ 'sector_label', 'ratio', 'nLen', 'label_sum', 'pv', 'av', 'vd' ]) return df_scaled
def load_map_clean_fba(method, attr, fba_sourcename, df_year, flowclass, geoscale_from, geoscale_to, **kwargs): """ Load, clean, and map a FlowByActivity df :param method: dictionary, FBS method yaml :param attr: dictionary, attribute data from method yaml for activity set :param fba_sourcename: str, source name :param df_year: str, year :param flowclass: str, flowclass to subset df with :param geoscale_from: str, geoscale to use :param geoscale_to: str, geoscale to aggregate to :param kwargs: dictionary, can include parameters: 'allocation_flow', 'allocation_compartment','clean_allocation_fba', 'clean_allocation_fba_w_sec' :return: df, fba format """ log.info("Loading allocation flowbyactivity %s for year %s", fba_sourcename, str(df_year)) fba = load_fba_w_standardized_units(datasource=fba_sourcename, year=df_year, flowclass=flowclass) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the %s level", geoscale_from) check_if_data_exists_at_geoscale(fba, geoscale_from) # aggregate geographically to the scale of the flowbyactivty source, if necessary fba = subset_df_by_geoscale(fba, geoscale_from, geoscale_to) # subset based on yaml settings if 'flowname_subset' in kwargs: if kwargs['flowname_subset'] != 'None': fba = fba.loc[fba['FlowName'].isin(kwargs['flowname_subset'])] if 'compartment_subset' in kwargs: if kwargs['compartment_subset'] != 'None': fba = fba.loc[fba['Compartment'].isin(kwargs['compartment_subset'])] # cleanup the fba allocation df, if necessary if 'clean_fba' in kwargs: log.info("Cleaning %s", fba_sourcename) fba = dynamically_import_fxn(fba_sourcename, kwargs["clean_fba"])(fba, attr=attr) # reset index fba = fba.reset_index(drop=True) # assign sector to allocation dataset log.info("Adding sectors to %s", fba_sourcename) fba_wsec = add_sectors_to_flowbyactivity(fba, sectorsourcename=method['target_sector_source']) # call on fxn to further clean up/disaggregate the fba allocation data, if exists if 'clean_fba_w_sec' in kwargs: log.info("Further disaggregating sectors in %s", fba_sourcename) fba_wsec = dynamically_import_fxn(fba_sourcename, kwargs['clean_fba_w_sec'])(fba_wsec, attr=attr, method=method, sourcename=fba_sourcename) return fba_wsec
def iwms_aggregation(df_load, **kwargs): """ Before multiplying the USDA CoA Cropland data by IWMS data, first aggregate the two hay values from IWMS :param df_load: :param kwargs: :return: """ # load the acreage information for iwms land_load = load_fba_w_standardized_units( "USDA_IWMS", year=kwargs['attr']['helper_source_year'], flowclass="Land", geographic_level="state") # subset to hay and haylage land = land_load[land_load['ActivityConsumedBy'].isin( ['HAY & HAYLAGE, (EXCL ALFALFA)', 'HAY & HAYLAGE, ALFALFA'])] land_sub = land[['ActivityConsumedBy', 'FlowAmount', 'Location']].reset_index(drop=True) land_sub = land_sub.rename(columns={'FlowAmount': 'HelperFlow'}) # merge the two dfs df = pd.merge(df_load, land_sub, how='right') df['HelperFlow'] = df['HelperFlow'].fillna(1) # drop rows where flow is 0 df = df[df['FlowAmount'] != 0] # reset hay sectors and rename df['SectorConsumedBy'] = np.where( df['SectorConsumedBy'].isin(['111940A', '111940B']), '11194', df['SectorConsumedBy']) df['ActivityConsumedBy'] = np.where(df['SectorConsumedBy'] == '11194', 'HAY & HAYLAGE', df['ActivityConsumedBy']) wt_flow = df.groupby(df['Location']).apply(lambda x: np.average( x['FlowAmount'], weights=x['HelperFlow'])).reset_index() wt_flow = wt_flow.rename(columns={wt_flow.columns[1]: 'NewFlow'}) df2 = df.merge(wt_flow) # reset flowamount, drop duplicates, drop columns df2 = df2.assign(FlowAmount=df2['NewFlow']).drop( columns=['HelperFlow', 'NewFlow']) df3 = df2.drop_duplicates() # drop data from original, add in modifed data df_o = df_load[~df_load['SectorConsumedBy'].isin(['111940A', '111940B'])] df4 = pd.concat([df_o, df3], ignore_index=True) return df4
def scale_blackhurst_results_to_usgs_values(df_to_scale, attr): """ Scale the initial estimates for Blackhurst-based mining estimates to USGS values. Oil-based sectors are allocated a larger percentage of the difference between initial water withdrawal estimates and published USGS values. This method is based off the Water Satellite Table created by Yang and Ingwersen, 2017 :param df_to_scale: df, fba dataframe to be modified :param attr: dictionary, attribute data from method yaml for activity set :return: scaled fba results """ # determine national level published withdrawal data for usgs mining in FBS method year pv_load = load_fba_w_standardized_units(datasource="USGS_NWIS_WU", year=str( attr['helper_source_year']), flowclass='Water') pv_sub = pv_load[(pv_load['Location'] == str(US_FIPS)) & ( pv_load['ActivityConsumedBy'] == 'Mining')].reset_index(drop=True) pv = pv_sub['FlowAmount'].loc[ 0] * 1000000 # usgs unit is Mgal, blackhurst unit is gal # sum quantity of water withdrawals already allocated to sectors av = df_to_scale['FlowAmount'].sum() # calculate the difference between published value and allocated value vd = pv - av # subset df to scale into oil and non-oil sectors df_to_scale['sector_label'] = np.where( df_to_scale['SectorConsumedBy'].apply(lambda x: x[0:5] == '21111'), 'oil', 'nonoil') df_to_scale['ratio'] = np.where(df_to_scale['sector_label'] == 'oil', 2 / 3, 1 / 3) df_to_scale['label_sum'] = df_to_scale.groupby( ['Location', 'sector_label'])['FlowAmount'].transform('sum') df_to_scale.loc[:, 'value_difference'] = vd.astype(float) # calculate revised water withdrawal allocation df_scaled = df_to_scale.copy() df_scaled.loc[:, 'FlowAmount'] = df_scaled['FlowAmount'] + \ (df_scaled['FlowAmount'] / df_scaled['label_sum']) * \ (df_scaled['ratio'] * df_scaled['value_difference']) df_scaled = df_scaled.drop( columns=['sector_label', 'ratio', 'label_sum', 'value_difference']) return df_scaled
def convert_blackhurst_data_to_kg_per_year(df, **kwargs): """ Load BEA Make After Redefinition data to convert Blackhurst IO dataframe units to gallon per year :param df: df, FBA format :param kwargs: kwargs includes "attr" - dictionary, attribute data from method yaml for activity set :return: transformed fba df """ # load the bea make table bmt = load_fba_w_standardized_units( datasource='BEA_Make_AR', year=kwargs['attr']['allocation_source_year'], flowclass='Money', download_FBA_if_missing=kwargs['download_FBA_if_missing']) # drop rows with flowamount = 0 bmt = bmt[bmt['FlowAmount'] != 0] # check on units of dfs before merge compare_df_units(df, bmt) bh_df_revised = pd.merge( df, bmt[['FlowAmount', 'ActivityProducedBy', 'Location']], left_on=['ActivityConsumedBy', 'Location'], right_on=['ActivityProducedBy', 'Location']) bh_df_revised.loc[:, 'FlowAmount'] = ((bh_df_revised['FlowAmount_x']) * (bh_df_revised['FlowAmount_y'])) bh_df_revised.loc[:, 'Unit'] = 'kg' # drop columns bh_df_revised = bh_df_revised.drop( columns=["FlowAmount_x", "FlowAmount_y", 'ActivityProducedBy_y']) bh_df_revised = bh_df_revised.rename( columns={"ActivityProducedBy_x": "ActivityProducedBy"}) return bh_df_revised
def disaggregate_cropland(fba_w_sector, attr, method, year, sector_column): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: df, CoA cropland data, FBA format with sector columns :param attr: dictionary, attribute data from method yaml for activity set :param year: str, year of data :param sector_column: str, the sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :param attr: dictionary, attribute data from method yaml for activity set :return: df, CoA cropland data disaggregated """ # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # drop pastureland data crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index( drop=True) # create tmp location crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2])) # load the relevant state level harvested cropland by naics naics = load_fba_w_standardized_units(datasource="USDA_CoA_Cropland_NAICS", year=year, flowclass='Land') # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # drop the activities that include '&' naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index( drop=True) # add sectors naics = add_sectors_to_flowbyactivity( naics, sectorsourcename=method['target_sector_source']) # estimate suppressed data by equally allocating parent to child naics naics = estimate_suppressed_data(naics, 'SectorConsumedBy', 3, 'USDA_CoA_Cropland_NAICS') # add missing fbs fields naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict) # aggregate sectors to create any missing naics levels group_cols = fbs_default_grouping_fields # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')] # group_cols.append(sector_column) naics2 = sector_aggregation(naics, group_cols) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics3 = sector_disaggregation(naics2) # drop rows where FlowAmount 0 # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))] naics3 = naics3.loc[naics3['FlowAmount'] != 0] # create ratios naics4 = sector_ratios(naics3, sector_column) # create temporary sector column to match the two dfs on naics4 = naics4.assign( Location_tmp=naics4['Location'].apply(lambda x: x[0:2])) # tmp drop Nonetypes naics4 = replace_NoneType_with_empty_cells(naics4) # check units in prep for merge compare_df_units(crop, naics4) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop[sector_column].apply( lambda x: i + 1 >= len(x) >= i)] crop_subset = crop_subset.assign( Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i])) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = \ naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i + 1)].reset_index(drop=True) naics_subset = naics_subset.assign( Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i])) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[ sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp' ]], how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[ 'FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index( drop=True) # drop columns df_subset = df_subset.drop( columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename( columns={sector_column + '_y': sector_column}) # tmp drop Nonetypes df_subset = replace_NoneType_with_empty_cells(df_subset) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # equally allocate any further missing naics crop = allocate_dropped_sector_data(crop, 'NAICS_6') # pasture data pasture = \ fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def disaggregate_pastureland(fba_w_sector, attr, method, year, sector_column): """ The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS. :param fba_w_sector: df, the CoA Cropland dataframe after linked to sectors :param attr: dictionary, attribute data from method yaml for activity set :param year: str, year of data being disaggregated :param sector_column: str, the sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :return: df, the CoA cropland dataframe with disaggregated pastureland data """ # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # subset the coa data so only pastureland p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) if len(p) != 0: # add temp loc column for state fips p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2])) # load usda coa cropland naics df_f = load_fba_w_standardized_units( datasource='USDA_CoA_Cropland_NAICS', year=year, flowclass='Land') # subset to land in farms data df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS'] # subset to rows related to pastureland df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112'] # drop rows with "&' df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')] # create sector columns df_f = add_sectors_to_flowbyactivity( df_f, sectorsourcename=method['target_sector_source']) # estimate suppressed data by equal allocation df_f = estimate_suppressed_data(df_f, 'SectorConsumedBy', 3, 'USDA_CoA_Cropland_NAICS') # create proportional ratios group_cols = fba_wsec_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df_f = allocate_by_sector(df_f, 'proportional', group_cols) # tmp drop NoneTypes df_f = replace_NoneType_with_empty_cells(df_f) # drop naics = '11 df_f = df_f[df_f[sector_column] != '11'] # drop 000 in location df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2])) # check units before merge compare_df_units(p, df_f) # merge the coa pastureland data with land in farm data df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left', left_on="Location_tmp", right_on="Location") # multiply the flowamount by the flowratio df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio'] # drop columns and rename df = df.drop(columns=[ 'Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio' ]) df = df.rename(columns={ sector_column + '_y': sector_column, "Location_x": 'Location' }) # drop rows where sector = 112 and then concat with original fba_w_sector fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply( lambda x: x[0:3]) != '112'].reset_index(drop=True) fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def convert_statcan_data_to_US_water_use(df, attr): """ Use Canadian GDP data to convert 3 digit canadian water use to us water use: - canadian gdp - us gdp :param df: df, FBA format :param attr: dictionary, attribute data from method yaml for activity set :return: df, FBA format, flowamounts converted """ # load Canadian GDP data gdp = load_fba_w_standardized_units(datasource='StatCan_GDP', year=attr['allocation_source_year'], flowclass='Money') # drop 31-33 gdp = gdp[gdp['ActivityProducedBy'] != '31-33'] gdp = gdp.rename(columns={"FlowAmount": "CanDollar"}) # check units before merge compare_df_units(df, gdp) # merge df df_m = pd.merge(df, gdp[['CanDollar', 'ActivityProducedBy']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m['CanDollar'] = df_m['CanDollar'].fillna(0) df_m = df_m.drop(columns=["ActivityProducedBy_y"]) df_m = df_m.rename(columns={"ActivityProducedBy_x": "ActivityProducedBy"}) df_m = df_m[df_m['CanDollar'] != 0] exchange_rate = get_Canadian_to_USD_exchange_rate( str(attr['allocation_source_year'])) exchange_rate = float(exchange_rate) # convert to mgal/USD df_m.loc[:, 'FlowAmount'] = df_m['FlowAmount'] / (df_m['CanDollar'] / exchange_rate) df_m.loc[:, 'Unit'] = 'Mgal/USD' df_m = df_m.drop(columns=["CanDollar"]) # convert Location to US df_m.loc[:, 'Location'] = US_FIPS df_m = assign_fips_location_system(df_m, str(attr['allocation_source_year'])) # load us gdp # load Canadian GDP data us_gdp_load = load_fba_w_standardized_units( datasource='BEA_GDP_GrossOutput', year=attr['allocation_source_year'], flowclass='Money') # load bea crosswalk cw_load = load_bea_crosswalk() cw = cw_load[['BEA_2012_Detail_Code', 'NAICS_2012_Code']].drop_duplicates() cw = cw[cw['NAICS_2012_Code'].apply( lambda x: len(str(x)) == 3)].drop_duplicates().reset_index(drop=True) # merge us_gdp = pd.merge(us_gdp_load, cw, how='left', left_on='ActivityProducedBy', right_on='BEA_2012_Detail_Code') us_gdp = us_gdp.drop( columns=['ActivityProducedBy', 'BEA_2012_Detail_Code']) # rename columns us_gdp = us_gdp.rename(columns={'NAICS_2012_Code': 'ActivityProducedBy'}) # agg by naics us_gdp = aggregator(us_gdp, fba_default_grouping_fields) us_gdp = us_gdp.rename(columns={'FlowAmount': 'us_gdp'}) # determine annual us water use df_m2 = pd.merge(df_m, us_gdp[['ActivityProducedBy', 'us_gdp']], how='left', left_on='ActivityConsumedBy', right_on='ActivityProducedBy') df_m2.loc[:, 'FlowAmount'] = df_m2['FlowAmount'] * (df_m2['us_gdp']) df_m2.loc[:, 'Unit'] = 'Mgal' df_m2 = df_m2.rename( columns={'ActivityProducedBy_x': 'ActivityProducedBy'}) df_m2 = df_m2.drop(columns=['ActivityProducedBy_y', 'us_gdp']) return df_m2
def convert_blackhurst_data_to_kg_per_employee(df_wsec, attr, method, **kwargs): """ Load BLS employment data and use to transform original units to gallons per employee :param df_wsec: df, includes sector columns :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :return: df, transformed fba dataframe with sector columns """ # load 2002 employment data bls = load_fba_w_standardized_units( datasource='BLS_QCEW', year='2002', flowclass='Employment', geographic_level='national', download_FBA_if_missing=kwargs['download_FBA_if_missing']) # clean df bls = clean_bls_qcew_fba(bls, attr=attr) # assign naics to allocation dataset bls_wsec = add_sectors_to_flowbyactivity( bls, sectorsourcename=method['target_sector_source']) # drop rows where sector = None ( does not occur with mining) bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()] bls_wsec = bls_wsec.rename(columns={ 'SectorProducedBy': 'Sector', 'FlowAmount': 'HelperFlow' }) # check units before merge compare_df_units(df_wsec, bls_wsec) # merge the two dfs df = pd.merge(df_wsec, bls_wsec[['Location', 'Sector', 'HelperFlow']], how='left', left_on=['Location', 'SectorConsumedBy'], right_on=['Location', 'Sector']) # drop any rows where sector is None df = df[~df['Sector'].isnull()] # fill helperflow values with 0 df['HelperFlow'] = df['HelperFlow'].fillna(0) # calculate proportional ratios df_wratio = proportional_allocation_by_location_and_activity(df, 'Sector') df_wratio = df_wratio.rename(columns={ 'FlowAmountRatio': 'EmployeeRatio', 'HelperFlow': 'Employees' }) # drop rows where helperflow = 0 df_wratio = df_wratio[df_wratio['Employees'] != 0] # calculate gal/employee in 2002 df_wratio.loc[:, 'FlowAmount'] = \ (df_wratio['FlowAmount'] * df_wratio['EmployeeRatio']) / \ df_wratio['Employees'] df_wratio.loc[:, 'Unit'] = 'kg/p' # drop cols df_wratio = df_wratio.drop( columns=['Sector', 'Employees', 'EmployeeRatio']) return df_wratio