def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method): import flowsa from flowsa.mapping import add_sectors_to_flowbyactivity from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict, agg_by_geoscale, fba_default_grouping_fields, \ sector_ratios, proportional_allocation_by_location_and_sector, filter_by_geoscale from flowsa.BLS_QCEW import clean_bls_qcew_fba bls = flowsa.getFlowByActivity(flowclass=['Employment'], datasource='BLS_QCEW', years=[2002]) # clean df bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict) bls = clean_bls_qcew_fba(bls, attr) # bls_agg = agg_by_geoscale(bls, 'state', 'national', fba_default_grouping_fields) bls_agg = filter_by_geoscale(bls, 'national') # assign naics to allocation dataset bls_wsec = add_sectors_to_flowbyactivity( bls_agg, sectorsourcename=method['target_sector_source']) # drop rows where sector = None ( does not occur with mining) bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()] bls_wsec = bls_wsec.rename(columns={'SectorProducedBy': 'Sector'}) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = df_wsec['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors bls_wsec = bls_wsec.loc[bls_wsec['Sector'].isin(sector_list)] # calculate proportional ratios bls_wsec = proportional_allocation_by_location_and_sector( bls_wsec, 'Sector') #, 'agg') bls_wsec = bls_wsec.rename(columns={ 'FlowAmountRatio': 'EmployeeRatio', 'FlowAmount': 'Employees' }) # merge the two dfs df = pd.merge(df_wsec, bls_wsec[['Sector', 'EmployeeRatio', 'Employees']], how='left', left_on='Sector', right_on='Sector') df['EmployeeRatio'] = df['EmployeeRatio'].fillna(0) # calculate gal/employee in 2002 df.loc[:, 'FlowAmount'] = (df['FlowAmount'] * df['EmployeeRatio']) / df['Employees'] df.loc[:, 'Unit'] = 'gal/employee' # drop cols df = df.drop(columns=['Employees', 'EmployeeRatio']) return df
def allocation_helper(df_w_sector, method, attr, v): """ Used when two df required to create allocation ratio :param df_w_sector: :param method: currently written for 'multiplication' and 'proportional' :param attr: :return: """ from flowsa.Blackhurst_IO import scale_blackhurst_results_to_usgs_values from flowsa.BLS_QCEW import clean_bls_qcew_fba, bls_clean_allocation_fba_w_sec from flowsa.mapping import add_sectors_to_flowbyactivity helper_allocation = flowsa.getFlowByActivity(flowclass=[attr['helper_source_class']], datasource=attr['helper_source'], years=[attr['helper_source_year']]) if 'clean_helper_fba' in attr: log.info("Cleaning " + attr['helper_source'] + ' FBA') # tmp hard coded - need to generalize if attr['helper_source'] == 'BLS_QCEW': helper_allocation = clean_bls_qcew_fba(helper_allocation, attr) # helper_allocation = getattr(sys.modules[__name__], attr["clean_helper_fba"])(helper_allocation, attr) # clean df helper_allocation = clean_df(helper_allocation, flow_by_activity_fields, fba_fill_na_dict) # drop rows with flowamount = 0 helper_allocation = helper_allocation[helper_allocation['FlowAmount'] != 0] # agg data if necessary or filter # determine to scale to_scale = min(fips_number_key[attr['allocation_from_scale']], fips_number_key[v['geoscale_to_use']]) if fips_number_key[attr['helper_from_scale']] > to_scale: helper_allocation = agg_by_geoscale(helper_allocation, attr['helper_from_scale'], list(fips_number_key.keys())[list(fips_number_key.values()).index(to_scale)], fba_default_grouping_fields) else: helper_allocation = filter_by_geoscale(helper_allocation, attr['helper_from_scale']) # assign naics to allocation dataset helper_allocation = add_sectors_to_flowbyactivity(helper_allocation, sectorsourcename=method['target_sector_source']) # generalize activity field names to enable link to water withdrawal table helper_allocation = generalize_activity_field_names(helper_allocation) # clean up helper fba with sec if 'clean_helper_fba_wsec' in attr: log.info("Cleaning " + attr['helper_source'] + ' FBA with sectors') # tmp hard coded - need to generalize if attr['helper_source'] == 'BLS_QCEW': helper_allocation = bls_clean_allocation_fba_w_sec(helper_allocation, attr, method) # helper_allocation = getattr(sys.modules[__name__], attr["clean_helper_fba_wsec"])(helper_allocation, attr, method) # drop columns helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max']) if attr['helper_method'] == 'proportional': # if calculating proportion, first subset the helper allocation df to only contain relevant sectors # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = df_w_sector['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors helper_allocation = helper_allocation.loc[helper_allocation['Sector'].isin(sector_list)] # calculate proportional ratios helper_allocation = proportional_allocation_by_location_and_sector(helper_allocation, 'Sector') # rename column helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) merge_columns = [e for e in ['Location','Sector', 'HelperFlow', 'FlowAmountRatio'] if e in helper_allocation.columns.values.tolist()] # merge allocation df with helper df based on sectors, depending on geo scales of dfs if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = helper_allocation['Location'].apply(lambda x: x[0:2]) df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: x[0:2]) merge_columns.append('Location_tmp') modified_fba_allocation = df_w_sector.merge(helper_allocation[merge_columns], how='left') modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp']) else: modified_fba_allocation = df_w_sector.merge(helper_allocation[merge_columns], how='left') # modify flow amounts using helper data if 'multiplication' in attr['helper_method']: # todo: modify so if missing data, replaced with value from one geoscale up instead of national # todo: modify year after merge if necessary # if missing values (na or 0), replace with national level values replacement_values = helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index( drop=True) replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'}) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] = np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # replace non-existent helper flow values with a 0, so after multiplying, don't have incorrect value associated # with new unit modified_fba_allocation['HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna(value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['HelperFlow'] # drop columns modified_fba_allocation = modified_fba_allocation.drop(columns=["HelperFlow", 'ReplacementValue']) elif attr['helper_method'] == 'proportional': modified_fba_allocation['FlowAmountRatio'] = modified_fba_allocation['FlowAmountRatio'].fillna(0) modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['FlowAmountRatio'] modified_fba_allocation = modified_fba_allocation.drop(columns=["HelperFlow", 'FlowAmountRatio']) # drop rows of 0 modified_fba_allocation = modified_fba_allocation[modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) # todo: change units modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal' # option to scale up fba values if 'scaled' in attr['helper_method']: log.info("Scaling " + attr['helper_source'] + ' to FBA values') # tmp hard coded - need to generalize if attr['helper_source'] == 'BLS_QCEW': modified_fba_allocation = scale_blackhurst_results_to_usgs_values(modified_fba_allocation, attr) # modified_fba_allocation = getattr(sys.modules[__name__], attr["scale_helper_results"])(modified_fba_allocation, attr) return modified_fba_allocation
def disaggregate_cropland(fba_w_sector, attr): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: :param attr: :return: """ import flowsa from flowsa.flowbyfunctions import generalize_activity_field_names, sector_aggregation,\ fbs_default_grouping_fields, clean_df, fba_fill_na_dict, add_missing_flow_by_fields from flowsa.mapping import add_sectors_to_flowbyactivity # drop pastureland data crop = fba_w_sector.loc[fba_w_sector['Sector'].apply(lambda x: str(x[0:3])) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop['Sector'].apply(lambda x: len(x) > 3)].reset_index( drop=True) # create tmp location crop.loc[:, 'Location_tmp'] = crop['Location'].apply(lambda x: str(x[0:2])) # load the relevant state level harvested cropland by naics naics_load = flowsa.getFlowByActivity( flowclass=['Land'], years=[attr['allocation_source_year']], datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True) # clean df naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict) # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # add sectors naics = add_sectors_to_flowbyactivity(naics, sectorsourcename='NAICS_2012_Code', levelofSectoragg='agg') # add missing fbs fields naics = add_missing_flow_by_fields(naics, flow_by_sector_fields) # aggregate sectors to create any missing naics levels naics = sector_aggregation(naics, fbs_default_grouping_fields) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics = sector_disaggregation(naics) # drop rows where sector consumed by is none and FlowAmount 0 naics = naics[naics['SectorConsumedBy'].notnull()] naics = naics.loc[naics['FlowAmount'] != 0] # create ratios naics = sector_ratios(naics) # drop sectors < 4 digits #naics = naics[naics['SectorConsumedBy'].apply(lambda x: len(x) > 3)].reset_index(drop=True) # create temporary sector column to match the two dfs on naics.loc[:, 'Location_tmp'] = naics['Location'].apply(lambda x: str(x[0:2])) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop['Sector'].apply( lambda x: i + 1 >= len(x) >= i)] crop_subset.loc[:, 'Sector_tmp'] = crop_subset['Sector'].apply( lambda x: x[0:i]) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df['Sector'].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = naics.loc[naics['SectorConsumedBy'].apply( lambda x: len(x) == i + 1)].reset_index(drop=True) naics_subset.loc[:, 'Sector_tmp'] = naics_subset[ 'SectorConsumedBy'].apply(lambda x: x[0:i]) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[ 'SectorConsumedBy', 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp' ]], how='left', left_on=['Sector', 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[ 'FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index( drop=True) # drop columns df_subset = df_subset.drop( columns=['Sector', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename(columns={"SectorConsumedBy": "Sector"}) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # pasture data pasture = fba_w_sector.loc[fba_w_sector['Sector'].apply( lambda x: str(x[0:3])) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) return fba_w_sector
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbs_list = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # ensure correct datatypes and that all fields exist flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict, drop_description=False) # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) # if activity_sets are specified in a file, call them here if 'activity_set_file' in v: aset_names = pd.read_csv(flowbysectoractivitysetspath + v['activity_set_file'], dtype=str) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities if 'activity_set_file' in v: names = aset_names[aset_names['activity_set'] == aset]['name'] else: names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset fba data by activity flows_subset = flows[ (flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))].reset_index( drop=True) # extract relevant geoscale data or aggregate existing data log.info("Subsetting/aggregating dataframe to " + attr['allocation_from_scale'] + " geoscale") flows_subset_geo = subset_df_by_geoscale( flows_subset, v['geoscale_to_use'], attr['allocation_from_scale']) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k) flow_subset_wsec = add_sectors_to_flowbyactivity( flows_subset_geo, sectorsourcename=method['target_sector_source'], allocationmethod=attr['allocation_method']) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])( flow_subset_wsec, attr=attr) # map df to elementary flows log.info("Mapping flows in " + k + ' to federal elementary flow list') if 'fedefl_mapping' in v: mapping_files = v['fedefl_mapping'] else: mapping_files = k flow_subset_mapped = map_elementary_flows( flow_subset_wsec, mapping_files) # clean up mapped fba with sectors, if specified in yaml if "clean_mapped_fba_w_sec_df_fxn" in v: log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_mapped = getattr( sys.modules[__name__], v["clean_mapped_fba_w_sec_df_fxn"])(flow_subset_mapped, attr, method) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_mapped.copy() # for each activity, if activities are not sector like, check that there is no data loss if load_source_catalog( )[k]['sector-like_activities'] is False: activity_list = [] for n in names: log.info('Checking for ' + n + ' at ' + method['target_sector_level']) fbs_subset = fbs[( (fbs[fba_activity_fields[0]] == n) & (fbs[fba_activity_fields[1]] == n)) | (fbs[fba_activity_fields[0]] == n) | (fbs[fba_activity_fields[1]] == n )].reset_index(drop=True) fbs_subset = check_if_losing_sector_data( fbs_subset, method['target_sector_level']) activity_list.append(fbs_subset) fbs = pd.concat(activity_list, ignore_index=True) # if allocation method for an activity set requires a specific function due to the complicated nature # of the allocation, call on function here elif attr['allocation_method'] == 'allocation_function': log.info( 'Calling on function specified in method yaml to allocate ' + ', '.join(map(str, names)) + ' to sectors') fbs = getattr(sys.modules[__name__], attr['allocation_source'])( flow_subset_mapped, attr, fbs_list) else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity( flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year'] ]).reset_index(drop=True) # clean df and harmonize unites fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) fba_allocation = harmonize_units(fba_allocation) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale( fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary fba_allocation = subset_df_by_geoscale( fba_allocation, attr['allocation_from_scale'], v['geoscale_to_use']) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['FlowName'].isin( attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin( attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])( fba_allocation, attr=attr) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation_wsec = add_sectors_to_flowbyactivity( fba_allocation, sectorsourcename=method['target_sector_source']) # call on fxn to further clean up/disaggregate the fba allocation data, if exists if 'clean_allocation_fba_w_sec' in attr: log.info("Further disaggregating sectors in " + attr['allocation_source']) fba_allocation_wsec = getattr( sys.modules[__name__], attr["clean_allocation_fba_w_sec"])( fba_allocation_wsec, attr=attr, method=method) # subset fba datasets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset( fba_allocation_wsec, k, names, flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info( "Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper( fba_allocation_subset, attr, method, v) # create flow allocation ratios for each activity # if load_source_catalog()[k]['sector-like_activities'] flow_alloc_list = [] group_cols = fba_mapped_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] for n in names: log.info("Creating allocation ratios for " + n) fba_allocation_subset_2 = get_fba_allocation_subset( fba_allocation_subset, k, [n], flowSubsetMapped=flow_subset_mapped, allocMethod=attr['allocation_method']) if len(fba_allocation_subset_2) == 0: log.info("No data found to allocate " + n) else: flow_alloc = allocate_by_sector( fba_allocation_subset_2, k, attr['allocation_source'], attr['allocation_method'], group_cols, flowSubsetMapped=flow_subset_mapped) flow_alloc = flow_alloc.assign(FBA_Activity=n) flow_alloc_list.append(flow_alloc) flow_allocation = pd.concat(flow_alloc_list, ignore_index=True) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) flow_allocation = collapse_activity_fields(flow_allocation) # check for issues with allocation ratios check_allocation_ratios(flow_allocation, aset, k, method_name) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_mapped = flow_subset_mapped.loc[ (flow_subset_mapped[fbs_activity_fields[0]]. isin(sector_list)) | (flow_subset_mapped[fbs_activity_fields[1]]. isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info( "Checking if flowbyactivity and allocation dataframes use the same location systems" ) check_if_location_systems_match(flow_subset_mapped, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_mapped.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorProducedBy', 'ActivityProducedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') fbs = fbs.merge( flow_allocation[[ 'Location', 'Sector', 'FlowAmountRatio', 'FBA_Activity' ]], left_on=[ 'Location', 'SectorConsumedBy', 'ActivityConsumedBy' ], right_on=['Location', 'Sector', 'FBA_Activity'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs[ 'FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # fill null rows with 0 because no allocation info fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs[ 'FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=[ 'Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'FBA_Activity_x', 'FBA_Activity_y' ]) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # define grouping columns dependent on sectors being activity-like or not if load_source_catalog()[k]['sector-like_activities'] is False: groupingcols = fbs_grouping_fields_w_activities groupingdict = flow_by_sector_fields_w_activity else: groupingcols = fbs_default_grouping_fields groupingdict = flow_by_sector_fields # clean df fbs = clean_df(fbs, groupingdict, fbs_fill_na_dict) # aggregate df geographically, if necessary # todo: replace with fxn return_from_scale log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[ attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs_geo_agg = agg_by_geoscale(fbs, from_scale, to_scale, groupingcols) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs_sec_agg = sector_aggregation(fbs_geo_agg, groupingcols) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs_agg = sector_disaggregation(fbs_sec_agg, groupingdict) # check if any sector information is lost before reaching the target sector length, if so, # allocate values equally to disaggregated sectors log.info('Checking for data at ' + method['target_sector_level']) fbs_agg_2 = check_if_losing_sector_data( fbs_agg, method['target_sector_level']) # compare flowbysector with flowbyactivity # todo: modify fxn to work if activities are sector like in df being allocated if load_source_catalog()[k]['sector-like_activities'] is False: check_for_differences_between_fba_load_and_fbs_output( flow_subset_mapped, fbs_agg_2, aset, k, method_name) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_2 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isin(sector_list)) & (fbs_agg_2[fbs_activity_fields[1]].isnull())].reset_index( drop=True) fbs_3 = fbs_agg_2.loc[ (fbs_agg_2[fbs_activity_fields[0]].isnull()) & (fbs_agg_2[fbs_activity_fields[1]].isin(sector_list) )].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2, fbs_3]) # drop activity columns fbs_sector_subset = fbs_sector_subset.drop( ['ActivityProducedBy', 'ActivityConsumedBy'], axis=1, errors='ignore') # save comparison of FBA total to FBS total for an activity set compare_fba_load_and_fbs_output_totals(flows_subset_geo, fbs_sector_subset, aset, k, method_name, attr, method, mapping_files) log.info( "Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbs_list.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") # ensure correct field datatypes and add any missing fields flows = clean_df(flows, flow_by_sector_fields, fbs_fill_na_dict) fbs_list.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbs_list, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") # add missing fields, ensure correct data type, reorder columns fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of data and allocation datasets fb = method['source_names'] # Create empty list for storing fbs files fbss = [] for k, v in fb.items(): # pull fba data for allocation flows = load_source_dataframe(k, v) if v['data_format'] == 'FBA': # clean up fba, if specified in yaml if v["clean_fba_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity") flows = getattr(sys.modules[__name__], v["clean_fba_df_fxn"])(flows) flows = clean_df(flows, flow_by_activity_fields, fba_fill_na_dict) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] # subset activity data and allocate to sector for aset, attr in activities.items(): # subset by named activities names = attr['names'] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # check if flowbyactivity data exists at specified geoscale to use flow_subset_list = [] for n in names: # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]] == n) | (flows[fba_activity_fields[1]] == n)].reset_index(drop=True) log.info("Checking if flowbyactivity data exists for " + n + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, v['geoscale_to_use'], activitynames=n) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + n + " at a less aggregated level") activity_from_scale = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, v['geoscale_to_use'], n) activity_to_scale = attr['allocation_from_scale'] # if df is less aggregated than allocation df, aggregate usgs activity to allocation geoscale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, n) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, n) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Subsetting " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, n) # Add sectors to df activity, depending on level of specified sector aggregation log.info("Adding sectors to " + k + " for " + n) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr['activity_sector_aggregation']) flow_subset_list.append(flow_subset_wsec) flow_subset_wsec = pd.concat(flow_subset_list, sort=False).reset_index(drop=True) # clean up fba with sectors, if specified in yaml if v["clean_fba_w_sec_df_fxn"] != 'None': log.info("Cleaning up " + k + " FlowByActivity with sectors") flow_subset_wsec = getattr(sys.modules[__name__], v["clean_fba_w_sec_df_fxn"])(flow_subset_wsec, attr) # map df to elementary flows - commented out until mapping complete log.info("Mapping flows in " + k + ' to federal elementary flow list') flow_subset_wsec = map_elementary_flows(flow_subset_wsec, k) # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': log.info('Directly assigning ' + ', '.join(map(str, names)) + ' to sectors') fbs = flow_subset_wsec.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) fba_allocation = clean_df(fba_allocation, flow_by_activity_fields, fba_fill_na_dict) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # cleanup the fba allocation df, if necessary if 'clean_allocation_fba' in attr: log.info("Cleaning " + attr['allocation_source']) fba_allocation = getattr(sys.modules[__name__], attr["clean_allocation_fba"])(fba_allocation) # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if allocation data exists at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr['allocation_sector_aggregation']) # subset fba datsets to only keep the sectors associated with activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # generalize activity field names to enable link to main fba source log.info("Generalizing activity columns in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # call on fxn to further disaggregate the fba allocation data, if exists if 'allocation_disaggregation_fxn' in attr: log.info("Futher disaggregating sectors in " + attr['allocation_source']) fba_allocation_subset = getattr(sys.modules[__name__], attr["allocation_disaggregation_fxn"])(fba_allocation_subset, attr) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge fba df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'SectorProducedBy'], right_on=['Location', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'SectorConsumedBy'], right_on=['Location', 'Sector'], how='left') # merge the flowamount columns fbs.loc[:, 'FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) # check if fba and alloc dfs have data for same geoscales - comment back in after address the 'todo' # log.info("Checking if flowbyactivity and allocation dataframes have data at the same locations") # check_if_data_exists_for_same_geoscales(fbs, k, attr['names']) # drop rows where there is no allocation data fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs.loc[:, 'FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # clean df fbs = clean_df(fbs, flow_by_sector_fields, fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to all sector levels") fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # add missing naics5/6 when only one naics5/6 associated with a naics4 fbs = sector_disaggregation(fbs) # test agg by sector # sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths sector_list = get_sector_list(method['target_sector_level']) # add any non-NAICS sectors used with NAICS sector_list = add_non_naics_sectors(sector_list, method['target_sector_level']) # subset df, necessary because not all of the sectors are NAICS and can get duplicate rows fbs_1 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) & (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) fbs_2 = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) fbs_sector_subset = pd.concat([fbs_1, fbs_2], sort=False) # set source name fbs_sector_subset.loc[:, 'SectorSourceName'] = method['target_sector_source'] log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs_sector_subset) else: # if the loaded flow dt is already in FBS format, append directly to list of FBS log.info("Append " + k + " to FBS list") fbss.append(flows) # create single df of all activities log.info("Concat data for all activities") fbss = pd.concat(fbss, ignore_index=True, sort=False) log.info("Clean final dataframe") # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df log.info("Sort and store dataframe") fbss = fbss.replace({'nan': None}) # add missing fields, ensure correct data type, reorder columns fbss = clean_df(fbss, flow_by_sector_fields, fbs_fill_na_dict) fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def allocation_helper(df_w_sector, method, attr): """ Used when two df required to create allocation ratio :param df_w_sector: :param method: currently written for 'multiplication' :param attr: :return: """ from flowsa.mapping import add_sectors_to_flowbyactivity helper_allocation = flowsa.getFlowByActivity(flowclass=[attr['helper_source_class']], datasource=attr['helper_source'], years=[attr['helper_source_year']]) # clean df helper_allocation = clean_df(helper_allocation, flow_by_activity_fields, fba_fill_na_dict) # drop rows with flowamount = 0 helper_allocation = helper_allocation[helper_allocation['FlowAmount'] != 0] # assign naics to allocation dataset helper_allocation = add_sectors_to_flowbyactivity(helper_allocation, sectorsourcename=method[ 'target_sector_source'], levelofSectoragg=attr[ 'helper_sector_aggregation']) # generalize activity field names to enable link to water withdrawal table helper_allocation = generalize_activity_field_names(helper_allocation) # drop columns helper_allocation = helper_allocation.drop(columns=['Activity', 'Min', 'Max']) # rename column helper_allocation = helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) # merge allocation df with helper df based on sectors, depending on geo scales of dfs if attr['helper_from_scale'] == 'national': modified_fba_allocation = df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']], how='left') if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'state'): modified_fba_allocation = df_w_sector.merge( helper_allocation[['Sector', 'Location', 'HelperFlow']], how='left') if (attr['helper_from_scale'] == 'state') and (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = helper_allocation['Location'].apply( lambda x: str(x[0:2])) df_w_sector.loc[:, 'Location_tmp'] = df_w_sector['Location'].apply(lambda x: str(x[0:2])) modified_fba_allocation = df_w_sector.merge( helper_allocation[['Sector', 'Location_tmp', 'HelperFlow']], how='left') modified_fba_allocation = modified_fba_allocation.drop(columns=['Location_tmp']) # todo: modify so if missing data, replaced with value from one geoscale up instead of national # if missing values (na or 0), replace with national level values replacement_values = helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index( drop=True) replacement_values = replacement_values.rename(columns={"HelperFlow": 'ReplacementValue'}) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] = np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # modify flow amounts using helper data if attr['helper_method'] == 'multiplication': # replace non-existent helper flow values with a 0, so after multiplying, don't have incorrect value associated # with new unit modified_fba_allocation['HelperFlow'] = modified_fba_allocation['HelperFlow'].fillna( value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation[ 'HelperFlow'] # drop columns modified_fba_allocation = modified_fba_allocation.drop( columns=["HelperFlow", 'ReplacementValue']) # drop rows of 0 to speed up allocation modified_fba_allocation = modified_fba_allocation[ modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) #todo: modify the unit return modified_fba_allocation
def disaggregate_cropland(fba_w_sector, attr, method, years_list, sector_column): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: :param attr: :param years_list: :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :param attr: :return: """ import flowsa from flowsa.flowbyfunctions import sector_aggregation,\ fbs_default_grouping_fields, clean_df, fba_fill_na_dict, fbs_fill_na_dict, add_missing_flow_by_fields,\ sector_disaggregation, sector_ratios, replace_strings_with_NoneType, replace_NoneType_with_empty_cells,\ harmonize_units from flowsa.mapping import add_sectors_to_flowbyactivity # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # drop pastureland data crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index(drop=True) # create tmp location crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2]))\ # load the relevant state level harvested cropland by naics naics_load = flowsa.getFlowByActivity(flowclass=['Land'], years=years_list, datasource="USDA_CoA_Cropland_NAICS").reset_index(drop=True) # clean df naics = clean_df(naics_load, flow_by_activity_fields, fba_fill_na_dict) naics = harmonize_units(naics) # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # drop the activities that include '&' naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index(drop=True) # add sectors naics = add_sectors_to_flowbyactivity(naics, sectorsourcename=method['target_sector_source']) # add missing fbs fields naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict) # drop cols and rename # naics = naics.drop(columns=["SectorProducedBy"]) # naics = naics.rename(columns={"SectorConsumedBy": sector_column}) # aggregate sectors to create any missing naics levels group_cols = fbs_default_grouping_fields # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')] # group_cols.append(sector_column) naics2 = sector_aggregation(naics, group_cols) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics3 = sector_disaggregation(naics2, group_cols) # drop rows where FlowAmount 0 # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))] naics3 = naics3.loc[naics3['FlowAmount'] != 0] # create ratios naics4 = sector_ratios(naics3, sector_column) # create temporary sector column to match the two dfs on naics4 = naics4.assign(Location_tmp=naics4['Location'].apply(lambda x: x[0:2])) # tmp drop Nonetypes naics4 = replace_NoneType_with_empty_cells(naics4) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop[sector_column].apply(lambda x: i+1 >= len(x) >= i)] crop_subset = crop_subset.assign(Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i])) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i+1)].reset_index(drop=True) naics_subset = naics_subset.assign(Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i])) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp']], how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset['FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index(drop=True) # drop columns df_subset = df_subset.drop(columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename(columns={sector_column + '_y': sector_column}) # tmp drop Nonetypes df_subset = replace_NoneType_with_empty_cells(df_subset) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # pasture data pasture = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def disaggregate_pastureland(fba_w_sector, attr, method, years_list, sector_column): """ The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS. :param fba_w_sector: The CoA Cropland dataframe after linked to sectors :param attr: :param years_list: :param sector_column: The sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :return: The CoA cropland dataframe with disaggregated pastureland data """ import flowsa from flowsa.flowbyfunctions import allocate_by_sector, clean_df, flow_by_activity_fields, \ fba_fill_na_dict, replace_strings_with_NoneType, replace_NoneType_with_empty_cells, \ fba_mapped_default_grouping_fields, harmonize_units from flowsa.mapping import add_sectors_to_flowbyactivity # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # subset the coa data so only pastureland p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) if len(p) != 0: # add temp loc column for state fips p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2])) df_sourcename = pd.unique(p['SourceName'])[0] # load usda coa cropland naics df_class = ['Land'] df_years = years_list df_allocation = 'USDA_CoA_Cropland_NAICS' df_f = flowsa.getFlowByActivity(flowclass=df_class, years=df_years, datasource=df_allocation) df_f = clean_df(df_f, flow_by_activity_fields, fba_fill_na_dict) df_f = harmonize_units(df_f) # subset to land in farms data df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS'] # subset to rows related to pastureland df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112'] # drop rows with "&' df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')] # create sector columns df_f = add_sectors_to_flowbyactivity(df_f, sectorsourcename=method['target_sector_source']) # create proportional ratios group_cols = fba_mapped_default_grouping_fields group_cols = [e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] df_f = allocate_by_sector(df_f, df_sourcename, df_allocation, 'proportional', group_cols) # tmp drop NoneTypes df_f = replace_NoneType_with_empty_cells(df_f) # drop naics = '11 df_f = df_f[df_f[sector_column] != '11'] # drop 000 in location df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2])) # merge the coa pastureland data with land in farm data df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left', left_on="Location_tmp", right_on="Location") # multiply the flowamount by the flowratio df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio'] # drop columns and rename df = df.drop(columns=['Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio']) df = df.rename(columns={sector_column + '_y': sector_column, "Location_x": 'Location'}) # drop rows where sector = 112 and then concat with original fba_w_sector fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of water data and allocation datasets fbas = method['flowbyactivity_sources'] # Create empty list for storing fbs files fbss = [] for k, v in fbas.items(): # pull water data for allocation log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) # if necessary, standardize names in data set if v['activity_name_standardization_fxn'] != 'None': log.info("Standardizing activity names in " + k) flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows) # drop description field flows = flows.drop(columns='Description') # fill null values flows = flows.fillna(value=fba_fill_na_dict) # map df to elementary flows - commented out until mapping complete # log.info("Mapping flows in " + k + ' to federal elementary flow list') # flows_mapped = map_elementary_flows(flows, k) # convert unit todo: think about unit conversion here log.info("Converting units in " + k) flows = convert_unit(flows) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] for aset, attr in activities.items(): # subset by named activities names = [attr['names']] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))] # Reset index values after subset flow_subset = flow_subset.reset_index(drop=True) # check if flowbyactivity data exists at specified geoscale to use log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use']) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level") new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names, v['geoscale_to_use']) activity_from_scale = new_geoscale_to_use activity_to_scale = attr['allocation_from_scale'] # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Filtering out " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names) # location column pad zeros if necessary flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5 else x ) # Add sectors to usgs activity, creating two versions of the flow subset # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS) # the second version, "flow_subset_agg" includes only the most aggregated level of sectors log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names))) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source']) flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg='agg') # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = flow_subset_wsec_agg.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) # fill null values fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict) # convert unit fba_allocation = convert_unit(fba_allocation) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) + " at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr[ 'allocation_sector_aggregation']) # subset fba datsets to only keep the naics associated with usgs activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # Reset index values after subset fba_allocation_subset = fba_allocation_subset.reset_index(drop=True) # generalize activity field names to enable link to water withdrawal table log.info("Generalizing activity names in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge water withdrawal df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorProducedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorConsumedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') # drop columns where both sector produced/consumed by in flow allocation dif is null fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # merge the flowamount columns fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # rename flow name to flowable fbs = fbs.rename(columns={"FlowName": 'Flowable', "Compartment": "Context" }) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # add missing data columns fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # fill null values fbs = fbs.fillna(value=fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to " + method['target_sector_level']) fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # test agg by sector sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths cw = load_sector_length_crosswalk() sector_list = cw[method['target_sector_level']].unique().tolist() # add any non-NAICS sectors used with NAICS household = load_household_sector_codes() household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']] # add household sector to sector list sector_list.extend(household['Code'].tolist()) # subset df fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) # add any missing columns of data and cast to appropriate data type fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs) # create single df of all activities fbss = pd.concat(fbss, ignore_index=True, sort=False) # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def iteratively_determine_flows_requiring_disaggregation( df_load, attr, method): """ The MECS Land data provides FlowAmounts for NAICS3-6. We use BLS QCEW employment data to determine land use for different industries. To accurately estimate land use per industry, existing FlowAmounts for a particular NAICS level (NAICS6) for example, should be subtracted from the possible FlowAmounts for other NAICS6 that share the first 5 digits. For Example, there is data for '311', '3112', and '311221' in the 2014 dataset. FlowAmounts for allocation by employment for NAICS6 are based on the provided '3112' FlowAmounts. However, since there is data at one NAICS6 (311221), the FlowAmount for that NAICS6 should be subtracted from other NAICS6 to accurately depict the remaining 'FlowAmount' that requires a secondary source (Employment data) for allocation. :param df_load: :return: A dataframe with a column 'disaggregate_flag', if '1', row requires secondary source to calculate FlowAmount, if '0' FlowAmount does not require modifications """ from flowsa.flowbyfunctions import replace_strings_with_NoneType, replace_NoneType_with_empty_cells from flowsa.mapping import add_sectors_to_flowbyactivity # original df - subset # subset cols of original df dfo = df_load[['FlowAmount', 'Location', 'SectorConsumedBy']] # add a column of the sector dropping last digit dfo = dfo.assign( SectorMatch=dfo['SectorConsumedBy'].apply(lambda x: x[:len(x) - 1])) # sum flowamounts based on sector match col dfo2 = dfo.groupby(['Location', 'SectorMatch'], as_index=False)['FlowAmount'] \ .sum().rename(columns={'FlowAmount': 'SubtractFlow'}) dfo2 = dfo2.assign( SectorLengthMatch=dfo2['SectorMatch'].apply(lambda x: len(x) + 1)) # new df # in the original df, drop sector columns re-add sectors, this time with sectors = 'aggregated' dfn = df_load.drop(columns=[ 'SectorProducedBy', 'ProducedBySectorType', 'SectorConsumedBy', 'ConsumedBySectorType', 'SectorSourceName' ]) dfn = add_sectors_to_flowbyactivity( dfn, sectorsourcename=method['target_sector_source'], overwrite_sectorlevel='aggregated') # add column of sector length dfn = dfn.assign( SectorLength=dfn['SectorConsumedBy'].apply(lambda x: len(x))) # add column noting that these columns require an allocation ratio dfn = dfn.assign(disaggregate_flag=1) # create lists of sectors to drop list_original = df_load['ActivityConsumedBy'].drop_duplicates().tolist() # drop values in original df dfn2 = dfn[~dfn['SectorConsumedBy'].isin(list_original)].reset_index( drop=True) # sort the df by 'ActivityConsumedBy' and drop duplicated rows of SectorconsumedBy, keeping the second entry \ # (where ActivityConsumedBy has greater sector length) dfn2 = dfn2.sort_values(['ActivityConsumedBy', 'SectorConsumedBy']) dfn3 = dfn2.drop_duplicates('SectorConsumedBy', keep='last').reset_index(drop=True) # add columns on which to match dfn3 = dfn3.assign(NAICS3=dfn3.apply(lambda x: x['SectorConsumedBy'][ 0:3] if len(x['ActivityConsumedBy']) <= 3 else 0, axis=1)) dfn3 = dfn3.assign(NAICS4=dfn3.apply(lambda x: x['SectorConsumedBy'][ 0:4] if len(x['ActivityConsumedBy']) <= 4 else 0, axis=1)) dfn3 = dfn3.assign(NAICS5=dfn3.apply(lambda x: x['SectorConsumedBy'][ 0:5] if len(x['ActivityConsumedBy']) <= 5 else 0, axis=1)) # merge the two dfs and create new flowamounts for allocation # first merge the new df with the subset original df where activity = sector match df = pd.merge(dfn3, dfo2[['Location', 'SectorMatch', 'SubtractFlow']], how='left', left_on=['Location', 'ActivityConsumedBy'], right_on=['Location', 'SectorMatch' ]).rename(columns={ 'SubtractFlow': 'SubtractFlow1' }).drop(columns='SectorMatch') # then merge new df with subset original df a second time, this time where sector - length 1 = sector match def match_flows(row): # conditions # sector match != activity consumed by condition1 = dfo2['Location'] == row['Location'] condition2 = dfo2['SectorLengthMatch'] <= row['SectorLength'] condition3 = dfo2['SectorMatch'] != row['ActivityConsumedBy'] # condition4 = dfo2['SectorMatch'] == row['SectorConsumedBy'][:len(dfo2['SectorMatch'])] condition4 = ((row['NAICS3'] == dfo2['SectorMatch']) | (row['NAICS4'] == dfo2['SectorMatch']) | (row['NAICS5'] == dfo2['SectorMatch'])) curr_df = dfo2[condition1 & condition2 & condition3 & condition4] try: row['SubtractFlow2'] = curr_df['SubtractFlow'].iloc[0] except: row['SubtractFlow2'] = 0 return row df2 = df.apply(lambda x: match_flows(x), axis=1) # calculate new flow amounts df2['SubtractFlow1'] = df2['SubtractFlow1'].fillna(0) df2['FlowAmount'] = df2['FlowAmount'] - df2['SubtractFlow1'] - df2[ 'SubtractFlow2'] # drop columns df3 = df2.drop(columns=[ 'SectorLength', 'NAICS3', 'NAICS4', 'NAICS5', 'SubtractFlow1', 'SubtractFlow2' ]) # merge the original df with modified # add column to original df for disaggregate_flag df_load = df_load.assign(disaggregate_flag=0) # concat the two dfs and sort df_c = pd.concat([df_load, df3], ignore_index=True).sort_values(['SectorConsumedBy' ]).reset_index(drop=True) df_c = replace_strings_with_NoneType(df_c) return df_c
def convert_blackhurst_data_to_gal_per_employee(df_wsec, attr, method): """ :param df_wsec: :param attr: :param method: :return: """ import flowsa from flowsa.mapping import add_sectors_to_flowbyactivity from flowsa.flowbyfunctions import clean_df, fba_fill_na_dict, proportional_allocation_by_location_and_activity, \ filter_by_geoscale, harmonize_units from flowsa.BLS_QCEW import clean_bls_qcew_fba bls = flowsa.getFlowByActivity(flowclass=['Employment'], datasource='BLS_QCEW', years=[2002]) bls = filter_by_geoscale(bls, 'national') # clean df bls = clean_df(bls, flow_by_activity_fields, fba_fill_na_dict) bls = harmonize_units(bls) bls = clean_bls_qcew_fba(bls, attr=attr) # assign naics to allocation dataset bls_wsec = add_sectors_to_flowbyactivity( bls, sectorsourcename=method['target_sector_source']) # drop rows where sector = None ( does not occur with mining) bls_wsec = bls_wsec[~bls_wsec['SectorProducedBy'].isnull()] bls_wsec = bls_wsec.rename(columns={ 'SectorProducedBy': 'Sector', 'FlowAmount': 'HelperFlow' }) # merge the two dfs df = pd.merge(df_wsec, bls_wsec[['Location', 'Sector', 'HelperFlow']], how='left', left_on=['Location', 'SectorConsumedBy'], right_on=['Location', 'Sector']) # drop any rows where sector is None df = df[~df['Sector'].isnull()] # fill helperflow values with 0 df['HelperFlow'] = df['HelperFlow'].fillna(0) # calculate proportional ratios df_wratio = proportional_allocation_by_location_and_activity(df, 'Sector') df_wratio = df_wratio.rename(columns={ 'FlowAmountRatio': 'EmployeeRatio', 'HelperFlow': 'Employees' }) # drop rows where helperflow = 0 df_wratio = df_wratio[df_wratio['Employees'] != 0] # calculate gal/employee in 2002 df_wratio.loc[:, 'FlowAmount'] = ( df_wratio['FlowAmount'] * df_wratio['EmployeeRatio']) / df_wratio['Employees'] df_wratio.loc[:, 'Unit'] = 'gal/employee' # drop cols df_wratio = df_wratio.drop( columns=['Sector', 'Employees', 'EmployeeRatio']) return df_wratio