def link_non_bls_naics_to_naics(df): """ BLS has 6 digit naics that are not recognized by Census. Map bls data to naics :param df: :return: """ # load sector crosswalk cw = load_sector_length_crosswalk() cw_sub = cw[['NAICS_6']].drop_duplicates() # subset the df to the 6 digit naics df_sub = df[df['Activity'].apply(lambda x: len(x) == 6)] # create a list of 6 digit bls activities that are not naics unmapped_df = pd.merge( df_sub, cw_sub, indicator=True, how='outer', left_on='Activity', right_on='NAICS_6').query('_merge=="left_only"').drop( '_merge', axis=1).reset_index(drop=True) act_list = unmapped_df['Activity'].values.tolist() # if in the activity list, the sector should be modified so last digit is 0 df['Sector'] = np.where(df['Activity'].isin(act_list), df['Activity'].apply(lambda x: x[0:5]) + '0', df['Sector']) return df
def naics_expansion(facility_NAICS): """ modeled after sector_disaggregation in flowbyfunctions, updates NAICS to more granular sectors if there is only one naics at a lower level :param facility_NAICS: df of facilities from facility matcher with NAICS """ # load naics 2 to naics 6 crosswalk cw_load = load_sector_length_crosswalk() cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']] # subset the naics 4 and 5 columns cw4 = cw_load[['NAICS_4', 'NAICS_5']] cw4 = cw4.drop_duplicates(subset=['NAICS_4'], keep=False).reset_index(drop=True) naics4 = cw4['NAICS_4'].values.tolist() # subset the naics 5 and 6 columns cw5 = cw_load[['NAICS_5', 'NAICS_6']] cw5 = cw5.drop_duplicates(subset=['NAICS_5'], keep=False).reset_index(drop=True) naics5 = cw5['NAICS_5'].values.tolist() # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(4, 6): if i == 4: sector_list = naics4 sector_merge = "NAICS_4" sector_add = "NAICS_5" elif i == 5: sector_list = naics5 sector_merge = "NAICS_5" sector_add = "NAICS_6" # subset df to NAICS with length = i df_subset = facility_NAICS.loc[facility_NAICS["NAICS"].apply( lambda x: len(x) == i)] # subset the df to the rows where the tmp sector columns are in naics list df_subset = df_subset.loc[(df_subset['NAICS'].isin(sector_list))] # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['NAICS'], right_on=[sector_merge]) # drop columns and rename new sector columns new_naics['NAICS'] = new_naics[sector_add] new_naics = new_naics.drop(columns=[sector_merge, sector_add]) # drop records with NAICS that have now been expanded facility_NAICS = facility_NAICS[~facility_NAICS['NAICS']. isin(sector_list)] # append new naics to df facility_NAICS = pd.concat([facility_NAICS, new_naics], sort=True) return facility_NAICS
def sector_disaggregation_generalized(fbs): """ function to disaggregate sectors if there is only one naics at a lower level works for lower than naics 4 :param df: A FBS df :return: A FBS df with missing naics5 and naics6 """ # load naics 2 to naics 6 crosswalk cw_load = load_sector_length_crosswalk() cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']] # subset the naics 4 and 5 columsn cw4 = cw_load[['NAICS_4', 'NAICS_5']] cw4 = cw4.drop_duplicates(subset=['NAICS_4'], keep=False).reset_index(drop=True) naics4 = cw4['NAICS_4'].values.tolist() # subset the naics 5 and 6 columsn cw5 = cw_load[['NAICS_5', 'NAICS_6']] cw5 = cw5.drop_duplicates(subset=['NAICS_5'], keep=False).reset_index(drop=True) naics5 = cw5['NAICS_5'].values.tolist() # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(4, 6): if i == 4: sector_list = naics4 sector_merge = "NAICS_4" sector_add = "NAICS_5" elif i == 5: sector_list = naics5 sector_merge = "NAICS_5" sector_add = "NAICS_6" # subset df to sectors with length = i and length = i + 1 df_subset = fbs.loc[fbs['Sector'].apply(lambda x: i + 1 >= len(x) >= i)] # create new columns that are length i df_subset.loc[:, 'Sector_tmp'] = df_subset['Sector'].apply(lambda x: x[0:i]) # subset the df to the rows where the tmp sector columns are in naics list df_subset = df_subset.loc[df_subset['Sector_tmp'].isin(sector_list)] # drop all rows with duplicate temp values, as a less aggregated naics exists df_subset = df_subset.drop_duplicates(subset=['FlowName', 'Compartment', 'Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['Sector_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "ST"}) new_naics = new_naics.drop(columns=[sector_merge]) # drop columns and rename new sector columns new_naics = new_naics.drop(columns=["Sector", "Sector_tmp"]) new_naics = new_naics.rename(columns={"ST": "Sector"}) # append new naics to df fbs = pd.concat([fbs, new_naics], sort=True) return fbs
def get_sector_list(sector_level): cw = load_sector_length_crosswalk() sector_list = cw[sector_level].unique().tolist() return sector_list
def allocate_dropped_sector_data(df_load, target_sector_level): """ Determine rows of data that will be lost if subset data at target sector level Equally allocate parent NAICS to child NAICS where child NAICS missing :param df: df, FBS format :param target_sector_level: str, target NAICS level for FBS output :return: df, with all child NAICS at target sector level """ # exclude nonsectors df = replace_NoneType_with_empty_cells(df_load) rows_lost = pd.DataFrame() for i in range(2, sector_level_key[target_sector_level]): # create df of i length df_x1 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]] == '')] df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & ( df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x3 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False) # create df of i + 1 length df_y1 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y2 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False) # create temp sector columns in df y, that are i digits in length df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply( lambda x: x[0:i]) df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply( lambda x: x[0:i]) # don't modify household sector lengths or gov't transport df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'}) # merge the two dfs df_m = pd.merge(df_x, df_y[[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]], how='left', left_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'SectorProducedBy', 'SectorConsumedBy' ], right_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]) # extract the rows that are not disaggregated to more specific naics rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())].reset_index(drop=True) # clean df rl = replace_strings_with_NoneType(rl) rl_list = rl[['SectorProducedBy', 'SectorConsumedBy']].drop_duplicates().values.tolist() # match sectors with target sector length sectors # import cw and subset to current sector length and target sector length cw_load = load_sector_length_crosswalk() nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & conditionally replace sector produced/consumed columns rl_m = pd.merge(rl, cw, how='left', left_on=[fbs_activity_fields[0]], right_on=[nlength]) rl_m.loc[rl_m[fbs_activity_fields[0]] != '', fbs_activity_fields[0]] = rl_m[target_sector_level] rl_m = rl_m.drop(columns=[nlength, target_sector_level]) rl_m2 = pd.merge(rl_m, cw, how='left', left_on=[fbs_activity_fields[1]], right_on=[nlength]) rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '', fbs_activity_fields[1]] = rl_m2[target_sector_level] rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level]) # create one sector count column rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna( rl_m2['sector_count_y']) rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'}) rl_m3 = rl_m3.drop(columns=['sector_count_y']) # calculate new flow amounts, based on sector count, # allocating equally to the new sector length codes rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count'] rl_m3 = rl_m3.drop(columns=['sector_count']) # append to df if len(rl) != 0: vLogDetailed.warning( 'Data found at %s digit NAICS not represented in current ' 'data subset: {}'.format(' '.join(map(str, rl_list))), str(i)) rows_lost = rows_lost.append(rl_m3, ignore_index=True) if len(rows_lost) != 0: vLogDetailed.info( 'Allocating FlowAmounts equally to each %s associated with ' 'the sectors previously dropped', target_sector_level) # add rows of missing data to the fbs sector subset df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data) return df_w_lost_data
def sector_disaggregation(sector_disaggregation): """ function to disaggregate sectors if there is only one naics at a lower level works for lower than naics 4 :param df: A FBS df :return: A FBS df with missing naics5 and naics6 """ #todo: need to modify so works with either a fBA with sectors or a FBS because called on in a fxn \ # that accepts either # load naics 2 to naics 6 crosswalk cw_load = load_sector_length_crosswalk() cw = cw_load[['NAICS_4', 'NAICS_5', 'NAICS_6']] # subset the naics 4 and 5 columsn cw4 = cw_load[['NAICS_4', 'NAICS_5']] cw4 = cw4.drop_duplicates(subset=['NAICS_4'], keep=False).reset_index(drop=True) naics4 = cw4['NAICS_4'].values.tolist() # subset the naics 5 and 6 columsn cw5 = cw_load[['NAICS_5', 'NAICS_6']] cw5 = cw5.drop_duplicates(subset=['NAICS_5'], keep=False).reset_index(drop=True) naics5 = cw5['NAICS_5'].values.tolist() # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(4, 6): sector_disaggregation = clean_df(sector_disaggregation, flow_by_sector_fields, fbs_fill_na_dict) if i == 4: sector_list = naics4 sector_merge = "NAICS_4" sector_add = "NAICS_5" elif i == 5: sector_list = naics5 sector_merge = "NAICS_5" sector_add = "NAICS_6" # subset df to sectors with length = i and length = i + 1 df_subset = sector_disaggregation.loc[sector_disaggregation[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | sector_disaggregation[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create new columns that are length i df_subset.loc[:, 'SectorProduced_tmp'] = df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i]) df_subset.loc[:, 'SectorConsumed_tmp'] = df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i]) # subset the df to the rows where the tmp sector columns are in naics list df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) | (df_subset['SectorConsumed_tmp'].isin(sector_list))] df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'].isin(sector_list))] # concat existing dfs df_subset = pd.concat([df_subset_1, df_subset_2], sort=False) # drop all rows with duplicate temp values, as a less aggregated naics exists df_subset = df_subset.drop_duplicates(subset=['Flowable', 'Context', 'Location', 'SectorProduced_tmp', 'SectorConsumed_tmp'], keep=False).reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SPB"}) new_naics = new_naics.drop(columns=[sector_merge]) new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]], how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SCB"}) new_naics = new_naics.drop(columns=[sector_merge]) # drop columns and rename new sector columns new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp", "SectorConsumed_tmp"]) new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy", "SCB": "SectorConsumedBy"}) # append new naics to df sector_disaggregation = pd.concat([sector_disaggregation, new_naics], sort=True) return sector_disaggregation
def sector_disaggregation(df): """ function to disaggregate sectors if there is only one naics at a lower level works for lower than naics 4 :param df: A FBS df, must have sector columns :return: A FBS df with values for the missing naics5 and naics6 """ # ensure None values are not strings df = replace_NoneType_with_empty_cells(df) # load naics 2 to naics 6 crosswalk cw_load = load_sector_length_crosswalk() # for loop min length to 6 digits, where min length cannot be less than 2 length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply( lambda x: x.str.len()).min().min() if length < 2: length = 2 # appends missing naics levels to df for i in range(length, 6): sector_merge = 'NAICS_' + str(i) sector_add = 'NAICS_' + str(i+1) # subset the df by naics length cw = cw_load[[sector_merge, sector_add]] # only keep the rows where there is only one value in sector_add for a value in sector_merge cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True) sector_list = cw[sector_merge].values.tolist() # subset df to sectors with length = i and length = i + 1 df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create new columns that are length i df_subset = df_subset.assign(SectorProduced_tmp= df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i])) df_subset = df_subset.assign(SectorConsumed_tmp= df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i])) # subset the df to the rows where the tmp sector columns are in naics list df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'] == "")] df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") & (df_subset['SectorConsumed_tmp'].isin(sector_list))] df_subset_3 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'].isin(sector_list))] # concat existing dfs df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False) # drop all rows with duplicate temp values, as a less aggregated naics exists # list of column headers, that if exist in df, should be # aggregated using the weighted avg fxn possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context', 'Compartment', 'Location', 'Year', 'SectorProduced_tmp', 'SectorConsumed_tmp') # list of column headers that do exist in the df being subset cols_to_drop = [e for e in possible_column_headers if e in df_subset.columns.values.tolist()] df_subset = df_subset.drop_duplicates(subset=cols_to_drop, keep=False).reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SPB"}) new_naics = new_naics.drop(columns=[sector_merge]) new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]], how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SCB"}) new_naics = new_naics.drop(columns=[sector_merge]) # drop columns and rename new sector columns new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp", "SectorConsumed_tmp"]) new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy", "SCB": "SectorConsumedBy"}) # append new naics to df new_naics['SectorConsumedBy'] = new_naics['SectorConsumedBy'].replace({np.nan: ""}) new_naics['SectorProducedBy'] = new_naics['SectorProducedBy'].replace({np.nan: ""}) new_naics = replace_NoneType_with_empty_cells(new_naics) df = pd.concat([df, new_naics], sort=True) # replace blank strings with None df = replace_strings_with_NoneType(df) return df
def main(method_name): """ Creates a flowbysector dataset :param method_name: Name of method corresponding to flowbysector method yaml name :return: flowbysector """ log.info("Initiating flowbysector creation for " + method_name) # call on method method = load_method(method_name) # create dictionary of water data and allocation datasets fbas = method['flowbyactivity_sources'] # Create empty list for storing fbs files fbss = [] for k, v in fbas.items(): # pull water data for allocation log.info("Retrieving flowbyactivity for datasource " + k + " in year " + str(v['year'])) flows = flowsa.getFlowByActivity(flowclass=[v['class']], years=[v['year']], datasource=k) # if necessary, standardize names in data set if v['activity_name_standardization_fxn'] != 'None': log.info("Standardizing activity names in " + k) flows = getattr(sys.modules[__name__], v['activity_name_standardization_fxn'])(flows) # drop description field flows = flows.drop(columns='Description') # fill null values flows = flows.fillna(value=fba_fill_na_dict) # map df to elementary flows - commented out until mapping complete # log.info("Mapping flows in " + k + ' to federal elementary flow list') # flows_mapped = map_elementary_flows(flows, k) # convert unit todo: think about unit conversion here log.info("Converting units in " + k) flows = convert_unit(flows) # create dictionary of allocation datasets for different activities activities = v['activity_sets'] for aset, attr in activities.items(): # subset by named activities names = [attr['names']] log.info("Preparing to handle subset of flownames " + ', '.join(map(str, names)) + " in " + k) # subset usgs data by activity flow_subset = flows[(flows[fba_activity_fields[0]].isin(names)) | (flows[fba_activity_fields[1]].isin(names))] # Reset index values after subset flow_subset = flow_subset.reset_index(drop=True) # check if flowbyactivity data exists at specified geoscale to use log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at the " + v['geoscale_to_use'] + ' level') geocheck = check_if_data_exists_at_geoscale(flow_subset, names, v['geoscale_to_use']) # aggregate geographically to the scale of the allocation dataset if geocheck == "Yes": activity_from_scale = v['geoscale_to_use'] else: # if activity does not exist at specified geoscale, issue warning and use data at less aggregated # geoscale, and sum to specified geoscale log.info("Checking if flowbyactivity data exists for " + ', '.join(map(str, names)) + " at a less aggregated level") new_geoscale_to_use = check_if_data_exists_at_less_aggregated_geoscale(flow_subset, names, v['geoscale_to_use']) activity_from_scale = new_geoscale_to_use activity_to_scale = attr['allocation_from_scale'] # if usgs is less aggregated than allocation df, aggregate usgs activity to target scale if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]: log.info("Aggregating subset from " + activity_from_scale + " to " + activity_to_scale) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, activity_to_scale, fba_default_grouping_fields, names) # else, aggregate to geoscale want to use elif fips_number_key[activity_from_scale] > fips_number_key[v['geoscale_to_use']]: log.info("Aggregating subset from " + activity_from_scale + " to " + v['geoscale_to_use']) flow_subset = agg_by_geoscale(flow_subset, activity_from_scale, v['geoscale_to_use'], fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, filter relevant rows else: log.info("Filtering out " + activity_from_scale + " data") flow_subset = filter_by_geoscale(flow_subset, activity_from_scale, names) # location column pad zeros if necessary flow_subset['Location'] = flow_subset['Location'].apply(lambda x: x.ljust(3 + len(x), '0') if len(x) < 5 else x ) # Add sectors to usgs activity, creating two versions of the flow subset # the first version "flow_subset" is the most disaggregated version of the Sectors (NAICS) # the second version, "flow_subset_agg" includes only the most aggregated level of sectors log.info("Adding sectors to " + k + " for " + ', '.join(map(str, names))) flow_subset_wsec = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source']) flow_subset_wsec_agg = add_sectors_to_flowbyactivity(flow_subset, sectorsourcename=method['target_sector_source'], levelofSectoragg='agg') # if allocation method is "direct", then no need to create alloc ratios, else need to use allocation # dataframe to create sector allocation ratios if attr['allocation_method'] == 'direct': fbs = flow_subset_wsec_agg.copy() else: # determine appropriate allocation dataset log.info("Loading allocation flowbyactivity " + attr['allocation_source'] + " for year " + str(attr['allocation_source_year'])) fba_allocation = flowsa.getFlowByActivity(flowclass=[attr['allocation_source_class']], datasource=attr['allocation_source'], years=[attr['allocation_source_year']]).reset_index(drop=True) # fill null values fba_allocation = fba_allocation.fillna(value=fba_fill_na_dict) # convert unit fba_allocation = convert_unit(fba_allocation) # subset based on yaml settings if attr['allocation_flow'] != 'None': fba_allocation = fba_allocation.loc[fba_allocation['FlowName'].isin(attr['allocation_flow'])] if attr['allocation_compartment'] != 'None': fba_allocation = fba_allocation.loc[ fba_allocation['Compartment'].isin(attr['allocation_compartment'])] # reset index fba_allocation = fba_allocation.reset_index(drop=True) # check if allocation data exists at specified geoscale to use log.info("Checking if" + " allocation data exists for " + ', '.join(map(str, names)) + " at the " + attr['allocation_from_scale'] + " level") check_if_data_exists_at_geoscale(fba_allocation, names, attr['allocation_from_scale']) # aggregate geographically to the scale of the flowbyactivty source, if necessary from_scale = attr['allocation_from_scale'] to_scale = v['geoscale_to_use'] # if allocation df is less aggregated than FBA df, aggregate allocation df to target scale if fips_number_key[from_scale] > fips_number_key[to_scale]: fba_allocation = agg_by_geoscale(fba_allocation, from_scale, to_scale, fba_default_grouping_fields, names) # else, if usgs is more aggregated than allocation table, use usgs as both to and from scale else: fba_allocation = filter_by_geoscale(fba_allocation, from_scale, names) # assign sector to allocation dataset log.info("Adding sectors to " + attr['allocation_source']) fba_allocation = add_sectors_to_flowbyactivity(fba_allocation, sectorsourcename=method['target_sector_source'], levelofSectoragg=attr[ 'allocation_sector_aggregation']) # subset fba datsets to only keep the naics associated with usgs activity subset log.info("Subsetting " + attr['allocation_source'] + " for sectors in " + k) fba_allocation_subset = get_fba_allocation_subset(fba_allocation, k, names) # Reset index values after subset fba_allocation_subset = fba_allocation_subset.reset_index(drop=True) # generalize activity field names to enable link to water withdrawal table log.info("Generalizing activity names in subset of " + attr['allocation_source']) fba_allocation_subset = generalize_activity_field_names(fba_allocation_subset) # drop columns fba_allocation_subset = fba_allocation_subset.drop(columns=['Activity']) # if there is an allocation helper dataset, modify allocation df if attr['allocation_helper'] == 'yes': log.info("Using the specified allocation help for subset of " + attr['allocation_source']) fba_allocation_subset = allocation_helper(fba_allocation_subset, method, attr) # create flow allocation ratios log.info("Creating allocation ratios for " + attr['allocation_source']) flow_allocation = allocate_by_sector(fba_allocation_subset, attr['allocation_method']) # create list of sectors in the flow allocation df, drop any rows of data in the flow df that \ # aren't in list sector_list = flow_allocation['Sector'].unique().tolist() # subset fba allocation table to the values in the activity list, based on overlapping sectors flow_subset_wsec = flow_subset_wsec.loc[ (flow_subset_wsec[fbs_activity_fields[0]].isin(sector_list)) | (flow_subset_wsec[fbs_activity_fields[1]].isin(sector_list))] # check if fba and allocation dfs have the same LocationSystem log.info("Checking if flowbyactivity and allocation dataframes use the same location systems") check_if_location_systems_match(flow_subset_wsec, flow_allocation) # merge water withdrawal df w/flow allocation dataset log.info("Merge " + k + " and subset of " + attr['allocation_source']) fbs = flow_subset_wsec.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorProducedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') fbs = fbs.merge( flow_allocation[['Location', 'LocationSystem', 'Sector', 'FlowAmountRatio']], left_on=['Location', 'LocationSystem', 'SectorConsumedBy'], right_on=['Location', 'LocationSystem', 'Sector'], how='left') # drop columns where both sector produced/consumed by in flow allocation dif is null fbs = fbs.dropna(subset=['Sector_x', 'Sector_y'], how='all').reset_index() # merge the flowamount columns fbs['FlowAmountRatio'] = fbs['FlowAmountRatio_x'].fillna(fbs['FlowAmountRatio_y']) fbs['FlowAmountRatio'] = fbs['FlowAmountRatio'].fillna(0) # calculate flow amounts for each sector log.info("Calculating new flow amounts using flow ratios") fbs['FlowAmount'] = fbs['FlowAmount'] * fbs['FlowAmountRatio'] # drop columns log.info("Cleaning up new flow by sector") fbs = fbs.drop(columns=['Sector_x', 'FlowAmountRatio_x', 'Sector_y', 'FlowAmountRatio_y', 'FlowAmountRatio', 'ActivityProducedBy', 'ActivityConsumedBy']) # rename flow name to flowable fbs = fbs.rename(columns={"FlowName": 'Flowable', "Compartment": "Context" }) # drop rows where flowamount = 0 (although this includes dropping suppressed data) fbs = fbs[fbs['FlowAmount'] != 0].reset_index(drop=True) # add missing data columns fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) # fill null values fbs = fbs.fillna(value=fbs_fill_na_dict) # aggregate df geographically, if necessary log.info("Aggregating flowbysector to " + method['target_geoscale'] + " level") if fips_number_key[v['geoscale_to_use']] < fips_number_key[attr['allocation_from_scale']]: from_scale = v['geoscale_to_use'] else: from_scale = attr['allocation_from_scale'] to_scale = method['target_geoscale'] fbs = agg_by_geoscale(fbs, from_scale, to_scale, fbs_default_grouping_fields, names) # aggregate data to every sector level log.info("Aggregating flowbysector to " + method['target_sector_level']) fbs = sector_aggregation(fbs, fbs_default_grouping_fields) # test agg by sector sector_agg_comparison = sector_flow_comparision(fbs) # return sector level specified in method yaml # load the crosswalk linking sector lengths cw = load_sector_length_crosswalk() sector_list = cw[method['target_sector_level']].unique().tolist() # add any non-NAICS sectors used with NAICS household = load_household_sector_codes() household = household.loc[household['NAICS_Level_to_Use_For'] == method['target_sector_level']] # add household sector to sector list sector_list.extend(household['Code'].tolist()) # subset df fbs = fbs.loc[(fbs[fbs_activity_fields[0]].isin(sector_list)) | (fbs[fbs_activity_fields[1]].isin(sector_list))].reset_index(drop=True) # add any missing columns of data and cast to appropriate data type fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) log.info("Completed flowbysector for activity subset with flows " + ', '.join(map(str, names))) fbss.append(fbs) # create single df of all activities fbss = pd.concat(fbss, ignore_index=True, sort=False) # aggregate df as activities might have data for the same specified sector length fbss = aggregator(fbss, fbs_default_grouping_fields) # sort df fbss = fbss.sort_values( ['SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context']).reset_index(drop=True) # save parquet file store_flowbysector(fbss, method_name)
def check_for_missing_sector_data(df, target_sector_level): """ Modeled after datachecks.py check_if_losing_sector_data Allocates flow amount equally across child NAICS when parent NAICS is not target_level :param df: :param target_sector_level: :return: """ from flowsa.flowbyfunctions import replace_NoneType_with_empty_cells, replace_strings_with_NoneType # temporarily replace null values with empty cells df = replace_NoneType_with_empty_cells(df) activity_field = "SectorProducedBy" rows_lost = pd.DataFrame() cw_load = load_sector_length_crosswalk() for i in range(3, sector_level_key[target_sector_level]): # create df of i length df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)] # import cw and subset to current sector length and target sector length nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & replace sector produced columns df_x = pd.merge(df_subset, cw, how='left', left_on=[activity_field], right_on=[nlength]) df_x[activity_field] = df_x[target_sector_level] df_x = df_x.drop(columns=[nlength, target_sector_level]) # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count'] df_x = df_x.drop(columns=['sector_count']) # replace null values with empty cells df_x = replace_NoneType_with_empty_cells(df_x) # append to df sector_list = df_subset[activity_field].drop_duplicates() if len(df_x) != 0: log.warning('Data found at ' + str(i) + ' digit NAICS to be allocated' ': {}'.format(' '.join(map(str, sector_list)))) rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True) if len(rows_lost) == 0: log.info('No data loss from NAICS in dataframe') else: log.info('Allocating FlowAmounts equally to each ' + target_sector_level) # add rows of missing data to the fbs sector subset df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_allocated = df_allocated.loc[df_allocated[activity_field].apply( lambda x: len(x) == sector_level_key[target_sector_level])] df_allocated.reset_index(inplace=True) # replace empty cells with NoneType (if dtype is object) df_allocated = replace_strings_with_NoneType(df_allocated) return df_allocated