def compare_FBS_results(fbs1_load, fbs2_load, ignore_metasources=False): """ Compare a parquet on Data Commons to a parquet stored locally :param fbs1_load: df, fbs format :param fbs2_load: df, fbs format :param ignore_metasources: bool, True to compare fbs without matching metasources :return: df, comparison of the two dfs """ import flowsa # load first file (must be saved locally) df1 = flowsa.getFlowBySector(fbs1_load).rename( columns={'FlowAmount': 'FlowAmount_fbs1'}) df1 = replace_strings_with_NoneType(df1) # load second file (must be saved locally) df2 = flowsa.getFlowBySector(fbs2_load).rename( columns={'FlowAmount': 'FlowAmount_fbs2'}) df2 = replace_strings_with_NoneType(df2) # compare df merge_cols = [ 'Flowable', 'Class', 'SectorProducedBy', 'SectorConsumedBy', 'SectorSourceName', 'Context', 'Location', 'LocationSystem', 'Unit', 'FlowType', 'Year', 'MetaSources' ] if ignore_metasources: merge_cols.remove('MetaSources') # check units compare_df_units(df1, df2) df_m = pd.merge(df1[merge_cols + ['FlowAmount_fbs1']], df2[merge_cols + ['FlowAmount_fbs2']], how='outer') df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_fbs2'] - df_m['FlowAmount_fbs1']) df_m = df_m.assign( Percent_Diff=(df_m['FlowAmount_diff'] / df_m['FlowAmount_fbs1']) * 100) df_m = df_m[df_m['FlowAmount_diff'].apply( lambda x: round(abs(x), 2) != 0)].reset_index(drop=True) # if no differences, print, if differences, provide df subset if len(df_m) == 0: vLog.debug('No differences between dataframes') else: vLog.debug('Differences exist between dataframes') df_m = df_m.sort_values([ 'Location', 'SectorProducedBy', 'SectorConsumedBy', 'Flowable', 'Context', ]).reset_index(drop=True) return df_m
def proportional_allocation_by_location(df): """ Creates a proportional allocation based on all the most aggregated sectors within a location Ensure that sectors are at 2 digit level - can run sector_aggregation() prior to using this function :param df: df, includes sector columns :param sectorcolumn: str, sector column by which to base allocation :return: df, with 'FlowAmountRatio' column """ # tmp drop NoneType df = replace_NoneType_with_empty_cells(df) # find the shortest length sector denom_df = df.loc[(df['SectorProducedBy'].apply(lambda x: len(x) == 2)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == 2))] denom_df = denom_df.assign(Denominator=denom_df['FlowAmount'].groupby( denom_df['Location']).transform('sum')) denom_df_2 = denom_df[[ 'Location', 'LocationSystem', 'Year', 'Denominator' ]].drop_duplicates() # merge the denominator column with fba_w_sector df allocation_df = df.merge(denom_df_2, how='left') # calculate ratio allocation_df.loc[:, 'FlowAmountRatio'] = allocation_df[ 'FlowAmount'] / allocation_df['Denominator'] allocation_df = allocation_df.drop(columns=['Denominator']).reset_index() # add nonetypes allocation_df = replace_strings_with_NoneType(allocation_df) return allocation_df
def collapse_activity_fields(df): """ The 'activityconsumedby' and 'activityproducedby' columns from the allocation dataset do not always align with the dataframe being allocated. Generalize the allocation activity column. :param df: df, FBA used to allocate another FBA :return: df, single Activity column """ df = replace_strings_with_NoneType(df) activity_consumed_list = df['ActivityConsumedBy'].drop_duplicates().values.tolist() activity_produced_list = df['ActivityProducedBy'].drop_duplicates().values.tolist() # if an activity field column is all 'none', drop the column and # rename renaming activity columns to generalize if all(v is None for v in activity_consumed_list): df = df.drop(columns=['ActivityConsumedBy', 'SectorConsumedBy']) df = df.rename(columns={'ActivityProducedBy': 'Activity', 'SectorProducedBy': 'Sector'}) elif all(v is None for v in activity_produced_list): df = df.drop(columns=['ActivityProducedBy', 'SectorProducedBy']) df = df.rename(columns={'ActivityConsumedBy': 'Activity', 'SectorConsumedBy': 'Sector'}) else: log.error('Cannot generalize dataframe') # drop other columns df = df.drop(columns=['ProducedBySectorType', 'ConsumedBySectorType']) return df
def equal_allocation(fba_load): """ Allocate an Activity in a FBA equally to all mapped sectors. Function only works if all mapped sectors are the same length :param fba_load: df, FBA with activity columns mapped to sectors :return: df, with FlowAmount equally allocated to all mapped sectors """ # create groupby cols by which to determine allocation fba_cols = fba_load.select_dtypes([object]).columns.to_list() groupcols = [ e for e in fba_cols if e not in ['SectorProducedBy', 'SectorConsumedBy', 'Description'] ] # create counts of rows df_count = fba_load.groupby(groupcols, as_index=False, dropna=False).size().astype(str) df_count = replace_strings_with_NoneType(df_count) # merge dfs dfm = fba_load.merge(df_count, how='left') # calc new flowamounts dfm['FlowAmount'] = dfm['FlowAmount'] / dfm['size'].astype(int) dfm = dfm.drop(columns='size') return dfm
def proportional_allocation_by_location_and_activity(df, sectorcolumn): """ Creates a proportional allocation within each aggregated sector within a location :param df: df with sector columns :param sectorcolumn: str, sector column for which to create allocation ratios :return: df, with 'FlowAmountRatio' and 'HelperFlow' columns """ # tmp replace NoneTypes with empty cells df = replace_NoneType_with_empty_cells(df) # denominator summed from highest level of sector grouped by location short_length = min(df[sectorcolumn].apply(lambda x: len(str(x))).unique()) # want to create denominator based on short_length denom_df = df.loc[df[sectorcolumn].apply( lambda x: len(x) == short_length)].reset_index(drop=True) grouping_cols = [ e for e in [ 'FlowName', 'Location', 'Activity', 'ActivityConsumedBy', 'ActivityProducedBy' ] if e in denom_df.columns.values.tolist() ] denom_df.loc[:, 'Denominator'] = denom_df.groupby( grouping_cols)['HelperFlow'].transform('sum') # list of column headers, that if exist in df, should be aggregated using the weighted avg fxn possible_column_headers = ('Location', 'LocationSystem', 'Year', 'Activity', 'ActivityConsumedBy', 'ActivityProducedBy') # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in denom_df.columns.values.tolist() ] merge_headers = column_headers.copy() column_headers.append('Denominator') # create subset of denominator values based on Locations and Activities denom_df_2 = denom_df[column_headers].drop_duplicates().reset_index( drop=True) # merge the denominator column with fba_w_sector df allocation_df = df.merge(denom_df_2, how='left', left_on=merge_headers, right_on=merge_headers) # calculate ratio allocation_df.loc[:, 'FlowAmountRatio'] = \ allocation_df['HelperFlow'] / allocation_df['Denominator'] allocation_df = allocation_df.drop(columns=['Denominator']).reset_index( drop=True) # fill empty cols with NoneType allocation_df = replace_strings_with_NoneType(allocation_df) # fill na values with 0 allocation_df['HelperFlow'] = allocation_df['HelperFlow'].fillna(0) return allocation_df
def bls_clean_allocation_fba_w_sec_sat_table(df_w_sec, **kwargs): """ clean up bls df with sectors by estimating suppresed data :param df_w_sec: df, FBA format BLS QCEW data :param kwargs: additional arguments can include 'attr', a dictionary of FBA method yaml parameters :return: df, BLS QCEW FBA with estimated suppressed data """ df_w_sec = df_w_sec.reset_index(drop=True) df2 = add_missing_flow_by_fields( df_w_sec, flow_by_activity_mapped_wsec_fields).reset_index(drop=True) df3 = replace_strings_with_NoneType(df2) return df3
def proportional_allocation(df, attr): """ Creates a proportional allocation based on all the most aggregated sectors within a location Ensure that sectors are at 2 digit level - can run sector_aggregation() prior to using this function :param df: df, includes sector columns :param attr: dictionary, attributes for an activity set :return: df, with 'FlowAmountRatio' column """ # tmp drop NoneType df = replace_NoneType_with_empty_cells(df) # determine if any additional columns beyond location and sector by which # to base allocation ratios if 'allocation_merge_columns' in attr: groupby_cols = ['Location'] + attr['allocation_merge_columns'] denom_subset_cols = [ 'Location', 'LocationSystem', 'Year', 'Denominator' ] + attr['allocation_merge_columns'] else: groupby_cols = ['Location'] denom_subset_cols = [ 'Location', 'LocationSystem', 'Year', 'Denominator' ] denom_df = df.loc[(df['SectorProducedBy'].apply(lambda x: len(x) == 2)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == 2))] # generate denominator based on identified groupby cols denom_df = denom_df.assign(Denominator=denom_df.groupby(groupby_cols) ['FlowAmount'].transform('sum')) # subset select columns by which to generate ratios denom_df_2 = denom_df[denom_subset_cols].drop_duplicates() # merge the denominator column with fba_w_sector df allocation_df = df.merge(denom_df_2, how='left') # calculate ratio allocation_df.loc[:, 'FlowAmountRatio'] = \ allocation_df['FlowAmount'] / allocation_df['Denominator'] allocation_df = allocation_df.drop(columns=['Denominator']).reset_index() # add nonetypes allocation_df = replace_strings_with_NoneType(allocation_df) return allocation_df
def aggregator(df, groupbycols): """ Aggregates flowbyactivity or flowbysector 'FlowAmount' column in df and generate weighted average values based on FlowAmount values for numeric columns :param df: df, Either flowbyactivity or flowbysector :param groupbycols: list, Either flowbyactivity or flowbysector columns :return: df, with aggregated columns """ # reset index df = df.reset_index(drop=True) # tmp replace null values with empty cells df = replace_NoneType_with_empty_cells(df) # drop columns with flowamount = 0 df = df[df['FlowAmount'] != 0] # list of column headers, that if exist in df, should be # aggregated using the weighted avg fxn possible_column_headers = \ ('Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation', 'GeographicalCorrelation', 'TechnologicalCorrelation', 'DataCollection') # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in df.columns.values.tolist() ] df_dfg = df.groupby(groupbycols).agg({'FlowAmount': ['sum']}) # run through other columns creating weighted average for e in column_headers: df_dfg[e] = get_weighted_average(df, e, 'FlowAmount', groupbycols) df_dfg = df_dfg.reset_index() df_dfg.columns = df_dfg.columns.droplevel(level=1) # if datatypes are strings, ensure that Null values remain NoneType df_dfg = replace_strings_with_NoneType(df_dfg) return df_dfg
def disaggregate_pastureland(fba_w_sector, attr, method, year, sector_column): """ The USDA CoA Cropland irrigated pastureland data only links to the 3 digit NAICS '112'. This function uses state level CoA 'Land in Farms' to allocate the county level acreage data to 6 digit NAICS. :param fba_w_sector: df, the CoA Cropland dataframe after linked to sectors :param attr: dictionary, attribute data from method yaml for activity set :param year: str, year of data being disaggregated :param sector_column: str, the sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :return: df, the CoA cropland dataframe with disaggregated pastureland data """ # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # subset the coa data so only pastureland p = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) if len(p) != 0: # add temp loc column for state fips p = p.assign(Location_tmp=p['Location'].apply(lambda x: x[0:2])) # load usda coa cropland naics df_f = load_fba_w_standardized_units( datasource='USDA_CoA_Cropland_NAICS', year=year, flowclass='Land') # subset to land in farms data df_f = df_f[df_f['FlowName'] == 'FARM OPERATIONS'] # subset to rows related to pastureland df_f = df_f.loc[df_f['ActivityConsumedBy'].apply(lambda x: x[0:3]) == '112'] # drop rows with "&' df_f = df_f[~df_f['ActivityConsumedBy'].str.contains('&')] # create sector columns df_f = add_sectors_to_flowbyactivity( df_f, sectorsourcename=method['target_sector_source']) # estimate suppressed data by equal allocation df_f = estimate_suppressed_data(df_f, 'SectorConsumedBy', 3, 'USDA_CoA_Cropland_NAICS') # create proportional ratios group_cols = fba_wsec_default_grouping_fields group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df_f = allocate_by_sector(df_f, 'proportional', group_cols) # tmp drop NoneTypes df_f = replace_NoneType_with_empty_cells(df_f) # drop naics = '11 df_f = df_f[df_f[sector_column] != '11'] # drop 000 in location df_f = df_f.assign(Location=df_f['Location'].apply(lambda x: x[0:2])) # check units before merge compare_df_units(p, df_f) # merge the coa pastureland data with land in farm data df = p.merge(df_f[[sector_column, 'Location', 'FlowAmountRatio']], how='left', left_on="Location_tmp", right_on="Location") # multiply the flowamount by the flowratio df.loc[:, 'FlowAmount'] = df['FlowAmount'] * df['FlowAmountRatio'] # drop columns and rename df = df.drop(columns=[ 'Location_tmp', sector_column + '_x', 'Location_y', 'FlowAmountRatio' ]) df = df.rename(columns={ sector_column + '_y': sector_column, "Location_x": 'Location' }) # drop rows where sector = 112 and then concat with original fba_w_sector fba_w_sector = fba_w_sector[fba_w_sector[sector_column].apply( lambda x: x[0:3]) != '112'].reset_index(drop=True) fba_w_sector = pd.concat([fba_w_sector, df], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def allocation_helper(df_w_sector, attr, method, v, download_FBA_if_missing): """ Function to help allocate activity names using secondary df :param df_w_sector: df, includes sector columns :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :param v: dictionary, the datasource parameters :param download_FBA_if_missing: bool, indicate if missing FBAs should be downloaded from Data Commons or run locally :return: df, with modified fba allocation values """ from flowsa.validation import compare_df_units # add parameters to dictionary if exist in method yaml fba_dict = {} if 'helper_flow' in attr: fba_dict['flowname_subset'] = attr['helper_flow'] if 'clean_helper_fba' in attr: fba_dict['clean_fba'] = attr['clean_helper_fba'] if 'clean_helper_fba_wsec' in attr: fba_dict['clean_fba_w_sec'] = attr['clean_helper_fba_wsec'] # load the allocation FBA helper_allocation = \ load_map_clean_fba(method, attr, fba_sourcename=attr['helper_source'], df_year=attr['helper_source_year'], flowclass=attr['helper_source_class'], geoscale_from=attr['helper_from_scale'], geoscale_to=v['geoscale_to_use'], download_FBA_if_missing=download_FBA_if_missing, **fba_dict) # run sector disagg to capture any missing lower level naics helper_allocation = sector_disaggregation(helper_allocation) # generalize activity field names to enable link to water withdrawal table helper_allocation = collapse_activity_fields(helper_allocation) # drop any rows not mapped helper_allocation = \ helper_allocation[helper_allocation['Sector'].notnull()] # drop columns helper_allocation = \ helper_allocation.drop(columns=['Activity', 'Min', 'Max']) # rename column helper_allocation = \ helper_allocation.rename(columns={"FlowAmount": 'HelperFlow'}) # determine the df_w_sector column to merge on df_w_sector = replace_strings_with_NoneType(df_w_sector) sec_consumed_list = \ df_w_sector['SectorConsumedBy'].drop_duplicates().values.tolist() sec_produced_list = \ df_w_sector['SectorProducedBy'].drop_duplicates().values.tolist() # if a sector field column is not all 'none', that is the column to merge if all(v is None for v in sec_consumed_list): sector_col_to_merge = 'SectorProducedBy' elif all(v is None for v in sec_produced_list): sector_col_to_merge = 'SectorConsumedBy' else: log.error('There is not a clear sector column to base ' 'merge with helper allocation dataset') # merge allocation df with helper df based on sectors, # depending on geo scales of dfs if (attr['helper_from_scale'] == 'state') and \ (attr['allocation_from_scale'] == 'county'): helper_allocation.loc[:, 'Location_tmp'] = \ helper_allocation['Location'].apply(lambda x: x[0:2]) df_w_sector.loc[:, 'Location_tmp'] = \ df_w_sector['Location'].apply(lambda x: x[0:2]) # merge_columns.append('Location_tmp') compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location_tmp', 'Sector', 'HelperFlow']], how='left', left_on=['Location_tmp', sector_col_to_merge], right_on=['Location_tmp', 'Sector']) modified_fba_allocation = \ modified_fba_allocation.drop(columns=['Location_tmp']) elif (attr['helper_from_scale'] == 'national') and \ (attr['allocation_from_scale'] != 'national'): compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation = \ df_w_sector.merge(helper_allocation[['Sector', 'HelperFlow']], how='left', left_on=[sector_col_to_merge], right_on=['Sector']) else: compare_df_units(df_w_sector, helper_allocation) modified_fba_allocation =\ df_w_sector.merge( helper_allocation[['Location', 'Sector', 'HelperFlow']], left_on=['Location', sector_col_to_merge], right_on=['Location', 'Sector'], how='left') # load bea codes that sub for naics bea = return_bea_codes_used_as_naics() # replace sector column and helperflow value if the sector column to # merge is in the bea list to prevent dropped data modified_fba_allocation['Sector'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation[sector_col_to_merge], modified_fba_allocation['Sector']) modified_fba_allocation['HelperFlow'] = \ np.where(modified_fba_allocation[sector_col_to_merge].isin(bea), modified_fba_allocation['FlowAmount'], modified_fba_allocation['HelperFlow']) # modify flow amounts using helper data if 'multiplication' in attr['helper_method']: # if missing values (na or 0), replace with national level values replacement_values =\ helper_allocation[helper_allocation['Location'] == US_FIPS].reset_index(drop=True) replacement_values = \ replacement_values.rename( columns={"HelperFlow": 'ReplacementValue'}) compare_df_units(modified_fba_allocation, replacement_values) modified_fba_allocation = modified_fba_allocation.merge( replacement_values[['Sector', 'ReplacementValue']], how='left') modified_fba_allocation.loc[:, 'HelperFlow'] = \ modified_fba_allocation['HelperFlow'].fillna( modified_fba_allocation['ReplacementValue']) modified_fba_allocation.loc[:, 'HelperFlow'] =\ np.where(modified_fba_allocation['HelperFlow'] == 0, modified_fba_allocation['ReplacementValue'], modified_fba_allocation['HelperFlow']) # replace non-existent helper flow values with a 0, # so after multiplying, don't have incorrect value associated with # new unit modified_fba_allocation['HelperFlow'] =\ modified_fba_allocation['HelperFlow'].fillna(value=0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['HelperFlow'] # drop columns modified_fba_allocation =\ modified_fba_allocation.drop( columns=["HelperFlow", 'ReplacementValue', 'Sector']) elif attr['helper_method'] == 'proportional': modified_fba_allocation =\ proportional_allocation_by_location_and_activity( modified_fba_allocation, sector_col_to_merge) modified_fba_allocation['FlowAmountRatio'] =\ modified_fba_allocation['FlowAmountRatio'].fillna(0) modified_fba_allocation.loc[:, 'FlowAmount'] = \ modified_fba_allocation['FlowAmount'] * \ modified_fba_allocation['FlowAmountRatio'] modified_fba_allocation =\ modified_fba_allocation.drop( columns=['FlowAmountRatio', 'HelperFlow', 'Sector']) elif attr['helper_method'] == 'proportional-flagged': # calculate denominators based on activity and 'flagged' column modified_fba_allocation =\ modified_fba_allocation.assign( Denominator=modified_fba_allocation.groupby( ['FlowName', 'ActivityConsumedBy', 'Location', 'disaggregate_flag'])['HelperFlow'].transform('sum')) modified_fba_allocation = modified_fba_allocation.assign( FlowAmountRatio=modified_fba_allocation['HelperFlow'] / modified_fba_allocation['Denominator']) modified_fba_allocation =\ modified_fba_allocation.assign( FlowAmount=modified_fba_allocation['FlowAmount'] * modified_fba_allocation['FlowAmountRatio']) modified_fba_allocation =\ modified_fba_allocation.drop( columns=['disaggregate_flag', 'Sector', 'HelperFlow', 'Denominator', 'FlowAmountRatio']) # run sector aggregation modified_fba_allocation = \ sector_aggregation(modified_fba_allocation, fba_wsec_default_grouping_fields) # drop rows of 0 modified_fba_allocation =\ modified_fba_allocation[ modified_fba_allocation['FlowAmount'] != 0].reset_index(drop=True) modified_fba_allocation.loc[modified_fba_allocation['Unit'] == 'gal/employee', 'Unit'] = 'gal' # option to scale up fba values if 'scaled' in attr['helper_method']: log.info("Scaling %s to FBA values", attr['helper_source']) modified_fba_allocation = \ dynamically_import_fxn( attr['allocation_source'], attr["scale_helper_results"])( modified_fba_allocation, attr, download_FBA_if_missing=download_FBA_if_missing) return modified_fba_allocation
def allocate_dropped_sector_data(df_load, target_sector_level): """ Determine rows of data that will be lost if subset data at target sector level Equally allocate parent NAICS to child NAICS where child NAICS missing :param df: df, FBS format :param target_sector_level: str, target NAICS level for FBS output :return: df, with all child NAICS at target sector level """ # exclude nonsectors df = replace_NoneType_with_empty_cells(df_load) rows_lost = pd.DataFrame() for i in range(2, sector_level_key[target_sector_level]): # create df of i length df_x1 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]] == '')] df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & ( df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x3 = df.loc[ (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i)) & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))] df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False) # create df of i + 1 length df_y1 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y2 = df.loc[ df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1) & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)] df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False) # create temp sector columns in df y, that are i digits in length df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply( lambda x: x[0:i]) df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply( lambda x: x[0:i]) # don't modify household sector lengths or gov't transport df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'}) # merge the two dfs df_m = pd.merge(df_x, df_y[[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]], how='left', left_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'SectorProducedBy', 'SectorConsumedBy' ], right_on=[ 'Class', 'Context', 'FlowType', 'Flowable', 'Location', 'LocationSystem', 'Unit', 'Year', 'spb_tmp', 'scb_tmp' ]) # extract the rows that are not disaggregated to more specific naics rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())].reset_index(drop=True) # clean df rl = replace_strings_with_NoneType(rl) rl_list = rl[['SectorProducedBy', 'SectorConsumedBy']].drop_duplicates().values.tolist() # match sectors with target sector length sectors # import cw and subset to current sector length and target sector length cw_load = load_sector_length_crosswalk() nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & conditionally replace sector produced/consumed columns rl_m = pd.merge(rl, cw, how='left', left_on=[fbs_activity_fields[0]], right_on=[nlength]) rl_m.loc[rl_m[fbs_activity_fields[0]] != '', fbs_activity_fields[0]] = rl_m[target_sector_level] rl_m = rl_m.drop(columns=[nlength, target_sector_level]) rl_m2 = pd.merge(rl_m, cw, how='left', left_on=[fbs_activity_fields[1]], right_on=[nlength]) rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '', fbs_activity_fields[1]] = rl_m2[target_sector_level] rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level]) # create one sector count column rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna( rl_m2['sector_count_y']) rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'}) rl_m3 = rl_m3.drop(columns=['sector_count_y']) # calculate new flow amounts, based on sector count, # allocating equally to the new sector length codes rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count'] rl_m3 = rl_m3.drop(columns=['sector_count']) # append to df if len(rl) != 0: vLogDetailed.warning( 'Data found at %s digit NAICS not represented in current ' 'data subset: {}'.format(' '.join(map(str, rl_list))), str(i)) rows_lost = rows_lost.append(rl_m3, ignore_index=True) if len(rows_lost) != 0: vLogDetailed.info( 'Allocating FlowAmounts equally to each %s associated with ' 'the sectors previously dropped', target_sector_level) # add rows of missing data to the fbs sector subset df_w_lost_data = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_w_lost_data = replace_strings_with_NoneType(df_w_lost_data) return df_w_lost_data
def estimate_suppressed_data(df, sector_column, naics_level, sourcename): """ Estimate data suppression, by equally allocating parent NAICS values to child NAICS :param df: df with sector columns :param sector_column: str, column to estimate suppressed data for :param naics_level: numeric, indicate at what NAICS length to base estimated suppresed data off (2 - 5) :param sourcename: str, sourcename :return: df, with estimated suppressed data """ # exclude nonsectors df = replace_NoneType_with_empty_cells(df) # find the longest length sector max_length = max(df[sector_column].apply(lambda x: len(str(x))).unique()) # loop through starting at naics_level, use most detailed level possible to save time for i in range(naics_level, max_length): # create df of i length df_x = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # create df of i + 1 length df_y = df.loc[df[sector_column].apply(lambda x: len(x) == i + 1)] # create temp sector columns in df y, that are i digits in length df_y = df_y.assign(s_tmp=df_y[sector_column].apply(lambda x: x[0:i])) # create list of location and temp activity combos that contain a 0 missing_sectors_df = df_y[df_y['FlowAmount'] == 0] missing_sectors_list = missing_sectors_df[['Location', 's_tmp']].drop_duplicates().values.tolist() # subset the y df if len(missing_sectors_list) != 0: # new df of sectors that start with missing sectors. # drop last digit of the sector and sum flows set conditions suppressed_list = [] for q, r, in missing_sectors_list: c1 = df_y['Location'] == q c2 = df_y['s_tmp'] == r # subset data suppressed_list.append(df_y.loc[c1 & c2]) suppressed_sectors = pd.concat(suppressed_list, sort=False, ignore_index=True) # add column of existing allocated data for length of i suppressed_sectors['alloc_flow'] =\ suppressed_sectors.groupby(['Location', 's_tmp'])['FlowAmount'].transform('sum') # subset further so only keep rows of 0 value suppressed_sectors_sub = suppressed_sectors[suppressed_sectors['FlowAmount'] == 0] # add count suppressed_sectors_sub = \ suppressed_sectors_sub.assign(sector_count= suppressed_sectors_sub.groupby( ['Location', 's_tmp'] )['s_tmp'].transform('count')) # merge suppressed sector subset with df x df_m = pd.merge(df_x, suppressed_sectors_sub[['Class', 'Compartment', 'FlowType', 'FlowName', 'Location', 'LocationSystem', 'Unit', 'Year', sector_column, 's_tmp', 'alloc_flow', 'sector_count']], left_on=['Class', 'Compartment', 'FlowType', 'FlowName', 'Location', 'LocationSystem', 'Unit', 'Year', sector_column], right_on=['Class', 'Compartment', 'FlowType', 'FlowName', 'Location', 'LocationSystem', 'Unit', 'Year', 's_tmp'], how='right') # drop any rows where flowamount is none df_m = df_m[~df_m['FlowAmount'].isna()] # calculate estimated flows by subtracting the flow # amount already allocated from total flow of # sector one level up and divide by number of sectors with suppressed data df_m.loc[:, 'FlowAmount'] = \ (df_m['FlowAmount'] - df_m['alloc_flow']) / df_m['sector_count'] # only keep the suppressed sector subset activity columns df_m = df_m.drop(columns=[sector_column + '_x', 's_tmp', 'alloc_flow', 'sector_count']) df_m = df_m.rename(columns={sector_column + '_y': sector_column}) # reset activity columns if load_source_catalog()[sourcename]['sector-like_activities']: df_m = df_m.assign(ActivityProducedBy=df_m['SectorProducedBy']) df_m = df_m.assign(ActivityConsumedBy=df_m['SectorConsumedBy']) # drop the existing rows with suppressed data and append the new estimates from fba df modified_df =\ pd.merge(df, df_m[['FlowName', 'Location', sector_column]], indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1) df = pd.concat([modified_df, df_m], ignore_index=True) df_w_estimated_data = replace_strings_with_NoneType(df) return df_w_estimated_data
def equally_allocate_suppressed_parent_to_child_naics(df_load, sector_column, groupcols): """ Estimate data suppression, by equally allocating parent NAICS values to child NAICS :param df_load: df with sector columns :param sector_column: str, column to estimate suppressed data for :param groupcols: list, columns to group df by :return: df, with estimated suppressed data """ df = sector_disaggregation(df_load) df = replace_NoneType_with_empty_cells(df) df = df[df[sector_column] != ''] # determine if activities are sector-like, # if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: s = pd.unique(df_load['SourceName'])[0] sector_like_activities = check_activities_sector_like(s) # if activities are source like, drop from df, # add back in as copies of sector columns columns to keep if sector_like_activities: # subset df df_cols = [ e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df = df[df_cols] # drop activity from groupby groupcols = [ e for e in groupcols if e not in ['ActivityConsumedBy', 'ActivityProducedBy', 'Description'] ] # load naics 2 to naics 6 crosswalk cw_load = load_crosswalk('sector_length') cw_melt = cw_load.melt(id_vars=["NAICS_6"], var_name="NAICS_Length", value_name="NAICS_Match").drop( columns=['NAICS_Length']).drop_duplicates() df_sup = df[df['FlowAmount'] == 0].reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_sup, cw_melt, how='left', left_on=[sector_column], right_on=['NAICS_Match']) # drop rows where match is null because no additional naics to add new_naics = new_naics.dropna() new_naics[sector_column] = new_naics['NAICS_6'].copy() new_naics = new_naics.drop(columns=['NAICS_6', 'NAICS_Match']) # merge the new naics with the existing df, if data already # existed for a NAICS6, keep the original dfm = pd.merge(new_naics[groupcols], df, how='left', on=groupcols, indicator=True).query('_merge=="left_only"').drop('_merge', axis=1) dfm = replace_NoneType_with_empty_cells(dfm) dfm = dfm.fillna(0) df = pd.concat([df, dfm], sort=True, ignore_index=True) # add length column and subset the data # subtract out existing data at NAICS6 from total data # at a length where no suppressed data df = df.assign(secLength=df[sector_column].apply(lambda x: len(x))) # add column for each state of sector length where # there are no missing values df_sup = df_sup.assign( secLength=df_sup[sector_column].apply(lambda x: len(x))) df_sup2 = (df_sup.groupby( ['FlowName', 'Compartment', 'Location'])['secLength'].agg(lambda x: x.min() - 1).reset_index( name='secLengthsup')) # merge the dfs and sub out the last sector lengths with # all data for each state drop states that don't have suppressed dat df1 = df.merge(df_sup2) df2 = df1[df1['secLength'] == 6].reset_index(drop=True) # determine sector to merge on df2.loc[:, 'mergeSec'] = df2.apply( lambda x: x[sector_column][:x['secLengthsup']], axis=1) sum_cols = [ e for e in fba_default_grouping_fields if e not in ['ActivityConsumedBy', 'ActivityProducedBy'] ] sum_cols.append('mergeSec') df2 = df2.assign( FlowAlloc=df2.groupby(sum_cols)['FlowAmount'].transform('sum')) # rename columns for the merge and define merge cols df2 = df2.rename(columns={ sector_column: 'NewNAICS', 'mergeSec': sector_column }) # keep flows with 0 flow df3 = df2[df2['FlowAmount'] == 0].reset_index(drop=True) m_cols = groupcols + ['NewNAICS', 'FlowAlloc'] # merge the two dfs dfe = df1.merge(df3[m_cols]) # add count column used to divide the unallocated flows dfe = dfe.assign( secCount=dfe.groupby(groupcols)['NewNAICS'].transform('count')) dfe = dfe.assign(newFlow=(dfe['FlowAmount'] - dfe['FlowAlloc']) / dfe['secCount']) # reassign values and drop columns dfe = dfe.assign(FlowAmount=dfe['newFlow']) dfe[sector_column] = dfe['NewNAICS'].copy() dfe = dfe.drop(columns=['NewNAICS', 'FlowAlloc', 'secCount', 'newFlow']) # new df with estimated naics6 dfn = pd.concat([df, dfe], ignore_index=True) dfn2 = dfn[dfn['FlowAmount'] != 0].reset_index(drop=True) dfn2 = dfn2.drop(columns=['secLength']) dff = sector_aggregation(dfn2, fba_wsec_default_grouping_fields) # if activities are source-like, set col values as copies # of the sector columns if sector_like_activities: dff = dff.assign(ActivityProducedBy=dff['SectorProducedBy']) dff = dff.assign(ActivityConsumedBy=dff['SectorConsumedBy']) # reindex columns dff = dff.reindex(df_load.columns, axis=1) # replace null values dff = replace_strings_with_NoneType(dff).reset_index(drop=True) return dff
def compare_geographic_totals(df_subset, df_load, sourcename, attr, activity_set, activity_names): """ Check for any data loss between the geoscale used and published national data :param df_subset: df, after subset by geography :param df_load: df, loaded data, including published national data :param sourcename: str, source name :param attr: dictionary, attributes :param activity_set: str, activity set :param activity_names: list of names in the activity set by which to subset national level data :return: df, comparing published national level data to df subset """ # subset df_load to national level nat = df_load[df_load['Location'] == US_FIPS].reset_index( drop=True).rename(columns={'FlowAmount': 'FlowAmount_nat'}) # if df len is not 0, continue with comparison if len(nat) != 0: # subset national level data by activity set names nat = nat[(nat[fba_activity_fields[0]].isin(activity_names)) | ( nat[fba_activity_fields[1]].isin(activity_names))].reset_index( drop=True) nat = replace_strings_with_NoneType(nat) # drop the geoscale in df_subset and sum sub = df_subset.assign(Location=US_FIPS) # depending on the datasource, might need to rename some # strings for national comparison sub = rename_column_values_for_comparison(sub, sourcename) sub2 = aggregator(sub, fba_default_grouping_fields).rename( columns={'FlowAmount': 'FlowAmount_sub'}) # compare df merge_cols = [ 'Class', 'SourceName', 'FlowName', 'Unit', 'FlowType', 'ActivityProducedBy', 'ActivityConsumedBy', 'Compartment', 'Location', 'LocationSystem', 'Year' ] # comapare units compare_df_units(nat, sub2) df_m = pd.merge(nat[merge_cols + ['FlowAmount_nat']], sub2[merge_cols + ['FlowAmount_sub']], how='outer') df_m = df_m.assign(FlowAmount_diff=df_m['FlowAmount_nat'] - df_m['FlowAmount_sub']) df_m = df_m.assign(Percent_Diff=( abs(df_m['FlowAmount_diff'] / df_m['FlowAmount_nat']) * 100)) df_m = df_m[df_m['FlowAmount_diff'] != 0].reset_index(drop=True) # subset the merged df to what to include in the validation df # include data where percent difference is > 1 or where value is nan df_m_sub = df_m[(df_m['Percent_Diff'] > 1) | (df_m['Percent_Diff'].isna())].reset_index(drop=True) if len(df_m_sub) == 0: vLog.info( 'No data loss greater than 1%% between national ' 'level data and %s subset', attr['allocation_from_scale']) else: vLog.info( 'There are data differences between published national' ' values and %s subset, saving to validation log', attr['allocation_from_scale']) vLogDetailed.info( 'Comparison of National FlowAmounts to aggregated data ' 'subset for %s: \n {}'.format(df_m_sub.to_string()), activity_set)
def replace_naics_w_naics_from_another_year(df_load, sectorsourcename): """ Replace any non sectors with sectors. :param df_load: df with sector columns or sector-like activities :param sectorsourcename: str, sector source name (ex. NAICS_2012_Code) :return: df, with non-sectors replaced with sectors """ # from flowsa.flowbyfunctions import aggregator # drop NoneType df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True) # load the mastercroswalk and subset by sectorsourcename, # save values to list cw_load = load_crosswalk('sector_timeseries') cw = cw_load[sectorsourcename].drop_duplicates().tolist() # load melted crosswalk cw_melt = melt_naics_crosswalk() # drop the count column cw_melt = cw_melt.drop(columns='naics_count') # determine which headers are in the df if 'SectorConsumedBy' in df: column_headers = ['SectorProducedBy', 'SectorConsumedBy'] else: column_headers = ['ActivityProducedBy', 'ActivityConsumedBy'] # check if there are any sectors that are not in the naics 2012 crosswalk non_naics = check_if_sectors_are_naics(df, cw, column_headers) # loop through the df headers and determine if value is # not in crosswalk list if len(non_naics) != 0: vLog.debug( 'Checking if sectors represent a different ' 'NAICS year, if so, replace with %s', sectorsourcename) for c in column_headers: # merge df with the melted sector crosswalk df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left') # if there is a value in the sectorsourcename column, # use that value to replace sector in column c if value in # column c is in the non_naics list df[c] = np.where((df[c] == df['NAICS']) & (df[c].isin(non_naics)), df[sectorsourcename], df[c]) # multiply the FlowAmount col by allocation_ratio df.loc[df[c] == df[sectorsourcename], 'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio'] # drop columns df = df.drop( columns=[sectorsourcename, 'NAICS', 'allocation_ratio']) vLog.debug('Replaced NAICS with %s', sectorsourcename) # check if there are any sectors that are not in # the naics 2012 crosswalk vLog.debug('Check again for non NAICS 2012 Codes') nonsectors = check_if_sectors_are_naics(df, cw, column_headers) if len(nonsectors) != 0: vLog.debug('Dropping non-NAICS from dataframe') for c in column_headers: # drop rows where column value is in the nonnaics list df = df[~df[c].isin(nonsectors)] # aggregate data possible_column_headers = \ ('FlowAmount', 'Spread', 'Min', 'Max', 'DataReliability', 'TemporalCorrelation', 'GeographicalCorrelation', 'TechnologicalCorrelation', 'DataCollection', 'Description') # list of column headers to group aggregation by groupby_cols = [ e for e in df.columns.values.tolist() if e not in possible_column_headers ] df = aggregator(df, groupby_cols) # drop rows where both SectorConsumedBy and SectorProducedBy NoneType if 'SectorConsumedBy' in df: df_drop = df[(df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull())] if len(df_drop) != 0: activities_dropped = pd.unique( df_drop[['ActivityConsumedBy', 'ActivityProducedBy']].values.ravel('K')) activities_dropped = list( filter(lambda x: x is not None, activities_dropped)) vLog.debug('Dropping rows where the Activity columns contain %s', ', '.join(activities_dropped)) df = df[~((df['SectorConsumedBy'].isnull()) & (df['SectorProducedBy'].isnull()))].reset_index(drop=True) else: df = df[~((df['ActivityConsumedBy'].isnull()) & (df['ActivityProducedBy'].isnull()))].reset_index(drop=True) df = replace_strings_with_NoneType(df) return df
def disaggregate_cropland(fba_w_sector, attr, method, year, sector_column): """ In the event there are 4 (or 5) digit naics for cropland at the county level, use state level harvested cropland to create ratios :param fba_w_sector: df, CoA cropland data, FBA format with sector columns :param attr: dictionary, attribute data from method yaml for activity set :param year: str, year of data :param sector_column: str, the sector column on which to make df modifications (SectorProducedBy or SectorConsumedBy) :param attr: dictionary, attribute data from method yaml for activity set :return: df, CoA cropland data disaggregated """ # tmp drop NoneTypes fba_w_sector = replace_NoneType_with_empty_cells(fba_w_sector) # drop pastureland data crop = fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) != '112'].reset_index(drop=True) # drop sectors < 4 digits crop = crop[crop[sector_column].apply(lambda x: len(x) > 3)].reset_index( drop=True) # create tmp location crop = crop.assign(Location_tmp=crop['Location'].apply(lambda x: x[0:2])) # load the relevant state level harvested cropland by naics naics = load_fba_w_standardized_units(datasource="USDA_CoA_Cropland_NAICS", year=year, flowclass='Land') # subset the harvested cropland by naics naics = naics[naics['FlowName'] == 'AG LAND, CROPLAND, HARVESTED'].reset_index(drop=True) # drop the activities that include '&' naics = naics[~naics['ActivityConsumedBy'].str.contains('&')].reset_index( drop=True) # add sectors naics = add_sectors_to_flowbyactivity( naics, sectorsourcename=method['target_sector_source']) # estimate suppressed data by equally allocating parent to child naics naics = estimate_suppressed_data(naics, 'SectorConsumedBy', 3, 'USDA_CoA_Cropland_NAICS') # add missing fbs fields naics = clean_df(naics, flow_by_sector_fields, fbs_fill_na_dict) # aggregate sectors to create any missing naics levels group_cols = fbs_default_grouping_fields # group_cols = [e for e in group_cols if e not in ('SectorProducedBy', 'SectorConsumedBy')] # group_cols.append(sector_column) naics2 = sector_aggregation(naics, group_cols) # add missing naics5/6 when only one naics5/6 associated with a naics4 naics3 = sector_disaggregation(naics2) # drop rows where FlowAmount 0 # naics3 = naics3[~((naics3['SectorProducedBy'] == '') & (naics3['SectorConsumedBy'] == ''))] naics3 = naics3.loc[naics3['FlowAmount'] != 0] # create ratios naics4 = sector_ratios(naics3, sector_column) # create temporary sector column to match the two dfs on naics4 = naics4.assign( Location_tmp=naics4['Location'].apply(lambda x: x[0:2])) # tmp drop Nonetypes naics4 = replace_NoneType_with_empty_cells(naics4) # check units in prep for merge compare_df_units(crop, naics4) # for loop through naics lengths to determine naics 4 and 5 digits to disaggregate for i in range(4, 6): # subset df to sectors with length = i and length = i + 1 crop_subset = crop.loc[crop[sector_column].apply( lambda x: i + 1 >= len(x) >= i)] crop_subset = crop_subset.assign( Sector_tmp=crop_subset[sector_column].apply(lambda x: x[0:i])) # if duplicates drop all rows df = crop_subset.drop_duplicates(subset=['Location', 'Sector_tmp'], keep=False).reset_index(drop=True) # drop sector temp column df = df.drop(columns=["Sector_tmp"]) # subset df to keep the sectors of length i df_subset = df.loc[df[sector_column].apply(lambda x: len(x) == i)] # subset the naics df where naics length is i + 1 naics_subset = \ naics4.loc[naics4[sector_column].apply(lambda x: len(x) == i + 1)].reset_index(drop=True) naics_subset = naics_subset.assign( Sector_tmp=naics_subset[sector_column].apply(lambda x: x[0:i])) # merge the two df based on locations df_subset = pd.merge(df_subset, naics_subset[[ sector_column, 'FlowAmountRatio', 'Sector_tmp', 'Location_tmp' ]], how='left', left_on=[sector_column, 'Location_tmp'], right_on=['Sector_tmp', 'Location_tmp']) # create flow amounts for the new NAICS based on the flow ratio df_subset.loc[:, 'FlowAmount'] = df_subset['FlowAmount'] * df_subset[ 'FlowAmountRatio'] # drop rows of 0 and na df_subset = df_subset[df_subset['FlowAmount'] != 0] df_subset = df_subset[~df_subset['FlowAmount'].isna()].reset_index( drop=True) # drop columns df_subset = df_subset.drop( columns=[sector_column + '_x', 'FlowAmountRatio', 'Sector_tmp']) # rename columns df_subset = df_subset.rename( columns={sector_column + '_y': sector_column}) # tmp drop Nonetypes df_subset = replace_NoneType_with_empty_cells(df_subset) # add new rows of data to crop df crop = pd.concat([crop, df_subset], sort=True).reset_index(drop=True) # clean up df crop = crop.drop(columns=['Location_tmp']) # equally allocate any further missing naics crop = allocate_dropped_sector_data(crop, 'NAICS_6') # pasture data pasture = \ fba_w_sector.loc[fba_w_sector[sector_column].apply(lambda x: x[0:3]) == '112'].reset_index(drop=True) # concat crop and pasture fba_w_sector = pd.concat([pasture, crop], sort=True).reset_index(drop=True) # fill empty cells with NoneType fba_w_sector = replace_strings_with_NoneType(fba_w_sector) return fba_w_sector
def determine_flows_requiring_disaggregation(df_load, attr, method, sector_column): """ The MECS Land data provides FlowAmounts for NAICS3-6. We use BLS QCEW employment data to determine land use for different industries. To accurately estimate land use per industry, existing FlowAmounts for a particular NAICS level (NAICS6) for example, should be subtracted from the possible FlowAmounts for other NAICS6 that share the first 5 digits. For Example, there is data for '311', '3112', and '311221' in the 2014 dataset. FlowAmounts for allocation by employment for NAICS6 are based on the provided '3112' FlowAmounts. However, since there is data at one NAICS6 (311221), the FlowAmount for that NAICS6 should be subtracted from other NAICS6 to accurately depict the remaining 'FlowAmount' that requires a secondary source (Employment data) for allocation. :param df_load: df, EIA MECS Land FBA :param attr: dictionary, attribute data from method yaml for activity set :param method: dictionary, FBS method yaml :param sector_column: str, sector column to flag ('SectorProducedBy', 'SectorConsumedBy') :return: A dataframe with a column 'disaggregate_flag', if '1', row requires secondary source to calculate FlowAmount, if '0' FlowAmount does not require modifications """ from flowsa.sectormapping import add_sectors_to_flowbyactivity df_load = replace_NoneType_with_empty_cells(df_load) # drop rows where there is no value in sector column, which might occur if # sector-like activities have a "-" in them df_load = df_load[df_load[sector_column] != ''] # determine activity column if sector_column == 'SectorConsumedBy': activity_column = 'ActivityConsumedBy' else: activity_column = 'ActivityProducedBy' # original df - subset # subset cols of original df dfo = df_load[['FlowAmount', 'Location', sector_column]] # min and max length min_length = min( df_load[sector_column].apply(lambda x: len(str(x))).unique()) max_length = max( df_load[sector_column].apply(lambda x: len(str(x))).unique()) # subset by sector length, creating a df for s in range(min_length, max_length + 1): df_name = 'dfo_naics' + str(s) vars()[df_name] = dfo[dfo[sector_column].apply( lambda x: len(x) == s)].reset_index(drop=True) vars()[df_name] = vars()[df_name].assign(SectorMatch=vars( )[df_name][sector_column].apply(lambda x: x[:len(x) - 1])) # loop through the dfs, merging by sector match. If there is a match, # subtract the value, if there is not a match, drop last digit in # sectormatch, add row to the next df, and repeat df_merged = pd.DataFrame() df_not_merged = pd.DataFrame() for s in range(max_length, min_length, -1): df_name_1 = 'dfo_naics' + str(s - 1) df_name_2 = 'dfo_naics' + str(s) # concat df 1 with df_not_merged df2 = pd.concat([vars()[df_name_2], df_not_merged]) df2 = df2.rename(columns={ 'FlowAmount': 'SubtractFlow', sector_column: 'Sector' }) df_m = pd.merge( vars()[df_name_1][['FlowAmount', 'Location', sector_column]], df2, left_on=['Location', sector_column], right_on=['Location', 'SectorMatch'], indicator=True, how='outer') # subset by merge and append to appropriate df df_both = df_m[df_m['_merge'] == 'both'] if len(df_both) != 0: # drop columns df_both1 = df_both.drop( columns=['Sector', 'SectorMatch', '_merge']) # aggregate before subtracting df_both2 = df_both1.groupby( ['FlowAmount', 'Location', sector_column], as_index=False).agg({"SubtractFlow": sum}) df_both3 = df_both2.assign(FlowAmount=df_both2['FlowAmount'] - df_both2['SubtractFlow']) df_both3 = df_both3.drop(columns=['SubtractFlow']) # drop rows where 0 # df_both = df_both[df_both['FlowAmount'] != 0] df_merged = df_merged.append(df_both3, ignore_index=True) df_right = df_m[df_m['_merge'] == 'right_only'] if len(df_right) != 0: df_right = df_right.drop( columns=['FlowAmount', sector_column, '_merge']) df_right = df_right.rename(columns={ 'SubtractFlow': 'FlowAmount', 'Sector': sector_column }) # remove another digit from Sectormatch df_right = df_right.assign(SectorMatch=df_right[sector_column]. apply(lambda x: x[:(s - 2)])) # reorder df_right = df_right[[ 'FlowAmount', 'Location', sector_column, 'SectorMatch' ]] df_not_merged = df_not_merged.append(df_right, ignore_index=True) # rename the flowamount column df_merged = df_merged.rename(columns={ 'FlowAmount': 'FlowAmountNew', sector_column: activity_column }) # In the original EIA MECS df, some of the NAICS 6-digit codes sum # to a value greater than published NAICS3, due to rounding. In these # cases, the new FlowAmount is a negative number. Reset neg numbers to 0 df_merged.loc[df_merged['FlowAmountNew'] < 0, 'FlowAmountNew'] = 0 # in the original df, drop sector columns re-add sectors, this time with # sectors = 'aggregated' dfn = df_load.drop(columns=[ 'SectorProducedBy', 'ProducedBySectorType', 'SectorConsumedBy', 'ConsumedBySectorType', 'SectorSourceName' ]) dfn = add_sectors_to_flowbyactivity( dfn, sectorsourcename=method['target_sector_source'], overwrite_sectorlevel='aggregated') # add column noting that these columns require an allocation ratio dfn = dfn.assign(disaggregate_flag=1) # create lists of sectors to drop list_original = df_load[activity_column].drop_duplicates().tolist() # drop values in original df dfn2 = dfn[~dfn[sector_column].isin(list_original)].sort_values( [activity_column, sector_column]).reset_index(drop=True) # drop the sectors that are duplicated by different naics being # mapped to naics6 if len(dfn2[dfn2.duplicated(subset=['Location', sector_column], keep=False)]) > 0: dfn2.drop_duplicates(subset=['Location', sector_column], keep='last', inplace=True) # want to allocate at NAICS6, so drop all other sectors dfn2 = \ dfn2[dfn2[sector_column].apply(lambda x: len(x) == 6)].reset_index( drop=True).sort_values([sector_column]) # merge revised flowamounts back with modified original df df_to_allocate = dfn2.merge(df_merged, how='left') # replace FlowAmount with newly calculated FlowAmount, # which represents Flows that are currently unaccounted for at NAICS6 df_to_allocate['FlowAmount'] = np.where( df_to_allocate['FlowAmountNew'].notnull(), df_to_allocate['FlowAmountNew'], df_to_allocate['FlowAmount']) # drop rows where flow amount = 0 - flows are captured through other NAICS6 df_to_allocate2 = df_to_allocate[df_to_allocate['FlowAmount'] != 0].drop( columns='FlowAmountNew').reset_index(drop=True) # merge the original df with modified # add column to original df for disaggregate_flag df_load = df_load.assign(disaggregate_flag=0) # concat the two dfs and sort df_c = pd.concat([df_load, df_to_allocate2], ignore_index=True).sort_values([sector_column ]).reset_index(drop=True) df_c = replace_strings_with_NoneType(df_c).sort_values([sector_column]) return df_c
def proportional_allocation_by_location_and_activity(df_load, sectorcolumn): """ Creates a proportional allocation within each aggregated sector within a location :param df_load: df with sector columns :param sectorcolumn: str, sector column for which to create allocation ratios :return: df, with 'FlowAmountRatio' and 'HelperFlow' columns """ # tmp replace NoneTypes with empty cells df = replace_NoneType_with_empty_cells(df_load).reset_index(drop=True) # want to create denominator based on shortest length naics for each # activity/location grouping_cols = [ e for e in [ 'FlowName', 'Location', 'Activity', 'ActivityConsumedBy', 'ActivityProducedBy', 'Class', 'SourceName', 'Unit', 'FlowType', 'Compartment', 'Year' ] if e in df.columns.values.tolist() ] activity_cols = [ e for e in ['Activity', 'ActivityConsumedBy', 'ActivityProducedBy'] if e in df.columns.values.tolist() ] # trim whitespace df[sectorcolumn] = df[sectorcolumn].str.strip() # to create the denominator dataframe first add a column that captures # the sector length denom_df = df.assign(sLen=df[sectorcolumn].str.len()) denom_df = denom_df[denom_df['sLen'] == denom_df.groupby(activity_cols) ['sLen'].transform(min)].drop(columns='sLen') denom_df.loc[:, 'Denominator'] = \ denom_df.groupby(grouping_cols)['HelperFlow'].transform('sum') # list of column headers, that if exist in df, should be aggregated # using the weighted avg fxn possible_column_headers = ('Location', 'LocationSystem', 'Year', 'Activity', 'ActivityConsumedBy', 'ActivityProducedBy') # list of column headers that do exist in the df being aggregated column_headers = [ e for e in possible_column_headers if e in denom_df.columns.values.tolist() ] merge_headers = column_headers.copy() column_headers.append('Denominator') # create subset of denominator values based on Locations and Activities denom_df_2 = \ denom_df[column_headers].drop_duplicates().reset_index(drop=True) # merge the denominator column with fba_w_sector df allocation_df = df.merge(denom_df_2, how='left', left_on=merge_headers, right_on=merge_headers) # calculate ratio allocation_df.loc[:, 'FlowAmountRatio'] = \ allocation_df['HelperFlow'] / allocation_df['Denominator'] allocation_df = allocation_df.drop(columns=['Denominator']).reset_index( drop=True) # where parent NAICS are not found in the allocation dataset, make sure # those child NAICS are not dropped allocation_df['FlowAmountRatio'] = \ allocation_df['FlowAmountRatio'].fillna(1) # fill empty cols with NoneType allocation_df = replace_strings_with_NoneType(allocation_df) # fill na values with 0 allocation_df['HelperFlow'] = allocation_df['HelperFlow'].fillna(0) return allocation_df
def calculate_flowamount_diff_between_dfs(dfa_load, dfb_load): """ Calculate the differences in FlowAmounts between two dfs :param dfa_load: df, initial df :param dfb_load: df, modified df :return: df, comparing changes in flowamounts between 2 dfs """ # subset the dataframes, only keeping data for easy # comparison of flowamounts drop_cols = [ 'Year', 'MeasureofSpread', 'Spread', 'DistributionType', 'Min', 'Max', 'DataReliability', 'DataCollection' ] # drop cols and rename, ignore error if a df does not # contain a column to drop dfa = dfa_load.drop( drop_cols, axis=1, errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Original'}) dfb = dfb_load.drop( drop_cols, axis=1, errors='ignore').rename(columns={'FlowAmount': 'FlowAmount_Modified'}) # create df dict for modified dfs created in for loop df_list = [] for d in ['a', 'b']: df_name = f'df{d}' # assign new column of geoscale by which to aggregate vars()[df_name + '2'] = vars()[df_name].assign(geoscale=np.where( vars()[df_name]['Location'].apply(lambda x: x.endswith('000')), 'state', 'county')) vars()[df_name + '2'] = vars()[df_name + '2'].assign( geoscale=np.where(vars()[df_name + '2']['Location'] == '00000', 'national', vars()[df_name + '2']['geoscale'])) # ensure all nan/nones filled/match vars()[df_name + '2'] = \ replace_strings_with_NoneType(vars()[df_name+'2']) df_list.append(vars()[df_name + '2']) # merge the two dataframes df = df_list[0].merge(df_list[1], how='outer') # determine if any new data is negative dfn = df[df['FlowAmount_Modified'] < 0].reset_index(drop=True) if len(dfn) > 0: vLog.info('There are negative FlowAmounts in new dataframe, ' 'see Validation Log') vLogDetailed.info('Negative FlowAmounts in new dataframe: ' '\n {}'.format(dfn.to_string())) # Because code will sometimes change terminology, aggregate # data by context and flowable to compare df differences # subset df dfs = df[[ 'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'FlowAmount_Original', 'FlowAmount_Modified', 'Unit', 'geoscale' ]] agg_cols = [ 'Flowable', 'Context', 'ActivityProducedBy', 'ActivityConsumedBy', 'Unit', 'geoscale' ] dfagg = dfs.groupby(agg_cols, dropna=False, as_index=False).agg({ 'FlowAmount_Original': sum, 'FlowAmount_Modified': sum }) # column calculating difference dfagg['FlowAmount_Difference'] = \ dfagg['FlowAmount_Modified'] - dfagg['FlowAmount_Original'] dfagg['Percent_Difference'] = (dfagg['FlowAmount_Difference'] / dfagg['FlowAmount_Original']) * 100 # drop rows where difference = 0 dfagg2 = dfagg[dfagg['FlowAmount_Difference'] != 0].reset_index(drop=True) if len(dfagg2) == 0: vLogDetailed.info('No FlowAmount differences') else: # subset df and aggregate, also print out the total # aggregate diff at the geoscale dfagg3 = replace_strings_with_NoneType(dfagg).drop(columns=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'FlowAmount_Difference', 'Percent_Difference' ]) dfagg4 = dfagg3.groupby(['Flowable', 'Context', 'Unit', 'geoscale'], dropna=False, as_index=False).agg({ 'FlowAmount_Original': sum, 'FlowAmount_Modified': sum }) # column calculating difference dfagg4['FlowAmount_Difference'] = \ dfagg4['FlowAmount_Modified'] - dfagg4['FlowAmount_Original'] dfagg4['Percent_Difference'] = (dfagg4['FlowAmount_Difference'] / dfagg4['FlowAmount_Original']) * 100 # drop rows where difference = 0 dfagg5 = dfagg4[dfagg4['FlowAmount_Difference'] != 0].reset_index( drop=True) vLogDetailed.info('Total FlowAmount differences between dataframes: ' '\n {}'.format(dfagg5.to_string(), index=False)) # save detail output in log file vLogDetailed.info('Total FlowAmount differences by Activity Columns: ' '\n {}'.format(dfagg2.to_string(), index=False))
def check_for_missing_sector_data(df, target_sector_level): """ Modeled after validation.py check_if_losing_sector_data Allocates flow amount equally across child NAICS when parent NAICS is not target_level :param df: df :param target_sector_level: str, final sector level of FBS (ex. NAICS_6) :return: df with missing sector level data """ from flowsa.dataclean import replace_NoneType_with_empty_cells from flowsa.dataclean import replace_strings_with_NoneType # temporarily replace null values with empty cells df = replace_NoneType_with_empty_cells(df) activity_field = "SectorProducedBy" rows_lost = pd.DataFrame() cw_load = load_crosswalk('sector_length') for i in range(3, sector_level_key[target_sector_level]): # create df of i length df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)] # import cw and subset to current sector length and # target sector length nlength = list(sector_level_key.keys())[list( sector_level_key.values()).index(i)] cw = cw_load[[nlength, target_sector_level]].drop_duplicates() # add column with counts cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count') # merge df & replace sector produced columns df_x = pd.merge(df_subset, cw, how='left', left_on=[activity_field], right_on=[nlength]) df_x[activity_field] = df_x[target_sector_level] df_x = df_x.drop(columns=[nlength, target_sector_level]) # calculate new flow amounts, based on sector count, # allocating equally to the new sector length codes df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count'] df_x = df_x.drop(columns=['sector_count']) # replace null values with empty cells df_x = replace_NoneType_with_empty_cells(df_x) # append to df sector_list = df_subset[activity_field].drop_duplicates() if len(df_x) != 0: log.warning( 'Data found at %s digit NAICS to be allocated: ' '{}'.format(' '.join(map(str, sector_list))), str(i)) rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True) if len(rows_lost) == 0: log.info('No data loss from NAICS in dataframe') else: log.info('Allocating FlowAmounts equally to each %s', target_sector_level) # add rows of missing data to the fbs sector subset df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True) df_allocated = df_allocated.loc[df_allocated[activity_field].apply( lambda x: len(x) == sector_level_key[target_sector_level])] df_allocated.reset_index(inplace=True) # replace empty cells with NoneType (if dtype is object) df_allocated = replace_strings_with_NoneType(df_allocated) return df_allocated
def sector_aggregation(df_load, group_cols): """ Function that checks if a sector length exists, and if not, sums the less aggregated sector :param df_load: Either a flowbyactivity df with sectors or a flowbysector df :param group_cols: columns by which to aggregate :return: df, with aggregated sector values """ # determine if activities are sector-like, if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: # load source catalog cat = load_source_catalog() # for s in pd.unique(flowbyactivity_df['SourceName']): s = pd.unique(df_load['SourceName'])[0] # load catalog info for source src_info = cat[s] sector_like_activities = src_info['sector-like_activities'] # ensure None values are not strings df = replace_NoneType_with_empty_cells(df_load) # if activities are source like, drop from df and group calls, # add back in as copies of sector columns columns to keep if sector_like_activities: group_cols = [e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] # subset df df_cols = [e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy')] df = df[df_cols] # find the longest length sector length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply( lambda x: x.str.len()).max().max() length = int(length) # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(length - 1, 1, -1): # subset df to sectors with length = i and length = i + 1 df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create a list of i digit sectors in df subset sector_subset = df_subset[ ['Location', fbs_activity_fields[0], fbs_activity_fields[1]]].drop_duplicates().reset_index(drop=True) df_sectors = sector_subset.copy() df_sectors.loc[:, 'SectorProducedBy'] = \ df_sectors['SectorProducedBy'].apply(lambda x: x[0:i]) df_sectors.loc[:, 'SectorConsumedBy'] = \ df_sectors['SectorConsumedBy'].apply(lambda x: x[0:i]) sector_list = df_sectors.drop_duplicates().values.tolist() # create a list of sectors that are exactly i digits long # where either sector column is i digits in length df_existing_1 = \ sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) | (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))] # where both sector columns are i digits in length df_existing_2 = \ sector_subset.loc[(sector_subset['SectorProducedBy'].apply(lambda x: len(x) == i)) & (sector_subset['SectorConsumedBy'].apply(lambda x: len(x) == i))] # concat existing dfs df_existing = pd.concat([df_existing_1, df_existing_2], sort=False) existing_sectors = df_existing.drop_duplicates().dropna().values.tolist() # list of sectors of length i that are not in sector list missing_sectors = [e for e in sector_list if e not in existing_sectors] if len(missing_sectors) != 0: # new df of sectors that start with missing sectors. # drop last digit of the sector and sum flows # set conditions agg_sectors_list = [] for q, r, s in missing_sectors: c1 = df_subset['Location'] == q c2 = df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i] == r) c3 = df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i] == s) # subset data agg_sectors_list.append(df_subset.loc[c1 & c2 & c3]) agg_sectors = pd.concat(agg_sectors_list, sort=False) agg_sectors = agg_sectors.loc[ (agg_sectors[fbs_activity_fields[0]].apply(lambda x: len(x) > i)) | (agg_sectors[fbs_activity_fields[1]].apply(lambda x: len(x) > i))] agg_sectors.loc[:, fbs_activity_fields[0]] = agg_sectors[fbs_activity_fields[0]].apply( lambda x: x[0:i]) agg_sectors.loc[:, fbs_activity_fields[1]] = agg_sectors[fbs_activity_fields[1]].apply( lambda x: x[0:i]) # aggregate the new sector flow amounts agg_sectors = aggregator(agg_sectors, group_cols) # append to df agg_sectors = replace_NoneType_with_empty_cells(agg_sectors) df = df.append(agg_sectors, sort=False).reset_index(drop=True) # manually modify non-NAICS codes that might exist in sector df.loc[:, 'SectorConsumedBy'] = np.where(df['SectorConsumedBy'].isin(['F0', 'F01']), 'F010', df['SectorConsumedBy']) # domestic/household df.loc[:, 'SectorProducedBy'] = np.where(df['SectorProducedBy'].isin(['F0', 'F01']), 'F010', df['SectorProducedBy']) # domestic/household # drop any duplicates created by modifying sector codes df = df.drop_duplicates() # if activities are source-like, set col values as copies of the sector columns if sector_like_activities: df = df.assign(ActivityProducedBy=df['SectorProducedBy']) df = df.assign(ActivityConsumedBy=df['SectorConsumedBy']) # reindex columns df = df.reindex(df_load.columns, axis=1) # replace null values df = replace_strings_with_NoneType(df) return df
def sector_disaggregation(df): """ function to disaggregate sectors if there is only one naics at a lower level works for lower than naics 4 :param df: A FBS df, must have sector columns :return: A FBS df with values for the missing naics5 and naics6 """ # ensure None values are not strings df = replace_NoneType_with_empty_cells(df) # load naics 2 to naics 6 crosswalk cw_load = load_sector_length_crosswalk() # for loop min length to 6 digits, where min length cannot be less than 2 length = df[[fbs_activity_fields[0], fbs_activity_fields[1]]].apply( lambda x: x.str.len()).min().min() if length < 2: length = 2 # appends missing naics levels to df for i in range(length, 6): sector_merge = 'NAICS_' + str(i) sector_add = 'NAICS_' + str(i+1) # subset the df by naics length cw = cw_load[[sector_merge, sector_add]] # only keep the rows where there is only one value in sector_add for a value in sector_merge cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True) sector_list = cw[sector_merge].values.tolist() # subset df to sectors with length = i and length = i + 1 df_subset = df.loc[df[fbs_activity_fields[0]].apply(lambda x: i + 1 >= len(x) >= i) | df[fbs_activity_fields[1]].apply(lambda x: i + 1 >= len(x) >= i)] # create new columns that are length i df_subset = df_subset.assign(SectorProduced_tmp= df_subset[fbs_activity_fields[0]].apply(lambda x: x[0:i])) df_subset = df_subset.assign(SectorConsumed_tmp= df_subset[fbs_activity_fields[1]].apply(lambda x: x[0:i])) # subset the df to the rows where the tmp sector columns are in naics list df_subset_1 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'] == "")] df_subset_2 = df_subset.loc[(df_subset['SectorProduced_tmp'] == "") & (df_subset['SectorConsumed_tmp'].isin(sector_list))] df_subset_3 = df_subset.loc[(df_subset['SectorProduced_tmp'].isin(sector_list)) & (df_subset['SectorConsumed_tmp'].isin(sector_list))] # concat existing dfs df_subset = pd.concat([df_subset_1, df_subset_2, df_subset_3], sort=False) # drop all rows with duplicate temp values, as a less aggregated naics exists # list of column headers, that if exist in df, should be # aggregated using the weighted avg fxn possible_column_headers = ('Flowable', 'FlowName', 'Unit', 'Context', 'Compartment', 'Location', 'Year', 'SectorProduced_tmp', 'SectorConsumed_tmp') # list of column headers that do exist in the df being subset cols_to_drop = [e for e in possible_column_headers if e in df_subset.columns.values.tolist()] df_subset = df_subset.drop_duplicates(subset=cols_to_drop, keep=False).reset_index(drop=True) # merge the naics cw new_naics = pd.merge(df_subset, cw[[sector_merge, sector_add]], how='left', left_on=['SectorProduced_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SPB"}) new_naics = new_naics.drop(columns=[sector_merge]) new_naics = pd.merge(new_naics, cw[[sector_merge, sector_add]], how='left', left_on=['SectorConsumed_tmp'], right_on=[sector_merge]) new_naics = new_naics.rename(columns={sector_add: "SCB"}) new_naics = new_naics.drop(columns=[sector_merge]) # drop columns and rename new sector columns new_naics = new_naics.drop(columns=["SectorProducedBy", "SectorConsumedBy", "SectorProduced_tmp", "SectorConsumed_tmp"]) new_naics = new_naics.rename(columns={"SPB": "SectorProducedBy", "SCB": "SectorConsumedBy"}) # append new naics to df new_naics['SectorConsumedBy'] = new_naics['SectorConsumedBy'].replace({np.nan: ""}) new_naics['SectorProducedBy'] = new_naics['SectorProducedBy'].replace({np.nan: ""}) new_naics = replace_NoneType_with_empty_cells(new_naics) df = pd.concat([df, new_naics], sort=True) # replace blank strings with None df = replace_strings_with_NoneType(df) return df
def update_naics_crosswalk(): """ update the useeior crosswalk with crosswalks created for flowsa datasets - want to add any NAICS > 6 digits Add NAICS 2002 :return: """ # read useeior master crosswalk, subset NAICS columns naics_load = import_useeior_mastercrosswalk() naics = naics_load[[ 'NAICS_2007_Code', 'NAICS_2012_Code', 'NAICS_2017_Code' ]].drop_duplicates().reset_index(drop=True) # convert all rows to string naics = naics.astype(str) # ensure all None are NoneType naics = replace_strings_with_NoneType(naics) # drop rows where all None naics = naics.dropna(how='all') # drop naics > 6 in mastercrosswalk (all manufacturing) because unused and slows functions naics = naics[naics['NAICS_2012_Code'].apply( lambda x: len(x) < 7)].reset_index(drop=True) # find any NAICS where length > 6 that are used for allocation purposes and add to naics list missing_naics_df_list = [] # read in all the crosswalk csv files (ends in toNAICS.csv) for file_name in glob.glob(datapath + "activitytosectormapping/" + '*_toNAICS.csv'): # skip Statistics Canada GDP because not all sectors relevant if file_name != crosswalkpath + 'Crosswalk_StatCan_GDP_toNAICS.csv': df = pd.read_csv(file_name, low_memory=False, dtype=str) # convert all rows to string df = df.astype(str) # determine sector year naics_year = df['SectorSourceName'].all() # subset dataframe so only sector df = df[['Sector']] # trim whitespace and cast as string, rename column df['Sector'] = df['Sector'].astype(str).str.strip() df = df.rename(columns={'Sector': naics_year}) # extract sector year column from master crosswalk df_naics = naics[[naics_year]] # find any NAICS that are in source crosswalk but not in mastercrosswalk common = df.merge(df_naics, on=[naics_year, naics_year]) missing_naics = df[(~df[naics_year].isin(common[naics_year]))] # extract sectors where len > 6 and that does not include a '-' missing_naics = missing_naics[missing_naics[naics_year].apply( lambda x: len(x) > 6)] if len(missing_naics) != 0: missing_naics = missing_naics[~missing_naics[naics_year].str. contains('-')] # append to df list missing_naics_df_list.append(missing_naics) # concat df list and drop duplications missing_naics_df = \ pd.concat(missing_naics_df_list, ignore_index=True, sort=False).drop_duplicates().reset_index(drop=True) # sort df missing_naics_df = missing_naics_df.sort_values(['NAICS_2012_Code']) missing_naics_df = missing_naics_df.reset_index(drop=True) # add missing naics to master naics crosswalk total_naics = naics.append(missing_naics_df, ignore_index=True) # sort df total_naics = total_naics.sort_values( ['NAICS_2012_Code', 'NAICS_2007_Code']).drop_duplicates() total_naics = \ total_naics[~total_naics['NAICS_2012_Code'].isin(['None', 'unknown', 'nan', 'Unknown', np.nan] )].reset_index(drop=True) # convert all columns to string total_naics = total_naics.astype(str) # add naics 2002 naics_02 = load_naics_02_to_07_crosswalk() naics_cw = pd.merge(total_naics, naics_02, how='left') # ensure NoneType naics_cw = replace_strings_with_NoneType(naics_cw) # reorder naics_cw = naics_cw[[ 'NAICS_2002_Code', 'NAICS_2007_Code', 'NAICS_2012_Code', 'NAICS_2017_Code' ]] # save as csv naics_cw.to_csv(datapath + "NAICS_Crosswalk.csv", index=False)
def sector_aggregation(df_load, group_cols): """ Function that checks if a sector length exists, and if not, sums the less aggregated sector :param df_load: Either a flowbyactivity df with sectors or a flowbysector df :param group_cols: columns by which to aggregate :return: df, with aggregated sector values """ # ensure None values are not strings df = replace_NoneType_with_empty_cells(df_load) # determine if activities are sector-like, # if aggregating a df with a 'SourceName' sector_like_activities = False if 'SourceName' in df_load.columns: s = pd.unique(df_load['SourceName'])[0] sector_like_activities = check_activities_sector_like(s) # if activities are source like, drop from df and group calls, # add back in as copies of sector columns columns to keep if sector_like_activities: group_cols = [ e for e in group_cols if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] # subset df df_cols = [ e for e in df.columns if e not in ('ActivityProducedBy', 'ActivityConsumedBy') ] df = df[df_cols] # find the longest length sector length = df[[fbs_activity_fields[0], fbs_activity_fields[1] ]].apply(lambda x: x.str.len()).max().max() length = int(length) # for loop in reverse order longest length naics minus 1 to 2 # appends missing naics levels to df for i in range(length, 2, -1): # df where either sector column is length or both columns are df1 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == i))) | ((df['SectorProducedBy'].apply(lambda x: len(x) == i)) & (df['SectorConsumedBy'].apply(lambda x: len(x) == i)))] # add new columns dropping last digit of sectors df1 = df1.assign( SPB=df1['SectorProducedBy'].apply(lambda x: x[0:i - 1])) df1 = df1.assign( SCB=df1['SectorConsumedBy'].apply(lambda x: x[0:i - 1])) # second dataframe where length is l - 1 df2 = df[((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) | (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1))) | ((df['SectorProducedBy'].apply(lambda x: len(x) == i - 1)) & (df['SectorConsumedBy'].apply(lambda x: len(x) == i - 1)) )].rename(columns={ 'SectorProducedBy': 'SPB', 'SectorConsumedBy': 'SCB' }) # merge the dfs merge_cols = [col for col in df2.columns if hasattr(df2[col], 'str')] # also drop activity and description cols merge_cols = [ c for c in merge_cols if c not in ['ActivityConsumedBy', 'ActivityProducedBy', 'Description'] ] if len(df2) > 0: dfm = df1.merge(df2[merge_cols], how='outer', on=merge_cols, indicator=True).query('_merge=="left_only"').drop( '_merge', axis=1) else: dfm = df1.copy(deep=True) if len(dfm) > 0: # replace the SCB and SPB columns then aggregate and add to df dfm['SectorProducedBy'] = dfm['SPB'] dfm['SectorConsumedBy'] = dfm['SCB'] dfm = dfm.drop(columns=(['SPB', 'SCB'])) # aggregate the new sector flow amounts agg_sectors = aggregator(dfm, group_cols) # append to df agg_sectors = replace_NoneType_with_empty_cells(agg_sectors) df = df.append(agg_sectors, sort=False).reset_index(drop=True) df = df.drop_duplicates() # if activities are source-like, set col values as # copies of the sector columns if sector_like_activities: df = df.assign(ActivityProducedBy=df['SectorProducedBy']) df = df.assign(ActivityConsumedBy=df['SectorConsumedBy']) # reindex columns df = df.reindex(df_load.columns, axis=1) # replace null values df = replace_strings_with_NoneType(df).reset_index(drop=True) return df