def check_if_data_exists_for_same_geoscales( fba_wsec_walloc, source, activity): # fba_w_aggregated_sectors """ Determine if data exists at the same scales for datasource and allocation source :param source_fba: :param allocation_fba: :return: """ # todo: modify so only returns warning if no value for entire location, not just no value for one of the possible sectors from flowsa.mapping import get_activitytosector_mapping # create list of highest sector level for which there should be data mapping = get_activitytosector_mapping(source) # filter by activity of interest mapping = mapping.loc[mapping['Activity'].isin(activity)] # add sectors to list sectors_list = pd.unique(mapping['Sector']).tolist() # subset fba w sectors and with merged allocation table so only have rows with aggregated sector list df_subset = fba_wsec_walloc.loc[ (fba_wsec_walloc[fbs_activity_fields[0]].isin(sectors_list)) | (fba_wsec_walloc[fbs_activity_fields[1]].isin(sectors_list) )].reset_index(drop=True) # only interested in total flows # df_subset = df_subset.loc[df_subset['FlowName'] == 'total'].reset_index(drop=True) # df_subset = df_subset.loc[df_subset['Compartment'] == 'total'].reset_index(drop=True) # create subset of fba where the allocation data is missing missing_alloc = df_subset.loc[ df_subset['FlowAmountRatio'].isna()].reset_index(drop=True) # drop any rows where source flow value = 0 missing_alloc = missing_alloc.loc[ missing_alloc['FlowAmount'] != 0].reset_index(drop=True) # create list of locations with missing alllocation data states_missing_data = pd.unique(missing_alloc['Location']).tolist() if len(missing_alloc) == 0: log.info("All aggregated sector flows have allocation flow ratio data") else: log.warning("Missing allocation flow ratio data for " + ', '.join(states_missing_data)) return None
def geoscale_flow_comparison(flowclass, years, datasource, activitynames=['all'], to_scale='national'): """ Aggregates county data to state and national, and state data to national level, allowing for comparisons in flow totals for a given flowclass and industry. First assigns all flownames to NAICS and standardizes units. Assigned to NAICS rather than using FlowNames for aggregation to negate any changes in flownames across time/geoscale """ # load parquet file checking aggregation flows = flowsa.getFlowByActivity(flowclass=flowclass, years=years, datasource=datasource) # fill null values flows = flows.fillna(value=fba_fill_na_dict) # convert units flows = convert_unit(flows) # if activityname set to default, then compare aggregation for all activities. If looking at particular activity, # filter that activity out if activitynames == ['all']: flow_subset = flows.copy() else: flow_subset = flows[ (flows[fba_activity_fields[0]].isin(activitynames)) | (flows[fba_activity_fields[1]].isin(activitynames))] # Reset index values after subset flow_subset = flow_subset.reset_index() # pull naics crosswalk mapping = get_activitytosector_mapping(flow_subset['SourceName'].all()) # assign naics to activities # usgs datasource is not easily assigned to naics for checking totals, so instead standardize activity names if datasource == 'USGS_NWIS_WU': flow_subset = standardize_usgs_nwis_names(flow_subset) else: flow_subset = pd.merge(flow_subset, mapping[['Activity', 'Sector']], left_on='ActivityProducedBy', right_on='Activity', how='left').rename( {'Sector': 'SectorProducedBy'}, axis=1) flow_subset = pd.merge(flow_subset, mapping[['Activity', 'Sector']], left_on='ActivityConsumedBy', right_on='Activity', how='left').rename( {'Sector': 'SectorConsumedBy'}, axis=1) flow_subset = flow_subset.drop(columns=[ 'ActivityProducedBy', 'ActivityConsumedBy', 'Activity_x', 'Activity_y', 'Description' ], errors='ignore') flow_subset['SectorProducedBy'] = flow_subset['SectorProducedBy'].replace({ np.nan: None }).astype(str) flow_subset['SectorConsumedBy'] = flow_subset['SectorConsumedBy'].replace({ np.nan: None }).astype(str) # create list of geoscales for aggregation if to_scale == 'national': geoscales = ['national', 'state', 'county'] elif to_scale == 'state': geoscales = ['state', 'county'] # create empty df list flow_dfs = [] for i in geoscales: try: # filter by geoscale fba_from_scale = filter_by_geoscale(flow_subset, i) # remove/add column names as a column group_cols = fba_default_grouping_fields.copy() for j in ['Location', 'ActivityProducedBy', 'ActivityConsumedBy']: group_cols.remove(j) for j in ['SectorProducedBy', 'SectorConsumedBy']: group_cols.append(j) # county sums to state and national, state sums to national if to_scale == 'state': fba_from_scale['Location'] = fba_from_scale['Location'].apply( lambda x: str(x[0:2])) elif to_scale == 'national': fba_from_scale['Location'] = US_FIPS # aggregate fba_agg = aggregator(fba_from_scale, group_cols) # rename flowamount column, based on geoscale fba_agg = fba_agg.rename(columns={"FlowAmount": "FlowAmount_" + i}) # drop fields irrelevant to aggregated flow comparision drop_fields = flows[[ 'MeasureofSpread', 'Spread', 'DistributionType', 'DataReliability', 'DataCollection' ]] fba_agg = fba_agg.drop(columns=drop_fields) # reset index fba_agg = fba_agg.reset_index(drop=True) flow_dfs.append(fba_agg) except: pass # merge list of dfs by column flow_comparison = reduce( lambda left, right: pd.merge( left, right, on=[ 'Class', 'SourceName', 'FlowName', 'Unit', 'SectorProducedBy', 'SectorConsumedBy', 'Compartment', 'Location', 'LocationSystem', 'Year' ], how='outer'), flow_dfs) # sort df flow_comparison = flow_comparison.sort_values([ 'Year', 'Location', 'SectorProducedBy', 'SectorConsumedBy', 'FlowName', 'Compartment' ]) return flow_comparison