def bh_parse(**kwargs): """ Combine, parse, and format the provided dataframes :param kwargs: potential arguments include: dataframe_list: list of dataframes to concat and format args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ # load arguments necessary for function dataframe_list = kwargs['dataframe_list'] # concat list of dataframes (info on each page) df = pd.concat(dataframe_list, sort=False) df = df.rename( columns={ "I-O code": "ActivityConsumedBy", "I-O description": "Description", "gal/$M": "FlowAmount", }) # hardcode df.loc[:, 'FlowAmount'] = df[ 'FlowAmount'] / 1000000 # original data in gal/million usd df['Unit'] = 'gal/USD' df['SourceName'] = 'Blackhurst_IO' df['Class'] = 'Water' df['FlowName'] = 'Water Withdrawals IO Vector' df['Location'] = US_FIPS df = assign_fips_location_system(df, '2002') df['Year'] = '2002' df['DataReliability'] = 5 # tmp df['DataCollection'] = 5 #tmp return df
def epa_cddpath_parse(*, df_list, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ # concat list of dataframes (info on each page) df = pd.concat(df_list, sort=False) # hardcode df['Class'] = 'Other' # confirm this df['SourceName'] = 'EPA_CDDPath' # confirm this df['Unit'] = 'short tons' df['FlowType'] = 'WASTE_FLOW' df.loc[df['ActivityProducedBy'].isna(), 'ActivityProducedBy'] = 'Buildings' # df['Compartment'] = 'waste' # confirm this df['Location'] = US_FIPS df = assign_fips_location_system(df, year) df['Year'] = year # df['MeasureofSpread'] = "NA" # none available df['DataReliability'] = 5 # confirm this df['DataCollection'] = 5 # confirm this return df
def bh_parse(*, df_list, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :return: df, parsed and partially formatted to flowbyactivity specifications """ # concat list of dataframes (info on each page) df = pd.concat(df_list, sort=False) df = df.rename( columns={ "I-O code": "ActivityConsumedBy", "I-O description": "Description", "gal/$M": "FlowAmount", }) # hardcode # original data in gal/million usd df.loc[:, 'FlowAmount'] = df['FlowAmount'] / 1000000 df['Unit'] = 'gal/USD' df['SourceName'] = 'Blackhurst_IO' df['Class'] = 'Water' df['FlowName'] = 'Water Withdrawals IO Vector' df['Location'] = US_FIPS df = assign_fips_location_system(df, '2002') df['Year'] = '2002' df['DataReliability'] = 5 # tmp df['DataCollection'] = 5 # tmp return df
def eia_cbecs_water_parse(dataframe_list, args): # concat dataframes df = pd.concat(dataframe_list, sort=False).dropna() # drop columns df = df.drop(columns=["Distribution of building 25th", "Distribution of building Median", "Distribution of building 75th"]) # use "melt" fxn to convert colummns into rows df = df.melt(id_vars=["PBA"], var_name="FlowName", value_name="FlowAmount") # rename column(s) df = df.rename(columns={'PBA': 'ActivityConsumedBy'}) # replace withdrawn code df.loc[df['FlowAmount'] == "Q", 'FlowAmount'] = withdrawn_keyword # add unit based on flowname df.loc[df['FlowName'] == 'Number of Buildings', 'Unit'] = 'p' df.loc[df['FlowName'] == "Total Floor Space", 'Unit'] = 'million square feet' df.loc[df['FlowName'] == "Total Consumption", 'Unit'] = 'billion gallons' df.loc[df['FlowName'] == "Consumption per Building", 'Unit'] = 'thousand gallons' df.loc[df['FlowName'] == "Consumption per square foot", 'Unit'] = 'gallons' df.loc[df['FlowName'] == "Consumption per worker", 'Unit'] = 'thousand gallons' # class type based on flowname/unit df["Class"] = 'Water' df.loc[df['FlowName'] == 'Number of Buildings', 'Class'] = 'Other' df.loc[df['FlowName'] == "Total Floor Space", 'Class'] = 'Other' # add location system based on year of data df = assign_fips_location_system(df, args['year']) # hardcode columns df["SourceName"] = 'EIA_CBECS_Water' df['Year'] = args["year"] df['Location'] = US_FIPS return df
def census_pop_parse(dataframe_list, args): # concat dataframes df = pd.concat(dataframe_list, sort=False) # Add year df['Year'] = args["year"] # drop puerto rico df = df[df['state'] != '72'] # replace null county cells with '000' df['county'] = df['county'].fillna('000') # Make FIPS as a combo of state and county codes df['Location'] = df['state'] + df['county'] # replace the null value representing the US with US fips df.loc[df['us'] == '1', 'Location'] = US_FIPS # drop columns df = df.drop(columns=['state', 'county', 'us']) # rename columns df = df.rename(columns={"POP": "FlowAmount"}) # add location system based on year of data df = assign_fips_location_system(df, args['year']) # hardcode dta df['Class'] = 'Other' df['SourceName'] = 'Census_PEP_Population' df['FlowName'] = 'Population' df['Unit'] = 'p' df['ActivityConsumedBy'] = 'All' # temporary data quality scores df['DataReliability'] = None df['DataCollection'] = None # sort df df = df.sort_values(['Location']) # reset index df.reset_index(drop=True, inplace=True) return df
def usgs_coef_parse(*, year, **_): """ Combine, parse, and format the provided dataframes :param year: year :return: df, parsed and partially formatted to flowbyactivity specifications """ # Read directly into a pandas df df_raw = pd.read_csv(externaldatapath + "USGS_WU_Coef_Raw.csv") # rename columns to match flowbyactivity format df = df_raw.rename(columns={"Animal Type": "ActivityConsumedBy", "WUC_Median": "FlowAmount", "WUC_Minimum": "Min", "WUC_Maximum": "Max" }) # drop columns df = df.drop(columns=["WUC_25th_Percentile", "WUC_75th_Percentile"]) # hardcode data df["Class"] = "Water" df["SourceName"] = "USGS_WU_Coef" df["Location"] = US_FIPS df['Year'] = year df = assign_fips_location_system(df, '2005') df["Unit"] = "gallons/animal/day" df['DataReliability'] = 5 # tmp df['DataCollection'] = 5 # tmp return df
def bea_make_parse(dataframe_list, args): # concat dataframes - tmp load from uploaded csv # df = pd.concat(dataframe_list, sort=False) df_load = pd.read_csv(datapath + "BEA_Make_Table_after_Redef_2002.csv", dtype="str") # strip whitespace df = df_load.apply(lambda x: x.str.strip()) # drop rows of data df = df[df['Industry'] == df['Commodity']].reset_index(drop=True) # drop columns df = df.drop(columns=['Commodity', 'CommodityDescription']) # rename columns df = df.rename( columns={ 'Industry': 'ActivityProducedBy', 'IndustryDescription': 'Description', 'ProVal': 'FlowAmount', 'IOYear': 'Year' }) df.loc[:, 'FlowAmount'] = df['FlowAmount'].astype(float) * 1000000 # hard code data df['Class'] = 'Money' df['SourceName'] = 'BEA_Make_Table' df['Unit'] = 'USD' df['Location'] = US_FIPS df = assign_fips_location_system(df, args['year']) df['FlowName'] = 'Gross Output Producer Value After Redef' return df
def usgs_ma_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = ["Silicon carbide"] name = usgs_myb_name(source) des = name dataframe = pd.DataFrame() for df in df_list: for index, row in df.iterrows(): remove_digits = str.maketrans('', '', digits) product = df.iloc[index][ "Product"].strip().translate(remove_digits) if product in row_to_use: data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data['FlowName'] = "Silicon carbide" data["ActivityProducedBy"] = "Silicon carbide" data["Unit"] = "Metric Tons" col_name = "quality_" + usgs_myb_year(SPAN_YEARS, year) col_name_array = col_name.split("_") data["Description"] = product + " " + col_name_array[0] data["FlowAmount"] = str(df.iloc[index][col_name]) dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system( dataframe, str(year)) return dataframe
def census_vip_parse(*, df_list, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ df = pd.concat(df_list, sort=False) df['Year'] = df['Year'].astype(str) df = df[df['Year'] == year].reset_index(drop=True) df = df.rename(columns={'Type of Construction:': 'ActivityProducedBy'}) df['Class'] = 'Money' df['SourceName'] = 'Census_VIP' df['FlowName'] = 'Construction spending' # millions of dollars df['FlowAmount'] = df['FlowAmount'] * 1000000 df['Unit'] = 'USD' df['FlowType'] = "ELEMENTARY_FLOW" df['Compartment'] = None df['Location'] = US_FIPS df = assign_fips_location_system(df, year) df['Year'] = year # Add tmp DQ scores df['DataReliability'] = 5 df['DataCollection'] = 5 return df
def noaa_parse(*, year, **_): """ Combine, parse, and format the provided dataframes :param year: year :return: df, parsed and partially formatted to flowbyactivity specifications """ # Read directly into a pandas df df_raw = pd.read_csv(externaldatapath + "foss_landings.csv") # read state fips from common.py df_state = get_state_FIPS().reset_index(drop=True) df_state['State'] = df_state["State"].str.lower() # modify fish state names to match those from common df = df_raw.drop('Sum Pounds', axis=1) df['State'] = df["State"].str.lower() # filter by year df = df[df['Year'] == int(year)] # noaa differentiates between florida east and west, # which is not necessary for our purposes df['State'] = df['State'].str.replace(r'-east', '') df['State'] = df['State'].str.replace(r'-west', '') # sum florida data after casting rows as numeric df['Sum Dollars'] = df['Sum Dollars'].str.replace(r',', '') df["Sum Dollars"] = df["Sum Dollars"].apply(pd.to_numeric) df2 = df.groupby(['Year', 'State'], as_index=False).agg({"Sum Dollars": sum}) # new column includes state fips df3 = df2.merge(df_state[["State", "FIPS"]], how="left", left_on="State", right_on="State") # data includes "process at sea", which is not associated with any # fips, assign value of '99' if fips is nan, add the state name to # description and drop state name df3['Description'] = None df3.loc[df3['State'] == 'process at sea', 'Description'] = df3['State'] df3.loc[df3['State'] == 'process at sea', 'FIPS'] = 99 df4 = df3.drop('State', axis=1) # rename columns to match flowbyactivity format df4 = df4.rename(columns={"Sum Dollars": "FlowAmount", "FIPS": "Location"}) # hardcode data df4["Class"] = "Money" df4["SourceName"] = "NOAA_Landings" df4["FlowName"] = None df4 = assign_fips_location_system(df4, year) df4["Unit"] = "$" df4["ActivityProducedBy"] = "All Species" df4['DataReliability'] = 5 # tmp df4['DataCollection'] = 5 # tmp return df4
def eia_cbecs_land_parse(dataframe_list, args): """ :param dataframe_list: :param args: :return: """ # concat dataframes df_array = [] for dataframes in dataframe_list: # rename column(s) dataframes = dataframes.rename(columns={'Name': 'ActivityConsumedBy'}) if "Location" not in list(dataframes): dataframes["Location"] = US_FIPS dataframes = assign_fips_location_system(dataframes, args['year']) dataframes = dataframes.drop( dataframes[dataframes.Description == "Any elevators"].index) dataframes["Description"] = dataframes["Description"].apply( lambda x: x if 'All buildings' in x else x + " floors") else: dataframes = dataframes.drop(dataframes[ dataframes.ActivityConsumedBy == "Before 1920"].index) # rename location dataframes["Name"] = dataframes["Location"] + ' Division' dcodes = get_region_and_division_codes() dataframes = dataframes.merge( dcodes[['Division', 'Name', 'LocationSystem']], how='left') dataframes["Description"] = "All buildings" dataframes['Location'] = dataframes['Division'].replace( float("NaN"), US_FIPS) dataframes.loc[dataframes.Location == US_FIPS, "LocationSystem"] = "FIPS_2010" dataframes = dataframes.drop(columns=['Division', 'Name']) df_array.append(dataframes) df = pd.concat(df_array, sort=False, ignore_index=True) # trim whitespace and standardize Activity names df['ActivityConsumedBy'] = df['ActivityConsumedBy'].str.strip() df = standardize_eia_cbecs_land_activity_names( df, column_to_standardize='ActivityConsumedBy') # replace withdrawn code df.loc[df['FlowAmount'] == "Q", 'FlowAmount'] = withdrawn_keyword df.loc[df['FlowAmount'] == "N", 'FlowAmount'] = withdrawn_keyword df["Class"] = 'Land' df["SourceName"] = 'EIA_CBECS_Land' df['Year'] = args["year"] df['FlowName'] = "Commercial, " + df[ "ActivityConsumedBy"] + ", Total floorspace" df['Compartment'] = 'ground' df['Unit'] = "million square feet" df['MeasureofSpread'] = "RSE" df['FlowType'] = 'ELEMENTARY_FLOW' # drop any duplicates that arise from joining multiple excel files df = df.drop_duplicates() return df
def epa_nei_global_parse(dataframe_list, args): """ Modifies the raw data to meet the flowbyactivity criteria. Renames certain column headers to match flowbyactivity format. Adds a few additional columns with hardcoded data. Deletes all unnecessary columns. """ df = pd.concat(dataframe_list, sort=True) # rename columns to match flowbyactivity format if args['year'] == '2017': df = df.rename(columns={"pollutant desc": "FlowName", "total emissions": "FlowAmount", "scc": "ActivityProducedBy", "fips code": "Location", "emissions uom":"Unit", "pollutant code": "Description"}) elif args['year'] == '2014': df = df.rename(columns={"pollutant_desc": "FlowName", "total_emissions": "FlowAmount", "scc": "ActivityProducedBy", "state_and_county_fips_code": "Location", "uom":"Unit", "pollutant_cd": "Description"}) elif args['year'] == '2011' or args['year'] == '2008': df = df.rename(columns={"description": "FlowName", "total_emissions": "FlowAmount", "scc": "ActivityProducedBy", "state_and_county_fips_code": "Location", "uom":"Unit", "pollutant_cd": "Description"}) # make sure FIPS are string and 5 digits df['Location']=df['Location'].astype('str').apply('{:0>5}'.format) # remove records from certain FIPS excluded_fips = ['78','85','88'] df = df[~df['Location'].str[0:2].isin(excluded_fips)] excluded_fips2 = ['777'] df = df[~df['Location'].str[-3:].isin(excluded_fips2)] # drop all other columns df.drop(df.columns.difference(['FlowName', 'FlowAmount', 'ActivityProducedBy', 'Location', 'Unit', 'Description']), 1, inplace=True) # add hardcoded data df['FlowType']="ELEMENTARY_FLOW" df['Class']="Chemicals" df['SourceName'] = args['source'] df['Compartment'] = "air" df['Year'] = args['year'] df = assign_fips_location_system(df, args['year']) return df
def census_cbp_parse(*, df_list, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param year: year :return: df, parsed and partially formatted to flowbyactivity specifications """ # concat dataframes df = pd.concat(df_list, sort=False) # Add year df['Year'] = year # convert county='999' to line for full state df.loc[df['county'] == '999', 'county'] = '000' # Make FIPS as a combo of state and county codes df['Location'] = df['state'] + df['county'] # now drop them df = df.drop(columns=['state', 'county']) # rename NAICS column and add NAICS year as description if 'NAICS2007' in df.columns: df = df.rename(columns={"NAICS2007": "ActivityProducedBy"}) df['Description'] = 'NAICS2007' if 'NAICS2012' in df.columns: df = df.rename(columns={"NAICS2012": "ActivityProducedBy"}) df['Description'] = 'NAICS2012' if 'NAICS2017' in df.columns: df = df.rename(columns={"NAICS2017": "ActivityProducedBy"}) df['Description'] = 'NAICS2017' # drop all sectors record df = df[df['ActivityProducedBy'] != "00"] # rename columns df = df.rename( columns={ 'ESTAB': 'Number of establishments', 'EMP': 'Number of employees', 'PAYANN': 'Annual payroll' }) # use "melt" fxn to convert colummns into rows df = df.melt( id_vars=["Location", "ActivityProducedBy", "Year", "Description"], var_name="FlowName", value_name="FlowAmount") # specify unit based on flowname df['Unit'] = np.where(df["FlowName"] == 'Annual payroll', "USD", "p") # specify class df.loc[df['FlowName'] == 'Number of employees', 'Class'] = 'Employment' df.loc[df['FlowName'] == 'Number of establishments', 'Class'] = 'Other' df.loc[df['FlowName'] == 'Annual payroll', 'Class'] = 'Money' # add location system based on year of data df = assign_fips_location_system(df, year) # hard code data df['SourceName'] = 'Census_CBP' # Add tmp DQ scores df['DataReliability'] = 5 df['DataCollection'] = 5 df['Compartment'] = None return df
def usgs_zinc_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param source: source :param year: year :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = ["Quantity", "Ores and concentrates, zinc content", "United States"] import_export = ["Exports:", "Imports for consumption:", "Recoverable zinc:"] prod = "" name = usgs_myb_name(source) dataframe = pd.DataFrame() for df in df_list: for index, row in df.iterrows(): if df.iloc[index]["Production"].strip() == "Exports:": prod = "exports" elif df.iloc[index]["Production"].strip() == \ "Imports for consumption:": prod = "imports" elif df.iloc[index]["Production"].strip() == "Recoverable zinc:": prod = "production" elif df.iloc[index]["Production"].strip() == "United States": prod = "production" if df.iloc[index]["Production"].strip() in row_to_use: product = df.iloc[index]["Production"].strip() data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "Metric Tons" col_name = usgs_myb_year(SPAN_YEARS, year) data["FlowAmount"] = str(df.iloc[index][col_name]) if product.strip() == "Quantity": data["Description"] = "zinc in concentrate" data["ActivityProducedBy"] = "zinc in concentrate " data['FlowName'] = "zinc in concentrate " + prod elif product.strip() == "Ores and concentrates, zinc content": data["Description"] = "Ores and concentrates, zinc content" data["ActivityProducedBy"] = \ "Ores and concentrates, zinc content" data['FlowName'] = \ "Ores and concentrates, zinc content " + prod elif product.strip() == "United States": data["Description"] = "Zinc; Mine" data["ActivityProducedBy"] = name + " " + prod data['FlowName'] = "Zinc; Mine" dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system( dataframe, str(year)) return dataframe
def usgs_platinum_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param source: source :param year: year :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = [ "Quantity", "Palladium, Pd content", "Platinum, includes coins, Pt content", "Platinum, Pt content", "Iridium, Ir content", "Osmium, Os content", "Rhodium, Rh content", "Ruthenium, Ru content", "Iridium, osmium, and ruthenium, gross weight", "Rhodium, Rh content" ] dataframe = pd.DataFrame() for df in df_list: previous_name = "" for index, row in df.iterrows(): if df.iloc[index]["Production"].strip() == "Exports, refined:": product = "exports" elif df.iloc[index]["Production"].strip() == \ "Imports for consumption, refined:": product = "imports" elif df.iloc[index]["Production"].strip() == "Mine production:2": product = "production" name_array = df.iloc[index]["Production"].strip().split(",") if product == "production": name_array = previous_name.split(",") previous_name = df.iloc[index]["Production"].strip() name = name_array[0] if df.iloc[index]["Production"].strip() in row_to_use: data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "kilograms" data['FlowName'] = name + " " + product data["Description"] = name data["ActivityProducedBy"] = name col_name = usgs_myb_year(SPAN_YEARS, year) if str(df.iloc[index][col_name]) == "--": data["FlowAmount"] = str(0) else: data["FlowAmount"] = str(df.iloc[index][col_name]) dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system(dataframe, str(year)) return dataframe
def usgs_fluorspar_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = [ "Quantity", "Quantity3", "Total", "Hydrofluoric acid", "Metallurgical", "Production" ] prod = "" name = usgs_myb_name(source) dataframe = pd.DataFrame() for df in df_list: for index, row in df.iterrows(): if df.iloc[index]["Production"].strip() == "Exports:3": prod = "exports" des = name elif df.iloc[index]["Production"].strip() == \ "Imports for consumption:3": prod = "imports" des = name elif df.iloc[index]["Production"].strip() == "Fluorosilicic acid:": prod = "production" des = "Fluorosilicic acid:" if str(df.iloc[index]["type"]).strip() == "data_two": prod = "imports" des = df.iloc[index]["Production"].strip() elif str(df.iloc[index]["type"]).strip() == \ "Aluminum Fluoride" or \ str(df.iloc[index]["type"]).strip() == "Cryolite": prod = "imports" des = df.iloc[index]["type"].strip() if df.iloc[index]["Production"].strip() in row_to_use: data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "Metric Tons" col_name = usgs_myb_year(SPAN_YEARS, year) if str(df.iloc[index][col_name]) == "W": data["FlowAmount"] = WITHDRAWN_KEYWORD else: data["FlowAmount"] = str(df.iloc[index][col_name]) data["Description"] = des data["ActivityProducedBy"] = name data['FlowName'] = name + " " + prod dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system(dataframe, str(year)) return dataframe
def usgs_zirconium_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param source: source :param year: year :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = [ "Imports for consumption3", "Concentrates", "Exports", "Hafnium, unwrought, including powder, " "imports for consumption" ] dataframe = pd.DataFrame() name = usgs_myb_name(source) for df in df_list: for index, row in df.iterrows(): if df.iloc[index]["Production"].strip() == \ "Imports for consumption3": product = "imports" elif df.iloc[index]["Production"].strip() == "Concentrates": product = "production" elif df.iloc[index]["Production"].strip() == "Exports": product = "exports" if df.iloc[index]["Production"].strip() == \ "Hafnium, unwrought, including powder, imports for " \ "consumption": prod = "imports" des = df.iloc[index]["Production"].strip() else: des = name if df.iloc[index]["Production"].strip() in row_to_use: data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "Metric Tons" data['FlowName'] = name + " " + product data["Description"] = des data["ActivityProducedBy"] = name col_name = usgs_myb_year(SPAN_YEARS, year) if str(df.iloc[index][col_name]) == "--" or \ str(df.iloc[index][col_name]) == "(3)": data["FlowAmount"] = str(0) elif str(df.iloc[index][col_name]) == "W": data["FlowAmount"] = WITHDRAWN_KEYWORD else: data["FlowAmount"] = str(df.iloc[index][col_name]) dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system(dataframe, str(year)) return dataframe
def eia_mecs_land_parse(dataframe_list, args): df_array = [] for dataframes in dataframe_list: dataframes = dataframes.rename( columns={'NAICS Code(a)': 'ActivityConsumedBy'}) dataframes = dataframes.rename( columns={'Subsector and Industry': 'Description'}) dataframes.loc[dataframes.Description == "Total", "ActivityConsumedBy"] = "31-33" unit = [] for index, row in dataframes.iterrows(): if row["FlowName"] == "Establishments(b) (counts)": row["FlowName"] = "Establishments (counts)" flow_name_str = row["FlowName"] flow_name_array = flow_name_str.split("(") row["FlowName"] = flow_name_array[0] unit_text = flow_name_array[1] unit_text_array = unit_text.split(")") if unit_text_array[0] == "counts": unit.append(("p")) else: unit.append(unit_text_array[0]) ACB = row["ActivityConsumedBy"] ACB_str = str(ACB).strip() row["ActivityConsumedBy"] = ACB_str df_array.append(dataframes) df = pd.concat(df_array, sort=False) # trim whitespace associated with Activity df['Description'] = df['Description'].str.strip() # add manufacturing to end of description if missing df['Description'] = df['Description'].apply( lambda x: x + ' Manufacturing' if not x.endswith('Manufacturing') else x) # replace withdrawn code df.loc[df['FlowAmount'] == "Q", 'FlowAmount'] = withdrawn_keyword df.loc[df['FlowAmount'] == "N", 'FlowAmount'] = withdrawn_keyword df["Class"] = 'Land' df["SourceName"] = 'EIA_MECS_Land' df['Year'] = args["year"] df["Compartment"] = 'ground' df['MeasureofSpread'] = "RSE" df['Location'] = US_FIPS df['Unit'] = unit df = assign_fips_location_system(df, args['year']) df['FlowType'] = "ELEMENTARY_FLOW" # modify flowname df['FlowName'] = df['Description'] + ', ' + df['FlowName'].str.strip() return df
def iwms_parse(dataframe_list, args): """Modify the imported data so it meets the flowbyactivity criteria and only includes data on harvested acreage (irrigated and total).""" df = pd.concat(dataframe_list, sort=False) # only interested in total water applied, not water applied by type of irrigation df = df[df['domain_desc'] == 'TOTAL'] # drop unused columns df = df.drop(columns=[ 'CV (%)', 'agg_level_desc', 'location_desc', 'state_alpha', 'sector_desc', 'country_code', 'begin_code', 'watershed_code', 'reference_period_desc', 'asd_desc', 'county_name', 'source_desc', 'congr_district_code', 'asd_code', 'week_ending', 'freq_desc', 'load_time', 'zip_5', 'watershed_desc', 'region_desc', 'state_ansi', 'state_name', 'country_name', 'county_ansi', 'end_code', 'group_desc', 'util_practice_desc', 'class_desc' ]) # create FIPS column by combining existing columns df.loc[df['county_code'] == '', 'county_code'] = '000' # add county fips when missing df['Location'] = df['state_fips_code'] + df['county_code'] df.loc[df['Location'] == '99000', 'Location'] = US_FIPS # modify national level fips # create activityconsumedby column df['ActivityConsumedBy'] = df['short_desc'].str.split(', IRRIGATED').str[0] df['ActivityConsumedBy'] = df['ActivityConsumedBy'].str.replace( ", IN THE OPEN", "", regex=True) # not interested in all data from class_desc # rename columns to match flowbyactivity format df = df.rename( columns={ "Value": "FlowAmount", "unit_desc": "Unit", "year": "Year", "short_desc": "Description", "domaincat_desc": "Compartment" }) # drop remaining unused columns df = df.drop(columns=[ 'commodity_desc', 'state_fips_code', 'county_code', 'statisticcat_desc', 'prodn_practice_desc', 'domain_desc' ]) # modify contents of flowamount column, "D" is supressed data, "z" means less than half the unit is shown df['FlowAmount'] = df['FlowAmount'].str.strip() # trim whitespace df.loc[df['FlowAmount'] == "(D)", 'FlowAmount'] = withdrawn_keyword # df.loc[df['FlowAmount'] == "(Z)", 'FlowAmount'] = withdrawn_keyword df['FlowAmount'] = df['FlowAmount'].str.replace(",", "", regex=True) # add location system based on year of data df = assign_fips_location_system(df, args['year']) # # Add hardcoded data df['Class'] = "Water" df['SourceName'] = "USDA_IWMS" df['DataReliability'] = None #TODO score data qualtiy df['DataCollection'] = None return df
def usgs_strontium_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param source: source :param year: year :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = [ "Production, strontium minerals", "Strontium compounds3", "Celestite4", "Strontium carbonate" ] prod = "" name = usgs_myb_name(source) des = name dataframe = pd.DataFrame() col_name = usgs_myb_year(SPAN_YEARS, year) for df in df_list: for index, row in df.iterrows(): if df.iloc[index]["Production"].strip() == \ "Imports for consumption:2": product = "imports" elif df.iloc[index]["Production"].strip() == \ "Production, strontium minerals": product = "production" elif df.iloc[index]["Production"].strip() == "Exports:2": product = "exports" if df.iloc[index]["Production"].strip() in row_to_use: data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "Metric Tons" if usgs_myb_remove_digits( df.iloc[index]["Production"].strip()) == "Celestite": data['FlowName'] = \ name + " " + product + " " + usgs_myb_remove_digits( df.iloc[index]["Production"].strip()) else: data['FlowName'] = name + " " + product data["Description"] = usgs_myb_remove_digits( df.iloc[index]["Production"].strip()) data["ActivityProducedBy"] = name col_name = usgs_myb_year(SPAN_YEARS, year) if str(df.iloc[index][col_name]) == "--" or \ str(df.iloc[index][col_name]) == "(3)": data["FlowAmount"] = str(0) else: data["FlowAmount"] = str(df.iloc[index][col_name]) dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system(dataframe, str(year)) return dataframe
def prepare_stewi_fbs(df, inventory_dict, NAICS_level, geo_scale): from stewi.globals import weighted_average # update location to appropriate geoscale prior to aggregating df.dropna(subset=['Location'], inplace=True) df['Location'] = df['Location'].astype(str) df = update_geoscale(df, geo_scale) # assign grouping variables based on desired geographic aggregation level grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location'] if 'MetaSources' in df: grouping_vars.append('MetaSources') # aggregate by NAICS code, FlowName, compartment, and geographic level fbs = df.groupby(grouping_vars).agg({ 'FlowAmount': 'sum', 'Year': 'first', 'Unit': 'first' }) # add reliability score fbs['DataReliability'] = weighted_average(df, 'DataReliability', 'FlowAmount', grouping_vars) fbs.reset_index(inplace=True) # apply flow mapping fbs = map_elementary_flows(fbs, list(inventory_dict.keys())) # rename columns to match flowbysector format fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"}) # add hardcoded data, depending on the source data, some of these fields may need to change fbs['Class'] = 'Chemicals' fbs['SectorConsumedBy'] = 'None' fbs['SectorSourceName'] = 'NAICS_2012_Code' fbs['FlowType'] = 'ELEMENTARY_FLOW' fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0]) # add missing flow by sector fields fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields) fbs = check_for_missing_sector_data(fbs, NAICS_level) # sort dataframe and reset index fbs = fbs.sort_values(list( flow_by_sector_fields.keys())).reset_index(drop=True) # check the sector codes to make sure NAICS 2012 codes fbs = replace_naics_w_naics_2012(fbs, 'NAICS_2012_Code') return fbs
def usgs_cobalt_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} name = usgs_myb_name(source) des = name row_to_use = [ "United Statese, 16, 17", "Mine productione", "Imports for consumption", "Exports" ] dataframe = pd.DataFrame() for df in df_list: for index, row in df.iterrows(): prod = "production" if df.iloc[index]["Production"].strip() == \ "United Statese, 16, 17": prod = "production" elif df.iloc[index]["Production"].strip() == \ "Imports for consumption": prod = "imports" elif df.iloc[index]["Production"].strip() == "Exports": prod = "exports" if df.iloc[index]["Production"].strip() in row_to_use: remove_digits = str.maketrans('', '', digits) product = df.iloc[index]["Production"].strip().translate( remove_digits) data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "Thousand Metric Tons" col_name = usgs_myb_year(SPAN_YEARS, year) data["Description"] = des data["ActivityProducedBy"] = name data['FlowName'] = name + " " + prod data["FlowAmount"] = str(df.iloc[index][col_name]) remove_rows = ["(18)", "(2)"] if data["FlowAmount"] not in remove_rows: dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system( dataframe, str(year)) return dataframe
def usgs_clay_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = [ "Ball clay", "Bentonite", "Fire clay", "Kaolin", "Fuller’s earth", "Total", "Grand total", "Artificially activated clay and earth", "Clays, not elsewhere classified", "Clays, not elsewhere classified" ] dataframe = pd.DataFrame() for df in df_list: for index, row in df.iterrows(): if df.iloc[index]["type"].strip() == "import": product = "imports" elif df.iloc[index]["type"].strip() == "export": product = "exports" else: product = "production" if str(df.iloc[index]["Production"]).strip() in row_to_use: data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "Metric Tons" if product == "production": data['FlowName'] = \ df.iloc[index]["type"].strip() + " " + product data["Description"] = df.iloc[index]["type"].strip() data["ActivityProducedBy"] = df.iloc[index]["type"].strip() else: data['FlowName'] = \ df.iloc[index]["Production"].strip() + " " + product data["Description"] = df.iloc[index]["Production"].strip() data["ActivityProducedBy"] = \ df.iloc[index]["Production"].strip() col_name = usgs_myb_year(SPAN_YEARS, year) if str(df.iloc[index][col_name]) == "--" or \ str(df.iloc[index][col_name]) == "(3)" or \ str(df.iloc[index][col_name]) == "(2)": data["FlowAmount"] = str(0) else: data["FlowAmount"] = str(df.iloc[index][col_name]) dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system(dataframe, str(year)) return dataframe
def usgs_nickel_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param source: source :param year: year :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = [ "Ores and concentrates3", "United States, sulfide ore, concentrate" ] import_export = ["Exports:", "Imports for consumption:"] name = usgs_myb_name(source) des = name dataframe = pd.DataFrame() for df in df_list: prod = "production" for index, row in df.iterrows(): if df.iloc[index]["Production"].strip() == "Exports:": prod = "exports" elif df.iloc[index]["Production"].strip() == \ "Imports for consumption:": prod = "imports" if df.iloc[index]["Production"].strip() in row_to_use: remove_digits = str.maketrans('', '', digits) product = df.iloc[index]["Production"].strip().translate( remove_digits) data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "Metric Tons" col_name = usgs_myb_year(SPAN_YEARS, year) if product.strip() == \ "United States, sulfide ore, concentrate": data["Description"] = \ "United States, sulfide ore, concentrate Nickel" data["ActivityProducedBy"] = name data['FlowName'] = name + " " + prod elif product.strip() == "Ores and concentrates": data["Description"] = "Ores and concentrates Nickel" data["ActivityProducedBy"] = name data['FlowName'] = name + " " + prod if str(df.iloc[index][col_name]) == "--" or \ str(df.iloc[index][col_name]) == "(4)": data["FlowAmount"] = str(0) else: data["FlowAmount"] = str(df.iloc[index][col_name]) dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system(dataframe, str(year)) return dataframe
def eia_mecs_energy_parse(dataframe_list, args): from flowsa.common import assign_census_regions # concatenate dataframe list into single dataframe df = pd.concat(dataframe_list, sort=True) # rename columns to match standard flowbyactivity format df = df.rename( columns={ 'NAICS Code': 'ActivityConsumedBy', 'Subsector and Industry': 'Description' }) df['ActivityConsumedBy'] = df['ActivityConsumedBy'].str.strip() # add hardcoded data df["SourceName"] = args['source'] df["Compartment"] = None df['FlowType'] = 'TECHNOSPHERE_FLOWS' df['Year'] = args["year"] df['MeasureofSpread'] = "RSE" # assign location codes and location system df.loc[df['Location'] == 'Total United States', 'Location'] = US_FIPS df = assign_fips_location_system(df, args['year']) df = assign_census_regions(df) df.loc[df['Description'] == 'Total', 'ActivityConsumedBy'] = '31-33' # drop rows that reflect subtotals (only necessary in 2014) df.dropna(subset=['ActivityConsumedBy'], inplace=True) ## replace withheld/unavailable data # * = estimate is less than 0.5 # W = withheld to avoid disclosing data for individual establishments # Q = withheld because relative standard error is greater than 50 percent # NA = not available df.loc[df['FlowAmount'] == '*', 'FlowAmount'] = None df.loc[df['FlowAmount'] == 'W', 'FlowAmount'] = withdrawn_keyword df.loc[df['FlowAmount'] == 'Q', 'FlowAmount'] = withdrawn_keyword df.loc[df['FlowAmount'] == 'NA', 'FlowAmount'] = None # * = estimate is less than 0.5 # W = withheld to avoid disclosing data for individual establishments # Q = withheld because relative standard error is greater than 50 percent # NA = not available # X = not defined because relative standard error corresponds to a value of zero # at least one 'empty' cell appears to contain a space df.loc[df['Spread'] == '*', 'Spread'] = None df.loc[df['Spread'] == 'W', 'Spread'] = withdrawn_keyword df.loc[df['Spread'] == 'Q', 'Spread'] = withdrawn_keyword df.loc[df['Spread'] == 'NA', 'Spread'] = None df.loc[df['Spread'] == 'X', 'Spread'] = None df.loc[df['Spread'] == ' ', 'Spread'] = None return df
def soda_parse(**kwargs): """ Combine, parse, and format the provided dataframes :param kwargs: potential arguments include: dataframe_list: list of dataframes to concat and format args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ # load arguments necessary for function dataframe_list = kwargs['dataframe_list'] args = kwargs['args'] total_glass = 0 data = {} dataframe = pd.DataFrame() for df in dataframe_list: data["Class"] = "Chemicals" data['FlowType'] = "Elementary Type" data["Location"] = "00000" data["Compartment"] = " " data["SourceName"] = "USGS_MYB_SodaAsh" data["Year"] = str(args["year"]) data["Unit"] = "Thousand metric tons" data['FlowName'] = "Soda Ash" data["Context"] = "air" data['DataReliability'] = 5 # tmp data['DataCollection'] = 5 # tmp for index, row in df.iterrows(): data["Description"] = "" data['ActivityConsumedBy'] = description( df.iloc[index]["End use"], df.iloc[index]["NAICS code"]) if df.iloc[index]["End use"].strip() == "Glass:": total_glass = int(df.iloc[index]["NAICS code"]) elif data['ActivityConsumedBy'] == "Glass Total": data["Description"] = total_glass if not math.isnan(df.iloc[index]["Total"]): data["FlowAmount"] = int(df.iloc[index]["Total"]) data["ActivityProducedBy"] = None if not math.isnan(df.iloc[index]["NAICS code"]): des_str = str(df.iloc[index]["NAICS code"]) data["Description"] = des_str if df.iloc[index]["End use"].strip() != "Glass:": dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system( dataframe, str(args["year"])) return dataframe
def usgs_boron_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = ["B2O3 content", "Quantity"] prod = "" name = usgs_myb_name(source) des = name dataframe = pd.DataFrame() col_name = usgs_myb_year(SPAN_YEARS, year) for df in df_list: for index, row in df.iterrows(): if df.iloc[index]["Production"].strip() == "B2O3 content" or \ df.iloc[index]["Production"].strip() == "Quantity": product = "production" if df.iloc[index]["Production"].strip() == "Colemanite:4": des = "Colemanite" elif df.iloc[index]["Production"].strip() == "Ulexite:4": des = "Ulexite" if df.iloc[index]["Production"].strip() in row_to_use: data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "Metric Tons" if des == name: data['FlowName'] = name + " " + product else: data['FlowName'] = name + " " + product + " " + des data["Description"] = des data["ActivityProducedBy"] = name if str(df.iloc[index][col_name]) == "--" or \ str(df.iloc[index][col_name]) == "(3)": data["FlowAmount"] = str(0) elif str(df.iloc[index][col_name]) == "W": data["FlowAmount"] = WITHDRAWN_KEYWORD else: data["FlowAmount"] = str(df.iloc[index][col_name]) dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system(dataframe, str(year)) return dataframe
def calR_parse(**kwargs): """ Combine, parse, and format the provided dataframes :param kwargs: potential arguments include: dataframe_list: list of dataframes to concat and format args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ # load arguments necessary for function args = kwargs['args'] data = {} output = pd.DataFrame() for entry in os.listdir(externaldatapath): if os.path.isfile(os.path.join(externaldatapath, entry)): data["Class"] = "Other" data['FlowType'] = "Waste Flow" data["Location"] = "06000" data["Compartment"] = "ground" data["LocationSystem"] = "FIPS" data["SourceName"] = "California_Commercial_bySector" data["Year"] = args['year'] data['DataReliability'] = 5 # tmp data['DataCollection'] = 5 # tmp if "California_Commercial_bySector_2014" in entry and "Map" not in entry: data["ActivityProducedBy"] = produced_by(entry) data["ActivityConsumedBy"] = None dataframe = pd.read_csv(externaldatapath + "/" + entry, header=0, dtype=str) for col in dataframe.columns: if "Percent" in str(col): del dataframe[col] for index, row in dataframe.iterrows(): for column in dataframe.columns: if column != "Material": col_string = column.split() data["Unit"] = col_string[1] data['FlowName'] = dataframe.iloc[index]["Material"] + \ " " + col_string[0] if dataframe.iloc[index][column] != "-": data["FlowAmount"] = int( dataframe.iloc[index][column]) output = output.append(data, ignore_index=True) output = assign_fips_location_system( output, '2014') return output
def eia_cbecs_water_parse(**kwargs): """ Combine, parse, and format the provided dataframes :param kwargs: potential arguments include: dataframe_list: list of dataframes to concat and format args: dictionary, used to run flowbyactivity.py ('year' and 'source') :return: df, parsed and partially formatted to flowbyactivity specifications """ # load arguments necessary for function dataframe_list = kwargs['dataframe_list'] args = kwargs['args'] # concat dataframes df = pd.concat(dataframe_list, sort=False).dropna() # drop columns df = df.drop(columns=[ "Distribution of building 25th", "Distribution of building Median", "Distribution of building 75th" ]) # use "melt" fxn to convert colummns into rows df = df.melt(id_vars=["PBA"], var_name="FlowName", value_name="FlowAmount") # rename column(s) df = df.rename(columns={'PBA': 'ActivityConsumedBy'}) # replace withdrawn code df.loc[df['FlowAmount'] == "Q", 'FlowAmount'] = WITHDRAWN_KEYWORD # add unit based on flowname df.loc[df['FlowName'] == 'Number of Buildings', 'Unit'] = 'p' df.loc[df['FlowName'] == "Total Floor Space", 'Unit'] = 'million square feet' df.loc[df['FlowName'] == "Total Consumption", 'Unit'] = 'billion gallons' df.loc[df['FlowName'] == "Consumption per Building", 'Unit'] = 'thousand gallons' df.loc[df['FlowName'] == "Consumption per square foot", 'Unit'] = 'gallons' df.loc[df['FlowName'] == "Consumption per worker", 'Unit'] = 'thousand gallons' # class type based on flowname/unit df["Class"] = 'Water' df.loc[df['FlowName'] == 'Number of Buildings', 'Class'] = 'Other' df.loc[df['FlowName'] == "Total Floor Space", 'Class'] = 'Other' # add location system based on year of data df = assign_fips_location_system(df, args['year']) # hardcode columns df["SourceName"] = 'EIA_CBECS_Water' df['Year'] = args["year"] df['Location'] = US_FIPS df['DataReliability'] = 5 # tmp df['DataCollection'] = 5 #tmp return df
def usgs_gold_parse(*, df_list, source, year, **_): """ Combine, parse, and format the provided dataframes :param df_list: list of dataframes to concat and format :param source: source :param year: year :return: df, parsed and partially formatted to flowbyactivity specifications """ data = {} row_to_use = [ "Quantity", "Exports, refined bullion", "Imports for consumption, refined bullion" ] dataframe = pd.DataFrame() product = "production" name = usgs_myb_name(source) des = name for df in df_list: for index, row in df.iterrows(): if df.iloc[index]["Production"].strip() == "Quantity": product = "production" elif df.iloc[index]["Production"].strip() == \ "Exports, refined bullion": product = "exports" elif df.iloc[index]["Production"].strip() == \ "Imports for consumption, refined bullion": product = "imports" if df.iloc[index]["Production"].strip() in row_to_use: data = usgs_myb_static_varaibles() data["SourceName"] = source data["Year"] = str(year) data["Unit"] = "kilograms" data['FlowName'] = name + " " + product data["Description"] = des data["ActivityProducedBy"] = name col_name = usgs_myb_year(SPAN_YEARS, year) if str(df.iloc[index][col_name]) == "--": data["FlowAmount"] = str(0) else: data["FlowAmount"] = str(df.iloc[index][col_name]) dataframe = dataframe.append(data, ignore_index=True) dataframe = assign_fips_location_system(dataframe, str(year)) return dataframe