Пример #1
0
def bh_parse(**kwargs):
    """
    Combine, parse, and format the provided dataframes
    :param kwargs: potential arguments include:
                   dataframe_list: list of dataframes to concat and format
                   args: dictionary, used to run flowbyactivity.py ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity specifications
    """
    # load arguments necessary for function
    dataframe_list = kwargs['dataframe_list']

    # concat list of dataframes (info on each page)
    df = pd.concat(dataframe_list, sort=False)
    df = df.rename(
        columns={
            "I-O code": "ActivityConsumedBy",
            "I-O description": "Description",
            "gal/$M": "FlowAmount",
        })
    # hardcode
    df.loc[:, 'FlowAmount'] = df[
        'FlowAmount'] / 1000000  # original data in gal/million usd
    df['Unit'] = 'gal/USD'
    df['SourceName'] = 'Blackhurst_IO'
    df['Class'] = 'Water'
    df['FlowName'] = 'Water Withdrawals IO Vector'
    df['Location'] = US_FIPS
    df = assign_fips_location_system(df, '2002')
    df['Year'] = '2002'
    df['DataReliability'] = 5  # tmp
    df['DataCollection'] = 5  #tmp

    return df
Пример #2
0
def epa_cddpath_parse(*, df_list, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param args: dictionary, used to run flowbyactivity.py
        ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    # concat list of dataframes (info on each page)
    df = pd.concat(df_list, sort=False)

    # hardcode
    df['Class'] = 'Other'  # confirm this
    df['SourceName'] = 'EPA_CDDPath'  # confirm this
    df['Unit'] = 'short tons'
    df['FlowType'] = 'WASTE_FLOW'
    df.loc[df['ActivityProducedBy'].isna(), 'ActivityProducedBy'] = 'Buildings'
    # df['Compartment'] = 'waste'  # confirm this
    df['Location'] = US_FIPS
    df = assign_fips_location_system(df, year)
    df['Year'] = year
    # df['MeasureofSpread'] = "NA"  # none available
    df['DataReliability'] = 5  # confirm this
    df['DataCollection'] = 5  # confirm this

    return df
Пример #3
0
def bh_parse(*, df_list, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :return: df, parsed and partially formatted to
        flowbyactivity specifications
    """
    # concat list of dataframes (info on each page)
    df = pd.concat(df_list, sort=False)
    df = df.rename(
        columns={
            "I-O code": "ActivityConsumedBy",
            "I-O description": "Description",
            "gal/$M": "FlowAmount",
        })
    # hardcode
    # original data in gal/million usd
    df.loc[:, 'FlowAmount'] = df['FlowAmount'] / 1000000
    df['Unit'] = 'gal/USD'
    df['SourceName'] = 'Blackhurst_IO'
    df['Class'] = 'Water'
    df['FlowName'] = 'Water Withdrawals IO Vector'
    df['Location'] = US_FIPS
    df = assign_fips_location_system(df, '2002')
    df['Year'] = '2002'
    df['DataReliability'] = 5  # tmp
    df['DataCollection'] = 5  # tmp

    return df
Пример #4
0
def eia_cbecs_water_parse(dataframe_list, args):
    # concat dataframes
    df = pd.concat(dataframe_list, sort=False).dropna()
    # drop columns
    df = df.drop(columns=["Distribution of building 25th", "Distribution of building Median",
                          "Distribution of building 75th"])
    # use "melt" fxn to convert colummns into rows
    df = df.melt(id_vars=["PBA"],
                 var_name="FlowName",
                 value_name="FlowAmount")
    # rename column(s)
    df = df.rename(columns={'PBA': 'ActivityConsumedBy'})
    # replace withdrawn code
    df.loc[df['FlowAmount'] == "Q", 'FlowAmount'] = withdrawn_keyword
    # add unit based on flowname
    df.loc[df['FlowName'] == 'Number of Buildings', 'Unit'] = 'p'
    df.loc[df['FlowName'] == "Total Floor Space", 'Unit'] = 'million square feet'
    df.loc[df['FlowName'] == "Total Consumption", 'Unit'] = 'billion gallons'
    df.loc[df['FlowName'] == "Consumption per Building", 'Unit'] = 'thousand gallons'
    df.loc[df['FlowName'] == "Consumption per square foot", 'Unit'] = 'gallons'
    df.loc[df['FlowName'] == "Consumption per worker", 'Unit'] = 'thousand gallons'
    # class type based on flowname/unit
    df["Class"] = 'Water'
    df.loc[df['FlowName'] == 'Number of Buildings', 'Class'] = 'Other'
    df.loc[df['FlowName'] == "Total Floor Space", 'Class'] = 'Other'
    # add location system based on year of data
    df = assign_fips_location_system(df, args['year'])
    # hardcode columns
    df["SourceName"] = 'EIA_CBECS_Water'
    df['Year'] = args["year"]
    df['Location'] = US_FIPS
    return df
Пример #5
0
def census_pop_parse(dataframe_list, args):
    # concat dataframes
    df = pd.concat(dataframe_list, sort=False)
    # Add year
    df['Year'] = args["year"]
    # drop puerto rico
    df = df[df['state'] != '72']
    # replace null county cells with '000'
    df['county'] = df['county'].fillna('000')
    # Make FIPS as a combo of state and county codes
    df['Location'] = df['state'] + df['county']
    # replace the null value representing the US with US fips
    df.loc[df['us'] == '1', 'Location'] = US_FIPS
    # drop columns
    df = df.drop(columns=['state', 'county', 'us'])
    # rename columns
    df = df.rename(columns={"POP": "FlowAmount"})
    # add location system based on year of data
    df = assign_fips_location_system(df, args['year'])
    # hardcode dta
    df['Class'] = 'Other'
    df['SourceName'] = 'Census_PEP_Population'
    df['FlowName'] = 'Population'
    df['Unit'] = 'p'
    df['ActivityConsumedBy'] = 'All'
    # temporary data quality scores
    df['DataReliability'] = None
    df['DataCollection'] = None
    # sort df
    df = df.sort_values(['Location'])
    # reset index
    df.reset_index(drop=True, inplace=True)
    return df
Пример #6
0
def usgs_coef_parse(*, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param year: year
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    # Read directly into a pandas df
    df_raw = pd.read_csv(externaldatapath + "USGS_WU_Coef_Raw.csv")

    # rename columns to match flowbyactivity format
    df = df_raw.rename(columns={"Animal Type": "ActivityConsumedBy",
                                "WUC_Median": "FlowAmount",
                                "WUC_Minimum": "Min",
                                "WUC_Maximum": "Max"
                                })

    # drop columns
    df = df.drop(columns=["WUC_25th_Percentile", "WUC_75th_Percentile"])

    # hardcode data
    df["Class"] = "Water"
    df["SourceName"] = "USGS_WU_Coef"
    df["Location"] = US_FIPS
    df['Year'] = year
    df = assign_fips_location_system(df, '2005')
    df["Unit"] = "gallons/animal/day"
    df['DataReliability'] = 5  # tmp
    df['DataCollection'] = 5  # tmp

    return df
Пример #7
0
def bea_make_parse(dataframe_list, args):
    # concat dataframes - tmp load from uploaded csv
    # df = pd.concat(dataframe_list, sort=False)
    df_load = pd.read_csv(datapath + "BEA_Make_Table_after_Redef_2002.csv",
                          dtype="str")
    # strip whitespace
    df = df_load.apply(lambda x: x.str.strip())
    # drop rows of data
    df = df[df['Industry'] == df['Commodity']].reset_index(drop=True)
    # drop columns
    df = df.drop(columns=['Commodity', 'CommodityDescription'])
    # rename columns
    df = df.rename(
        columns={
            'Industry': 'ActivityProducedBy',
            'IndustryDescription': 'Description',
            'ProVal': 'FlowAmount',
            'IOYear': 'Year'
        })
    df.loc[:, 'FlowAmount'] = df['FlowAmount'].astype(float) * 1000000
    # hard code data
    df['Class'] = 'Money'
    df['SourceName'] = 'BEA_Make_Table'
    df['Unit'] = 'USD'
    df['Location'] = US_FIPS
    df = assign_fips_location_system(df, args['year'])
    df['FlowName'] = 'Gross Output Producer Value After Redef'
    return df
Пример #8
0
def usgs_ma_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param args: dictionary, used to run flowbyactivity.py
        ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = ["Silicon carbide"]
    name = usgs_myb_name(source)
    des = name
    dataframe = pd.DataFrame()
    for df in df_list:
        for index, row in df.iterrows():
            remove_digits = str.maketrans('', '', digits)
            product = df.iloc[index][
                "Product"].strip().translate(remove_digits)
            if product in row_to_use:
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data['FlowName'] = "Silicon carbide"
                data["ActivityProducedBy"] = "Silicon carbide"
                data["Unit"] = "Metric Tons"
                col_name = "quality_" + usgs_myb_year(SPAN_YEARS, year)
                col_name_array = col_name.split("_")
                data["Description"] = product + " " + col_name_array[0]
                data["FlowAmount"] = str(df.iloc[index][col_name])
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(
                    dataframe, str(year))
    return dataframe
Пример #9
0
def census_vip_parse(*, df_list, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param args: dictionary, used to run flowbyactivity.py
        ('year' and 'source')
    :return: df, parsed and partially formatted to
        flowbyactivity specifications
    """
    df = pd.concat(df_list, sort=False)
    df['Year'] = df['Year'].astype(str)
    df = df[df['Year'] == year].reset_index(drop=True)
    df = df.rename(columns={'Type of Construction:': 'ActivityProducedBy'})

    df['Class'] = 'Money'
    df['SourceName'] = 'Census_VIP'
    df['FlowName'] = 'Construction spending'
    # millions of dollars
    df['FlowAmount'] = df['FlowAmount'] * 1000000
    df['Unit'] = 'USD'
    df['FlowType'] = "ELEMENTARY_FLOW"
    df['Compartment'] = None
    df['Location'] = US_FIPS
    df = assign_fips_location_system(df, year)
    df['Year'] = year
    # Add tmp DQ scores
    df['DataReliability'] = 5
    df['DataCollection'] = 5

    return df
Пример #10
0
def noaa_parse(*, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param year: year
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    # Read directly into a pandas df
    df_raw = pd.read_csv(externaldatapath + "foss_landings.csv")

    # read state fips from common.py
    df_state = get_state_FIPS().reset_index(drop=True)
    df_state['State'] = df_state["State"].str.lower()

    # modify fish state names to match those from common
    df = df_raw.drop('Sum Pounds', axis=1)
    df['State'] = df["State"].str.lower()

    # filter by year
    df = df[df['Year'] == int(year)]
    # noaa differentiates between florida east and west,
    # which is not necessary for our purposes
    df['State'] = df['State'].str.replace(r'-east', '')
    df['State'] = df['State'].str.replace(r'-west', '')

    # sum florida data after casting rows as numeric
    df['Sum Dollars'] = df['Sum Dollars'].str.replace(r',', '')
    df["Sum Dollars"] = df["Sum Dollars"].apply(pd.to_numeric)
    df2 = df.groupby(['Year', 'State'],
                     as_index=False).agg({"Sum Dollars": sum})

    # new column includes state fips
    df3 = df2.merge(df_state[["State", "FIPS"]],
                    how="left",
                    left_on="State",
                    right_on="State")

    # data includes "process at sea", which is not associated with any
    # fips, assign value of '99' if fips is nan, add the state name to
    # description and drop state name
    df3['Description'] = None
    df3.loc[df3['State'] == 'process at sea', 'Description'] = df3['State']
    df3.loc[df3['State'] == 'process at sea', 'FIPS'] = 99
    df4 = df3.drop('State', axis=1)

    # rename columns to match flowbyactivity format
    df4 = df4.rename(columns={"Sum Dollars": "FlowAmount", "FIPS": "Location"})

    # hardcode data
    df4["Class"] = "Money"
    df4["SourceName"] = "NOAA_Landings"
    df4["FlowName"] = None
    df4 = assign_fips_location_system(df4, year)
    df4["Unit"] = "$"
    df4["ActivityProducedBy"] = "All Species"
    df4['DataReliability'] = 5  # tmp
    df4['DataCollection'] = 5  # tmp

    return df4
Пример #11
0
def eia_cbecs_land_parse(dataframe_list, args):
    """

    :param dataframe_list:
    :param args:
    :return:
    """

    # concat dataframes
    df_array = []
    for dataframes in dataframe_list:
        # rename column(s)
        dataframes = dataframes.rename(columns={'Name': 'ActivityConsumedBy'})
        if "Location" not in list(dataframes):
            dataframes["Location"] = US_FIPS
            dataframes = assign_fips_location_system(dataframes, args['year'])
            dataframes = dataframes.drop(
                dataframes[dataframes.Description == "Any elevators"].index)
            dataframes["Description"] = dataframes["Description"].apply(
                lambda x: x if 'All buildings' in x else x + " floors")
        else:
            dataframes = dataframes.drop(dataframes[
                dataframes.ActivityConsumedBy == "Before 1920"].index)
            # rename location
            dataframes["Name"] = dataframes["Location"] + ' Division'
            dcodes = get_region_and_division_codes()
            dataframes = dataframes.merge(
                dcodes[['Division', 'Name', 'LocationSystem']], how='left')
            dataframes["Description"] = "All buildings"
            dataframes['Location'] = dataframes['Division'].replace(
                float("NaN"), US_FIPS)
            dataframes.loc[dataframes.Location == US_FIPS,
                           "LocationSystem"] = "FIPS_2010"
            dataframes = dataframes.drop(columns=['Division', 'Name'])
        df_array.append(dataframes)
    df = pd.concat(df_array, sort=False, ignore_index=True)

    # trim whitespace and standardize Activity names
    df['ActivityConsumedBy'] = df['ActivityConsumedBy'].str.strip()
    df = standardize_eia_cbecs_land_activity_names(
        df, column_to_standardize='ActivityConsumedBy')

    # replace withdrawn code
    df.loc[df['FlowAmount'] == "Q", 'FlowAmount'] = withdrawn_keyword
    df.loc[df['FlowAmount'] == "N", 'FlowAmount'] = withdrawn_keyword
    df["Class"] = 'Land'
    df["SourceName"] = 'EIA_CBECS_Land'
    df['Year'] = args["year"]
    df['FlowName'] = "Commercial, " + df[
        "ActivityConsumedBy"] + ", Total floorspace"
    df['Compartment'] = 'ground'
    df['Unit'] = "million square feet"
    df['MeasureofSpread'] = "RSE"
    df['FlowType'] = 'ELEMENTARY_FLOW'

    # drop any duplicates that arise from joining multiple excel files
    df = df.drop_duplicates()

    return df
Пример #12
0
def epa_nei_global_parse(dataframe_list, args):
    """
    Modifies the raw data to meet the flowbyactivity criteria. 
    Renames certain column headers to match flowbyactivity format.
    Adds a few additional columns with hardcoded data.
    Deletes all unnecessary columns.
    """
    df = pd.concat(dataframe_list, sort=True)
                       	      
    # rename columns to match flowbyactivity format
    if args['year'] == '2017':
        df = df.rename(columns={"pollutant desc": "FlowName",
                                "total emissions": "FlowAmount", 
                                "scc": "ActivityProducedBy", 
                                "fips code": "Location",
                                "emissions uom":"Unit",
                                "pollutant code": "Description"})
    
    elif args['year'] == '2014':
        df = df.rename(columns={"pollutant_desc": "FlowName",
                                "total_emissions": "FlowAmount", 
                                "scc": "ActivityProducedBy", 
                                "state_and_county_fips_code": "Location",
                                "uom":"Unit",
                                "pollutant_cd": "Description"})
    
    elif args['year'] == '2011' or args['year'] == '2008':
        df = df.rename(columns={"description": "FlowName",
                                "total_emissions": "FlowAmount", 
                                "scc": "ActivityProducedBy", 
                                "state_and_county_fips_code": "Location",
                                "uom":"Unit",
                                "pollutant_cd": "Description"})
    
    # make sure FIPS are string and 5 digits
    df['Location']=df['Location'].astype('str').apply('{:0>5}'.format)
    # remove records from certain FIPS
    excluded_fips = ['78','85','88'] 
    df = df[~df['Location'].str[0:2].isin(excluded_fips)]
    excluded_fips2 = ['777']
    df = df[~df['Location'].str[-3:].isin(excluded_fips2)]
    
    # drop all other columns
    df.drop(df.columns.difference(['FlowName',
                                   'FlowAmount',
                                   'ActivityProducedBy',
                                   'Location',
                                   'Unit',
                                   'Description']), 1, inplace=True)
    
    # add hardcoded data
    df['FlowType']="ELEMENTARY_FLOW"
    df['Class']="Chemicals"
    df['SourceName'] = args['source']
    df['Compartment'] = "air"
    df['Year'] = args['year']
    df = assign_fips_location_system(df, args['year'])
   
    return df
Пример #13
0
def census_cbp_parse(*, df_list, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param year: year
    :return: df, parsed and partially formatted to
        flowbyactivity specifications
    """
    # concat dataframes
    df = pd.concat(df_list, sort=False)
    # Add year
    df['Year'] = year
    # convert county='999' to line for full state
    df.loc[df['county'] == '999', 'county'] = '000'
    # Make FIPS as a combo of state and county codes
    df['Location'] = df['state'] + df['county']
    # now drop them
    df = df.drop(columns=['state', 'county'])
    # rename NAICS column and add NAICS year as description
    if 'NAICS2007' in df.columns:
        df = df.rename(columns={"NAICS2007": "ActivityProducedBy"})
        df['Description'] = 'NAICS2007'
    if 'NAICS2012' in df.columns:
        df = df.rename(columns={"NAICS2012": "ActivityProducedBy"})
        df['Description'] = 'NAICS2012'
    if 'NAICS2017' in df.columns:
        df = df.rename(columns={"NAICS2017": "ActivityProducedBy"})
        df['Description'] = 'NAICS2017'
    # drop all sectors record
    df = df[df['ActivityProducedBy'] != "00"]
    # rename columns
    df = df.rename(
        columns={
            'ESTAB': 'Number of establishments',
            'EMP': 'Number of employees',
            'PAYANN': 'Annual payroll'
        })
    # use "melt" fxn to convert colummns into rows
    df = df.melt(
        id_vars=["Location", "ActivityProducedBy", "Year", "Description"],
        var_name="FlowName",
        value_name="FlowAmount")
    # specify unit based on flowname
    df['Unit'] = np.where(df["FlowName"] == 'Annual payroll', "USD", "p")
    # specify class
    df.loc[df['FlowName'] == 'Number of employees', 'Class'] = 'Employment'
    df.loc[df['FlowName'] == 'Number of establishments', 'Class'] = 'Other'
    df.loc[df['FlowName'] == 'Annual payroll', 'Class'] = 'Money'
    # add location system based on year of data
    df = assign_fips_location_system(df, year)
    # hard code data
    df['SourceName'] = 'Census_CBP'
    # Add tmp DQ scores
    df['DataReliability'] = 5
    df['DataCollection'] = 5
    df['Compartment'] = None
    return df
Пример #14
0
def usgs_zinc_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param source: source
    :param year: year
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = ["Quantity", "Ores and concentrates, zinc content",
                  "United States"]
    import_export = ["Exports:", "Imports for consumption:",
                     "Recoverable zinc:"]
    prod = ""
    name = usgs_myb_name(source)
    dataframe = pd.DataFrame()
    for df in df_list:
        for index, row in df.iterrows():
            if df.iloc[index]["Production"].strip() == "Exports:":
                prod = "exports"
            elif df.iloc[index]["Production"].strip() == \
                    "Imports for consumption:":
                prod = "imports"
            elif df.iloc[index]["Production"].strip() == "Recoverable zinc:":
                prod = "production"
            elif df.iloc[index]["Production"].strip() == "United States":
                prod = "production"

            if df.iloc[index]["Production"].strip() in row_to_use:
                product = df.iloc[index]["Production"].strip()
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data["Unit"] = "Metric Tons"
                col_name = usgs_myb_year(SPAN_YEARS, year)
                data["FlowAmount"] = str(df.iloc[index][col_name])

                if product.strip() == "Quantity":
                    data["Description"] = "zinc in concentrate"
                    data["ActivityProducedBy"] = "zinc in concentrate "
                    data['FlowName'] = "zinc in concentrate " + prod
                elif product.strip() == "Ores and concentrates, zinc content":
                    data["Description"] = "Ores and concentrates, zinc content"
                    data["ActivityProducedBy"] = \
                        "Ores and concentrates, zinc content"
                    data['FlowName'] = \
                        "Ores and concentrates, zinc content " + prod
                elif product.strip() == "United States":
                    data["Description"] = "Zinc; Mine"
                    data["ActivityProducedBy"] = name + " " + prod
                    data['FlowName'] = "Zinc; Mine"

                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(
                    dataframe, str(year))
    return dataframe
Пример #15
0
def usgs_platinum_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param source: source
    :param year: year
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = [
        "Quantity", "Palladium, Pd content",
        "Platinum, includes coins, Pt content", "Platinum, Pt content",
        "Iridium, Ir content", "Osmium, Os content", "Rhodium, Rh content",
        "Ruthenium, Ru content",
        "Iridium, osmium, and ruthenium, gross weight", "Rhodium, Rh content"
    ]
    dataframe = pd.DataFrame()

    for df in df_list:
        previous_name = ""
        for index, row in df.iterrows():

            if df.iloc[index]["Production"].strip() == "Exports, refined:":
                product = "exports"
            elif df.iloc[index]["Production"].strip() == \
                    "Imports for consumption, refined:":
                product = "imports"
            elif df.iloc[index]["Production"].strip() == "Mine production:2":
                product = "production"

            name_array = df.iloc[index]["Production"].strip().split(",")

            if product == "production":
                name_array = previous_name.split(",")

            previous_name = df.iloc[index]["Production"].strip()
            name = name_array[0]

            if df.iloc[index]["Production"].strip() in row_to_use:
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data["Unit"] = "kilograms"
                data['FlowName'] = name + " " + product
                data["Description"] = name
                data["ActivityProducedBy"] = name
                col_name = usgs_myb_year(SPAN_YEARS, year)
                if str(df.iloc[index][col_name]) == "--":
                    data["FlowAmount"] = str(0)
                else:
                    data["FlowAmount"] = str(df.iloc[index][col_name])
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(dataframe, str(year))
    return dataframe
Пример #16
0
def usgs_fluorspar_parse(*, df_list, source, year, **_):
    """
     Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param args: dictionary, used to run flowbyactivity.py
        ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = [
        "Quantity", "Quantity3", "Total", "Hydrofluoric acid", "Metallurgical",
        "Production"
    ]
    prod = ""
    name = usgs_myb_name(source)
    dataframe = pd.DataFrame()
    for df in df_list:
        for index, row in df.iterrows():
            if df.iloc[index]["Production"].strip() == "Exports:3":
                prod = "exports"
                des = name
            elif df.iloc[index]["Production"].strip() == \
                    "Imports for consumption:3":
                prod = "imports"
                des = name
            elif df.iloc[index]["Production"].strip() == "Fluorosilicic acid:":
                prod = "production"
                des = "Fluorosilicic acid:"

            if str(df.iloc[index]["type"]).strip() == "data_two":
                prod = "imports"
                des = df.iloc[index]["Production"].strip()
            elif str(df.iloc[index]["type"]).strip() == \
                    "Aluminum Fluoride" or \
                    str(df.iloc[index]["type"]).strip() == "Cryolite":
                prod = "imports"
                des = df.iloc[index]["type"].strip()

            if df.iloc[index]["Production"].strip() in row_to_use:
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data["Unit"] = "Metric Tons"
                col_name = usgs_myb_year(SPAN_YEARS, year)
                if str(df.iloc[index][col_name]) == "W":
                    data["FlowAmount"] = WITHDRAWN_KEYWORD
                else:
                    data["FlowAmount"] = str(df.iloc[index][col_name])
                data["Description"] = des
                data["ActivityProducedBy"] = name
                data['FlowName'] = name + " " + prod
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(dataframe, str(year))
    return dataframe
Пример #17
0
def usgs_zirconium_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param source: source
    :param year: year
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = [
        "Imports for consumption3", "Concentrates", "Exports",
        "Hafnium, unwrought, including powder, "
        "imports for consumption"
    ]
    dataframe = pd.DataFrame()
    name = usgs_myb_name(source)

    for df in df_list:
        for index, row in df.iterrows():
            if df.iloc[index]["Production"].strip() == \
                    "Imports for consumption3":
                product = "imports"
            elif df.iloc[index]["Production"].strip() == "Concentrates":
                product = "production"
            elif df.iloc[index]["Production"].strip() == "Exports":
                product = "exports"

            if df.iloc[index]["Production"].strip() == \
                    "Hafnium, unwrought, including powder, imports for " \
                    "consumption":
                prod = "imports"
                des = df.iloc[index]["Production"].strip()
            else:
                des = name

            if df.iloc[index]["Production"].strip() in row_to_use:
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data["Unit"] = "Metric Tons"
                data['FlowName'] = name + " " + product
                data["Description"] = des
                data["ActivityProducedBy"] = name
                col_name = usgs_myb_year(SPAN_YEARS, year)
                if str(df.iloc[index][col_name]) == "--" or \
                        str(df.iloc[index][col_name]) == "(3)":
                    data["FlowAmount"] = str(0)
                elif str(df.iloc[index][col_name]) == "W":
                    data["FlowAmount"] = WITHDRAWN_KEYWORD
                else:
                    data["FlowAmount"] = str(df.iloc[index][col_name])
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(dataframe, str(year))
    return dataframe
Пример #18
0
def eia_mecs_land_parse(dataframe_list, args):
    df_array = []
    for dataframes in dataframe_list:

        dataframes = dataframes.rename(
            columns={'NAICS Code(a)': 'ActivityConsumedBy'})
        dataframes = dataframes.rename(
            columns={'Subsector and Industry': 'Description'})
        dataframes.loc[dataframes.Description == "Total",
                       "ActivityConsumedBy"] = "31-33"
        unit = []
        for index, row in dataframes.iterrows():
            if row["FlowName"] == "Establishments(b) (counts)":
                row["FlowName"] = "Establishments (counts)"
            flow_name_str = row["FlowName"]
            flow_name_array = flow_name_str.split("(")
            row["FlowName"] = flow_name_array[0]
            unit_text = flow_name_array[1]
            unit_text_array = unit_text.split(")")
            if unit_text_array[0] == "counts":
                unit.append(("p"))
            else:
                unit.append(unit_text_array[0])
            ACB = row["ActivityConsumedBy"]
            ACB_str = str(ACB).strip()
            row["ActivityConsumedBy"] = ACB_str
        df_array.append(dataframes)
    df = pd.concat(df_array, sort=False)

    # trim whitespace associated with Activity
    df['Description'] = df['Description'].str.strip()

    # add manufacturing to end of description if missing
    df['Description'] = df['Description'].apply(
        lambda x: x + ' Manufacturing'
        if not x.endswith('Manufacturing') else x)

    # replace withdrawn code
    df.loc[df['FlowAmount'] == "Q", 'FlowAmount'] = withdrawn_keyword
    df.loc[df['FlowAmount'] == "N", 'FlowAmount'] = withdrawn_keyword
    df["Class"] = 'Land'
    df["SourceName"] = 'EIA_MECS_Land'
    df['Year'] = args["year"]
    df["Compartment"] = 'ground'
    df['MeasureofSpread'] = "RSE"
    df['Location'] = US_FIPS
    df['Unit'] = unit
    df = assign_fips_location_system(df, args['year'])
    df['FlowType'] = "ELEMENTARY_FLOW"

    # modify flowname
    df['FlowName'] = df['Description'] + ', ' + df['FlowName'].str.strip()

    return df
Пример #19
0
def iwms_parse(dataframe_list, args):
    """Modify the imported data so it meets the flowbyactivity criteria and only includes data on harvested acreage
    (irrigated and total)."""
    df = pd.concat(dataframe_list, sort=False)
    # only interested in total water applied, not water applied by type of irrigation
    df = df[df['domain_desc'] == 'TOTAL']
    # drop unused columns
    df = df.drop(columns=[
        'CV (%)', 'agg_level_desc', 'location_desc', 'state_alpha',
        'sector_desc', 'country_code', 'begin_code', 'watershed_code',
        'reference_period_desc', 'asd_desc', 'county_name', 'source_desc',
        'congr_district_code', 'asd_code', 'week_ending', 'freq_desc',
        'load_time', 'zip_5', 'watershed_desc', 'region_desc', 'state_ansi',
        'state_name', 'country_name', 'county_ansi', 'end_code', 'group_desc',
        'util_practice_desc', 'class_desc'
    ])
    # create FIPS column by combining existing columns
    df.loc[df['county_code'] == '',
           'county_code'] = '000'  # add county fips when missing
    df['Location'] = df['state_fips_code'] + df['county_code']
    df.loc[df['Location'] == '99000',
           'Location'] = US_FIPS  # modify national level fips
    # create activityconsumedby column
    df['ActivityConsumedBy'] = df['short_desc'].str.split(', IRRIGATED').str[0]
    df['ActivityConsumedBy'] = df['ActivityConsumedBy'].str.replace(
        ", IN THE OPEN", "",
        regex=True)  # not interested in all data from class_desc
    # rename columns to match flowbyactivity format
    df = df.rename(
        columns={
            "Value": "FlowAmount",
            "unit_desc": "Unit",
            "year": "Year",
            "short_desc": "Description",
            "domaincat_desc": "Compartment"
        })
    # drop remaining unused columns
    df = df.drop(columns=[
        'commodity_desc', 'state_fips_code', 'county_code',
        'statisticcat_desc', 'prodn_practice_desc', 'domain_desc'
    ])
    # modify contents of flowamount column, "D" is supressed data, "z" means less than half the unit is shown
    df['FlowAmount'] = df['FlowAmount'].str.strip()  # trim whitespace
    df.loc[df['FlowAmount'] == "(D)", 'FlowAmount'] = withdrawn_keyword
    # df.loc[df['FlowAmount'] == "(Z)", 'FlowAmount'] = withdrawn_keyword
    df['FlowAmount'] = df['FlowAmount'].str.replace(",", "", regex=True)
    # add location system based on year of data
    df = assign_fips_location_system(df, args['year'])
    # # Add hardcoded data
    df['Class'] = "Water"
    df['SourceName'] = "USDA_IWMS"
    df['DataReliability'] = None  #TODO score data qualtiy
    df['DataCollection'] = None
    return df
Пример #20
0
def usgs_strontium_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param source: source
    :param year: year
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = [
        "Production, strontium minerals", "Strontium compounds3", "Celestite4",
        "Strontium carbonate"
    ]
    prod = ""
    name = usgs_myb_name(source)
    des = name
    dataframe = pd.DataFrame()
    col_name = usgs_myb_year(SPAN_YEARS, year)
    for df in df_list:
        for index, row in df.iterrows():
            if df.iloc[index]["Production"].strip() == \
                    "Imports for consumption:2":
                product = "imports"
            elif df.iloc[index]["Production"].strip() == \
                    "Production, strontium minerals":
                product = "production"
            elif df.iloc[index]["Production"].strip() == "Exports:2":
                product = "exports"

            if df.iloc[index]["Production"].strip() in row_to_use:
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data["Unit"] = "Metric Tons"
                if usgs_myb_remove_digits(
                        df.iloc[index]["Production"].strip()) == "Celestite":
                    data['FlowName'] = \
                        name + " " + product + " " + usgs_myb_remove_digits(
                            df.iloc[index]["Production"].strip())
                else:
                    data['FlowName'] = name + " " + product
                data["Description"] = usgs_myb_remove_digits(
                    df.iloc[index]["Production"].strip())
                data["ActivityProducedBy"] = name
                col_name = usgs_myb_year(SPAN_YEARS, year)
                if str(df.iloc[index][col_name]) == "--" or \
                        str(df.iloc[index][col_name]) == "(3)":
                    data["FlowAmount"] = str(0)
                else:
                    data["FlowAmount"] = str(df.iloc[index][col_name])
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(dataframe, str(year))
    return dataframe
Пример #21
0
def prepare_stewi_fbs(df, inventory_dict, NAICS_level, geo_scale):
    from stewi.globals import weighted_average

    # update location to appropriate geoscale prior to aggregating
    df.dropna(subset=['Location'], inplace=True)
    df['Location'] = df['Location'].astype(str)
    df = update_geoscale(df, geo_scale)

    # assign grouping variables based on desired geographic aggregation level
    grouping_vars = ['NAICS_lvl', 'FlowName', 'Compartment', 'Location']
    if 'MetaSources' in df:
        grouping_vars.append('MetaSources')

    # aggregate by NAICS code, FlowName, compartment, and geographic level
    fbs = df.groupby(grouping_vars).agg({
        'FlowAmount': 'sum',
        'Year': 'first',
        'Unit': 'first'
    })

    # add reliability score
    fbs['DataReliability'] = weighted_average(df, 'DataReliability',
                                              'FlowAmount', grouping_vars)
    fbs.reset_index(inplace=True)

    # apply flow mapping
    fbs = map_elementary_flows(fbs, list(inventory_dict.keys()))

    # rename columns to match flowbysector format
    fbs = fbs.rename(columns={"NAICS_lvl": "SectorProducedBy"})

    # add hardcoded data, depending on the source data, some of these fields may need to change
    fbs['Class'] = 'Chemicals'
    fbs['SectorConsumedBy'] = 'None'
    fbs['SectorSourceName'] = 'NAICS_2012_Code'
    fbs['FlowType'] = 'ELEMENTARY_FLOW'

    fbs = assign_fips_location_system(fbs, list(inventory_dict.values())[0])

    # add missing flow by sector fields
    fbs = add_missing_flow_by_fields(fbs, flow_by_sector_fields)

    fbs = check_for_missing_sector_data(fbs, NAICS_level)

    # sort dataframe and reset index
    fbs = fbs.sort_values(list(
        flow_by_sector_fields.keys())).reset_index(drop=True)

    # check the sector codes to make sure NAICS 2012 codes
    fbs = replace_naics_w_naics_2012(fbs, 'NAICS_2012_Code')

    return fbs
Пример #22
0
def usgs_cobalt_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param args: dictionary, used to run flowbyactivity.py
        ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    name = usgs_myb_name(source)
    des = name
    row_to_use = [
        "United Statese, 16, 17", "Mine productione",
        "Imports for consumption", "Exports"
    ]
    dataframe = pd.DataFrame()
    for df in df_list:

        for index, row in df.iterrows():
            prod = "production"
            if df.iloc[index]["Production"].strip() == \
                    "United Statese, 16, 17":
                prod = "production"
            elif df.iloc[index]["Production"].strip() == \
                    "Imports for consumption":
                prod = "imports"
            elif df.iloc[index]["Production"].strip() == "Exports":
                prod = "exports"

            if df.iloc[index]["Production"].strip() in row_to_use:
                remove_digits = str.maketrans('', '', digits)
                product = df.iloc[index]["Production"].strip().translate(
                    remove_digits)
                data = usgs_myb_static_varaibles()

                data["SourceName"] = source
                data["Year"] = str(year)

                data["Unit"] = "Thousand Metric Tons"
                col_name = usgs_myb_year(SPAN_YEARS, year)
                data["Description"] = des
                data["ActivityProducedBy"] = name
                data['FlowName'] = name + " " + prod

                data["FlowAmount"] = str(df.iloc[index][col_name])
                remove_rows = ["(18)", "(2)"]
                if data["FlowAmount"] not in remove_rows:
                    dataframe = dataframe.append(data, ignore_index=True)
                    dataframe = assign_fips_location_system(
                        dataframe, str(year))
    return dataframe
Пример #23
0
def usgs_clay_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param args: dictionary, used to run flowbyactivity.py
        ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = [
        "Ball clay", "Bentonite", "Fire clay", "Kaolin", "Fuller’s earth",
        "Total", "Grand total", "Artificially activated clay and earth",
        "Clays, not elsewhere classified", "Clays, not elsewhere classified"
    ]
    dataframe = pd.DataFrame()
    for df in df_list:
        for index, row in df.iterrows():
            if df.iloc[index]["type"].strip() == "import":
                product = "imports"
            elif df.iloc[index]["type"].strip() == "export":
                product = "exports"
            else:
                product = "production"

            if str(df.iloc[index]["Production"]).strip() in row_to_use:
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data["Unit"] = "Metric Tons"
                if product == "production":
                    data['FlowName'] = \
                        df.iloc[index]["type"].strip() + " " + product
                    data["Description"] = df.iloc[index]["type"].strip()
                    data["ActivityProducedBy"] = df.iloc[index]["type"].strip()
                else:
                    data['FlowName'] = \
                        df.iloc[index]["Production"].strip() + " " + product
                    data["Description"] = df.iloc[index]["Production"].strip()
                    data["ActivityProducedBy"] = \
                        df.iloc[index]["Production"].strip()

                col_name = usgs_myb_year(SPAN_YEARS, year)
                if str(df.iloc[index][col_name]) == "--" or \
                        str(df.iloc[index][col_name]) == "(3)" or \
                        str(df.iloc[index][col_name]) == "(2)":
                    data["FlowAmount"] = str(0)
                else:
                    data["FlowAmount"] = str(df.iloc[index][col_name])
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(dataframe, str(year))
    return dataframe
Пример #24
0
def usgs_nickel_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param source: source
    :param year: year
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = [
        "Ores and concentrates3", "United States, sulfide ore, concentrate"
    ]
    import_export = ["Exports:", "Imports for consumption:"]
    name = usgs_myb_name(source)
    des = name
    dataframe = pd.DataFrame()
    for df in df_list:
        prod = "production"
        for index, row in df.iterrows():
            if df.iloc[index]["Production"].strip() == "Exports:":
                prod = "exports"
            elif df.iloc[index]["Production"].strip() == \
                    "Imports for consumption:":
                prod = "imports"
            if df.iloc[index]["Production"].strip() in row_to_use:
                remove_digits = str.maketrans('', '', digits)
                product = df.iloc[index]["Production"].strip().translate(
                    remove_digits)
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data["Unit"] = "Metric Tons"
                col_name = usgs_myb_year(SPAN_YEARS, year)
                if product.strip() == \
                        "United States, sulfide ore, concentrate":
                    data["Description"] = \
                        "United States, sulfide ore, concentrate Nickel"
                    data["ActivityProducedBy"] = name
                    data['FlowName'] = name + " " + prod
                elif product.strip() == "Ores and concentrates":
                    data["Description"] = "Ores and concentrates Nickel"
                    data["ActivityProducedBy"] = name
                    data['FlowName'] = name + " " + prod
                if str(df.iloc[index][col_name]) == "--" or \
                        str(df.iloc[index][col_name]) == "(4)":
                    data["FlowAmount"] = str(0)
                else:
                    data["FlowAmount"] = str(df.iloc[index][col_name])
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(dataframe, str(year))
    return dataframe
Пример #25
0
def eia_mecs_energy_parse(dataframe_list, args):

    from flowsa.common import assign_census_regions

    # concatenate dataframe list into single dataframe
    df = pd.concat(dataframe_list, sort=True)

    # rename columns to match standard flowbyactivity format
    df = df.rename(
        columns={
            'NAICS Code': 'ActivityConsumedBy',
            'Subsector and Industry': 'Description'
        })
    df['ActivityConsumedBy'] = df['ActivityConsumedBy'].str.strip()
    # add hardcoded data
    df["SourceName"] = args['source']
    df["Compartment"] = None
    df['FlowType'] = 'TECHNOSPHERE_FLOWS'
    df['Year'] = args["year"]
    df['MeasureofSpread'] = "RSE"
    # assign location codes and location system
    df.loc[df['Location'] == 'Total United States', 'Location'] = US_FIPS
    df = assign_fips_location_system(df, args['year'])
    df = assign_census_regions(df)
    df.loc[df['Description'] == 'Total', 'ActivityConsumedBy'] = '31-33'

    # drop rows that reflect subtotals (only necessary in 2014)
    df.dropna(subset=['ActivityConsumedBy'], inplace=True)

    ## replace withheld/unavailable data
    # * = estimate is less than 0.5
    # W = withheld to avoid disclosing data for individual establishments
    # Q = withheld because relative standard error is greater than 50 percent
    # NA = not available
    df.loc[df['FlowAmount'] == '*', 'FlowAmount'] = None
    df.loc[df['FlowAmount'] == 'W', 'FlowAmount'] = withdrawn_keyword
    df.loc[df['FlowAmount'] == 'Q', 'FlowAmount'] = withdrawn_keyword
    df.loc[df['FlowAmount'] == 'NA', 'FlowAmount'] = None
    # * = estimate is less than 0.5
    # W = withheld to avoid disclosing data for individual establishments
    # Q = withheld because relative standard error is greater than 50 percent
    # NA = not available
    # X = not defined because relative standard error corresponds to a value of zero
    # at least one 'empty' cell appears to contain a space
    df.loc[df['Spread'] == '*', 'Spread'] = None
    df.loc[df['Spread'] == 'W', 'Spread'] = withdrawn_keyword
    df.loc[df['Spread'] == 'Q', 'Spread'] = withdrawn_keyword
    df.loc[df['Spread'] == 'NA', 'Spread'] = None
    df.loc[df['Spread'] == 'X', 'Spread'] = None
    df.loc[df['Spread'] == ' ', 'Spread'] = None

    return df
Пример #26
0
def soda_parse(**kwargs):
    """
    Combine, parse, and format the provided dataframes
    :param kwargs: potential arguments include:
                   dataframe_list: list of dataframes to concat and format
                   args: dictionary, used to run flowbyactivity.py ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity specifications
    """
    # load arguments necessary for function
    dataframe_list = kwargs['dataframe_list']
    args = kwargs['args']

    total_glass = 0
    data = {}
    dataframe = pd.DataFrame()
    for df in dataframe_list:

        data["Class"] = "Chemicals"
        data['FlowType'] = "Elementary Type"
        data["Location"] = "00000"
        data["Compartment"] = " "
        data["SourceName"] = "USGS_MYB_SodaAsh"
        data["Year"] = str(args["year"])
        data["Unit"] = "Thousand metric tons"
        data['FlowName'] = "Soda Ash"
        data["Context"] = "air"
        data['DataReliability'] = 5  # tmp
        data['DataCollection'] = 5  # tmp

        for index, row in df.iterrows():
            data["Description"] = ""
            data['ActivityConsumedBy'] = description(
                df.iloc[index]["End use"], df.iloc[index]["NAICS code"])

            if df.iloc[index]["End use"].strip() == "Glass:":
                total_glass = int(df.iloc[index]["NAICS code"])
            elif data['ActivityConsumedBy'] == "Glass Total":
                data["Description"] = total_glass

            if not math.isnan(df.iloc[index]["Total"]):
                data["FlowAmount"] = int(df.iloc[index]["Total"])
                data["ActivityProducedBy"] = None
                if not math.isnan(df.iloc[index]["NAICS code"]):
                    des_str = str(df.iloc[index]["NAICS code"])
                    data["Description"] = des_str
            if df.iloc[index]["End use"].strip() != "Glass:":
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(
                    dataframe, str(args["year"]))
    return dataframe
Пример #27
0
def usgs_boron_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param args: dictionary, used to run flowbyactivity.py
        ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = ["B2O3 content", "Quantity"]
    prod = ""
    name = usgs_myb_name(source)
    des = name
    dataframe = pd.DataFrame()
    col_name = usgs_myb_year(SPAN_YEARS, year)

    for df in df_list:
        for index, row in df.iterrows():

            if df.iloc[index]["Production"].strip() == "B2O3 content" or \
                    df.iloc[index]["Production"].strip() == "Quantity":
                product = "production"

            if df.iloc[index]["Production"].strip() == "Colemanite:4":
                des = "Colemanite"
            elif df.iloc[index]["Production"].strip() == "Ulexite:4":
                des = "Ulexite"

            if df.iloc[index]["Production"].strip() in row_to_use:
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data["Unit"] = "Metric Tons"
                if des == name:
                    data['FlowName'] = name + " " + product
                else:
                    data['FlowName'] = name + " " + product + " " + des
                data["Description"] = des
                data["ActivityProducedBy"] = name
                if str(df.iloc[index][col_name]) == "--" or \
                        str(df.iloc[index][col_name]) == "(3)":
                    data["FlowAmount"] = str(0)
                elif str(df.iloc[index][col_name]) == "W":
                    data["FlowAmount"] = WITHDRAWN_KEYWORD
                else:
                    data["FlowAmount"] = str(df.iloc[index][col_name])
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(dataframe, str(year))
    return dataframe
def calR_parse(**kwargs):
    """
    Combine, parse, and format the provided dataframes
    :param kwargs: potential arguments include:
                   dataframe_list: list of dataframes to concat and format
                   args: dictionary, used to run flowbyactivity.py ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity specifications
    """
    # load arguments necessary for function
    args = kwargs['args']

    data = {}
    output = pd.DataFrame()

    for entry in os.listdir(externaldatapath):
        if os.path.isfile(os.path.join(externaldatapath, entry)):
            data["Class"] = "Other"
            data['FlowType'] = "Waste Flow"
            data["Location"] = "06000"
            data["Compartment"] = "ground"
            data["LocationSystem"] = "FIPS"
            data["SourceName"] = "California_Commercial_bySector"
            data["Year"] = args['year']
            data['DataReliability'] = 5  # tmp
            data['DataCollection'] = 5  # tmp

            if "California_Commercial_bySector_2014" in entry and "Map" not in entry:
                data["ActivityProducedBy"] = produced_by(entry)
                data["ActivityConsumedBy"] = None
                dataframe = pd.read_csv(externaldatapath + "/" + entry,
                                        header=0,
                                        dtype=str)
                for col in dataframe.columns:
                    if "Percent" in str(col):
                        del dataframe[col]

                for index, row in dataframe.iterrows():
                    for column in dataframe.columns:
                        if column != "Material":
                            col_string = column.split()
                            data["Unit"] = col_string[1]
                            data['FlowName'] = dataframe.iloc[index]["Material"] + \
                                               " " + col_string[0]
                            if dataframe.iloc[index][column] != "-":
                                data["FlowAmount"] = int(
                                    dataframe.iloc[index][column])
                                output = output.append(data, ignore_index=True)
                                output = assign_fips_location_system(
                                    output, '2014')
    return output
Пример #29
0
def eia_cbecs_water_parse(**kwargs):
    """
    Combine, parse, and format the provided dataframes
    :param kwargs: potential arguments include:
                   dataframe_list: list of dataframes to concat and format
                   args: dictionary, used to run flowbyactivity.py ('year' and 'source')
    :return: df, parsed and partially formatted to flowbyactivity specifications
    """
    # load arguments necessary for function
    dataframe_list = kwargs['dataframe_list']
    args = kwargs['args']

    # concat dataframes
    df = pd.concat(dataframe_list, sort=False).dropna()
    # drop columns
    df = df.drop(columns=[
        "Distribution of building 25th", "Distribution of building Median",
        "Distribution of building 75th"
    ])
    # use "melt" fxn to convert colummns into rows
    df = df.melt(id_vars=["PBA"], var_name="FlowName", value_name="FlowAmount")
    # rename column(s)
    df = df.rename(columns={'PBA': 'ActivityConsumedBy'})
    # replace withdrawn code
    df.loc[df['FlowAmount'] == "Q", 'FlowAmount'] = WITHDRAWN_KEYWORD
    # add unit based on flowname
    df.loc[df['FlowName'] == 'Number of Buildings', 'Unit'] = 'p'
    df.loc[df['FlowName'] == "Total Floor Space",
           'Unit'] = 'million square feet'
    df.loc[df['FlowName'] == "Total Consumption", 'Unit'] = 'billion gallons'
    df.loc[df['FlowName'] == "Consumption per Building",
           'Unit'] = 'thousand gallons'
    df.loc[df['FlowName'] == "Consumption per square foot", 'Unit'] = 'gallons'
    df.loc[df['FlowName'] == "Consumption per worker",
           'Unit'] = 'thousand gallons'
    # class type based on flowname/unit
    df["Class"] = 'Water'
    df.loc[df['FlowName'] == 'Number of Buildings', 'Class'] = 'Other'
    df.loc[df['FlowName'] == "Total Floor Space", 'Class'] = 'Other'
    # add location system based on year of data
    df = assign_fips_location_system(df, args['year'])
    # hardcode columns
    df["SourceName"] = 'EIA_CBECS_Water'
    df['Year'] = args["year"]
    df['Location'] = US_FIPS
    df['DataReliability'] = 5  # tmp
    df['DataCollection'] = 5  #tmp

    return df
Пример #30
0
def usgs_gold_parse(*, df_list, source, year, **_):
    """
    Combine, parse, and format the provided dataframes
    :param df_list: list of dataframes to concat and format
    :param source: source
    :param year: year
    :return: df, parsed and partially formatted to flowbyactivity
        specifications
    """
    data = {}
    row_to_use = [
        "Quantity", "Exports, refined bullion",
        "Imports for consumption, refined bullion"
    ]
    dataframe = pd.DataFrame()
    product = "production"
    name = usgs_myb_name(source)
    des = name
    for df in df_list:
        for index, row in df.iterrows():

            if df.iloc[index]["Production"].strip() == "Quantity":
                product = "production"
            elif df.iloc[index]["Production"].strip() == \
                    "Exports, refined bullion":
                product = "exports"
            elif df.iloc[index]["Production"].strip() == \
                    "Imports for consumption, refined bullion":
                product = "imports"

            if df.iloc[index]["Production"].strip() in row_to_use:
                data = usgs_myb_static_varaibles()
                data["SourceName"] = source
                data["Year"] = str(year)
                data["Unit"] = "kilograms"
                data['FlowName'] = name + " " + product

                data["Description"] = des
                data["ActivityProducedBy"] = name
                col_name = usgs_myb_year(SPAN_YEARS, year)
                if str(df.iloc[index][col_name]) == "--":
                    data["FlowAmount"] = str(0)
                else:
                    data["FlowAmount"] = str(df.iloc[index][col_name])
                dataframe = dataframe.append(data, ignore_index=True)
                dataframe = assign_fips_location_system(dataframe, str(year))
    return dataframe