示例#1
0
def check_if_sectors_are_naics(df, crosswalk_list, column_headers):
    """
    Check if activity-like sectors are in fact sectors. Also works for the Sector column
    :return:
    """

    # create a df of non-sectors to export
    non_sectors_df = []
    # create a df of just the non-sectors column
    non_sectors_list = []
    # loop through the df headers and determine if value is not in crosswalk list
    for c in column_headers:
        # create df where sectors do not exist in master crosswalk
        non_sectors = df[~df[c].isin(crosswalk_list)]
        # drop rows where c is empty
        non_sectors = non_sectors[non_sectors[c] != '']
        # subset to just the sector column
        if len(non_sectors) != 0:
            sectors = non_sectors[[c]].rename(columns={c: 'NonSectors'})
            non_sectors_df.append(non_sectors)
            non_sectors_list.append(sectors)

    if len(non_sectors_df) != 0:
        # concat the df and the df of sectors
        # ns_df = pd.concat(non_sectors_df, sort=False, ignore_index=True)
        ns_list = pd.concat(non_sectors_list, sort=False, ignore_index=True)
        # print the NonSectors
        non_sectors = ns_list['NonSectors'].drop_duplicates().tolist()
        log.info('There are sectors that are not NAICS 2012 Codes')
        print(non_sectors)
    else:
        log.info('All sectors are NAICS 2012 Codes')

    return non_sectors_df
示例#2
0
def getFlowByActivity(flowclass, years, datasource):
    """
    Retrieves stored data in the FlowByActivity format
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or
     ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :return: a pandas DataFrame in FlowByActivity format
    """
    fbas = pd.DataFrame()
    for y in years:
        # first try reading parquet from your local repo
        try:
            log.info('Loading ' + datasource + ' ' + str(y) +' parquet from local repository')
            fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) + ".parquet")
            fba = fba[fba['Class'].isin(flowclass)]
            fbas = pd.concat([fbas, fba], sort=False)
        except (OSError, FileNotFoundError):
            # if parquet does not exist in local repo, read file from Data Commons
            try:
                log.info(datasource + ' parquet not found in local repo, loading from Data Commons')
                fba = pd.read_parquet('https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' +
                                      datasource + "_" + str(y) + '.parquet')
                fba = fba[fba['Class'].isin(flowclass)]
                fbas = pd.concat([fbas, fba], sort=False)
            except FileNotFoundError:
                log.error("No parquet file found for datasource " + datasource + "and year " + str(
                    y) + " in flowsa or Data Commons")
    return fbas
示例#3
0
def map_fbs_flows(fbs, from_fba_source, v, **kwargs):
    """
    Identifies the mapping file and applies mapping to fbs flows
    :param fbs: flow-by-sector dataframe
    :param from_fba_source: str Source name of fba list to look for mappings
    :param v: dictionary, The datasource parameters
    :param kwargs: includes keep_unmapped_columns and keep_fba_columns
    :return fbs_mapped: df, with flows mapped using federal elementary
           flow list or material flow list
    :return mapping_files: str, name of mapping file
    """
    ignore_source_name = False
    if 'mfl_mapping' in v:
        mapping_files = v['mfl_mapping']
        log.info("Mapping flows in %s to material flow list", from_fba_source)
        flow_type = 'WASTE_FLOW'
        ignore_source_name = True
    else:
        log.info("Mapping flows in %s to federal elementary flow list",
                 from_fba_source)
        if 'fedefl_mapping' in v:
            mapping_files = v['fedefl_mapping']
            ignore_source_name = True
        else:
            mapping_files = from_fba_source
        flow_type = 'ELEMENTARY_FLOW'

    fbs_mapped = map_flows(fbs, mapping_files, flow_type, ignore_source_name,
                           **kwargs)

    return fbs_mapped, mapping_files
示例#4
0
def getFlowBySector(methodname):
    """
    Retrieves stored data in the FlowBySector format
    :param methodname: string, Name of an available method for the given class
    :return: dataframe in flow by sector format
    """
    fbs = pd.DataFrame()
    # first try reading parquet from your local repo
    try:
        log.info('Loading ' + methodname + ' parquet from local repository')
        fbs = pd.read_parquet(fbsoutputpath + methodname + ".parquet")
    except (OSError, FileNotFoundError):
        # if parquet does not exist in local repo, read file from Data Commons
        try:
            log.info(
                methodname +
                ' parquet not found in local repo, loading from Data Commons')
            fbs = pd.read_parquet(
                'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowBySector/'
                + methodname + ".parquet")
        except FileNotFoundError:
            log.error("No parquet file found for datasource " + methodname +
                      " in flowsa or Data Commons")

    return fbs
示例#5
0
def getFlowByActivity(flowclass,
                      years,
                      datasource,
                      geographic_level='all',
                      file_location='local'):
    """
    Retrieves stored data in the FlowByActivity format
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or
     ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :param geographic_level: 'all', 'national', 'state', 'county'. Default is 'all'
    :param file_location: 'local' or 'remote'. Default is 'local'
    :return: a pandas DataFrame in FlowByActivity format
    """
    fbas = pd.DataFrame()
    for y in years:
        # definitions
        fba_file = datasource + "_" + str(y) + ".parquet"
        local_file_path = fbaoutputpath + fba_file
        remote_file_path = 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/' + fba_file
        # load data
        if file_location == 'local':
            fba = load_file(fba_file, local_file_path, remote_file_path)
        else:
            log.info('Loading ' + datasource + ' from remote server')
            fba = pd.read_parquet(remote_file_path)
        fba = fba[fba['Class'].isin(flowclass)]
        # if geographic level specified, only load rows in geo level
        if geographic_level != 'all':
            fba = filter_by_geoscale(fba, geographic_level)
        # concat dfs
        fbas = pd.concat([fbas, fba], sort=False)
    return fbas
示例#6
0
def extract_facility_data(inventory_dict):
    import stewi
    facility_mapping = pd.DataFrame()
    # load facility data from stewi output directory, keeping only the facility IDs, and geographic information
    inventory_list = list(inventory_dict.keys())

    for i in range(len(inventory_dict)):
        # define inventory name as inventory type + inventory year (e.g., NEI_2017)
        database = inventory_list[i]
        year = list(inventory_dict.values())[i]
        inventory_name = database + '_' + year
        facilities = stewi.getInventoryFacilities(database, year)
        facilities = facilities[['FacilityID', 'State', 'County', 'NAICS']]
        if len(facilities[facilities.duplicated(subset='FacilityID',
                                                keep=False)]) > 0:
            log.info('Duplicate facilities in ' + inventory_name +
                     ' - keeping first listed')
            facilities.drop_duplicates(subset='FacilityID',
                                       keep='first',
                                       inplace=True)
        facility_mapping = facility_mapping.append(facilities)

    # Apply FIPS to facility locations
    facility_mapping = apply_county_FIPS(facility_mapping)

    return facility_mapping
示例#7
0
def check_if_activities_match_sectors(fba):
    """
    Checks if activities in flowbyactivity that appear to be like sectors are actually sectors
    :param fba: a flow by activity dataset
    :return: A list of activities not marching the default sector list or text indicating 100% match
    """
    # Get list of activities in a flowbyactivity file
    activities = []
    for f in fba_activity_fields:
        activities.extend(fba[f])
    #activities.remove("None")

    # Get list of module default sectors
    flowsa_sector_list = list(load_sector_crosswalk()[sector_source_name])
    activities_missing_sectors = set(activities) - set(flowsa_sector_list)

    if len(activities_missing_sectors) > 0:
        log.info(
            str(len(activities_missing_sectors)) +
            " activities not matching sectors in default " +
            sector_source_name + " list.")
        return activities_missing_sectors
    else:
        log.info("All activities match sectors in " + sector_source_name +
                 " list.")
        return None
示例#8
0
def return_activity_from_scale(df, provided_from_scale):
    """
    Determine the 'from scale' used for aggregation/df subsetting for each activity combo in a df
    :param df: flowbyactivity df
    :param activity_df: a df with the activityproducedby, activityconsumedby columns and
    a column 'exists' denoting if data is available at specified geoscale
    :param provided_from_scale: The scale to use specified in method yaml
    :return:
    """

    # determine the unique combinations of activityproduced/consumedby
    unique_activities = unique_activity_names(df)
    # filter by geoscale
    fips = create_geoscale_list(df, provided_from_scale)
    df_sub = df[df['Location'].isin(fips)]
    # determine unique activities after subsetting by geoscale
    unique_activities_sub = unique_activity_names(df_sub)

    # return df of the difference between unique_activities and unique_activities2
    df_missing = dataframe_difference(unique_activities, unique_activities_sub, which='left_only')
    # return df of the similarities between unique_activities and unique_activities2
    df_existing = dataframe_difference(unique_activities, unique_activities_sub, which='both')
    df_existing = df_existing.drop(columns='_merge')
    df_existing['activity_from_scale'] = provided_from_scale

    # for loop through geoscales until find data for each activity combo
    if provided_from_scale == 'national':
        geoscales = ['state', 'county']
    elif provided_from_scale == 'state':
        geoscales = ['county']
    elif provided_from_scale == 'county':
        log.info('No data - skipping')

    if len(df_missing) > 0:
        for i in geoscales:
            # filter by geoscale
            fips_i = create_geoscale_list(df, i)
            df_i = df[df['Location'].isin(fips_i)]

            # determine unique activities after subsetting by geoscale
            unique_activities_i = unique_activity_names(df_i)

            # return df of the difference between unique_activities subset and unique_activities for geoscale
            df_missing_i = dataframe_difference(unique_activities_sub, unique_activities_i, which='right_only')
            df_missing_i = df_missing_i.drop(columns='_merge')
            df_missing_i['activity_from_scale'] = i
            # return df of the similarities between unique_activities and unique_activities2
            df_existing_i = dataframe_difference(unique_activities_sub, unique_activities_i, which='both')

            # append unique activities and df with defined activity_from_scale
            unique_activities_sub = unique_activities_sub.append(df_missing_i[[fba_activity_fields[0],
                                                                               fba_activity_fields[1]]])
            df_existing = df_existing.append(df_missing_i)
            df_missing = dataframe_difference(df_missing[[fba_activity_fields[0],fba_activity_fields[1]]],
                                              df_existing_i[[fba_activity_fields[0],fba_activity_fields[1]]],
                                              which=None)

    return df_existing
示例#9
0
def load_map_clean_fba(method, attr, fba_sourcename, df_year, flowclass,
                       geoscale_from, geoscale_to, **kwargs):
    """
    Load, clean, and map a FlowByActivity df
    :param method: dictionary, FBS method yaml
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fba_sourcename: str, source name
    :param df_year: str, year
    :param flowclass: str, flowclass to subset df with
    :param geoscale_from: str, geoscale to use
    :param geoscale_to: str, geoscale to aggregate to
    :param kwargs: dictionary, can include parameters: 'allocation_flow',
                   'allocation_compartment','clean_allocation_fba', 'clean_allocation_fba_w_sec'
    :return: df, fba format
    """

    log.info("Loading allocation flowbyactivity %s for year %s", fba_sourcename, str(df_year))
    fba = load_fba_w_standardized_units(datasource=fba_sourcename,
                                        year=df_year,
                                        flowclass=flowclass)

    # check if allocation data exists at specified geoscale to use
    log.info("Checking if allocation data exists at the %s level", geoscale_from)
    check_if_data_exists_at_geoscale(fba, geoscale_from)

    # aggregate geographically to the scale of the flowbyactivty source, if necessary
    fba = subset_df_by_geoscale(fba, geoscale_from, geoscale_to)

    # subset based on yaml settings
    if 'flowname_subset' in kwargs:
        if kwargs['flowname_subset'] != 'None':
            fba = fba.loc[fba['FlowName'].isin(kwargs['flowname_subset'])]

    if 'compartment_subset' in kwargs:
        if kwargs['compartment_subset'] != 'None':
            fba = fba.loc[fba['Compartment'].isin(kwargs['compartment_subset'])]
    # cleanup the fba allocation df, if necessary
    if 'clean_fba' in kwargs:
        log.info("Cleaning %s", fba_sourcename)
        fba = dynamically_import_fxn(fba_sourcename, kwargs["clean_fba"])(fba, attr=attr)
    # reset index
    fba = fba.reset_index(drop=True)

    # assign sector to allocation dataset
    log.info("Adding sectors to %s", fba_sourcename)
    fba_wsec = add_sectors_to_flowbyactivity(fba, sectorsourcename=method['target_sector_source'])

    # call on fxn to further clean up/disaggregate the fba allocation data, if exists
    if 'clean_fba_w_sec' in kwargs:
        log.info("Further disaggregating sectors in %s", fba_sourcename)
        fba_wsec = dynamically_import_fxn(fba_sourcename,
                                          kwargs['clean_fba_w_sec'])(fba_wsec, attr=attr,
                                                                     method=method,
                                                                     sourcename=fba_sourcename)

    return fba_wsec
示例#10
0
def writeFlowBySectorBibliography(methodname):
    """
    Generate bibliography for FlowBySectorMethod in local directory
    :param methodname: string, FBS methodname for which to create .bib file
    :return: .bib file save to local directory
    """
    # Generate a single .bib file for a list of Flow-By-Sector method names
    # and save file to local directory
    log.info('Write bibliography to %s%s.bib', biboutputpath, methodname)
    generate_fbs_bibliography(methodname)
示例#11
0
def check_allocation_ratios(flow_alloc_df, activity_set, source_name,
                            method_name):
    """
    Check for issues with the flow allocation ratios
    :param df:
    :return:
    """

    # create column of sector lengths
    flow_alloc_df.loc[:, 'slength'] = flow_alloc_df['Sector'].apply(
        lambda x: len(x))
    # subset df
    flow_alloc_df2 = flow_alloc_df[[
        'FBA_Activity', 'Location', 'slength', 'FlowAmountRatio'
    ]]
    # sum the flow amount ratios by location and sector length
    flow_alloc_df3 = flow_alloc_df2.groupby(
        ['FBA_Activity', 'Location', 'slength'],
        as_index=False)[["FlowAmountRatio"]].agg("sum")
    # not interested in sector length > 6
    flow_alloc_df4 = flow_alloc_df3[flow_alloc_df3['slength'] <= 6]

    ua_count1 = len(flow_alloc_df4[flow_alloc_df4['FlowAmountRatio'] < 1])
    log.info(
        'There are ' + str(ua_count1) +
        ' instances at a sector length of 6 or less where the allocation ratio for a location and sector length is < 1'
    )
    ua_count2 = len(flow_alloc_df4[flow_alloc_df4['FlowAmountRatio'] < 0.99])
    log.info(
        'There are ' + str(ua_count2) +
        ' instances at a sector length of 6 or less where the allocation ratio for a location and sector length is < 0.99'
    )
    ua_count3 = len(flow_alloc_df4[flow_alloc_df4['FlowAmountRatio'] > 1])
    log.info(
        'There are ' + str(ua_count3) +
        ' instances at a sector length of 6 or less where the allocation ratio for a location and sector length is > 1'
    )
    ua_count4 = len(flow_alloc_df4[flow_alloc_df4['FlowAmountRatio'] > 1.01])
    log.info(
        'There are ' + str(ua_count4) +
        ' instances at a sector length of 6 or less where the allocation ratio for a location and sector length is > 1.01'
    )

    # save csv to output folder
    log.info(
        'Save the summary table of flow allocation ratios for each sector length for '
        + activity_set + ' in output folder')
    # output data for all sector lengths
    flow_alloc_df3.to_csv(outputpath + "FlowBySectorMethodAnalysis/" +
                          method_name + '_' + source_name +
                          "_allocation_ratios_" + activity_set + ".csv",
                          index=False)

    return None
示例#12
0
def check_if_location_systems_match(df1, df2):
    """
    Check if two dataframes share the same location system
    :param df1: fba or fbs df
    :param df2: fba or fbs df
    :return:
    """

    if df1["LocationSystem"].all() == df2["LocationSystem"].all():
        log.info("LocationSystems match")
    else:
        log.warning("LocationSystems do not match, might lose county level data")
示例#13
0
def reassign_airplane_emissions(df, year, NAICS_level_value):
    """
    Reassigns emissions from airplanes to NAICS associated with air
    transportation instead of the NAICS assigned to airports
    :param df: a dataframe of emissions and mapped faciliites from stewicombo
    :param year: year as str
    :param NAICS_level_value: desired NAICS aggregation level, using sector_level_key,
                should match target_sector_level
    :return: df
    """
    import stewi
    from stewicombo.overlaphandler import remove_default_flow_overlaps
    from stewicombo.globals import addChemicalMatches

    ## subtract emissions for air transportation from airports in NEI
    airport_NAICS = '4881'
    air_transportation_SCC = '2275020000'
    air_transportation_naics = '481111'
    log.info('Reassigning emissions from air transportation from airports')

    # obtain and prepare SCC dataset
    df_airplanes = stewi.getInventory('NEI', year,
                                      stewiformat='flowbyprocess')
    df_airplanes = df_airplanes[df_airplanes['Process'] == air_transportation_SCC]
    df_airplanes['Source'] = 'NEI'
    df_airplanes = addChemicalMatches(df_airplanes)
    df_airplanes = remove_default_flow_overlaps(df_airplanes, SCC=True)
    df_airplanes.drop(columns=['Process'], inplace=True)

    facility_mapping_air = df[['FacilityID', 'NAICS']]
    facility_mapping_air.drop_duplicates(keep='first', inplace=True)
    df_airplanes = df_airplanes.merge(facility_mapping_air, how='left',
                                      on='FacilityID')

    df_airplanes['Year'] = year
    df_airplanes = df_airplanes[
        df_airplanes['NAICS'].str[0:len(airport_NAICS)] == airport_NAICS]

    # subtract airplane emissions from airport NAICS at individual facilities
    df_planeemissions = df_airplanes[['FacilityID', 'FlowName', 'FlowAmount']]
    df_planeemissions.rename(columns={'FlowAmount': 'PlaneEmissions'}, inplace=True)
    df = df.merge(df_planeemissions, how='left',
                  on=['FacilityID', 'FlowName'])
    df[['PlaneEmissions']] = df[['PlaneEmissions']].fillna(value=0)
    df['FlowAmount'] = df['FlowAmount'] - df['PlaneEmissions']
    df.drop(columns=['PlaneEmissions'], inplace=True)

    # add airplane emissions under air transport NAICS
    df_airplanes.loc[:, 'NAICS_lvl'] = air_transportation_naics[0:NAICS_level_value]
    df = pd.concat([df, df_airplanes], ignore_index=True)

    return df
示例#14
0
def harmonize_FBS_columns(df):
    """
    For FBS use in USEEIOR, harmonize the values in the columns
    - LocationSystem: drop the year, so just 'FIPS'
    - MeasureofSpread: tmp set to NoneType as values currently misleading
    - Spread: tmp set to 0 as values currently misleading
    - DistributionType: tmp set to NoneType as values currently misleading
    - MetaSources: Combine strings for rows where class/context/flowtype/flowable/etc. are equal
    :param df: FBS dataframe with mixed values/strings in columns
    :return: FBS df with harmonized values/strings in columns
    """

    # harmonize LocationSystem column
    log.info('Drop year in LocationSystem')
    if df['LocationSystem'].str.contains('FIPS').all():
        df = df.assign(LocationSystem='FIPS')
    # harmonize MeasureofSpread
    log.info('Reset MeasureofSpread to NoneType')
    df = df.assign(MeasureofSpread=None)
    # reset spread, as current values are misleading
    log.info('Reset Spread to 0')
    df = df.assign(Spread=0)
    # harmonize Distributiontype
    log.info('Reset DistributionType to NoneType')
    df = df.assign(DistributionType=None)

    # harmonize metasources
    log.info('Harmonize MetaSources')
    df = replace_NoneType_with_empty_cells(df)

    # subset all string cols of the df and drop duplicates
    string_cols = ['Flowable', 'Class', 'SectorProducedBy', 'SectorConsumedBy',
                   'SectorSourceName', 'Context', 'Location', 'LocationSystem',
                   'Unit', 'FlowType', 'Year', 'MeasureofSpread', 'MetaSources']
    df_sub = df[string_cols].drop_duplicates().reset_index(drop=True)
    # sort df
    df_sub = df_sub.sort_values(['MetaSources', 'SectorProducedBy',
                                 'SectorConsumedBy']).reset_index(drop=True)

    # new group cols
    group_no_meta = [e for e in string_cols if e not in 'MetaSources']

    # combine/sum columns that share the same data other than Metasources,
    # combining MetaSources string in process
    df_sub = df_sub.groupby(group_no_meta)['MetaSources'].apply(', '.join).reset_index()
    # drop the MetaSources col in original df and replace with the MetaSources col in df_sub
    df = df.drop(columns='MetaSources')
    harmonized_df = df.merge(df_sub, how='left')
    harmonized_df = replace_strings_with_NoneType(harmonized_df)

    return harmonized_df
示例#15
0
def check_allocation_ratios(flow_alloc_df_load, activity_set, config):
    """
    Check for issues with the flow allocation ratios
    :param flow_alloc_df_load: df, includes 'FlowAmountRatio' column
    :param activity_set: str, activity set
    :param config: dictionary, method yaml
    :return: print out information regarding allocation ratios,
             save csv of results to local directory
    """

    # create column of sector lengths
    flow_alloc_df =\
        flow_alloc_df_load.assign(SectorLength=flow_alloc_df_load['Sector'].str.len())
    # subset df
    flow_alloc_df2 = flow_alloc_df[[
        'FBA_Activity', 'Location', 'SectorLength', 'FlowAmountRatio'
    ]]
    # sum the flow amount ratios by location and sector length
    flow_alloc_df3 = \
        flow_alloc_df2.groupby(['FBA_Activity', 'Location', 'SectorLength'],
                               dropna=False, as_index=False).agg({"FlowAmountRatio": sum})
    # keep only rows of specified sector length
    flow_alloc_df4 = flow_alloc_df3[
        flow_alloc_df3['SectorLength'] == sector_level_key[
            config['target_sector_level']]].reset_index(drop=True)
    # keep data where the flowamountratio is greater than or less than 1 by 0.005
    tolerance = 0.01
    flow_alloc_df5 = flow_alloc_df4[
        (flow_alloc_df4['FlowAmountRatio'] < 1 - tolerance) |
        (flow_alloc_df4['FlowAmountRatio'] > 1 + tolerance)]

    if len(flow_alloc_df5) > 0:
        vLog.info(
            'There are %s instances at a sector length of %s '
            'where the allocation ratio for a location is greater '
            'than or less than 1 by at least %s. See Validation Log',
            len(flow_alloc_df5), config["target_sector_level"], str(tolerance))

    # add to validation log
    log.info(
        'Save the summary table of flow allocation ratios for each sector length for '
        '%s in validation log', activity_set)
    # if df not empty, print, if empty, print string
    if flow_alloc_df5.empty:
        vLogDetailed.info('Flow allocation ratios for %s all round to 1',
                          activity_set)

    else:
        vLogDetailed.info(
            'Flow allocation ratios for %s: '
            '\n {}'.format(flow_alloc_df5.to_string()), activity_set)
示例#16
0
def function_allocation_method(flow_subset_mapped, k, names, attr, fbs_list):
    """
    Allocate df activities to sectors using a function identified in the FBS method yaml
    :param flow_subset_mapped: df, FBA with flows converted using fedelemflowlist
    :param k: str, source name
    :param names: list, activity names in activity set
    :param attr: dictionary, attribute data from method yaml for activity set
    :param fbs_list: list, fbs dfs created running flowbysector.py
    :return: df, FBS, with allocated activity columns to sectors
    """
    log.info('Calling on function specified in method yaml to allocate '
             '%s to sectors', ', '.join(map(str, names)))
    fbs = dynamically_import_fxn(k, attr['allocation_source'])(flow_subset_mapped, attr, fbs_list)
    return fbs
示例#17
0
def compare_df_units(df1_load, df2_load):
    """
    Determine what units are in each df prior to merge
    :param df1_load:
    :param df2_load:
    :return:
    """
    df1 = df1_load['Unit'].drop_duplicates().tolist()
    df2 = df2_load['Unit'].drop_duplicates().tolist()

    # identify differnces between unit lists
    list_comp = list(set(df1) ^ set(df2))
    # if list is not empty, print warning that units are different
    if list_comp:
        log.info('Merging df with %s and df with %s units', df1, df2)
示例#18
0
def obtain_NAICS_from_facility_matcher(inventory_list):
    import facilitymatcher
    ## Access NAICS From facility matcher and assign based on FRS_ID
    all_NAICS = facilitymatcher.get_FRS_NAICSInfo_for_facility_list(
        frs_id_list=None, inventories_of_interest_list=inventory_list)
    all_NAICS = all_NAICS.loc[all_NAICS['PRIMARY_INDICATOR'] == 'PRIMARY']
    all_NAICS.drop(columns=['PRIMARY_INDICATOR'], inplace=True)
    all_NAICS = naics_expansion(all_NAICS)
    if len(all_NAICS[all_NAICS.duplicated(subset=['FRS_ID', 'Source'],
                                          keep=False)]) > 0:
        log.info('Duplicate primary NAICS reported - keeping first')
        all_NAICS.drop_duplicates(subset=['FRS_ID', 'Source'],
                                  keep='first',
                                  inplace=True)
    return all_NAICS
示例#19
0
def acup_call(*, resp, **_):
    """
    Convert response for calling url to pandas dataframe, begin parsing df
    into FBA format
    :param resp: df, response from url call
    :return: pandas dataframe of original source data
    """
    response_json = json.loads(resp.text)
    # not all states have data, so return empty df if does not exist
    try:
        df = pd.DataFrame(data=response_json["data"])
    except KeyError:
        log.info('No data exists for state')
        df = []

    return df
示例#20
0
def main(**kwargs):
    """
    Generate FBA parquet(s)
    :param kwargs: 'source' and 'year'
    :return: parquet saved to local directory
    """
    # assign arguments
    if len(kwargs) == 0:
        kwargs = parse_args()

    # assign yaml parameters (common.py fxn)
    config = load_sourceconfig(kwargs['source'])

    log.info("Creating dataframe list")
    # @@@01082021JS - Range of years defined, to support split into multiple Parquets:
    if '-' in str(kwargs['year']):
        years = str(kwargs['year']).split('-')
        min_year = int(years[0])
        max_year = int(years[1]) + 1
        year_iter = list(range(min_year, max_year))
    else:
        # Else only a single year defined, create an array of one:
        year_iter = [kwargs['year']]

    for p_year in year_iter:
        kwargs['year'] = str(p_year)
        # build the base url with strings that will be replaced
        build_url = build_url_for_query(config, kwargs)
        # replace parts of urls with specific instructions from source.py
        urls = assemble_urls_for_query(build_url, config, kwargs)
        # create a list with data from all source urls
        dataframe_list = call_urls(urls, kwargs, config)
        # concat the dataframes and parse data with specific instructions from source.py
        log.info("Concat dataframe list and parse data")
        df = parse_data(dataframe_list, kwargs, config)
        if isinstance(df, list):
            for frame in df:
                if not len(frame.index) == 0:
                    try:
                        source_names = frame['SourceName']
                        source_name = source_names.iloc[0]
                    except KeyError:
                        source_name = kwargs['source']
                    process_data_frame(frame, source_name, kwargs['year'],
                                       config)
        else:
            process_data_frame(df, kwargs['source'], kwargs['year'], config)
示例#21
0
def replace_naics_w_naics_2012(df, sectorsourcename):
    """
    Check if activity-like sectors are in fact sectors. Also works for the Sector column
    :return:
    """
    # test
    # df = mapping.copy()
    # drop NoneType
    df = replace_NoneType_with_empty_cells(df)

    # load the mastercroswalk and subset by sectorsourcename, save values to list
    cw_load = load_sector_crosswalk()
    cw = cw_load[sectorsourcename].drop_duplicates().tolist()

    # load melted crosswalk
    cw_melt = melt_naics_crosswalk()
    # drop the count column
    cw_melt = cw_melt.drop(columns='naics_count')

    # determine which headers are in the df
    possible_column_headers = [
        'Sector', 'SectorProducedBy', 'SectorConsumedBy'
    ]
    # # list of column headers that do exist in the df being aggregated
    column_headers = [
        e for e in possible_column_headers if e in df.columns.values.tolist()
    ]

    # check if there are any sectors that are not in the naics 2012 crosswalk
    non_naics2012 = check_if_sectors_are_naics(df, cw, column_headers)

    # loop through the df headers and determine if value is not in crosswalk list
    if len(non_naics2012) != 0:
        log.info(
            'Checking if sectors represent a different NAICS year, if so, replace with NAICS 2012'
        )
        for c in column_headers:
            # merge df with the melted sector crosswalk
            df = df.merge(cw_melt, left_on=c, right_on='NAICS', how='left')
            # if there is a value in the 'NAICS_2012_Code' column, use that value to replace sector in column c
            df.loc[df[c] == df['NAICS'], c] = df['NAICS_2012_Code']
            # multiply the FlowAmount col by allocation_ratio
            df.loc[df[c] == df['NAICS_2012_Code'],
                   'FlowAmount'] = df['FlowAmount'] * df['allocation_ratio']
            # drop columns
            df = df.drop(
                columns=['NAICS_2012_Code', 'NAICS', 'allocation_ratio'])
        log.info('Replaced NAICS with NAICS 2012 Codes')

        # check if there are any sectors that are not in the naics 2012 crosswalk
        log.info('Check again for non NAICS 2012 Codes')
        check_if_sectors_are_naics(df, cw, column_headers)

    else:
        log.info('No sectors require substitution')

    return df
示例#22
0
def getFlowByActivity(flowclass, years, datasource, geographic_level='all'):
    """
    Retrieves stored data in the FlowByActivity format
    :param flowclass: list, a list of`Class' of the flow. required. E.g. ['Water'] or
     ['Land', 'Other']
    :param year: list, a list of years [2015], or [2010,2011,2012]
    :param datasource: str, the code of the datasource.
    :param geographic_level: default set to 'all', which will load all geographic scales in the FlowByActivity, can \
    specify 'national', 'state', 'county'
    :return: a pandas DataFrame in FlowByActivity format
    """

    fbas = pd.DataFrame()
    for y in years:
        # first try reading parquet from your local repo
        try:
            log.info('Loading ' + datasource + ' ' + str(y) +
                     ' parquet from local repository')
            fba = pd.read_parquet(fbaoutputpath + datasource + "_" + str(y) +
                                  ".parquet")
            fba = fba[fba['Class'].isin(flowclass)]
            fbas = pd.concat([fbas, fba], sort=False)
        except (OSError, FileNotFoundError):
            # if parquet does not exist in local repo, read file from Data Commons
            try:
                log.info(
                    datasource +
                    ' parquet not found in local repo, loading from Data Commons'
                )
                fba = pd.read_parquet(
                    'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowByActivity/'
                    + datasource + "_" + str(y) + '.parquet')
                fba = fba[fba['Class'].isin(flowclass)]
                fbas = pd.concat([fbas, fba], sort=False)
            except FileNotFoundError:
                log.error("No parquet file found for datasource " +
                          datasource + "and year " + str(y) +
                          " in flowsa or Data Commons")

    # if geographic level specified, only load rows in geo level
    if geographic_level != 'all':
        fbas = filter_by_geoscale(fbas, geographic_level)

    return fbas
示例#23
0
def generate_list_of_sources_in_fbs_method(methodname):
    """
    Determine what FlowByActivities are used to generate a FlowBySector
    :param methodname: string, FlowBySector method
    :return: list, pairs of FlowByActivity source names and years
    """
    sources = []
    # load the fbs method yaml
    fbs_yaml = load_method(methodname)

    # create list of data and allocation data sets
    fbs = fbs_yaml['source_names']
    for fbs_k, fbs_v in fbs.items():
        try:
            sources.append([fbs_k, fbs_v['year']])
        except KeyError:
            log.info('Could not append %s to datasource list', fbs_k)
            continue
        activities = fbs_v['activity_sets']
        for aset, attr in activities.items():
            if attr['allocation_source'] != 'None':
                sources.append([
                    attr['allocation_source'], attr['allocation_source_year']
                ])
            if 'helper_source' in attr:
                sources.append(
                    [attr['helper_source'], attr['helper_source_year']])
            if 'literature_sources' in attr:
                for source, date in attr['literature_sources'].items():
                    sources.append([source, date])
    # load any additional fbas that are called in a fbs method within fxns
    try:
        fbas = load_fbs_methods_additional_fbas_config()[methodname]
        for s, acts_info in fbas.items():
            for acts, fxn_info in acts_info.items():
                for fxn, fba_info in fxn_info.items():
                    for fba, y in fba_info.items():
                        fxn_config = load_functions_loading_fbas_config(
                        )[fxn][fba]
                        sources.append([fxn_config['source'], y])
    except KeyError:
        pass

    return sources
示例#24
0
def check_if_data_exists_for_same_geoscales(
        fba_wsec_walloc, source, activity):  # fba_w_aggregated_sectors
    """
    Determine if data exists at the same scales for datasource and allocation source
    :param source_fba:
    :param allocation_fba:
    :return:
    """
    # todo: modify so only returns warning if no value for entire location, not just no value for one of the possible sectors

    from flowsa.mapping import get_activitytosector_mapping

    # create list of highest sector level for which there should be data
    mapping = get_activitytosector_mapping(source)
    # filter by activity of interest
    mapping = mapping.loc[mapping['Activity'].isin(activity)]
    # add sectors to list
    sectors_list = pd.unique(mapping['Sector']).tolist()

    # subset fba w sectors and with merged allocation table so only have rows with aggregated sector list
    df_subset = fba_wsec_walloc.loc[
        (fba_wsec_walloc[fbs_activity_fields[0]].isin(sectors_list)) |
        (fba_wsec_walloc[fbs_activity_fields[1]].isin(sectors_list)
         )].reset_index(drop=True)
    # only interested in total flows
    # df_subset = df_subset.loc[df_subset['FlowName'] == 'total'].reset_index(drop=True)
    # df_subset = df_subset.loc[df_subset['Compartment'] == 'total'].reset_index(drop=True)

    # create subset of fba where the allocation data is missing
    missing_alloc = df_subset.loc[
        df_subset['FlowAmountRatio'].isna()].reset_index(drop=True)
    # drop any rows where source flow value = 0
    missing_alloc = missing_alloc.loc[
        missing_alloc['FlowAmount'] != 0].reset_index(drop=True)
    # create list of locations with missing alllocation data
    states_missing_data = pd.unique(missing_alloc['Location']).tolist()

    if len(missing_alloc) == 0:
        log.info("All aggregated sector flows have allocation flow ratio data")
    else:
        log.warning("Missing allocation flow ratio data for " +
                    ', '.join(states_missing_data))

    return None
示例#25
0
def load_file(datafile, local_file, remote_file):
    """
    Loads a preprocessed file
    :param datafile: a data file name with any preceeding relative file
    :param paths: instance of class Paths
    :return: a pandas dataframe of the datafile
    """
    if os.path.exists(local_file):
        log.info('Loading ' + datafile + ' from local repository')
        df = pd.read_parquet(local_file)
    else:
        try:
            log.info(
                datafile +
                ' not found in local folder; loading from remote server...')
            df = pd.read_parquet(remote_file)
        except FileNotFoundError:
            log.error("No file found for " + datafile)
    return df
示例#26
0
def getFlowByActivity(datasource,
                      year,
                      flowclass=None,
                      geographic_level=None,
                      download_if_missing=DEFAULT_DOWNLOAD_IF_MISSING):
    """
    Retrieves stored data in the FlowByActivity format
    :param datasource: str, the code of the datasource.
    :param year: int, a year, e.g. 2012
    :param flowclass: str, a 'Class' of the flow. Optional. E.g. 'Water'
    :param geographic_level: str, a geographic level of the data.
                             Optional. E.g. 'national', 'state', 'county'.
    :param download_if_missing: bool, if True will attempt to load from remote server
        prior to generating if file not found locally
    :return: a pandas DataFrame in FlowByActivity format
    """
    from esupy.processed_data_mgmt import download_from_remote
    # Set fba metadata
    name = flowsa.flowbyactivity.set_fba_name(datasource, year)
    fba_meta = set_fb_meta(name, "FlowByActivity")

    # Try to load a local version of fba; generate and load if missing
    fba = load_preprocessed_output(fba_meta, paths)
    # Remote download
    if fba is None and download_if_missing:
        log.info('%s %s not found in %s, downloading from remote source',
                 datasource, str(year), fbaoutputpath)
        download_from_remote(fba_meta, paths)
        fba = load_preprocessed_output(fba_meta, paths)

    if fba is None:
        log.info('%s %s not found in %s, running functions to generate FBA',
                 datasource, str(year), fbaoutputpath)
        # Generate the fba
        flowsa.flowbyactivity.main(year=year, source=datasource)
        # Now load the fba
        fba = load_preprocessed_output(fba_meta, paths)
        if fba is None:
            log.error('getFlowByActivity failed, FBA not found')
        else:
            log.info('Loaded %s %s from %s', datasource, str(year),
                     fbaoutputpath)
    else:
        log.info('Loaded %s %s from %s', datasource, str(year), fbaoutputpath)

    # Address optional parameters
    if flowclass is not None:
        fba = fba[fba['Class'] == flowclass]
    # if geographic level specified, only load rows in geo level
    if geographic_level is not None:
        fba = filter_by_geoscale(fba, geographic_level)
    return fba
示例#27
0
def getFlowBySector(methodname, file_location='local'):
    """
    Retrieves stored data in the FlowBySector format
    :param methodname: string, Name of an available method for the given class
    :param file_location: 'local' or 'remote'. Default is 'local'
    :return: dataframe in flow by sector format
    """

    # define fbs file
    fbs_file = methodname + ".parquet"
    local_file_path = fbsoutputpath + fbs_file
    remote_file_path = 'https://edap-ord-data-commons.s3.amazonaws.com/flowsa/FlowBySector/' + fbs_file

    if file_location == 'local':
        fbs = load_file(fbs_file, local_file_path, remote_file_path)
    else:
        log.info('Loading ' + methodname + ' from remote server')
        fbs = pd.read_parquet(remote_file_path)

    return fbs
示例#28
0
def subset_df_by_geoscale(df, activity_from_scale, activity_to_scale):
    """
    Subset a df by geoscale or agg to create data specified in method yaml
    :param flow_subset:
    :param activity_from_scale:
    :param activity_to_scale:
    :return:
    """

    # determine 'activity_from_scale' for use in df geoscale subset, by activity
    modified_from_scale = return_activity_from_scale(df, activity_from_scale)
    # add 'activity_from_scale' column to df
    df2 = pd.merge(df, modified_from_scale)

    # list of unique 'from' geoscales
    unique_geoscales = modified_from_scale['activity_from_scale'].drop_duplicates().values.tolist()

    # to scale
    if fips_number_key[activity_from_scale] > fips_number_key[activity_to_scale]:
        to_scale = activity_to_scale
    else:
        to_scale = activity_from_scale

    df_subset_list = []
    # subset df based on activity 'from' scale
    for i in unique_geoscales:
        df3 = df2[df2['activity_from_scale'] == i]
        # if desired geoscale doesn't exist, aggregate existing data
        # if df is less aggregated than allocation df, aggregate fba activity to allocation geoscale
        if fips_number_key[i] > fips_number_key[to_scale]:
            log.info("Aggregating subset from " + i + " to " + to_scale)
            df_sub = agg_by_geoscale(df3, i, to_scale, fba_default_grouping_fields)
        # else filter relevant rows
        else:
            log.info("Subsetting " + i + " data")
            df_sub = filter_by_geoscale(df3, i)
        df_subset_list.append(df_sub)
    df_subset = pd.concat(df_subset_list)

    return df_subset
示例#29
0
def load_source_dataframe(k, v):
    """
    Load the source dataframe. Data can be a FlowbyActivity or FlowBySector parquet stored in flowsa, or a FlowBySector
    formatted dataframe from another package.
    :param k: The datasource name
    :param v: The datasource parameters
    :return:
    """
    if v['data_format'] == 'FBA':
        # if yaml specifies a geoscale to load, use parameter to filter dataframe
        if 'source_fba_load_scale' in v:
            geo_level = v['source_fba_load_scale']
        else:
            geo_level = 'all'
        log.info("Retrieving flowbyactivity for datasource " + k +
                 " in year " + str(v['year']))
        flows_df = flowsa.getFlowByActivity(flowclass=[v['class']],
                                            years=[v['year']],
                                            datasource=k,
                                            geographic_level=geo_level)
    elif v['data_format'] == 'FBS':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = flowsa.getFlowBySector(k)
    elif v['data_format'] == 'FBS_outside_flowsa':
        log.info("Retrieving flowbysector for datasource " + k)
        flows_df = getattr(sys.modules[__name__],
                           v["FBS_datapull_fxn"])(*v['parameters'])
    else:
        log.error("Data format not specified in method file for datasource " +
                  k)

    return flows_df
示例#30
0
def process_data_frame(*, df, source, year, config):
    """
    Process the given dataframe, cleaning, converting data, and
    writing the final parquet. This method was written to move code into a
    shared method, which was necessary to support the processing of a list
    of dataframes instead of a single dataframe.
    :param df: df, FBA format
    :param source: str, source name
    :param year: str, year
    :param config: dict, items in method yaml
    :return: df, FBA format, standardized
    """
    # log that data was retrieved
    log.info("Retrieved data for %s %s", source, year)
    # add any missing columns of data and cast to appropriate data type
    log.info("Add any missing columns and check field datatypes")
    flow_df = clean_df(df, flow_by_activity_fields, fba_fill_na_dict,
                       drop_description=False)
    # sort df and reset index
    flow_df = flow_df.sort_values(['Class', 'Location', 'ActivityProducedBy',
                                   'ActivityConsumedBy', 'FlowName',
                                   'Compartment']).reset_index(drop=True)
    # save as parquet file
    name_data = set_fba_name(source, year)
    meta = set_fb_meta(name_data, "FlowByActivity")
    write_df_to_file(flow_df, paths, meta)
    write_metadata(source, config, meta, "FlowByActivity", year=year)
    log.info("FBA generated and saved for %s", name_data)
    # rename the log file saved to local directory
    rename_log_file(name_data, meta)