Exemplo n.º 1
0
def sector_disaggregation_generalized(fbs, group_cols):
    """
    function to disaggregate sectors if there is only one naics at a lower level
    works for lower than naics 4
    :param df: A FBS df
    :return: A FBS df with missing naics5 and naics6
    """

    # load naics 2 to naics 6 crosswalk
    cw_load = load_sector_length_crosswalk_w_nonnaics()

    # for loop min length to 6 digits
    length = min(fbs['Sector'].apply(lambda x: len(x)).unique())
    # appends missing naics levels to df
    for i in range(length, 6):

        sector_merge = 'NAICS_' + str(i)
        sector_add = 'NAICS_' + str(i+1)

        # subset the df by naics length
        cw = cw_load[[sector_merge, sector_add]]
        # only keep the rows where there is only one value in sector_add for a value in sector_merge
        cw = cw.drop_duplicates(subset=[sector_merge], keep=False).reset_index(drop=True)
        sector_list = cw[sector_merge].values.tolist()

        # subset df to sectors with length = i and length = i + 1
        df_subset = fbs[fbs['Sector'].apply(lambda x: i + 1 >= len(x) >= i)]
        # create new columns that are length i
        df_subset = df_subset.assign(Sector_tmp=df_subset['Sector'].apply(lambda x: x[0:i]))
        # subset the df to the rows where the tmp sector columns are in naics list
        df_subset = df_subset.loc[df_subset['Sector_tmp'].isin(sector_list)]
        # drop all rows with duplicate temp values, as a less aggregated naics exists
        group_cols = [e for e in group_cols if e not in ('Sector')]
        group_cols.append('Sector_tmp')
        df_subset2 = df_subset.drop_duplicates(subset=group_cols,
                                               keep=False).reset_index(drop=True)
        # merge the naics cw
        new_naics = pd.merge(df_subset2, cw[[sector_merge, sector_add]],
                             how='left', left_on=['Sector_tmp'], right_on=[sector_merge])
        # add column counting the number of child naics associated with a parent
        new_naics = new_naics.assign(sector_count=new_naics.groupby(['Location', 'Sector_tmp'])['Sector_tmp'].transform('count'))
        # only keep the rows where the count is 1
        new_naics2 = new_naics[new_naics['sector_count'] == 1]
        del new_naics2['sector_count']
        # issue warning if rows with more than one child naics that get dropped - will need method of estimation
        missing_naics = new_naics[new_naics['sector_count'] > 1]
        if len(missing_naics) > 0:
            missing_naics = missing_naics[['Location', 'Sector']].values.tolist()
            log.warning('There is data at sector length ' + str(i) + ' that is lost at sector length ' + str(i+1) +
                        ' for ' + str(missing_naics))
        new_naics2 = new_naics2.rename(columns={sector_add: "ST"})
        new_naics2 = new_naics2.drop(columns=[sector_merge])
        # drop columns and rename new sector columns
        new_naics2 = new_naics2.drop(columns=["Sector", "Sector_tmp"])
        new_naics2 = new_naics2.rename(columns={"ST": "Sector"})
        # append new naics to df
        if len(new_naics2) > 1:
            fbs = pd.concat([fbs, new_naics2], sort=True)

    return fbs
Exemplo n.º 2
0
def return_fba_method_meta(sourcename, **kwargs):
    """
    Return meta for a FlowByActivity method
    :param sourcename: string, the FlowByActivity sourcename
    :param kwargs: requires "year" defined
    :return: meta object
    """
    from flowsa.bibliography import load_source_dict

    # load info from either a FBA method yaml or the literature yaml
    fba = load_source_dict(sourcename)
    # initiate empty dictionary
    fba_dict = {}

    # add year if creating an FBA metafile
    if 'year' in kwargs:
        fba_dict['data_year'] = kwargs['year']

    try:
        # loop through the FBA yaml and add info
        for k, v in fba.items():
            # include bib_id because this ifno pulled when generating a method bib
            if k in ('author', 'source_name', 'source_url',
                     'original_data_download_date', 'date_accessed', 'bib_id'):
                fba_dict[k] = str(v)
    except:
        log.warning('No metadata found for %s', sourcename)
        fba_dict['meta_data'] = f'No metadata found for {sourcename}'

    return fba_dict
Exemplo n.º 3
0
def map_elementary_flows(fba, from_fba_source, keep_unmapped_rows=False):
    """
    Applies mapping from fedelemflowlist to convert flows to fedelemflowlist flows
    :param fba: df flow-by-activity or flow-by-sector with 'Flowable', 'Context', and 'Unit' fields
    :param from_fba_source: str Source name of fba list to look for mappings
    :param keep_unmapped_rows: False if want unmapped rows dropped, True if want to retain
    :return:
    """

    from fedelemflowlist import get_flowmapping

    # rename columns to match FBS formatting
    fba = fba.rename(columns={
        "FlowName": 'Flowable',
        "Compartment": "Context"
    })

    flowmapping = get_flowmapping(from_fba_source)
    mapping_fields = [
        "SourceListName", "SourceFlowName", "SourceFlowContext", "SourceUnit",
        "ConversionFactor", "TargetFlowName", "TargetFlowContext", "TargetUnit"
    ]
    if flowmapping.empty:
        log.warning("No mapping file in fedelemflowlist found for " +
                    from_fba_source)
        # return the original df but with columns renamed so can continue working on the FBS
        fba_mapped_df = fba.copy()
    else:
        flowmapping = flowmapping[mapping_fields]

        # define merge type based on keeping or dropping unmapped data
        if keep_unmapped_rows is False:
            merge_type = 'inner'
        else:
            merge_type = 'left'

        # merge fba with flows
        fba_mapped_df = pd.merge(
            fba,
            flowmapping,
            left_on=["Flowable", "Context"],
            right_on=["SourceFlowName", "SourceFlowContext"],
            how=merge_type)
        fba_mapped_df.loc[fba_mapped_df["TargetFlowName"].notnull(),
                          "Flowable"] = fba_mapped_df["TargetFlowName"]
        fba_mapped_df.loc[fba_mapped_df["TargetFlowName"].notnull(),
                          "Context"] = fba_mapped_df["TargetFlowContext"]
        fba_mapped_df.loc[fba_mapped_df["TargetFlowName"].notnull(),
                          "Unit"] = fba_mapped_df["TargetUnit"]
        fba_mapped_df.loc[fba_mapped_df["TargetFlowName"].notnull(), "FlowAmount"] = \
            fba_mapped_df["FlowAmount"] * fba_mapped_df["ConversionFactor"]

        # drop
        fba_mapped_df = fba_mapped_df.drop(columns=mapping_fields)

    return fba_mapped_df
Exemplo n.º 4
0
def check_for_nonetypes_in_sector_col(df):
    """
    Check for NoneType in columns where datatype = string
    :param df: df with columns where datatype = object
    :return: warning message if there are NoneTypes
    """
    # if datatypes are strings, return warning message
    if df['Sector'].isnull().any():
        log.warning("There are NoneType values in the 'Sector' column")
    return df
Exemplo n.º 5
0
def map_flows(fba,
              from_fba_source,
              flow_type='ELEMENTARY_FLOW',
              ignore_source_name=False,
              **kwargs):
    """
    Applies mapping via esupy from fedelemflowlist or material flow list to convert flows to
    standardized list of flows
    :param fba: df flow-by-activity or flow-by-sector
    :param from_fba_source: str Source name of fba list to look for mappings
    :param flow_type: str either 'ELEMENTARY_FLOW', 'TECHNOSPHERE_FLOW',
        or 'WASTE_FLOW'
    :param ignore_source_name: bool, passed to apply_flow_mapping
    :param kwargs: optional - keep_unmapped_rows: False if want unmapped rows dropped,
        True if want to retain and keep_fba_columns: boolean,
        True or False, indicate if want to maintain
        'FlowName' and 'Compartment' columns in returned df
    :return: df, with flows mapped using federal elementary flow list or material flow list
    """

    # prior to mapping elementary flows, ensure all data are in an annual format
    fba = convert_units_to_annual(fba)

    keep_unmapped_rows = False

    # if need to maintain FBA columns, create copies of columns
    if kwargs != {}:
        if ('keep_fba_columns'
                in kwargs) & (kwargs['keep_fba_columns'] is True):
            fba['Flowable'] = fba['FlowName']
            fba['Context'] = fba['Compartment']
        # if keep unmapped rows identified in kwargs, then use
        if 'keep_unmapped_rows' in kwargs:
            keep_unmapped_rows = kwargs['keep_unmapped_rows']

    # else, rename
    else:
        fba = fba.rename(columns={
            'FlowName': 'Flowable',
            'Compartment': 'Context'
        })

    mapped_df = apply_flow_mapping(fba,
                                   from_fba_source,
                                   flow_type=flow_type,
                                   keep_unmapped_rows=keep_unmapped_rows,
                                   ignore_source_name=ignore_source_name)

    if mapped_df is None or len(mapped_df) == 0:
        # return the original df but with columns renamed so can continue working on the FBS
        log.warning("Error in flow mapping")
        mapped_df = fba.copy()
        mapped_df['FlowUUID'] = None

    return mapped_df
Exemplo n.º 6
0
def check_if_location_systems_match(df1, df2):
    """
    Check if two dataframes share the same location system
    :param df1: fba or fbs df
    :param df2: fba or fbs df
    :return:
    """

    if df1["LocationSystem"].all() == df2["LocationSystem"].all():
        log.info("LocationSystems match")
    else:
        log.warning("LocationSystems do not match, might lose county level data")
Exemplo n.º 7
0
def map_elementary_flows(fba, from_fba_source):
    """
    Applies mapping from fedelemflowlist to convert flows to fedelemflowlist flows
    :param fba: df flow-by-activity or flow-by-sector with 'Flowable', 'Context', and 'Unit' fields
    :param from_fba_source: str Source name of fba list to look for mappings
    :return:
    """

    from fedelemflowlist import get_flowmapping

    # rename flow name to flowable - remove this once elementary flows are mapped
    fba = fba.rename(columns={"FlowName": 'Flowable',
                              "Compartment": "Context"})

    flowmapping = get_flowmapping(from_fba_source)
    mapping_fields = ["SourceListName",
                      "SourceFlowName",
                      "SourceFlowContext",
                      "SourceUnit",
                      "ConversionFactor",
                      "TargetFlowName",
                      "TargetFlowContext",
                      "TargetUnit"]
    if flowmapping.empty:
        log.warning("No mapping file in fedelemflowlist found for " + from_fba_source)
        # return the original df but with columns renamed so can continue working on the FBS
        fba_mapped_df = fba.copy()
    else:
        flowmapping = flowmapping[mapping_fields]

        # merge fba with flows
        fba_mapped_df = pd.merge(fba, flowmapping,
                                 left_on=["Flowable", "Context"],
                                 right_on=["SourceFlowName", "SourceFlowContext"],
                                 how="left")
        fba_mapped_df.loc[fba_mapped_df["TargetFlowName"].notnull(), "Flowable"] = fba_mapped_df["TargetFlowName"]
        fba_mapped_df.loc[fba_mapped_df["TargetFlowName"].notnull(), "Context"] = fba_mapped_df["TargetFlowContext"]
        fba_mapped_df.loc[fba_mapped_df["TargetFlowName"].notnull(), "Unit"] = fba_mapped_df["TargetUnit"]
        fba_mapped_df.loc[fba_mapped_df["TargetFlowName"].notnull(), "FlowAmount"] = \
            fba_mapped_df["FlowAmount"] * fba_mapped_df["ConversionFactor"]

        # drop
        fba_mapped_df = fba_mapped_df.drop(columns=mapping_fields)

    return fba_mapped_df
Exemplo n.º 8
0
def getMetadata(source, year):
    """
    Use the esupy package functions to return the metadata for
    a FBA used to generate a FBS
    :param source: string, FBA source name
    :param year: string, year of FBA data
    :param paths: paths as defined in common.py
    :return: meta object, previously generated FBA meta
    """
    from flowsa.flowbyactivity import set_fba_name

    name = set_fba_name(source, year)
    meta = read_source_metadata(paths, set_fb_meta(name, 'FlowByActivity'))
    if meta is None:
        log.warning('No metadata found for %s', source)
        meta = {'source_meta': f'No metadata found for {source} {year}'}

    return meta
Exemplo n.º 9
0
def check_if_data_exists_for_same_geoscales(
        fba_wsec_walloc, source, activity):  # fba_w_aggregated_sectors
    """
    Determine if data exists at the same scales for datasource and allocation source
    :param source_fba:
    :param allocation_fba:
    :return:
    """
    # todo: modify so only returns warning if no value for entire location, not just no value for one of the possible sectors

    from flowsa.mapping import get_activitytosector_mapping

    # create list of highest sector level for which there should be data
    mapping = get_activitytosector_mapping(source)
    # filter by activity of interest
    mapping = mapping.loc[mapping['Activity'].isin(activity)]
    # add sectors to list
    sectors_list = pd.unique(mapping['Sector']).tolist()

    # subset fba w sectors and with merged allocation table so only have rows with aggregated sector list
    df_subset = fba_wsec_walloc.loc[
        (fba_wsec_walloc[fbs_activity_fields[0]].isin(sectors_list)) |
        (fba_wsec_walloc[fbs_activity_fields[1]].isin(sectors_list)
         )].reset_index(drop=True)
    # only interested in total flows
    # df_subset = df_subset.loc[df_subset['FlowName'] == 'total'].reset_index(drop=True)
    # df_subset = df_subset.loc[df_subset['Compartment'] == 'total'].reset_index(drop=True)

    # create subset of fba where the allocation data is missing
    missing_alloc = df_subset.loc[
        df_subset['FlowAmountRatio'].isna()].reset_index(drop=True)
    # drop any rows where source flow value = 0
    missing_alloc = missing_alloc.loc[
        missing_alloc['FlowAmount'] != 0].reset_index(drop=True)
    # create list of locations with missing alllocation data
    states_missing_data = pd.unique(missing_alloc['Location']).tolist()

    if len(missing_alloc) == 0:
        log.info("All aggregated sector flows have allocation flow ratio data")
    else:
        log.warning("Missing allocation flow ratio data for " +
                    ', '.join(states_missing_data))

    return None
Exemplo n.º 10
0
def assign_fips_location_system(df, year_of_data):
    """
    Add location system based on year of data. County level FIPS change over the years.
    :param df: df with FIPS location system
    :param year_of_data: year of data pulled
    :return:
    """

    if '2015' <= year_of_data:
        df.loc[:, 'LocationSystem'] = 'FIPS_2015'
    elif '2013' <= year_of_data < '2015':
        df.loc[:, 'LocationSystem'] = 'FIPS_2013'
    elif '2010' <= year_of_data < '2013':
        df.loc[:, 'LocationSystem'] = 'FIPS_2010'
    elif year_of_data < '2010':
        log.warning(
            "Missing FIPS codes from crosswalk for " + year_of_data + ". Temporarily assigning to FIPS_2010")
        df.loc[:, 'LocationSystem'] = 'FIPS_2010'
    return df
Exemplo n.º 11
0
def assign_fips_location_system(df, year_of_data):
    """
    Add location system based on year of data. County level FIPS change over the years.
    :param df: df with FIPS location system
    :param year_of_data: str, year of data pulled
    :return: df, with 'LocationSystem' column values
    """

    if year_of_data >= '2015':
        df.loc[:, 'LocationSystem'] = 'FIPS_2015'
    elif '2013' <= year_of_data < '2015':
        df.loc[:, 'LocationSystem'] = 'FIPS_2013'
    elif '2010' <= year_of_data < '2013':
        df.loc[:, 'LocationSystem'] = 'FIPS_2010'
    elif year_of_data < '2010':
        log.warning(
            "Missing FIPS codes from crosswalk for %s. Assigning to FIPS_2010", year_of_data)
        df.loc[:, 'LocationSystem'] = 'FIPS_2010'

    return df
Exemplo n.º 12
0
def check_for_missing_sector_data(df, target_sector_level):
    """
    Modeled after datachecks.py check_if_losing_sector_data
    Allocates flow amount equally across child NAICS when parent NAICS is not target_level
    :param df:
    :param target_sector_level:
    :return:
    """

    from flowsa.flowbyfunctions import replace_NoneType_with_empty_cells, replace_strings_with_NoneType

    # temporarily replace null values with empty cells
    df = replace_NoneType_with_empty_cells(df)

    activity_field = "SectorProducedBy"
    rows_lost = pd.DataFrame()
    cw_load = load_sector_length_crosswalk_w_nonnaics()
    for i in range(3, sector_level_key[target_sector_level]):
        # create df of i length
        df_subset = df.loc[df[activity_field].apply(lambda x: len(x) == i)]

        # import cw and subset to current sector length and target sector length

        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & replace sector produced columns
        df_x = pd.merge(df_subset,
                        cw,
                        how='left',
                        left_on=[activity_field],
                        right_on=[nlength])
        df_x[activity_field] = df_x[target_sector_level]
        df_x = df_x.drop(columns=[nlength, target_sector_level])

        # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes
        df_x['FlowAmount'] = df_x['FlowAmount'] / df_x['sector_count']
        df_x = df_x.drop(columns=['sector_count'])
        # replace null values with empty cells
        df_x = replace_NoneType_with_empty_cells(df_x)

        # append to df
        sector_list = df_subset[activity_field].drop_duplicates()
        if len(df_x) != 0:
            log.warning('Data found at ' + str(i) +
                        ' digit NAICS to be allocated'
                        ': {}'.format(' '.join(map(str, sector_list))))
            rows_lost = rows_lost.append(df_x, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info('No data loss from NAICS in dataframe')
    else:
        log.info('Allocating FlowAmounts equally to each ' +
                 target_sector_level)

    # add rows of missing data to the fbs sector subset
    df_allocated = pd.concat([df, rows_lost], ignore_index=True, sort=True)
    df_allocated = df_allocated.loc[df_allocated[activity_field].apply(
        lambda x: len(x) == sector_level_key[target_sector_level])]
    df_allocated.reset_index(inplace=True)

    # replace empty cells with NoneType (if dtype is object)
    df_allocated = replace_strings_with_NoneType(df_allocated)

    return df_allocated
Exemplo n.º 13
0
def check_if_losing_sector_data(df, df_subset, target_sector_level):
    """
    Determine rows of data that will be lost if subset data at target sector level
    In some instances, not all
    :param fbs:
    :return:
    """

    df = df.fillna(fbs_fill_na_dict)
    # exclude nonsectors
    df = df.replace({'nan': '', 'None': ''})

    rows_lost = pd.DataFrame()
    for i in range(2, sector_level_key[target_sector_level]):
        # create df of i length
        df_x1 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]] == '')]
        df_x2 = df.loc[(df[fbs_activity_fields[0]] == '') & (
            df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x3 = df.loc[
            (df[fbs_activity_fields[0]].apply(lambda x: len(x) == i))
            & (df[fbs_activity_fields[1]].apply(lambda x: len(x) == i))]
        df_x = pd.concat([df_x1, df_x2, df_x3], ignore_index=True, sort=False)

        # create df of i + 1 length
        df_y1 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            | df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y2 = df.loc[
            df[fbs_activity_fields[0]].apply(lambda x: len(x) == i + 1)
            & df[fbs_activity_fields[1]].apply(lambda x: len(x) == i + 1)]
        df_y = pd.concat([df_y1, df_y2], ignore_index=True, sort=False)

        # create temp sector columns in df y, that are i digits in length
        df_y.loc[:, 'spb_tmp'] = df_y[fbs_activity_fields[0]].apply(
            lambda x: x[0:i])
        df_y.loc[:, 'scb_tmp'] = df_y[fbs_activity_fields[1]].apply(
            lambda x: x[0:i])
        # don't modify household sector lengths
        df_y = df_y.replace({'F0': 'F010', 'F01': 'F010'})

        # merge the two dfs
        df_m = pd.merge(df_x,
                        df_y[[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ]],
                        how='left',
                        left_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'SectorProducedBy', 'SectorConsumedBy'
                        ],
                        right_on=[
                            'Class', 'Context', 'FlowType', 'Flowable',
                            'Location', 'LocationSystem', 'Unit', 'Year',
                            'spb_tmp', 'scb_tmp'
                        ])

        # extract the rows that are not disaggregated to more specific naics
        rl = df_m[(df_m['scb_tmp'].isnull()) & (df_m['spb_tmp'].isnull())]
        # clean df
        rl = clean_df(rl, flow_by_sector_fields, fbs_fill_na_dict)
        rl_list = rl[['SectorProducedBy',
                      'SectorConsumedBy']].drop_duplicates().values.tolist()

        # match sectors with target sector length sectors

        # import cw and subset to current sector length and target sector length
        cw_load = load_sector_length_crosswalk_w_nonnaics()
        nlength = list(sector_level_key.keys())[list(
            sector_level_key.values()).index(i)]
        cw = cw_load[[nlength, target_sector_level]].drop_duplicates()
        # add column with counts
        cw['sector_count'] = cw.groupby(nlength)[nlength].transform('count')

        # merge df & conditionally replace sector produced/consumed columns
        rl_m = pd.merge(rl,
                        cw,
                        how='left',
                        left_on=[fbs_activity_fields[0]],
                        right_on=[nlength])
        rl_m.loc[rl_m[fbs_activity_fields[0]] != '',
                 fbs_activity_fields[0]] = rl_m[target_sector_level]
        rl_m = rl_m.drop(columns=[nlength, target_sector_level])

        rl_m2 = pd.merge(rl_m,
                         cw,
                         how='left',
                         left_on=[fbs_activity_fields[1]],
                         right_on=[nlength])
        rl_m2.loc[rl_m2[fbs_activity_fields[1]] != '',
                  fbs_activity_fields[1]] = rl_m2[target_sector_level]
        rl_m2 = rl_m2.drop(columns=[nlength, target_sector_level])

        # create one sector count column
        rl_m2['sector_count_x'] = rl_m2['sector_count_x'].fillna(
            rl_m2['sector_count_y'])
        rl_m3 = rl_m2.rename(columns={'sector_count_x': 'sector_count'})
        rl_m3 = rl_m3.drop(columns=['sector_count_y'])

        # calculate new flow amounts, based on sector count, allocating equally to the new sector length codes
        rl_m3['FlowAmount'] = rl_m3['FlowAmount'] / rl_m3['sector_count']
        rl_m3 = rl_m3.drop(columns=['sector_count'])

        # append to df
        if len(rl) != 0:
            log.warning('Data found at ' + str(i) +
                        ' digit NAICS not represented in current '
                        'data subset: {}'.format(' '.join(map(str, rl_list))))
            rows_lost = rows_lost.append(rl_m3, ignore_index=True, sort=True)

    if len(rows_lost) == 0:
        log.info(
            'No data loss from subsetting the dataframe by specified sector length'
        )
    else:
        log.info('Allocating FlowAmounts equally to each ' +
                 target_sector_level +
                 ' associated with the sectors previously being dropped')

    # add rows of missing data to the fbs sector subset
    df_w_lost_data = pd.concat([df_subset, rows_lost],
                               ignore_index=True,
                               sort=True)
    df_w_lost_data = df_w_lost_data.replace({'': None})

    return df_w_lost_data
Exemplo n.º 14
0
def check_for_negative_flowamounts(df):

    if (df['FlowAmount'].values < 0).any():
        log.warning('There are negative FlowAmounts')

    return df
Exemplo n.º 15
0
def main(**kwargs):
    """
    Generate FBA parquet(s)
    :param kwargs: 'source' and 'year'
    :return: parquet saved to local directory
    """
    # assign arguments
    if len(kwargs) == 0:
        kwargs = parse_args()

    source = kwargs['source']
    year = kwargs['year']

    # assign yaml parameters (common.py fxn), drop any extensions to FBA
    # filename if run into error
    try:
        config = load_yaml_dict(source, flowbytype='FBA')
    except UnboundLocalError:
        log.info(f'Could not find Flow-By-Activity config file for {source}')
        source = get_flowsa_base_name(sourceconfigpath, source, "yaml")
        log.info(f'Generating FBA for {source}')
        config = load_yaml_dict(source, flowbytype='FBA')

    log.info("Creating dataframe list")
    # year input can either be sequential years (e.g. 2007-2009) or single year
    if '-' in str(year):
        years = str(year).split('-')
        year_iter = list(range(int(years[0]), int(years[1]) + 1))
    else:
        # Else only a single year defined, create an array of one:
        year_iter = [year]

    # check that year(s) are listed in the method yaml, return warning if not
    years_list = list(set(list(map(int, year_iter))
                          ).difference(config['years']))
    if len(years_list) != 0:
        log.warning(f'Years not listed in FBA method yaml: {years_list}, '
                    f'data might not exist')

    for p_year in year_iter:
        year = str(p_year)
        # replace parts of urls with specific instructions from source.py
        urls = assemble_urls_for_query(source=source, year=year, config=config)
        # create a list with data from all source urls
        df_list = call_urls(url_list=urls,
                            source=source, year=year, config=config)
        # concat the dataframes and parse data with specific
        # instructions from source.py
        log.info("Concat dataframe list and parse data")
        dfs = parse_data(df_list=df_list,
                         source=source, year=year, config=config)
        if isinstance(dfs, list):
            for frame in dfs:
                if not len(frame.index) == 0:
                    try:
                        source_names = frame['SourceName']
                        source_name = source_names.iloc[0]
                    except KeyError:
                        source_name = source
                    process_data_frame(df=frame,
                                       source=source_name, year=year,
                                       config=config)
        else:
            process_data_frame(df=dfs, source=source, year=year, config=config)
Exemplo n.º 16
0
def generate_fbs_bibliography(methodname):
    """
    Generate bibliography for a FlowBySector
    :param methodname: string, methodname to create a bibliiography
    :return: a .bib file saved in local directory
    """

    from flowsa.metadata import getMetadata

    # create list of sources in method
    sources = generate_list_of_sources_in_fbs_method(methodname)

    # loop through list of sources, load source method yaml, and create bib entry
    bib_list = []
    source_set = set()
    for source in sources:
        # drop list duplicates and any where year is None (because allocation
        # is a function, not a datasource)
        if source[1] != 'None':
            try:
                config = load_values_from_literature_citations_config()[
                    source[0]]
            except KeyError:
                try:
                    config = getMetadata(source[0], source[1])
                except KeyError or AttributeError:
                    log.info('Could not find metadata for %s', source[0])
                    continue
            if config is not None:
                # ensure data sources are not duplicated when different source names
                try:
                    if (config['source_name'], config['author'], source[1],
                            config['source_url']) not in source_set:
                        source_set.add(
                            (config['source_name'], config['author'],
                             source[1], config['source_url']))

                        # if there is a date downloaded, use in citation over date generated
                        if 'original_data_download_date' in config:
                            bib_date = config['original_data_download_date']
                        elif 'date_accessed' in config:
                            bib_date = config['date_accessed']
                        else:
                            bib_date = config['date_created']

                        db = BibDatabase()
                        db.entries = [{
                            'title':
                            config['source_name'] + ' ' + str(source[1]),
                            'author':
                            config['author'],
                            'year':
                            str(source[1]),
                            'url':
                            config['source_url'],
                            'urldate':
                            bib_date,
                            'ID':
                            config['bib_id'] + '_' + str(source[1]),
                            'ENTRYTYPE':
                            'misc'
                        }]
                        # append each entry to a list of BibDatabase entries
                        bib_list.append(db)
                except KeyError:
                    log.warning(
                        'Missing information needed to create bib for %s, %s',
                        source[0], source[1])
                    continue

    # write out bibliography
    writer = BibTexWriter()
    # create directory if missing
    os.makedirs(outputpath + '/Bibliography', exist_ok=True)
    with open(f'{biboutputpath}{methodname}.bib', 'w') as bibfile:
        # loop through all entries in bib_list
        for b in bib_list:
            bibfile.write(writer.write(b))