def drop_unnamed_columns(df: DataFrame, inplace: bool = False) -> DataFrame:
    """
    Drop columns beginning with 'Unnamed' from a DataFrame
    :param df: DataFrame to remove columns from
    :param inplace: Remove inplace flag
    :return: Updated DataFrame
    """
    # drop empty columns
    unnamed = []
    for column in df.columns:
        if column.startswith('Unnamed'):
            unnamed.append(column)

    if inplace:
        df.drop(unnamed, axis=1, inplace=True)
    else:
        df = df.drop(unnamed, axis=1, inplace=False)
    return df
예제 #2
0
def transform_table_desc_df(context, table_desc: DataFrame) -> DataFrame:
    """
    Transform the DataFrame of data types in database table
    :param context: execution context
    :param table_desc: panda DataFrame containing details of the Postgres database table
    :return: panda DataFrame containing details of the Postgres database table
    :rtype: panda.DataFrame
    """

    table_desc.fillna('', inplace=True)

    # get names of indices for which column ignore contains #. i.e. comment lines
    index_names = table_desc[table_desc['ignore'].str.contains('#')].index
    # drop comment row indexes from dataFrame
    table_desc.drop(index_names, inplace=True)
    # drop column ignore from dataFrame
    table_desc.drop(['ignore'], axis=1, inplace=True)

    return table_desc
예제 #3
0
def organise_creator(_, quickstate: dp.DataFrame):
    creators = {
        "Augusto Malta": "Q16495239",
        "Anônimo": "Q4233718",
        "Marc Ferrez": "Q3180571",
        "Georges Leuzinger": "Q5877879",
        "José dos Santos Affonso": "Q63993961",
        "N. Viggiani": "Q65619909",
        "Archanjo Sobrinho": "Q64009665",
        "F. Basto": "Q55089601",
        "J. Faria de Azevedo": "Q97570600",
        "S. H. Holland": "Q65619918",
        "Augusto Monteiro": "Q65619921",
        "Jorge Kfuri": "Q63166336",
        "Camillo Vedani": "Q63109123",
        "Fritz Büsch": "Q63109492",
        "Armando Pittigliani": "Q19607834",
        "Braz": "Q97487621",
        "Stahl & Wahnschaffe": "Q63109157",
        "Gomes Junior": "Q86942676",
        "A. Ruelle": "Q97570551",
        "Guilherme Santos": "Q55088608",
        "Albert Frisch": "Q21288396",
        "José Baptista Barreira Vianna": "Q63166517",
        "Alfredo Krausz": "Q63166405",
        "Therezio Mascarenhas": "Q97570728",
        "Torres": "Q65619905",
        "Theodor Preising": "Q63109140",
        "Augusto Stahl": "Q4821327",
        "Luiz Musso": "Q89538832",
        "Carlos Bippus": "Q63109147",
        "Thiele": "Q64825643",
        "Revert Henrique Klumb": "Q3791061",
        "Juan Gutierrez": "Q10312614",
        "F. Manzière": "Q65619915",
        "Antonio Luiz Ferreira": "Q97570558",
        "Etienne Farnier": "Q97570575",
        "José Francisco Corrêa": "Q10309433",
        "Chapelin": "Q97570376",
        "J. Teixeira": "Q89642578",
        "F. Garcia": "Q97570588",
        "A. de Barros Lobo": "Q97570363",
        "Bloch": "Q61041099",
    }

    def name2qid(name):
        """
        Takes a string and returns the
        corresponding Wikidata QID
        """
        try:
            qid = creators[f"{name}"]
        except KeyError:
            qid = ""
        return qid

    quickstate["P170"] = quickstate["P170"].apply(name2qid)
    quickstate = quickstate.drop(columns="date_accuracy")
    quickstate.name = "import_wikidata"

    def df2quickstatements(df):
        create_str = ""
        edit_str = ""
        str_props = ["Lpt-br", "Dpt-br", "Den", "P217", "P7835"]
        no_ref_props = ["Lpt-br", "Dpt-br", "Den"]
        for _, row in df.iterrows():
            row = dict(row)
            props = []
            if row["qid"]:
                for key in row.keys():
                    if row[key]:
                        if key in str_props:
                            row[key] = '"{0}"'.format(row[key])
                        prop_str = "|".join([
                            str(row["qid"]),
                            str(key).replace("P31_a", "P31"),
                            str(row[key]),
                        ])
                        if key == "P217":
                            prop_str += "|P195|Q71989864"
                        if key == "P195":
                            prop_str += "|P217|" + '"{0}"'.format(row["P217"])
                        if key not in no_ref_props:
                            prop_str += "|S248|Q64995339|S813|+{0}Z/11".format(
                                dt.now().strftime("%Y-%m-%dT00:00:00"))
                        props.append(prop_str)
                item_str = "||".join(props)
                if not edit_str:
                    edit_str += item_str
                else:
                    edit_str += "||" + item_str
            else:
                props.append("CREATE")
                for key in row.keys():
                    if row[key]:
                        if key in str_props:
                            row[key] = '"{0}"'.format(row[key])
                        prop_str = "|".join([
                            "LAST",
                            str(key).replace("P31_a", "P31"),
                            str(row[key]),
                        ])
                        if key == "P217":
                            prop_str += "|P195|Q71989864"
                        if key == "P195":
                            prop_str += "|P217|" + '"{0}"'.format(row["P217"])
                        if key not in no_ref_props:
                            prop_str += "|S248|Q64995339|S813|+{0}Z/11".format(
                                dt.now().strftime("%Y-%m-%dT00:00:00"))
                        props.append(prop_str)
                item_str = "||".join(props)
                if not create_str:
                    create_str += item_str
                else:
                    create_str += "||" + item_str

        return {"create": create_str, "edit": edit_str}

    quickstate.fillna("", inplace=True)

    with open("data/output/quickstatements_create.txt", "w+") as f:
        f.write(df2quickstatements(quickstate)["create"])

    with open("data/output/quickstatements_edit.txt", "w+") as f:
        f.write(df2quickstatements(quickstate)["edit"])

    return quickstate.set_index("qid")
def transform_ex_rates_per_usd(context, ex_rates_per_usd: DataFrame,
                               currency_eq_usd_df: DataFrame,
                               currency_codes_df: DataFrame, cur_config: Dict):
    """

    :param context: execution context
    :param ex_rates_per_usd: DataFrame from an 'IMF National Currency per U.S. Dollar, period average' file
    :param currency_eq_usd_df: panda Dataframe of currency to USD rates
    :param currency_codes_df: IS0 4217 currency codes DataFrame
    :param cur_config: currency configuration
    :return:
    """
    cfg = cur_config['value']
    date_col_name = cfg['date_col_name']
    supplementary_currency_rates = cfg['supplementary_currency_rates']
    currency_codes_cfg = cfg['currency_codes']
    country_attrib = currency_codes_cfg['country_attrib']
    currency_code_attrib = currency_codes_cfg['currency_code_attrib']

    context.log.info(f'Generating list of currencies missing USD rates')

    # make list of missing currencies and add columns to the currency equivalent usd's dataframe
    missing_currencies = []
    for code in cfg['currencies_required']:
        if code not in currency_eq_usd_df.columns:
            currency_eq_usd_df[code] = np.nan
            missing_currencies.append({currency_code_attrib: code})

    # add temp columns with values to match ex_rates_per_usd column
    currency_eq_usd_df['year'] = currency_eq_usd_df[date_col_name].apply(
        lambda x: x.strftime('%Y'))
    currency_eq_usd_df['year_mth'] = currency_eq_usd_df[date_col_name].apply(
        lambda x: x.strftime('%YM%m'))
    currency_eq_usd_df['year_qtr'] = currency_eq_usd_df[date_col_name].apply(
        lambda x: x.strftime('%Y') + 'Q' + str(int((x.month / 3) + 1)))
    temp_period_columns = ['year_mth', 'year_qtr', 'year']

    context.log.info(f'Loading supplementary currency information')

    # add supplementary currency info to exchange rate per usd
    cidx_ex_rates_per_usd = ex_rates_per_usd.set_index(
        ex_rates_per_usd['Country'].str.lower())  # country name as index
    for code in supplementary_currency_rates.keys():
        suplm_currency = supplementary_currency_rates[code]
        for suplm_time in suplm_currency.keys():
            suplm_currency_value = suplm_currency[suplm_time]
            if suplm_time not in cidx_ex_rates_per_usd.columns:
                cidx_ex_rates_per_usd[suplm_time] = '...'

            country = currency_codes_df[currency_codes_df[currency_code_attrib]
                                        == code]
            if len(country) > 0:
                country = country.reset_index(drop=True)
                country_name = country.at[0, country_attrib].lower()

                if country_name not in cidx_ex_rates_per_usd.index:
                    # add new country and set index (as append resets previous set)
                    cidx_ex_rates_per_usd = cidx_ex_rates_per_usd.append(
                        {'Country': country_name}, ignore_index=True)
                    cidx_ex_rates_per_usd = cidx_ex_rates_per_usd. \
                        set_index(cidx_ex_rates_per_usd['Country'].str.lower())

                cidx_ex_rates_per_usd.at[country_name,
                                         suplm_time] = suplm_currency_value

    context.log.info(f'Updating list of currencies with missing USD rates')

    for missing in missing_currencies:
        currency_code = missing[currency_code_attrib]
        currency = currency_codes_df[currency_codes_df[currency_code_attrib] ==
                                     currency_code]
        if len(currency) > 0:
            currency = currency.reset_index(drop=True)
            country_name = currency.at[0, country_attrib].lower()

            for alias in currency_codes_cfg['currency_name_aliases']:
                alias_lower = [x.lower() for x in alias]
                if country_name in alias_lower:
                    idx = alias_lower.index(country_name)
                    country_name = alias_lower[
                        (idx + 1) %
                        2]  # 2 entries in list, get the one its not

            ex_rate_country = cidx_ex_rates_per_usd.loc[
                country_name]  # series of country ex rates

            # set currency values
            def get_time_col_value(col):
                value = np.nan
                if col in ex_rate_country.index:
                    value = ex_rate_country.at[col]
                    if not isinstance(value, float) and not isinstance(
                            value, int):
                        value = np.nan
                return value

            not_filled_mask = None
            for time_col in temp_period_columns:
                # set values to value from time column
                if not_filled_mask is None:
                    currency_eq_usd_df[currency_code] = currency_eq_usd_df[
                        time_col].apply(get_time_col_value)
                else:
                    currency_eq_usd_df.loc[currency_eq_usd_df[currency_code] == np.nan, currency_code] = \
                        currency_eq_usd_df[time_col].apply(get_time_col_value)

                not_filled_mask = currency_eq_usd_df[currency_code].isna()
                if not not_filled_mask.any():
                    break

    currency_eq_usd_df.drop(temp_period_columns, axis=1, inplace=True)

    return currency_eq_usd_df