示例#1
0
def organize_columns(context, df: dp.DataFrame):
    """
    Rename columns, remove file extension from identifiers,
    normalize creators names and drop duplicates
    """
    # rename columns
    cumulus_df = df.rename(
        columns={
            "Record Name": "Source ID",
            "CÓDIGO DE IDENTIFICAÇÃO PRELIMINAR": "preliminary id",
            "TÍTULO": "Title",
            "RESUMO": "Description (Portuguese)",
            "AUTORIA": "Creator",
            "DATA": "Date",
            "DATA LIMITE INFERIOR": "First Year",
            "DATA LIMITE SUPERIOR": "Last Year",
            "DIMENSÃO": "dimensions",
            "PROCESSO FORMADOR DA IMAGEM": "Fabrication Method",
            "DESIGNAÇÃO GENÉRICA": "Materials",
            "FORMATO PADRÃO": "format",
        },
    )
    # select columns
    cumulus_df = cumulus_df[
        [
            "Source ID",
            "Title",
            "Description (Portuguese)",
            "Creator",
            "Date",
            "First Year",
            "Last Year",
            "Materials",
            "Fabrication Method",
            "format",
            "dimensions",
            "preliminary id",
        ]
    ]

    # remove file extension
    cumulus_df["Source ID"] = cumulus_df["Source ID"].str.split(".", n=1, expand=True)[
        0
    ]

    # remove duplicates
    cumulus_df = cumulus_df.drop_duplicates(subset="Source ID", keep="last")

    # reverse cretor name
    cumulus_df["Creator"] = cumulus_df["Creator"].str.replace(r"(.+),\s+(.+)", r"\2 \1")

    return cumulus_df
def transform_imf_currency_tsv(context, currency_df: DataFrame,
                               cur_config: Dict):
    """
    Transform an IMF SDR per currency DataFrame
    :param context: execution context
    :param currency_df: DataFrame to process
    :param cur_config: currency configuration
    """
    cfg = cur_config['value']
    date_col_name = cfg['date_col_name']

    # clear whitespace
    currency_df.rename(columns=str.strip,
                       inplace=True)  # remove any whitespace in column names
    for column in currency_df.columns:
        currency_df[column] = currency_df[column].str.strip()

    # make sure no dates are missing
    currency_date = datetime.strptime(cfg['currency_start_date'],
                                      cfg['to_datetime'])
    currency_end_date = datetime.strptime(cfg['currency_end_date'],
                                          cfg['to_datetime'])
    delta = timedelta(days=1)
    while currency_date <= currency_end_date:
        date_text = currency_date.strftime(cfg['to_datetime'])
        if date_text not in currency_df[date_col_name].values:
            currency_df = currency_df.append({date_col_name: date_text},
                                             ignore_index=True)
        currency_date += delta

    # drop non-data rows in data column
    currency_df[[date_col_name]] = currency_df[[date_col_name
                                                ]].fillna(value='')
    currency_df = currency_df[currency_df[date_col_name].str.contains(
        cfg['date_pattern'])]

    # convert dates and sort
    currency_df[date_col_name] = pd.to_datetime(currency_df[date_col_name],
                                                format=cfg['to_datetime'])
    currency_df = currency_df.sort_values(by=[date_col_name])

    # fill gaps with previous value
    for column in currency_df.columns:
        if column != date_col_name:
            currency_df[column] = currency_df[column].fillna(method='ffill')
    # and if the gap is on the first line, the next valid value
    for column in currency_df.columns:
        if column != date_col_name:
            currency_df[column] = currency_df[column].fillna(method='bfill')

    # convert floats
    for column in currency_df.columns:
        if column != date_col_name:
            currency_df[column] = currency_df[column].astype(float)

    # rename columns to currency code
    columns = []
    currency_names = {}
    regex = re.compile(cfg['currency_name_pattern'])
    for column in currency_df.columns:
        match = regex.search(column)
        if match:
            currency_names[match.group(2)] = match.group(1)
            columns.append(match.group(2))
        else:
            columns.append(column)

    currency_df.columns = columns

    yield Output(currency_df, 'currency_df')
    yield Output(currency_names, 'currency_names')