def organize_columns(context, df: dp.DataFrame): """ Rename columns, remove file extension from identifiers, normalize creators names and drop duplicates """ # rename columns cumulus_df = df.rename( columns={ "Record Name": "Source ID", "CÓDIGO DE IDENTIFICAÇÃO PRELIMINAR": "preliminary id", "TÍTULO": "Title", "RESUMO": "Description (Portuguese)", "AUTORIA": "Creator", "DATA": "Date", "DATA LIMITE INFERIOR": "First Year", "DATA LIMITE SUPERIOR": "Last Year", "DIMENSÃO": "dimensions", "PROCESSO FORMADOR DA IMAGEM": "Fabrication Method", "DESIGNAÇÃO GENÉRICA": "Materials", "FORMATO PADRÃO": "format", }, ) # select columns cumulus_df = cumulus_df[ [ "Source ID", "Title", "Description (Portuguese)", "Creator", "Date", "First Year", "Last Year", "Materials", "Fabrication Method", "format", "dimensions", "preliminary id", ] ] # remove file extension cumulus_df["Source ID"] = cumulus_df["Source ID"].str.split(".", n=1, expand=True)[ 0 ] # remove duplicates cumulus_df = cumulus_df.drop_duplicates(subset="Source ID", keep="last") # reverse cretor name cumulus_df["Creator"] = cumulus_df["Creator"].str.replace(r"(.+),\s+(.+)", r"\2 \1") return cumulus_df
def transform_imf_currency_tsv(context, currency_df: DataFrame, cur_config: Dict): """ Transform an IMF SDR per currency DataFrame :param context: execution context :param currency_df: DataFrame to process :param cur_config: currency configuration """ cfg = cur_config['value'] date_col_name = cfg['date_col_name'] # clear whitespace currency_df.rename(columns=str.strip, inplace=True) # remove any whitespace in column names for column in currency_df.columns: currency_df[column] = currency_df[column].str.strip() # make sure no dates are missing currency_date = datetime.strptime(cfg['currency_start_date'], cfg['to_datetime']) currency_end_date = datetime.strptime(cfg['currency_end_date'], cfg['to_datetime']) delta = timedelta(days=1) while currency_date <= currency_end_date: date_text = currency_date.strftime(cfg['to_datetime']) if date_text not in currency_df[date_col_name].values: currency_df = currency_df.append({date_col_name: date_text}, ignore_index=True) currency_date += delta # drop non-data rows in data column currency_df[[date_col_name]] = currency_df[[date_col_name ]].fillna(value='') currency_df = currency_df[currency_df[date_col_name].str.contains( cfg['date_pattern'])] # convert dates and sort currency_df[date_col_name] = pd.to_datetime(currency_df[date_col_name], format=cfg['to_datetime']) currency_df = currency_df.sort_values(by=[date_col_name]) # fill gaps with previous value for column in currency_df.columns: if column != date_col_name: currency_df[column] = currency_df[column].fillna(method='ffill') # and if the gap is on the first line, the next valid value for column in currency_df.columns: if column != date_col_name: currency_df[column] = currency_df[column].fillna(method='bfill') # convert floats for column in currency_df.columns: if column != date_col_name: currency_df[column] = currency_df[column].astype(float) # rename columns to currency code columns = [] currency_names = {} regex = re.compile(cfg['currency_name_pattern']) for column in currency_df.columns: match = regex.search(column) if match: currency_names[match.group(2)] = match.group(1) columns.append(match.group(2)) else: columns.append(column) currency_df.columns = columns yield Output(currency_df, 'currency_df') yield Output(currency_names, 'currency_names')