def write_metadata(context, metadata: dp.DataFrame, to_tag): """ Write available metadata, including GPS tags, to high-res JPEGs """ metadata.fillna(value="", inplace=True) metadata["Source ID"] = metadata["Source ID"].str.upper() metadata.set_index("Source ID", inplace=True) for item in tqdm(to_tag, "Embedding metadata in files..."): if item.endswith(".jpg"): basename = os.path.split(item)[1] name = basename.split(".")[0] date = metadata.loc[name.upper(), "Date"] byline = metadata.loc[name.upper(), "Creator"] headline = metadata.loc[name.upper(), "Title"] caption = metadata.loc[name.upper(), "Description (Portuguese)"] objecttype = metadata.loc[name.upper(), "Type"] # dimensions = f'{metadata.loc[name.upper(), "image_width"]}cm x {metadata.loc[name.upper(), "image_height"]}cm' keywords = metadata.loc[name.upper(), "Depicts"].split("||") latitude = metadata.loc[name.upper(), "Latitude"] longitude = metadata.loc[name.upper(), "Longitude"] # altitude = metadata.loc[name.upper(), "Altitude"] # imgdirection = metadata.loc[name.upper(), "heading"] params = [ "-IPTC:Source=Instituto Moreira Salles/IMS", "-IPTC:CopyrightNotice=This image is in the Public Domain.", "-IPTC:City=Rio de Janeiro", "-IPTC:Province-State=RJ", "-IPTC:Country-PrimaryLocationName=Brasil", "-GPSLatitudeRef=S", "-GPSLongitudeRef=W", "-GPSAltitudeRef=0", "-GPSImgDirectionRef=T", f"-IPTC:DateCreated={date}", f"-IPTC:By-line={byline}", f"-IPTC:ObjectName={name}", f"-IPTC:Headline={headline}", f"-IPTC:Caption-Abstract={caption}", f"-IPTC:ObjectTypeReference={objecttype}", # f"-IPTC:Dimensions={dimensions}", f"-IPTC:Keywords={keywords}", f"-GPSLatitude={latitude}", f"-GPSLongitude={longitude}", # f"-GPSAltitude={altitude}", # f"-GPSImgDirection={imgdirection}", ] with ExifTool(executable_=context.solid_config) as et: for param in params: param = param.encode(encoding="utf-8") dest = item.encode(encoding="utf-8") et.execute(param, dest) to_upload = to_tag return to_upload
def transform_plot_data(context, df: DataFrame, plot_config: Dict) -> Dict: """ Perform any necessary transformations on the plot data :param context: execution context :param df: pandas DataFrame of plot data :param plot_config: dict of plot configurations :return: dict of pandas DataFrame of plot data """ plot_info = {} if 'all' in plot_config.keys(): plot_details = plot_config['all'] raise NotImplementedError( 'All plots functionality is not yet fully supported') else: plot_details = plot_config for plot_key in plot_details.keys(): plot_cfg = plot_details[plot_key] df.columns = plot_cfg[ 'header'] # add the column names to the dataframe for column in plot_cfg['header']: if column in plot_cfg.keys(): column_config = plot_cfg[column] if 'to_datetime' in column_config.keys(): # convert to a date time df[column] = pd.to_datetime( df[column], format=column_config['to_datetime']) plot_info[plot_key] = {'df': df, 'config': plot_cfg} return plot_info
def drop_unnamed_columns(df: DataFrame, inplace: bool = False) -> DataFrame: """ Drop columns beginning with 'Unnamed' from a DataFrame :param df: DataFrame to remove columns from :param inplace: Remove inplace flag :return: Updated DataFrame """ # drop empty columns unnamed = [] for column in df.columns: if column.startswith('Unnamed'): unnamed.append(column) if inplace: df.drop(unnamed, axis=1, inplace=True) else: df = df.drop(unnamed, axis=1, inplace=False) return df
def generate_test_data(): """ Generate random data """ df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=['col_a', 'col_b', 'col_c', 'col_d']) return DataFrame(df)
def organize_columns(context, df: dp.DataFrame): """ Rename columns, remove file extension from identifiers, normalize creators names and drop duplicates """ # rename columns cumulus_df = df.rename( columns={ "Record Name": "Source ID", "CÓDIGO DE IDENTIFICAÇÃO PRELIMINAR": "preliminary id", "TÍTULO": "Title", "RESUMO": "Description (Portuguese)", "AUTORIA": "Creator", "DATA": "Date", "DATA LIMITE INFERIOR": "First Year", "DATA LIMITE SUPERIOR": "Last Year", "DIMENSÃO": "dimensions", "PROCESSO FORMADOR DA IMAGEM": "Fabrication Method", "DESIGNAÇÃO GENÉRICA": "Materials", "FORMATO PADRÃO": "format", }, ) # select columns cumulus_df = cumulus_df[ [ "Source ID", "Title", "Description (Portuguese)", "Creator", "Date", "First Year", "Last Year", "Materials", "Fabrication Method", "format", "dimensions", "preliminary id", ] ] # remove file extension cumulus_df["Source ID"] = cumulus_df["Source ID"].str.split(".", n=1, expand=True)[ 0 ] # remove duplicates cumulus_df = cumulus_df.drop_duplicates(subset="Source ID", keep="last") # reverse cretor name cumulus_df["Creator"] = cumulus_df["Creator"].str.replace(r"(.+),\s+(.+)", r"\2 \1") return cumulus_df
def generate_table_fields_str(context, table_desc: DataFrame): """ Upload a DataFrame to the Postgres server, creating the table if it doesn't exist :param context: execution context :param table_desc: pandas DataFrame containing details of the database table :return: panda DataFrame or None :rtype: panda.DataFrame """ # add fields from the table description create_columns = '' insert_columns = '' idx = 0 # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.itertuples.html#pandas.DataFrame.itertuples for row in table_desc.itertuples(index=False, name='FieldDef'): if row.save.lower() != 'y': # don't save this entry to database continue if idx > 0: create_columns += ', ' insert_columns += ', ' create_columns += f'{row.field} {row.datatype} ' insert_columns += f'{row.field} ' if row.primary_key.lower() == 'y': create_columns += 'PRIMARY KEY ' if row.not_null.lower() == 'y': create_columns += 'NOT NULL ' if row.default != '': # integer field default values may be real/str in table_desc dtype = row.datatype.lower() if dtype == 'integer' or dtype == 'smallint' or dtype == 'bigint' or dtype == 'serial': default = f'{int(row.default)}' else: default = f'{row.default}' create_columns += f'DEFAULT {default} ' idx += 1 yield Output(create_columns, 'create_columns') yield Output(insert_columns, 'insert_columns')
def transform_table_desc_df(context, table_desc: DataFrame) -> DataFrame: """ Transform the DataFrame of data types in database table :param context: execution context :param table_desc: panda DataFrame containing details of the Postgres database table :return: panda DataFrame containing details of the Postgres database table :rtype: panda.DataFrame """ table_desc.fillna('', inplace=True) # get names of indices for which column ignore contains #. i.e. comment lines index_names = table_desc[table_desc['ignore'].str.contains('#')].index # drop comment row indexes from dataFrame table_desc.drop(index_names, inplace=True) # drop column ignore from dataFrame table_desc.drop(['ignore'], axis=1, inplace=True) return table_desc
def sum_solid(num_df: DataFrame) -> DataFrame: sum_df = num_df.copy() sum_df['sum'] = sum_df['num1'] + sum_df['num2'] return sum_df
def sum_sq_solid(sum_df: DataFrame) -> DataFrame: sum_sq_df = sum_df.copy() sum_sq_df['sum_sq'] = sum_df['sum']**2 return sum_sq_df
def organize_columns_to_omeka(_, df: dp.DataFrame, smapshot: dp.DataFrame, mapping: dp.DataFrame): def string2url(string): if "||Stereoscopy" in string: string = string.split("||")[0] QID = mapping.loc[string, "Wiki ID"] return (f"http://wikidata.org/wiki/{QID} {string}" + "||http://wikidata.org/wiki/Q35158 Stereoscopy") elif "||Estereoscopia" in string: string = string.split("||")[0] QID = mapping.loc[string, "Wiki ID"] return (f"http://wikidata.org/wiki/{QID} {string}" + "||http://wikidata.org/wiki/Q35158 Estereoscopia") else: QID = mapping.loc[string, "Wiki ID"] return f"http://wikidata.org/wiki/{QID} {string}" def translateString(string): if "||Stereoscopy" in string: string = string.split("||")[0] kw = mapping.loc[string, "Label:pt"] return kw + "||Estereoscopia" else: kw = mapping.loc[string, "Label:pt"] return kw # filter items omeka_df = df.loc[(df["Source"] == "Instituto Moreira Salles") & (df["Latitude"].notna() & df["Source URL"].notna() & df["Media URL"].notna() & df["First Year"].notna() & df["Last Year"].notna())] omeka_df = omeka_df.dropna(subset=["Collections"]) mapping.set_index("Label:en", inplace=True) omeka_df[["First Year", "Last Year"]] = omeka_df[["First Year", "Last Year" ]].applymap(lambda x: str(int(x)), na_action="ignore") # create columns omeka_df.loc[omeka_df["First Year"].notna(), "dcterms:available"] = (omeka_df["First Year"].astype(str) + "/" + omeka_df["Last Year"].astype(str)) omeka_df["dcterms:format:en"] = omeka_df["Materials"] omeka_df["dcterms:format:pt"] = omeka_df["Materials"] omeka_df["dcterms:medium:en"] = omeka_df["Fabrication Method"] omeka_df["dcterms:medium:pt"] = omeka_df["Fabrication Method"] omeka_df["dcterms:type:en"] = omeka_df["Type"] omeka_df["dcterms:type:pt"] = omeka_df["Type"] # format data omeka_df["Source URL"] = omeka_df["Source URL"] + " " + omeka_df["Source"] omeka_df["Wikidata ID"] = ("www.wikidata.org/wiki/" + omeka_df["Wikidata ID"] + " Wikidata") include = omeka_df["Source ID"].isin(smapshot["id"]) omeka_df.loc[include, "Collections"] = omeka_df["Collections"] + "||Smapshot" omeka_df[["dcterms:format:en", "dcterms:medium:en", "dcterms:type:en"]] = omeka_df[[ "dcterms:format:en", "dcterms:medium:en", "dcterms:type:en" ]].applymap(string2url, na_action="ignore") omeka_df[["dcterms:format:pt", "dcterms:medium:pt", "dcterms:type:pt"]] = omeka_df[[ "dcterms:format:pt", "dcterms:medium:pt", "dcterms:type:pt" ]].applymap(translateString, na_action="ignore") mapping = mapping.reset_index() mapping.set_index("Label:pt", inplace=True) omeka_df[["dcterms:format:pt", "dcterms:medium:pt", "dcterms:type:pt"]] = omeka_df[[ "dcterms:format:pt", "dcterms:medium:pt", "dcterms:type:pt" ]].applymap(string2url, na_action="ignore") # rename columns omeka_df = omeka_df.rename( columns={ "Source ID": "dcterms:identifier", "Title": "dcterms:title", "Description (Portuguese)": "dcterms:description", "Creator": "dcterms:creator", "Date": "dcterms:date", "Width (mm)": "schema:width", "Height (mm)": "schema:height", "Rights": "dcterms:rights", "Attribution": "dcterms:bibliographicCitation", "Source URL": "dcterms:source", "Wikidata ID": "dcterms:hasVersion", "Depicts": "foaf:depicts", "Media URL": "media", "Latitude": "latitude", "Longitude": "longitude", "Collection": "item_sets", }) # select columns omeka_df = omeka_df[[ "dcterms:identifier", "dcterms:title", "dcterms:description", "dcterms:creator", "dcterms:date", "dcterms:available", "dcterms:type:en", "dcterms:type:pt", "dcterms:medium:pt", "dcterms:medium:en", "dcterms:format:pt", "dcterms:format:en", "dcterms:rights", "dcterms:bibliographicCitation", "dcterms:source", "dcterms:hasVersion", "latitude", "longitude", "foaf:depicts", "schema:width", "schema:height", "media", "item_sets", ]] return omeka_df
def organise_creator(_, quickstate: dp.DataFrame): creators = { "Augusto Malta": "Q16495239", "Anônimo": "Q4233718", "Marc Ferrez": "Q3180571", "Georges Leuzinger": "Q5877879", "José dos Santos Affonso": "Q63993961", "N. Viggiani": "Q65619909", "Archanjo Sobrinho": "Q64009665", "F. Basto": "Q55089601", "J. Faria de Azevedo": "Q97570600", "S. H. Holland": "Q65619918", "Augusto Monteiro": "Q65619921", "Jorge Kfuri": "Q63166336", "Camillo Vedani": "Q63109123", "Fritz Büsch": "Q63109492", "Armando Pittigliani": "Q19607834", "Braz": "Q97487621", "Stahl & Wahnschaffe": "Q63109157", "Gomes Junior": "Q86942676", "A. Ruelle": "Q97570551", "Guilherme Santos": "Q55088608", "Albert Frisch": "Q21288396", "José Baptista Barreira Vianna": "Q63166517", "Alfredo Krausz": "Q63166405", "Therezio Mascarenhas": "Q97570728", "Torres": "Q65619905", "Theodor Preising": "Q63109140", "Augusto Stahl": "Q4821327", "Luiz Musso": "Q89538832", "Carlos Bippus": "Q63109147", "Thiele": "Q64825643", "Revert Henrique Klumb": "Q3791061", "Juan Gutierrez": "Q10312614", "F. Manzière": "Q65619915", "Antonio Luiz Ferreira": "Q97570558", "Etienne Farnier": "Q97570575", "José Francisco Corrêa": "Q10309433", "Chapelin": "Q97570376", "J. Teixeira": "Q89642578", "F. Garcia": "Q97570588", "A. de Barros Lobo": "Q97570363", "Bloch": "Q61041099", } def name2qid(name): """ Takes a string and returns the corresponding Wikidata QID """ try: qid = creators[f"{name}"] except KeyError: qid = "" return qid quickstate["P170"] = quickstate["P170"].apply(name2qid) quickstate = quickstate.drop(columns="date_accuracy") quickstate.name = "import_wikidata" def df2quickstatements(df): create_str = "" edit_str = "" str_props = ["Lpt-br", "Dpt-br", "Den", "P217", "P7835"] no_ref_props = ["Lpt-br", "Dpt-br", "Den"] for _, row in df.iterrows(): row = dict(row) props = [] if row["qid"]: for key in row.keys(): if row[key]: if key in str_props: row[key] = '"{0}"'.format(row[key]) prop_str = "|".join([ str(row["qid"]), str(key).replace("P31_a", "P31"), str(row[key]), ]) if key == "P217": prop_str += "|P195|Q71989864" if key == "P195": prop_str += "|P217|" + '"{0}"'.format(row["P217"]) if key not in no_ref_props: prop_str += "|S248|Q64995339|S813|+{0}Z/11".format( dt.now().strftime("%Y-%m-%dT00:00:00")) props.append(prop_str) item_str = "||".join(props) if not edit_str: edit_str += item_str else: edit_str += "||" + item_str else: props.append("CREATE") for key in row.keys(): if row[key]: if key in str_props: row[key] = '"{0}"'.format(row[key]) prop_str = "|".join([ "LAST", str(key).replace("P31_a", "P31"), str(row[key]), ]) if key == "P217": prop_str += "|P195|Q71989864" if key == "P195": prop_str += "|P217|" + '"{0}"'.format(row["P217"]) if key not in no_ref_props: prop_str += "|S248|Q64995339|S813|+{0}Z/11".format( dt.now().strftime("%Y-%m-%dT00:00:00")) props.append(prop_str) item_str = "||".join(props) if not create_str: create_str += item_str else: create_str += "||" + item_str return {"create": create_str, "edit": edit_str} quickstate.fillna("", inplace=True) with open("data/output/quickstatements_create.txt", "w+") as f: f.write(df2quickstatements(quickstate)["create"]) with open("data/output/quickstatements_edit.txt", "w+") as f: f.write(df2quickstatements(quickstate)["edit"]) return quickstate.set_index("qid")
def make_df_to_wikidata(_, df: dp.DataFrame, mapping: dp.DataFrame): def string2qid(string): QID = mapping.loc[string, "Wiki ID"] return QID # filter items df = df.loc[(df["Source"] == "Instituto Moreira Salles") & df["Latitude"].notna() & df["Source URL"].notna() & df["Media URL"].notna() & df["First Year"].notna() & df["Last Year"].notna() & df["Width (mm)"].notna() & df["Height (mm)"]] df = df.dropna(subset=["Collections"]) df[["First Year", "Last Year"]] = df[["First Year", "Last Year"]].applymap(lambda x: str(int(x)), na_action="ignore") mapping.set_index("Label:en", inplace=True) df["First Year"] = pd.to_datetime(df["First Year"]) df["Last Year"] = pd.to_datetime(df["Last Year"]) df[["Type", "Type_"]] = df["Type"].str.rsplit("||", n=1, expand=True) quickstate = pd.DataFrame(columns=[ "qid", "P31", "P31_a", "Lpt-br", "Dpt-br", "Den", "P571", "qal1319", "qal1326", "P17", "P1259", "qal2044", "qal7787", "qal8208", "P170", "P186", "P195", "P217", "P2079", "P4036", "P2049", "P2048", "P7835", ]) # date_accuracy quickstate["date_accuracy"] = df["date_accuracy"] circa = quickstate["date_accuracy"] == "circa" year = quickstate["date_accuracy"] == "year" month = quickstate["date_accuracy"] == "month" day = quickstate["date_accuracy"] == "day" quickstate["P571"] = df["datetime"].apply(dt.isoformat) quickstate.loc[circa, "P571"] = "+" + quickstate["P571"] + "Z/8" quickstate.loc[year, "P571"] = "+" + quickstate["P571"] + "Z/9" quickstate.loc[month, "P571"] = "+" + quickstate["P571"] + "Z/10" quickstate.loc[day, "P571"] = "+" + quickstate["P571"] + "Z/11" # earliest date # quickstate["qal1319"] = df["First Year"].apply(dt.isoformat) + "Z/9" quickstate["P571"] = (quickstate["P571"] + "|P580|+" + df["First Year"].apply(dt.isoformat) + "Z/9" "|P582|+" + df["Last Year"].apply(dt.isoformat) + "Z/9") # latest date # quickstate["qal1326"] = df["Last Year"].apply(dt.isoformat) + "Z/9" # pt-br label quickstate["Lpt-br"] = df["Title"] # creator quickstate["P170"] = df["Creator"] # description # pt-br quickstate["Dpt-br"] = "Fotografia de " + df["Creator"] # en quickstate["Den"] = np.where( df["Creator"] != "Anônimo", "Photograph by " + df["Creator"], "Photograph by Unknown", ) # inventory number quickstate["P217"] = df["Source ID"] list_creator = list(quickstate["P170"].unique()) for author in list_creator: df_creator = quickstate.loc[quickstate["P170"] == author] duplicate = df_creator.duplicated(subset=["Lpt-br"], keep=False) df_creator.loc[duplicate, "Dpt-br"] = ("Fotografia de " + df_creator.loc[duplicate, "P170"] + " (" + df_creator.loc[duplicate, "P217"] + ")") df_creator.loc[duplicate, "Den"] = np.where( df_creator.loc[duplicate, "P170"] != "Anônimo", "Photograph by " + df_creator.loc[duplicate, "P170"] + " (" + df_creator.loc[duplicate, "P217"] + ")", "Photograph by Unknown" + " (" + df_creator.loc[duplicate, "P217"] + ")", ) quickstate.loc[quickstate["P170"] == author, ["Dpt-br", "Den"]] = df_creator[["Dpt-br", "Den"]] # Instance of quickstate["P31"] = "Q125191" quickstate["P31_a"] = df["Type_"].map({"Stereoscopy": "Q35158"}) # country quickstate["P17"] = "Q155" # coordinate of POV quickstate["P1259"] = (("@" + df["Latitude"].astype(str) + "/" + df["Longitude"].astype(str)) + "|P2044|" + df["altitude"].astype(str) + "U11573" + "|P7787|" + df["heading"].astype(str) + "U28390" + "|P8208|" + df["tilt"].astype(str) + "U28390") # altitude # quickstate["qal2044"] = df["altitude"].astype(str) + "P11573" # heading # quickstate["qal7787"] = df["heading"].astype(str) + "P28390" # tilt # quickstate["qal8208"] = df["tilt"].astype(str) + "P28390" # made from material quickstate["P186"] = df["Materials"] # collection quickstate["P195"] = "Q71989864" # fabrication method quickstate["P2079"] = df["Fabrication Method"] # field of view quickstate["P4036"] = df["fov"].astype(str) + "U28390" # width quickstate["P2049"] = df["Width (mm)"].astype(str) + "U174789" # height quickstate["P2048"] = df["Height (mm)"].astype(str) + "U174789" # IMS ID quickstate["P7835"] = df["Source URL"].str.extract(r"(\d+)").astype(int) # qid quickstate["qid"] = df["Wikidata ID"] # Copyright status # quickstate["P6216"] # format data P186 and P2079 quickstate[["P186", "P2079"]] = quickstate[["P186", "P2079"]].applymap(string2qid, na_action="ignore") return quickstate
def sum_sq_solid(_, sum_df: DataFrame, mult_df: DataFrame) -> DataFrame: sum_sq_df = sum_df.copy() sum_sq_df['sum_sq'] = sum_df['sum'] ** 2 sum_sq_df['sum_mult_sq'] = sum_df['sum'] * mult_df['mult'] return sum_sq_df
def mult_solid(_, num_df: DataFrame) -> DataFrame: mult_df = num_df.copy() mult_df['mult'] = num_df['num1'] * num_df['num2'] return mult_df
def transform_ex_rates_per_usd(context, ex_rates_per_usd: DataFrame, currency_eq_usd_df: DataFrame, currency_codes_df: DataFrame, cur_config: Dict): """ :param context: execution context :param ex_rates_per_usd: DataFrame from an 'IMF National Currency per U.S. Dollar, period average' file :param currency_eq_usd_df: panda Dataframe of currency to USD rates :param currency_codes_df: IS0 4217 currency codes DataFrame :param cur_config: currency configuration :return: """ cfg = cur_config['value'] date_col_name = cfg['date_col_name'] supplementary_currency_rates = cfg['supplementary_currency_rates'] currency_codes_cfg = cfg['currency_codes'] country_attrib = currency_codes_cfg['country_attrib'] currency_code_attrib = currency_codes_cfg['currency_code_attrib'] context.log.info(f'Generating list of currencies missing USD rates') # make list of missing currencies and add columns to the currency equivalent usd's dataframe missing_currencies = [] for code in cfg['currencies_required']: if code not in currency_eq_usd_df.columns: currency_eq_usd_df[code] = np.nan missing_currencies.append({currency_code_attrib: code}) # add temp columns with values to match ex_rates_per_usd column currency_eq_usd_df['year'] = currency_eq_usd_df[date_col_name].apply( lambda x: x.strftime('%Y')) currency_eq_usd_df['year_mth'] = currency_eq_usd_df[date_col_name].apply( lambda x: x.strftime('%YM%m')) currency_eq_usd_df['year_qtr'] = currency_eq_usd_df[date_col_name].apply( lambda x: x.strftime('%Y') + 'Q' + str(int((x.month / 3) + 1))) temp_period_columns = ['year_mth', 'year_qtr', 'year'] context.log.info(f'Loading supplementary currency information') # add supplementary currency info to exchange rate per usd cidx_ex_rates_per_usd = ex_rates_per_usd.set_index( ex_rates_per_usd['Country'].str.lower()) # country name as index for code in supplementary_currency_rates.keys(): suplm_currency = supplementary_currency_rates[code] for suplm_time in suplm_currency.keys(): suplm_currency_value = suplm_currency[suplm_time] if suplm_time not in cidx_ex_rates_per_usd.columns: cidx_ex_rates_per_usd[suplm_time] = '...' country = currency_codes_df[currency_codes_df[currency_code_attrib] == code] if len(country) > 0: country = country.reset_index(drop=True) country_name = country.at[0, country_attrib].lower() if country_name not in cidx_ex_rates_per_usd.index: # add new country and set index (as append resets previous set) cidx_ex_rates_per_usd = cidx_ex_rates_per_usd.append( {'Country': country_name}, ignore_index=True) cidx_ex_rates_per_usd = cidx_ex_rates_per_usd. \ set_index(cidx_ex_rates_per_usd['Country'].str.lower()) cidx_ex_rates_per_usd.at[country_name, suplm_time] = suplm_currency_value context.log.info(f'Updating list of currencies with missing USD rates') for missing in missing_currencies: currency_code = missing[currency_code_attrib] currency = currency_codes_df[currency_codes_df[currency_code_attrib] == currency_code] if len(currency) > 0: currency = currency.reset_index(drop=True) country_name = currency.at[0, country_attrib].lower() for alias in currency_codes_cfg['currency_name_aliases']: alias_lower = [x.lower() for x in alias] if country_name in alias_lower: idx = alias_lower.index(country_name) country_name = alias_lower[ (idx + 1) % 2] # 2 entries in list, get the one its not ex_rate_country = cidx_ex_rates_per_usd.loc[ country_name] # series of country ex rates # set currency values def get_time_col_value(col): value = np.nan if col in ex_rate_country.index: value = ex_rate_country.at[col] if not isinstance(value, float) and not isinstance( value, int): value = np.nan return value not_filled_mask = None for time_col in temp_period_columns: # set values to value from time column if not_filled_mask is None: currency_eq_usd_df[currency_code] = currency_eq_usd_df[ time_col].apply(get_time_col_value) else: currency_eq_usd_df.loc[currency_eq_usd_df[currency_code] == np.nan, currency_code] = \ currency_eq_usd_df[time_col].apply(get_time_col_value) not_filled_mask = currency_eq_usd_df[currency_code].isna() if not not_filled_mask.any(): break currency_eq_usd_df.drop(temp_period_columns, axis=1, inplace=True) return currency_eq_usd_df
def transform_imf_currency_tsv(context, currency_df: DataFrame, cur_config: Dict): """ Transform an IMF SDR per currency DataFrame :param context: execution context :param currency_df: DataFrame to process :param cur_config: currency configuration """ cfg = cur_config['value'] date_col_name = cfg['date_col_name'] # clear whitespace currency_df.rename(columns=str.strip, inplace=True) # remove any whitespace in column names for column in currency_df.columns: currency_df[column] = currency_df[column].str.strip() # make sure no dates are missing currency_date = datetime.strptime(cfg['currency_start_date'], cfg['to_datetime']) currency_end_date = datetime.strptime(cfg['currency_end_date'], cfg['to_datetime']) delta = timedelta(days=1) while currency_date <= currency_end_date: date_text = currency_date.strftime(cfg['to_datetime']) if date_text not in currency_df[date_col_name].values: currency_df = currency_df.append({date_col_name: date_text}, ignore_index=True) currency_date += delta # drop non-data rows in data column currency_df[[date_col_name]] = currency_df[[date_col_name ]].fillna(value='') currency_df = currency_df[currency_df[date_col_name].str.contains( cfg['date_pattern'])] # convert dates and sort currency_df[date_col_name] = pd.to_datetime(currency_df[date_col_name], format=cfg['to_datetime']) currency_df = currency_df.sort_values(by=[date_col_name]) # fill gaps with previous value for column in currency_df.columns: if column != date_col_name: currency_df[column] = currency_df[column].fillna(method='ffill') # and if the gap is on the first line, the next valid value for column in currency_df.columns: if column != date_col_name: currency_df[column] = currency_df[column].fillna(method='bfill') # convert floats for column in currency_df.columns: if column != date_col_name: currency_df[column] = currency_df[column].astype(float) # rename columns to currency code columns = [] currency_names = {} regex = re.compile(cfg['currency_name_pattern']) for column in currency_df.columns: match = regex.search(column) if match: currency_names[match.group(2)] = match.group(1) columns.append(match.group(2)) else: columns.append(column) currency_df.columns = columns yield Output(currency_df, 'currency_df') yield Output(currency_names, 'currency_names')