Exemplo n.º 1
0
def write_metadata(context, metadata: dp.DataFrame, to_tag):
    """
    Write available metadata, including GPS tags,
    to high-res JPEGs
    """
    metadata.fillna(value="", inplace=True)
    metadata["Source ID"] = metadata["Source ID"].str.upper()
    metadata.set_index("Source ID", inplace=True)

    for item in tqdm(to_tag, "Embedding metadata in files..."):
        if item.endswith(".jpg"):
            basename = os.path.split(item)[1]
            name = basename.split(".")[0]
            date = metadata.loc[name.upper(), "Date"]
            byline = metadata.loc[name.upper(), "Creator"]
            headline = metadata.loc[name.upper(), "Title"]
            caption = metadata.loc[name.upper(), "Description (Portuguese)"]
            objecttype = metadata.loc[name.upper(), "Type"]
            # dimensions = f'{metadata.loc[name.upper(), "image_width"]}cm x {metadata.loc[name.upper(), "image_height"]}cm'
            keywords = metadata.loc[name.upper(), "Depicts"].split("||")
            latitude = metadata.loc[name.upper(), "Latitude"]
            longitude = metadata.loc[name.upper(), "Longitude"]
            # altitude = metadata.loc[name.upper(), "Altitude"]
            # imgdirection = metadata.loc[name.upper(), "heading"]

            params = [
                "-IPTC:Source=Instituto Moreira Salles/IMS",
                "-IPTC:CopyrightNotice=This image is in the Public Domain.",
                "-IPTC:City=Rio de Janeiro",
                "-IPTC:Province-State=RJ",
                "-IPTC:Country-PrimaryLocationName=Brasil",
                "-GPSLatitudeRef=S",
                "-GPSLongitudeRef=W",
                "-GPSAltitudeRef=0",
                "-GPSImgDirectionRef=T",
                f"-IPTC:DateCreated={date}",
                f"-IPTC:By-line={byline}",
                f"-IPTC:ObjectName={name}",
                f"-IPTC:Headline={headline}",
                f"-IPTC:Caption-Abstract={caption}",
                f"-IPTC:ObjectTypeReference={objecttype}",
                # f"-IPTC:Dimensions={dimensions}",
                f"-IPTC:Keywords={keywords}",
                f"-GPSLatitude={latitude}",
                f"-GPSLongitude={longitude}",
                # f"-GPSAltitude={altitude}",
                # f"-GPSImgDirection={imgdirection}",
            ]
            with ExifTool(executable_=context.solid_config) as et:
                for param in params:
                    param = param.encode(encoding="utf-8")
                    dest = item.encode(encoding="utf-8")
                    et.execute(param, dest)
    to_upload = to_tag
    return to_upload
def transform_plot_data(context, df: DataFrame, plot_config: Dict) -> Dict:
    """
    Perform any necessary transformations on the plot data
    :param context: execution context
    :param df: pandas DataFrame of plot data
    :param plot_config: dict of plot configurations
    :return: dict of pandas DataFrame of plot data
    """
    plot_info = {}
    if 'all' in plot_config.keys():
        plot_details = plot_config['all']
        raise NotImplementedError(
            'All plots functionality is not yet fully supported')
    else:
        plot_details = plot_config

    for plot_key in plot_details.keys():
        plot_cfg = plot_details[plot_key]
        df.columns = plot_cfg[
            'header']  # add the column names to the dataframe

        for column in plot_cfg['header']:
            if column in plot_cfg.keys():
                column_config = plot_cfg[column]
                if 'to_datetime' in column_config.keys():
                    # convert to a date time
                    df[column] = pd.to_datetime(
                        df[column], format=column_config['to_datetime'])

        plot_info[plot_key] = {'df': df, 'config': plot_cfg}

    return plot_info
def drop_unnamed_columns(df: DataFrame, inplace: bool = False) -> DataFrame:
    """
    Drop columns beginning with 'Unnamed' from a DataFrame
    :param df: DataFrame to remove columns from
    :param inplace: Remove inplace flag
    :return: Updated DataFrame
    """
    # drop empty columns
    unnamed = []
    for column in df.columns:
        if column.startswith('Unnamed'):
            unnamed.append(column)

    if inplace:
        df.drop(unnamed, axis=1, inplace=True)
    else:
        df = df.drop(unnamed, axis=1, inplace=False)
    return df
Exemplo n.º 4
0
def generate_test_data():
    """
    Generate random data
    """
    df = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)),
                      columns=['col_a',
                               'col_b',
                               'col_c',
                               'col_d'])

    return DataFrame(df)
Exemplo n.º 5
0
def organize_columns(context, df: dp.DataFrame):
    """
    Rename columns, remove file extension from identifiers,
    normalize creators names and drop duplicates
    """
    # rename columns
    cumulus_df = df.rename(
        columns={
            "Record Name": "Source ID",
            "CÓDIGO DE IDENTIFICAÇÃO PRELIMINAR": "preliminary id",
            "TÍTULO": "Title",
            "RESUMO": "Description (Portuguese)",
            "AUTORIA": "Creator",
            "DATA": "Date",
            "DATA LIMITE INFERIOR": "First Year",
            "DATA LIMITE SUPERIOR": "Last Year",
            "DIMENSÃO": "dimensions",
            "PROCESSO FORMADOR DA IMAGEM": "Fabrication Method",
            "DESIGNAÇÃO GENÉRICA": "Materials",
            "FORMATO PADRÃO": "format",
        },
    )
    # select columns
    cumulus_df = cumulus_df[
        [
            "Source ID",
            "Title",
            "Description (Portuguese)",
            "Creator",
            "Date",
            "First Year",
            "Last Year",
            "Materials",
            "Fabrication Method",
            "format",
            "dimensions",
            "preliminary id",
        ]
    ]

    # remove file extension
    cumulus_df["Source ID"] = cumulus_df["Source ID"].str.split(".", n=1, expand=True)[
        0
    ]

    # remove duplicates
    cumulus_df = cumulus_df.drop_duplicates(subset="Source ID", keep="last")

    # reverse cretor name
    cumulus_df["Creator"] = cumulus_df["Creator"].str.replace(r"(.+),\s+(.+)", r"\2 \1")

    return cumulus_df
Exemplo n.º 6
0
def generate_table_fields_str(context, table_desc: DataFrame):
    """
    Upload a DataFrame to the Postgres server, creating the table if it doesn't exist
    :param context: execution context
    :param table_desc: pandas DataFrame containing details of the database table
    :return: panda DataFrame or None
    :rtype: panda.DataFrame
    """

    # add fields from the table description
    create_columns = ''
    insert_columns = ''
    idx = 0
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.itertuples.html#pandas.DataFrame.itertuples
    for row in table_desc.itertuples(index=False, name='FieldDef'):
        if row.save.lower() != 'y':
            # don't save this entry to database
            continue

        if idx > 0:
            create_columns += ', '
            insert_columns += ', '

        create_columns += f'{row.field} {row.datatype} '
        insert_columns += f'{row.field} '

        if row.primary_key.lower() == 'y':
            create_columns += 'PRIMARY KEY '
        if row.not_null.lower() == 'y':
            create_columns += 'NOT NULL '
        if row.default != '':
            # integer field default values may be real/str in table_desc
            dtype = row.datatype.lower()
            if dtype == 'integer' or dtype == 'smallint' or dtype == 'bigint' or dtype == 'serial':
                default = f'{int(row.default)}'
            else:
                default = f'{row.default}'
            create_columns += f'DEFAULT {default} '

        idx += 1

    yield Output(create_columns, 'create_columns')
    yield Output(insert_columns, 'insert_columns')
Exemplo n.º 7
0
def transform_table_desc_df(context, table_desc: DataFrame) -> DataFrame:
    """
    Transform the DataFrame of data types in database table
    :param context: execution context
    :param table_desc: panda DataFrame containing details of the Postgres database table
    :return: panda DataFrame containing details of the Postgres database table
    :rtype: panda.DataFrame
    """

    table_desc.fillna('', inplace=True)

    # get names of indices for which column ignore contains #. i.e. comment lines
    index_names = table_desc[table_desc['ignore'].str.contains('#')].index
    # drop comment row indexes from dataFrame
    table_desc.drop(index_names, inplace=True)
    # drop column ignore from dataFrame
    table_desc.drop(['ignore'], axis=1, inplace=True)

    return table_desc
Exemplo n.º 8
0
def sum_solid(num_df: DataFrame) -> DataFrame:
    sum_df = num_df.copy()
    sum_df['sum'] = sum_df['num1'] + sum_df['num2']
    return sum_df
Exemplo n.º 9
0
def sum_sq_solid(sum_df: DataFrame) -> DataFrame:
    sum_sq_df = sum_df.copy()
    sum_sq_df['sum_sq'] = sum_df['sum']**2
    return sum_sq_df
Exemplo n.º 10
0
def organize_columns_to_omeka(_, df: dp.DataFrame, smapshot: dp.DataFrame,
                              mapping: dp.DataFrame):
    def string2url(string):
        if "||Stereoscopy" in string:
            string = string.split("||")[0]
            QID = mapping.loc[string, "Wiki ID"]
            return (f"http://wikidata.org/wiki/{QID} {string}" +
                    "||http://wikidata.org/wiki/Q35158 Stereoscopy")
        elif "||Estereoscopia" in string:
            string = string.split("||")[0]
            QID = mapping.loc[string, "Wiki ID"]
            return (f"http://wikidata.org/wiki/{QID} {string}" +
                    "||http://wikidata.org/wiki/Q35158 Estereoscopia")
        else:
            QID = mapping.loc[string, "Wiki ID"]
            return f"http://wikidata.org/wiki/{QID} {string}"

    def translateString(string):
        if "||Stereoscopy" in string:
            string = string.split("||")[0]
            kw = mapping.loc[string, "Label:pt"]
            return kw + "||Estereoscopia"
        else:
            kw = mapping.loc[string, "Label:pt"]
            return kw

    # filter items
    omeka_df = df.loc[(df["Source"] == "Instituto Moreira Salles")
                      & (df["Latitude"].notna()
                         & df["Source URL"].notna()
                         & df["Media URL"].notna()
                         & df["First Year"].notna()
                         & df["Last Year"].notna())]
    omeka_df = omeka_df.dropna(subset=["Collections"])
    mapping.set_index("Label:en", inplace=True)
    omeka_df[["First Year",
              "Last Year"]] = omeka_df[["First Year", "Last Year"
                                        ]].applymap(lambda x: str(int(x)),
                                                    na_action="ignore")

    # create columns
    omeka_df.loc[omeka_df["First Year"].notna(),
                 "dcterms:available"] = (omeka_df["First Year"].astype(str) +
                                         "/" +
                                         omeka_df["Last Year"].astype(str))
    omeka_df["dcterms:format:en"] = omeka_df["Materials"]
    omeka_df["dcterms:format:pt"] = omeka_df["Materials"]
    omeka_df["dcterms:medium:en"] = omeka_df["Fabrication Method"]
    omeka_df["dcterms:medium:pt"] = omeka_df["Fabrication Method"]
    omeka_df["dcterms:type:en"] = omeka_df["Type"]
    omeka_df["dcterms:type:pt"] = omeka_df["Type"]

    # format data
    omeka_df["Source URL"] = omeka_df["Source URL"] + " " + omeka_df["Source"]
    omeka_df["Wikidata ID"] = ("www.wikidata.org/wiki/" +
                               omeka_df["Wikidata ID"] + " Wikidata")
    include = omeka_df["Source ID"].isin(smapshot["id"])
    omeka_df.loc[include,
                 "Collections"] = omeka_df["Collections"] + "||Smapshot"

    omeka_df[["dcterms:format:en", "dcterms:medium:en",
              "dcterms:type:en"]] = omeka_df[[
                  "dcterms:format:en", "dcterms:medium:en", "dcterms:type:en"
              ]].applymap(string2url, na_action="ignore")
    omeka_df[["dcterms:format:pt", "dcterms:medium:pt",
              "dcterms:type:pt"]] = omeka_df[[
                  "dcterms:format:pt", "dcterms:medium:pt", "dcterms:type:pt"
              ]].applymap(translateString, na_action="ignore")
    mapping = mapping.reset_index()
    mapping.set_index("Label:pt", inplace=True)
    omeka_df[["dcterms:format:pt", "dcterms:medium:pt",
              "dcterms:type:pt"]] = omeka_df[[
                  "dcterms:format:pt", "dcterms:medium:pt", "dcterms:type:pt"
              ]].applymap(string2url, na_action="ignore")

    # rename columns
    omeka_df = omeka_df.rename(
        columns={
            "Source ID": "dcterms:identifier",
            "Title": "dcterms:title",
            "Description (Portuguese)": "dcterms:description",
            "Creator": "dcterms:creator",
            "Date": "dcterms:date",
            "Width (mm)": "schema:width",
            "Height (mm)": "schema:height",
            "Rights": "dcterms:rights",
            "Attribution": "dcterms:bibliographicCitation",
            "Source URL": "dcterms:source",
            "Wikidata ID": "dcterms:hasVersion",
            "Depicts": "foaf:depicts",
            "Media URL": "media",
            "Latitude": "latitude",
            "Longitude": "longitude",
            "Collection": "item_sets",
        })

    # select columns
    omeka_df = omeka_df[[
        "dcterms:identifier",
        "dcterms:title",
        "dcterms:description",
        "dcterms:creator",
        "dcterms:date",
        "dcterms:available",
        "dcterms:type:en",
        "dcterms:type:pt",
        "dcterms:medium:pt",
        "dcterms:medium:en",
        "dcterms:format:pt",
        "dcterms:format:en",
        "dcterms:rights",
        "dcterms:bibliographicCitation",
        "dcterms:source",
        "dcterms:hasVersion",
        "latitude",
        "longitude",
        "foaf:depicts",
        "schema:width",
        "schema:height",
        "media",
        "item_sets",
    ]]

    return omeka_df
Exemplo n.º 11
0
def organise_creator(_, quickstate: dp.DataFrame):
    creators = {
        "Augusto Malta": "Q16495239",
        "Anônimo": "Q4233718",
        "Marc Ferrez": "Q3180571",
        "Georges Leuzinger": "Q5877879",
        "José dos Santos Affonso": "Q63993961",
        "N. Viggiani": "Q65619909",
        "Archanjo Sobrinho": "Q64009665",
        "F. Basto": "Q55089601",
        "J. Faria de Azevedo": "Q97570600",
        "S. H. Holland": "Q65619918",
        "Augusto Monteiro": "Q65619921",
        "Jorge Kfuri": "Q63166336",
        "Camillo Vedani": "Q63109123",
        "Fritz Büsch": "Q63109492",
        "Armando Pittigliani": "Q19607834",
        "Braz": "Q97487621",
        "Stahl & Wahnschaffe": "Q63109157",
        "Gomes Junior": "Q86942676",
        "A. Ruelle": "Q97570551",
        "Guilherme Santos": "Q55088608",
        "Albert Frisch": "Q21288396",
        "José Baptista Barreira Vianna": "Q63166517",
        "Alfredo Krausz": "Q63166405",
        "Therezio Mascarenhas": "Q97570728",
        "Torres": "Q65619905",
        "Theodor Preising": "Q63109140",
        "Augusto Stahl": "Q4821327",
        "Luiz Musso": "Q89538832",
        "Carlos Bippus": "Q63109147",
        "Thiele": "Q64825643",
        "Revert Henrique Klumb": "Q3791061",
        "Juan Gutierrez": "Q10312614",
        "F. Manzière": "Q65619915",
        "Antonio Luiz Ferreira": "Q97570558",
        "Etienne Farnier": "Q97570575",
        "José Francisco Corrêa": "Q10309433",
        "Chapelin": "Q97570376",
        "J. Teixeira": "Q89642578",
        "F. Garcia": "Q97570588",
        "A. de Barros Lobo": "Q97570363",
        "Bloch": "Q61041099",
    }

    def name2qid(name):
        """
        Takes a string and returns the
        corresponding Wikidata QID
        """
        try:
            qid = creators[f"{name}"]
        except KeyError:
            qid = ""
        return qid

    quickstate["P170"] = quickstate["P170"].apply(name2qid)
    quickstate = quickstate.drop(columns="date_accuracy")
    quickstate.name = "import_wikidata"

    def df2quickstatements(df):
        create_str = ""
        edit_str = ""
        str_props = ["Lpt-br", "Dpt-br", "Den", "P217", "P7835"]
        no_ref_props = ["Lpt-br", "Dpt-br", "Den"]
        for _, row in df.iterrows():
            row = dict(row)
            props = []
            if row["qid"]:
                for key in row.keys():
                    if row[key]:
                        if key in str_props:
                            row[key] = '"{0}"'.format(row[key])
                        prop_str = "|".join([
                            str(row["qid"]),
                            str(key).replace("P31_a", "P31"),
                            str(row[key]),
                        ])
                        if key == "P217":
                            prop_str += "|P195|Q71989864"
                        if key == "P195":
                            prop_str += "|P217|" + '"{0}"'.format(row["P217"])
                        if key not in no_ref_props:
                            prop_str += "|S248|Q64995339|S813|+{0}Z/11".format(
                                dt.now().strftime("%Y-%m-%dT00:00:00"))
                        props.append(prop_str)
                item_str = "||".join(props)
                if not edit_str:
                    edit_str += item_str
                else:
                    edit_str += "||" + item_str
            else:
                props.append("CREATE")
                for key in row.keys():
                    if row[key]:
                        if key in str_props:
                            row[key] = '"{0}"'.format(row[key])
                        prop_str = "|".join([
                            "LAST",
                            str(key).replace("P31_a", "P31"),
                            str(row[key]),
                        ])
                        if key == "P217":
                            prop_str += "|P195|Q71989864"
                        if key == "P195":
                            prop_str += "|P217|" + '"{0}"'.format(row["P217"])
                        if key not in no_ref_props:
                            prop_str += "|S248|Q64995339|S813|+{0}Z/11".format(
                                dt.now().strftime("%Y-%m-%dT00:00:00"))
                        props.append(prop_str)
                item_str = "||".join(props)
                if not create_str:
                    create_str += item_str
                else:
                    create_str += "||" + item_str

        return {"create": create_str, "edit": edit_str}

    quickstate.fillna("", inplace=True)

    with open("data/output/quickstatements_create.txt", "w+") as f:
        f.write(df2quickstatements(quickstate)["create"])

    with open("data/output/quickstatements_edit.txt", "w+") as f:
        f.write(df2quickstatements(quickstate)["edit"])

    return quickstate.set_index("qid")
Exemplo n.º 12
0
def make_df_to_wikidata(_, df: dp.DataFrame, mapping: dp.DataFrame):
    def string2qid(string):
        QID = mapping.loc[string, "Wiki ID"]
        return QID

    # filter items
    df = df.loc[(df["Source"] == "Instituto Moreira Salles")
                & df["Latitude"].notna()
                & df["Source URL"].notna()
                & df["Media URL"].notna()
                & df["First Year"].notna()
                & df["Last Year"].notna()
                & df["Width (mm)"].notna()
                & df["Height (mm)"]]
    df = df.dropna(subset=["Collections"])
    df[["First Year",
        "Last Year"]] = df[["First Year",
                            "Last Year"]].applymap(lambda x: str(int(x)),
                                                   na_action="ignore")

    mapping.set_index("Label:en", inplace=True)

    df["First Year"] = pd.to_datetime(df["First Year"])
    df["Last Year"] = pd.to_datetime(df["Last Year"])

    df[["Type", "Type_"]] = df["Type"].str.rsplit("||", n=1, expand=True)

    quickstate = pd.DataFrame(columns=[
        "qid",
        "P31",
        "P31_a",
        "Lpt-br",
        "Dpt-br",
        "Den",
        "P571",
        "qal1319",
        "qal1326",
        "P17",
        "P1259",
        "qal2044",
        "qal7787",
        "qal8208",
        "P170",
        "P186",
        "P195",
        "P217",
        "P2079",
        "P4036",
        "P2049",
        "P2048",
        "P7835",
    ])

    # date_accuracy
    quickstate["date_accuracy"] = df["date_accuracy"]
    circa = quickstate["date_accuracy"] == "circa"
    year = quickstate["date_accuracy"] == "year"
    month = quickstate["date_accuracy"] == "month"
    day = quickstate["date_accuracy"] == "day"

    quickstate["P571"] = df["datetime"].apply(dt.isoformat)
    quickstate.loc[circa, "P571"] = "+" + quickstate["P571"] + "Z/8"
    quickstate.loc[year, "P571"] = "+" + quickstate["P571"] + "Z/9"
    quickstate.loc[month, "P571"] = "+" + quickstate["P571"] + "Z/10"
    quickstate.loc[day, "P571"] = "+" + quickstate["P571"] + "Z/11"
    # earliest date
    # quickstate["qal1319"] = df["First Year"].apply(dt.isoformat) + "Z/9"
    quickstate["P571"] = (quickstate["P571"] + "|P580|+" +
                          df["First Year"].apply(dt.isoformat) + "Z/9"
                          "|P582|+" + df["Last Year"].apply(dt.isoformat) +
                          "Z/9")
    # latest date
    # quickstate["qal1326"] = df["Last Year"].apply(dt.isoformat) + "Z/9"
    # pt-br label
    quickstate["Lpt-br"] = df["Title"]
    # creator
    quickstate["P170"] = df["Creator"]
    # description
    # pt-br
    quickstate["Dpt-br"] = "Fotografia de " + df["Creator"]
    # en
    quickstate["Den"] = np.where(
        df["Creator"] != "Anônimo",
        "Photograph by " + df["Creator"],
        "Photograph by Unknown",
    )
    # inventory number
    quickstate["P217"] = df["Source ID"]

    list_creator = list(quickstate["P170"].unique())

    for author in list_creator:
        df_creator = quickstate.loc[quickstate["P170"] == author]
        duplicate = df_creator.duplicated(subset=["Lpt-br"], keep=False)
        df_creator.loc[duplicate,
                       "Dpt-br"] = ("Fotografia de " +
                                    df_creator.loc[duplicate, "P170"] + " (" +
                                    df_creator.loc[duplicate, "P217"] + ")")
        df_creator.loc[duplicate, "Den"] = np.where(
            df_creator.loc[duplicate, "P170"] != "Anônimo",
            "Photograph by " + df_creator.loc[duplicate, "P170"] + " (" +
            df_creator.loc[duplicate, "P217"] + ")",
            "Photograph by Unknown" + " (" +
            df_creator.loc[duplicate, "P217"] + ")",
        )
        quickstate.loc[quickstate["P170"] == author,
                       ["Dpt-br", "Den"]] = df_creator[["Dpt-br", "Den"]]

    # Instance of
    quickstate["P31"] = "Q125191"
    quickstate["P31_a"] = df["Type_"].map({"Stereoscopy": "Q35158"})
    # country
    quickstate["P17"] = "Q155"
    # coordinate of POV
    quickstate["P1259"] = (("@" + df["Latitude"].astype(str) + "/" +
                            df["Longitude"].astype(str)) + "|P2044|" +
                           df["altitude"].astype(str) + "U11573" + "|P7787|" +
                           df["heading"].astype(str) + "U28390" + "|P8208|" +
                           df["tilt"].astype(str) + "U28390")
    # altitude
    # quickstate["qal2044"] = df["altitude"].astype(str) + "P11573"
    # heading
    # quickstate["qal7787"] = df["heading"].astype(str) + "P28390"
    # tilt
    # quickstate["qal8208"] = df["tilt"].astype(str) + "P28390"
    # made from material
    quickstate["P186"] = df["Materials"]
    # collection
    quickstate["P195"] = "Q71989864"
    # fabrication method
    quickstate["P2079"] = df["Fabrication Method"]
    # field of view
    quickstate["P4036"] = df["fov"].astype(str) + "U28390"
    # width
    quickstate["P2049"] = df["Width (mm)"].astype(str) + "U174789"
    # height
    quickstate["P2048"] = df["Height (mm)"].astype(str) + "U174789"
    # IMS ID
    quickstate["P7835"] = df["Source URL"].str.extract(r"(\d+)").astype(int)
    # qid
    quickstate["qid"] = df["Wikidata ID"]
    # Copyright status
    # quickstate["P6216"]

    # format data P186 and P2079
    quickstate[["P186",
                "P2079"]] = quickstate[["P186",
                                        "P2079"]].applymap(string2qid,
                                                           na_action="ignore")

    return quickstate
Exemplo n.º 13
0
def sum_sq_solid(_, sum_df: DataFrame, mult_df: DataFrame) -> DataFrame:
    sum_sq_df = sum_df.copy()
    sum_sq_df['sum_sq'] = sum_df['sum'] ** 2
    sum_sq_df['sum_mult_sq'] = sum_df['sum'] * mult_df['mult']
    return sum_sq_df
Exemplo n.º 14
0
def mult_solid(_, num_df: DataFrame) -> DataFrame:
    mult_df = num_df.copy()
    mult_df['mult'] = num_df['num1'] * num_df['num2']
    return mult_df
Exemplo n.º 15
0
def transform_ex_rates_per_usd(context, ex_rates_per_usd: DataFrame,
                               currency_eq_usd_df: DataFrame,
                               currency_codes_df: DataFrame, cur_config: Dict):
    """

    :param context: execution context
    :param ex_rates_per_usd: DataFrame from an 'IMF National Currency per U.S. Dollar, period average' file
    :param currency_eq_usd_df: panda Dataframe of currency to USD rates
    :param currency_codes_df: IS0 4217 currency codes DataFrame
    :param cur_config: currency configuration
    :return:
    """
    cfg = cur_config['value']
    date_col_name = cfg['date_col_name']
    supplementary_currency_rates = cfg['supplementary_currency_rates']
    currency_codes_cfg = cfg['currency_codes']
    country_attrib = currency_codes_cfg['country_attrib']
    currency_code_attrib = currency_codes_cfg['currency_code_attrib']

    context.log.info(f'Generating list of currencies missing USD rates')

    # make list of missing currencies and add columns to the currency equivalent usd's dataframe
    missing_currencies = []
    for code in cfg['currencies_required']:
        if code not in currency_eq_usd_df.columns:
            currency_eq_usd_df[code] = np.nan
            missing_currencies.append({currency_code_attrib: code})

    # add temp columns with values to match ex_rates_per_usd column
    currency_eq_usd_df['year'] = currency_eq_usd_df[date_col_name].apply(
        lambda x: x.strftime('%Y'))
    currency_eq_usd_df['year_mth'] = currency_eq_usd_df[date_col_name].apply(
        lambda x: x.strftime('%YM%m'))
    currency_eq_usd_df['year_qtr'] = currency_eq_usd_df[date_col_name].apply(
        lambda x: x.strftime('%Y') + 'Q' + str(int((x.month / 3) + 1)))
    temp_period_columns = ['year_mth', 'year_qtr', 'year']

    context.log.info(f'Loading supplementary currency information')

    # add supplementary currency info to exchange rate per usd
    cidx_ex_rates_per_usd = ex_rates_per_usd.set_index(
        ex_rates_per_usd['Country'].str.lower())  # country name as index
    for code in supplementary_currency_rates.keys():
        suplm_currency = supplementary_currency_rates[code]
        for suplm_time in suplm_currency.keys():
            suplm_currency_value = suplm_currency[suplm_time]
            if suplm_time not in cidx_ex_rates_per_usd.columns:
                cidx_ex_rates_per_usd[suplm_time] = '...'

            country = currency_codes_df[currency_codes_df[currency_code_attrib]
                                        == code]
            if len(country) > 0:
                country = country.reset_index(drop=True)
                country_name = country.at[0, country_attrib].lower()

                if country_name not in cidx_ex_rates_per_usd.index:
                    # add new country and set index (as append resets previous set)
                    cidx_ex_rates_per_usd = cidx_ex_rates_per_usd.append(
                        {'Country': country_name}, ignore_index=True)
                    cidx_ex_rates_per_usd = cidx_ex_rates_per_usd. \
                        set_index(cidx_ex_rates_per_usd['Country'].str.lower())

                cidx_ex_rates_per_usd.at[country_name,
                                         suplm_time] = suplm_currency_value

    context.log.info(f'Updating list of currencies with missing USD rates')

    for missing in missing_currencies:
        currency_code = missing[currency_code_attrib]
        currency = currency_codes_df[currency_codes_df[currency_code_attrib] ==
                                     currency_code]
        if len(currency) > 0:
            currency = currency.reset_index(drop=True)
            country_name = currency.at[0, country_attrib].lower()

            for alias in currency_codes_cfg['currency_name_aliases']:
                alias_lower = [x.lower() for x in alias]
                if country_name in alias_lower:
                    idx = alias_lower.index(country_name)
                    country_name = alias_lower[
                        (idx + 1) %
                        2]  # 2 entries in list, get the one its not

            ex_rate_country = cidx_ex_rates_per_usd.loc[
                country_name]  # series of country ex rates

            # set currency values
            def get_time_col_value(col):
                value = np.nan
                if col in ex_rate_country.index:
                    value = ex_rate_country.at[col]
                    if not isinstance(value, float) and not isinstance(
                            value, int):
                        value = np.nan
                return value

            not_filled_mask = None
            for time_col in temp_period_columns:
                # set values to value from time column
                if not_filled_mask is None:
                    currency_eq_usd_df[currency_code] = currency_eq_usd_df[
                        time_col].apply(get_time_col_value)
                else:
                    currency_eq_usd_df.loc[currency_eq_usd_df[currency_code] == np.nan, currency_code] = \
                        currency_eq_usd_df[time_col].apply(get_time_col_value)

                not_filled_mask = currency_eq_usd_df[currency_code].isna()
                if not not_filled_mask.any():
                    break

    currency_eq_usd_df.drop(temp_period_columns, axis=1, inplace=True)

    return currency_eq_usd_df
Exemplo n.º 16
0
def transform_imf_currency_tsv(context, currency_df: DataFrame,
                               cur_config: Dict):
    """
    Transform an IMF SDR per currency DataFrame
    :param context: execution context
    :param currency_df: DataFrame to process
    :param cur_config: currency configuration
    """
    cfg = cur_config['value']
    date_col_name = cfg['date_col_name']

    # clear whitespace
    currency_df.rename(columns=str.strip,
                       inplace=True)  # remove any whitespace in column names
    for column in currency_df.columns:
        currency_df[column] = currency_df[column].str.strip()

    # make sure no dates are missing
    currency_date = datetime.strptime(cfg['currency_start_date'],
                                      cfg['to_datetime'])
    currency_end_date = datetime.strptime(cfg['currency_end_date'],
                                          cfg['to_datetime'])
    delta = timedelta(days=1)
    while currency_date <= currency_end_date:
        date_text = currency_date.strftime(cfg['to_datetime'])
        if date_text not in currency_df[date_col_name].values:
            currency_df = currency_df.append({date_col_name: date_text},
                                             ignore_index=True)
        currency_date += delta

    # drop non-data rows in data column
    currency_df[[date_col_name]] = currency_df[[date_col_name
                                                ]].fillna(value='')
    currency_df = currency_df[currency_df[date_col_name].str.contains(
        cfg['date_pattern'])]

    # convert dates and sort
    currency_df[date_col_name] = pd.to_datetime(currency_df[date_col_name],
                                                format=cfg['to_datetime'])
    currency_df = currency_df.sort_values(by=[date_col_name])

    # fill gaps with previous value
    for column in currency_df.columns:
        if column != date_col_name:
            currency_df[column] = currency_df[column].fillna(method='ffill')
    # and if the gap is on the first line, the next valid value
    for column in currency_df.columns:
        if column != date_col_name:
            currency_df[column] = currency_df[column].fillna(method='bfill')

    # convert floats
    for column in currency_df.columns:
        if column != date_col_name:
            currency_df[column] = currency_df[column].astype(float)

    # rename columns to currency code
    columns = []
    currency_names = {}
    regex = re.compile(cfg['currency_name_pattern'])
    for column in currency_df.columns:
        match = regex.search(column)
        if match:
            currency_names[match.group(2)] = match.group(1)
            columns.append(match.group(2))
        else:
            columns.append(column)

    currency_df.columns = columns

    yield Output(currency_df, 'currency_df')
    yield Output(currency_names, 'currency_names')