示例#1
0
def build_dimension_df(pid_meta, ind_theme_id, next_dim_id):
    df_dims = pd.DataFrame({
        "Dimension_EN": ["Date"] + pid_meta["dimension_names"]["en"],
        "Dimension_FR": ["Date"] + pid_meta["dimension_names"]["fr"]
    })  # add date dimension
    df_dims["IndicatorThemeId"] = int(ind_theme_id)
    df_dims["DisplayOrder"] = h.create_id_series(
        df_dims, 1)  # counter col for each dimension
    df_dims["DimensionId"] = h.create_id_series(df_dims, next_dim_id)

    # DimensionType is "Filter" for all Dimensions except the last one, which is "Value"
    df_dims["DimensionType"] = "Filter"
    df_dims.loc[df_dims.index[-1], "DimensionType"] = "Value"

    # order columns for insert
    df_dims = df_dims.loc[:, [
        "DimensionId", "IndicatorThemeId", "Dimension_EN", "Dimension_FR",
        "DisplayOrder", "DimensionType"
    ]]
    return df_dims
示例#2
0
def build_date_dimension_values_df(file_dates, existing_dates, dim_id,
                                   next_dim_val_id, next_dim_val_order):
    # build dataframe of date dimension for DimensionValues including file reference dates (file_dates) that do
    # not yet exist in the database (existing_dates). dim_id is the dimension id of the Dates dimension,
    # next_dim_val_id and next_dim_val_order are the next ids to populate in the table.

    # join ref_dates from file to those found in the DB (ensure join column is trimmed string)
    file_dates["ReferencePeriod"] = file_dates["ReferencePeriod"].astype(
        "str").str.strip()
    existing_dates["Display_EN"] = existing_dates["Display_EN"].astype(
        "str").str.strip()
    # format any oddball dates in "date" dimension so they match the dates in the created reference periods (yyyy-mm-dd)
    existing_dates["ReferencePeriod"] = ""
    if existing_dates.shape[0] > 0:
        existing_dates["ReferencePeriod"] = existing_dates.apply(
            lambda x: h.fix_ref_date(x["Display_EN"], "%Y-%m-%d"), axis=1)
    joined_ref_dates_df = pd.merge(file_dates,
                                   existing_dates,
                                   on="ReferencePeriod",
                                   how="left")
    new_ref_dates_df = joined_ref_dates_df[joined_ref_dates_df[
        'DimensionId'].isnull()].copy()  # keeps only new dates

    ret_df = pd.DataFrame()
    if new_ref_dates_df.shape[0] > 0:
        # if there are new reference dates, build remaining columns
        new_ref_dates_df["DimensionValueId"] = h.create_id_series(
            new_ref_dates_df, next_dim_val_id)
        new_ref_dates_df["DimensionId"] = dim_id
        new_ref_dates_df["Display_EN"] = new_ref_dates_df[
            "REF_DATE"]  # duplicate date to FR
        new_ref_dates_df["Display_FR"] = new_ref_dates_df[
            "Display_EN"]  # duplicate date to FR
        new_ref_dates_df["ValueDisplayOrder"] = h.create_id_series(
            new_ref_dates_df, next_dim_val_order)
        ret_df = build_dimension_values_df_subset(new_ref_dates_df)
    return ret_df
示例#3
0
def build_dimension_values_df(pid_meta, df_dims, next_dim_val_id):
    # build dataframe for DimensionValues with dimension data (df_dims). next_dim_val_id is the next id for the db.
    df_dim_vals = create_dimension_member_df(
        pid_meta["dimensions_and_members"])  # create dim/member df
    df_dim_vals.rename(columns={
        "MemberNameEn": "Display_EN",
        "MemberNameFr": "Display_FR"
    },
                       inplace=True)  # match db
    df_dim_vals.drop(
        df_dim_vals[df_dim_vals["DimNameEn"].str.lower() == "geography"].index,
        inplace=True)  # no geo dim
    df_dim_vals["DimensionValueId"] = h.create_id_series(
        df_dim_vals, next_dim_val_id)
    df_dim_vals = pd.merge(df_dim_vals,
                           df_dims,
                           how="left",
                           left_on="DimNameEn",
                           right_on="Dimension_EN")  # dimIDs
    df_dim_vals.sort_values(
        by=["DimPosId", "MemberId"],
        inplace=True)  # add counter that resets for each dimID
    df_dim_vals["ValueDisplayOrder"] = df_dim_vals.groupby(["DimensionId"
                                                            ]).cumcount() + 1
    df_dim_vals["MemberPrefix"] = df_dim_vals["ValueDisplayOrder"].astype(
        str).str.zfill(2) + ". "  # prefix for web app
    df_dim_vals[
        "Display_EN"] = df_dim_vals["MemberPrefix"] + df_dim_vals["Display_EN"]
    df_dim_vals[
        "Display_FR"] = df_dim_vals["MemberPrefix"] + df_dim_vals["Display_FR"]
    df_dim_vals[
        "ValueDisplayParent"] = None  # unable to determine whether field is being used, set null for now

    # check data types/lengths, order cols for insert
    df_dim_vals["Display_EN"] = df_dim_vals["Display_EN"].astype(
        "str").str[:255]
    df_dim_vals["Display_FR"] = df_dim_vals["Display_FR"].astype(
        "str").str[:255]
    df_dim_vals = build_dimension_values_df_subset(df_dim_vals)
    return df_dim_vals
示例#4
0
def build_indicator_values_df(edf, gdf, ndf, next_id, prod_id,
                              mixed_geo_justice_pids, is_sibling):
    # build the data frame for IndicatorValues based on dataframe of english csv file (edf),
    # GeographyReference ids (gdf), and NullReason ids (ndf). Populate indicator value ids starting from next_id.
    # mixed_geo_justice_pids/is_sibling indicate justice tables that have special date handling.
    # also collect and return unique GeographicLevelIDs

    # Justice products with mixed geos
    if int(prod_id) in mixed_geo_justice_pids:
        # remove rows < 2017 if geolevel is not in national, provincial, regional level
        edf.drop(
            edf[(edf["RefYear"].astype("int16") < 2017)
                & (~edf["GeographicLevelId"].isin(["A0000", "A0001", "A0002"])
                   )].index,
            inplace=True)
        # for sibling tables with mixed geos, remove these same geolevels b/c they already exist in the master
        if is_sibling:
            edf.drop(edf[edf["GeographicLevelId"].isin(
                ["A0000", "A0001", "A0002"])].index,
                     inplace=True)

    df_iv = edf.loc[:, ["DGUID", "IndicatorCode", "STATUS",
                        "VALUE"]]  # subset of full en dataset
    df_iv["IndicatorValueId"] = h.create_id_series(edf,
                                                   next_id)  # populate IDs
    df_iv = pd.merge(df_iv,
                     gdf,
                     left_on="DGUID",
                     right_on="GeographyReferenceId",
                     how="left")  # join to geoRef for id

    df_iv.dropna(subset=["GeographyReferenceId"],
                 inplace=True)  # drop empty ids
    df_iv.drop(["GeographyReferenceId"], axis=1, inplace=True)
    df_iv["IndicatorValueCode"] = df_iv["DGUID"] + "." + df_iv[
        "IndicatorCode"]  # combine DGUID and IndicatorCode
    df_iv.drop(["DGUID", "IndicatorCode"], axis=1, inplace=True)
    df_iv = pd.merge(df_iv,
                     ndf,
                     left_on="STATUS",
                     right_on="Symbol",
                     how="left")  # join to NullReasonId for Symbol
    df_iv.drop(["STATUS", "Symbol"], axis=1, inplace=True)

    # format for locale while preserving decimals from datapoints, restore original locale setting when done.
    if df_iv.shape[0] > 0:
        df_iv["Value_Dec"] = df_iv.apply(
            lambda x: h.format_number_preserve_decimals(x["VALUE"]),
            axis=1)  # temp column
        orig_locale = h.get_locale()
        h.set_locale("en_ca")
        df_iv["FormattedValue_EN"] = df_iv.apply(
            lambda x: h.format_number_for_locale(x["Value_Dec"]), axis=1)
        h.set_locale("fr_ca")
        df_iv["FormattedValue_FR"] = df_iv.apply(
            lambda x: h.format_number_for_locale(x["Value_Dec"]), axis=1)
        h.set_locale(orig_locale)
    else:
        df_iv["FormattedValue_EN"] = df_iv[
            "VALUE"]  # work around to prevent error on empty chunk
        df_iv["FormattedValue_FR"] = df_iv["VALUE"]

    # set datatypes for db
    df_iv = df_iv.fillna(np.nan).replace(
        [np.nan],
        [None])  # workaround to set nan/na=None (prevents sql error 22003)
    df_iv["IndicatorValueCode"] = df_iv["IndicatorValueCode"].str[:100]
    df_iv["VALUE"] = df_iv["VALUE"].astype("float64")

    # Keep only the columns needed for insert
    df_iv = df_iv.loc[:, [
        "IndicatorValueId", "VALUE", "NullReasonId", "IndicatorValueCode",
        "FormattedValue_EN", "FormattedValue_FR"
    ]]
    return df_iv
示例#5
0
def build_indicator_df(product_id, release_dt, dim_members, uom_codeset,
                       ref_date_list, next_id, min_ref_year,
                       mixed_geo_justice_pids, freq_code):
    # Build the data frame for gis.Indicator based on product_id, relase date (release_dt), dimension members
    # (dim_members), unit of measure information (uom_codeset), list of possible reference dates (ref_date_list).
    # next_id contains the next available indicator id, and min_ref_year specifies whether we want the df to be
    # generated from a specific year onward. Justice tables w/ mixed geo levels (mixed_geo_justice_pids) have special
    # date handling. Frequency of publication (freq_code) is used to format the date for the web app popup.
    df = create_dimension_member_df(
        dim_members)  # turn dimension/member data into dataframe
    df.sort_values(
        by=["DimPosId", "MemberId"],
        inplace=True)  # Important to allow recombining columns in df later

    # prepare dictionaries for creating member combinations
    dim_mem_ids = {}  # for coordinates
    dim_mem_names_en = {}  # for english indicator name
    dim_mem_names_fr = {}  # for french indicator name
    dim_mem_uoms = {}  # for unit of measure (will only occur one per member)

    for index, row in df.iterrows():
        dim_id = row["DimPosId"]

        # skip dimension 1 (geography)
        if row["DimNameEn"] != "Geography":
            if dim_id not in dim_mem_names_en:
                dim_mem_names_en[dim_id] = []
            if dim_id not in dim_mem_names_fr:
                dim_mem_names_fr[dim_id] = []
            if dim_id not in dim_mem_ids:
                dim_mem_ids[dim_id] = []
            if dim_id not in dim_mem_uoms:
                dim_mem_uoms[dim_id] = []
            dim_mem_names_en[dim_id].append(row["MemberNameEn"])
            dim_mem_names_fr[dim_id].append(row["MemberNameFr"])
            dim_mem_ids[dim_id].append(row["MemberId"])
            app_uom = str(row["MemberUomCode"]) if row[
                "DimHasUom"] else ""  # keeps "nan" from ending up in the combo
            dim_mem_uoms[dim_id].append(app_uom)

    # build all possible member combinations
    mem_names_en = build_dimension_member_combos(dim_mem_names_en, " _ ")
    mem_names_fr = build_dimension_member_combos(dim_mem_names_fr, " _ ")
    mem_ids = build_dimension_member_combos(dim_mem_ids, ".")
    mem_uoms = build_dimension_member_combos(dim_mem_uoms, " ")

    pre_df = False
    # because the dicts are already sorted we can safely stick them together as columns in a dataframe at the end.
    if len(mem_names_en) == len(mem_names_fr) == len(mem_ids) == len(mem_uoms):
        pre_df = pd.DataFrame(
            {
                "IndicatorNameLong_EN": mem_names_en,
                "IndicatorNameLong_FR": mem_names_fr,
                "Coordinate": mem_ids,
                "UOM_ID": mem_uoms
            },
            dtype=str)

    # UOM - Combining members may result in the uom field looking like "nan nan 229.0", we only want the 229 part.
    # Must go to float before int to prevent conversion error
    pre_df["UOM_ID"] = pre_df["UOM_ID"].str.replace("nan", "").str.replace(
        " ", "").astype("float").astype("int16")
    # Turn off inspection next 2 lines, false-positives from pycharm: see https://youtrack.jetbrains.com/issue/PY-43841
    # noinspection PyTypeChecker
    pre_df["UOM_EN"] = pre_df.apply(
        lambda x: h.get_uom_desc_from_code_set(x["UOM_ID"], uom_codeset, "en"),
        axis=1)
    # noinspection PyTypeChecker
    pre_df["UOM_FR"] = pre_df.apply(
        lambda x: h.get_uom_desc_from_code_set(x["UOM_ID"], uom_codeset, "fr"),
        axis=1)
    pre_df["IndicatorThemeID"] = product_id
    pre_df["ReleaseIndicatorDate"] = release_dt
    pre_df[
        "Vector"] = np.NaN  # Vector field exists in gis.Indicator but is not used. We will insert nulls.
    # IndicatorNames seem to only be used for populating titles on related charts - 2nd last member for legend
    pre_df["IndicatorName_EN"] = pre_df.apply(
        lambda x: h.get_nth_item_from_string_list(x["IndicatorNameLong_EN"],
                                                  " _ ", -2),
        axis=1)
    pre_df["IndicatorName_FR"] = pre_df.apply(
        lambda x: h.get_nth_item_from_string_list(x["IndicatorNameLong_FR"],
                                                  " _ ", -2),
        axis=1)

    # Create new indicator data frame with a row for each year in the reference period
    ind_df = copy_data_frames_for_date_range(pre_df, ref_date_list,
                                             min_ref_year, product_id,
                                             mixed_geo_justice_pids)

    # add the remaining fields that required RefYear to be built first
    ind_df["RefYear"] = ind_df["RefYear"].astype("str")
    ind_df["IndicatorCode"] = str(product_id) + "." + ind_df[
        "Coordinate"] + "." + ind_df["ReferencePeriod"]

    # This field becomes the popup on the web app. Reformat date depending on publication frequency.
    freq_dict = h.build_freq_code_to_pd_dict()
    fmt = freq_dict[freq_code][
        "py_fmt"] if freq_code in freq_dict else "%Y-%m-%d"  # default to show full date
    ind_df["IndicatorDisplay_EN"] = build_dimension_ul(
        ind_df["ReferencePeriod"], fmt, ind_df["IndicatorNameLong_EN"])
    ind_df["IndicatorDisplay_FR"] = build_dimension_ul(
        ind_df["ReferencePeriod"], fmt, ind_df["IndicatorNameLong_FR"])

    ind_df["IndicatorId"] = h.create_id_series(ind_df, next_id)  # populate IDs
    # build fields needed later for IndicatorMetaData DimensionUniqueKey matching and RelatedCharts
    ind_df["IndicatorFmt"] = ind_df["ReferencePeriod"] + "-" + ind_df[
        "IndicatorNameLong_EN"].str.replace(" _ ", "-")
    ind_df["LastIndicatorMember_EN"] = ind_df.apply(
        lambda x: h.get_nth_item_from_string_list(x["IndicatorNameLong_EN"],
                                                  " _ "),
        axis=1)
    ind_df["LastIndicatorMember_FR"] = ind_df.apply(
        lambda x: h.get_nth_item_from_string_list(x["IndicatorNameLong_FR"],
                                                  " _ "),
        axis=1)

    # set datatypes for db
    ind_df["ReleaseIndicatorDate"] = ind_df["ReleaseIndicatorDate"].astype(
        "datetime64[ns]")
    ind_df["ReferencePeriod"] = ind_df["ReferencePeriod"].astype(
        "datetime64[ns]")
    ind_df["IndicatorCode"] = ind_df["IndicatorCode"].str[:100]
    return ind_df