示例#1
0
def compare_runtime(source_job: Job, target_job: Job) -> Result:
    source_runtime = api.get_runtime(source_job)
    target_runtime = api.get_runtime(target_job)

    result = Result("Compare Runtime")
    if not source_runtime or not target_runtime:
        result.add_warning("Jobs are not finished")
    elif source_runtime > target_runtime:
        runtime_ratio_diff = helpers.ratio_diff(source_runtime, target_runtime)
        msg = (
            f"Sources differ on {runtime_ratio_diff}% - "
            f"{helpers.ms_to_time(source_runtime)} and "
            f"{helpers.ms_to_time(target_runtime)}"
        )
        if runtime_ratio_diff > 0.2:
            result.add_error(msg)
        elif runtime_ratio_diff > 0.1:
            result.add_warning(msg)
        else:
            result.add_info(msg)
    else:
        result.add_info(
            f"Similar or better runtime - {helpers.ms_to_time(source_runtime)} and "
            f"{helpers.ms_to_time(target_runtime)}"
        )
    return result
示例#2
0
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if (price_was_fields and price_was_fields[0] in df.columns and price_fields
            and price_fields[0] in df.columns):
        price_field = price_fields[0]
        price_was_field = price_was_fields[0]
        prices = df.copy()
        prices[price_was_field] = prices[price_was_field].astype(float)
        prices[price_field] = prices[price_field].astype(float)

        df_prices_less = pd.DataFrame(
            prices[prices[price_was_field] < prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )

        price_less_percent = "{:.2%}".format(
            len(df_prices_less) / items_number)

        if not df_prices_less.empty:
            error = f"Past price is less than current for {len(df_prices_less)} items"
            result.add_error(
                f"{price_less_percent} ({len(df_prices_less)}) of "
                f"items with {price_was_field} < {price_field}",
                detailed=f"{error}:\n{list(df_prices_less['_key'])}",
            )

        df_prices_equals = pd.DataFrame(
            prices[prices[price_was_field] == prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )
        price_equal_percent = "{:.2%}".format(
            len(df_prices_equals) / items_number)

        if not df_prices_equals.empty:
            result.add_warning(
                (f"{price_equal_percent} ({len(df_prices_equals)}) "
                 f"of items with {price_was_field} = {price_field}"),
                detailed=(f"Prices equal for {len(df_prices_equals)} items:\n"
                          f"{list(df_prices_equals['_key'])}"),
            )

        result.err_items_count = len(df_prices_equals) + len(df_prices_less)
        result.items_count = len(df.index)

    else:
        result.add_info(
            "product_price_field or product_price_was_field tags were not "
            "found in schema")
    return result
示例#3
0
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if not price_was_fields or not price_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    price_field = price_fields[0]
    price_was_field = price_was_fields[0]
    prices = df.copy()
    prices[price_was_field] = prices[price_was_field].astype(float)
    prices[price_field] = prices[price_field].astype(float)

    df_prices_less = pd.DataFrame(
        prices[prices[price_was_field] < prices[price_field]],
        columns=[price_was_field, price_field],
    )

    price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number)

    if not df_prices_less.empty:
        error = f"Past price is less than current for {len(df_prices_less)} items"
        result.add_error(
            f"{price_less_percent} ({len(df_prices_less)}) of "
            f"items with {price_was_field} < {price_field}",
            detailed=f"{error}:\n{list(df_prices_less.index)}",
        )

    df_prices_equals = pd.DataFrame(
        prices[prices[price_was_field] == prices[price_field]],
        columns=[price_was_field, price_field],
    )
    price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number)

    if not df_prices_equals.empty:
        result.add_warning(
            (
                f"{price_equal_percent} ({len(df_prices_equals)}) "
                f"of items with {price_was_field} = {price_field}"
            ),
            detailed=(
                f"Prices equal for {len(df_prices_equals)} items:\n"
                f"{list(df_prices_equals.index)}"
            ),
        )

    result.err_items_count = len(df_prices_equals) + len(df_prices_less)
    result.items_count = len(df.index)

    return result
示例#4
0
def compare_spider_names(source_job: Job, target_job: Job) -> Result:
    s_name = source_job.metadata.get("spider")
    t_name = target_job.metadata.get("spider")

    result = Result("Spider Names")
    if s_name != t_name:
        result.add_warning(
            f"{source_job.key} spider is {s_name}, {target_job.key} spider is {t_name}"
        )
    return result
示例#5
0
def compare_spider_names(source_job: Job, target_job: Job) -> Result:
    name1 = source_job.metadata.get("spider")
    name2 = target_job.metadata.get("spider")

    result = Result("Spider Names")
    if name1 != name2:
        result.add_warning(
            f"{source_job.key} spider is {name1}, {target_job.key} spider is {name2}"
        )
    return result
示例#6
0
def compare_boolean_fields(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    err_thr: float = 0.10,
    warn_thr: float = 0.05,
) -> Result:
    """Compare booleans distribution between two dataframes

    Returns:
        A result containing dataframe with distributions and messages if differences
        are in thresholds
    """

    source_bool = source_df.select_dtypes(include="bool")
    target_bool = target_df.select_dtypes(include="bool")

    result = Result("Boolean Fields")
    if not fields_to_compare(source_bool, target_bool):
        result.outcome = Outcome.SKIPPED
        return result

    dummy = pd.DataFrame(columns=[True, False])
    source_counts = pd.concat(
        [dummy, source_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    target_counts = pd.concat(
        [dummy, target_bool.apply(pd.value_counts, normalize=True).T], sort=False
    ).fillna(0.0)
    difs = (source_counts - target_counts)[True]

    bool_covs = pd.concat(
        [
            source_counts.rename("{}_source".format),
            target_counts.rename("{}_target".format),
        ]
    ).sort_index()
    bool_covs.name = "Coverage for boolean fields"
    result.stats.append(bool_covs)

    err_diffs = difs[difs.abs() > err_thr]
    if not err_diffs.empty:
        result.add_error(
            f"{', '.join(err_diffs.index)} relative frequencies differ "
            f"by more than {err_thr:.0%}"
        )

    warn_diffs = difs[(difs.abs() > warn_thr) & (difs.abs() <= err_thr)]
    if not warn_diffs.empty:
        result.add_warning(
            f"{', '.join(warn_diffs.index)} relative frequencies differ by "
            f"{warn_thr:.0%}-{err_thr:.0%}"
        )

    return result
示例#7
0
def get_difference(
    source_df: pd.DataFrame,
    target_df: pd.DataFrame,
    category_names: List[str],
    source_key: str = "source",
    target_key: str = "target",
) -> Result:
    """Find and show differences between categories coverage, including nan values.
    Coverage means value counts divided on total size.

    Args:
        source_df: a data you want to compare
        target_df: a data you want to compare with
        category_names: list of columns which values to compare
        source_key: label for `source_df`
        target_key: label for `target_df`

    Returns:
        A result instance with messages containing significant difference defined by
        thresholds, a dataframe showing all normalized value counts in percents,
        a series containing significant difference.
    """
    result = Result("Category Coverage Difference")
    warn_thr = 0.10
    err_thr = 0.20

    for c in category_names:
        cats = (
            pd.DataFrame(
                {
                    source_key: source_df[c].value_counts(dropna=False, normalize=True),
                    target_key: target_df[c].value_counts(dropna=False, normalize=True),
                }
            )
            .fillna(0)
            .sort_values(by=[source_key, target_key], kind="mergesort")
        )
        cats.name = f"Coverage for {c}"
        result.stats.append(cats)
        cat_difs = (cats[source_key] - cats[target_key]).abs()
        cat_difs = cat_difs[cat_difs > warn_thr]
        cat_difs.name = f"Coverage difference more than {warn_thr:.0%} for {c}"
        if not cat_difs.empty:
            result.stats.append(cat_difs)
        errs = cat_difs[cat_difs > err_thr]
        if not errs.empty:
            result.add_warning(
                f"The difference is greater than {err_thr:.0%} for {len(errs)} value(s) of {c}"
            )

    if not category_names:
        result.add_info(Outcome.SKIPPED)
    return result
示例#8
0
def compare_finish_time(source_job: Job, target_job: Job) -> Result:
    diff_in_days = api.get_finish_time_difference_in_days(source_job, target_job)

    result = Result("Finish Time")
    if diff_in_days == 0:
        result.add_info("Less than 1 day difference")
    else:
        if diff_in_days is None:
            result.add_warning("Jobs are not finished")
        else:
            result.add_warning(f"{diff_in_days} day(s) difference between 2 jobs")

    return result
示例#9
0
def get_difference(source_job: Job,
                   target_job: Job,
                   err_thr: float = 0.10,
                   warn_thr: float = 0.05) -> Result:
    """Get difference between jobs coverages. The coverage is job fields counts
    divided on the job size.

    Args:
        source_job: a base job, the difference is calculated from it
        target_job: a job to compare
        err_thr: a threshold for errors
        warn_thr: a threshold for warnings

    Returns:
        A Result instance with huge dif and stats with fields counts coverage and dif
    """
    result = Result("Coverage Difference")
    f_counts = (pd.DataFrame({
        source_job.key: api.get_counts(source_job),
        target_job.key: api.get_counts(target_job),
    }).drop(index=["_type"]).fillna(0).sort_values(by=[source_job.key],
                                                   kind="mergesort"))
    f_counts[source_job.key] = f_counts[source_job.key].divide(
        api.get_items_count(source_job))
    f_counts[target_job.key] = f_counts[target_job.key].divide(
        api.get_items_count(target_job))
    f_counts.name = "Coverage from job stats fields counts"
    result.stats.append(f_counts)

    coverage_difs = f_counts[source_job.key] - f_counts[target_job.key]
    coverage_difs = coverage_difs[coverage_difs.abs() > warn_thr].sort_values(
        kind="mergesoft")
    coverage_difs.name = f"Coverage difference more than {warn_thr:.0%}"
    if not coverage_difs.empty:
        result.stats.append(coverage_difs)

    errs = coverage_difs[coverage_difs.abs() > err_thr]
    if not errs.empty:
        result.add_error(
            f"The difference is greater than {err_thr:.0%} for {len(errs)} field(s)"
        )
    warns = coverage_difs[(coverage_difs > warn_thr)
                          & (coverage_difs <= err_thr)]
    if not warns.empty:
        result.add_warning(
            f"The difference is between {warn_thr:.0%} and {err_thr:.0%} "
            f"for {len(warns)} field(s)")
    return result
示例#10
0
def compare_fields_counts(source_job, target_job):
    """Compare the relative difference between field counts to items count

    Args:
        source_job: a base job, the difference is calculated from it
        target_job: a job to compare

    Returns:
        A Result instance
    """
    source_items_count = get_items_count(source_job)
    target_items_count = get_items_count(target_job)
    result = Result("Fields Counts")

    source_fields = pd.DataFrame(
        {"Count1": source_job.items.stats().get("counts", None)})
    target_fields = pd.DataFrame(
        {"Count2": target_job.items.stats().get("counts", None)})
    fields = pd.concat([source_fields, target_fields], axis=1,
                       sort=True).fillna(0)
    fields["Difference, %"] = fields.apply(
        lambda row: ratio_diff(row["Count1"] / source_items_count, row[
            "Count2"] / target_items_count) * 100,
        axis=1,
    )
    fields["Difference, %"] = fields["Difference, %"].astype(int)
    fields.sort_values(by=["Difference, %"], ascending=False)

    err_diffs = fields[fields["Difference, %"] > 10]
    if not err_diffs.empty:
        result.add_error(
            f"Coverage difference is greater than 10% for "
            f"{len(err_diffs)} field(s)",
            err_diffs.to_string(columns=["Difference, %"]),
        )

    warn_diffs = fields[(fields["Difference, %"] > 5)
                        & (fields["Difference, %"] <= 10)]
    if not warn_diffs.empty:
        outcome_msg = (f"Coverage difference is between 5% and 10% for "
                       f"{len(warn_diffs)} field(s)")
        result.add_warning(outcome_msg,
                           warn_diffs.to_string(columns=["Difference, %"]))

    return result
示例#11
0
def compare_response_ratio(source_job: Job, target_job: Job) -> Result:
    """Compare request with response per item ratio"""
    s_ratio = round(
        api.get_requests_count(source_job) / api.get_items_count(source_job),
        2)
    t_ratio = round(
        api.get_requests_count(target_job) / api.get_items_count(target_job),
        2)

    response_ratio_diff = helpers.ratio_diff(s_ratio, t_ratio)
    msg = f"Difference is {response_ratio_diff:.2%} - {s_ratio} and {t_ratio}"

    result = Result("Compare Responses Per Item Ratio")
    if response_ratio_diff > 0.2:
        result.add_error(msg)
    elif response_ratio_diff > 0.1:
        result.add_warning(msg)
    return result
示例#12
0
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result:
    items_count1 = api.get_items_count(source_job)
    items_count2 = api.get_items_count(target_job)
    diff = helpers.ratio_diff(items_count1, items_count2)
    result = Result("Total Scraped Items")
    if 0 <= diff < 0.05:
        if diff == 0:
            msg = "Same number of items"
        else:
            msg = f"Almost the same number of items - {items_count1} and {items_count2}"
        result.add_info(msg)
    else:
        msg = f"{items_count1} differs from {items_count2} on {diff * 100}%"
        if 0.05 <= diff < 0.10:
            result.add_warning(msg)
        elif diff >= 0.10:
            result.add_error(msg)
    return result
示例#13
0
def compare_number_of_scraped_items(source_job: Job,
                                    target_job: Job) -> Result:
    s_count = api.get_items_count(source_job)
    t_count = api.get_items_count(target_job)
    diff = helpers.ratio_diff(s_count, t_count)
    result = Result("Total Scraped Items")
    if 0 <= diff < 0.05:
        if diff == 0:
            msg = "Same number of items"
        else:
            msg = f"Almost the same number of items - {s_count} and {t_count}"
        result.add_info(msg)
    else:
        msg = f"{s_count} differs from {t_count} on {diff:.2%}"
        if 0.05 <= diff < 0.10:
            result.add_warning(msg)
        elif diff >= 0.10:
            result.add_error(msg)
    return result
示例#14
0
def compare_response_ratio(source_job: Job, target_job: Job) -> Result:
    """Compare request with response per item ratio"""
    items_count1 = api.get_items_count(source_job)
    items_count2 = api.get_items_count(target_job)

    source_ratio = round(api.get_requests_count(source_job) / items_count1, 2)
    target_ratio = round(api.get_requests_count(target_job) / items_count2, 2)

    response_ratio_diff = helpers.ratio_diff(source_ratio, target_ratio)
    msg = "Difference is {}% - {} and {}".format(
        response_ratio_diff * 100, source_ratio, target_ratio
    )

    result = Result("Compare Responses Per Item Ratio")
    if response_ratio_diff > 0.2:
        result.add_error(msg)
    elif response_ratio_diff > 0.1:
        result.add_warning(msg)
    return result
示例#15
0
def compare_boolean_fields(source_df, target_df):
    source_bool = source_df.select_dtypes(include="bool")
    target_bool = target_df.select_dtypes(include="bool")

    result = Result("Boolean Fields")
    if not fields_to_compare(source_bool, target_bool):
        result.add_info("No fields to compare")
        return result

    source_relative_fr = get_bool_relative_frequency(source_bool)
    target_relative_fr = get_bool_relative_frequency(target_bool)
    relative_diffs = abs(source_relative_fr - target_relative_fr) * 100

    err_diffs = relative_diffs[(relative_diffs > 10).all(1)]
    if not err_diffs.empty:
        result.add_error(
            (f"{err_diffs.index.values} relative frequencies differ "
             "by more than 10%"),
            err_diffs.to_string(),
        )

    warn_diffs = relative_diffs[((relative_diffs <= 10) &
                                 (relative_diffs > 5)).all(1)]
    if not warn_diffs.empty:
        result.add_warning(
            f"{warn_diffs.index.values} relative frequencies differ by 5-10%",
            warn_diffs.to_string(),
        )
    if err_diffs.empty and warn_diffs.empty:
        result.add_info(
            f"{relative_diffs.index.values} relative frequencies are equal "
            "or differ by less than 5%",
            relative_diffs.to_string(
                header=["Difference in False, %", "Difference in True, %"]),
        )

    return result