示例#1
0
def find_by(df: pd.DataFrame, uniques: List[Union[str, List[str]]]) -> Result:
    """Find equal items rows in `df` by `uniques`. I.e. if two items have the same
    uniques's element value, they are considered duplicates.

    Args:
        uniques: list containing columns and list of columns to identify duplicates.
        List of columns means that all list columns values should be equal.

    Returns:
        Any duplicates
    """
    result = Result("Duplicates")
    result.items_count = len(df)

    df = df.dropna(subset=list(set(flatten(uniques))), how="all")
    for columns in uniques:
        mask = columns if isinstance(columns, list) else [columns]
        duplicates = df[df.duplicated(columns, keep=False)][mask]
        if duplicates.empty:
            continue

        errors = {}
        grouped = duplicates.groupby(columns)
        for _, d in grouped:
            msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in mask]
            errors[f"same {', '.join(msgs)}"] = list(d.index)
        result.add_error(
            f"{', '.join(mask)} contains {len(grouped)} duplicated value(s)",
            errors=errors,
        )
    return result
示例#2
0
def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Duplicates By **unique** Tag")

    if not unique_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df.duplicated(field, keep=False)][[field]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d.index)
            msg = f"same '{d[field].iloc[0]}' `{field}`"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"{field} contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
示例#3
0
def check_items(df: pd.DataFrame, tagged_fields: Dict[str,
                                                      List[str]]) -> Result:
    """Check for items with the same name and url"""

    name_fields = tagged_fields.get("name_field")
    url_fields = tagged_fields.get("product_url_field")
    result = Result("Duplicated Items")
    if not name_fields or not url_fields:
        result.add_info(
            "'name_field' and 'product_url_field' tags were not found in schema"
        )
    else:
        result.items_count = len(df)
        errors = {}
        name_field = name_fields[0]
        url_field = url_fields[0]
        df = df[[name_field, url_field, "_key"]]
        duplicates = df[df[[name_field, url_field]].duplicated(keep=False)]
        if duplicates.empty:
            return result

        result.err_items_count = len(duplicates)
        for _, d in duplicates.groupby([name_field, url_field]):
            msg = (
                f"same '{d[name_field].iloc[0]}' name and '{d[url_field].iloc[0]}' url"
            )
            errors[msg] = list(d["_key"])
        result.add_error(
            f"{len(duplicates)} duplicate(s) with same name and url",
            errors=errors)
    return result
示例#4
0
def check_uniqueness(df: pd.DataFrame,
                     tagged_fields: Dict[str, List[str]]) -> Result:
    """Verify if each item field tagged with `unique` is unique.

    Returns:
        A result containing field names and keys for non unique items
    """
    unique_fields = tagged_fields.get("unique", [])
    result = Result("Uniqueness")

    if not unique_fields:
        result.add_info("'unique' tag was not found in schema")
        return result

    err_keys = set()
    for field in unique_fields:
        result.items_count = df[field].count()
        duplicates = df[df[field].duplicated(keep=False)][[field, "_key"]]
        errors = {}
        for _, d in duplicates.groupby([field]):
            keys = list(d["_key"])
            msg = f"same '{d[field].iloc[0]}' {field}"
            errors[msg] = keys
            err_keys = err_keys.union(keys)
        if not duplicates.empty:
            result.add_error(
                f"'{field}' contains {len(duplicates[field].unique())} duplicated value(s)",
                errors=errors,
            )

    result.err_items_count = len(err_keys)
    return result
示例#5
0
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if (price_was_fields and price_was_fields[0] in df.columns and price_fields
            and price_fields[0] in df.columns):
        price_field = price_fields[0]
        price_was_field = price_was_fields[0]
        prices = df.copy()
        prices[price_was_field] = prices[price_was_field].astype(float)
        prices[price_field] = prices[price_field].astype(float)

        df_prices_less = pd.DataFrame(
            prices[prices[price_was_field] < prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )

        price_less_percent = "{:.2%}".format(
            len(df_prices_less) / items_number)

        if not df_prices_less.empty:
            error = f"Past price is less than current for {len(df_prices_less)} items"
            result.add_error(
                f"{price_less_percent} ({len(df_prices_less)}) of "
                f"items with {price_was_field} < {price_field}",
                detailed=f"{error}:\n{list(df_prices_less['_key'])}",
            )

        df_prices_equals = pd.DataFrame(
            prices[prices[price_was_field] == prices[price_field]],
            columns=["_key", price_was_field, price_field],
        )
        price_equal_percent = "{:.2%}".format(
            len(df_prices_equals) / items_number)

        if not df_prices_equals.empty:
            result.add_warning(
                (f"{price_equal_percent} ({len(df_prices_equals)}) "
                 f"of items with {price_was_field} = {price_field}"),
                detailed=(f"Prices equal for {len(df_prices_equals)} items:\n"
                          f"{list(df_prices_equals['_key'])}"),
            )

        result.err_items_count = len(df_prices_equals) + len(df_prices_less)
        result.items_count = len(df.index)

    else:
        result.add_info(
            "product_price_field or product_price_was_field tags were not "
            "found in schema")
    return result
示例#6
0
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields):
    """Compare price_was and price_now tagged fields"""

    price_was_fields = tagged_fields.get("product_price_was_field")
    price_fields = tagged_fields.get("product_price_field")
    items_number = len(df.index)

    result = Result("Compare Price Was And Now")

    if not price_was_fields or not price_fields:
        result.add_info(Outcome.SKIPPED)
        return result

    price_field = price_fields[0]
    price_was_field = price_was_fields[0]
    prices = df.copy()
    prices[price_was_field] = prices[price_was_field].astype(float)
    prices[price_field] = prices[price_field].astype(float)

    df_prices_less = pd.DataFrame(
        prices[prices[price_was_field] < prices[price_field]],
        columns=[price_was_field, price_field],
    )

    price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number)

    if not df_prices_less.empty:
        error = f"Past price is less than current for {len(df_prices_less)} items"
        result.add_error(
            f"{price_less_percent} ({len(df_prices_less)}) of "
            f"items with {price_was_field} < {price_field}",
            detailed=f"{error}:\n{list(df_prices_less.index)}",
        )

    df_prices_equals = pd.DataFrame(
        prices[prices[price_was_field] == prices[price_field]],
        columns=[price_was_field, price_field],
    )
    price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number)

    if not df_prices_equals.empty:
        result.add_warning(
            (
                f"{price_equal_percent} ({len(df_prices_equals)}) "
                f"of items with {price_was_field} = {price_field}"
            ),
            detailed=(
                f"Prices equal for {len(df_prices_equals)} items:\n"
                f"{list(df_prices_equals.index)}"
            ),
        )

    result.err_items_count = len(df_prices_equals) + len(df_prices_less)
    result.items_count = len(df.index)

    return result
示例#7
0
def create_result(
    rule_name, messages, stats=None, err_items_count=None, items_count=None
):
    result = Result(rule_name)
    for level, messages in messages.items():
        for message in messages:
            result.add_message(level, *message)

    if stats:
        result.stats = stats
    if err_items_count:
        result.err_items_count = err_items_count
    if items_count:
        result.items_count = items_count
    return result
示例#8
0
def create_result(
    rule_name: str,
    messages: Dict[Level, List[Message]],
    stats: Optional[List[Stat]] = None,
    items_count: Optional[int] = None,
) -> Result:
    result = Result(rule_name)
    for level, messages in messages.items():
        for message in messages:
            result.add_message(level, *message)

    if stats:
        result.stats = stats
    if items_count:
        result.items_count = items_count
    return result
示例#9
0
def create_result(rule_name,
                  messages,
                  err_items_count=None,
                  checked_fields=None,
                  items_count=None):
    result = Result(rule_name)
    for level, messages in messages.items():
        for message in messages:
            result.add_message(level, *message)

    if err_items_count:
        result.err_items_count = err_items_count
    if checked_fields:
        result.checked_fields = checked_fields
    if items_count:
        result.items_count = items_count
    return result
示例#10
0
def find_by(df: pd.DataFrame, columns: List[str]) -> Result:
    """Compare items rows in `df` by `columns`

    Returns:
        Any duplicates
    """
    result = Result(f"Duplicates")
    result.items_count = len(df)
    df = df.dropna(subset=columns, how="all")
    duplicates = df[df.duplicated(columns, keep=False)][columns]
    if duplicates.empty:
        return result

    errors = {}
    for _, d in duplicates.groupby(columns):
        msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in columns]
        errors[f"same {', '.join(msgs)}"] = list(d.index)

    result.add_error(
        f"{len(duplicates)} duplicate(s) with same {', '.join(columns)}",
        errors=errors)
    return result