def find_by(df: pd.DataFrame, uniques: List[Union[str, List[str]]]) -> Result: """Find equal items rows in `df` by `uniques`. I.e. if two items have the same uniques's element value, they are considered duplicates. Args: uniques: list containing columns and list of columns to identify duplicates. List of columns means that all list columns values should be equal. Returns: Any duplicates """ result = Result("Duplicates") result.items_count = len(df) df = df.dropna(subset=list(set(flatten(uniques))), how="all") for columns in uniques: mask = columns if isinstance(columns, list) else [columns] duplicates = df[df.duplicated(columns, keep=False)][mask] if duplicates.empty: continue errors = {} grouped = duplicates.groupby(columns) for _, d in grouped: msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in mask] errors[f"same {', '.join(msgs)}"] = list(d.index) result.add_error( f"{', '.join(mask)} contains {len(grouped)} duplicated value(s)", errors=errors, ) return result
def find_by_unique(df: pd.DataFrame, tagged_fields: TaggedFields) -> Result: """Verify if each item field tagged with `unique` is unique. Returns: A result containing field names and keys for non unique items """ unique_fields = tagged_fields.get("unique", []) result = Result("Duplicates By **unique** Tag") if not unique_fields: result.add_info(Outcome.SKIPPED) return result err_keys = set() for field in unique_fields: result.items_count = df[field].count() duplicates = df[df.duplicated(field, keep=False)][[field]] errors = {} for _, d in duplicates.groupby([field]): keys = list(d.index) msg = f"same '{d[field].iloc[0]}' `{field}`" errors[msg] = keys err_keys = err_keys.union(keys) if not duplicates.empty: result.add_error( f"{field} contains {len(duplicates[field].unique())} duplicated value(s)", errors=errors, ) result.err_items_count = len(err_keys) return result
def check_items(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]) -> Result: """Check for items with the same name and url""" name_fields = tagged_fields.get("name_field") url_fields = tagged_fields.get("product_url_field") result = Result("Duplicated Items") if not name_fields or not url_fields: result.add_info( "'name_field' and 'product_url_field' tags were not found in schema" ) else: result.items_count = len(df) errors = {} name_field = name_fields[0] url_field = url_fields[0] df = df[[name_field, url_field, "_key"]] duplicates = df[df[[name_field, url_field]].duplicated(keep=False)] if duplicates.empty: return result result.err_items_count = len(duplicates) for _, d in duplicates.groupby([name_field, url_field]): msg = ( f"same '{d[name_field].iloc[0]}' name and '{d[url_field].iloc[0]}' url" ) errors[msg] = list(d["_key"]) result.add_error( f"{len(duplicates)} duplicate(s) with same name and url", errors=errors) return result
def check_uniqueness(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]) -> Result: """Verify if each item field tagged with `unique` is unique. Returns: A result containing field names and keys for non unique items """ unique_fields = tagged_fields.get("unique", []) result = Result("Uniqueness") if not unique_fields: result.add_info("'unique' tag was not found in schema") return result err_keys = set() for field in unique_fields: result.items_count = df[field].count() duplicates = df[df[field].duplicated(keep=False)][[field, "_key"]] errors = {} for _, d in duplicates.groupby([field]): keys = list(d["_key"]) msg = f"same '{d[field].iloc[0]}' {field}" errors[msg] = keys err_keys = err_keys.union(keys) if not duplicates.empty: result.add_error( f"'{field}' contains {len(duplicates[field].unique())} duplicated value(s)", errors=errors, ) result.err_items_count = len(err_keys) return result
def compare_was_now(df: pd.DataFrame, tagged_fields: Dict[str, List[str]]): """Compare price_was and price_now tagged fields""" price_was_fields = tagged_fields.get("product_price_was_field") price_fields = tagged_fields.get("product_price_field") items_number = len(df.index) result = Result("Compare Price Was And Now") if (price_was_fields and price_was_fields[0] in df.columns and price_fields and price_fields[0] in df.columns): price_field = price_fields[0] price_was_field = price_was_fields[0] prices = df.copy() prices[price_was_field] = prices[price_was_field].astype(float) prices[price_field] = prices[price_field].astype(float) df_prices_less = pd.DataFrame( prices[prices[price_was_field] < prices[price_field]], columns=["_key", price_was_field, price_field], ) price_less_percent = "{:.2%}".format( len(df_prices_less) / items_number) if not df_prices_less.empty: error = f"Past price is less than current for {len(df_prices_less)} items" result.add_error( f"{price_less_percent} ({len(df_prices_less)}) of " f"items with {price_was_field} < {price_field}", detailed=f"{error}:\n{list(df_prices_less['_key'])}", ) df_prices_equals = pd.DataFrame( prices[prices[price_was_field] == prices[price_field]], columns=["_key", price_was_field, price_field], ) price_equal_percent = "{:.2%}".format( len(df_prices_equals) / items_number) if not df_prices_equals.empty: result.add_warning( (f"{price_equal_percent} ({len(df_prices_equals)}) " f"of items with {price_was_field} = {price_field}"), detailed=(f"Prices equal for {len(df_prices_equals)} items:\n" f"{list(df_prices_equals['_key'])}"), ) result.err_items_count = len(df_prices_equals) + len(df_prices_less) result.items_count = len(df.index) else: result.add_info( "product_price_field or product_price_was_field tags were not " "found in schema") return result
def compare_was_now(df: pd.DataFrame, tagged_fields: TaggedFields): """Compare price_was and price_now tagged fields""" price_was_fields = tagged_fields.get("product_price_was_field") price_fields = tagged_fields.get("product_price_field") items_number = len(df.index) result = Result("Compare Price Was And Now") if not price_was_fields or not price_fields: result.add_info(Outcome.SKIPPED) return result price_field = price_fields[0] price_was_field = price_was_fields[0] prices = df.copy() prices[price_was_field] = prices[price_was_field].astype(float) prices[price_field] = prices[price_field].astype(float) df_prices_less = pd.DataFrame( prices[prices[price_was_field] < prices[price_field]], columns=[price_was_field, price_field], ) price_less_percent = "{:.2%}".format(len(df_prices_less) / items_number) if not df_prices_less.empty: error = f"Past price is less than current for {len(df_prices_less)} items" result.add_error( f"{price_less_percent} ({len(df_prices_less)}) of " f"items with {price_was_field} < {price_field}", detailed=f"{error}:\n{list(df_prices_less.index)}", ) df_prices_equals = pd.DataFrame( prices[prices[price_was_field] == prices[price_field]], columns=[price_was_field, price_field], ) price_equal_percent = "{:.2%}".format(len(df_prices_equals) / items_number) if not df_prices_equals.empty: result.add_warning( ( f"{price_equal_percent} ({len(df_prices_equals)}) " f"of items with {price_was_field} = {price_field}" ), detailed=( f"Prices equal for {len(df_prices_equals)} items:\n" f"{list(df_prices_equals.index)}" ), ) result.err_items_count = len(df_prices_equals) + len(df_prices_less) result.items_count = len(df.index) return result
def create_result( rule_name, messages, stats=None, err_items_count=None, items_count=None ): result = Result(rule_name) for level, messages in messages.items(): for message in messages: result.add_message(level, *message) if stats: result.stats = stats if err_items_count: result.err_items_count = err_items_count if items_count: result.items_count = items_count return result
def create_result( rule_name: str, messages: Dict[Level, List[Message]], stats: Optional[List[Stat]] = None, items_count: Optional[int] = None, ) -> Result: result = Result(rule_name) for level, messages in messages.items(): for message in messages: result.add_message(level, *message) if stats: result.stats = stats if items_count: result.items_count = items_count return result
def create_result(rule_name, messages, err_items_count=None, checked_fields=None, items_count=None): result = Result(rule_name) for level, messages in messages.items(): for message in messages: result.add_message(level, *message) if err_items_count: result.err_items_count = err_items_count if checked_fields: result.checked_fields = checked_fields if items_count: result.items_count = items_count return result
def find_by(df: pd.DataFrame, columns: List[str]) -> Result: """Compare items rows in `df` by `columns` Returns: Any duplicates """ result = Result(f"Duplicates") result.items_count = len(df) df = df.dropna(subset=columns, how="all") duplicates = df[df.duplicated(columns, keep=False)][columns] if duplicates.empty: return result errors = {} for _, d in duplicates.groupby(columns): msgs = [f"'{d[c].iloc[0]}' `{c}`" for c in columns] errors[f"same {', '.join(msgs)}"] = list(d.index) result.add_error( f"{len(duplicates)} duplicate(s) with same {', '.join(columns)}", errors=errors) return result