def create_figures(self, items, items_dicts): jf = JsonFields(self.schema) tagged_fields = jf.tagged no_of_validated_items = len(items.df.index) dup_items_result = duplicate_rules.check_items(items.df, tagged_fields) no_of_checked_duplicated_items = dup_items_result.items_count no_of_duplicated_items = dup_items_result.err_items_count dup_skus_result = duplicate_rules.check_uniqueness( items.df, tagged_fields) no_of_checked_skus_items = dup_skus_result.items_count no_of_duplicated_skus = dup_skus_result.err_items_count price_was_now_result = price_rules.compare_was_now( items.df, tagged_fields) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count garbage_symbols_result = garbage_symbols(items) crawlera_user = api.get_crawlera_user(items.job) no_of_validation_warnings = self.report.results.get( "JSON Schema Validation").get_errors_count() quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, no_of_validation_warnings, no_of_duplicated_items, no_of_checked_duplicated_items, no_of_duplicated_skus, no_of_checked_skus_items, no_of_price_warns, no_of_validated_items, tested=True, garbage_symbols=garbage_symbols_result, ) cleaned_df = self.drop_service_columns(items.df) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( cleaned_df, no_of_validation_warnings, tagged_fields.get("name_field", ""), tagged_fields.get("product_url_field", ""), no_of_checked_duplicated_items, no_of_duplicated_items, tagged_fields.get("unique", []), no_of_checked_skus_items, no_of_duplicated_skus, tagged_fields.get("product_price_field", ""), tagged_fields.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.job.key, cleaned_df) self.coverage_by_categories(cleaned_df, tagged_fields)
def run_customized_rules(self, items, tagged_fields): self.save_result(price_rules.compare_was_now(items.df, tagged_fields)) self.save_result( duplicate_rules.check_uniqueness(items.df, tagged_fields)) self.save_result(duplicate_rules.check_items(items.df, tagged_fields)) self.save_result( category_rules.get_coverage_per_category( items.df, tagged_fields.get("category", [])))
def test_check_uniqueness(data, tagged_fields, expected_messages, expected_err_items_count): df = pd.DataFrame(data) assert check_uniqueness(df, tagged_fields) == create_result( "Uniqueness", expected_messages, items_count=len(df), err_items_count=expected_err_items_count, )
def create_figures(self, items): tagged_fields = Tags().get(self.schema) dup_items_result = duplicate_rules.check_items(items.df, tagged_fields) no_of_checked_duplicated_items = dup_items_result.items_count no_of_duplicated_items = dup_items_result.err_items_count dup_skus_result = duplicate_rules.check_uniqueness( items.df, tagged_fields) no_of_checked_skus_items = dup_skus_result.items_count no_of_duplicated_skus = dup_skus_result.err_items_count price_was_now_result = price_rules.compare_was_now( items.df, tagged_fields) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count crawlera_user = api.get_crawlera_user(items.job) validation_errors = self.report.results.get( "JSON Schema Validation", schema_rules.validate(self.schema, raw_items=items.raw, fast=False), ).get_errors_count() garbage_symbols_result = self.report.results.get( "Garbage Symbols", garbage_symbols(items)) quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, validation_errors, no_of_duplicated_items, no_of_checked_duplicated_items, no_of_duplicated_skus, no_of_checked_skus_items, no_of_price_warns, no_of_checked_price_items, tested=True, garbage_symbols=garbage_symbols_result, ) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( items.df, validation_errors, tagged_fields.get("name_field", ""), tagged_fields.get("product_url_field", ""), no_of_checked_duplicated_items, no_of_duplicated_items, tagged_fields.get("unique", []), no_of_checked_skus_items, no_of_duplicated_skus, tagged_fields.get("product_price_field", ""), tagged_fields.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.df) self.coverage_by_categories(items.df, tagged_fields)