Exemplo n.º 1
0
 def run_general_rules(self):
     self.save_result(garbage_symbols(self.source_items))
     self.save_result(
         coverage_rules.check_fields_coverage(
             self.source_items.df.drop(columns=["_type", "_key"])
         )
     )
Exemplo n.º 2
0
 def run_general_rules(self):
     self.save_result(garbage_symbols(self.source_items.df))
     df = self.source_items.df
     self.save_result(
         coverage_rules.check_fields_coverage(
             df.drop(columns=df.columns[df.columns.str.startswith("_")])))
     self.save_result(category_rules.get_categories(df))
Exemplo n.º 3
0
def test_garbage_symbols(raw_items, expected_messages, expected_items_count):
    assert_results_equal(
        garbage_symbols(pd.DataFrame(raw_items)),
        create_result("Garbage Symbols",
                      expected_messages,
                      items_count=expected_items_count),
    )
Exemplo n.º 4
0
    def create_figures(self, items, items_dicts):
        tagged_fields = Tags().get(self.schema)
        no_of_validated_items = len(items.df.index)

        dup_items_result = duplicate_rules.check_items(items.df, tagged_fields)
        no_of_checked_duplicated_items = dup_items_result.items_count
        no_of_duplicated_items = dup_items_result.err_items_count

        dup_skus_result = duplicate_rules.check_uniqueness(
            items.df, tagged_fields)
        no_of_checked_skus_items = dup_skus_result.items_count
        no_of_duplicated_skus = dup_skus_result.err_items_count

        price_was_now_result = price_rules.compare_was_now(
            items.df, tagged_fields)
        no_of_price_warns = price_was_now_result.err_items_count
        no_of_checked_price_items = price_was_now_result.items_count

        garbage_symbols_result = garbage_symbols(items)

        crawlera_user = api.get_crawlera_user(items.job)
        no_of_validation_warnings = self.report.results.get(
            "JSON Schema Validation").get_errors_count()
        quality_estimation, field_accuracy = generate_quality_estimation(
            items.job,
            crawlera_user,
            no_of_validation_warnings,
            no_of_duplicated_items,
            no_of_checked_duplicated_items,
            no_of_duplicated_skus,
            no_of_checked_skus_items,
            no_of_price_warns,
            no_of_validated_items,
            tested=True,
            garbage_symbols=garbage_symbols_result,
        )

        cleaned_df = self.drop_service_columns(items.df)

        self.score_table(quality_estimation, field_accuracy)
        self.job_summary_table(items.job)
        self.rules_summary_table(
            cleaned_df,
            no_of_validation_warnings,
            tagged_fields.get("name_field", ""),
            tagged_fields.get("product_url_field", ""),
            no_of_checked_duplicated_items,
            no_of_duplicated_items,
            tagged_fields.get("unique", []),
            no_of_checked_skus_items,
            no_of_duplicated_skus,
            tagged_fields.get("product_price_field", ""),
            tagged_fields.get("product_price_was_field", ""),
            no_of_checked_price_items,
            no_of_price_warns,
            garbage_symbols=garbage_symbols_result,
        )
        self.scraped_fields_coverage(items.job.key, cleaned_df)
        self.coverage_by_categories(cleaned_df, tagged_fields)
Exemplo n.º 5
0
def test_garbage_symbols(get_job_items, expected_messages,
                         expected_items_count, expected_err_items_count):
    assert garbage_symbols(get_job_items) == create_result(
        "Garbage Symbols",
        expected_messages,
        items_count=expected_items_count,
        err_items_count=expected_err_items_count,
    )
Exemplo n.º 6
0
def test_garbage_symbols(raw_items, expected_messages, expected_items_count,
                         expected_err_items_count):
    assert garbage_symbols(pd.DataFrame(raw_items)) == create_result(
        "Garbage Symbols",
        expected_messages,
        items_count=expected_items_count,
        err_items_count=expected_err_items_count,
    )
Exemplo n.º 7
0
 def run_general_rules(self):
     self.save_result(garbage_symbols(self.source_items.df))
     df = self.source_items.df
     self.save_result(
         coverage_rules.check_fields_coverage(
             df.drop(columns=df.columns[df.columns.str.startswith("_")])))
     self.save_result(category_rules.get_categories(df))
     if getattr(self, "uniques", None):
         self.save_result(
             duplicate_rules.find_by(self.source_items.df, self.uniques))
Exemplo n.º 8
0
    def create_figures(self, items: JobItems):
        dups = self.report.results.get(
            "Duplicates", duplicate_rules.find_by_tags(items.df, self.schema.tags)
        )

        price_was_now_result = price_rules.compare_was_now(items.df, self.schema.tags)
        no_of_price_warns = price_was_now_result.err_items_count
        no_of_checked_price_items = price_was_now_result.items_count

        crawlera_user = api.get_crawlera_user(items.job)

        validation_errors = self.report.results.get(
            "JSON Schema Validation",
            schema_rules.validate(
                self.schema.raw, raw_items=items.raw, keys=items.df.index, fast=False
            ),
        ).get_errors_count()

        garbage_symbols_result = self.report.results.get(
            "Garbage Symbols", garbage_symbols(items.df)
        )

        quality_estimation, field_accuracy = generate_quality_estimation(
            items.job,
            crawlera_user,
            validation_errors,
            dups.err_items_count,
            dups.items_count,
            no_of_price_warns,
            no_of_checked_price_items,
            tested=True,
            garbage_symbols=garbage_symbols_result,
        )

        self.score_table(quality_estimation, field_accuracy)
        self.job_summary_table(items.job)
        self.rules_summary_table(
            items.df,
            validation_errors,
            self.schema.tags.get("name_field", ""),
            self.schema.tags.get("product_url_field", ""),
            dups.err_items_count,
            dups.items_count,
            self.schema.tags.get("product_price_field", ""),
            self.schema.tags.get("product_price_was_field", ""),
            no_of_checked_price_items,
            no_of_price_warns,
            garbage_symbols=garbage_symbols_result,
        )
        self.scraped_fields_coverage(items.df)
        self.coverage_by_categories(items.df, self.schema.tags)
Exemplo n.º 9
0
    def create_figures(self, items):
        tagged_fields = Tags().get(self.schema)

        dup_items_result = duplicate_rules.check_items(items.df, tagged_fields)
        no_of_checked_duplicated_items = dup_items_result.items_count
        no_of_duplicated_items = dup_items_result.err_items_count

        dup_skus_result = duplicate_rules.check_uniqueness(
            items.df, tagged_fields)
        no_of_checked_skus_items = dup_skus_result.items_count
        no_of_duplicated_skus = dup_skus_result.err_items_count

        price_was_now_result = price_rules.compare_was_now(
            items.df, tagged_fields)
        no_of_price_warns = price_was_now_result.err_items_count
        no_of_checked_price_items = price_was_now_result.items_count

        crawlera_user = api.get_crawlera_user(items.job)

        validation_errors = self.report.results.get(
            "JSON Schema Validation",
            schema_rules.validate(self.schema, raw_items=items.raw,
                                  fast=False),
        ).get_errors_count()

        garbage_symbols_result = self.report.results.get(
            "Garbage Symbols", garbage_symbols(items))

        quality_estimation, field_accuracy = generate_quality_estimation(
            items.job,
            crawlera_user,
            validation_errors,
            no_of_duplicated_items,
            no_of_checked_duplicated_items,
            no_of_duplicated_skus,
            no_of_checked_skus_items,
            no_of_price_warns,
            no_of_checked_price_items,
            tested=True,
            garbage_symbols=garbage_symbols_result,
        )

        self.score_table(quality_estimation, field_accuracy)
        self.job_summary_table(items.job)
        self.rules_summary_table(
            items.df,
            validation_errors,
            tagged_fields.get("name_field", ""),
            tagged_fields.get("product_url_field", ""),
            no_of_checked_duplicated_items,
            no_of_duplicated_items,
            tagged_fields.get("unique", []),
            no_of_checked_skus_items,
            no_of_duplicated_skus,
            tagged_fields.get("product_price_field", ""),
            tagged_fields.get("product_price_was_field", ""),
            no_of_checked_price_items,
            no_of_price_warns,
            garbage_symbols=garbage_symbols_result,
        )
        self.scraped_fields_coverage(items.df)
        self.coverage_by_categories(items.df, tagged_fields)