Exemplo n.º 1
0
def test_find_by_name_url(data, tagged_fields, expected_messages):
    df = pd.DataFrame(data)
    result = duplicates.find_by_name_url(df, tagged_fields)
    assert result == create_result(
        "Duplicates By **name_field, product_url_field** Tags",
        expected_messages,
        items_count=len(df),
    )
Exemplo n.º 2
0
 def run_customized_rules(self, items, tagged_fields):
     self.save_result(price_rules.compare_was_now(items.df, tagged_fields))
     self.save_result(
         duplicate_rules.find_by_unique(items.df, tagged_fields))
     self.save_result(
         duplicate_rules.find_by_name_url(items.df, tagged_fields))
     self.save_result(
         category_rules.get_coverage_per_category(
             items.df,
             tagged_fields.get("category", []) + self.schema.enums))
Exemplo n.º 3
0
    def create_figures(self, items: CloudItems):
        name_url_dups = self.report.results.get(
            "Duplicates By **name_field, product_url_field** Tags",
            duplicate_rules.find_by_name_url(items.df, self.schema.tags),
        )

        uniques = self.report.results.get(
            "Duplicates By **unique** Tag",
            duplicate_rules.find_by_unique(items.df, self.schema.tags),
        )

        price_was_now_result = price_rules.compare_was_now(
            items.df, self.schema.tags)
        no_of_price_warns = price_was_now_result.err_items_count
        no_of_checked_price_items = price_was_now_result.items_count

        crawlera_user = api.get_crawlera_user(items.job)

        validation_errors = self.report.results.get(
            "JSON Schema Validation",
            schema_rules.validate(self.schema.raw,
                                  raw_items=items.raw,
                                  keys=items.df.index,
                                  fast=False),
        ).get_errors_count()

        garbage_symbols_result = self.report.results.get(
            "Garbage Symbols", garbage_symbols(items.df))

        quality_estimation, field_accuracy = generate_quality_estimation(
            items.job,
            crawlera_user,
            validation_errors,
            name_url_dups.err_items_count,
            name_url_dups.items_count,
            uniques.err_items_count,
            uniques.items_count,
            no_of_price_warns,
            no_of_checked_price_items,
            tested=True,
            garbage_symbols=garbage_symbols_result,
        )

        self.score_table(quality_estimation, field_accuracy)
        self.job_summary_table(items.job)
        self.rules_summary_table(
            items.df,
            validation_errors,
            self.schema.tags.get("name_field", ""),
            self.schema.tags.get("product_url_field", ""),
            name_url_dups.items_count,
            name_url_dups.err_items_count,
            self.schema.tags.get("unique", []),
            uniques.items_count,
            uniques.err_items_count,
            self.schema.tags.get("product_price_field", ""),
            self.schema.tags.get("product_price_was_field", ""),
            no_of_checked_price_items,
            no_of_price_warns,
            garbage_symbols=garbage_symbols_result,
        )
        self.scraped_fields_coverage(items.df)
        self.coverage_by_categories(items.df, self.schema.tags)