def create_figures(self, items, items_dicts): jf = JsonFields(self.schema) tagged_fields = jf.tagged no_of_validated_items = len(items.df.index) dup_items_result = duplicate_rules.check_items(items.df, tagged_fields) no_of_checked_duplicated_items = dup_items_result.items_count no_of_duplicated_items = dup_items_result.err_items_count dup_skus_result = duplicate_rules.check_uniqueness( items.df, tagged_fields) no_of_checked_skus_items = dup_skus_result.items_count no_of_duplicated_skus = dup_skus_result.err_items_count price_was_now_result = price_rules.compare_was_now( items.df, tagged_fields) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count garbage_symbols_result = garbage_symbols(items) crawlera_user = api.get_crawlera_user(items.job) no_of_validation_warnings = self.report.results.get( "JSON Schema Validation").get_errors_count() quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, no_of_validation_warnings, no_of_duplicated_items, no_of_checked_duplicated_items, no_of_duplicated_skus, no_of_checked_skus_items, no_of_price_warns, no_of_validated_items, tested=True, garbage_symbols=garbage_symbols_result, ) cleaned_df = self.drop_service_columns(items.df) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( cleaned_df, no_of_validation_warnings, tagged_fields.get("name_field", ""), tagged_fields.get("product_url_field", ""), no_of_checked_duplicated_items, no_of_duplicated_items, tagged_fields.get("unique", []), no_of_checked_skus_items, no_of_duplicated_skus, tagged_fields.get("product_price_field", ""), tagged_fields.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.job.key, cleaned_df) self.coverage_by_categories(cleaned_df, tagged_fields)
def create_figures(self, items: JobItems): dups = self.report.results.get( "Duplicates", duplicate_rules.find_by_tags(items.df, self.schema.tags) ) price_was_now_result = price_rules.compare_was_now(items.df, self.schema.tags) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count crawlera_user = api.get_crawlera_user(items.job) validation_errors = self.report.results.get( "JSON Schema Validation", schema_rules.validate( self.schema.raw, raw_items=items.raw, keys=items.df.index, fast=False ), ).get_errors_count() garbage_symbols_result = self.report.results.get( "Garbage Symbols", garbage_symbols(items.df) ) quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, validation_errors, dups.err_items_count, dups.items_count, no_of_price_warns, no_of_checked_price_items, tested=True, garbage_symbols=garbage_symbols_result, ) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( items.df, validation_errors, self.schema.tags.get("name_field", ""), self.schema.tags.get("product_url_field", ""), dups.err_items_count, dups.items_count, self.schema.tags.get("product_price_field", ""), self.schema.tags.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.df) self.coverage_by_categories(items.df, self.schema.tags)
def create_figures(self, items): tagged_fields = Tags().get(self.schema) dup_items_result = duplicate_rules.check_items(items.df, tagged_fields) no_of_checked_duplicated_items = dup_items_result.items_count no_of_duplicated_items = dup_items_result.err_items_count dup_skus_result = duplicate_rules.check_uniqueness( items.df, tagged_fields) no_of_checked_skus_items = dup_skus_result.items_count no_of_duplicated_skus = dup_skus_result.err_items_count price_was_now_result = price_rules.compare_was_now( items.df, tagged_fields) no_of_price_warns = price_was_now_result.err_items_count no_of_checked_price_items = price_was_now_result.items_count crawlera_user = api.get_crawlera_user(items.job) validation_errors = self.report.results.get( "JSON Schema Validation", schema_rules.validate(self.schema, raw_items=items.raw, fast=False), ).get_errors_count() garbage_symbols_result = self.report.results.get( "Garbage Symbols", garbage_symbols(items)) quality_estimation, field_accuracy = generate_quality_estimation( items.job, crawlera_user, validation_errors, no_of_duplicated_items, no_of_checked_duplicated_items, no_of_duplicated_skus, no_of_checked_skus_items, no_of_price_warns, no_of_checked_price_items, tested=True, garbage_symbols=garbage_symbols_result, ) self.score_table(quality_estimation, field_accuracy) self.job_summary_table(items.job) self.rules_summary_table( items.df, validation_errors, tagged_fields.get("name_field", ""), tagged_fields.get("product_url_field", ""), no_of_checked_duplicated_items, no_of_duplicated_items, tagged_fields.get("unique", []), no_of_checked_skus_items, no_of_duplicated_skus, tagged_fields.get("product_price_field", ""), tagged_fields.get("product_price_was_field", ""), no_of_checked_price_items, no_of_price_warns, garbage_symbols=garbage_symbols_result, ) self.scraped_fields_coverage(items.df) self.coverage_by_categories(items.df, tagged_fields)
def job_summary_table(job) -> go.FigureWidget: job_url = f"{SH_URL}/{job.key}" job_state = api.get_job_state(job) job_close_reason = api.get_job_close_reason(job) no_of_scraped_items = api.get_items_count(job) no_of_errors = api.get_errors_count(job) job_runtime = api.get_runtime_s(job) / 1000 run_time = helpers.ms_to_time(job_runtime) crawling_speed = round(job_runtime / 60 / no_of_scraped_items, 3) request_success_ratio = round( api.get_requests_count(job) / float(no_of_scraped_items), 2) max_memusage = api.get_max_memusage(job) response_status_count = api.get_response_status_count(job) crawlera_stat_value = api.get_crawlera_user(job) if not crawlera_stat_value: crawlera_stat_value = "Not Used" job_stats_values = [ "Job URL", "Spider State", "Spider Close Reason", "Number of Scraped Items", "Number of Errors", "Runtime", "Request Success Ratio [requests/scraped items]", "Crawling Speed [items/min]", "Crawlera user", "Max Memory Usage [Bytes]", "Response Status Count", ] stats_values = [ '<a href="' + job_url + '">' + job_url + "</a>", job_state, job_close_reason, no_of_scraped_items, no_of_errors, run_time, request_success_ratio, crawling_speed, crawlera_stat_value, max_memusage, "200: " + str(response_status_count[0]) + "<br>" + "301: " + str(response_status_count[1]) + "<br>" + "404: " + str(response_status_count[2]) + "<br>" + "503: " + str(response_status_count[3]) + "<br>", ] trace = go.Table( columnorder=[1, 2], columnwidth=[300, 200], header=dict( values=["<b>Job Stat</b>", "<b>Stat Value</b>"], fill=dict(color="gray"), align=["left"] * 5, font=dict(color="black", size=14), height=30, ), cells=dict( values=[job_stats_values, stats_values], fill=dict(color="lightgrey"), font=dict(color="black", size=12), height=25, align=["left"] * 5, ), ) spider = job.metadata.get("spider") layout = go.Layout( title=f"Summary for spider {spider}", autosize=True, margin=dict(t=40, b=25, l=0, r=0), height=445, ) return go.FigureWidget(data=[trace], layout=layout)