Exemplo n.º 1
0
def get_difference(source_job: Job,
                   target_job: Job,
                   err_thr: float = 0.10,
                   warn_thr: float = 0.05) -> Result:
    """Get difference between jobs coverages. The coverage is job fields counts
    divided on the job size.

    Args:
        source_job: a base job, the difference is calculated from it
        target_job: a job to compare
        err_thr: a threshold for errors
        warn_thr: a threshold for warnings

    Returns:
        A Result instance with huge dif and stats with fields counts coverage and dif
    """
    result = Result("Coverage Difference")
    f_counts = (pd.DataFrame({
        source_job.key: api.get_counts(source_job),
        target_job.key: api.get_counts(target_job),
    }).drop(index=["_type"]).fillna(0).sort_values(by=[source_job.key],
                                                   kind="mergesort"))
    f_counts[source_job.key] = f_counts[source_job.key].divide(
        api.get_items_count(source_job))
    f_counts[target_job.key] = f_counts[target_job.key].divide(
        api.get_items_count(target_job))
    f_counts.name = "Coverage from job stats fields counts"
    result.stats.append(f_counts)

    coverage_difs = f_counts[source_job.key] - f_counts[target_job.key]
    coverage_difs = coverage_difs[coverage_difs.abs() > warn_thr].sort_values(
        kind="mergesoft")
    coverage_difs.name = f"Coverage difference more than {warn_thr:.0%}"
    if not coverage_difs.empty:
        result.stats.append(coverage_difs)

    errs = coverage_difs[coverage_difs.abs() > err_thr]
    if not errs.empty:
        result.add_error(
            f"The difference is greater than {err_thr:.0%} for {len(errs)} field(s)"
        )
    warns = coverage_difs[(coverage_difs > warn_thr)
                          & (coverage_difs <= err_thr)]
    if not warns.empty:
        result.add_warning(
            f"The difference is between {warn_thr:.0%} and {err_thr:.0%} "
            f"for {len(warns)} field(s)")
    return result
Exemplo n.º 2
0
def create_json_schema(source_key: str,
                       item_numbers: List[int] = None) -> dict:
    client = ScrapinghubClient()
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
    elif helpers.is_job_key(source_key):
        job = client.get_job(source_key)
        items_count = api.get_items_count(job)
        store = job.items
    else:
        logger.error(f"{source_key} is not a job or collection key")
        return

    if items_count == 0:
        logger.error(f"{source_key} does not have any items")
        return

    item_n_err = "{} is a bad item number, choose numbers between 0 and {}"
    if item_numbers:
        item_numbers.sort()
        if item_numbers[-1] >= items_count or item_numbers[0] < 0:
            logger.error(item_n_err.format(item_numbers[-1], items_count - 1))
            return
    else:
        item_numbers = set_item_no(items_count)

    samples = []
    for n in item_numbers:
        items = api.get_items(source_key, start_index=n, count=1)
        samples.append(items[0])

    return infer_schema(samples)
Exemplo n.º 3
0
def create_json_schema(source_key: str,
                       item_numbers: Optional[List[int]] = None) -> dict:
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
    elif helpers.is_job_key(source_key):
        job = ScrapinghubClient().get_job(source_key)
        items_count = api.get_items_count(job)
    else:
        raise ValueError(
            f"'{source_key}' is not a valid job or collection key")

    if items_count == 0:
        raise ValueError(f"'{source_key}' does not have any items")

    item_n_err = "{} is a bad item number, choose numbers between 0 and {}"
    if item_numbers:
        item_numbers.sort()
        if item_numbers[-1] >= items_count or item_numbers[0] < 0:
            raise ValueError(
                item_n_err.format(item_numbers[-1], items_count - 1))
    else:
        item_numbers = set_item_no(items_count)

    samples = []
    for n in item_numbers:
        items = api.get_items(source_key, start_index=n, count=1, p_bar=None)
        samples.append(items[0])

    return infer_schema(samples)
Exemplo n.º 4
0
def create_json_schema(source_key: str,
                       items_numbers: Optional[List[int]] = None) -> RawSchema:
    """Create schema based on sampled `source_key` items."""
    if helpers.is_collection_key(source_key):
        store = api.get_collection(source_key)
        items_count = store.count()
        start_mask = ""
    elif helpers.is_job_key(source_key):
        items_count = api.get_items_count(api.get_job(source_key))
        start_mask = f"{source_key}/"
    else:
        raise ValueError(
            f"'{source_key}' is not a valid job or collection key")

    if items_count == 0:
        raise ValueError(f"'{source_key}' does not have any items")

    items_numbers = items_numbers or set_item_no(items_count)
    if max(items_numbers) >= items_count or min(items_numbers) < 0:
        raise ValueError(
            f"Expected values between 0 and {items_count}, got '{items_numbers}'"
        )

    samples = []
    for n in items_numbers:
        item = api.get_items(source_key,
                             count=1,
                             start_index=n,
                             start=f"{start_mask}{n}",
                             p_bar=None)[0]
        item.pop("_type", None)
        item.pop("_key", None)
        samples.append(item)

    return infer_schema(samples)
Exemplo n.º 5
0
def compare_fields_counts(source_job, target_job):
    """Compare the relative difference between field counts to items count

    Args:
        source_job: a base job, the difference is calculated from it
        target_job: a job to compare

    Returns:
        A Result instance
    """
    source_items_count = get_items_count(source_job)
    target_items_count = get_items_count(target_job)
    result = Result("Fields Counts")

    source_fields = pd.DataFrame(
        {"Count1": source_job.items.stats().get("counts", None)})
    target_fields = pd.DataFrame(
        {"Count2": target_job.items.stats().get("counts", None)})
    fields = pd.concat([source_fields, target_fields], axis=1,
                       sort=True).fillna(0)
    fields["Difference, %"] = fields.apply(
        lambda row: ratio_diff(row["Count1"] / source_items_count, row[
            "Count2"] / target_items_count) * 100,
        axis=1,
    )
    fields["Difference, %"] = fields["Difference, %"].astype(int)
    fields.sort_values(by=["Difference, %"], ascending=False)

    err_diffs = fields[fields["Difference, %"] > 10]
    if not err_diffs.empty:
        result.add_error(
            f"Coverage difference is greater than 10% for "
            f"{len(err_diffs)} field(s)",
            err_diffs.to_string(columns=["Difference, %"]),
        )

    warn_diffs = fields[(fields["Difference, %"] > 5)
                        & (fields["Difference, %"] <= 10)]
    if not warn_diffs.empty:
        outcome_msg = (f"Coverage difference is between 5% and 10% for "
                       f"{len(warn_diffs)} field(s)")
        result.add_warning(outcome_msg,
                           warn_diffs.to_string(columns=["Difference, %"]))

    return result
Exemplo n.º 6
0
def compare_response_ratio(source_job: Job, target_job: Job) -> Result:
    """Compare request with response per item ratio"""
    s_ratio = round(
        api.get_requests_count(source_job) / api.get_items_count(source_job),
        2)
    t_ratio = round(
        api.get_requests_count(target_job) / api.get_items_count(target_job),
        2)

    response_ratio_diff = helpers.ratio_diff(s_ratio, t_ratio)
    msg = f"Difference is {response_ratio_diff:.2%} - {s_ratio} and {t_ratio}"

    result = Result("Compare Responses Per Item Ratio")
    if response_ratio_diff > 0.2:
        result.add_error(msg)
    elif response_ratio_diff > 0.1:
        result.add_warning(msg)
    return result
Exemplo n.º 7
0
def compare_number_of_scraped_items(source_job: Job, target_job: Job) -> Result:
    items_count1 = api.get_items_count(source_job)
    items_count2 = api.get_items_count(target_job)
    diff = helpers.ratio_diff(items_count1, items_count2)
    result = Result("Total Scraped Items")
    if 0 <= diff < 0.05:
        if diff == 0:
            msg = "Same number of items"
        else:
            msg = f"Almost the same number of items - {items_count1} and {items_count2}"
        result.add_info(msg)
    else:
        msg = f"{items_count1} differs from {items_count2} on {diff * 100}%"
        if 0.05 <= diff < 0.10:
            result.add_warning(msg)
        elif diff >= 0.10:
            result.add_error(msg)
    return result
Exemplo n.º 8
0
def check_response_ratio(job: Job) -> Result:
    requests_number = api.get_requests_count(job)
    items_count = api.get_items_count(job)
    result = Result("Responses Per Item Ratio")
    result.add_info(
        f"Number of responses / Number of scraped items - "
        f"{round(requests_number / items_count, 2)}"
    )
    return result
Exemplo n.º 9
0
def compare_number_of_scraped_items(source_job: Job,
                                    target_job: Job) -> Result:
    s_count = api.get_items_count(source_job)
    t_count = api.get_items_count(target_job)
    diff = helpers.ratio_diff(s_count, t_count)
    result = Result("Total Scraped Items")
    if 0 <= diff < 0.05:
        if diff == 0:
            msg = "Same number of items"
        else:
            msg = f"Almost the same number of items - {s_count} and {t_count}"
        result.add_info(msg)
    else:
        msg = f"{s_count} differs from {t_count} on {diff:.2%}"
        if 0.05 <= diff < 0.10:
            result.add_warning(msg)
        elif diff >= 0.10:
            result.add_error(msg)
    return result
Exemplo n.º 10
0
def compare_response_ratio(source_job: Job, target_job: Job) -> Result:
    """Compare request with response per item ratio"""
    items_count1 = api.get_items_count(source_job)
    items_count2 = api.get_items_count(target_job)

    source_ratio = round(api.get_requests_count(source_job) / items_count1, 2)
    target_ratio = round(api.get_requests_count(target_job) / items_count2, 2)

    response_ratio_diff = helpers.ratio_diff(source_ratio, target_ratio)
    msg = "Difference is {}% - {} and {}".format(
        response_ratio_diff * 100, source_ratio, target_ratio
    )

    result = Result("Compare Responses Per Item Ratio")
    if response_ratio_diff > 0.2:
        result.add_error(msg)
    elif response_ratio_diff > 0.1:
        result.add_warning(msg)
    return result
Exemplo n.º 11
0
def test_get_items_count(metadata, stats, expected_count):
    assert api.get_items_count(Job(metadata=metadata,
                                   stats=stats)) == expected_count
Exemplo n.º 12
0
def generate_quality_estimation(job, crawlera_user, no_of_validation_warnings,
                                no_of_duplicated_items,
                                checked_dup_items_count, no_of_duplicated_skus,
                                no_of_checked_skus_items, no_of_price_warns,
                                no_of_checked_price_items, tested, **kwargs):
    no_of_scraped_items = api.get_items_count(job)
    no_of_errors = api.get_errors_count(job)

    job_state = api.get_job_state(job)
    job_close_reason = api.get_job_close_reason(job)
    response_status_count = api.get_response_status_count(job)

    adherence_to_schema_percent = float(
        get_adherence_to_schema_percent(no_of_validation_warnings,
                                        no_of_scraped_items))
    duplicated_items_percent = float(
        get_duplicated_items_percent(no_of_duplicated_items,
                                     no_of_scraped_items))
    duplicated_skus_percent = float(
        get_duplicated_skus_percent(no_of_duplicated_skus,
                                    no_of_scraped_items))

    crawlera_incapsula_percent = float(
        get_crawlera_incapsula_percent(crawlera_user))

    no_of_errors_percent = float(get_errors_count_percent(no_of_errors))
    price_was_price_now_comparison_percent = float(
        get_price_was_price_now_comparison_percent(no_of_price_warns,
                                                   no_of_scraped_items))
    outcome_percent = float(get_outcome_percent(job_state, job_close_reason))
    response_status_count_percent = float(
        get_response_status_count_percent(response_status_count))
    tested_percent = float(get_tested_percent(tested))

    if all([
            checked_dup_items_count == 0,
            no_of_checked_skus_items == 0,
            no_of_checked_price_items == 0,
    ]):
        quality_estimation = (adherence_to_schema_percent * 60 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif checked_dup_items_count == 0 and no_of_checked_skus_items == 0:
        quality_estimation = (adherence_to_schema_percent * 55 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              price_was_price_now_comparison_percent * 5 / 100
                              + outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif checked_dup_items_count == 0 and no_of_checked_price_items == 0:
        quality_estimation = (adherence_to_schema_percent * 55 / 100 +
                              duplicated_skus_percent * 5 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif no_of_checked_skus_items == 0 and no_of_checked_price_items == 0:
        quality_estimation = (adherence_to_schema_percent * 50 / 100 +
                              duplicated_items_percent * 10 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif checked_dup_items_count == 0:
        quality_estimation = (adherence_to_schema_percent * 50 / 100 +
                              duplicated_skus_percent * 5 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              price_was_price_now_comparison_percent * 5 / 100
                              + outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif no_of_checked_skus_items == 0:
        quality_estimation = (adherence_to_schema_percent * 45 / 100 +
                              duplicated_items_percent * 10 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              price_was_price_now_comparison_percent * 5 / 100
                              + outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    elif no_of_checked_price_items == 0:
        quality_estimation = (adherence_to_schema_percent * 45 / 100 +
                              duplicated_items_percent * 10 / 100 +
                              duplicated_skus_percent * 5 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)
    else:
        quality_estimation = (adherence_to_schema_percent * 40 / 100 +
                              duplicated_items_percent * 10 / 100 +
                              duplicated_skus_percent * 5 / 100 +
                              crawlera_incapsula_percent * 8 / 100 +
                              no_of_errors_percent * 5 / 100 +
                              price_was_price_now_comparison_percent * 5 / 100
                              + outcome_percent * 5 / 100 +
                              response_status_count_percent * 7 / 100 +
                              tested_percent * 15 / 100)

    field_accuracy = adherence_to_schema_percent * 100 / 100

    for rule_result in kwargs.values():
        if rule_result.err_items_count / rule_result.items_count < 0.1:
            quality_estimation = quality_estimation * 0.95
        else:
            quality_estimation = quality_estimation * 0.90

    return int(quality_estimation), int(field_accuracy)
Exemplo n.º 13
0
 def limit(self) -> int:
     if not self._limit:
         self._limit = api.get_items_count(self.job)
     return self._limit
Exemplo n.º 14
0
def job_summary_table(job) -> go.FigureWidget:
    job_url = f"{SH_URL}/{job.key}"
    job_state = api.get_job_state(job)
    job_close_reason = api.get_job_close_reason(job)
    no_of_scraped_items = api.get_items_count(job)
    no_of_errors = api.get_errors_count(job)

    job_runtime = api.get_runtime_s(job) / 1000
    run_time = helpers.ms_to_time(job_runtime)
    crawling_speed = round(job_runtime / 60 / no_of_scraped_items, 3)

    request_success_ratio = round(
        api.get_requests_count(job) / float(no_of_scraped_items), 2)

    max_memusage = api.get_max_memusage(job)
    response_status_count = api.get_response_status_count(job)

    crawlera_stat_value = api.get_crawlera_user(job)
    if not crawlera_stat_value:
        crawlera_stat_value = "Not Used"

    job_stats_values = [
        "Job URL",
        "Spider State",
        "Spider Close Reason",
        "Number of Scraped Items",
        "Number of Errors",
        "Runtime",
        "Request Success Ratio [requests/scraped items]",
        "Crawling Speed [items/min]",
        "Crawlera user",
        "Max Memory Usage [Bytes]",
        "Response Status Count",
    ]
    stats_values = [
        '<a href="' + job_url + '">' + job_url + "</a>",
        job_state,
        job_close_reason,
        no_of_scraped_items,
        no_of_errors,
        run_time,
        request_success_ratio,
        crawling_speed,
        crawlera_stat_value,
        max_memusage,
        "200: " + str(response_status_count[0]) + "<br>" + "301: " +
        str(response_status_count[1]) + "<br>" + "404: " +
        str(response_status_count[2]) + "<br>" + "503: " +
        str(response_status_count[3]) + "<br>",
    ]

    trace = go.Table(
        columnorder=[1, 2],
        columnwidth=[300, 200],
        header=dict(
            values=["<b>Job Stat</b>", "<b>Stat Value</b>"],
            fill=dict(color="gray"),
            align=["left"] * 5,
            font=dict(color="black", size=14),
            height=30,
        ),
        cells=dict(
            values=[job_stats_values, stats_values],
            fill=dict(color="lightgrey"),
            font=dict(color="black", size=12),
            height=25,
            align=["left"] * 5,
        ),
    )
    spider = job.metadata.get("spider")
    layout = go.Layout(
        title=f"Summary for spider {spider}",
        autosize=True,
        margin=dict(t=40, b=25, l=0, r=0),
        height=445,
    )

    return go.FigureWidget(data=[trace], layout=layout)