Пример #1
0
def find_labels(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights = []

    for label_tag, regex_list in LABELS_REGEX.items():
        for ocr_regex in regex_list:
            text = get_text(content, ocr_regex)

            if not text:
                continue

            for match in ocr_regex.regex.finditer(text):
                if ocr_regex.processing_func:
                    label_value = ocr_regex.processing_func(match)

                    if label_value is None:
                        continue

                else:
                    label_value = label_tag

                insights.append(
                    RawInsight(
                        type=InsightType.label,
                        value_tag=label_value,
                        predictor="regex",
                        data={
                            "text": match.group(),
                            "notify": ocr_regex.notify
                        },
                    ))

    processor = LABEL_KEYWORD_PROCESSOR_STORE.get()

    text = get_text(content)
    insights += extract_label_flashtext(processor, text)

    if isinstance(content, OCRResult):
        for logo_annotation in content.logo_annotations:
            if logo_annotation.description in LOGO_ANNOTATION_LABELS:
                label_tag = LOGO_ANNOTATION_LABELS[logo_annotation.description]

                insights.append(
                    RawInsight(
                        type=InsightType.label,
                        value_tag=label_tag,
                        automatic_processing=False,
                        predictor="google-cloud-vision",
                        data={"confidence": logo_annotation.score},
                    ))

    return insights
Пример #2
0
def find_nutrient_mentions(content: Union[OCRResult, str]) -> List[RawInsight]:
    nutrients: JSONType = {}

    for regex_code, ocr_regex in NUTRIENT_MENTIONS_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            nutrients.setdefault(regex_code, [])
            group_dict = {k: v for k, v in match.groupdict().items() if v is not None}

            languages: List[str] = []
            if group_dict:
                languages_raw = list(group_dict.keys())[0]
                languages = languages_raw.rsplit("_", maxsplit=1)[0].split("_")

            nutrients[regex_code].append(
                {
                    "raw": match.group(0),
                    "span": list(match.span()),
                    "languages": languages,
                }
            )

    if not nutrients:
        return []

    return [
        RawInsight(
            type=InsightType.nutrient_mention,
            data={"mentions": nutrients, "version": EXTRACTOR_VERSION},
        )
    ]
Пример #3
0
def find_nutrient_values(content: Union[OCRResult, str]) -> List[RawInsight]:
    nutrients: JSONType = {}

    for regex_code, ocr_regex in NUTRIENT_VALUES_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            value = match.group(2).replace(",", ".")
            unit = match.group(3)
            nutrients.setdefault(regex_code, [])
            nutrients[regex_code].append(
                {
                    "raw": match.group(0),
                    "nutrient": regex_code,
                    "value": value,
                    "unit": unit,
                }
            )

    if not nutrients:
        return []

    return [
        RawInsight(
            type=InsightType.nutrient,
            data={"nutrients": nutrients, "version": EXTRACTOR_VERSION},
        )
    ]
Пример #4
0
def find_packaging(content: Union[OCRResult, str]) -> List[Dict]:
    insights = []

    text = get_text(content)

    if not text:
        return []

    processor = KEYWORD_PROCESSOR_STORE.get()

    for (packaging_str, _), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        packagings = packaging_str.split(";")

        for packaging in packagings:
            match_str = text[span_start:span_end]
            insights.append({
                "packaging_tag": get_tag(packaging),
                "packaging": packaging,
                "text": match_str,
                "notify": True,
                "automatic_processing": True,
            })

    return insights
Пример #5
0
def find_packager_codes_regex(
        ocr_result: Union[OCRResult, str]) -> List[RawInsight]:
    results: List[RawInsight] = []

    for regex_code, ocr_regex in PACKAGER_CODE.items():
        text = get_text(ocr_result, ocr_regex, ocr_regex.lowercase)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            if ocr_regex.processing_func is None:
                value = match.group(0)
            else:
                value = ocr_regex.processing_func(match)

            results.append(
                RawInsight(
                    value=value,
                    data={
                        "raw": match.group(0),
                        "type": regex_code,
                        "notify": ocr_regex.notify,
                    },
                    type=InsightType.packager_code,
                    automatic_processing=True,
                ))

    return results
Пример #6
0
def find_nutrient_values(content: Union[OCRResult, str]) -> List[Dict]:
    nutrients: JSONType = {}

    for regex_code, ocr_regex in NUTRIENT_VALUES_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            value = match.group(2).replace(",", ".")
            unit = match.group(3)
            nutrients.setdefault(regex_code, [])
            nutrients[regex_code].append({
                "raw": match.group(0),
                "nutrient": regex_code,
                "value": value,
                "unit": unit,
            })

    if not nutrients:
        return []

    return [{
        "nutrients": nutrients,
        "notify": False,
    }]
Пример #7
0
def predict_ocr_categories(content: Union[OCRResult, str]) -> List[RawInsight]:
    """Run prediction on a given OCR and return insights.

    If the model hesitates between 2 categories, both are returned as insights.
    Otherwise, only 1 category is returned. We consider the model to be
    "hesitating" if the probability of the top 2 categories are separated by
    less than `HESITATION_THRESHOLD` percent.
    """
    text = get_text(content)
    if not text:
        return []

    probabilities = Predictor(text=text).run()
    indices_max = np.argsort(probabilities)

    # Select top 2 categories
    best_index = indices_max[-1]
    best_proba = probabilities[best_index]

    second_index = indices_max[-2]
    second_proba = probabilities[second_index]

    # Return either top category only or both, depending on the gap
    results = [_get_raw_insight(best_proba, best_index)]
    if (best_proba - second_proba) <= HESITATION_THRESHOLD:
        results.append(_get_raw_insight(second_proba, second_index))
    return results
Пример #8
0
def find_traces(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights = []

    text = get_text(content, TRACES_REGEX)

    if not text:
        return []

    processor = TRACE_KEYWORD_PROCESSOR_STORE.get()

    for match in TRACES_REGEX.regex.finditer(text):
        prompt = match.group()
        end_idx = match.end()
        captured = text[end_idx : end_idx + 100]

        for (trace_tag, _), span_start, span_end in processor.extract_keywords(
            captured, span_info=True
        ):
            match_str = captured[span_start:span_end]
            insights.append(
                RawInsight(
                    type=InsightType.trace,
                    value_tag=trace_tag,
                    data={"text": match_str, "prompt": prompt, "notify": False},
                )
            )

    return insights
Пример #9
0
def find_stores(content: Union[OCRResult, str]) -> List[RawInsight]:
    results = []

    text = get_text(content, STORE_REGEX)

    if not text:
        return []

    for match in STORE_REGEX.regex.finditer(text):
        groups = match.groups()

        for idx, match_str in enumerate(groups):
            if match_str is not None:
                store, _ = SORTED_STORES[idx]
                results.append(
                    RawInsight(
                        type=InsightType.store,
                        value=store,
                        value_tag=get_store_tag(store),
                        data={
                            "text": match_str,
                            "notify": store in NOTIFY_STORES
                        },
                    ))
                break

    return results
Пример #10
0
def find_product_weight(content: Union[OCRResult, str]) -> List[RawInsight]:
    results = []

    for type_, ocr_regex in PRODUCT_WEIGHT_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            if ocr_regex.processing_func is None:
                continue

            result = ocr_regex.processing_func(match)

            if result is None:
                continue

            result["matcher_type"] = type_
            result["priority"] = ocr_regex.priority
            result["notify"] = ocr_regex.notify
            value = result.pop("text")
            automatic_processing = result.pop("automatic_processing", None)
            results.append(
                RawInsight(
                    value=value,
                    type=InsightType.product_weight,
                    automatic_processing=automatic_processing,
                    data=result,
                ))

    return results
Пример #11
0
def find_expiration_date(content: Union[OCRResult, str]) -> List[Dict]:
    # Parse expiration date
    #        "À consommer de préférence avant",
    results: List[Dict] = []

    for type_, ocr_regex in EXPIRATION_DATE_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            raw = match.group(0)

            if not ocr_regex.processing_func:
                continue

            date = ocr_regex.processing_func(match)

            if date is None:
                continue

            if date.year > 2025 or date.year < 2015:
                continue

            # Format dates according to ISO 8601
            value = date.strftime("%Y-%m-%d")

            results.append(
                {"raw": raw, "text": value, "type": type_, "notify": ocr_regex.notify,}
            )

    return results
Пример #12
0
def find_packaging(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights = []

    text = get_text(content)

    if not text:
        return []

    processor = KEYWORD_PROCESSOR_STORE.get()

    for (packaging_str, _), span_start, span_end in processor.extract_keywords(
        text, span_info=True
    ):
        packagings = packaging_str.split(";")

        for packaging in packagings:
            match_str = text[span_start:span_end]
            insights.append(
                RawInsight(
                    type=InsightType.packaging,
                    value_tag=get_tag(packaging),
                    value=packaging,
                    data={"text": match_str, "notify": False},
                    automatic_processing=True,
                )
            )

    return insights
Пример #13
0
def find_brands(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights: List[RawInsight] = []
    text = get_text(content)

    if text:
        insights += extract_brands(BRAND_PROCESSOR, text, "curated-list")
        insights += extract_brands(TAXONOMY_BRAND_PROCESSOR, text, "taxonomy")

    if isinstance(content, OCRResult):
        insights += extract_brands_google_cloud_vision(content)

    return insights
Пример #14
0
def flag_image(content: Union[OCRResult, str]) -> List[RawInsight]:
    insights: List[RawInsight] = []

    text = get_text(content)
    insight = extract_image_flag_flashtext(PROCESSOR, text)

    if insight is not None:
        insights.append(insight)

    if isinstance(content, str):
        return insights

    safe_search_annotation = content.get_safe_search_annotation()
    label_annotations = content.get_label_annotations()

    if safe_search_annotation:
        for key in ("adult", "violence"):
            value: SafeSearchAnnotationLikelihood = getattr(safe_search_annotation, key)
            if value >= SafeSearchAnnotationLikelihood.VERY_LIKELY:
                insights.append(
                    RawInsight(
                        type=InsightType.image_flag,
                        data={
                            "type": "safe_search_annotation",
                            "label": key,
                            "likelihood": value.name,
                        },
                    )
                )

    for label_annotation in label_annotations:
        if (
            label_annotation.description in LABELS_TO_FLAG
            and label_annotation.score >= 0.6
        ):
            insights.append(
                RawInsight(
                    type=InsightType.image_flag,
                    data={
                        "type": "label_annotation",
                        "label": label_annotation.description.lower(),
                        "likelihood": label_annotation.score,
                    },
                )
            )
            break

    return insights
Пример #15
0
def find_packager_codes(ocr_result: Union[OCRResult, str]) -> List[Dict]:
    results: List[Dict] = []

    for regex_code, ocr_regex in PACKAGER_CODE.items():
        text = get_text(ocr_result, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            if ocr_regex.processing_func is not None:
                value = ocr_regex.processing_func(match)
                results.append(
                    {
                        "raw": match.group(0),
                        "text": value,
                        "type": regex_code,
                        "notify": ocr_regex.notify,
                    }
                )

    return results
Пример #16
0
def find_stores(content: Union[OCRResult, str]) -> List[Dict]:
    results = []

    text = get_text(content, STORE_REGEX)

    if not text:
        return []

    for match in STORE_REGEX.regex.finditer(text):
        groups = match.groups()

        for idx, match_str in enumerate(groups):
            if match_str is not None:
                store, _ = SORTED_STORES[idx]
                results.append({
                    "value": store,
                    "value_tag": get_store_tag(store),
                    "text": match_str,
                    "notify": store in NOTIFY_STORES,
                })
                break

    return results
Пример #17
0
def find_product_weight(content: Union[OCRResult, str]) -> List[Dict]:
    results = []

    for type_, ocr_regex in PRODUCT_WEIGHT_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            if ocr_regex.processing_func is None:
                continue

            result = ocr_regex.processing_func(match)

            if result is None:
                continue

            result["matcher_type"] = type_
            result["priority"] = ocr_regex.priority
            result["notify"] = ocr_regex.notify
            results.append(result)

    return results
Пример #18
0
def find_packager_codes(ocr_result: Union[OCRResult, str]) -> List[RawInsight]:
    insights = find_packager_codes_regex(ocr_result)
    processor = FISHING_KEYWORD_PROCESSOR_STORE.get()
    text = get_text(ocr_result)
    insights += extract_fishing_code(processor, text)
    return insights