示例#1
0
def extract_brands(
    processor: KeywordProcessor,
    text: str,
    data_source_name: str,
    automatic_processing: bool,
) -> List[Prediction]:
    predictions = []

    for (brand_tag, brand), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        match_str = text[span_start:span_end]
        predictions.append(
            Prediction(
                type=PredictionType.brand,
                value=brand,
                value_tag=brand_tag,
                automatic_processing=automatic_processing,
                predictor=data_source_name,
                data={
                    "text": match_str,
                    "notify": False
                },
            ))

    return predictions
示例#2
0
def get_image_lang(ocr_result: Union[OCRResult, str]) -> List[Prediction]:
    if isinstance(ocr_result, str):
        return []

    image_lang: Optional[Dict[str, int]] = ocr_result.get_languages()

    if image_lang is None:
        return []

    words = image_lang["words"]
    percents = {}
    for key, count in image_lang.items():
        if key == "words":
            continue

        percents[key] = count * 100 / words

    return [
        Prediction(
            type=PredictionType.image_lang,
            data={
                "count": image_lang,
                "percent": percents
            },
        )
    ]
示例#3
0
def find_product_weight(content: Union[OCRResult, str]) -> List[Prediction]:
    results = []

    for type_, ocr_regex in PRODUCT_WEIGHT_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            if ocr_regex.processing_func is None:
                continue

            result = ocr_regex.processing_func(match)

            if result is None:
                continue

            result["matcher_type"] = type_
            result["priority"] = ocr_regex.priority
            result["notify"] = ocr_regex.notify
            value = result.pop("text")
            results.append(
                Prediction(
                    value=value,
                    type=PredictionType.product_weight,
                    automatic_processing=result["automatic_processing"],
                    data=result,
                ))

    return results
示例#4
0
 def test_import_insights_single_product(self, mocker):
     prediction_dict = {
         "barcode": DEFAULT_BARCODE,
         "type": PredictionType.category.name,
         "data": {},
     }
     prediction = Prediction(
         barcode=DEFAULT_BARCODE,
         type=PredictionType.category,
         data={},
     )
     get_product_predictions_mock = mocker.patch(
         "robotoff.insights.importer.get_product_predictions",
         return_value=[
             prediction_dict,
         ],
     )
     import_insights_mock = mocker.patch(
         "robotoff.insights.importer.InsightImporter.import_insights",
         return_value=1,
     )
     product_store = FakeProductStore()
     imported = import_insights_for_products(
         {DEFAULT_BARCODE: {PredictionType.category}},
         DEFAULT_SERVER_DOMAIN,
         automatic=True,
         product_store=product_store,
     )
     assert imported == 1
     get_product_predictions_mock.assert_called_once()
     import_insights_mock.assert_called_once_with([prediction],
                                                  DEFAULT_SERVER_DOMAIN,
                                                  True, product_store)
示例#5
0
def test_notify_image_flag_public(mocker, monkeypatch):
    """Test notifying a potentially sensitive public image"""
    mock = mocker.patch("robotoff.slack.http_session.post",
                        return_value=MockSlackResponse())
    monkeypatch.delenv("ROBOTOFF_SCHEME",
                       raising=False)  # force defaults to apply

    notifier = slack.SlackNotifier("")

    notifier.notify_image_flag(
        [
            Prediction(
                type=PredictionType.image_flag,
                data={
                    "text": "bad_word",
                    "type": "SENSITIVE",
                    "label": "flagged"
                },
            )
        ],
        "/source_image",
        "123",
    )

    mock.assert_called_once_with(
        notifier.POST_MESSAGE_URL,
        data=PartialRequestMatcher(
            f"type: SENSITIVE\nlabel: *flagged*, match: bad_word\n\n <{settings.OFF_IMAGE_BASE_URL}/source_image|Image> -- <https://world.{settings._robotoff_domain}/cgi/product.pl?type=edit&code=123|*Edit*>",
            notifier.ROBOTOFF_PUBLIC_IMAGE_ALERT_CHANNEL,
            f"{settings.OFF_IMAGE_BASE_URL}/source_image",
        ),
    )
示例#6
0
def test_notify_image_flag_private(mocker, monkeypatch):
    """Test notifying a potentially sensitive private image"""
    mock = mocker.patch("robotoff.slack.http_session.post",
                        return_value=MockSlackResponse())
    monkeypatch.delenv("ROBOTOFF_SCHEME",
                       raising=False)  # force defaults to apply

    notifier = slack.SlackNotifier("")

    notifier.notify_image_flag(
        [
            Prediction(
                type=PredictionType.image_flag,
                data={
                    "type": "label_annotation",
                    "label": "face",
                    "likelihood": 0.8
                },
            )
        ],
        "/source_image",
        "123",
    )

    mock.assert_called_once_with(
        notifier.POST_MESSAGE_URL,
        data=PartialRequestMatcher(
            f"type: label_annotation\nlabel: *face*, score: 0.8\n\n <{settings.OFF_IMAGE_BASE_URL}/source_image|Image> -- <https://world.{settings._robotoff_domain}/cgi/product.pl?type=edit&code=123|*Edit*>",
            notifier.ROBOTOFF_PRIVATE_IMAGE_ALERT_CHANNEL,
            f"{settings.OFF_IMAGE_BASE_URL}/source_image",
        ),
    )
示例#7
0
def find_packaging(content: Union[OCRResult, str]) -> List[Prediction]:
    predictions = []

    text = get_text(content)

    if not text:
        return []

    processor = KEYWORD_PROCESSOR_STORE.get()

    for (packaging_str, _), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        packagings = packaging_str.split(";")

        for packaging in packagings:
            match_str = text[span_start:span_end]
            predictions.append(
                Prediction(
                    type=PredictionType.packaging,
                    value_tag=get_tag(packaging),
                    value=packaging,
                    data={
                        "text": match_str,
                        "notify": False
                    },
                    automatic_processing=True,
                ))

    return predictions
示例#8
0
def find_traces(content: Union[OCRResult, str]) -> List[Prediction]:
    predictions = []

    text = get_text(content, TRACES_REGEX)

    if not text:
        return []

    processor = TRACE_KEYWORD_PROCESSOR_STORE.get()

    for match in TRACES_REGEX.regex.finditer(text):
        prompt = match.group()
        end_idx = match.end()
        captured = text[end_idx:end_idx + 100]

        for (trace_tag, _), span_start, span_end in processor.extract_keywords(
                captured, span_info=True):
            match_str = captured[span_start:span_end]
            predictions.append(
                Prediction(
                    type=PredictionType.trace,
                    value_tag=trace_tag,
                    data={
                        "text": match_str,
                        "prompt": prompt,
                        "notify": False
                    },
                ))

    return predictions
示例#9
0
def find_stores(content: Union[OCRResult, str]) -> List[Prediction]:
    results = []

    text = get_text(content, STORE_REGEX)

    if not text:
        return []

    for match in STORE_REGEX.regex.finditer(text):
        groups = match.groups()

        for idx, match_str in enumerate(groups):
            if match_str is not None:
                store, _ = SORTED_STORES[idx]
                results.append(
                    Prediction(
                        type=PredictionType.store,
                        value=store,
                        value_tag=get_store_tag(store),
                        data={"text": match_str, "notify": store in NOTIFY_STORES},
                    )
                )
                break

    return results
示例#10
0
def find_packager_codes_regex(
        ocr_result: Union[OCRResult, str]) -> List[Prediction]:
    results: List[Prediction] = []

    for regex_code, ocr_regex in PACKAGER_CODE.items():
        text = get_text(ocr_result, ocr_regex, ocr_regex.lowercase)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            if ocr_regex.processing_func is None:
                value = match.group(0)
            else:
                value = ocr_regex.processing_func(match)

            results.append(
                Prediction(
                    value=value,
                    data={
                        "raw": match.group(0),
                        "type": regex_code,
                        "notify": ocr_regex.notify,
                    },
                    type=PredictionType.packager_code,
                    automatic_processing=True,
                ))

    return results
示例#11
0
    def generate_insights(
        self,
        max_errors: Optional[int] = None,
        lang: str = "fr",
        limit: Optional[int] = None,
    ) -> Iterable[Prediction]:
        dataset = ProductDataset(settings.JSONL_DATASET_PATH)
        product_iter = (dataset.stream().filter_by_country_tag(
            "en:france").filter_text_field(
                "lang",
                lang).filter_nonempty_text_field("ingredients_text_fr").iter())

        insights_count = 0
        for product in product_iter:
            if self.is_product_valid(product, max_errors=max_errors):
                insight = self.predict_insight(product["ingredients_text_fr"])
                if insight is not None:
                    insight["lang"] = lang
                    yield Prediction(
                        type=PredictionType.ingredient_spellcheck,
                        data=insight,
                        barcode=product["code"],
                    )

                    insights_count += 1
                    if limit is not None and insights_count >= limit:
                        break
示例#12
0
def generate_prediction(
    logo_type: str,
    logo_value: Optional[str],
    automatic_processing: Optional[bool] = False,
    **kwargs,
) -> Optional[Prediction]:
    if logo_type not in LOGO_TYPE_MAPPING:
        return None

    prediction_type = LOGO_TYPE_MAPPING[logo_type]

    value_tag = None
    value = None

    if prediction_type == PredictionType.brand:
        value_tag = value = logo_value
        if value is None:
            return None

    elif prediction_type == PredictionType.label:
        value_tag = logo_value
        if value_tag is None:
            return None

    return Prediction(
        type=prediction_type,
        value_tag=value_tag,
        value=value,
        automatic_processing=automatic_processing,
        predictor="universal-logo-detector",
        data=kwargs,
    )
示例#13
0
def import_insights_for_products(
    prediction_types_by_barcode: Dict[str, Set[PredictionType]],
    server_domain: str,
    automatic: bool,
    product_store: DBProductStore,
) -> int:
    """Re-compute insights for products with new predictions.

    :param prediction_types_by_barcode: a dict that associates each barcode
    with a set of prediction type that were updated
    :return: Number of imported insights
    """
    imported = 0
    for importer in IMPORTERS:
        required_prediction_types = importer.get_required_prediction_types()
        selected_barcodes: List[str] = []
        for barcode, prediction_types in prediction_types_by_barcode.items():
            if prediction_types >= required_prediction_types:
                selected_barcodes.append(barcode)

        if selected_barcodes:
            predictions = [
                Prediction(**p) for p in get_product_predictions(
                    selected_barcodes, list(required_prediction_types))
            ]
            imported += importer.import_insights(predictions, server_domain,
                                                 automatic, product_store)
    return imported
示例#14
0
def find_nutrient_values(content: Union[OCRResult, str]) -> List[Prediction]:
    nutrients: JSONType = {}

    for regex_code, ocr_regex in NUTRIENT_VALUES_REGEX.items():
        text = get_text(content, ocr_regex)

        if not text:
            continue

        for match in ocr_regex.regex.finditer(text):
            value = match.group(2).replace(",", ".")
            unit = match.group(3)
            nutrients.setdefault(regex_code, [])
            nutrients[regex_code].append({
                "raw": match.group(0),
                "nutrient": regex_code,
                "value": value,
                "unit": unit,
            })

    if not nutrients:
        return []

    return [
        Prediction(
            type=PredictionType.nutrient,
            data={
                "nutrients": nutrients,
                "version": EXTRACTOR_VERSION
            },
        )
    ]
def test_add_category_insight_with_ml_insights(mocker):
    expected_prediction = Prediction(
        barcode="123",
        type=PredictionType.category,
        value_tag="en:chicken",
        data={
            "lang": "xx",
            "model": "neural",
            "confidence": 0.9
        },
        automatic_processing=True,
    )
    mocker.patch(
        "robotoff.workers.tasks.product_updated.predict_category_from_product_es",
        return_value=None,
    )
    mocker.patch(
        "robotoff.workers.tasks.product_updated.CategoryClassifier.predict",
        return_value=[expected_prediction],
    )
    import_insights_mock = mocker.patch(
        "robotoff.workers.tasks.product_updated.import_insights",
        return_value=1,
    )
    server_domain = settings.BaseURLProvider().get()
    imported = add_category_insight("123", {"code": "123"}, server_domain)

    import_insights_mock.assert_called_once_with(
        [
            Prediction(
                barcode="123",
                type=PredictionType.category,
                value_tag="en:chicken",
                data={
                    "lang": "xx",
                    "model": "neural",
                    "confidence": 0.9
                },
                automatic_processing=True,
            )
        ],
        server_domain,
        automatic=True,
    )

    assert imported
示例#16
0
 def to_prediction(self) -> Prediction:
     """Converts this category prediction to a Prediction."""
     return Prediction(
         type=PredictionType.category,
         value_tag=self.category,
         data={"lang": "xx", "model": "neural", "confidence": self.confidence},
         automatic_processing=self.confidence >= self.NEURAL_CONFIDENCE_THRESHOLD,
     )
示例#17
0
def flag_image(content: Union[OCRResult, str]) -> List[Prediction]:
    predictions: List[Prediction] = []

    text = get_text(content)
    prediction = extract_image_flag_flashtext(PROCESSOR, text)

    if prediction is not None:
        predictions.append(prediction)

    if isinstance(content, str):
        return predictions

    safe_search_annotation = content.get_safe_search_annotation()
    label_annotations = content.get_label_annotations()

    if safe_search_annotation:
        for key in ("adult", "violence"):
            value: SafeSearchAnnotationLikelihood = getattr(
                safe_search_annotation, key)
            if value >= SafeSearchAnnotationLikelihood.VERY_LIKELY:
                predictions.append(
                    Prediction(
                        type=PredictionType.image_flag,
                        data={
                            "type": "safe_search_annotation",
                            "label": key,
                            "likelihood": value.name,
                        },
                    ))

    for label_annotation in label_annotations:
        if (label_annotation.description in LABELS_TO_FLAG
                and label_annotation.score >= 0.6):
            predictions.append(
                Prediction(
                    type=PredictionType.image_flag,
                    data={
                        "type": "label_annotation",
                        "label": label_annotation.description.lower(),
                        "likelihood": label_annotation.score,
                    },
                ))
            break

    return predictions
示例#18
0
 def generate_prediction(value,
                         data: Dict[str, Any],
                         automatic_processing: Optional[bool] = None):
     return Prediction(
         barcode=DEFAULT_BARCODE,
         value=value,
         type=PredictionType.product_weight,
         data=data,
         automatic_processing=automatic_processing,
         predictor="ocr",
     )
def matcher_prediction(category):
    return Prediction(
        barcode=barcode1,
        type=PredictionType.category,
        value_tag=category,
        data={
            "lang": "en",
            "product_name": "test",
            "model": "matcher",
        },
        automatic_processing=False,
    )
def neural_prediction(category, confidence=0.7, auto=False):
    return Prediction(
        barcode=barcode1,
        type=PredictionType.category,
        value_tag=category,
        data={
            "lang": "xx",
            "model": "neural",
            "confidence": confidence
        },
        automatic_processing=auto,
    )
示例#21
0
 def test_generate_candidates(self):
     prediction = Prediction(type=PredictionType.packager_code,
                             value="fr 40.261.001 ce")
     selected = list(
         PackagerCodeInsightImporter.generate_candidates(
             Product({"emb_codes_tags": ["FR 50.200.000 CE"]}),
             [prediction],
         ))
     assert len(selected) == 1
     insight = selected[0]
     assert isinstance(insight, ProductInsight)
     assert insight.value == prediction.value
     assert insight.type == InsightType.packager_code
示例#22
0
def test_category_prediction_to_prediction():
    category_prediction = CategoryPrediction("category", 0.5)

    assert category_prediction.to_prediction() == Prediction(
        type=InsightType.category,
        value_tag="category",
        data={
            "lang": "xx",
            "model": "neural",
            "confidence": 0.5
        },
        automatic_processing=False,
    )
示例#23
0
def test_category_prediction_to_prediction_auto(monkeypatch):
    monkeypatch.setattr(CategoryPrediction, "NEURAL_CONFIDENCE_THRESHOLD", 0.9)
    category_prediction = CategoryPrediction("category", 0.9)

    assert category_prediction.to_prediction() == Prediction(
        type=InsightType.category,
        value_tag="category",
        data={
            "lang": "xx",
            "model": "neural",
            "confidence": 0.9
        },
        automatic_processing=True,
    )
示例#24
0
def find_image_orientation(
        ocr_result: Union[OCRResult, str]) -> List[Prediction]:
    if isinstance(ocr_result, str):
        return []

    orientation_result = ocr_result.get_orientation()

    if orientation_result is None:
        return []

    prediction = orientation_result.to_json()
    prediction["rotation"] = get_rotation_angle_from_orientation(
        orientation_result.orientation)
    return [Prediction(type=PredictionType.image_orientation, data=prediction)]
示例#25
0
    def test_import_insights_invalid_types(self):
        class FakeImporter(InsightImporter):
            @staticmethod
            def get_required_prediction_types():
                return {PredictionType.category, PredictionType.image_flag}

        with pytest.raises(ValueError,
                           match="unexpected prediction type: 'label'"):
            FakeImporter.import_insights(
                [Prediction(type=PredictionType.label)],
                DEFAULT_SERVER_DOMAIN,
                automatic=True,
                product_store=FakeProductStore(),
            )
示例#26
0
def extract_image_flag_flashtext(processor: KeywordProcessor,
                                 text: str) -> Optional[Prediction]:
    for (_, key), span_start, span_end in processor.extract_keywords(
            text, span_info=True):
        match_str = text[span_start:span_end]
        return Prediction(
            type=PredictionType.image_flag,
            data={
                "text": match_str,
                "type": "text",
                "label": key
            },
        )

    return None
示例#27
0
def predict(client, product: Dict) -> Optional[Prediction]:
    """Predict product categories using ES.

    :param elasticsearch.Elasticsearch client: connection to ES instance
    :param product: product properties

    :return: a category Prediction or None if no prediction was available
    """
    predictions = []

    for lang in product.get("languages_codes", []):
        product_name = product.get("product_name_{}".format(lang))

        if not product_name:
            continue

        prediction = predict_category(client, product_name, lang)

        if prediction is None:
            continue

        category, score = prediction
        predictions.append((lang, category, product_name, score))
        continue

    if predictions:
        # Sort by descending score
        sorted_predictions = sorted(predictions,
                                    key=operator.itemgetter(2),
                                    reverse=True)

        p = sorted_predictions[0]
        lang, category, product_name, score = p

        return Prediction(
            type=PredictionType.category,
            barcode=product["code"],
            value_tag=category,
            data={
                "lang": lang,
                "product_name": product_name,
                "model": "matcher",
            },
            automatic_processing=False,
        )

    return None
示例#28
0
    def extract_addresses(self, content: Union[str,
                                               OCRResult]) -> List[Prediction]:
        """Extract addresses from the given OCR result.

        Args:
            content (OCRResult or str): a string or the OCR result to process.

        Returns:
            list of Prediction: List of addresses extracted from the text. Each entry
            is a dictionary with the items: country_code (always "fr"), city_name,
            postal_code and text_extract.
        """
        if isinstance(content, OCRResult):
            text = self.get_text(content)
        else:
            text = content

        text = self.normalize_text(text)
        city_matches = self.find_city_names(text)

        locations = []
        for city, city_start, city_end in city_matches:
            pc_match = self.find_nearby_postal_code(text, city, city_start,
                                                    city_end)
            if pc_match is None:
                continue

            pc, pc_start, pc_end = pc_match
            address_start = min(city_start,
                                pc_start) - self.text_extract_distance
            address_end = max(city_end, pc_end) + self.text_extract_distance
            text_extract = text[max(0, address_start
                                    ):min(len(text), address_end)]

            locations.append(
                Prediction(
                    type=PredictionType.location,
                    data={
                        "country_code": "fr",
                        "city_name": city.name,
                        "postal_code": city.postal_code,
                        "text_extract": text_extract,
                    },
                ))

        return locations
示例#29
0
 def test_generate_insights_missing_product_no_references(self, mocker):
     get_existing_insight_mock = mocker.patch(
         "robotoff.insights.importer.get_existing_insight", return_value=[])
     assert (list(
         InsightImporter.generate_insights(
             [
                 Prediction(
                     type=PredictionType.category,
                     barcode=DEFAULT_BARCODE,
                     data={},
                 )
             ],
             DEFAULT_SERVER_DOMAIN,
             automatic=True,
             product_store=FakeProductStore(),
         )) == [])
     get_existing_insight_mock.assert_called_once()
示例#30
0
def refresh_insights(
    barcode: str,
    server_domain: str,
    automatic: bool,
    product_store: Optional[DBProductStore] = None,
) -> int:
    """Refresh all insights for specific product.

    All predictions are fetched, and insights are created/deleted by each
    InsightImporter.

    This is different from `import_insights`, because here, there is no
    prediction creation.  It's just an refresh based on current database
    predictions. It's useful to refresh insights after an Product Opener
    update (some insights may be invalid).

    :param barcode: Barcode of the product.
    :param server_domain: The server domain associated with the predictions.
    :param automatic: If False, no insight is applied automatically.
    :param product_store: The product store to use, defaults to None
    :return: The number of imported insights.
    """
    if product_store is None:
        product_store = get_product_store()

    predictions = [Prediction(**p) for p in get_product_predictions([barcode])]
    prediction_types = set(p.type for p in predictions)

    imported = 0
    for importer in IMPORTERS:
        required_prediction_types = importer.get_required_prediction_types()
        if prediction_types >= required_prediction_types:
            imported += importer.import_insights(
                [
                    p
                    for p in predictions if p.type in required_prediction_types
                ],
                server_domain,
                automatic,
                product_store,
            )

    return imported