示例#1
0
def test_product_insights_merge():
    insights_1 = [
        RawInsight(type=InsightType.label, data={}, value_tag="en:organic")
    ]
    product_insights_1 = ProductInsights(
        insights=insights_1,
        barcode="123",
        type=InsightType.label,
        source_image="/123/1.jpg",
    )

    insights_2 = [
        RawInsight(type=InsightType.label, data={}, value_tag="en:pgi")
    ]
    product_insights_2 = ProductInsights(
        insights=insights_2,
        barcode="123",
        type=InsightType.label,
        source_image="/123/1.jpg",
    )

    merged_product_insights = ProductInsights.merge(
        [product_insights_1, product_insights_2])

    assert merged_product_insights.type == InsightType.label
    assert merged_product_insights.barcode == "123"
    assert merged_product_insights.source_image == "/123/1.jpg"
    assert merged_product_insights.insights == insights_1 + insights_2
示例#2
0
def add_category_insight(barcode: str, product: JSONType, server_domain: str) -> bool:
    if get_server_type(server_domain) != ServerType.off:
        return False

    product_insights = []
    product_insight = predict_category_from_product_es(product)

    if product_insight is not None:
        product_insights.append(product_insight)

    product_insight = predict_category_from_product_ml(product, filter_blacklisted=True)

    if product_insight is not None:
        product_insights.append(product_insight)

    if not product_insights:
        return False

    merged_product_insight = ProductInsights.merge(product_insights)
    product_store = get_product_store()
    importer = InsightImporterFactory.create(InsightType.category, product_store)

    imported = importer.import_insights(
        [merged_product_insight],
        server_domain=server_domain,
        automatic=False,
    )

    if imported:
        logger.info("Category insight imported for product {}".format(barcode))

    return bool(imported)
示例#3
0
def extract_ocr_insights(
    ocr_url: str, insight_types: Iterable[InsightType]
) -> Dict[InsightType, ProductInsights]:
    source_image = get_source_from_ocr_url(ocr_url)
    barcode = get_barcode_from_url(ocr_url)

    if barcode is None:
        raise ValueError("cannot extract barcode fro URL: {}".format(ocr_url))

    ocr_result = get_ocr_result(ocr_url)

    if ocr_result is None:
        logger.info("Error during OCR extraction: {}".format(ocr_url))
        return {}

    results = {}

    for insight_type in insight_types:
        insights = ocr.extract_insights(ocr_result, insight_type)

        if insights:
            results[insight_type] = ProductInsights(
                barcode=barcode,
                insights=insights,
                source_image=source_image,
                type=insight_type,
            )

    return results
示例#4
0
def get_insights_from_image(
        barcode: str, image_url: str,
        ocr_url: str) -> Dict[InsightType, ProductInsights]:
    try:
        ocr_insights = extract_ocr_insights(ocr_url,
                                            IMAGE_IMPORT_INSIGHT_TYPES)
    except requests.exceptions.RequestException as e:
        logger.info("error during OCR JSON download", exc_info=e)
        return {}
    except OCRParsingException as e:
        logger.error("OCR JSON Parsing error", exc_info=e)
        return {}

    extract_nutriscore = has_nutriscore_insight(
        ocr_insights.get(InsightType.label, None))
    image_ml_insights = extract_image_ml_insights(
        image_url, extract_nutriscore=extract_nutriscore)

    insight_types = set(ocr_insights.keys()).union(image_ml_insights.keys())

    results: Dict[InsightType, ProductInsights] = {}

    for insight_type in insight_types:
        product_insights: List[ProductInsights] = []

        if insight_type in ocr_insights:
            product_insights.append(ocr_insights[insight_type])

        if insight_type in image_ml_insights:
            product_insights.append(image_ml_insights[insight_type])

        results[insight_type] = ProductInsights.merge(product_insights)

    return results
示例#5
0
def extract_image_ml_insights(
        image_url: str,
        extract_nutriscore: bool = True) -> Dict[InsightType, ProductInsights]:
    barcode = get_barcode_from_url(image_url)
    if barcode is None:
        raise ValueError("cannot extract barcode from URL: {}".format(barcode))

    results: Dict[InsightType, ProductInsights] = {}

    if extract_nutriscore:
        image = get_image_from_url(image_url,
                                   error_raise=True,
                                   session=http_session)
        nutriscore_insight = extract_nutriscore_label(image,
                                                      manual_threshold=0.5,
                                                      automatic_threshold=0.9)

        if not nutriscore_insight:
            return results

        source_image = get_source_from_image_url(image_url)
        results[InsightType.label] = ProductInsights(
            insights=[nutriscore_insight],
            barcode=barcode,
            source_image=source_image,
            type=InsightType.label,
        )

    return results
示例#6
0
def generate_insights_from_annotated_logos(logos: List[LogoAnnotation],
                                           server_domain: str):
    product_insights: List[ProductInsights] = []
    for logo in logos:
        raw_insight = generate_raw_insight(logo.annotation_type,
                                           logo.taxonomy_value,
                                           confidence=1.0,
                                           logo_id=logo.id)

        if raw_insight is None:
            return

        image = logo.image_prediction.image

        try:
            raw_insight.automatic_processing = is_automatically_processable(
                image.barcode, image.source_image, datetime.timedelta(days=30))
        except InvalidInsight:
            return

        if raw_insight.automatic_processing:
            raw_insight.data["notify"] = True

        product_insights.append(
            ProductInsights(
                insights=[raw_insight],
                type=raw_insight.type,
                barcode=image.barcode,
                source_image=image.source_image,
            ))

    imported = import_insights(product_insights, server_domain, automatic=True)

    if imported:
        logger.info(f"{imported} logo insights imported after annotation")
示例#7
0
def get_insights_from_product_name(
        barcode: str, product_name: str) -> Dict[InsightType, ProductInsights]:
    results = {}
    for insight_type in PRODUCT_NAME_INSIGHT_TYPES:
        insights = ocr.extract_insights(product_name, insight_type)

        if insights:
            for insight in insights:
                insight.data["source"] = "product_name"

            results[insight_type] = ProductInsights(
                insights=insights,
                barcode=barcode,
                type=insight_type,
            )

    return results
示例#8
0
def predict_logo_insights(
    logos: List[LogoAnnotation],
    logo_probs: List[Dict[LogoLabelType, float]],
) -> List[ProductInsights]:
    grouped_insights: Dict[Tuple[str, str, InsightType], List[RawInsight]] = {}

    for logo, probs in zip(logos, logo_probs):
        if not probs:
            continue

        label, max_prob = max(
            ((label, prob)
             for label, prob in probs.items() if label != UNKNOWN_LABEL),
            default=(UNKNOWN_LABEL, 0.0),
            key=operator.itemgetter(1),
        )

        if label == UNKNOWN_LABEL:
            continue

        raw_insight = generate_raw_insight(label[0],
                                           label[1],
                                           confidence=max_prob,
                                           logo_id=logo.id)

        if raw_insight is not None:
            image = logo.image_prediction.image
            source_image = image.source_image
            barcode = image.barcode
            key = (barcode, source_image, raw_insight.type)
            grouped_insights.setdefault(key, [])
            grouped_insights[key].append(raw_insight)

    insights: List[ProductInsights] = []

    for (barcode, source_image,
         insight_type), raw_insights in grouped_insights.items():
        insights.append(
            ProductInsights(
                insights=raw_insights,
                type=insight_type,
                barcode=barcode,
                source_image=source_image,
            ))

    return insights
示例#9
0
def predict(client, product: Dict) -> Optional[ProductInsights]:
    predictions = []

    for lang in product.get("languages_codes", []):
        product_name = product.get("product_name_{}".format(lang))

        if not product_name:
            continue

        prediction = predict_category(client, product_name, lang)

        if prediction is None:
            continue

        category, score = prediction
        predictions.append((lang, category, product_name, score))
        continue

    if predictions:
        # Sort by descending score
        sorted_predictions = sorted(
            predictions, key=operator.itemgetter(2), reverse=True
        )

        p = sorted_predictions[0]
        lang, category, product_name, score = p

        return ProductInsights(
            barcode=product["code"],
            type=InsightType.category,
            insights=[
                RawInsight(
                    type=InsightType.category,
                    value_tag=category,
                    data={
                        "lang": lang,
                        "product_name": product_name,
                        "model": "matcher",
                    },
                )
            ],
        )

    return None
示例#10
0
def format_predictions(product: Dict, predictions: List[CategoryPrediction],
                       lang: str) -> ProductInsights:
    insights = []

    for category, confidence in predictions:
        insights.append(
            RawInsight(
                type=InsightType.category,
                value_tag=category,
                data={
                    "lang": lang,
                    "model": "neural",
                    "confidence": confidence
                },
            ))

    return ProductInsights(barcode=product["code"],
                           type=InsightType.category,
                           insights=insights)
示例#11
0
def generate_from_ocr_archive(
    input_: Union[str, TextIO, pathlib.Path],
    insight_type: InsightType,
    keep_empty: bool = False,
) -> Iterable[ProductInsights]:
    for source_image, ocr_json in ocr_iter(input_):
        if source_image is None:
            continue

        barcode: Optional[str] = get_barcode_from_path(source_image)

        if barcode is None:
            click.echo(
                "cannot extract barcode from source "
                "{}".format(source_image),
                err=True,
            )
            continue

        ocr_result: Optional[OCRResult] = OCRResult.from_json(ocr_json)

        if ocr_result is None:
            continue

        insights = extract_insights(ocr_result, insight_type)

        # Do not produce output if insights is empty and we don't want to keep it
        if not keep_empty and not insights:
            continue

        yield ProductInsights(
            insights=insights,
            barcode=barcode,
            type=insight_type,
            source_image=source_image,
        )
示例#12
0
def insights_iter(file_path: pathlib.Path) -> Iterable[ProductInsights]:
    for insight in jsonl_iter(file_path):
        yield ProductInsights.from_dict(insight)
示例#13
0
def test_product_insights_failed_merge():
    with pytest.raises(ValueError):
        ProductInsights.merge([])

    with pytest.raises(ValueError):
        ProductInsights.merge([
            ProductInsights(
                insights=[],
                barcode="123",
                type=InsightType.label,
                source_image="/123/1.jpg",
            ),
            ProductInsights(
                insights=[],
                barcode="234",
                type=InsightType.label,
                source_image="/123/1.jpg",
            ),
        ])

    with pytest.raises(ValueError):
        ProductInsights.merge([
            ProductInsights(
                insights=[],
                barcode="123",
                type=InsightType.label,
                source_image="/123/1.jpg",
            ),
            ProductInsights(
                insights=[],
                barcode="123",
                type=InsightType.category,
                source_image="/123/1.jpg",
            ),
        ])

    with pytest.raises(ValueError):
        ProductInsights.merge([
            ProductInsights(
                insights=[],
                barcode="123",
                type=InsightType.label,
                source_image="/123/1.jpg",
            ),
            ProductInsights(
                insights=[],
                barcode="123",
                type=InsightType.label,
                source_image="/123/2.jpg",
            ),
        ])

    with pytest.raises(ValueError):
        ProductInsights.merge([
            ProductInsights(
                insights=[],
                barcode="123",
                type=InsightType.label,
                source_image="/123/1.jpg",
            ),
            ProductInsights(
                insights=[],
                barcode="123",
                type=InsightType.category,
                source_image="/123/2.jpg",
            ),
        ])