def test_product_insights_merge(): insights_1 = [ RawInsight(type=InsightType.label, data={}, value_tag="en:organic") ] product_insights_1 = ProductInsights( insights=insights_1, barcode="123", type=InsightType.label, source_image="/123/1.jpg", ) insights_2 = [ RawInsight(type=InsightType.label, data={}, value_tag="en:pgi") ] product_insights_2 = ProductInsights( insights=insights_2, barcode="123", type=InsightType.label, source_image="/123/1.jpg", ) merged_product_insights = ProductInsights.merge( [product_insights_1, product_insights_2]) assert merged_product_insights.type == InsightType.label assert merged_product_insights.barcode == "123" assert merged_product_insights.source_image == "/123/1.jpg" assert merged_product_insights.insights == insights_1 + insights_2
def add_category_insight(barcode: str, product: JSONType, server_domain: str) -> bool: if get_server_type(server_domain) != ServerType.off: return False product_insights = [] product_insight = predict_category_from_product_es(product) if product_insight is not None: product_insights.append(product_insight) product_insight = predict_category_from_product_ml(product, filter_blacklisted=True) if product_insight is not None: product_insights.append(product_insight) if not product_insights: return False merged_product_insight = ProductInsights.merge(product_insights) product_store = get_product_store() importer = InsightImporterFactory.create(InsightType.category, product_store) imported = importer.import_insights( [merged_product_insight], server_domain=server_domain, automatic=False, ) if imported: logger.info("Category insight imported for product {}".format(barcode)) return bool(imported)
def extract_ocr_insights( ocr_url: str, insight_types: Iterable[InsightType] ) -> Dict[InsightType, ProductInsights]: source_image = get_source_from_ocr_url(ocr_url) barcode = get_barcode_from_url(ocr_url) if barcode is None: raise ValueError("cannot extract barcode fro URL: {}".format(ocr_url)) ocr_result = get_ocr_result(ocr_url) if ocr_result is None: logger.info("Error during OCR extraction: {}".format(ocr_url)) return {} results = {} for insight_type in insight_types: insights = ocr.extract_insights(ocr_result, insight_type) if insights: results[insight_type] = ProductInsights( barcode=barcode, insights=insights, source_image=source_image, type=insight_type, ) return results
def get_insights_from_image( barcode: str, image_url: str, ocr_url: str) -> Dict[InsightType, ProductInsights]: try: ocr_insights = extract_ocr_insights(ocr_url, IMAGE_IMPORT_INSIGHT_TYPES) except requests.exceptions.RequestException as e: logger.info("error during OCR JSON download", exc_info=e) return {} except OCRParsingException as e: logger.error("OCR JSON Parsing error", exc_info=e) return {} extract_nutriscore = has_nutriscore_insight( ocr_insights.get(InsightType.label, None)) image_ml_insights = extract_image_ml_insights( image_url, extract_nutriscore=extract_nutriscore) insight_types = set(ocr_insights.keys()).union(image_ml_insights.keys()) results: Dict[InsightType, ProductInsights] = {} for insight_type in insight_types: product_insights: List[ProductInsights] = [] if insight_type in ocr_insights: product_insights.append(ocr_insights[insight_type]) if insight_type in image_ml_insights: product_insights.append(image_ml_insights[insight_type]) results[insight_type] = ProductInsights.merge(product_insights) return results
def extract_image_ml_insights( image_url: str, extract_nutriscore: bool = True) -> Dict[InsightType, ProductInsights]: barcode = get_barcode_from_url(image_url) if barcode is None: raise ValueError("cannot extract barcode from URL: {}".format(barcode)) results: Dict[InsightType, ProductInsights] = {} if extract_nutriscore: image = get_image_from_url(image_url, error_raise=True, session=http_session) nutriscore_insight = extract_nutriscore_label(image, manual_threshold=0.5, automatic_threshold=0.9) if not nutriscore_insight: return results source_image = get_source_from_image_url(image_url) results[InsightType.label] = ProductInsights( insights=[nutriscore_insight], barcode=barcode, source_image=source_image, type=InsightType.label, ) return results
def generate_insights_from_annotated_logos(logos: List[LogoAnnotation], server_domain: str): product_insights: List[ProductInsights] = [] for logo in logos: raw_insight = generate_raw_insight(logo.annotation_type, logo.taxonomy_value, confidence=1.0, logo_id=logo.id) if raw_insight is None: return image = logo.image_prediction.image try: raw_insight.automatic_processing = is_automatically_processable( image.barcode, image.source_image, datetime.timedelta(days=30)) except InvalidInsight: return if raw_insight.automatic_processing: raw_insight.data["notify"] = True product_insights.append( ProductInsights( insights=[raw_insight], type=raw_insight.type, barcode=image.barcode, source_image=image.source_image, )) imported = import_insights(product_insights, server_domain, automatic=True) if imported: logger.info(f"{imported} logo insights imported after annotation")
def get_insights_from_product_name( barcode: str, product_name: str) -> Dict[InsightType, ProductInsights]: results = {} for insight_type in PRODUCT_NAME_INSIGHT_TYPES: insights = ocr.extract_insights(product_name, insight_type) if insights: for insight in insights: insight.data["source"] = "product_name" results[insight_type] = ProductInsights( insights=insights, barcode=barcode, type=insight_type, ) return results
def predict_logo_insights( logos: List[LogoAnnotation], logo_probs: List[Dict[LogoLabelType, float]], ) -> List[ProductInsights]: grouped_insights: Dict[Tuple[str, str, InsightType], List[RawInsight]] = {} for logo, probs in zip(logos, logo_probs): if not probs: continue label, max_prob = max( ((label, prob) for label, prob in probs.items() if label != UNKNOWN_LABEL), default=(UNKNOWN_LABEL, 0.0), key=operator.itemgetter(1), ) if label == UNKNOWN_LABEL: continue raw_insight = generate_raw_insight(label[0], label[1], confidence=max_prob, logo_id=logo.id) if raw_insight is not None: image = logo.image_prediction.image source_image = image.source_image barcode = image.barcode key = (barcode, source_image, raw_insight.type) grouped_insights.setdefault(key, []) grouped_insights[key].append(raw_insight) insights: List[ProductInsights] = [] for (barcode, source_image, insight_type), raw_insights in grouped_insights.items(): insights.append( ProductInsights( insights=raw_insights, type=insight_type, barcode=barcode, source_image=source_image, )) return insights
def predict(client, product: Dict) -> Optional[ProductInsights]: predictions = [] for lang in product.get("languages_codes", []): product_name = product.get("product_name_{}".format(lang)) if not product_name: continue prediction = predict_category(client, product_name, lang) if prediction is None: continue category, score = prediction predictions.append((lang, category, product_name, score)) continue if predictions: # Sort by descending score sorted_predictions = sorted( predictions, key=operator.itemgetter(2), reverse=True ) p = sorted_predictions[0] lang, category, product_name, score = p return ProductInsights( barcode=product["code"], type=InsightType.category, insights=[ RawInsight( type=InsightType.category, value_tag=category, data={ "lang": lang, "product_name": product_name, "model": "matcher", }, ) ], ) return None
def format_predictions(product: Dict, predictions: List[CategoryPrediction], lang: str) -> ProductInsights: insights = [] for category, confidence in predictions: insights.append( RawInsight( type=InsightType.category, value_tag=category, data={ "lang": lang, "model": "neural", "confidence": confidence }, )) return ProductInsights(barcode=product["code"], type=InsightType.category, insights=insights)
def generate_from_ocr_archive( input_: Union[str, TextIO, pathlib.Path], insight_type: InsightType, keep_empty: bool = False, ) -> Iterable[ProductInsights]: for source_image, ocr_json in ocr_iter(input_): if source_image is None: continue barcode: Optional[str] = get_barcode_from_path(source_image) if barcode is None: click.echo( "cannot extract barcode from source " "{}".format(source_image), err=True, ) continue ocr_result: Optional[OCRResult] = OCRResult.from_json(ocr_json) if ocr_result is None: continue insights = extract_insights(ocr_result, insight_type) # Do not produce output if insights is empty and we don't want to keep it if not keep_empty and not insights: continue yield ProductInsights( insights=insights, barcode=barcode, type=insight_type, source_image=source_image, )
def insights_iter(file_path: pathlib.Path) -> Iterable[ProductInsights]: for insight in jsonl_iter(file_path): yield ProductInsights.from_dict(insight)
def test_product_insights_failed_merge(): with pytest.raises(ValueError): ProductInsights.merge([]) with pytest.raises(ValueError): ProductInsights.merge([ ProductInsights( insights=[], barcode="123", type=InsightType.label, source_image="/123/1.jpg", ), ProductInsights( insights=[], barcode="234", type=InsightType.label, source_image="/123/1.jpg", ), ]) with pytest.raises(ValueError): ProductInsights.merge([ ProductInsights( insights=[], barcode="123", type=InsightType.label, source_image="/123/1.jpg", ), ProductInsights( insights=[], barcode="123", type=InsightType.category, source_image="/123/1.jpg", ), ]) with pytest.raises(ValueError): ProductInsights.merge([ ProductInsights( insights=[], barcode="123", type=InsightType.label, source_image="/123/1.jpg", ), ProductInsights( insights=[], barcode="123", type=InsightType.label, source_image="/123/2.jpg", ), ]) with pytest.raises(ValueError): ProductInsights.merge([ ProductInsights( insights=[], barcode="123", type=InsightType.label, source_image="/123/1.jpg", ), ProductInsights( insights=[], barcode="123", type=InsightType.category, source_image="/123/2.jpg", ), ])