示例#1
0
def generate_dataset(stream: ProductStream,
                     lang: Optional[str]) -> Iterator[JSONType]:
    category_taxonomy = get_taxonomy('category')
    ingredient_taxonomy = get_taxonomy('ingredient')

    for product in stream.iter():
        categories_tags: List[str] = product['categories_tags']
        inferred_categories_tags: List[TaxonomyNode] = list(
            infer_category_tags(categories_tags, category_taxonomy))

        if inferred_categories_tags:
            ingredient_tags = product.get('ingredients_tags', [])
            ingredient_tags = [x for x in ingredient_tags if x]
            known_ingredient_tags = [
                ingredient_tag for ingredient_tag in ingredient_tags
                if ingredient_tag in ingredient_taxonomy
            ]
            ingredients_text_field = 'ingredients_text_{}'.format(
                lang) if lang else 'ingredients_text'
            ingredients_text = product.get(ingredients_text_field,
                                           None) or None

            product_name_field = 'product_name_{}'.format(
                lang) if lang else 'product_name'
            yield {
                'code': product['code'],
                'product_name': product[product_name_field],
                'categories_tags': [x.id for x in inferred_categories_tags],
                'ingredient_tags': ingredient_tags,
                'known_ingredient_tags': known_ingredient_tags,
                'ingredients_text': ingredients_text,
                'lang': product.get('lang', None),
            }
示例#2
0
def generate_dataset(stream: ProductStream,
                     lang: Optional[str]) -> Iterator[JSONType]:
    category_taxonomy = get_taxonomy("category")
    ingredient_taxonomy = get_taxonomy("ingredient")

    for product in stream.iter():
        categories_tags: List[str] = product["categories_tags"]
        inferred_categories_tags: List[TaxonomyNode] = list(
            infer_category_tags(categories_tags, category_taxonomy))

        if inferred_categories_tags:
            ingredient_tags = product.get("ingredients_tags", [])
            ingredient_tags = [x for x in ingredient_tags if x]
            known_ingredient_tags = [
                ingredient_tag for ingredient_tag in ingredient_tags
                if ingredient_tag in ingredient_taxonomy
            ]
            ingredients_text_field = ("ingredients_text_{}".format(lang)
                                      if lang else "ingredients_text")
            ingredients_text = product.get(ingredients_text_field,
                                           None) or None

            product_name_field = ("product_name_{}".format(lang)
                                  if lang else "product_name")
            yield {
                "code": product["code"],
                "nutriments": product.get("nutriments") or None,
                "images": product.get("images", {}) or None,
                "product_name": product[product_name_field],
                "categories_tags": [x.id for x in inferred_categories_tags],
                "ingredient_tags": ingredient_tags,
                "known_ingredient_tags": known_ingredient_tags,
                "ingredients_text": ingredients_text,
                "lang": product.get("lang", None),
            }
def train(model_output_dir: pathlib.Path, comment: Optional[str] = None):
    category_taxonomy: Taxonomy = get_taxonomy(TaxonomyType.category.name)
    category_classifier = CategoryClassifier(category_taxonomy)
    dataset: ProductDataset = ProductDataset.load()
    train_df, test_df = category_classifier.train(dataset)

    category_classifier.save(str(model_output_dir))
    test_metrics = category_classifier.evaluate(test_df)
    dataset_timestamp = datetime.datetime.fromtimestamp(
        os.path.getmtime(settings.JSONL_DATASET_PATH))

    meta = {
        'metrics': {
            'test': test_metrics,
        },
        'dataset_id': dataset_timestamp.date().isoformat(),
        'training_set_count': len(train_df),
        'test_set_count': len(test_df),
    }

    if comment:
        meta['comment'] = comment

    with open(str(model_output_dir / 'meta.json'), 'w') as f:
        json.dump(meta, f)
示例#4
0
    def format_question(self, insight: ProductInsight, lang: str) -> Question:
        value_tag: str = insight.value_tag
        image_url = None

        if value_tag in LABEL_IMAGES:
            image_url = LABEL_IMAGES[value_tag]

        taxonomy: Taxonomy = get_taxonomy(TaxonomyType.label.name)
        localized_value: str = taxonomy.get_localized_name(value_tag, lang)
        localized_question = self.translation_store.gettext(
            lang, self.question)

        source_image_url = None
        if insight.source_image:
            source_image_url = settings.OFF_IMAGE_BASE_URL + get_display_image(
                insight.source_image)

        return AddBinaryQuestion(
            question=localized_question,
            value=localized_value,
            value_tag=value_tag,
            insight=insight,
            image_url=image_url,
            source_image_url=source_image_url,
        )
示例#5
0
    def is_valid(self, barcode: str, label_tag: str,
                 label_seen: Set[str]) -> bool:
        product = self.product_store[barcode]
        product_labels_tags = getattr(product, 'labels_tags', [])

        if label_tag in product_labels_tags:
            return False

        if label_tag in label_seen:
            return False

        # Check that the predicted label is not a parent of a
        # current/already predicted label
        label_taxonomy: Taxonomy = get_taxonomy(InsightType.label.name)

        if label_tag in label_taxonomy:
            label_node: TaxonomyNode = label_taxonomy[label_tag]

            to_check_labels = (set(product_labels_tags).union(label_seen))
            for other_label_node in (label_taxonomy[to_check_label]
                                     for to_check_label in to_check_labels):
                if (other_label_node is not None
                        and other_label_node.is_child_of(label_node)):
                    return False

        return True
示例#6
0
    def is_latent(product: Optional[Product], barcode: str, tag: str,
                  seen_set: Set[str]) -> bool:
        product_labels_tags = getattr(product, "labels_tags", [])

        if tag in product_labels_tags:
            return True

        if tag in seen_set:
            return True

        # Check that the predicted label is not a parent of a
        # current/already predicted label
        label_taxonomy: Taxonomy = get_taxonomy(InsightType.label.name)

        if tag in label_taxonomy:
            label_node: TaxonomyNode = label_taxonomy[tag]

            to_check_labels = set(product_labels_tags).union(seen_set)
            for other_label_node in (label_taxonomy[to_check_label]
                                     for to_check_label in to_check_labels):
                if other_label_node is not None and other_label_node.is_child_of(
                        label_node):
                    return True

        return False
示例#7
0
def categorize(
    barcode: str,
    deepest_only: bool = False,
) -> None:
    """Categorise predicts product categories based on the neural category classifier.

    deepest_only: controls whether the returned predictions should only contain the deepmost
    categories for a predicted taxonomy chain.
    For example, if we predict 'fresh vegetables' -> 'legumes' -> 'beans' for a product,
    setting deepest_only=True will return 'beans'."""
    from robotoff.prediction.category.neural.category_classifier import (
        CategoryClassifier, )
    from robotoff.products import get_product
    from robotoff.taxonomy import TaxonomyType, get_taxonomy

    product = get_product(barcode)
    if product is None:
        print(f"Product {barcode} not found")
        return

    predictions = CategoryClassifier(get_taxonomy(
        TaxonomyType.category.name)).predict(product, deepest_only)

    if predictions:
        for prediction in predictions:
            print(f"{prediction.value_tag}: {prediction.data['confidence']}")
    else:
        print(f"Nothing predicted for product {barcode}")
示例#8
0
def extract_ingredients_from_taxonomy(lang: str):
    taxonomy = get_taxonomy(TaxonomyType.ingredient.name)
    ingredients = set()
    for key, node in taxonomy.nodes.items():
        synonyms: List[str] = node.get_synonyms(lang)

        for synonym in synonyms:
            ingredients.add(synonym.lower())

    return ingredients
示例#9
0
 def format_question(self, insight: ProductInsight, lang: str) -> Question:
     value: str = insight.value_tag
     taxonomy: Taxonomy = get_taxonomy(TaxonomyType.category.name)
     localized_value: str = taxonomy.get_localized_name(value, lang)
     localized_question = self.translation_store.gettext(
         lang, self.question)
     source_image_url = self.get_source_image_url(insight.barcode)
     return AddBinaryQuestion(question=localized_question,
                              value=localized_value,
                              insight=insight,
                              source_image_url=source_image_url)
示例#10
0
def category_export():
    logger.info("Starting category export to Elasticsearch...")
    client = get_es_client()
    category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name)
    logger.info("Deleting existing categories...")
    delete_categories(client)
    logger.info("Starting export...")
    category_data = generate_category_data(category_taxonomy)
    rows_inserted = perform_export(client, category_data,
                                   settings.ELASTICSEARCH_CATEGORY_INDEX)
    logger.info("%d rows inserted" % rows_inserted)
示例#11
0
 def generate_candidates(
     cls,
     product: Product,
     predictions: List[Prediction],
 ) -> Iterator[ProductInsight]:
     candidates = [
         prediction
         for prediction in predictions if cls.is_prediction_valid(
             product, prediction.value_tag)  # type: ignore
     ]
     taxonomy = get_taxonomy(InsightType.category.name)
     yield from (ProductInsight(**candidate.to_dict())
                 for candidate in select_deepest_taxonomized_candidates(
                     candidates, taxonomy))
示例#12
0
    def on_get(self, req: falcon.Request, resp: falcon.Response):
        barcode = req.get_param("barcode", required=True)
        deepest_only = req.get_param_as_bool("deepest_only", default=False)

        categories = []

        product = get_product(barcode)
        if product:
            predictions = CategoryClassifier(
                get_taxonomy(TaxonomyType.category.name)
            ).predict(product, deepest_only)
            categories = [p.to_dict() for p in predictions]

        resp.media = {"categories": categories}
示例#13
0
def generate_category_data() -> Iterable[Tuple[str, Dict]]:
    category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name)

    for category_node in category_taxonomy.iter_nodes():
        supported_langs = [
            lang for lang in category_node.names if lang in SUPPORTED_LANG
        ]

        data = {
            "{}:name".format(lang): category_node.names[lang]
            for lang in supported_langs
        }
        data["id"] = category_node.id

        id_ = hashlib.sha256(category_node.id.encode("utf-8")).hexdigest()

        yield id_, data
示例#14
0
 def generate_candidates(
     cls,
     product: Product,
     predictions: List[Prediction],
 ) -> Iterator[ProductInsight]:
     candidates = [
         prediction
         for prediction in predictions if cls.is_prediction_valid(
             product, prediction.value_tag)  # type: ignore
     ]
     taxonomy = get_taxonomy(InsightType.label.name)
     for candidate in select_deepest_taxonomized_candidates(
             candidates, taxonomy):
         insight = ProductInsight(**candidate.to_dict())
         if insight.automatic_processing is None:
             insight.automatic_processing = (
                 candidate.value_tag in AUTHORIZED_LABELS_STORE.get())
         yield insight
示例#15
0
def add_category_insight(barcode: str, product: JSONType,
                         server_domain: str) -> bool:
    """Predict categories for product and import predicted category insight.

    :param barcode: product barcode
    :param product: product as retrieved from application
    :param server_domain: the server the product belongs to
    :return: True if at least one category insight was imported
    """
    if get_server_type(server_domain) != ServerType.off:
        return False

    logger.info("Predicting product categories...")
    # predict category using Elasticsearch on title
    product_predictions = []
    es_prediction = predict_category_from_product_es(product)

    if es_prediction is not None:
        product_predictions.append(es_prediction)

    # predict category using neural model
    neural_predictions = []
    try:
        neural_predictions = CategoryClassifier(
            get_taxonomy(TaxonomyType.category.name)).predict(product)
    except requests.exceptions.HTTPError as e:
        resp = e.response
        logger.error(
            f"Category classifier returned an error: {resp.status_code}: %s",
            resp.text)

    for neural_prediction in neural_predictions:
        neural_prediction.barcode = barcode
        product_predictions.append(neural_prediction)

    if len(product_predictions) < 1:
        return False

    imported = import_insights(product_predictions,
                               server_domain,
                               automatic=True)
    logger.info(f"{imported} category insight imported for product {barcode}")

    return bool(imported)
示例#16
0
    def is_valid(self,
                 insight: ProductInsight,
                 product: Optional[Product] = None) -> bool:
        if product is None:
            product = self.product_store[insight.barcode]

        product_categories_tags = getattr(product, "categories_tags", [])
        category_tag = insight.value_tag

        if category_tag in product_categories_tags:
            return False

        # Check that the predicted category is not a parent of a
        # current/already predicted category
        category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name)

        if category_tag in category_taxonomy and category_taxonomy.is_parent_of_any(
                category_tag, product_categories_tags):
            return False

        return True
示例#17
0
    def is_valid(self,
                 insight: ProductInsight,
                 product: Optional[Product] = None) -> bool:
        if product is None:
            product = self.product_store[insight.barcode]

        product_labels_tags = getattr(product, 'labels_tags', [])
        label_tag = insight.value_tag

        if label_tag in product_labels_tags:
            return False

        # Check that the predicted label is not a parent of a
        # current/already predicted label
        label_taxonomy: Taxonomy = get_taxonomy(InsightType.label.name)

        if (label_tag in label_taxonomy and label_taxonomy.is_parent_of_any(
                label_tag, product_labels_tags)):
            return False

        return True
示例#18
0
    def is_valid(
        self,
        product: Optional[Product],
        barcode: str,
        category: str,
        seen_set: Set[str],
    ):
        product_categories_tags = getattr(product, "categories_tags", [])

        if category in product_categories_tags:
            logger.debug("The product already belongs to this category, "
                         "considering the insight as invalid")
            return False

        if category in seen_set:
            logger.debug("An insight already exists for this product and "
                         "category, considering the insight as invalid")
            return False

        # Check that the predicted category is not a parent of a
        # current/already predicted category
        category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name)

        if category in category_taxonomy:
            category_node: TaxonomyNode = category_taxonomy[category]

            to_check_categories = set(product_categories_tags).union(seen_set)
            for other_category_node in (
                    category_taxonomy[to_check_category]
                    for to_check_category in to_check_categories):
                if other_category_node is not None and other_category_node.is_child_of(
                        category_node):
                    logger.debug(
                        "The predicted category is a parent of the product "
                        "category or of the predicted category of an insight, "
                        "considering the insight as invalid")
                    return False

        return True
示例#19
0
 def is_parent_label(cls, tag: str, to_check_labels: Set[str]) -> bool:
     # Check that the predicted label is not a parent of a
     # current/already predicted label
     return get_taxonomy(InsightType.label.name).is_parent_of_any(
         tag, to_check_labels, raises=False)
示例#20
0
 def is_parent_category(cls, category: str, to_check_categories: Set[str]):
     # Check that the predicted category is not a parent of a
     # current/already predicted category
     return get_taxonomy(InsightType.category.name).is_parent_of_any(
         category, to_check_categories, raises=False)
示例#21
0
from robotoff.taxonomy import get_taxonomy
import csv

for taxonomy_name in ("ingredient", "category", "label"):
    taxonomy = get_taxonomy(taxonomy_name)

    with open(f"{taxonomy_name}.tsv", "w", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=["id", "name", "description"])
        writer.writeheader()

        for node in taxonomy.iter_nodes():
            name = node.get_localized_name("en")

            if name != node.id:
                writer.writerow({"id": node.id, "name": name, "description": name})
示例#22
0
def test_select_deepest_taxonomized_candidates(candidates, taxonomy_name,
                                               kept_indices):
    taxonomy = get_taxonomy(taxonomy_name)
    assert select_deepest_taxonomized_candidates(
        candidates, taxonomy) == [candidates[idx] for idx in kept_indices]