Exemplo n.º 1
0
    def generate_insights(
        self,
        max_errors: Optional[int] = None,
        lang: str = "fr",
        limit: Optional[int] = None,
    ) -> Iterable[Prediction]:
        dataset = ProductDataset(settings.JSONL_DATASET_PATH)
        product_iter = (dataset.stream().filter_by_country_tag(
            "en:france").filter_text_field(
                "lang",
                lang).filter_nonempty_text_field("ingredients_text_fr").iter())

        insights_count = 0
        for product in product_iter:
            if self.is_product_valid(product, max_errors=max_errors):
                insight = self.predict_insight(product["ingredients_text_fr"])
                if insight is not None:
                    insight["lang"] = lang
                    yield Prediction(
                        type=PredictionType.ingredient_spellcheck,
                        data=insight,
                        barcode=product["code"],
                    )

                    insights_count += 1
                    if limit is not None and insights_count >= limit:
                        break
Exemplo n.º 2
0
def train(model_output_dir: pathlib.Path, comment: Optional[str] = None):
    category_taxonomy: Taxonomy = get_taxonomy(TaxonomyType.category.name)
    category_classifier = CategoryClassifier(category_taxonomy)
    dataset: ProductDataset = ProductDataset.load()
    train_df, test_df = category_classifier.train(dataset)

    category_classifier.save(str(model_output_dir))
    test_metrics = category_classifier.evaluate(test_df)
    dataset_timestamp = datetime.datetime.fromtimestamp(
        os.path.getmtime(settings.JSONL_DATASET_PATH))

    meta = {
        "metrics": {
            "test": test_metrics
        },
        "dataset_id": dataset_timestamp.date().isoformat(),
        "training_set_count": len(train_df),
        "test_set_count": len(test_df),
    }

    if comment:
        meta["comment"] = comment

    with open(str(model_output_dir / "meta.json"), "w") as f:
        json.dump(meta, f)
Exemplo n.º 3
0
def main():
    dataset = ProductDataset.load()

    training_stream = (dataset.stream().filter_text_field(
        "lang",
        lang).filter_nonempty_text_field("product_name_{}".format(lang)))
    generate_category_insights(training_stream.iter(), batch_size=1024)
Exemplo n.º 4
0
 def generate_prediction_df(self, dataset: ProductDataset) -> pd.DataFrame:
     dataset_iter = (
         dataset.stream()
         .filter_by_country_tag("en:france")
         .filter_nonempty_text_field("product_name")
     )
     return pd.DataFrame((self.transform_product(p) for p in dataset_iter))
Exemplo n.º 5
0
def train(model_output_dir: pathlib.Path, comment: Optional[str] = None):
    category_taxonomy: Taxonomy = TAXONOMY_STORES[
        TaxonomyType.category.name].get()
    category_classifier = CategoryClassifier(category_taxonomy)
    dataset: ProductDataset = ProductDataset.load()
    train_df, test_df = category_classifier.train(dataset)

    category_classifier.save(str(model_output_dir))
    test_metrics = category_classifier.evaluate(test_df)
    dataset_timestamp = datetime.datetime.fromtimestamp(
        os.path.getmtime(settings.JSONL_DATASET_PATH))

    meta = {
        'metrics': {
            'test': test_metrics,
        },
        'dataset_id': dataset_timestamp.date().isoformat(),
        'training_set_count': len(train_df),
        'test_set_count': len(test_df),
    }

    if comment:
        meta['comment'] = comment

    with open(str(model_output_dir / 'meta.json'), 'w') as f:
        json.dump(meta, f)
Exemplo n.º 6
0
 def predict_category(output: str):
     from robotoff.elasticsearch.category.predict import predict_from_dataset
     from robotoff.utils import dump_jsonl
     from robotoff.products import ProductDataset
     from robotoff import settings
     dataset = ProductDataset(settings.JSONL_DATASET_PATH)
     dump_jsonl(output, predict_from_dataset(dataset))
Exemplo n.º 7
0
def compute_brand_prefix(
    product_dataset: ProductDataset, threshold: Optional[int] = None
) -> Dict[Tuple[str, str], int]:
    count: Dict[Tuple[str, str], int] = {}

    for product in (
        product_dataset.stream()
        .filter_nonempty_tag_field("brands_tags")
        .filter_nonempty_text_field("code")
    ):
        brand_tags = set(x for x in product["brands_tags"] if x)
        barcode = product["code"]

        if len(barcode) == 13:
            barcode_prefix = generate_barcode_prefix(barcode)

            for brand_tag in brand_tags:
                key = (brand_tag, barcode_prefix)
                count.setdefault(key, 0)
                count[key] += 1

    if threshold:
        for key in list(count.keys()):
            if count[key] < threshold:
                count.pop(key)

    return count
Exemplo n.º 8
0
def predict_from_dataset(
        dataset: ProductDataset,
        from_datetime: Optional[datetime.datetime] = None
) -> Iterable[JSONType]:
    """Return an iterable of category insights, using the provided dataset.

    Args:
        dataset: a ProductDataset
        from_datetime: datetime threshold: only keep products modified after
            `from_datetime`
    """
    product_stream = (
        dataset.stream().filter_nonempty_text_field("code").
        filter_nonempty_text_field("product_name").filter_empty_tag_field(
            "categories_tags").filter_nonempty_tag_field(
                "countries_tags").filter_nonempty_tag_field("languages_codes"))

    if from_datetime:
        product_stream = product_stream.filter_by_modified_datetime(
            from_t=from_datetime)

    product_iter = product_stream.iter()
    logger.info("Performing prediction on products without categories")

    es_client = get_es_client()
    yield from predict_from_iterable(es_client, product_iter)
Exemplo n.º 9
0
def save_brand_prefix(count_threshold: int):
    product_dataset = ProductDataset(settings.JSONL_DATASET_PATH)
    counts = compute_brand_prefix(product_dataset, threshold=count_threshold)

    brand_prefixes = list(counts.keys())

    with settings.BRAND_PREFIX_PATH.open("w") as f:
        json.dump(brand_prefixes, f)
Exemplo n.º 10
0
def main():
    dataset = ProductDataset.load()

    training_stream = (dataset.stream().filter_text_field(
        'lang',
        lang).filter_nonempty_text_field('product_name_{}'.format(lang)))

    updated_product_add_category_insight(training_stream.iter(),
                                         batch_size=1024)
Exemplo n.º 11
0
    def predict_category(output: str):
        from robotoff.elasticsearch.category.predict import predict_from_dataset
        from robotoff.utils import dump_jsonl
        from robotoff.products import ProductDataset
        from robotoff import settings

        dataset = ProductDataset(settings.JSONL_DATASET_PATH)
        insights = predict_from_dataset(dataset)
        dict_insights = (i.to_dict() for i in insights)
        dump_jsonl(output, dict_insights)
Exemplo n.º 12
0
def product_export():
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        'en:france').filter_nonempty_text_field(
            'ingredients_text_fr').filter_by_state_tag('en:complete').iter())
    product_iter = (p for p in product_iter
                    if 'ingredients-unknown-score-above-0' not in p.get(
                        'quality_tags', []))

    data = ((product['code'], {
        'ingredients_text_fr':
        normalize_ingredient_list(product['ingredients_text_fr'])
    }) for product in product_iter)

    logger.info("Importing products")

    es_client = get_es_client()
    perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX)
Exemplo n.º 13
0
def generate_product_data() -> Iterable[Tuple[str, Dict]]:
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_stream = (dataset.stream().filter_text_field(
        "lang",
        "fr").filter_by_country_tag("en:france").filter_nonempty_text_field(
            "ingredients_text_fr").filter_by_state_tag("en:complete"))

    product_iter = product_stream.iter()
    product_iter = (p for p in product_iter
                    if int(p.get("unknown_ingredients_n", 0)) == 0)

    return ((
        product["code"],
        {
            "ingredients_text_fr":
            normalize_ingredient_list(product["ingredients_text_fr"])
        },
    ) for product in product_iter)
Exemplo n.º 14
0
def generate_insights(client, confidence=1):
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        "en:france").filter_nonempty_text_field("ingredients_text_fr").iter())

    for product in product_iter:
        text = product["ingredients_text_fr"]
        corrections = generate_corrections(client, text, confidence=confidence)

        if not corrections:
            continue

        term_corrections = list(
            itertools.chain.from_iterable(
                (c.term_corrections for c in corrections)))

        yield {
            "corrections": [dataclasses.asdict(c) for c in term_corrections],
            "text": text,
            "corrected": generate_corrected_text(term_corrections, text),
            "barcode": product["code"],
        }
Exemplo n.º 15
0
def generate_insights():
    """Generate and import category insights from the latest dataset dump, for
    products added at day-1."""
    logger.info("Generating new category insights")
    product_store: ProductStore = CACHED_PRODUCT_STORE.get()
    importer = CategoryImporter(product_store)

    datetime_threshold = datetime.datetime.utcnow().replace(
        hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=1)
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)
    category_insights_iter = predict_from_dataset(dataset, datetime_threshold)

    imported = importer.import_insights(category_insights_iter)
    logger.info("{} category insights imported".format(imported))
Exemplo n.º 16
0
def generate_insights(client, confidence=1):
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        'en:france').filter_nonempty_text_field('ingredients_text_fr').iter())

    for product in product_iter:
        text = product['ingredients_text_fr']
        corrections = generate_corrections(client, text, confidence=confidence)

        if not corrections:
            continue

        term_corrections = list(
            itertools.chain.from_iterable(
                (c.term_corrections for c in corrections)))

        yield {
            'corrections': [dataclasses.asdict(c) for c in term_corrections],
            'text': text,
            'corrected': generate_corrected_text(term_corrections, text),
            'barcode': product['code'],
        }
Exemplo n.º 17
0
def images_dimension_iter():
    dataset = ProductDataset.load()

    for product in dataset.stream().filter_nonempty_text_field("code"):
        images = product.get("images", {})
        for image_id, image_data in images.items():
            if not image_id.isdigit():
                continue

            if "full" not in image_data["sizes"]:
                continue

            width = image_data["sizes"]["full"]["w"]
            height = image_data["sizes"]["full"]["h"]
            yield [int(width), int(height), product["code"], str(image_id)]
Exemplo n.º 18
0
def product_export():
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        "en:france").filter_nonempty_text_field(
            "ingredients_text_fr").filter_by_state_tag("en:complete").iter())
    product_iter = (p for p in product_iter
                    if "ingredients-unknown-score-above-0" not in p.get(
                        "quality_tags", []))

    data = ((
        product["code"],
        {
            "ingredients_text_fr":
            normalize_ingredient_list(product["ingredients_text_fr"])
        },
    ) for product in product_iter)

    logger.info("Importing products")

    es_client = get_es_client()
    inserted = perform_export(es_client, data,
                              settings.ELASTICSEARCH_PRODUCT_INDEX)
    logger.info("{} rows inserted".format(inserted))
Exemplo n.º 19
0
def update_recycling(username: str, password: str) -> None:
    """
    Function to update "Recycle" image for the product based on triggers
    """

    recycling_triggers = {
        "en": ["throw away", "recycle"],
        "fr": ["consignesdetri.fr", "recycler", "jeter", "bouteille"],
    }
    # get products dataset
    dataset = ProductDataset.load()

    # iterate products
    for product in dataset.stream().filter_nonempty_text_field("code"):
        if "packaging-photo-to-be-selected" not in product.get("states", ""):
            continue

        product_code = product.get("code")
        if not product_code:
            continue

        images = get_images(product_code)
        if not images:
            continue

        product_images_items = images.get("product", {}).get("images",
                                                             {}).items()
        images_ids = {i for i, j in product_images_items if not j.get("imgid")}
        pack_images = {
            i: j
            for i, j in product_images_items if "packaging" in i
        }

        for i in images_ids:
            # imageid - i, product
            for lang in recycling_triggers.keys():
                field = "packaging_{}".format(lang)

                if check_image_in_pack(i, field, pack_images):
                    continue

                if not check_trigger_in_text(product_code, i,
                                             recycling_triggers[lang]):
                    continue

                select_image(product_code, i, field, pack_images, username,
                             password)
Exemplo n.º 20
0
def generate_insights():
    """Generate and import category insights from the latest dataset dump, for
    products added at day-1."""
    logger.info("Generating new category insights")

    datetime_threshold = datetime.datetime.utcnow().replace(
        hour=0, minute=0, second=0, microsecond=0) - datetime.timedelta(days=1)
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)
    product_predictions_iter = predict_from_dataset(dataset,
                                                    datetime_threshold)

    imported = import_insights(
        product_predictions_iter,
        server_domain=settings.OFF_SERVER_DOMAIN,
        automatic=False,
    )
    logger.info("{} category insights imported".format(imported))
Exemplo n.º 21
0
def run(lang: Optional[str] = None):
    dataset = ProductDataset.load()
    training_stream = dataset.stream().filter_nonempty_tag_field(
        'categories_tags')

    if lang is not None:
        training_stream = (training_stream.filter_text_field(
            'lang',
            lang).filter_nonempty_text_field('product_name_{}'.format(lang)))
    else:
        training_stream = training_stream.filter_nonempty_text_field(
            'product_name')

    dataset_iter = generate_dataset(training_stream, lang)
    count = dump_jsonl(
        settings.PROJECT_DIR / 'datasets' / 'category' /
        'category_{}.jsonl'.format(lang or 'xx'), dataset_iter)
    print(count)
Exemplo n.º 22
0
def run(lang: Optional[str] = None):
    logger.info("Generating category dataset for lang {}".format(lang or "xx"))
    dataset = ProductDataset.load()
    training_stream = dataset.stream().filter_nonempty_tag_field(
        "categories_tags")

    if lang is not None:
        training_stream = training_stream.filter_text_field(
            "lang",
            lang).filter_nonempty_text_field("product_name_{}".format(lang))
    else:
        training_stream = training_stream.filter_nonempty_text_field(
            "product_name")

    dataset_iter = generate_dataset(training_stream, lang)
    count = dump_jsonl(
        settings.PROJECT_DIR / "datasets" / "category" /
        "category_{}.jsonl".format(lang or "xx"),
        dataset_iter,
    )
    logger.info("{} items for lang {}".format(count, lang or "xx"))
Exemplo n.º 23
0
    def generate_training_df(self, dataset: ProductDataset) -> pd.DataFrame:
        training_dataset_iter = (dataset.stream().filter_by_country_tag(
            'en:france').filter_nonempty_text_field(
                'product_name').filter_nonempty_tag_field('categories_tags'))

        training_dataset = []

        processed = 0
        for product in training_dataset_iter:
            processed += 1
            transformed_product = self.transform_product(product,
                                                         add_category=True)

            if 'deepest_category' in transformed_product:
                training_dataset.append(transformed_product)

        logger.info("{} training samples discarded (category not in "
                    "taxonomy), {} remaining"
                    "".format(processed - len(training_dataset),
                              len(training_dataset)))
        return pd.DataFrame(training_dataset)
Exemplo n.º 24
0
import pathlib
from typing import Set

import requests

from robotoff.off import generate_image_url
from robotoff.products import ProductDataset
from robotoff import settings
from robotoff.utils import get_logger
from robotoff.utils.types import JSONType

logger = get_logger()

JSONL_SHUF_DATASET_PATH = settings.DATASET_DIR / 'products-shuf.jsonl.gz'
ds = ProductDataset(JSONL_SHUF_DATASET_PATH)
IMAGE_DATASET_DIR = settings.PROJECT_DIR / 'image_dataset'
NUTRITION_TABLE_IMAGE_DIR = IMAGE_DATASET_DIR / 'nutrition-table-2'


def load_seen_set() -> Set[str]:
    seen_set = set()

    with open(IMAGE_DATASET_DIR / 'dataset.txt') as f:
        for line in f:
            if line:
                line = line.strip('\n')
                barcode, _ = line.split('_')
                seen_set.add(barcode)

    return seen_set
Exemplo n.º 25
0
import csv

from robotoff import settings
from robotoff.insights.ocr.core import get_source
from robotoff.products import ProductDataset

ds = ProductDataset.load()

product_iter = (
    ds.stream().filter_by_country_tag('en:france').filter_nonempty_text_field(
        'ingredients_text_fr').filter_number_field('unknown_ingredients_n', 2,
                                                   0, 'geq').iter())

with open('spellcheck_test_fr.csv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter=',', dialect='unix')

    for product in product_iter:
        if 'images' not in product:
            continue

        images = product['images']

        if 'ingredients_fr' not in images:
            continue

        print(product['unknown_ingredients_n'])
        barcode = product['code']
        url = 'https://world.openfoodfacts.org/product/{}'.format(barcode)

        rev_id = nutrition_fr_image_url = images['ingredients_fr']['rev']
        image_name = "ingredients_fr.{}.400".format(rev_id)