Пример #1
0
def train(model_output_dir: pathlib.Path, comment: Optional[str] = None):
    category_taxonomy: Taxonomy = get_taxonomy(TaxonomyType.category.name)
    category_classifier = CategoryClassifier(category_taxonomy)
    dataset: ProductDataset = ProductDataset.load()
    train_df, test_df = category_classifier.train(dataset)

    category_classifier.save(str(model_output_dir))
    test_metrics = category_classifier.evaluate(test_df)
    dataset_timestamp = datetime.datetime.fromtimestamp(
        os.path.getmtime(settings.JSONL_DATASET_PATH))

    meta = {
        "metrics": {
            "test": test_metrics
        },
        "dataset_id": dataset_timestamp.date().isoformat(),
        "training_set_count": len(train_df),
        "test_set_count": len(test_df),
    }

    if comment:
        meta["comment"] = comment

    with open(str(model_output_dir / "meta.json"), "w") as f:
        json.dump(meta, f)
Пример #2
0
def train(model_output_dir: pathlib.Path, comment: Optional[str] = None):
    category_taxonomy: Taxonomy = TAXONOMY_STORES[
        TaxonomyType.category.name].get()
    category_classifier = CategoryClassifier(category_taxonomy)
    dataset: ProductDataset = ProductDataset.load()
    train_df, test_df = category_classifier.train(dataset)

    category_classifier.save(str(model_output_dir))
    test_metrics = category_classifier.evaluate(test_df)
    dataset_timestamp = datetime.datetime.fromtimestamp(
        os.path.getmtime(settings.JSONL_DATASET_PATH))

    meta = {
        'metrics': {
            'test': test_metrics,
        },
        'dataset_id': dataset_timestamp.date().isoformat(),
        'training_set_count': len(train_df),
        'test_set_count': len(test_df),
    }

    if comment:
        meta['comment'] = comment

    with open(str(model_output_dir / 'meta.json'), 'w') as f:
        json.dump(meta, f)
Пример #3
0
def main():
    dataset = ProductDataset.load()

    training_stream = (dataset.stream().filter_text_field(
        "lang",
        lang).filter_nonempty_text_field("product_name_{}".format(lang)))
    generate_category_insights(training_stream.iter(), batch_size=1024)
Пример #4
0
def main():
    dataset = ProductDataset.load()

    training_stream = (dataset.stream().filter_text_field(
        'lang',
        lang).filter_nonempty_text_field('product_name_{}'.format(lang)))

    updated_product_add_category_insight(training_stream.iter(),
                                         batch_size=1024)
Пример #5
0
def images_dimension_iter():
    dataset = ProductDataset.load()

    for product in dataset.stream().filter_nonempty_text_field("code"):
        images = product.get("images", {})
        for image_id, image_data in images.items():
            if not image_id.isdigit():
                continue

            if "full" not in image_data["sizes"]:
                continue

            width = image_data["sizes"]["full"]["w"]
            height = image_data["sizes"]["full"]["h"]
            yield [int(width), int(height), product["code"], str(image_id)]
Пример #6
0
def update_recycling(username: str, password: str) -> None:
    """
    Function to update "Recycle" image for the product based on triggers
    """

    recycling_triggers = {
        "en": ["throw away", "recycle"],
        "fr": ["consignesdetri.fr", "recycler", "jeter", "bouteille"],
    }
    # get products dataset
    dataset = ProductDataset.load()

    # iterate products
    for product in dataset.stream().filter_nonempty_text_field("code"):
        if "packaging-photo-to-be-selected" not in product.get("states", ""):
            continue

        product_code = product.get("code")
        if not product_code:
            continue

        images = get_images(product_code)
        if not images:
            continue

        product_images_items = images.get("product", {}).get("images",
                                                             {}).items()
        images_ids = {i for i, j in product_images_items if not j.get("imgid")}
        pack_images = {
            i: j
            for i, j in product_images_items if "packaging" in i
        }

        for i in images_ids:
            # imageid - i, product
            for lang in recycling_triggers.keys():
                field = "packaging_{}".format(lang)

                if check_image_in_pack(i, field, pack_images):
                    continue

                if not check_trigger_in_text(product_code, i,
                                             recycling_triggers[lang]):
                    continue

                select_image(product_code, i, field, pack_images, username,
                             password)
Пример #7
0
def run(lang: Optional[str] = None):
    dataset = ProductDataset.load()
    training_stream = dataset.stream().filter_nonempty_tag_field(
        'categories_tags')

    if lang is not None:
        training_stream = (training_stream.filter_text_field(
            'lang',
            lang).filter_nonempty_text_field('product_name_{}'.format(lang)))
    else:
        training_stream = training_stream.filter_nonempty_text_field(
            'product_name')

    dataset_iter = generate_dataset(training_stream, lang)
    count = dump_jsonl(
        settings.PROJECT_DIR / 'datasets' / 'category' /
        'category_{}.jsonl'.format(lang or 'xx'), dataset_iter)
    print(count)
Пример #8
0
def run(lang: Optional[str] = None):
    logger.info("Generating category dataset for lang {}".format(lang or "xx"))
    dataset = ProductDataset.load()
    training_stream = dataset.stream().filter_nonempty_tag_field(
        "categories_tags")

    if lang is not None:
        training_stream = training_stream.filter_text_field(
            "lang",
            lang).filter_nonempty_text_field("product_name_{}".format(lang))
    else:
        training_stream = training_stream.filter_nonempty_text_field(
            "product_name")

    dataset_iter = generate_dataset(training_stream, lang)
    count = dump_jsonl(
        settings.PROJECT_DIR / "datasets" / "category" /
        "category_{}.jsonl".format(lang or "xx"),
        dataset_iter,
    )
    logger.info("{} items for lang {}".format(count, lang or "xx"))
Пример #9
0
import csv

from robotoff import settings
from robotoff.insights.ocr.core import get_source
from robotoff.products import ProductDataset

ds = ProductDataset.load()

product_iter = (
    ds.stream().filter_by_country_tag('en:france').filter_nonempty_text_field(
        'ingredients_text_fr').filter_number_field('unknown_ingredients_n', 2,
                                                   0, 'geq').iter())

with open('spellcheck_test_fr.csv', 'w', newline='') as f:
    writer = csv.writer(f, delimiter=',', dialect='unix')

    for product in product_iter:
        if 'images' not in product:
            continue

        images = product['images']

        if 'ingredients_fr' not in images:
            continue

        print(product['unknown_ingredients_n'])
        barcode = product['code']
        url = 'https://world.openfoodfacts.org/product/{}'.format(barcode)

        rev_id = nutrition_fr_image_url = images['ingredients_fr']['rev']
        image_name = "ingredients_fr.{}.400".format(rev_id)