def train(model_output_dir: pathlib.Path, comment: Optional[str] = None): category_taxonomy: Taxonomy = get_taxonomy(TaxonomyType.category.name) category_classifier = CategoryClassifier(category_taxonomy) dataset: ProductDataset = ProductDataset.load() train_df, test_df = category_classifier.train(dataset) category_classifier.save(str(model_output_dir)) test_metrics = category_classifier.evaluate(test_df) dataset_timestamp = datetime.datetime.fromtimestamp( os.path.getmtime(settings.JSONL_DATASET_PATH)) meta = { "metrics": { "test": test_metrics }, "dataset_id": dataset_timestamp.date().isoformat(), "training_set_count": len(train_df), "test_set_count": len(test_df), } if comment: meta["comment"] = comment with open(str(model_output_dir / "meta.json"), "w") as f: json.dump(meta, f)
def train(model_output_dir: pathlib.Path, comment: Optional[str] = None): category_taxonomy: Taxonomy = TAXONOMY_STORES[ TaxonomyType.category.name].get() category_classifier = CategoryClassifier(category_taxonomy) dataset: ProductDataset = ProductDataset.load() train_df, test_df = category_classifier.train(dataset) category_classifier.save(str(model_output_dir)) test_metrics = category_classifier.evaluate(test_df) dataset_timestamp = datetime.datetime.fromtimestamp( os.path.getmtime(settings.JSONL_DATASET_PATH)) meta = { 'metrics': { 'test': test_metrics, }, 'dataset_id': dataset_timestamp.date().isoformat(), 'training_set_count': len(train_df), 'test_set_count': len(test_df), } if comment: meta['comment'] = comment with open(str(model_output_dir / 'meta.json'), 'w') as f: json.dump(meta, f)
def main(): dataset = ProductDataset.load() training_stream = (dataset.stream().filter_text_field( "lang", lang).filter_nonempty_text_field("product_name_{}".format(lang))) generate_category_insights(training_stream.iter(), batch_size=1024)
def main(): dataset = ProductDataset.load() training_stream = (dataset.stream().filter_text_field( 'lang', lang).filter_nonempty_text_field('product_name_{}'.format(lang))) updated_product_add_category_insight(training_stream.iter(), batch_size=1024)
def images_dimension_iter(): dataset = ProductDataset.load() for product in dataset.stream().filter_nonempty_text_field("code"): images = product.get("images", {}) for image_id, image_data in images.items(): if not image_id.isdigit(): continue if "full" not in image_data["sizes"]: continue width = image_data["sizes"]["full"]["w"] height = image_data["sizes"]["full"]["h"] yield [int(width), int(height), product["code"], str(image_id)]
def update_recycling(username: str, password: str) -> None: """ Function to update "Recycle" image for the product based on triggers """ recycling_triggers = { "en": ["throw away", "recycle"], "fr": ["consignesdetri.fr", "recycler", "jeter", "bouteille"], } # get products dataset dataset = ProductDataset.load() # iterate products for product in dataset.stream().filter_nonempty_text_field("code"): if "packaging-photo-to-be-selected" not in product.get("states", ""): continue product_code = product.get("code") if not product_code: continue images = get_images(product_code) if not images: continue product_images_items = images.get("product", {}).get("images", {}).items() images_ids = {i for i, j in product_images_items if not j.get("imgid")} pack_images = { i: j for i, j in product_images_items if "packaging" in i } for i in images_ids: # imageid - i, product for lang in recycling_triggers.keys(): field = "packaging_{}".format(lang) if check_image_in_pack(i, field, pack_images): continue if not check_trigger_in_text(product_code, i, recycling_triggers[lang]): continue select_image(product_code, i, field, pack_images, username, password)
def run(lang: Optional[str] = None): dataset = ProductDataset.load() training_stream = dataset.stream().filter_nonempty_tag_field( 'categories_tags') if lang is not None: training_stream = (training_stream.filter_text_field( 'lang', lang).filter_nonempty_text_field('product_name_{}'.format(lang))) else: training_stream = training_stream.filter_nonempty_text_field( 'product_name') dataset_iter = generate_dataset(training_stream, lang) count = dump_jsonl( settings.PROJECT_DIR / 'datasets' / 'category' / 'category_{}.jsonl'.format(lang or 'xx'), dataset_iter) print(count)
def run(lang: Optional[str] = None): logger.info("Generating category dataset for lang {}".format(lang or "xx")) dataset = ProductDataset.load() training_stream = dataset.stream().filter_nonempty_tag_field( "categories_tags") if lang is not None: training_stream = training_stream.filter_text_field( "lang", lang).filter_nonempty_text_field("product_name_{}".format(lang)) else: training_stream = training_stream.filter_nonempty_text_field( "product_name") dataset_iter = generate_dataset(training_stream, lang) count = dump_jsonl( settings.PROJECT_DIR / "datasets" / "category" / "category_{}.jsonl".format(lang or "xx"), dataset_iter, ) logger.info("{} items for lang {}".format(count, lang or "xx"))
import csv from robotoff import settings from robotoff.insights.ocr.core import get_source from robotoff.products import ProductDataset ds = ProductDataset.load() product_iter = ( ds.stream().filter_by_country_tag('en:france').filter_nonempty_text_field( 'ingredients_text_fr').filter_number_field('unknown_ingredients_n', 2, 0, 'geq').iter()) with open('spellcheck_test_fr.csv', 'w', newline='') as f: writer = csv.writer(f, delimiter=',', dialect='unix') for product in product_iter: if 'images' not in product: continue images = product['images'] if 'ingredients_fr' not in images: continue print(product['unknown_ingredients_n']) barcode = product['code'] url = 'https://world.openfoodfacts.org/product/{}'.format(barcode) rev_id = nutrition_fr_image_url = images['ingredients_fr']['rev'] image_name = "ingredients_fr.{}.400".format(rev_id)