def predict_from_dataset( dataset: ProductDataset, from_datetime: Optional[datetime.datetime] = None ) -> Iterable[JSONType]: """Return an iterable of category insights, using the provided dataset. Args: dataset: a ProductDataset from_datetime: datetime threshold: only keep products modified after `from_datetime` """ product_stream = ( dataset.stream().filter_nonempty_text_field("code"). filter_nonempty_text_field("product_name").filter_empty_tag_field( "categories_tags").filter_nonempty_tag_field( "countries_tags").filter_nonempty_tag_field("languages_codes")) if from_datetime: product_stream = product_stream.filter_by_modified_datetime( from_t=from_datetime) product_iter = product_stream.iter() logger.info("Performing prediction on products without categories") es_client = get_es_client() yield from predict_from_iterable(es_client, product_iter)
def generate_insights( self, max_errors: Optional[int] = None, lang: str = "fr", limit: Optional[int] = None, ) -> Iterable[Prediction]: dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( "en:france").filter_text_field( "lang", lang).filter_nonempty_text_field("ingredients_text_fr").iter()) insights_count = 0 for product in product_iter: if self.is_product_valid(product, max_errors=max_errors): insight = self.predict_insight(product["ingredients_text_fr"]) if insight is not None: insight["lang"] = lang yield Prediction( type=PredictionType.ingredient_spellcheck, data=insight, barcode=product["code"], ) insights_count += 1 if limit is not None and insights_count >= limit: break
def compute_brand_prefix( product_dataset: ProductDataset, threshold: Optional[int] = None ) -> Dict[Tuple[str, str], int]: count: Dict[Tuple[str, str], int] = {} for product in ( product_dataset.stream() .filter_nonempty_tag_field("brands_tags") .filter_nonempty_text_field("code") ): brand_tags = set(x for x in product["brands_tags"] if x) barcode = product["code"] if len(barcode) == 13: barcode_prefix = generate_barcode_prefix(barcode) for brand_tag in brand_tags: key = (brand_tag, barcode_prefix) count.setdefault(key, 0) count[key] += 1 if threshold: for key in list(count.keys()): if count[key] < threshold: count.pop(key) return count
def generate_prediction_df(self, dataset: ProductDataset) -> pd.DataFrame: dataset_iter = ( dataset.stream() .filter_by_country_tag("en:france") .filter_nonempty_text_field("product_name") ) return pd.DataFrame((self.transform_product(p) for p in dataset_iter))
def generate_product_data() -> Iterable[Tuple[str, Dict]]: dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_stream = (dataset.stream().filter_text_field( "lang", "fr").filter_by_country_tag("en:france").filter_nonempty_text_field( "ingredients_text_fr").filter_by_state_tag("en:complete")) product_iter = product_stream.iter() product_iter = (p for p in product_iter if int(p.get("unknown_ingredients_n", 0)) == 0) return (( product["code"], { "ingredients_text_fr": normalize_ingredient_list(product["ingredients_text_fr"]) }, ) for product in product_iter)
def product_export(): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( 'en:france').filter_nonempty_text_field( 'ingredients_text_fr').filter_by_state_tag('en:complete').iter()) product_iter = (p for p in product_iter if 'ingredients-unknown-score-above-0' not in p.get( 'quality_tags', [])) data = ((product['code'], { 'ingredients_text_fr': normalize_ingredient_list(product['ingredients_text_fr']) }) for product in product_iter) logger.info("Importing products") es_client = get_es_client() perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX)
def generate_training_df(self, dataset: ProductDataset) -> pd.DataFrame: training_dataset_iter = (dataset.stream().filter_by_country_tag( 'en:france').filter_nonempty_text_field( 'product_name').filter_nonempty_tag_field('categories_tags')) training_dataset = [] processed = 0 for product in training_dataset_iter: processed += 1 transformed_product = self.transform_product(product, add_category=True) if 'deepest_category' in transformed_product: training_dataset.append(transformed_product) logger.info("{} training samples discarded (category not in " "taxonomy), {} remaining" "".format(processed - len(training_dataset), len(training_dataset))) return pd.DataFrame(training_dataset)
def generate_insights(client, confidence=1): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( "en:france").filter_nonempty_text_field("ingredients_text_fr").iter()) for product in product_iter: text = product["ingredients_text_fr"] corrections = generate_corrections(client, text, confidence=confidence) if not corrections: continue term_corrections = list( itertools.chain.from_iterable( (c.term_corrections for c in corrections))) yield { "corrections": [dataclasses.asdict(c) for c in term_corrections], "text": text, "corrected": generate_corrected_text(term_corrections, text), "barcode": product["code"], }
def generate_insights(client, confidence=1): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( 'en:france').filter_nonempty_text_field('ingredients_text_fr').iter()) for product in product_iter: text = product['ingredients_text_fr'] corrections = generate_corrections(client, text, confidence=confidence) if not corrections: continue term_corrections = list( itertools.chain.from_iterable( (c.term_corrections for c in corrections))) yield { 'corrections': [dataclasses.asdict(c) for c in term_corrections], 'text': text, 'corrected': generate_corrected_text(term_corrections, text), 'barcode': product['code'], }
def product_export(): dataset = ProductDataset(settings.JSONL_DATASET_PATH) product_iter = (dataset.stream().filter_by_country_tag( "en:france").filter_nonempty_text_field( "ingredients_text_fr").filter_by_state_tag("en:complete").iter()) product_iter = (p for p in product_iter if "ingredients-unknown-score-above-0" not in p.get( "quality_tags", [])) data = (( product["code"], { "ingredients_text_fr": normalize_ingredient_list(product["ingredients_text_fr"]) }, ) for product in product_iter) logger.info("Importing products") es_client = get_es_client() inserted = perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX) logger.info("{} rows inserted".format(inserted))
image_url = generate_image_url(barcode, image_name) logger.info("Downloading image {}".format(image_url)) r = requests.get(image_url) with open(str(image_path), 'wb') as fd: logger.info("Saving image in {}".format(image_path)) for chunk in r.iter_content(chunk_size=128): fd.write(chunk) seen_set = load_seen_set() count = 0 for product in (ds.stream().filter_by_state_tag('en:complete'). filter_by_country_tag('en:france').filter_nonempty_text_field( 'code').filter_nonempty_tag_field('images')): barcode = product['code'] if barcode in seen_set: print("Product already seen: {}".format(barcode)) continue has_nutrition = False has_front = False for image_key, image_meta in product.get('images', {}).items(): if not has_nutrition and image_key.startswith('nutrition'): has_nutrition = True save_image(NUTRITION_TABLE_IMAGE_DIR, image_meta, barcode) count += 1
image_url = generate_image_url(barcode, image_name) logger.info("Downloading image {}".format(image_url)) r = http_session.get(image_url) with open(str(image_path), "wb") as fd: logger.info("Saving image in {}".format(image_path)) for chunk in r.iter_content(chunk_size=128): fd.write(chunk) seen_set = load_seen_set() count = 0 for product in (ds.stream().filter_by_state_tag("en:complete"). filter_by_country_tag("en:france").filter_nonempty_text_field( "code").filter_nonempty_tag_field("images")): barcode = product["code"] if barcode in seen_set: print("Product already seen: {}".format(barcode)) continue has_nutrition = False has_front = False for image_key, image_meta in product.get("images", {}).items(): if not has_nutrition and image_key.startswith("nutrition"): has_nutrition = True save_image(NUTRITION_TABLE_IMAGE_DIR, image_meta, barcode) count += 1