示例#1
0
def predict_from_dataset(
        dataset: ProductDataset,
        from_datetime: Optional[datetime.datetime] = None
) -> Iterable[JSONType]:
    """Return an iterable of category insights, using the provided dataset.

    Args:
        dataset: a ProductDataset
        from_datetime: datetime threshold: only keep products modified after
            `from_datetime`
    """
    product_stream = (
        dataset.stream().filter_nonempty_text_field("code").
        filter_nonempty_text_field("product_name").filter_empty_tag_field(
            "categories_tags").filter_nonempty_tag_field(
                "countries_tags").filter_nonempty_tag_field("languages_codes"))

    if from_datetime:
        product_stream = product_stream.filter_by_modified_datetime(
            from_t=from_datetime)

    product_iter = product_stream.iter()
    logger.info("Performing prediction on products without categories")

    es_client = get_es_client()
    yield from predict_from_iterable(es_client, product_iter)
示例#2
0
    def generate_insights(
        self,
        max_errors: Optional[int] = None,
        lang: str = "fr",
        limit: Optional[int] = None,
    ) -> Iterable[Prediction]:
        dataset = ProductDataset(settings.JSONL_DATASET_PATH)
        product_iter = (dataset.stream().filter_by_country_tag(
            "en:france").filter_text_field(
                "lang",
                lang).filter_nonempty_text_field("ingredients_text_fr").iter())

        insights_count = 0
        for product in product_iter:
            if self.is_product_valid(product, max_errors=max_errors):
                insight = self.predict_insight(product["ingredients_text_fr"])
                if insight is not None:
                    insight["lang"] = lang
                    yield Prediction(
                        type=PredictionType.ingredient_spellcheck,
                        data=insight,
                        barcode=product["code"],
                    )

                    insights_count += 1
                    if limit is not None and insights_count >= limit:
                        break
示例#3
0
def compute_brand_prefix(
    product_dataset: ProductDataset, threshold: Optional[int] = None
) -> Dict[Tuple[str, str], int]:
    count: Dict[Tuple[str, str], int] = {}

    for product in (
        product_dataset.stream()
        .filter_nonempty_tag_field("brands_tags")
        .filter_nonempty_text_field("code")
    ):
        brand_tags = set(x for x in product["brands_tags"] if x)
        barcode = product["code"]

        if len(barcode) == 13:
            barcode_prefix = generate_barcode_prefix(barcode)

            for brand_tag in brand_tags:
                key = (brand_tag, barcode_prefix)
                count.setdefault(key, 0)
                count[key] += 1

    if threshold:
        for key in list(count.keys()):
            if count[key] < threshold:
                count.pop(key)

    return count
示例#4
0
 def generate_prediction_df(self, dataset: ProductDataset) -> pd.DataFrame:
     dataset_iter = (
         dataset.stream()
         .filter_by_country_tag("en:france")
         .filter_nonempty_text_field("product_name")
     )
     return pd.DataFrame((self.transform_product(p) for p in dataset_iter))
示例#5
0
def generate_product_data() -> Iterable[Tuple[str, Dict]]:
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_stream = (dataset.stream().filter_text_field(
        "lang",
        "fr").filter_by_country_tag("en:france").filter_nonempty_text_field(
            "ingredients_text_fr").filter_by_state_tag("en:complete"))

    product_iter = product_stream.iter()
    product_iter = (p for p in product_iter
                    if int(p.get("unknown_ingredients_n", 0)) == 0)

    return ((
        product["code"],
        {
            "ingredients_text_fr":
            normalize_ingredient_list(product["ingredients_text_fr"])
        },
    ) for product in product_iter)
示例#6
0
文件: dump.py 项目: hangy/robotoff
def product_export():
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        'en:france').filter_nonempty_text_field(
            'ingredients_text_fr').filter_by_state_tag('en:complete').iter())
    product_iter = (p for p in product_iter
                    if 'ingredients-unknown-score-above-0' not in p.get(
                        'quality_tags', []))

    data = ((product['code'], {
        'ingredients_text_fr':
        normalize_ingredient_list(product['ingredients_text_fr'])
    }) for product in product_iter)

    logger.info("Importing products")

    es_client = get_es_client()
    perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX)
示例#7
0
    def generate_training_df(self, dataset: ProductDataset) -> pd.DataFrame:
        training_dataset_iter = (dataset.stream().filter_by_country_tag(
            'en:france').filter_nonempty_text_field(
                'product_name').filter_nonempty_tag_field('categories_tags'))

        training_dataset = []

        processed = 0
        for product in training_dataset_iter:
            processed += 1
            transformed_product = self.transform_product(product,
                                                         add_category=True)

            if 'deepest_category' in transformed_product:
                training_dataset.append(transformed_product)

        logger.info("{} training samples discarded (category not in "
                    "taxonomy), {} remaining"
                    "".format(processed - len(training_dataset),
                              len(training_dataset)))
        return pd.DataFrame(training_dataset)
示例#8
0
def generate_insights(client, confidence=1):
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        "en:france").filter_nonempty_text_field("ingredients_text_fr").iter())

    for product in product_iter:
        text = product["ingredients_text_fr"]
        corrections = generate_corrections(client, text, confidence=confidence)

        if not corrections:
            continue

        term_corrections = list(
            itertools.chain.from_iterable(
                (c.term_corrections for c in corrections)))

        yield {
            "corrections": [dataclasses.asdict(c) for c in term_corrections],
            "text": text,
            "corrected": generate_corrected_text(term_corrections, text),
            "barcode": product["code"],
        }
示例#9
0
def generate_insights(client, confidence=1):
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        'en:france').filter_nonempty_text_field('ingredients_text_fr').iter())

    for product in product_iter:
        text = product['ingredients_text_fr']
        corrections = generate_corrections(client, text, confidence=confidence)

        if not corrections:
            continue

        term_corrections = list(
            itertools.chain.from_iterable(
                (c.term_corrections for c in corrections)))

        yield {
            'corrections': [dataclasses.asdict(c) for c in term_corrections],
            'text': text,
            'corrected': generate_corrected_text(term_corrections, text),
            'barcode': product['code'],
        }
示例#10
0
文件: dump.py 项目: Wauplin/robotoff
def product_export():
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        "en:france").filter_nonempty_text_field(
            "ingredients_text_fr").filter_by_state_tag("en:complete").iter())
    product_iter = (p for p in product_iter
                    if "ingredients-unknown-score-above-0" not in p.get(
                        "quality_tags", []))

    data = ((
        product["code"],
        {
            "ingredients_text_fr":
            normalize_ingredient_list(product["ingredients_text_fr"])
        },
    ) for product in product_iter)

    logger.info("Importing products")

    es_client = get_es_client()
    inserted = perform_export(es_client, data,
                              settings.ELASTICSEARCH_PRODUCT_INDEX)
    logger.info("{} rows inserted".format(inserted))
示例#11
0
    image_url = generate_image_url(barcode, image_name)
    logger.info("Downloading image {}".format(image_url))
    r = requests.get(image_url)

    with open(str(image_path), 'wb') as fd:
        logger.info("Saving image in {}".format(image_path))
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)


seen_set = load_seen_set()
count = 0

for product in (ds.stream().filter_by_state_tag('en:complete').
                filter_by_country_tag('en:france').filter_nonempty_text_field(
                    'code').filter_nonempty_tag_field('images')):
    barcode = product['code']

    if barcode in seen_set:
        print("Product already seen: {}".format(barcode))
        continue

    has_nutrition = False
    has_front = False

    for image_key, image_meta in product.get('images', {}).items():
        if not has_nutrition and image_key.startswith('nutrition'):
            has_nutrition = True
            save_image(NUTRITION_TABLE_IMAGE_DIR, image_meta, barcode)
            count += 1
示例#12
0
    image_url = generate_image_url(barcode, image_name)
    logger.info("Downloading image {}".format(image_url))
    r = http_session.get(image_url)

    with open(str(image_path), "wb") as fd:
        logger.info("Saving image in {}".format(image_path))
        for chunk in r.iter_content(chunk_size=128):
            fd.write(chunk)


seen_set = load_seen_set()
count = 0

for product in (ds.stream().filter_by_state_tag("en:complete").
                filter_by_country_tag("en:france").filter_nonempty_text_field(
                    "code").filter_nonempty_tag_field("images")):
    barcode = product["code"]

    if barcode in seen_set:
        print("Product already seen: {}".format(barcode))
        continue

    has_nutrition = False
    has_front = False

    for image_key, image_meta in product.get("images", {}).items():
        if not has_nutrition and image_key.startswith("nutrition"):
            has_nutrition = True
            save_image(NUTRITION_TABLE_IMAGE_DIR, image_meta, barcode)
            count += 1