Пример #1
0
def update_insight_attributes(product: Product,
                              insight: ProductInsight) -> bool:
    to_update = False
    if insight.brands != product.brands_tags:
        logger.info("Updating brand {} -> {} ({})".format(
            insight.brands, product.brands_tags, product.barcode))
        to_update = True
        insight.brands = product.brands_tags

    if insight.countries != product.countries_tags:
        logger.info("Updating countries {} -> {} ({})".format(
            insight.countries, product.countries_tags, product.barcode))
        to_update = True
        insight.countries = product.countries_tags

    if insight.unique_scans_n != product.unique_scans_n:
        logger.info("Updating unique scan count {} -> {} ({})".format(
            insight.unique_scans_n, product.unique_scans_n, product.barcode))
        to_update = True
        insight.unique_scans_n = product.unique_scans_n

    if to_update:
        insight.save()

    return to_update
Пример #2
0
    def annotate(
        self,
        insight: ProductInsight,
        annotation: int,
        update=True,
        auth: Optional[OFFAuthentication] = None,
    ) -> AnnotationResult:
        username: Optional[str] = None
        if auth is not None:
            username = auth.username

            if auth.session_cookie:
                username = extract_username(auth.session_cookie)

        with db.atomic():
            insight.annotation = annotation
            insight.completed_at = datetime.datetime.utcnow()
            insight.save()

            if username:
                UserAnnotation.create(insight=insight, username=username)

        if annotation == 1 and update:
            return self.update_product(insight, auth=auth)

        return SAVED_ANNOTATION_RESULT
Пример #3
0
def test_mark_insights():
    now = datetime.utcnow()
    # not automatic
    not_auto = ProductInsightFactory(automatic_processing=False)
    # already marked
    marked = ProductInsightFactory(
        automatic_processing=True,
        annotation=None,
        process_after=now - timedelta(minutes=2),
    )
    # already annotated
    annotated = ProductInsightFactory(automatic_processing=True, annotation=1)
    # ready to be marked
    ready1 = ProductInsightFactory(automatic_processing=True)
    ready2 = ProductInsightFactory(automatic_processing=True)
    # run
    start = datetime.utcnow()
    num_marked = scheduler.mark_insights()
    end = datetime.utcnow()
    ten_min = timedelta(minutes=10)
    # two marked
    assert num_marked == 2
    assert (start + ten_min < ProductInsight.get(id=ready1.id).process_after <
            end + ten_min)
    assert (start + ten_min < ProductInsight.get(id=ready2.id).process_after <
            end + ten_min)
    # other did not change
    assert ProductInsight.get(id=not_auto).process_after is None
    assert ProductInsight.get(id=annotated).process_after is None
    assert ProductInsight.get(id=marked).process_after < start

    # run again should not mark anything more
    num_marked = scheduler.mark_insights()
    assert num_marked == 0
Пример #4
0
    def import_insights(
        cls,
        predictions: List[Prediction],
        server_domain: str,
        automatic: bool,
        product_store: DBProductStore,
    ) -> int:
        """Import insights, this is the main method.

        :return: the number of insights that were imported.
        """
        required_prediction_types = cls.get_required_prediction_types()
        for prediction in predictions:
            if prediction.type not in required_prediction_types:
                raise ValueError(
                    f"unexpected prediction type: '{prediction.type}'")

        inserts = 0
        for to_create, to_delete in cls.generate_insights(
                predictions, server_domain, automatic, product_store):
            if to_delete:
                to_delete_ids = [insight.id for insight in to_delete]
                logger.info(
                    f"Deleting insight IDs: {[str(x) for x in to_delete_ids]}")
                ProductInsight.delete().where(
                    ProductInsight.id.in_(to_delete_ids)).execute()
            if to_create:
                inserts += batch_insert(
                    ProductInsight,
                    (model_to_dict(insight) for insight in to_create),
                    50,
                )

        return inserts
Пример #5
0
    def generate_candidates(
        cls,
        product: Product,
        predictions: List[Prediction],
    ) -> Iterator[ProductInsight]:
        if product.quantity is not None or not predictions:
            # Don't generate candidates if the product weight is already
            # specified or if there are no predictions
            return

        # Only generate a single prediction at a time.
        # Predictions are sorted by ascending priority, so the first
        # prediction is assumed to be the best one
        prediction = predictions[0]
        insights_by_subtype = cls.group_by_subtype(predictions)

        insight = ProductInsight(**prediction.to_dict())
        if (len(
                set(x.value
                    for x in insights_by_subtype[insight.data["matcher_type"]])
        ) > 1) or insight.data.get("source") == "product_name":
            # Multiple candidates with the same subtype and value, or product
            # weight coming from the product name (less accurate that OCR data)
            # -> don't process automatically
            insight.automatic_processing = False

        yield insight
Пример #6
0
    def test_get_insight_update_annotated_reference(self):
        class TestInsightImporter(InsightImporter):
            @classmethod
            def is_conflicting_insight(cls, candidate, reference):
                return candidate.value_tag == reference.value_tag

        references = [
            ProductInsight(
                barcode=DEFAULT_BARCODE,
                type=InsightType.label,
                value_tag="tag1",
                id=uuid.UUID("a6aa784b-4d39-4baa-a16c-b2f1c9dac9f9"),
                annotation=0,
            ),
        ]
        candidates = [
            ProductInsight(
                barcode=DEFAULT_BARCODE,
                type=InsightType.label,
                value_tag="tag2",
                id=uuid.UUID("c984b252-fb31-41ea-b78e-6ca08b9f5e4b"),
            ),
        ]
        (
            to_create,
            to_delete,
        ) = InsightImporterWithIsConflictingInsight.get_insight_update(
            candidates, references)
        assert to_create == candidates
        # Annotated existing insight should not be deleted
        assert to_delete == []
def test_process_insight_category(mocker):
    mocker.patch("robotoff.insights.annotate.get_product",
                 return_value={"categories_tags": []})
    mock = mocker.patch("robotoff.off.update_product")
    # a processed insight exists
    date0 = datetime.utcnow() - timedelta(minutes=10)
    id0, code0 = _create_insight(type="category",
                                 completed_at=date0,
                                 annotation=1)
    # an insight to be processed
    id1, code1 = _create_insight(type="category")
    # run process
    process_insights()
    # insight 0 not touched
    assert ProductInsight.get(id=id0).completed_at == date0
    # insight 1 processed
    insight = ProductInsight.get(id=id1)
    assert insight.completed_at is not None
    assert insight.completed_at <= datetime.utcnow()
    assert insight.annotation == 1
    # update_product calledfor item 1
    mock.assert_called_once_with(
        {
            "code": code1,
            "add_categories": "en:Salmons",
            "comment": f"[robotoff] Adding category 'en:Salmons', ID: {id1}",
        },
        auth=None,
        server_domain=settings.OFF_SERVER_DOMAIN,
    )
Пример #8
0
def test_popular_question_pagination(client, mocker):
    mocker.patch("robotoff.insights.question.get_product", return_value={})
    ProductInsight.delete().execute()  # remove default sample
    for i in range(0, 12):
        ProductInsightFactory(barcode=i, unique_scans_n=100 - i)

    result = client.simulate_get("/api/v1/questions/popular?count=5&page=1")
    assert result.status_code == 200
    data = result.json
    assert data["count"] == 12
    assert data["status"] == "found"
    assert [q["barcode"]
            for q in data["questions"]] == ["0", "1", "2", "3", "4"]
    result = client.simulate_get("/api/v1/questions/popular?count=5&page=2")
    assert result.status_code == 200
    data = result.json
    assert data["count"] == 12
    assert data["status"] == "found"
    assert [q["barcode"]
            for q in data["questions"]] == ["5", "6", "7", "8", "9"]
    result = client.simulate_get("/api/v1/questions/popular?count=5&page=3")
    assert result.status_code == 200
    data = result.json
    assert data["count"] == 12
    assert data["status"] == "found"
    assert [q["barcode"] for q in data["questions"]] == ["10", "11"]
    result = client.simulate_get("/api/v1/questions/popular?count=5&page=4")
    assert result.status_code == 200
    data = result.json
    assert data["count"] == 12
    assert data["status"] == "no_questions"
    assert len(data["questions"]) == 0
Пример #9
0
 def generate_candidates(
     cls,
     product: Product,
     predictions: List[Prediction],
 ) -> Iterator[ProductInsight]:
     for prediction in predictions:
         insight = ProductInsight(**prediction.to_dict())
         insight.automatic_processing = True
         yield insight
Пример #10
0
 def test_import_one(self, predictions):
     imported = self._run_import(predictions)
     assert imported == 1
     # no insight created
     assert ProductInsight.select().count() == 1
     inserted = ProductInsight.get(ProductInsight.id != insight_id1)
     assert inserted.value_tag == "en:smoked-salmons"
     assert inserted.server_domain == settings.OFF_SERVER_DOMAIN
     assert not inserted.automatic_processing
Пример #11
0
def delete_invalid_insight(insight: ProductInsight,
                           validator: Optional[InsightValidator]) -> bool:
    if validator is None:
        return False

    if not validator.is_valid(insight):
        insight.delete_instance()
        return True

    return False
Пример #12
0
    def annotate(self, insight: ProductInsight, annotation: int, update=True) \
            -> AnnotationResult:
        insight.annotation = annotation
        insight.completed_at = datetime.datetime.utcnow()
        insight.save()

        if annotation == 1 and update:
            return self.update_product(insight)

        return SAVED_ANNOTATION_RESULT
Пример #13
0
    def process_annotation(
        self,
        insight: ProductInsight,
        data: Optional[Dict] = None,
        auth: Optional[OFFAuthentication] = None,
    ) -> AnnotationResult:
        insight.data["annotation"] = data
        insight.save()

        return SAVED_ANNOTATION_RESULT
Пример #14
0
 def test_import_auto(self):
     imported = self._run_import([
         neural_prediction("en:smoked-salmons", confidence=0.91, auto=True)
     ])
     assert imported == 1
     # no insight created
     assert ProductInsight.select().count() == 1
     inserted = ProductInsight.get(ProductInsight.id != insight_id1)
     assert inserted.value_tag == "en:smoked-salmons"
     assert inserted.server_domain == settings.OFF_SERVER_DOMAIN
     assert inserted.automatic_processing
Пример #15
0
    def annotate(self,
                 insight: ProductInsight,
                 annotation: int,
                 update=True,
                 session_cookie: Optional[str] = None) -> AnnotationResult:
        insight.annotation = annotation
        insight.completed_at = datetime.datetime.utcnow()
        insight.save()

        if annotation == 1 and update:
            return self.update_product(insight, session_cookie=session_cookie)

        return SAVED_ANNOTATION_RESULT
def test_process_insight_update_product_raises(mocker):
    def raise_for_salmons(params, *args, **kwargs):
        if "en:Salmons" in params.values():
            raise Exception("Boom !")
        else:
            return

    mocker.patch("robotoff.insights.annotate.get_product",
                 return_value={"categories_tags": []})
    mock = mocker.patch("robotoff.off.update_product",
                        side_effect=raise_for_salmons)
    # an insight to be processed, that will raise
    id1, code1 = _create_insight(type="category")
    # add another insight that should pass
    id2, code2 = _create_insight(type="category", value_tag="en:Tuna")
    # run process
    start = datetime.utcnow()
    process_insights()
    end = datetime.utcnow()
    # insight1 not marked processed
    insight = ProductInsight.get(id=id1)
    assert insight.completed_at is None
    assert insight.annotation is None
    # but update_product was called
    mock.assert_any_call(
        {
            "code": code1,
            "add_categories": "en:Salmons",
            "comment": f"[robotoff] Adding category 'en:Salmons', ID: {id1}",
        },
        auth=None,
        server_domain=settings.OFF_SERVER_DOMAIN,
    )
    # insight2 processed
    # and update_product was called
    insight = ProductInsight.get(id=id2)
    assert insight.completed_at is not None
    assert start < insight.completed_at < end
    assert insight.annotation == 1
    mock.assert_any_call(
        {
            "code": code2,
            "add_categories": "en:Tuna",
            "comment": f"[robotoff] Adding category 'en:Tuna', ID: {id2}",
        },
        auth=None,
        server_domain=settings.OFF_SERVER_DOMAIN,
    )
    # we add only two calls
    assert mock.call_count == 2
Пример #17
0
 def generate_insights(cls, predictions, server_domain, automatic,
                       product_store):
     yield [
         ProductInsight(
             barcode=DEFAULT_BARCODE,
             type=InsightType.label.name,
             value_tag="tag1",
         )
     ], [
         ProductInsight(
             barcode=DEFAULT_BARCODE,
             type=InsightType.label.name,
             value_tag="tag2",
         )
     ]
Пример #18
0
    def process_product_insights(self, barcode: str,
                                 insights: List[JSONType]) \
            -> Iterable[JSONType]:
        code_seen: Set[str] = set()

        for t in (ProductInsight.select(
                ProductInsight.data['text'].as_json().alias('text')).where(
                    ProductInsight.type == self.get_type(),
                    ProductInsight.barcode == barcode)).iterator():
            code_seen.add(t.text)

        for insight in insights:
            content = insight['content']
            emb_code = content['text']

            if not self.is_valid(barcode, emb_code, code_seen):
                continue

            source = insight['source']
            yield {
                'source_image': source,
                'data': {
                    'source': source,
                    'matcher_type': content['type'],
                    'raw': content['raw'],
                    'text': emb_code,
                    'notify': content['notify'],
                }
            }
            code_seen.add(emb_code)
Пример #19
0
def apply_insights(insight_type: str, max_timedelta: datetime.timedelta):
    logger.info("Timedelta: {}".format(max_timedelta))
    count = 0
    insight: ProductInsight

    annotator = InsightAnnotatorFactory.get(insight_type)
    authorized_labels: Set[str] = AUTHORIZED_LABELS_STORE.get()

    for insight in (ProductInsight.select().where(
            ProductInsight.type == insight_type,
            ProductInsight.annotation.is_null(),
    ).order_by(fn.Random())):
        if (insight.process_after is not None
                and insight.process_after >= datetime.datetime.utcnow()):
            continue

        if (insight_type == InsightType.label.name
                and insight.value_tag not in authorized_labels):
            continue

        try:
            is_processable = is_automatically_processable(
                insight, max_timedelta)
        except InvalidInsight:
            logger.info("Deleting insight {}".format(insight.id))
            insight.delete_instance()
            continue

        if is_processable:
            logger.info("Annotating insight {} (barcode: {})".format(
                insight.value_tag or insight.value, insight.barcode))
            annotator.annotate(insight, 1, update=True, automatic=True)
            count += 1

    logger.info("Annotated insights: {}".format(count))
Пример #20
0
def batch_annotate(insight_type: str,
                   dry: bool = True,
                   json_contains: Optional[Dict] = None):
    annotator = InsightAnnotatorFactory.get(insight_type)

    i = 0

    query = ProductInsight.select()
    where_clauses = [
        ProductInsight.type == insight_type,
        ProductInsight.annotation.is_null()
    ]

    if json_contains is not None:
        where_clauses.append(ProductInsight.data.contains(json_contains))

    query = query.where(*where_clauses)

    if dry:
        count = query.count()
        print("-- dry run --\n"
              "{} items matching filter:\n"
              "   insight type: {}\n"
              "   filter: {}"
              "".format(count, insight_type, json_contains))
    else:
        for insight in query:
            i += 1
            print("Insight %d" % i)
            print("Add label {} to https://fr.openfoodfacts.org/produit/{}"
                  "".format(insight.data,
                            insight.barcode))
            print(insight.data)

            annotator.annotate(insight, 1, update=True)
Пример #21
0
def save_insight(
    insight_id: str,
    annotation: int,
    update: bool = True,
    data: Optional[Dict] = None,
    auth: Optional[OFFAuthentication] = None,
) -> AnnotationResult:
    try:
        insight: Union[ProductInsight,
                       None] = ProductInsight.get_by_id(insight_id)
    except ProductInsight.DoesNotExist:
        insight = None

    if not insight:
        return UNKNOWN_INSIGHT_RESULT

    if insight.annotation is not None:
        return ALREADY_ANNOTATED_RESULT

    annotator = InsightAnnotatorFactory.get(insight.type)
    return annotator.annotate(insight,
                              annotation,
                              update,
                              data=data,
                              auth=auth)
Пример #22
0
def process_insights():
    processed = 0
    for insight in (ProductInsight.select().where(
            ProductInsight.annotation.is_null(),
            ProductInsight.process_after.is_null(False),
            ProductInsight.process_after <= datetime.datetime.utcnow(),
    ).iterator()):
        try:
            annotator = InsightAnnotatorFactory.get(insight.type)
            logger.info("Annotating insight %s (product: %s)", insight.id,
                        insight.barcode)
            annotation_result = annotator.annotate(insight, 1, update=True)
            processed += 1

            if annotation_result == UPDATED_ANNOTATION_RESULT and insight.data.get(
                    "notify", False):
                slack.NotifierFactory.get_notifier(
                ).notify_automatic_processing(insight)
        except Exception as e:
            # continue to the next one
            # Note: annotator already rolled-back the transaction
            logger.exception(
                f"exception {e} while handling annotation of insight %s (product) %s",
                insight.id,
                insight.barcode,
            )
    logger.info("%d insights processed", processed)
Пример #23
0
    def on_get(self, req: falcon.Request, resp: falcon.Response, insight_id: str):
        try:
            insight: ProductInsight = ProductInsight.get_by_id(insight_id)
        except ProductInsight.DoesNotExist:
            raise falcon.HTTPNotFound()

        resp.media = insight.to_dict()
Пример #24
0
def test_annotate_insight_not_enough_votes(client):
    result = client.simulate_post(
        "/api/v1/insights/annotate",
        params={
            "insight_id": insight_id,
            "annotation": -1,
            "device_id": "voter1",
        },
    )

    assert result.status_code == 200
    assert result.json == {
        "description": "the annotation vote was saved",
        "status": "vote_saved",
    }

    # For non-authenticated users we expect the insight to not be validated, with only a vote being cast.
    votes = list(AnnotationVote.select().dicts())
    assert len(votes) == 1

    assert votes[0]["value"] == -1
    assert votes[0]["username"] is None
    assert votes[0]["device_id"] == "voter1"

    insight = next(ProductInsight.select().where(
        ProductInsight.id == insight_id).dicts().iterator())

    assert not any(insight[key]
                   for key in ("username", "completed_at", "annotation"))
    assert insight.items() > {"n_votes": 1}.items()
Пример #25
0
def updated_product_update_insights(barcode: str):
    product_dict = get_product(barcode)

    if product_dict is None:
        logger.warn("Updated product does not exist: {}".format(barcode))
        return

    category_added = updated_product_add_category_insight(
        barcode, product_dict)

    if category_added:
        logger.info("Product {} updated".format(barcode))

    product = Product(product_dict)
    validators: Dict[str, InsightValidator] = {}

    for insight in (ProductInsight.select().where(
            ProductInsight.annotation.is_null(),
            ProductInsight.barcode == barcode).iterator()):
        if insight.type not in validators:
            validators[insight.type] = InsightValidatorFactory.create(
                insight.type, None)

        validator = validators[insight.type]
        insight_deleted = delete_invalid_insight(insight,
                                                 validator=validator,
                                                 product=product)
        if insight_deleted:
            logger.info("Insight {} deleted (type: {})".format(
                insight.id, insight.type))
Пример #26
0
def refresh_insights(with_deletion: bool = False):
    deleted = 0
    updated = 0
    product_store = CACHED_PRODUCT_STORE.get()

    datetime_threshold = datetime.datetime.utcnow().replace(hour=0,
                                                            minute=0,
                                                            second=0,
                                                            microsecond=0)
    dataset_datetime = datetime.datetime.fromtimestamp(
        os.path.getmtime(settings.JSONL_MIN_DATASET_PATH))

    if dataset_datetime.date() != datetime_threshold.date():
        logger.warn(
            "Dataset version is not up to date, aborting insight removal job")
        return

    validators: Dict[str, InsightValidator] = {}

    with db:
        with db.atomic():
            for insight in (ProductInsight.select().where(
                    ProductInsight.annotation.is_null(),
                    ProductInsight.timestamp <= datetime_threshold,
                    ProductInsight.server_domain == settings.OFF_SERVER_DOMAIN,
            ).iterator()):
                product: Product = product_store[insight.barcode]

                if product is None:
                    if with_deletion:
                        # Product has been deleted from OFF
                        logger.info("Product with barcode {} deleted"
                                    "".format(insight.barcode))
                        deleted += 1
                        insight.delete_instance()
                else:
                    if insight.type not in validators:
                        validators[
                            insight.type] = InsightValidatorFactory.create(
                                insight.type, product_store)

                    validator = validators[insight.type]
                    insight_deleted = delete_invalid_insight(
                        insight, validator)

                    if insight_deleted:
                        deleted += 1
                        logger.info(
                            "invalid insight {} (type: {}), deleting..."
                            "".format(insight.id, insight.type))
                        continue

                    insight_updated = update_insight_attributes(
                        product, insight)

                    if insight_updated:
                        updated += 1

    logger.info("{} insights deleted".format(deleted))
    logger.info("{} insights updated".format(updated))
Пример #27
0
def get_insights(barcode: Optional[str] = None,
                 keep_types: List[str] = None,
                 country: str = None,
                 brands: List[str] = None,
                 count=25) -> Iterable[ProductInsight]:
    where_clauses = [
        ProductInsight.annotation.is_null(),
    ]

    if barcode:
        where_clauses.append(ProductInsight.barcode == barcode)

    if keep_types:
        where_clauses.append(ProductInsight.type.in_(keep_types))

    if country is not None:
        where_clauses.append(ProductInsight.countries.contains(
            country))

    if brands:
        where_clauses.append(ProductInsight.brands.contains_any(
            brands))

    query = (ProductInsight.select()
                           .where(*where_clauses)
                           .limit(count)
                           .order_by(peewee.fn.Random()))
    return query.iterator()
Пример #28
0
    def process_product_insights(self, barcode: str,
                                 insights: List[JSONType]) \
            -> Iterable[JSONType]:
        label_seen: Set[str] = set()

        for t in (ProductInsight.select(ProductInsight.value_tag).where(
                ProductInsight.type == self.get_type(),
                ProductInsight.barcode == barcode)).iterator():
            label_seen.add(t.value_tag)

        for insight in insights:
            barcode = insight['barcode']
            content = insight['content']
            label_tag = content['label_tag']

            if not self.is_valid(barcode, label_tag, label_seen):
                continue

            source = insight['source']
            automatic_processing = content.pop('automatic_processing', None)
            insert = {
                'value_tag': label_tag,
                'source_image': source,
                'data': {
                    'source': source,
                    **content
                }
            }

            if automatic_processing is not None:
                insert['automatic_processing'] = automatic_processing

            yield insert
            label_seen.add(label_tag)
Пример #29
0
    def process_product_insights(self, barcode: str,
                                 insights: List[JSONType]) \
            -> Iterable[JSONType]:
        if len(insights) > 1:
            logger.info("{} distinct expiration dates found for product "
                        "{}, aborting import".format(len(insights), barcode))
            return

        if ProductInsight.select().where(
                ProductInsight.type == self.get_type(),
                ProductInsight.barcode == barcode).count():
            return

        for insight in insights:
            content = insight['content']

            if not self.is_valid(barcode):
                continue

            source = insight['source']
            yield {
                'source_image': source,
                'data': {
                    'source': source,
                    'notify': content['notify'],
                    **content
                }
            }
            break
Пример #30
0
    def process_insights(
        self,
        data: Iterable[ProductInsights],
        server_domain: str,
        automatic: bool = False,
    ) -> Iterator[Insight]:
        seen_set: Set[Tuple[str, str]] = set(
            (x.barcode, x.data["lang"]) for x in ProductInsight.select(
                ProductInsight.barcode, ProductInsight.data).where(
                    ProductInsight.type == self.get_type(),
                    ProductInsight.server_domain == server_domain,
                    ProductInsight.annotation.is_null(True),
                ).iterator())

        for product_insights in data:
            barcode = product_insights.barcode

            for insight in product_insights.insights:
                lang = insight.data["lang"]
                key = (barcode, lang)

                if key not in seen_set:
                    seen_set.add(key)
                else:
                    continue

                yield Insight.from_raw_insight(insight,
                                               product_insights,
                                               latent=False)