Exemplo n.º 1
0
 def predict_category(output: str):
     from robotoff.elasticsearch.category.predict import predict_from_dataset
     from robotoff.utils import dump_jsonl
     from robotoff.products import ProductDataset
     from robotoff import settings
     dataset = ProductDataset(settings.JSONL_DATASET_PATH)
     dump_jsonl(output, predict_from_dataset(dataset))
Exemplo n.º 2
0
    def export_logo_annotation(
        output: pathlib.Path,
        server_domain: Optional[str] = None,
        annotated: Optional[bool] = None,
    ):
        from robotoff.models import db, LogoAnnotation, ImageModel, ImagePrediction
        from robotoff.utils import dump_jsonl

        with db:
            where_clauses = []

            if server_domain is not None:
                where_clauses.append(ImageModel.server_domain == server_domain)

            if annotated is not None:
                where_clauses.append(
                    LogoAnnotation.annotation_value.is_null(not annotated))

            query = LogoAnnotation.select().join(ImagePrediction).join(
                ImageModel)
            if where_clauses:
                query = query.where(*where_clauses)

            logo_iter = query.iterator()
            dict_iter = (l.to_dict() for l in logo_iter)
            dump_jsonl(output, dict_iter)
Exemplo n.º 3
0
def generate_category_insights(products: Iterable[JSONType], batch_size: int):
    insights = predict_from_product_batch(
        products, allowed_lang={lang}, filter_blacklisted=True, batch_size=batch_size
    )

    dump_jsonl(
        settings.PROJECT_DIR / "category_insights.{}.jsonl".format(lang), insights
    )
Exemplo n.º 4
0
    def predict_category(output: str):
        from robotoff.elasticsearch.category.predict import predict_from_dataset
        from robotoff.utils import dump_jsonl
        from robotoff.products import ProductDataset
        from robotoff import settings

        dataset = ProductDataset(settings.JSONL_DATASET_PATH)
        insights = predict_from_dataset(dataset)
        dict_insights = (i.to_dict() for i in insights)
        dump_jsonl(output, dict_insights)
Exemplo n.º 5
0
    def generate_spellcheck_insights(output: str, confidence: float):
        from robotoff.utils import dump_jsonl
        from robotoff.utils.es import get_es_client
        from robotoff.ingredients import generate_insights
        from robotoff.utils import get_logger
        get_logger()

        client = get_es_client()
        insights_iter = generate_insights(client, confidence=confidence)
        dump_jsonl(output, insights_iter)
Exemplo n.º 6
0
def generate_spellcheck_insights(
    output: str,
    index_name: str = "product_all",
    confidence: float = 0.5,
    max_errors: Optional[int] = None,
    limit: Optional[int] = None,
) -> None:
    from robotoff.spellcheck import Spellchecker
    from robotoff.utils import dump_jsonl, get_logger
    from robotoff.utils.es import get_es_client

    logger = get_logger()
    logger.info("Max errors: {}".format(max_errors))

    client = get_es_client()
    insights_iter = Spellchecker.load(client=client,
                                      confidence=confidence,
                                      index_name=index_name).generate_insights(
                                          max_errors=max_errors, limit=limit)

    dump_jsonl(output, insights_iter)
Exemplo n.º 7
0
def run(lang: Optional[str] = None):
    dataset = ProductDataset.load()
    training_stream = dataset.stream().filter_nonempty_tag_field(
        'categories_tags')

    if lang is not None:
        training_stream = (training_stream.filter_text_field(
            'lang',
            lang).filter_nonempty_text_field('product_name_{}'.format(lang)))
    else:
        training_stream = training_stream.filter_nonempty_text_field(
            'product_name')

    dataset_iter = generate_dataset(training_stream, lang)
    count = dump_jsonl(
        settings.PROJECT_DIR / 'datasets' / 'category' /
        'category_{}.jsonl'.format(lang or 'xx'), dataset_iter)
    print(count)
Exemplo n.º 8
0
def run(lang: Optional[str] = None):
    logger.info("Generating category dataset for lang {}".format(lang or "xx"))
    dataset = ProductDataset.load()
    training_stream = dataset.stream().filter_nonempty_tag_field(
        "categories_tags")

    if lang is not None:
        training_stream = training_stream.filter_text_field(
            "lang",
            lang).filter_nonempty_text_field("product_name_{}".format(lang))
    else:
        training_stream = training_stream.filter_nonempty_text_field(
            "product_name")

    dataset_iter = generate_dataset(training_stream, lang)
    count = dump_jsonl(
        settings.PROJECT_DIR / "datasets" / "category" /
        "category_{}.jsonl".format(lang or "xx"),
        dataset_iter,
    )
    logger.info("{} items for lang {}".format(count, lang or "xx"))
Exemplo n.º 9
0
from robotoff import settings
from robotoff.products import ProductDataset
from robotoff.utils import dump_jsonl, get_logger

logger = get_logger()


def images_dimension_iter():
    dataset = ProductDataset.load()

    for product in dataset.stream().filter_nonempty_text_field("code"):
        images = product.get("images", {})
        for image_id, image_data in images.items():
            if not image_id.isdigit():
                continue

            if "full" not in image_data["sizes"]:
                continue

            width = image_data["sizes"]["full"]["w"]
            height = image_data["sizes"]["full"]["h"]
            yield [int(width), int(height), product["code"], str(image_id)]


dump_jsonl(settings.PROJECT_DIR / "images_dimension.jsonl", images_dimension_iter())
Exemplo n.º 10
0
def dump_insights():
    logger.info("Dumping insights...")
    insights_iter = get_insights(as_dict=True, annotated=None, limit=None)
    insights_iter = transform_insight_iter(insights_iter)
    dumped = dump_jsonl(settings.INSIGHT_DUMP_PATH, insights_iter)
    logger.info("Dump finished, {} insights dumped".format(dumped))
Exemplo n.º 11
0
import pathlib
from random import shuffle

from robotoff import settings
from robotoff.utils import dump_jsonl, jsonl_iter

lang = "pt"
input_path: pathlib.Path = (settings.DATASET_DIR / "category" /
                            "category_{}.jsonl".format(lang))

items = list(jsonl_iter(input_path))
shuffle(items)

val_count = len(items) // 10
val_items = items[:val_count]
test_items = items[val_count:2 * val_count]
train_items = items[2 * val_count:]

dump_jsonl(input_path.with_name("category_{}.val.jsonl".format(lang)),
           val_items)
dump_jsonl(input_path.with_name("category_{}.test.jsonl".format(lang)),
           test_items)
dump_jsonl(input_path.with_name("category_{}.train.jsonl".format(lang)),
           train_items)