Exemplo n.º 1
0
def predict_from_dataset(
        dataset: ProductDataset,
        from_datetime: Optional[datetime.datetime] = None
) -> Iterable[JSONType]:
    """Return an iterable of category insights, using the provided dataset.

    Args:
        dataset: a ProductDataset
        from_datetime: datetime threshold: only keep products modified after
            `from_datetime`
    """
    product_stream = (
        dataset.stream().filter_nonempty_text_field("code").
        filter_nonempty_text_field("product_name").filter_empty_tag_field(
            "categories_tags").filter_nonempty_tag_field(
                "countries_tags").filter_nonempty_tag_field("languages_codes"))

    if from_datetime:
        product_stream = product_stream.filter_by_modified_datetime(
            from_t=from_datetime)

    product_iter = product_stream.iter()
    logger.info("Performing prediction on products without categories")

    es_client = get_es_client()
    yield from predict_from_iterable(es_client, product_iter)
Exemplo n.º 2
0
    def init_elasticsearch(index: bool, data: bool, product: bool,
                           category: bool, product_version: str):
        import orjson

        from robotoff import settings
        from robotoff.utils.es import get_es_client
        from robotoff.elasticsearch.product.dump import product_export
        from robotoff.elasticsearch.category.dump import category_export

        if index:
            with settings.ELASTICSEARCH_PRODUCT_INDEX_CONFIG_PATH.open(
                    "rb") as f:
                product_index_config = orjson.loads(f.read())

            with settings.ELASTICSEARCH_CATEGORY_INDEX_CONFIG_PATH.open(
                    "rb") as f:
                category_index_config = orjson.loads(f.read())

            client = get_es_client()

            if product:
                client.indices.create(product_version, product_index_config)

            if category:
                client.indices.create("category", category_index_config)

        if data:
            if product:
                product_export(version=product_version)

            if category:
                category_export()
Exemplo n.º 3
0
def init_elasticsearch(
    load_index: bool = False,
    load_data: bool = True,
    to_load: Optional[List[str]] = None,
) -> None:
    """
    This command is used for manual insertion of the Elasticsearch data and/or indexes
    for products and categorties.

    to_load specifies which indexes/data should be loaded - supported values are
    in robotoff.settings.ElasticsearchIndex.
    """
    from robotoff.elasticsearch.export import ElasticsearchExporter
    from robotoff.settings import ElasticsearchIndex
    from robotoff.utils import get_logger
    from robotoff.utils.es import get_es_client

    logger = get_logger()

    es_exporter = ElasticsearchExporter(get_es_client())

    if not to_load:
        return

    for item in to_load:
        if item not in ElasticsearchIndex.SUPPORTED_INDICES:
            logger.error(f"Skipping over unknown Elasticsearch type: '{item}'")
            continue
        if load_index:
            es_exporter.load_index(item,
                                   ElasticsearchIndex.SUPPORTED_INDICES[item])
        if load_data:
            es_exporter.export_index_data(item)
Exemplo n.º 4
0
    def init_elasticsearch(index: bool, data: bool, product: bool,
                           category: bool):
        import json
        from robotoff import settings
        from robotoff.utils.es import get_es_client
        from robotoff.elasticsearch.product.dump import product_export
        from robotoff.elasticsearch.category.dump import category_export

        if index:
            with settings.ELASTICSEARCH_PRODUCT_INDEX_CONFIG_PATH.open(
                    'r') as f:
                product_index_config = json.load(f)

            with settings.ELASTICSEARCH_CATEGORY_INDEX_CONFIG_PATH.open(
                    'r') as f:
                category_index_config = json.load(f)

            client = get_es_client()

            if product:
                client.indices.create('product', product_index_config)

            if category:
                client.indices.create('category', category_index_config)

        if data:
            if product:
                product_export()

            if category:
                category_export()
Exemplo n.º 5
0
def test_load_index_already_exists(mocker):
    mocker.patch("elasticsearch.client.IndicesClient.exists",
                 return_value=True)
    create_call = mocker.patch("elasticsearch.client.IndicesClient.create")

    exporter = ElasticsearchExporter(get_es_client())
    exporter.load_index("category", "filepath/")

    create_call.assert_not_called()
Exemplo n.º 6
0
    def generate_spellcheck_insights(output: str, confidence: float):
        from robotoff.utils import dump_jsonl
        from robotoff.utils.es import get_es_client
        from robotoff.ingredients import generate_insights
        from robotoff.utils import get_logger
        get_logger()

        client = get_es_client()
        insights_iter = generate_insights(client, confidence=confidence)
        dump_jsonl(output, insights_iter)
Exemplo n.º 7
0
def _refresh_elasticsearch():
    logger.info("Refreshing Elasticsearch data")

    es_client = get_es_client()
    exporter = ElasticsearchExporter(es_client)

    for index, config_path in settings.ElasticsearchIndex.SUPPORTED_INDICES.items(
    ):
        exporter.load_index(index, config_path)
        exporter.export_index_data(index)
Exemplo n.º 8
0
def test_load_index(mocker):
    mocker.patch("elasticsearch.client.IndicesClient.exists",
                 return_value=False)
    create_call = mocker.patch("elasticsearch.client.IndicesClient.create")

    exporter = ElasticsearchExporter(get_es_client())

    with patch("builtins.open", mocker.mock_open(read_data='{"a":"b"}')):
        exporter.load_index("category", "filepath/")

    create_call.assert_called_once()
Exemplo n.º 9
0
def category_export():
    logger.info("Starting category export to Elasticsearch...")
    client = get_es_client()
    category_taxonomy: Taxonomy = get_taxonomy(InsightType.category.name)
    logger.info("Deleting existing categories...")
    delete_categories(client)
    logger.info("Starting export...")
    category_data = generate_category_data(category_taxonomy)
    rows_inserted = perform_export(client, category_data,
                                   settings.ELASTICSEARCH_CATEGORY_INDEX)
    logger.info("%d rows inserted" % rows_inserted)
Exemplo n.º 10
0
    def test_spellcheck(text: str, confidence: float):
        import json
        from robotoff.utils.es import get_es_client
        from robotoff.spellcheck import Spellchecker
        from robotoff.utils import get_logger

        get_logger()
        client = get_es_client()
        result = Spellchecker.load(client=client,
                                   confidence=confidence).predict_insight(
                                       text, detailed=True)
        print(json.dumps(result, indent=5))
Exemplo n.º 11
0
def test_export_category_index_data(mocker):
    del_by_query = mocker.patch(
        "robotoff.elasticsearch.export.Elasticsearch.delete_by_query",
        return_value={"deleted": 10},
    )
    bulk_insert = mocker.patch(
        "robotoff.utils.es.elasticsearch.Elasticsearch.bulk")
    mocker.patch(
        "robotoff.elasticsearch.category.dump.get_taxonomy",
        return_value=_category_taxonomy(),
    )

    exporter = ElasticsearchExporter(get_es_client())
    inserted = exporter.export_index_data("category")

    del_by_query.assert_called_once()
    bulk_insert.assert_called_once()
    assert inserted == 1
Exemplo n.º 12
0
def product_export():
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        'en:france').filter_nonempty_text_field(
            'ingredients_text_fr').filter_by_state_tag('en:complete').iter())
    product_iter = (p for p in product_iter
                    if 'ingredients-unknown-score-above-0' not in p.get(
                        'quality_tags', []))

    data = ((product['code'], {
        'ingredients_text_fr':
        normalize_ingredient_list(product['ingredients_text_fr'])
    }) for product in product_iter)

    logger.info("Importing products")

    es_client = get_es_client()
    perform_export(es_client, data, settings.ELASTICSEARCH_PRODUCT_INDEX)
Exemplo n.º 13
0
def generate_spellcheck_insights(
    output: str,
    index_name: str = "product_all",
    confidence: float = 0.5,
    max_errors: Optional[int] = None,
    limit: Optional[int] = None,
) -> None:
    from robotoff.spellcheck import Spellchecker
    from robotoff.utils import dump_jsonl, get_logger
    from robotoff.utils.es import get_es_client

    logger = get_logger()
    logger.info("Max errors: {}".format(max_errors))

    client = get_es_client()
    insights_iter = Spellchecker.load(client=client,
                                      confidence=confidence,
                                      index_name=index_name).generate_insights(
                                          max_errors=max_errors, limit=limit)

    dump_jsonl(output, insights_iter)
Exemplo n.º 14
0
def product_export():
    dataset = ProductDataset(settings.JSONL_DATASET_PATH)

    product_iter = (dataset.stream().filter_by_country_tag(
        "en:france").filter_nonempty_text_field(
            "ingredients_text_fr").filter_by_state_tag("en:complete").iter())
    product_iter = (p for p in product_iter
                    if "ingredients-unknown-score-above-0" not in p.get(
                        "quality_tags", []))

    data = ((
        product["code"],
        {
            "ingredients_text_fr":
            normalize_ingredient_list(product["ingredients_text_fr"])
        },
    ) for product in product_iter)

    logger.info("Importing products")

    es_client = get_es_client()
    inserted = perform_export(es_client, data,
                              settings.ELASTICSEARCH_PRODUCT_INDEX)
    logger.info("{} rows inserted".format(inserted))
Exemplo n.º 15
0
def predict_from_product(product: Dict) -> Optional[Dict]:
    client = get_es_client()
    return predict(client, product)
Exemplo n.º 16
0

def match(client, query: str, lang: str):
    body = generate_request(query, lang)
    return client.search(
        index=settings.ELASTICSEARCH_CATEGORY_INDEX,
        doc_type=settings.ELASTICSEARCH_TYPE,
        body=body,
        _source=True,
    )


def generate_request(query: str, lang: str):
    return {
        "query": {"match_phrase": {"{}:name.stemmed".format(lang): {"query": query,}}}
    }


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("query", help="query to search")
    parser.add_argument("--lang", help="language of the query", default="fr")
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    es_client = get_es_client()
    results = match(es_client, args.query, args.lang)
    print(json.dumps(results["hits"], indent=4))