예제 #1
0
def select_deepest_taxonomized_candidates(candidates: List[Prediction],
                                          taxonomy: Taxonomy):
    """Filter predictions to only keep the deepest items in the taxonomy.

    For instance, for a list of category predictions, the prediction with
    `value_tag` 'en:meat' will be removed if a prediction with `value_tag`
    'en:pork' is in the `candidates` list.

    :param candidates: The list of candidates to filter
    :param taxonomy: The taxonomy to use
    """
    value_tags = set()

    for candidate in candidates:
        if candidate.value_tag is None:
            logger.warning("Unexpected None `value_tag` (candidate: %s)",
                           candidate)
        else:
            value_tags.add(candidate.value_tag)

    nodes = [taxonomy[node] for node in value_tags if node in taxonomy]
    selected_node_ids = set(node.id
                            for node in taxonomy.find_deepest_nodes(nodes))
    return [
        candidate for candidate in candidates
        if candidate.value_tag in selected_node_ids
    ]
def get_deepest_categories(
        taxonomy: Taxonomy,
        categories_tags: Iterable[List[str]]) -> List[List[str]]:
    return [
        sorted((x.id for x in taxonomy.find_deepest_nodes(
            [taxonomy[c] for c in categories])))
        for categories in categories_tags
    ]
예제 #3
0
 def __init__(self, category_taxonomy: Taxonomy):
     self.category_taxonomy: Taxonomy = category_taxonomy
     self.categories_set: Set[str] = set(category_taxonomy.keys())
     self.categories: List[str] = sorted(self.categories_set)
     self.categories_to_index: Dict[str, int] = {
         cat: i for (i, cat) in enumerate(self.categories)
     }
     self.transformer: Optional[ColumnTransformer] = None
     self.classifier: Optional[HierarchicalClassifier] = None
예제 #4
0
    def load(cls, model_dir: str) -> 'CategoryClassifier':
        model_dir_path = pathlib.Path(model_dir)
        transformer = joblib.load(str(model_dir_path / cls.TRANSFORMER_PATH))
        classifier = joblib.load(str(model_dir_path / cls.CLASSIFIER_PATH))

        with open(str(model_dir_path / cls.CATEGORY_TAXONOMY_PATH), 'r') as f:
            category_taxonomy_data = json.load(f)

        category_taxonomy = Taxonomy.from_dict(category_taxonomy_data)
        instance = cls(category_taxonomy)
        instance.transformer = transformer
        instance.classifier = classifier
        return instance
예제 #5
0
def generate_category_data(
        category_taxonomy: Taxonomy) -> Iterable[Tuple[str, Dict]]:
    for category_node in category_taxonomy.iter_nodes():
        supported_langs = [
            lang for lang in category_node.names if lang in SUPPORTED_LANG
        ]

        data = {
            "{}:name".format(lang): category_node.names[lang]
            for lang in supported_langs
        }
        data["id"] = category_node.id

        id_ = hashlib.sha256(category_node.id.encode("utf-8")).hexdigest()

        yield id_, data
예제 #6
0
    def process_predictions(
        y_pred: np.ndarray,
        category_names: List[str],
        taxonomy: Taxonomy,
        threshold: float = 0.5,
        deepest_only: bool = False,
    ) -> List[List[CategoryPrediction]]:
        y_pred_int = (y_pred > threshold).astype(y_pred.dtype)
        y_pred_int_filled = fill_ancestors(y_pred_int,
                                           taxonomy=taxonomy,
                                           category_names=category_names)

        predicted = []
        for i in range(y_pred_int_filled.shape[0]):
            predicted_categories_ids = y_pred_int_filled[i].nonzero()[0]
            predicted_categories = [
                category_names[id_] for id_ in predicted_categories_ids
            ]

            product_predicted = []
            for predicted_category_id, predicted_category in zip(
                    predicted_categories_ids, predicted_categories):
                confidence = y_pred[i, predicted_category_id]
                product_predicted.append(
                    (predicted_category, float(confidence)))

            product_predicted = sorted(product_predicted,
                                       key=operator.itemgetter(1),
                                       reverse=True)

            if deepest_only:
                category_to_confidence = dict(product_predicted)
                product_predicted = [
                    (x.id, category_to_confidence[x.id])
                    for x in taxonomy.find_deepest_nodes(
                        [taxonomy[c] for c, confidence in product_predicted])
                ]
            predicted.append(product_predicted)

        return predicted
예제 #7
0
def test_predict(mocker, deepest_only, mock_response, expected_values):
    category_taxonomy = Taxonomy.from_dict({
        "en:meat": {
            "names": "meat",
        },
        "en:fish": {
            "names": "fish",
        },
        "en:salmon": {
            "names": "salmon",
            "parents": ["en:fish"],
        },
        "en:smoked-salmon": {
            "names": "salmon",
            "parents": ["en:salmon"],
        },
    })

    classifier = CategoryClassifier(category_taxonomy)

    mocker.patch(
        "robotoff.prediction.category.neural.category_classifier.http_session.post",
        return_value=mock_response,
    )

    predictions = classifier.predict(
        {
            "ingredients_tags": ["ingredient1"],
            "product_name": "Test Product"
        },
        deepest_only,
    )

    assert len(predictions) == len(expected_values)

    for prediction, (value_tag, confidence) in zip(predictions,
                                                   expected_values):
        assert prediction.value_tag == value_tag
        assert prediction.data.get("confidence") == confidence
예제 #8
0
def load_taxonomy(model_dir: pathlib.Path) -> Taxonomy:
    return Taxonomy.from_json(model_dir / CATEGORY_TAXONOMY_NAME)
예제 #9
0
 def test_is_child_of_any(self, taxonomy: Taxonomy, item: str,
                          candidates: List, output: bool):
     assert taxonomy.is_parent_of_any(item, candidates) is output
예제 #10
0
from typing import List

import pytest

from robotoff import settings
from robotoff.taxonomy import Taxonomy

label_taxonomy = Taxonomy.from_json(settings.TAXONOMY_LABEL_PATH)


class TestTaxonomy:
    @pytest.mark.parametrize('taxonomy,item,candidates,output', [
        (label_taxonomy, 'en:organic', {'en:fr-bio-01'}, True),
        (label_taxonomy, 'en:fr-bio-01', {'en:organic'}, False),
        (label_taxonomy, 'en:fr-bio-01', [], False),
        (label_taxonomy, 'en:organic', {'en:gluten-free'}, False),
        (label_taxonomy, 'en:organic',
         {'en:gluten-free', 'en:no-additives', 'en:vegan'}, False),
        (label_taxonomy, 'en:organic',
         {'en:gluten-free', 'en:no-additives', 'en:fr-bio-16'}, True),
    ])
    def test_is_child_of_any(self, taxonomy: Taxonomy, item: str,
                             candidates: List, output: bool):
        assert taxonomy.is_parent_of_any(item, candidates) is output

    def test_is_child_of_any_unknwon_item(self):
        with pytest.raises(ValueError):
            label_taxonomy.is_parent_of_any("unknown-id", set())
예제 #11
0
def _category_taxonomy() -> Taxonomy:
    return Taxonomy.from_dict({"en:mushrooms": {"lang": "fr"}})
        print("Product {} not found".format(barcode))
        continue

    X = generate_data(
        product=product,
        ingredient_to_id=ingredient_to_id,
        product_name_token_to_int=product_name_vocabulary,
        nlp=nlp,
        product_name_max_length=config.model_config.product_name_max_length,
        product_name_preprocessing_config=config.
        product_name_preprocessing_config,
    )

    y_pred = model.predict(X)
    y_pred_int = (y_pred > 0.5).astype(y_pred.dtype)
    taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH)
    y_pred_int_filled = fill_ancestors(y_pred_int,
                                       taxonomy=taxonomy,
                                       category_names=category_names)

    predicted_categories_ids = y_pred_int_filled[0].nonzero()[0]
    predicted_categories = [
        category_names[id_] for id_ in predicted_categories_ids
    ]

    predicted = []
    for predicted_category_id, predicted_category in zip(
            predicted_categories_ids, predicted_categories):
        confidence = y_pred[0, predicted_category_id]
        predicted.append((predicted_category, confidence))
def main():
    args = parse_args()
    config: Config = get_config(args)
    model_config = config.model_config

    output_dir = args.output_dir
    output_dir.mkdir(parents=True, exist_ok=True)

    category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH)
    ingredient_taxonomy = Taxonomy.from_json(
        settings.INGREDIENTS_TAXONOMY_PATH)

    train_df = create_dataframe("train", args.lang)
    test_df = create_dataframe("test", args.lang)
    val_df = create_dataframe("val", args.lang)

    categories_count = count_categories(train_df)
    ingredients_count = count_ingredients(train_df)

    selected_categories = set((cat for cat, count in categories_count.items()
                               if count >= config.category_min_count))
    selected_ingredients = set(
        (ingredient for ingredient, count in ingredients_count.items()
         if count >= config.ingredient_min_count))
    print("{} categories selected".format(len(selected_categories)))
    print("{} ingredients selected".format(len(selected_ingredients)))

    category_names = [
        x for x in sorted(category_taxonomy.keys()) if x in selected_categories
    ]

    ingredient_names = [
        x for x in sorted(ingredient_taxonomy.keys())
        if x in selected_ingredients
    ]

    category_to_id = {name: idx for idx, name in enumerate(category_names)}
    ingredient_to_id = {name: idx for idx, name in enumerate(ingredient_names)}

    nlp = get_nlp(lang=config.lang)

    preprocess_product_name_func = functools.partial(
        preprocess_product_name,
        lower=config.product_name_preprocessing_config.lower,
        strip_accent=config.product_name_preprocessing_config.strip_accent,
        remove_punct=config.product_name_preprocessing_config.remove_punct,
        remove_digit=config.product_name_preprocessing_config.remove_digit,
    )
    preprocessed_product_names_iter = (
        preprocess_product_name_func(product_name)
        for product_name in train_df.product_name)
    train_tokens_iter = tokenize_batch(preprocessed_product_names_iter, nlp)
    product_name_to_int = extract_vocabulary(train_tokens_iter,
                                             config.product_name_min_count)

    model_config.ingredient_voc_size = len(ingredient_to_id)
    model_config.output_dim = len(category_to_id)
    model_config.product_name_voc_size = len(product_name_to_int)

    print("Selected vocabulary: {}".format(len(product_name_to_int)))

    generate_data_partial = functools.partial(
        generate_data_from_df,
        ingredient_to_id=ingredient_to_id,
        category_to_id=category_to_id,
        product_name_max_length=model_config.product_name_max_length,
        product_name_token_to_int=product_name_to_int,
        nlp=nlp,
        product_name_preprocessing_config=config.
        product_name_preprocessing_config,
        nutriment_input=config.model_config.nutriment_input,
    )

    replicates = args.repeat
    if replicates == 1:
        save_dirs = [output_dir]
    else:
        save_dirs = [output_dir / str(i) for i in range(replicates)]

    for i, save_dir in enumerate(save_dirs):
        model = create_model(config)
        save_dir.mkdir(exist_ok=True)
        config.train_config.start_datetime = str(datetime.datetime.utcnow())
        print("Starting training repeat {}".format(i))
        save_product_name_vocabulary(product_name_to_int, save_dir)
        save_config(config, save_dir)
        copy_category_taxonomy(settings.CATEGORY_TAXONOMY_PATH, save_dir)
        save_category_vocabulary(category_to_id, save_dir)
        save_ingredient_vocabulary(ingredient_to_id, save_dir)

        X_train, y_train = generate_data_partial(train_df)
        X_val, y_val = generate_data_partial(val_df)
        X_test, y_test = generate_data_partial(test_df)

        train(
            (X_train, y_train),
            (X_val, y_val),
            (X_test, y_test),
            model,
            save_dir,
            config,
            category_taxonomy,
            category_names,
        )

        config.train_config.end_datetime = str(datetime.datetime.utcnow())
        save_config(config, save_dir)
        config.train_config.start_datetime = None
        config.train_config.end_datetime = None
예제 #14
0
 def test_find_deepest_nodes(self, taxonomy: Taxonomy,
                             items: List[str],
                             output: List[str]):
     item_nodes = [taxonomy[item] for item in items]
     output_nodes = [taxonomy[o] for o in output]
     assert taxonomy.find_deepest_nodes(item_nodes) == output_nodes
예제 #15
0
from typing import List, Set

import pytest

from robotoff import settings
from robotoff.taxonomy import Taxonomy


label_taxonomy = Taxonomy.from_json(settings.TAXONOMY_LABEL_PATH)
category_taxonomy = Taxonomy.from_json(settings.TAXONOMY_CATEGORY_PATH)


class TestTaxonomy:
    @pytest.mark.parametrize('taxonomy,item,candidates,output', [
        (label_taxonomy, 'en:organic', {'en:fr-bio-01'}, True),
        (label_taxonomy, 'en:fr-bio-01', {'en:organic'}, False),
        (label_taxonomy, 'en:fr-bio-01', [], False),
        (label_taxonomy, 'en:organic', {'en:gluten-free'}, False),
        (label_taxonomy, 'en:organic',
         {'en:gluten-free', 'en:no-additives', 'en:vegan'}, False),
        (label_taxonomy, 'en:organic',
         {'en:gluten-free', 'en:no-additives', 'en:fr-bio-16'}, True),
    ])
    def test_is_child_of_any(self, taxonomy: Taxonomy, item: str,
                             candidates: List, output: bool):
        assert taxonomy.is_parent_of_any(item, candidates) is output

    def test_is_child_of_any_unknwon_item(self):
        with pytest.raises(ValueError):
            label_taxonomy.is_parent_of_any("unknown-id", set())
예제 #16
0
    category_to_id: Dict,
    ingredient_to_id: Dict,
    vectorizer: CountVectorizer,
):
    y = generate_y(df.categories_tags, category_to_id)
    X = generate_X(df, ingredient_to_id, vectorizer)
    return X, y


def generate_X(df: pd.DataFrame, ingredient_to_id: Dict, vectorizer: CountVectorizer):
    product_name_matrix = vectorizer.transform(df.product_name)
    ingredient_matrix = process_ingredients(df.known_ingredient_tags, ingredient_to_id)
    return np.concatenate((product_name_matrix.toarray(), ingredient_matrix), axis=1)


category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH)
ingredient_taxonomy = Taxonomy.from_json(settings.INGREDIENTS_TAXONOMY_PATH)

CATEGORY_NAMES = sorted(category_taxonomy.keys())
INGREDIENT_NAMES = sorted(ingredient_taxonomy.keys())

CATEGORY_TO_ID = {name: idx for idx, name in enumerate(CATEGORY_NAMES)}
INGREDIENT_TO_ID = {name: idx for idx, name in enumerate(INGREDIENT_NAMES)}

train_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TRAIN_PATH)).head(1000)
test_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TEST_PATH)).head(100)
val_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_VAL_PATH)).head(100)

count_vectorizer = CountVectorizer(min_df=5, preprocessor=preprocess_product_name)
count_vectorizer.fit(train_df.product_name)