def select_deepest_taxonomized_candidates(candidates: List[Prediction], taxonomy: Taxonomy): """Filter predictions to only keep the deepest items in the taxonomy. For instance, for a list of category predictions, the prediction with `value_tag` 'en:meat' will be removed if a prediction with `value_tag` 'en:pork' is in the `candidates` list. :param candidates: The list of candidates to filter :param taxonomy: The taxonomy to use """ value_tags = set() for candidate in candidates: if candidate.value_tag is None: logger.warning("Unexpected None `value_tag` (candidate: %s)", candidate) else: value_tags.add(candidate.value_tag) nodes = [taxonomy[node] for node in value_tags if node in taxonomy] selected_node_ids = set(node.id for node in taxonomy.find_deepest_nodes(nodes)) return [ candidate for candidate in candidates if candidate.value_tag in selected_node_ids ]
def get_deepest_categories( taxonomy: Taxonomy, categories_tags: Iterable[List[str]]) -> List[List[str]]: return [ sorted((x.id for x in taxonomy.find_deepest_nodes( [taxonomy[c] for c in categories]))) for categories in categories_tags ]
def __init__(self, category_taxonomy: Taxonomy): self.category_taxonomy: Taxonomy = category_taxonomy self.categories_set: Set[str] = set(category_taxonomy.keys()) self.categories: List[str] = sorted(self.categories_set) self.categories_to_index: Dict[str, int] = { cat: i for (i, cat) in enumerate(self.categories) } self.transformer: Optional[ColumnTransformer] = None self.classifier: Optional[HierarchicalClassifier] = None
def load(cls, model_dir: str) -> 'CategoryClassifier': model_dir_path = pathlib.Path(model_dir) transformer = joblib.load(str(model_dir_path / cls.TRANSFORMER_PATH)) classifier = joblib.load(str(model_dir_path / cls.CLASSIFIER_PATH)) with open(str(model_dir_path / cls.CATEGORY_TAXONOMY_PATH), 'r') as f: category_taxonomy_data = json.load(f) category_taxonomy = Taxonomy.from_dict(category_taxonomy_data) instance = cls(category_taxonomy) instance.transformer = transformer instance.classifier = classifier return instance
def generate_category_data( category_taxonomy: Taxonomy) -> Iterable[Tuple[str, Dict]]: for category_node in category_taxonomy.iter_nodes(): supported_langs = [ lang for lang in category_node.names if lang in SUPPORTED_LANG ] data = { "{}:name".format(lang): category_node.names[lang] for lang in supported_langs } data["id"] = category_node.id id_ = hashlib.sha256(category_node.id.encode("utf-8")).hexdigest() yield id_, data
def process_predictions( y_pred: np.ndarray, category_names: List[str], taxonomy: Taxonomy, threshold: float = 0.5, deepest_only: bool = False, ) -> List[List[CategoryPrediction]]: y_pred_int = (y_pred > threshold).astype(y_pred.dtype) y_pred_int_filled = fill_ancestors(y_pred_int, taxonomy=taxonomy, category_names=category_names) predicted = [] for i in range(y_pred_int_filled.shape[0]): predicted_categories_ids = y_pred_int_filled[i].nonzero()[0] predicted_categories = [ category_names[id_] for id_ in predicted_categories_ids ] product_predicted = [] for predicted_category_id, predicted_category in zip( predicted_categories_ids, predicted_categories): confidence = y_pred[i, predicted_category_id] product_predicted.append( (predicted_category, float(confidence))) product_predicted = sorted(product_predicted, key=operator.itemgetter(1), reverse=True) if deepest_only: category_to_confidence = dict(product_predicted) product_predicted = [ (x.id, category_to_confidence[x.id]) for x in taxonomy.find_deepest_nodes( [taxonomy[c] for c, confidence in product_predicted]) ] predicted.append(product_predicted) return predicted
def test_predict(mocker, deepest_only, mock_response, expected_values): category_taxonomy = Taxonomy.from_dict({ "en:meat": { "names": "meat", }, "en:fish": { "names": "fish", }, "en:salmon": { "names": "salmon", "parents": ["en:fish"], }, "en:smoked-salmon": { "names": "salmon", "parents": ["en:salmon"], }, }) classifier = CategoryClassifier(category_taxonomy) mocker.patch( "robotoff.prediction.category.neural.category_classifier.http_session.post", return_value=mock_response, ) predictions = classifier.predict( { "ingredients_tags": ["ingredient1"], "product_name": "Test Product" }, deepest_only, ) assert len(predictions) == len(expected_values) for prediction, (value_tag, confidence) in zip(predictions, expected_values): assert prediction.value_tag == value_tag assert prediction.data.get("confidence") == confidence
def load_taxonomy(model_dir: pathlib.Path) -> Taxonomy: return Taxonomy.from_json(model_dir / CATEGORY_TAXONOMY_NAME)
def test_is_child_of_any(self, taxonomy: Taxonomy, item: str, candidates: List, output: bool): assert taxonomy.is_parent_of_any(item, candidates) is output
from typing import List import pytest from robotoff import settings from robotoff.taxonomy import Taxonomy label_taxonomy = Taxonomy.from_json(settings.TAXONOMY_LABEL_PATH) class TestTaxonomy: @pytest.mark.parametrize('taxonomy,item,candidates,output', [ (label_taxonomy, 'en:organic', {'en:fr-bio-01'}, True), (label_taxonomy, 'en:fr-bio-01', {'en:organic'}, False), (label_taxonomy, 'en:fr-bio-01', [], False), (label_taxonomy, 'en:organic', {'en:gluten-free'}, False), (label_taxonomy, 'en:organic', {'en:gluten-free', 'en:no-additives', 'en:vegan'}, False), (label_taxonomy, 'en:organic', {'en:gluten-free', 'en:no-additives', 'en:fr-bio-16'}, True), ]) def test_is_child_of_any(self, taxonomy: Taxonomy, item: str, candidates: List, output: bool): assert taxonomy.is_parent_of_any(item, candidates) is output def test_is_child_of_any_unknwon_item(self): with pytest.raises(ValueError): label_taxonomy.is_parent_of_any("unknown-id", set())
def _category_taxonomy() -> Taxonomy: return Taxonomy.from_dict({"en:mushrooms": {"lang": "fr"}})
print("Product {} not found".format(barcode)) continue X = generate_data( product=product, ingredient_to_id=ingredient_to_id, product_name_token_to_int=product_name_vocabulary, nlp=nlp, product_name_max_length=config.model_config.product_name_max_length, product_name_preprocessing_config=config. product_name_preprocessing_config, ) y_pred = model.predict(X) y_pred_int = (y_pred > 0.5).astype(y_pred.dtype) taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH) y_pred_int_filled = fill_ancestors(y_pred_int, taxonomy=taxonomy, category_names=category_names) predicted_categories_ids = y_pred_int_filled[0].nonzero()[0] predicted_categories = [ category_names[id_] for id_ in predicted_categories_ids ] predicted = [] for predicted_category_id, predicted_category in zip( predicted_categories_ids, predicted_categories): confidence = y_pred[0, predicted_category_id] predicted.append((predicted_category, confidence))
def main(): args = parse_args() config: Config = get_config(args) model_config = config.model_config output_dir = args.output_dir output_dir.mkdir(parents=True, exist_ok=True) category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH) ingredient_taxonomy = Taxonomy.from_json( settings.INGREDIENTS_TAXONOMY_PATH) train_df = create_dataframe("train", args.lang) test_df = create_dataframe("test", args.lang) val_df = create_dataframe("val", args.lang) categories_count = count_categories(train_df) ingredients_count = count_ingredients(train_df) selected_categories = set((cat for cat, count in categories_count.items() if count >= config.category_min_count)) selected_ingredients = set( (ingredient for ingredient, count in ingredients_count.items() if count >= config.ingredient_min_count)) print("{} categories selected".format(len(selected_categories))) print("{} ingredients selected".format(len(selected_ingredients))) category_names = [ x for x in sorted(category_taxonomy.keys()) if x in selected_categories ] ingredient_names = [ x for x in sorted(ingredient_taxonomy.keys()) if x in selected_ingredients ] category_to_id = {name: idx for idx, name in enumerate(category_names)} ingredient_to_id = {name: idx for idx, name in enumerate(ingredient_names)} nlp = get_nlp(lang=config.lang) preprocess_product_name_func = functools.partial( preprocess_product_name, lower=config.product_name_preprocessing_config.lower, strip_accent=config.product_name_preprocessing_config.strip_accent, remove_punct=config.product_name_preprocessing_config.remove_punct, remove_digit=config.product_name_preprocessing_config.remove_digit, ) preprocessed_product_names_iter = ( preprocess_product_name_func(product_name) for product_name in train_df.product_name) train_tokens_iter = tokenize_batch(preprocessed_product_names_iter, nlp) product_name_to_int = extract_vocabulary(train_tokens_iter, config.product_name_min_count) model_config.ingredient_voc_size = len(ingredient_to_id) model_config.output_dim = len(category_to_id) model_config.product_name_voc_size = len(product_name_to_int) print("Selected vocabulary: {}".format(len(product_name_to_int))) generate_data_partial = functools.partial( generate_data_from_df, ingredient_to_id=ingredient_to_id, category_to_id=category_to_id, product_name_max_length=model_config.product_name_max_length, product_name_token_to_int=product_name_to_int, nlp=nlp, product_name_preprocessing_config=config. product_name_preprocessing_config, nutriment_input=config.model_config.nutriment_input, ) replicates = args.repeat if replicates == 1: save_dirs = [output_dir] else: save_dirs = [output_dir / str(i) for i in range(replicates)] for i, save_dir in enumerate(save_dirs): model = create_model(config) save_dir.mkdir(exist_ok=True) config.train_config.start_datetime = str(datetime.datetime.utcnow()) print("Starting training repeat {}".format(i)) save_product_name_vocabulary(product_name_to_int, save_dir) save_config(config, save_dir) copy_category_taxonomy(settings.CATEGORY_TAXONOMY_PATH, save_dir) save_category_vocabulary(category_to_id, save_dir) save_ingredient_vocabulary(ingredient_to_id, save_dir) X_train, y_train = generate_data_partial(train_df) X_val, y_val = generate_data_partial(val_df) X_test, y_test = generate_data_partial(test_df) train( (X_train, y_train), (X_val, y_val), (X_test, y_test), model, save_dir, config, category_taxonomy, category_names, ) config.train_config.end_datetime = str(datetime.datetime.utcnow()) save_config(config, save_dir) config.train_config.start_datetime = None config.train_config.end_datetime = None
def test_find_deepest_nodes(self, taxonomy: Taxonomy, items: List[str], output: List[str]): item_nodes = [taxonomy[item] for item in items] output_nodes = [taxonomy[o] for o in output] assert taxonomy.find_deepest_nodes(item_nodes) == output_nodes
from typing import List, Set import pytest from robotoff import settings from robotoff.taxonomy import Taxonomy label_taxonomy = Taxonomy.from_json(settings.TAXONOMY_LABEL_PATH) category_taxonomy = Taxonomy.from_json(settings.TAXONOMY_CATEGORY_PATH) class TestTaxonomy: @pytest.mark.parametrize('taxonomy,item,candidates,output', [ (label_taxonomy, 'en:organic', {'en:fr-bio-01'}, True), (label_taxonomy, 'en:fr-bio-01', {'en:organic'}, False), (label_taxonomy, 'en:fr-bio-01', [], False), (label_taxonomy, 'en:organic', {'en:gluten-free'}, False), (label_taxonomy, 'en:organic', {'en:gluten-free', 'en:no-additives', 'en:vegan'}, False), (label_taxonomy, 'en:organic', {'en:gluten-free', 'en:no-additives', 'en:fr-bio-16'}, True), ]) def test_is_child_of_any(self, taxonomy: Taxonomy, item: str, candidates: List, output: bool): assert taxonomy.is_parent_of_any(item, candidates) is output def test_is_child_of_any_unknwon_item(self): with pytest.raises(ValueError): label_taxonomy.is_parent_of_any("unknown-id", set())
category_to_id: Dict, ingredient_to_id: Dict, vectorizer: CountVectorizer, ): y = generate_y(df.categories_tags, category_to_id) X = generate_X(df, ingredient_to_id, vectorizer) return X, y def generate_X(df: pd.DataFrame, ingredient_to_id: Dict, vectorizer: CountVectorizer): product_name_matrix = vectorizer.transform(df.product_name) ingredient_matrix = process_ingredients(df.known_ingredient_tags, ingredient_to_id) return np.concatenate((product_name_matrix.toarray(), ingredient_matrix), axis=1) category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH) ingredient_taxonomy = Taxonomy.from_json(settings.INGREDIENTS_TAXONOMY_PATH) CATEGORY_NAMES = sorted(category_taxonomy.keys()) INGREDIENT_NAMES = sorted(ingredient_taxonomy.keys()) CATEGORY_TO_ID = {name: idx for idx, name in enumerate(CATEGORY_NAMES)} INGREDIENT_TO_ID = {name: idx for idx, name in enumerate(INGREDIENT_NAMES)} train_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TRAIN_PATH)).head(1000) test_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TEST_PATH)).head(100) val_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_VAL_PATH)).head(100) count_vectorizer = CountVectorizer(min_df=5, preprocessor=preprocess_product_name) count_vectorizer.fit(train_df.product_name)