def load_taxonomy(model_dir: pathlib.Path) -> Taxonomy: return Taxonomy.from_json(model_dir / CATEGORY_TAXONOMY_NAME)
from typing import List import pytest from robotoff import settings from robotoff.taxonomy import Taxonomy label_taxonomy = Taxonomy.from_json(settings.TAXONOMY_LABEL_PATH) class TestTaxonomy: @pytest.mark.parametrize('taxonomy,item,candidates,output', [ (label_taxonomy, 'en:organic', {'en:fr-bio-01'}, True), (label_taxonomy, 'en:fr-bio-01', {'en:organic'}, False), (label_taxonomy, 'en:fr-bio-01', [], False), (label_taxonomy, 'en:organic', {'en:gluten-free'}, False), (label_taxonomy, 'en:organic', {'en:gluten-free', 'en:no-additives', 'en:vegan'}, False), (label_taxonomy, 'en:organic', {'en:gluten-free', 'en:no-additives', 'en:fr-bio-16'}, True), ]) def test_is_child_of_any(self, taxonomy: Taxonomy, item: str, candidates: List, output: bool): assert taxonomy.is_parent_of_any(item, candidates) is output def test_is_child_of_any_unknwon_item(self): with pytest.raises(ValueError): label_taxonomy.is_parent_of_any("unknown-id", set())
print("Product {} not found".format(barcode)) continue X = generate_data( product=product, ingredient_to_id=ingredient_to_id, product_name_token_to_int=product_name_vocabulary, nlp=nlp, product_name_max_length=config.model_config.product_name_max_length, product_name_preprocessing_config=config. product_name_preprocessing_config, ) y_pred = model.predict(X) y_pred_int = (y_pred > 0.5).astype(y_pred.dtype) taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH) y_pred_int_filled = fill_ancestors(y_pred_int, taxonomy=taxonomy, category_names=category_names) predicted_categories_ids = y_pred_int_filled[0].nonzero()[0] predicted_categories = [ category_names[id_] for id_ in predicted_categories_ids ] predicted = [] for predicted_category_id, predicted_category in zip( predicted_categories_ids, predicted_categories): confidence = y_pred[0, predicted_category_id] predicted.append((predicted_category, confidence))
def main(): args = parse_args() config: Config = get_config(args) model_config = config.model_config output_dir = args.output_dir output_dir.mkdir(parents=True, exist_ok=True) category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH) ingredient_taxonomy = Taxonomy.from_json( settings.INGREDIENTS_TAXONOMY_PATH) train_df = create_dataframe("train", args.lang) test_df = create_dataframe("test", args.lang) val_df = create_dataframe("val", args.lang) categories_count = count_categories(train_df) ingredients_count = count_ingredients(train_df) selected_categories = set((cat for cat, count in categories_count.items() if count >= config.category_min_count)) selected_ingredients = set( (ingredient for ingredient, count in ingredients_count.items() if count >= config.ingredient_min_count)) print("{} categories selected".format(len(selected_categories))) print("{} ingredients selected".format(len(selected_ingredients))) category_names = [ x for x in sorted(category_taxonomy.keys()) if x in selected_categories ] ingredient_names = [ x for x in sorted(ingredient_taxonomy.keys()) if x in selected_ingredients ] category_to_id = {name: idx for idx, name in enumerate(category_names)} ingredient_to_id = {name: idx for idx, name in enumerate(ingredient_names)} nlp = get_nlp(lang=config.lang) preprocess_product_name_func = functools.partial( preprocess_product_name, lower=config.product_name_preprocessing_config.lower, strip_accent=config.product_name_preprocessing_config.strip_accent, remove_punct=config.product_name_preprocessing_config.remove_punct, remove_digit=config.product_name_preprocessing_config.remove_digit, ) preprocessed_product_names_iter = ( preprocess_product_name_func(product_name) for product_name in train_df.product_name) train_tokens_iter = tokenize_batch(preprocessed_product_names_iter, nlp) product_name_to_int = extract_vocabulary(train_tokens_iter, config.product_name_min_count) model_config.ingredient_voc_size = len(ingredient_to_id) model_config.output_dim = len(category_to_id) model_config.product_name_voc_size = len(product_name_to_int) print("Selected vocabulary: {}".format(len(product_name_to_int))) generate_data_partial = functools.partial( generate_data_from_df, ingredient_to_id=ingredient_to_id, category_to_id=category_to_id, product_name_max_length=model_config.product_name_max_length, product_name_token_to_int=product_name_to_int, nlp=nlp, product_name_preprocessing_config=config. product_name_preprocessing_config, nutriment_input=config.model_config.nutriment_input, ) replicates = args.repeat if replicates == 1: save_dirs = [output_dir] else: save_dirs = [output_dir / str(i) for i in range(replicates)] for i, save_dir in enumerate(save_dirs): model = create_model(config) save_dir.mkdir(exist_ok=True) config.train_config.start_datetime = str(datetime.datetime.utcnow()) print("Starting training repeat {}".format(i)) save_product_name_vocabulary(product_name_to_int, save_dir) save_config(config, save_dir) copy_category_taxonomy(settings.CATEGORY_TAXONOMY_PATH, save_dir) save_category_vocabulary(category_to_id, save_dir) save_ingredient_vocabulary(ingredient_to_id, save_dir) X_train, y_train = generate_data_partial(train_df) X_val, y_val = generate_data_partial(val_df) X_test, y_test = generate_data_partial(test_df) train( (X_train, y_train), (X_val, y_val), (X_test, y_test), model, save_dir, config, category_taxonomy, category_names, ) config.train_config.end_datetime = str(datetime.datetime.utcnow()) save_config(config, save_dir) config.train_config.start_datetime = None config.train_config.end_datetime = None
from typing import List, Set import pytest from robotoff import settings from robotoff.taxonomy import Taxonomy label_taxonomy = Taxonomy.from_json(settings.TAXONOMY_LABEL_PATH) category_taxonomy = Taxonomy.from_json(settings.TAXONOMY_CATEGORY_PATH) class TestTaxonomy: @pytest.mark.parametrize('taxonomy,item,candidates,output', [ (label_taxonomy, 'en:organic', {'en:fr-bio-01'}, True), (label_taxonomy, 'en:fr-bio-01', {'en:organic'}, False), (label_taxonomy, 'en:fr-bio-01', [], False), (label_taxonomy, 'en:organic', {'en:gluten-free'}, False), (label_taxonomy, 'en:organic', {'en:gluten-free', 'en:no-additives', 'en:vegan'}, False), (label_taxonomy, 'en:organic', {'en:gluten-free', 'en:no-additives', 'en:fr-bio-16'}, True), ]) def test_is_child_of_any(self, taxonomy: Taxonomy, item: str, candidates: List, output: bool): assert taxonomy.is_parent_of_any(item, candidates) is output def test_is_child_of_any_unknwon_item(self): with pytest.raises(ValueError): label_taxonomy.is_parent_of_any("unknown-id", set())
category_to_id: Dict, ingredient_to_id: Dict, vectorizer: CountVectorizer, ): y = generate_y(df.categories_tags, category_to_id) X = generate_X(df, ingredient_to_id, vectorizer) return X, y def generate_X(df: pd.DataFrame, ingredient_to_id: Dict, vectorizer: CountVectorizer): product_name_matrix = vectorizer.transform(df.product_name) ingredient_matrix = process_ingredients(df.known_ingredient_tags, ingredient_to_id) return np.concatenate((product_name_matrix.toarray(), ingredient_matrix), axis=1) category_taxonomy = Taxonomy.from_json(settings.CATEGORY_TAXONOMY_PATH) ingredient_taxonomy = Taxonomy.from_json(settings.INGREDIENTS_TAXONOMY_PATH) CATEGORY_NAMES = sorted(category_taxonomy.keys()) INGREDIENT_NAMES = sorted(ingredient_taxonomy.keys()) CATEGORY_TO_ID = {name: idx for idx, name in enumerate(CATEGORY_NAMES)} INGREDIENT_TO_ID = {name: idx for idx, name in enumerate(INGREDIENT_NAMES)} train_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TRAIN_PATH)).head(1000) test_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_TEST_PATH)).head(100) val_df = pd.DataFrame(gzip_jsonl_iter(settings.CATEGORY_FR_VAL_PATH)).head(100) count_vectorizer = CountVectorizer(min_df=5, preprocessor=preprocess_product_name) count_vectorizer.fit(train_df.product_name)