Exemplo n.º 1
0
    def getAllIngredients(self):
        df = self.df_AllergenData
        ingredients = df['IngredientsInfo'].tolist()
        productId = df['Gtin'].tolist()
        languageInfo = df['LanguageInfo'].tolist()

        wordToTranslation = {}
        uniqueIngredients = set()
        # product_ingredients_mapping = defaultdict(list)
        x = 0
        for i in range(len(ingredients)):
            try:
                if (languageInfo[i] != 'en'):
                    vals = re.findall(r'\(E\d+\)|([^\W\d]+(?:\s+[^\W\d]+)*)',
                                      ingredients[i])
                    instance = Translate()
                    for item in vals:
                        if (len(item) < 3): continue
                        translatedItem = instance.TranslateWord(
                            item, languageInfo[i])
                        uniqueIngredients.add(translatedItem.lower())
                        wordToTranslation.update({
                            item: [
                                translatedItem,
                                instance.DetectLanguage(translatedItem)
                            ]
                        })
                else:
                    vals = re.findall(r'\(E\d+\)|([^\W\d]+(?:\s+[^\W\d]+)*)',
                                      ingredients[i])
                    for j in vals:
                        if (len(item) < 3): continue
                        uniqueIngredients.add(j.lower())
                        # product_ingredients_mapping[j].append(str(productId[i]))
                        wordToTranslation.update({item: [item, 'en']})
            except Exception as e:
                x += 1

        # Dataframe of items ===> productId mapping
        # frame = pd.DataFrame(dict([ (k,Series(v)) for k,v in product_ingredients_mapping.items() ])).transpose()
        frame = pd.DataFrame(wordToTranslation,
                             index=['Translation', 'Language']).transpose()
        frame.to_csv('frame.csv', sep=',', encoding='latin-1')
        return uniqueIngredients