def process_item(self, good_item, spider): if 'ingredients' in good_item: ingredients_as_string = good_item['ingredients'] extracted_e_additives = string_processor.parse_e_additives( ingredients_as_string) good_item['e_additives'] = extracted_e_additives return good_item
def test_additive_tailing_russian_A(self): string_under_test = ( u"сахар, сироп глюкозы, вода, желатин, ароматические " u"вещества, кислота(лимонная), порошок лакриц, " u"красители (Е100,Е120,Е133,Е153, Е160А), вещества " u"наносимые на поверхность(растительные масла, Е903). " u"Возможно незначительное содержание лесного ореха.") self.assertEqual(parse_e_additives(string_under_test), [u'E100', u'E120', u'E133', u'E153', u'E160a', u'E903'])
def test_additive_tailing_russian_A(self): string_under_test = ( u"сахар, сироп глюкозы, вода, желатин, ароматические " u"вещества, кислота(лимонная), порошок лакриц, " u"красители (Е100,Е120,Е133,Е153, Е160А), вещества " u"наносимые на поверхность(растительные масла, Е903). " u"Возможно незначительное содержание лесного ореха.") self.assertEqual( parse_e_additives(string_under_test), [u'E100', u'E120', u'E133', u'E153', u'E160a', u'E903'])
def process_item(self, good_item, spider): if 'ingredients' in good_item: ingredients_as_string = good_item['ingredients'] ingredients_as_string = string_processor.remove_substring_in_paranthesis( ingredients_as_string) ingredients_fragments = string_processor.split_ingredients(ingredients_as_string) #log.msg("ingredients after splitting: {0}".format(ingredients_fragments)) for fragment in ingredients_fragments: if not string_processor.parse_e_additives(fragment): fragment = string_processor.remove_weight(fragment) fragment = string_processor.remove_percents(fragment) agrovoc_match = self.agrovoc_graph.find_ingredient_by_name(fragment.strip()) if agrovoc_match: #log.msg('found ingredient {0}'.format(agrovoc_match)) good_item['agrovoc_ingredients'] = ( good_item.get('agrovoc_ingredients', []) + [agrovoc_match]) else: self.not_parsed_fragments[fragment] = ( self.not_parsed_fragments.get(fragment, 0) + 1) return good_item
def process_item(self, good_item, spider): if 'ingredients' in good_item: ingredients_as_string = good_item['ingredients'] ingredients_as_string = string_processor.remove_substring_in_paranthesis( ingredients_as_string) ingredients_fragments = string_processor.split_ingredients( ingredients_as_string) #log.msg("ingredients after splitting: {0}".format(ingredients_fragments)) for fragment in ingredients_fragments: if not string_processor.parse_e_additives(fragment): fragment = string_processor.remove_weight(fragment) fragment = string_processor.remove_percents(fragment) agrovoc_match = self.agrovoc_graph.find_ingredient_by_name( fragment.strip()) if agrovoc_match: #log.msg('found ingredient {0}'.format(agrovoc_match)) good_item['agrovoc_ingredients'] = ( good_item.get('agrovoc_ingredients', []) + [agrovoc_match]) else: self.not_parsed_fragments[fragment] = ( self.not_parsed_fragments.get(fragment, 0) + 1) return good_item
def test_no_additives(self): self.assertEqual(parse_e_additives(u"a303"), [])
def test_no_additives_if_additive_is_not_separate_word(self): self.assertEqual(parse_e_additives(u"abce304"), [])
def test_one_additive_in_lowercase(self): self.assertEqual(parse_e_additives(u"е100"), [u'E100'])
def test_additive_tailing_russian_non_latin_character_is_ignored(self): string_under_test = u"E160б" self.assertEqual(parse_e_additives(string_under_test), [u'E160'])
def test_additive_russian_tailing_russian_c_in_lowercase(self): string_under_test = u"E160с" self.assertEqual(parse_e_additives(string_under_test), [u'E160c'])
def test_one_additive_in_uppercase(self): self.assertEqual(parse_e_additives(u"Е200"), [u'E200'])
def test_additive_with_extra_digit(self): self.assertEqual(parse_e_additives(u"Е1525"), [u'E1525'])
def test_additive_russian_tailing_russian_C_in_uppercase(self): string_under_test = u"Е160С" self.assertEqual(parse_e_additives(string_under_test), [u'E160c'])
def test_two_additives_in_lower_and_upper_cases(self): self.assertEqual(parse_e_additives(u"е201, Е202"), [u'E201', u'E202'])
def test_additive_with_hyphen(self): self.assertEqual(parse_e_additives(u"е-100"), [u'E100'])
def test_additive_with_extra_letter(self): self.assertEqual(parse_e_additives(u"е201B"), [u'E201b'])
def test_additive_russian_leading_russian_E_in_lowercase(self): string_under_test = u"е160" self.assertEqual(parse_e_additives(string_under_test), [u'E160'])
def test_additive_with_space(self): string_under_test = ( u"влагоудерживающий агент E 452, регулятор кислотности E 451, " u"специи, декстроза, загустители E 407, E 412; " ) self.assertEqual(parse_e_additives(string_under_test), [u'E452', u'E451', u'E407', u'E412'])
def test_additive_with_space(self): string_under_test = ( u"влагоудерживающий агент E 452, регулятор кислотности E 451, " u"специи, декстроза, загустители E 407, E 412; ") self.assertEqual(parse_e_additives(string_under_test), [u'E452', u'E451', u'E407', u'E412'])