def process_item(self, good_item, spider): if 'ingredients' in good_item: ingredients_as_string = good_item['ingredients'] ingredients_as_string = string_processor.remove_substring_in_paranthesis( ingredients_as_string) ingredients_fragments = string_processor.split_ingredients(ingredients_as_string) #log.msg("ingredients after splitting: {0}".format(ingredients_fragments)) for fragment in ingredients_fragments: if not string_processor.parse_e_additives(fragment): fragment = string_processor.remove_weight(fragment) fragment = string_processor.remove_percents(fragment) agrovoc_match = self.agrovoc_graph.find_ingredient_by_name(fragment.strip()) if agrovoc_match: #log.msg('found ingredient {0}'.format(agrovoc_match)) good_item['agrovoc_ingredients'] = ( good_item.get('agrovoc_ingredients', []) + [agrovoc_match]) else: self.not_parsed_fragments[fragment] = ( self.not_parsed_fragments.get(fragment, 0) + 1) return good_item
def process_item(self, good_item, spider): if 'ingredients' in good_item: ingredients_as_string = good_item['ingredients'] ingredients_as_string = string_processor.remove_substring_in_paranthesis( ingredients_as_string) ingredients_fragments = string_processor.split_ingredients( ingredients_as_string) #log.msg("ingredients after splitting: {0}".format(ingredients_fragments)) for fragment in ingredients_fragments: if not string_processor.parse_e_additives(fragment): fragment = string_processor.remove_weight(fragment) fragment = string_processor.remove_percents(fragment) agrovoc_match = self.agrovoc_graph.find_ingredient_by_name( fragment.strip()) if agrovoc_match: #log.msg('found ingredient {0}'.format(agrovoc_match)) good_item['agrovoc_ingredients'] = ( good_item.get('agrovoc_ingredients', []) + [agrovoc_match]) else: self.not_parsed_fragments[fragment] = ( self.not_parsed_fragments.get(fragment, 0) + 1) return good_item
def test_nested_paranthesis(self): no_substring_in_paranthesis = remove_substring_in_paranthesis( "abc(d(e)f)") self.assertEqual(no_substring_in_paranthesis, "abcf)")
def test_brakets(self): no_substring_in_paranthesis = remove_substring_in_paranthesis( "abc[ddd]") self.assertEqual(no_substring_in_paranthesis, "abc[ddd]")
def test_not_oppened_paranthesis(self): no_substring_in_paranthesis = remove_substring_in_paranthesis( "abcdd)d") self.assertEqual(no_substring_in_paranthesis, "abcdd)d")
def test_not_closed_paranthesis(self): no_substring_in_paranthesis = remove_substring_in_paranthesis( "abc(ddd") self.assertEqual(no_substring_in_paranthesis, "abc(ddd")
def test_one_paranthesis(self): no_substring_in_paranthesis = remove_substring_in_paranthesis( "abc(ddd)") self.assertEqual(no_substring_in_paranthesis, "abc")
def test_nested_paranthesis(self): no_substring_in_paranthesis = remove_substring_in_paranthesis("abc(d(e)f)") self.assertEqual(no_substring_in_paranthesis, "abcf)")
def test_brakets(self): no_substring_in_paranthesis = remove_substring_in_paranthesis("abc[ddd]") self.assertEqual(no_substring_in_paranthesis, "abc[ddd]")
def test_not_oppened_paranthesis(self): no_substring_in_paranthesis = remove_substring_in_paranthesis("abcdd)d") self.assertEqual(no_substring_in_paranthesis, "abcdd)d")
def test_not_closed_paranthesis(self): no_substring_in_paranthesis = remove_substring_in_paranthesis("abc(ddd") self.assertEqual(no_substring_in_paranthesis, "abc(ddd")
def test_one_paranthesis(self): no_substring_in_paranthesis = remove_substring_in_paranthesis("abc(ddd)") self.assertEqual(no_substring_in_paranthesis, "abc")