def translate_row(row): """Translates a row of labeled data into CRF++-compatible tag strings. Args: row: A row of data from the input CSV of labeled ingredient data. Returns: The row of input converted to CRF++-compatible tags, e.g. 2\tI1\tL4\tNoCAP\tNoPAREN\tB-QTY cups\tI2\tL4\tNoCAP\tNoPAREN\tB-UNIT flour\tI3\tL4\tNoCAP\tNoPAREN\tB-NAME """ # extract the display name display_input = utils.cleanUnicodeFractions(row['input']) tokens = tokenizer.tokenize(display_input) labels = _row_to_labels(row) label_data = _addPrefixes([(t, _matchUp(t, labels)) for t in tokens]) translated = '' for i, (token, tags) in enumerate(label_data): features = utils.getFeatures(token, i + 1, tokens) translated += utils.joinLine([token] + features + [_bestTag(tags)]) + '\n' return translated
def _matchUp(token, labels): """ Returns our best guess of the match between the tags and the words from the display text. This problem is difficult for the following reasons: * not all the words in the display name have associated tags * the quantity field is stored as a number, but it appears as a string in the display name * the comment is often a compilation of different comments in the display name """ ret = [] # strip parens from the token, since they often appear in the # display_name, but are removed from the comment. token = utils.normalizeToken(token) decimalToken = _parseNumbers(token) # Iterate through the labels in descending order of label importance. for label_key in ['name', 'unit', 'qty', 'comment', 'range_end']: label_value = labels[label_key] if isinstance(label_value, str): for n, vt in enumerate(tokenizer.tokenize(label_value)): if utils.normalizeToken(vt) == token: ret.append(label_key.upper()) elif decimalToken is not None: if label_value == decimalToken: ret.append(label_key.upper()) return ret
def test_tokenizer_expands_unit_abbreviations(self): pairs = [ ('100g melted chocolate', ['100', 'grams', 'melted', 'chocolate']), ('8oz diet coke', ['8', 'ounces', 'diet', 'coke']), ] for ingredient, tokens_expected in pairs: tokens_actual = tokenizer.tokenize(ingredient) self.assertEqual(tokens_expected, tokens_actual)
def test_tokenizer_splits_slash_separated_alternatives(self): pairs = [('2 tablespoons/30 milliliters milk or cream', [ '2', 'tablespoons', '30', 'milliliters', 'milk', 'or', 'cream' ]), ('2 1/2 cups/300 grams all-purpose flour', ['2$1/2', 'cups', '300', 'grams', 'all-purpose', 'flour'])] for ingredient, tokens_expected in pairs: tokens_actual = tokenizer.tokenize(ingredient) self.assertEqual(tokens_expected, tokens_actual)
def test_tokenizer_expands_unit_abbreviations(self): pairs = [ ("100g melted chocolate", ["100", "grams", "melted", "chocolate"]), ("8oz diet coke", ["8", "ounces", "diet", "coke"]), ] for ingredient, tokens_expected in pairs: tokens_actual = tokenizer.tokenize(ingredient) self.assertEqual(tokens_expected, tokens_actual)
def test_tokenizer_commas(self): expected = [ 'Half', 'a', 'vanilla', 'bean', ',', 'split', 'lengthwise', ',', 'seeds', 'scraped' ] tokens_actual = tokenizer.tokenize( 'Half a vanilla bean, split lengthwise, seeds scraped') self.assertEqual(expected, tokens_actual)
def test_tokenizer_parens_and_commas(self): expected = [ '1', 'cup', 'peeled', 'and', 'cooked', 'fresh', 'chestnuts', '(', 'about', '20', ')', ',', 'or', '1', 'cup', 'canned', ',', 'unsweetened', 'chestnuts' ] tokens_actual = tokenizer.tokenize( '1 cup peeled and cooked fresh chestnuts (about 20), or 1 cup canned, unsweetened chestnuts' ) self.assertEqual(expected, tokens_actual)
def test_tokenizer_expands_unit_abbreviations(self): pairs = [ ('100g melted chocolate', ['100', 'grams', 'melted', 'chocolate']), ('8oz diet coke', ['8', 'ounces', 'diet', 'coke']), ('16oz. of coconut oil', ['16', 'ounces', 'of', 'coconut', 'oil']), ('5lbs yellow butter', ['5', 'pounds', 'yellow', 'butter']), ('15lb. chicken', ['15', 'pounds', 'chicken']), ('5ml corn sugar', ['5', 'milliliters', 'corn', 'sugar']), ('4tsp sugar', ['4', 'teaspoons', 'sugar']), ('2tbsp cinnamon', ['2', 'tablespoons', 'cinnamon']), ] for ingredient, tokens_expected in pairs: tokens_actual = tokenizer.tokenize(ingredient) self.assertEqual(tokens_expected, tokens_actual)
def test_tokenizer_splits_slash_separated_alternatives(self): pairs = [ ( "2 tablespoons/30 milliliters milk or cream", [ "2", "tablespoons", "30", "milliliters", "milk", "or", "cream" ], ), ( "2 1/2 cups/300 grams all-purpose flour", ["2$1/2", "cups", "300", "grams", "all-purpose", "flour"], ), ] for ingredient, tokens_expected in pairs: tokens_actual = tokenizer.tokenize(ingredient) self.assertEqual(tokens_expected, tokens_actual)
# if there are multiple tags, pick the first which isn't COMMENT else: for t in tags: if (t != "COMMENT") and (t != "COMMENT"): return t # we have no idea what to guess return "OTHER" #count tokens and tags token_count = 0 tag_count = {} for index, row in data.iterrows(): # clean and tokenize raw sentence / ingredient phrase raw = utils.cleanUnicodeFractions(row['input'].decode('utf-8')) tokens = tokenizer.tokenize(raw) # match tokens to labels and count labels labels = translator._row_to_labels(row) for token in tokens: # count tokens token_count += 1 # compute tag possible_tags = translator._matchUp(token, labels) tag = best_tag(possible_tags) # count tags tag_count[tag] = tag_count[tag] + 1 if tag in tag_count else 1 # print total number of tokens in our data set
def test_tokenizer_parens(self): expected = ['2', 'tablespoons', 'milk', '(', 'or', 'cream', ')'] tokens_actual = tokenizer.tokenize('2 tablespoons milk (or cream)') self.assertEqual(expected, tokens_actual)