def translate_row(row):
    """Translates a row of labeled data into CRF++-compatible tag strings.

    Args:
        row: A row of data from the input CSV of labeled ingredient data.

    Returns:
        The row of input converted to CRF++-compatible tags, e.g.

            2\tI1\tL4\tNoCAP\tNoPAREN\tB-QTY
            cups\tI2\tL4\tNoCAP\tNoPAREN\tB-UNIT
            flour\tI3\tL4\tNoCAP\tNoPAREN\tB-NAME
    """
    # extract the display name
    display_input = utils.cleanUnicodeFractions(row['input'])
    tokens = tokenizer.tokenize(display_input)

    labels = _row_to_labels(row)
    label_data = _addPrefixes([(t, _matchUp(t, labels)) for t in tokens])

    translated = ''
    for i, (token, tags) in enumerate(label_data):
        features = utils.getFeatures(token, i + 1, tokens)
        translated += utils.joinLine([token] + features +
                                     [_bestTag(tags)]) + '\n'
    return translated
def _matchUp(token, labels):
    """
    Returns our best guess of the match between the tags and the
    words from the display text.

    This problem is difficult for the following reasons:
        * not all the words in the display name have associated tags
        * the quantity field is stored as a number, but it appears
          as a string in the display name
        * the comment is often a compilation of different comments in
          the display name

    """
    ret = []

    # strip parens from the token, since they often appear in the
    # display_name, but are removed from the comment.
    token = utils.normalizeToken(token)
    decimalToken = _parseNumbers(token)

    # Iterate through the labels in descending order of label importance.
    for label_key in ['name', 'unit', 'qty', 'comment', 'range_end']:
        label_value = labels[label_key]
        if isinstance(label_value, str):
            for n, vt in enumerate(tokenizer.tokenize(label_value)):
                if utils.normalizeToken(vt) == token:
                    ret.append(label_key.upper())

        elif decimalToken is not None:
            if label_value == decimalToken:
                ret.append(label_key.upper())

    return ret
示例#3
0
 def test_tokenizer_expands_unit_abbreviations(self):
     pairs = [
         ('100g melted chocolate', ['100', 'grams', 'melted', 'chocolate']),
         ('8oz diet coke', ['8', 'ounces', 'diet', 'coke']),
     ]
     for ingredient, tokens_expected in pairs:
         tokens_actual = tokenizer.tokenize(ingredient)
         self.assertEqual(tokens_expected, tokens_actual)
示例#4
0
 def test_tokenizer_splits_slash_separated_alternatives(self):
     pairs = [('2 tablespoons/30 milliliters milk or cream', [
         '2', 'tablespoons', '30', 'milliliters', 'milk', 'or', 'cream'
     ]), ('2 1/2 cups/300 grams all-purpose flour',
          ['2$1/2', 'cups', '300', 'grams', 'all-purpose', 'flour'])]
     for ingredient, tokens_expected in pairs:
         tokens_actual = tokenizer.tokenize(ingredient)
         self.assertEqual(tokens_expected, tokens_actual)
示例#5
0
 def test_tokenizer_expands_unit_abbreviations(self):
     pairs = [
         ("100g melted chocolate", ["100", "grams", "melted", "chocolate"]),
         ("8oz diet coke", ["8", "ounces", "diet", "coke"]),
     ]
     for ingredient, tokens_expected in pairs:
         tokens_actual = tokenizer.tokenize(ingredient)
         self.assertEqual(tokens_expected, tokens_actual)
示例#6
0
 def test_tokenizer_commas(self):
     expected = [
         'Half', 'a', 'vanilla', 'bean', ',', 'split', 'lengthwise', ',',
         'seeds', 'scraped'
     ]
     tokens_actual = tokenizer.tokenize(
         'Half a vanilla bean, split lengthwise, seeds scraped')
     self.assertEqual(expected, tokens_actual)
示例#7
0
 def test_tokenizer_parens_and_commas(self):
     expected = [
         '1', 'cup', 'peeled', 'and', 'cooked', 'fresh', 'chestnuts', '(',
         'about', '20', ')', ',', 'or', '1', 'cup', 'canned', ',',
         'unsweetened', 'chestnuts'
     ]
     tokens_actual = tokenizer.tokenize(
         '1 cup peeled and cooked fresh chestnuts (about 20), or 1 cup canned, unsweetened chestnuts'
     )
     self.assertEqual(expected, tokens_actual)
 def test_tokenizer_expands_unit_abbreviations(self):
     pairs = [
         ('100g melted chocolate', ['100', 'grams', 'melted', 'chocolate']),
         ('8oz diet coke', ['8', 'ounces', 'diet', 'coke']),
         ('16oz. of coconut oil', ['16', 'ounces', 'of', 'coconut', 'oil']),
         ('5lbs  yellow butter', ['5', 'pounds', 'yellow', 'butter']),
         ('15lb. chicken', ['15', 'pounds', 'chicken']),
         ('5ml corn sugar', ['5', 'milliliters', 'corn', 'sugar']),
         ('4tsp sugar', ['4', 'teaspoons', 'sugar']),
         ('2tbsp cinnamon', ['2', 'tablespoons', 'cinnamon']),
     ]
     for ingredient, tokens_expected in pairs:
         tokens_actual = tokenizer.tokenize(ingredient)
         self.assertEqual(tokens_expected, tokens_actual)
示例#9
0
 def test_tokenizer_splits_slash_separated_alternatives(self):
     pairs = [
         (
             "2 tablespoons/30 milliliters milk or cream",
             [
                 "2", "tablespoons", "30", "milliliters", "milk", "or",
                 "cream"
             ],
         ),
         (
             "2 1/2 cups/300 grams all-purpose flour",
             ["2$1/2", "cups", "300", "grams", "all-purpose", "flour"],
         ),
     ]
     for ingredient, tokens_expected in pairs:
         tokens_actual = tokenizer.tokenize(ingredient)
         self.assertEqual(tokens_expected, tokens_actual)
    # if there are multiple tags, pick the first which isn't COMMENT
    else:
        for t in tags:
            if (t != "COMMENT") and (t != "COMMENT"):
                return t

    # we have no idea what to guess
    return "OTHER"

#count tokens and tags
token_count = 0
tag_count = {}
for index, row in data.iterrows():
    # clean and tokenize raw sentence / ingredient phrase
    raw = utils.cleanUnicodeFractions(row['input'].decode('utf-8'))
    tokens = tokenizer.tokenize(raw)

    # match tokens to labels and count labels
    labels = translator._row_to_labels(row)
    for token in tokens:
        # count tokens
        token_count += 1

        # compute tag
        possible_tags = translator._matchUp(token, labels)
        tag = best_tag(possible_tags)
        
        # count tags
        tag_count[tag] = tag_count[tag] + 1 if tag in tag_count else 1

# print total number of tokens in our data set
示例#11
0
 def test_tokenizer_parens(self):
     expected = ['2', 'tablespoons', 'milk', '(', 'or', 'cream', ')']
     tokens_actual = tokenizer.tokenize('2 tablespoons milk (or cream)')
     self.assertEqual(expected, tokens_actual)