示例#1
0
def translate_row(row):
    """Translates a row of labeled data into CRF++-compatible tag strings.

    Args:
        row: A row of data from the input CSV of labeled ingredient data.

    Returns:
        The row of input converted to CRF++-compatible tags, e.g.

            2\tI1\tL4\tNoCAP\tNoPAREN\tB-QTY
            cups\tI2\tL4\tNoCAP\tNoPAREN\tB-UNIT
            flour\tI3\tL4\tNoCAP\tNoPAREN\tB-NAME
    """
    # extract the display name
    display_input = utils.cleanUnicodeFractions(row['input'])
    tokens = tokenizer.tokenize(display_input)

    labels = _row_to_labels(row)
    label_data = _addPrefixes([(t, _matchUp(t, labels)) for t in tokens])

    translated = ''
    for i, (token, tags) in enumerate(label_data):
        features = utils.getFeatures(token, i + 1, tokens)
        translated += utils.joinLine([token] + features +
                                     [_bestTag(tags)]) + '\n'
    return translated
示例#2
0
    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv("nyt-ingredients-snapshot-2015.csv")
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start:end]
        s = ""
        for index, row in df_slice.iterrows():
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])

                tokens = utils.tokenize(display_input)
                del (row["input"])
                rowData = self.addPrefixes([(t, self.matchUp(t, row))
                                            for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i + 1, tokens)
                    s = s + utils.joinLine([token] + features +
                                           [self.bestTag(tags)]) + '\n'

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print
        self.writeTempFile(s)
def predictIngredientTag(ingredient):
    display_input = utils.cleanUnicodeFractions(ingredient)

    tokens = utils.tokenizeWithoutPunctuation(display_input)
    word2idx = {w: i for i, w in enumerate(words)}
    X = [[word2idx[w[0]] for w in s] for s in arr]

    max_len = max([len(x) for x in X])
    x_testData = pad_sequences(sequences=[[word2idx.get(w, 0)
                                           for w in tokens]],
                               padding="post",
                               value=0,
                               maxlen=max_len)

    loadedModel = loadTrainedModel()
    p = loadedModel.predict(np.array([x_testData[0]]))
    p = np.argmax(p, axis=-1)
    retArr = []
    for w, pred in zip(tokens, p[0]):
        print("{:15}: {:5}".format(w, tags[pred]))
        retArr.append((w, tags[pred]))
    return retArr


#print(predictIngredientTag("1 tomato"))

#trainAndSaveModel()
示例#4
0
    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv(self.opts.data_path)
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start: end]

        for index, row in df_slice.iterrows():
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del(row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i+1, tokens)
                    print utils.joinLine([token] + features + [self.bestTag(tags)])

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print
示例#5
0
def readIngredientDataForExtract():
    df = pd.read_csv("nyt-ingredients-snapshot-2015.csv")
    df = df.fillna("fillna")
    retArr = []
    for index, row in df.iterrows():
        try:
            # extract the display name
            display_input = utils.cleanUnicodeFractions(row["name"])
            display_input = str(re.sub(r'[^\w\s]', '', display_input)).lower()
            retArr.append(display_input)
        # ToDo: deal with this
        except UnicodeDecodeError:
            pass
    retArr2 = []
    if len(retArr) > 0:
        for word in retArr:
            length = len([w for w in retArr if w == word])
            if (word, length) not in retArr2:
                retArr2.append((word, length))
        sorted(retArr2, key=lambda x: x[1])
    if len(retArr2) > 0:
        with open('ingredients.txt', 'a') as the_file:
            for (word, count) in retArr2:
                the_file.write(word + " " + str(count) + '\n')
            the_file.close()
    def _generate_data_worker(self, args):
            index, row = args
            out = []
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del(row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row)) for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i+1, tokens)
                    out.append(utils.joinLine([token] + features + [self.bestTag(tags)]))

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            if out:
                self.output_queue.put('\n'.join(out))
示例#7
0
def readIngredientData():
    df = pd.read_csv("nyt-ingredients-snapshot-2015.csv")
    df = df.fillna("fillna")
    retArr = []
    for index, row in df.iterrows():
        try:
            # extract the display name
            display_input = utils.cleanUnicodeFractions(row["input"])

            tokens = utils.tokenizeWithoutPunctuation(display_input)
            del (row["input"])
            rowData = [(t, matchUp(t, row)) for t in tokens]
            tupleData = convertTupleArray(rowData, tokens)
            retArr.append(tupleData)

        # ToDo: deal with this
        except UnicodeDecodeError:
            pass
        if index == 5000:
            break
    return retArr
示例#8
0
    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        df = pd.read_csv(self.opts.data_path)
        df = df.fillna("")

        start = int(offset)
        end = int(offset) + int(count)

        df_slice = df.iloc[start:end]

        for index, row in df_slice.iterrows():

            prev_tag = None
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del (row["input"])

                taggedTokens = [(t, self.matchUp(t, row)) for t in tokens]
                rowData = self.addPrefixes(taggedTokens)

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i + 1, tokens)
                    best_tag = self.bestTag(tags)
                    if best_tag.startswith("I-") and best_tag.split(
                            "-")[-1] != prev_tag.split("-")[-1]:
                        best_tag = best_tag.replace("I-", "B-")

                    print utils.joinLine([token] + features + [best_tag])
                    prev_tag = best_tag

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print
示例#9
0
    def generate_data(self, count, offset):
        """
        Generates training data in the CRF++ format for the ingredient
        tagging task
        """
        data = []
        with open(self.opts.data_path, "r") as csvfile:
            reader = csv.DictReader(csvfile)
            for line in reader:
                data.append(line)

        start = int(offset)
        end = int(offset) + int(count)

        data_slice = data[start:end]

        for row in data_slice:
            try:
                # extract the display name
                display_input = utils.cleanUnicodeFractions(row["input"])
                tokens = utils.tokenize(display_input)
                del (row["input"])

                rowData = self.addPrefixes([(t, self.matchUp(t, row))
                                            for t in tokens])

                for i, (token, tags) in enumerate(rowData):
                    features = utils.getFeatures(token, i + 1, tokens)
                    print utils.joinLine([token] + features +
                                         [self.bestTag(tags)])

            # ToDo: deal with this
            except UnicodeDecodeError:
                pass

            print