Exemplo n.º 1
0
def idx_xy(X, Y, word2index=None, tag2index=None):
    if not word2index:
        vocabulary = list(set([word for sentence in X for word in sentence]))
        # word2index = {word: i for i, word in enumerate(vocabulary)}
        word2index = list2dict(vocabulary)
    if not tag2index:
        tags = list(set([tag for tags in Y for tag in tags]))
        tags = ['START'] + tags + ['STOP']
        # tag2index = {tag: i for i, tag in enumerate(tags)}
        tag2index = list2dict(tags)

    # index2tag = {v:k for (k, v) in tag2index.items()}
    index2tag = dictReverse(tag2index)

    X_idx = [tokenize(sentence, word2index) for sentence in X]
    Y_idx = [tag2idx(tags, tag2index) for tags in Y]

    return X_idx, X, Y_idx, Y, word2index, tag2index, index2tag
Exemplo n.º 2
0
def feature2idx(words, tags):
    features = list()
    T = len(tags)
    V = len(words)
    for y1, y2 in itertools.product(range(T), range(T)):
        features.append(str_transition(y1, y2))
    for x, y in itertools.product(range(V), range(T)):
        features.append(str_emission(x, y))

    # return {f:i for i, f in enumerate(features)}
    return list2dict(features)