def idx_xy(X, Y, word2index=None, tag2index=None): if not word2index: vocabulary = list(set([word for sentence in X for word in sentence])) # word2index = {word: i for i, word in enumerate(vocabulary)} word2index = list2dict(vocabulary) if not tag2index: tags = list(set([tag for tags in Y for tag in tags])) tags = ['START'] + tags + ['STOP'] # tag2index = {tag: i for i, tag in enumerate(tags)} tag2index = list2dict(tags) # index2tag = {v:k for (k, v) in tag2index.items()} index2tag = dictReverse(tag2index) X_idx = [tokenize(sentence, word2index) for sentence in X] Y_idx = [tag2idx(tags, tag2index) for tags in Y] return X_idx, X, Y_idx, Y, word2index, tag2index, index2tag
def feature2idx(words, tags): features = list() T = len(tags) V = len(words) for y1, y2 in itertools.product(range(T), range(T)): features.append(str_transition(y1, y2)) for x, y in itertools.product(range(V), range(T)): features.append(str_emission(x, y)) # return {f:i for i, f in enumerate(features)} return list2dict(features)