def __init__(self, item): """ Initalize all the features for a given token """ word = item[0] self.capital = ft.contains_capital(word) self.digit = ft.contains_digit(word) self.hyphen = ft.contains_hyphen(word) self.prefix1 = ft.prefix(word, 1) self.prefix2 = ft.prefix(word, 2) self.prefix3 = ft.prefix(word, 3) self.prefix4 = ft.prefix(word, 4) self.suffix1 = ft.suffix(word, 1) self.suffix2 = ft.suffix(word, 2) self.suffix3 = ft.suffix(word, 3) self.suffix4 = ft.suffix(word, 4) self.shape1 = ft.shape1(word) self.shape2 = ft.shape2(word) self.word = word self.word_lower = word.lower() self.pos = item[1] self.features = (self.word, self.word_lower, int(self.capital), int(self.digit), int(self.hyphen), self.prefix1, self.prefix2, self.prefix3, self.prefix4, self.suffix1, self.suffix2, self.suffix3, self.suffix4, self.shape1, self.shape2, self.pos) self.feature_names = ('token', 'token_lower', 'capital', 'digit', 'hyphen', 'prefix1', 'prefix2', 'prefix3', 'prefix4', 'suffix1', 'suffix2', 'suffix3', 'suffix4', 'shape1', 'shape2', 'POS') self.features_dict = dict(zip(self.feature_names[:-1], self.features[:-1]))
def infer_features(self, x, y): self.func_filter_dict = {} self.reg_exp = [] reg_exp ='ct{}pt{}' self.reg_exp.append(lambda x_j, cur_tag, prev_tag, i,reg_exp=reg_exp: reg_exp.format(cur_tag,prev_tag)) for y1 in self.unique_y: for y2 in self.unique_y: new_func = lambda words,curr_tag,prev_tag,i,y1=y1,y2=y2 : features.bigram_tag_label(words, curr_tag, prev_tag, i, tag=y1,tag2=y2) self.func_list.append(new_func) self.func_filter_dict[reg_exp.format(y1,y2)] = len(self.func_list) - 1 for ind in range(-1,2): reg_exp = 'ct{}w{}i{}' self.reg_exp.append(lambda x_j, cur_tag, prev_tag, i,reg_exp=reg_exp,ind=ind: reg_exp.format(cur_tag,x_j[i-ind].lower(),ind)) for y1 in self.unique_y: for x1 in self.unique_x: new_func = lambda words,curr_tag,prev_tag,i,y1=y1,x1=x1,ind=ind : features.atomic_tag_label(words, curr_tag, prev_tag, i-ind, tag=y1,word=x1) self.func_list.append(new_func) self.func_filter_dict[reg_exp.format(y1,x1,ind)] = len(self.func_list) - 1 for slength in range(3,5): reg_exp = 'ct{}s{}l{}' self.reg_exp.append(lambda x_j, cur_tag, prev_tag, i,l=slength,reg_exp=reg_exp: reg_exp.format(cur_tag,x_j[i][-l:].lower(),l)) for y1 in self.unique_y: for x1 in self.unique_x: s = x1[-slength:] new_func = lambda words,curr_tag,prev_tag,i,suff_length=slength,suffix=s,tag=y1 : features.suffix(words, curr_tag, prev_tag, i, suff_length=suff_length,suffix=suffix,tag=y1) self.func_list.append(new_func) self.func_filter_dict[reg_exp.format(y1,s,slength)] = len(self.func_list) - 1
def infer_features(self, x, y): self.func_filter_dict = {} self.reg_exp = [] reg_exp = "ct{}pt{}" self.reg_exp.append(lambda x_j, cur_tag, prev_tag, i, reg_exp=reg_exp: reg_exp.format(cur_tag, prev_tag)) for y1 in self.unique_y: for y2 in self.unique_y: new_func = lambda words, curr_tag, prev_tag, i, y1=y1, y2=y2: features.bigram_tag_label( words, curr_tag, prev_tag, i, tag=y1, tag2=y2 ) self.func_list.append(new_func) self.func_filter_dict[reg_exp.format(y1, y2)] = len(self.func_list) - 1 for ind in range(-1, 2): reg_exp = "ct{}w{}i{}" self.reg_exp.append( lambda x_j, cur_tag, prev_tag, i, reg_exp=reg_exp, ind=ind: reg_exp.format( cur_tag, x_j[i - ind].lower(), ind ) ) for y1 in self.unique_y: for x1 in self.unique_x: new_func = lambda words, curr_tag, prev_tag, i, y1=y1, x1=x1, ind=ind: features.atomic_tag_label( words, curr_tag, prev_tag, i - ind, tag=y1, word=x1 ) self.func_list.append(new_func) self.func_filter_dict[reg_exp.format(y1, x1, ind)] = len(self.func_list) - 1 for slength in range(3, 5): reg_exp = "ct{}s{}l{}" self.reg_exp.append( lambda x_j, cur_tag, prev_tag, i, l=slength, reg_exp=reg_exp: reg_exp.format( cur_tag, x_j[i][-l:].lower(), l ) ) for y1 in self.unique_y: for x1 in self.unique_x: s = x1[-slength:] new_func = lambda words, curr_tag, prev_tag, i, suff_length=slength, suffix=s, tag=y1: features.suffix( words, curr_tag, prev_tag, i, suff_length=suff_length, suffix=suffix, tag=y1 ) self.func_list.append(new_func) self.func_filter_dict[reg_exp.format(y1, s, slength)] = len(self.func_list) - 1