Exemplo n.º 1
0
    def __init__(self, item):
        """
        Initalize all the features for a given token
        """

        word = item[0]
        self.capital = ft.contains_capital(word)
        self.digit = ft.contains_digit(word)
        self.hyphen = ft.contains_hyphen(word)

        self.prefix1 = ft.prefix(word, 1)
        self.prefix2 = ft.prefix(word, 2)
        self.prefix3 = ft.prefix(word, 3)
        self.prefix4 = ft.prefix(word, 4)

        self.suffix1 = ft.suffix(word, 1)
        self.suffix2 = ft.suffix(word, 2)
        self.suffix3 = ft.suffix(word, 3)
        self.suffix4 = ft.suffix(word, 4)

        self.shape1 = ft.shape1(word)
        self.shape2 = ft.shape2(word)
        self.word = word
        self.word_lower = word.lower()

        self.pos = item[1]
        
        self.features = (self.word, self.word_lower, int(self.capital), int(self.digit), int(self.hyphen),
                         self.prefix1, self.prefix2, self.prefix3, self.prefix4,
                         self.suffix1, self.suffix2, self.suffix3, self.suffix4,
                         self.shape1, self.shape2, self.pos)
        self.feature_names = ('token', 'token_lower', 'capital', 'digit', 'hyphen',
                              'prefix1', 'prefix2', 'prefix3', 'prefix4',
                              'suffix1', 'suffix2', 'suffix3', 'suffix4',
                              'shape1', 'shape2', 'POS')
        self.features_dict = dict(zip(self.feature_names[:-1], self.features[:-1]))
    def infer_features(self, x, y):
        self.func_filter_dict = {}
        self.reg_exp = []
        reg_exp ='ct{}pt{}'
        self.reg_exp.append(lambda x_j, cur_tag, prev_tag, i,reg_exp=reg_exp: reg_exp.format(cur_tag,prev_tag))
        for y1 in self.unique_y:
            for y2 in self.unique_y:
                new_func = lambda words,curr_tag,prev_tag,i,y1=y1,y2=y2 : features.bigram_tag_label(words, curr_tag, prev_tag, i, tag=y1,tag2=y2)
                self.func_list.append(new_func)
                self.func_filter_dict[reg_exp.format(y1,y2)] = len(self.func_list) - 1

        for ind in range(-1,2):
            reg_exp = 'ct{}w{}i{}'
            self.reg_exp.append(lambda x_j, cur_tag, prev_tag, i,reg_exp=reg_exp,ind=ind: reg_exp.format(cur_tag,x_j[i-ind].lower(),ind))
            for y1 in self.unique_y:
                for x1 in self.unique_x:
                    new_func = lambda words,curr_tag,prev_tag,i,y1=y1,x1=x1,ind=ind : features.atomic_tag_label(words, curr_tag, prev_tag, i-ind, tag=y1,word=x1)
                    self.func_list.append(new_func)
                    self.func_filter_dict[reg_exp.format(y1,x1,ind)] = len(self.func_list) - 1

        for slength in range(3,5):
            reg_exp = 'ct{}s{}l{}'
            self.reg_exp.append(lambda x_j, cur_tag, prev_tag, i,l=slength,reg_exp=reg_exp: reg_exp.format(cur_tag,x_j[i][-l:].lower(),l))
            for y1 in self.unique_y:
                for x1 in self.unique_x:
                    s = x1[-slength:]
                    new_func = lambda words,curr_tag,prev_tag,i,suff_length=slength,suffix=s,tag=y1 : features.suffix(words, curr_tag, prev_tag, i, suff_length=suff_length,suffix=suffix,tag=y1)
                    self.func_list.append(new_func)
                    self.func_filter_dict[reg_exp.format(y1,s,slength)] = len(self.func_list) - 1
    def infer_features(self, x, y):
        self.func_filter_dict = {}
        self.reg_exp = []
        reg_exp = "ct{}pt{}"
        self.reg_exp.append(lambda x_j, cur_tag, prev_tag, i, reg_exp=reg_exp: reg_exp.format(cur_tag, prev_tag))
        for y1 in self.unique_y:
            for y2 in self.unique_y:
                new_func = lambda words, curr_tag, prev_tag, i, y1=y1, y2=y2: features.bigram_tag_label(
                    words, curr_tag, prev_tag, i, tag=y1, tag2=y2
                )
                self.func_list.append(new_func)
                self.func_filter_dict[reg_exp.format(y1, y2)] = len(self.func_list) - 1

        for ind in range(-1, 2):
            reg_exp = "ct{}w{}i{}"
            self.reg_exp.append(
                lambda x_j, cur_tag, prev_tag, i, reg_exp=reg_exp, ind=ind: reg_exp.format(
                    cur_tag, x_j[i - ind].lower(), ind
                )
            )
            for y1 in self.unique_y:
                for x1 in self.unique_x:
                    new_func = lambda words, curr_tag, prev_tag, i, y1=y1, x1=x1, ind=ind: features.atomic_tag_label(
                        words, curr_tag, prev_tag, i - ind, tag=y1, word=x1
                    )
                    self.func_list.append(new_func)
                    self.func_filter_dict[reg_exp.format(y1, x1, ind)] = len(self.func_list) - 1

        for slength in range(3, 5):
            reg_exp = "ct{}s{}l{}"
            self.reg_exp.append(
                lambda x_j, cur_tag, prev_tag, i, l=slength, reg_exp=reg_exp: reg_exp.format(
                    cur_tag, x_j[i][-l:].lower(), l
                )
            )
            for y1 in self.unique_y:
                for x1 in self.unique_x:
                    s = x1[-slength:]
                    new_func = lambda words, curr_tag, prev_tag, i, suff_length=slength, suffix=s, tag=y1: features.suffix(
                        words, curr_tag, prev_tag, i, suff_length=suff_length, suffix=suffix, tag=y1
                    )
                    self.func_list.append(new_func)
                    self.func_filter_dict[reg_exp.format(y1, s, slength)] = len(self.func_list) - 1