Exemplo n.º 1
0
    def corpus_generator(self):
        for doc in self.dataset.seq_list:
            for i in range(len(doc.x)):
                token = doc.sequence_list.x_dict.get_label_name(doc.x[i])
                if token in NO_LABELS:
                    yield token
                else:
                    tok = permanentFilter(token)
                    if tok not in filter_names and tok.lower() not in self.word_reference:
                        tok = assignFilterTag(tok.lower())

                    if self.lower and tok == token:
                        tok = tok.lower()
                    yield tok
    def get_word_cluster_features(self,lower_word, pos, y_name, features):
        if lower_word in [BR,START,END]:
            return features

        if lower_word not in filter_names and lower_word not in self.word_clusters:
            lower_word = assignFilterTag(lower_word)
        if lower_word not in self.word_clusters:
            lower_word = RARE
        bitstream = self.word_clusters[lower_word]

        for pref in cluster_prefixes:
            if pref < len(bitstream):
                feat_name = "cluster::pref_%i:%i:%s::%s" % (pref,pos,y_name,bitstream[:pref])
                feat_id = self.add_feature(feat_name)
                if feat_id != -1:
                    features.append(feat_id)
            else:
                break
        return features
    def get_word_clusters(self, lower_word):
        if lower_word in [BR, START, END]:
            return []
        if lower_word not in filter_names and lower_word not in self.word_clusters:
            lower_word = assignFilterTag(lower_word)
        # LEL hay casos de assignFIlter que no se ven en training
        if lower_word not in self.word_clusters:
            return []
        clusters_pref = []
        bitstream = self.word_clusters[lower_word]
        temp_cluster_prefixes = list(cluster_prefixes)
        #if len(bitstream) < temp_cluster_prefixes[0]:
        #    temp_cluster_prefixes = [4,6] + temp_cluster_prefixes

        for pref in temp_cluster_prefixes:
            if pref < len(bitstream):
                clusters_pref.append(bitstream[:pref])
            else:
                break
        return clusters_pref
    def get_word_clusters(self,lower_word):
        if lower_word in [BR,START,END]:
            return []
        if lower_word not in filter_names and lower_word not in self.word_clusters:
            lower_word = assignFilterTag(lower_word)
        # LEL hay casos de assignFIlter que no se ven en training
        if lower_word not in self.word_clusters:
            return []
        clusters_pref = []
        bitstream = self.word_clusters[lower_word]
        temp_cluster_prefixes = list(cluster_prefixes)
        #if len(bitstream) < temp_cluster_prefixes[0]:
        #    temp_cluster_prefixes = [4,6] + temp_cluster_prefixes

        for pref in temp_cluster_prefixes:
            if pref < len(bitstream):
                clusters_pref.append(bitstream[:pref])
            else:
                break
        return clusters_pref
Exemplo n.º 5
0
    def get_word_cluster_features(self, lower_word, pos, y_name, features):
        if lower_word in [BR, START, END]:
            return features

        if lower_word not in filter_names and lower_word not in self.word_clusters:
            lower_word = assignFilterTag(lower_word)
        if lower_word not in self.word_clusters:
            lower_word = RARE
        bitstream = self.word_clusters[lower_word]

        for pref in cluster_prefixes:
            if pref < len(bitstream):
                feat_name = "cluster::pref_%i:%i:%s::%s" % (pref, pos, y_name,
                                                            bitstream[:pref])
                feat_id = self.add_feature(feat_name)
                if feat_id != -1:
                    features.append(feat_id)
            else:
                break
        return features
    def update_tw(self, sequence, pos_current):
        '''
        if B: update inner and extern context
        elif I: update inner
        elif O: only update -inner,extern- if it's first O after I
        '''
        length = len(sequence.x)
        y_name = sequence.sequence_list.y_dict.get_label_name(sequence.y[pos_current])
        y_1_name = sequence.sequence_list.y_dict.get_label_name(sequence.y[pos_current-1])

        TW_WINDOW = 5
        extremos = range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length))

        ## outer TRIGGER WORD & POS
        if any(['B'==y_name[0],
                'O'==y_name[0] and 'I'==y_1_name[0],
                'O'==y_name[0] and 'B'==y_1_name[0]]):
            for pos in extremos:
                if y_name[0] == 'O' and pos < pos_current:
                    continue
                if y_name[0] == 'B' and pos >= pos_current:
                    continue

                x = sequence.x[pos]
                word = sequence.sequence_list.x_dict.get_label_name(x).lower()
                word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape')
                stem = stemAugmented(word)
                if stem not in filter_names and stem not in self.dataset.stem_vocabulary:
                    word = assignFilterTag(word)
                
                pos_id = sequence.pos[pos]
                pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
                if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                    pos_tag = NOUN

                if any([pos_tag[0] =='s',   # PREPOS
                        pos_tag[0] =='c',   # CONJ
                        pos_tag[0] =='d',   # DETERM
                        ]):
                    continue

                if y_name not in self.outer_trigger_words:
                    self.outer_trigger_words[y_name] = {}
                if y_name not in self.outer_trigger_pos:
                    self.outer_trigger_pos[y_name] = {}

                # TRIGGER WORD
                if word not in self.outer_trigger_words[y_name]:
                    self.outer_trigger_words[y_name][word] = 0
                self.outer_trigger_words[y_name][word] += 1
                # TRIGGER POS
                if pos_tag not in self.outer_trigger_pos[y_name]:
                    self.outer_trigger_pos[y_name][pos_tag] = 0
                self.outer_trigger_pos[y_name][pos_tag] += 1

        ## INNER TRIGGER WORD & POS
        if y_name[0] != 'O' and y_name!=BR:
            x = sequence.x[pos_current]
            word = sequence.sequence_list.x_dict.get_label_name(x).lower()
            word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape')

            stem = stemAugmented(word)
            if stem not in self.dataset.stem_vocabulary:
                word = assignFilterTag(word)

            pos_id = sequence.pos[pos_current]
            pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
            if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                pos_tag = NOUN

            if all([pos_tag[0] !='s',   # PREPOS
                    pos_tag[0] !='c',   # CONJ
                    pos_tag[0] !='d',   # DETERM
                    ]):
                if y_name not in self.inner_trigger_words:
                    self.inner_trigger_words[y_name] = {}
                if y_name not in self.inner_trigger_pos:
                    self.inner_trigger_pos[y_name] = {}
                # TRIGGER WORD
                if y_name not in self.inner_trigger_words[y_name]:
                    self.inner_trigger_words[y_name][word] = 0
                self.inner_trigger_words[y_name][word] += 1
                # TRIGGER POS
                if pos_tag not in self.inner_trigger_pos[y_name]:
                    self.inner_trigger_pos[y_name][pos_tag] = 0
                self.inner_trigger_pos[y_name][pos_tag] += 1
Exemplo n.º 7
0
    def update_tw(self, sequence, pos_current):
        '''
        if B: update inner and extern context
        elif I: update inner
        elif O: only update -inner,extern- if it's first O after I
        '''
        length = len(sequence.x)
        y_name = sequence.sequence_list.y_dict.get_label_name(
            sequence.y[pos_current])
        y_1_name = sequence.sequence_list.y_dict.get_label_name(
            sequence.y[pos_current - 1])

        TW_WINDOW = 5
        extremos = range(max(0, pos_current - TW_WINDOW),
                         min(pos_current + TW_WINDOW + 1, length))

        ## outer TRIGGER WORD & POS
        if any([
                'B' == y_name[0], 'O' == y_name[0] and 'I' == y_1_name[0],
                'O' == y_name[0] and 'B' == y_1_name[0]
        ]):
            for pos in extremos:
                if y_name[0] == 'O' and pos < pos_current:
                    continue
                if y_name[0] == 'B' and pos >= pos_current:
                    continue

                x = sequence.x[pos]
                word = sequence.sequence_list.x_dict.get_label_name(x).lower()
                word = unicodedata.normalize('NFKD', word).encode(
                    'ascii', 'ignore').decode('unicode_escape')
                stem = stemAugmented(word)
                if stem not in filter_names and stem not in self.dataset.stem_vocabulary:
                    word = assignFilterTag(word)

                pos_id = sequence.pos[pos]
                pos_tag = sequence.sequence_list.pos_dict.get_label_name(
                    pos_id)
                if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                    pos_tag = NOUN

                if any([
                        pos_tag[0] == 's',  # PREPOS
                        pos_tag[0] == 'c',  # CONJ
                        pos_tag[0] == 'd',  # DETERM
                ]):
                    continue

                if y_name not in self.outer_trigger_words:
                    self.outer_trigger_words[y_name] = {}
                if y_name not in self.outer_trigger_pos:
                    self.outer_trigger_pos[y_name] = {}

                # TRIGGER WORD
                if word not in self.outer_trigger_words[y_name]:
                    self.outer_trigger_words[y_name][word] = 0
                self.outer_trigger_words[y_name][word] += 1
                # TRIGGER POS
                if pos_tag not in self.outer_trigger_pos[y_name]:
                    self.outer_trigger_pos[y_name][pos_tag] = 0
                self.outer_trigger_pos[y_name][pos_tag] += 1

        ## INNER TRIGGER WORD & POS
        if y_name[0] != 'O' and y_name != BR:
            x = sequence.x[pos_current]
            word = sequence.sequence_list.x_dict.get_label_name(x).lower()
            word = unicodedata.normalize('NFKD', word).encode(
                'ascii', 'ignore').decode('unicode_escape')

            stem = stemAugmented(word)
            if stem not in self.dataset.stem_vocabulary:
                word = assignFilterTag(word)

            pos_id = sequence.pos[pos_current]
            pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id)
            if self.dataset.pos_dict.get_label_id(pos_tag) == -1:
                pos_tag = NOUN

            if all([
                    pos_tag[0] != 's',  # PREPOS
                    pos_tag[0] != 'c',  # CONJ
                    pos_tag[0] != 'd',  # DETERM
            ]):
                if y_name not in self.inner_trigger_words:
                    self.inner_trigger_words[y_name] = {}
                if y_name not in self.inner_trigger_pos:
                    self.inner_trigger_pos[y_name] = {}
                # TRIGGER WORD
                if y_name not in self.inner_trigger_words[y_name]:
                    self.inner_trigger_words[y_name][word] = 0
                self.inner_trigger_words[y_name][word] += 1
                # TRIGGER POS
                if pos_tag not in self.inner_trigger_pos[y_name]:
                    self.inner_trigger_pos[y_name][pos_tag] = 0
                self.inner_trigger_pos[y_name][pos_tag] += 1