def corpus_generator(self): for doc in self.dataset.seq_list: for i in range(len(doc.x)): token = doc.sequence_list.x_dict.get_label_name(doc.x[i]) if token in NO_LABELS: yield token else: tok = permanentFilter(token) if tok not in filter_names and tok.lower() not in self.word_reference: tok = assignFilterTag(tok.lower()) if self.lower and tok == token: tok = tok.lower() yield tok
def get_word_cluster_features(self,lower_word, pos, y_name, features): if lower_word in [BR,START,END]: return features if lower_word not in filter_names and lower_word not in self.word_clusters: lower_word = assignFilterTag(lower_word) if lower_word not in self.word_clusters: lower_word = RARE bitstream = self.word_clusters[lower_word] for pref in cluster_prefixes: if pref < len(bitstream): feat_name = "cluster::pref_%i:%i:%s::%s" % (pref,pos,y_name,bitstream[:pref]) feat_id = self.add_feature(feat_name) if feat_id != -1: features.append(feat_id) else: break return features
def get_word_clusters(self, lower_word): if lower_word in [BR, START, END]: return [] if lower_word not in filter_names and lower_word not in self.word_clusters: lower_word = assignFilterTag(lower_word) # LEL hay casos de assignFIlter que no se ven en training if lower_word not in self.word_clusters: return [] clusters_pref = [] bitstream = self.word_clusters[lower_word] temp_cluster_prefixes = list(cluster_prefixes) #if len(bitstream) < temp_cluster_prefixes[0]: # temp_cluster_prefixes = [4,6] + temp_cluster_prefixes for pref in temp_cluster_prefixes: if pref < len(bitstream): clusters_pref.append(bitstream[:pref]) else: break return clusters_pref
def get_word_clusters(self,lower_word): if lower_word in [BR,START,END]: return [] if lower_word not in filter_names and lower_word not in self.word_clusters: lower_word = assignFilterTag(lower_word) # LEL hay casos de assignFIlter que no se ven en training if lower_word not in self.word_clusters: return [] clusters_pref = [] bitstream = self.word_clusters[lower_word] temp_cluster_prefixes = list(cluster_prefixes) #if len(bitstream) < temp_cluster_prefixes[0]: # temp_cluster_prefixes = [4,6] + temp_cluster_prefixes for pref in temp_cluster_prefixes: if pref < len(bitstream): clusters_pref.append(bitstream[:pref]) else: break return clusters_pref
def get_word_cluster_features(self, lower_word, pos, y_name, features): if lower_word in [BR, START, END]: return features if lower_word not in filter_names and lower_word not in self.word_clusters: lower_word = assignFilterTag(lower_word) if lower_word not in self.word_clusters: lower_word = RARE bitstream = self.word_clusters[lower_word] for pref in cluster_prefixes: if pref < len(bitstream): feat_name = "cluster::pref_%i:%i:%s::%s" % (pref, pos, y_name, bitstream[:pref]) feat_id = self.add_feature(feat_name) if feat_id != -1: features.append(feat_id) else: break return features
def update_tw(self, sequence, pos_current): ''' if B: update inner and extern context elif I: update inner elif O: only update -inner,extern- if it's first O after I ''' length = len(sequence.x) y_name = sequence.sequence_list.y_dict.get_label_name(sequence.y[pos_current]) y_1_name = sequence.sequence_list.y_dict.get_label_name(sequence.y[pos_current-1]) TW_WINDOW = 5 extremos = range(max(0, pos_current-TW_WINDOW), min(pos_current+TW_WINDOW + 1, length)) ## outer TRIGGER WORD & POS if any(['B'==y_name[0], 'O'==y_name[0] and 'I'==y_1_name[0], 'O'==y_name[0] and 'B'==y_1_name[0]]): for pos in extremos: if y_name[0] == 'O' and pos < pos_current: continue if y_name[0] == 'B' and pos >= pos_current: continue x = sequence.x[pos] word = sequence.sequence_list.x_dict.get_label_name(x).lower() word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape') stem = stemAugmented(word) if stem not in filter_names and stem not in self.dataset.stem_vocabulary: word = assignFilterTag(word) pos_id = sequence.pos[pos] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if any([pos_tag[0] =='s', # PREPOS pos_tag[0] =='c', # CONJ pos_tag[0] =='d', # DETERM ]): continue if y_name not in self.outer_trigger_words: self.outer_trigger_words[y_name] = {} if y_name not in self.outer_trigger_pos: self.outer_trigger_pos[y_name] = {} # TRIGGER WORD if word not in self.outer_trigger_words[y_name]: self.outer_trigger_words[y_name][word] = 0 self.outer_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.outer_trigger_pos[y_name]: self.outer_trigger_pos[y_name][pos_tag] = 0 self.outer_trigger_pos[y_name][pos_tag] += 1 ## INNER TRIGGER WORD & POS if y_name[0] != 'O' and y_name!=BR: x = sequence.x[pos_current] word = sequence.sequence_list.x_dict.get_label_name(x).lower() word = unicodedata.normalize('NFKD', word).encode('ascii','ignore').decode('unicode_escape') stem = stemAugmented(word) if stem not in self.dataset.stem_vocabulary: word = assignFilterTag(word) pos_id = sequence.pos[pos_current] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if all([pos_tag[0] !='s', # PREPOS pos_tag[0] !='c', # CONJ pos_tag[0] !='d', # DETERM ]): if y_name not in self.inner_trigger_words: self.inner_trigger_words[y_name] = {} if y_name not in self.inner_trigger_pos: self.inner_trigger_pos[y_name] = {} # TRIGGER WORD if y_name not in self.inner_trigger_words[y_name]: self.inner_trigger_words[y_name][word] = 0 self.inner_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.inner_trigger_pos[y_name]: self.inner_trigger_pos[y_name][pos_tag] = 0 self.inner_trigger_pos[y_name][pos_tag] += 1
def update_tw(self, sequence, pos_current): ''' if B: update inner and extern context elif I: update inner elif O: only update -inner,extern- if it's first O after I ''' length = len(sequence.x) y_name = sequence.sequence_list.y_dict.get_label_name( sequence.y[pos_current]) y_1_name = sequence.sequence_list.y_dict.get_label_name( sequence.y[pos_current - 1]) TW_WINDOW = 5 extremos = range(max(0, pos_current - TW_WINDOW), min(pos_current + TW_WINDOW + 1, length)) ## outer TRIGGER WORD & POS if any([ 'B' == y_name[0], 'O' == y_name[0] and 'I' == y_1_name[0], 'O' == y_name[0] and 'B' == y_1_name[0] ]): for pos in extremos: if y_name[0] == 'O' and pos < pos_current: continue if y_name[0] == 'B' and pos >= pos_current: continue x = sequence.x[pos] word = sequence.sequence_list.x_dict.get_label_name(x).lower() word = unicodedata.normalize('NFKD', word).encode( 'ascii', 'ignore').decode('unicode_escape') stem = stemAugmented(word) if stem not in filter_names and stem not in self.dataset.stem_vocabulary: word = assignFilterTag(word) pos_id = sequence.pos[pos] pos_tag = sequence.sequence_list.pos_dict.get_label_name( pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if any([ pos_tag[0] == 's', # PREPOS pos_tag[0] == 'c', # CONJ pos_tag[0] == 'd', # DETERM ]): continue if y_name not in self.outer_trigger_words: self.outer_trigger_words[y_name] = {} if y_name not in self.outer_trigger_pos: self.outer_trigger_pos[y_name] = {} # TRIGGER WORD if word not in self.outer_trigger_words[y_name]: self.outer_trigger_words[y_name][word] = 0 self.outer_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.outer_trigger_pos[y_name]: self.outer_trigger_pos[y_name][pos_tag] = 0 self.outer_trigger_pos[y_name][pos_tag] += 1 ## INNER TRIGGER WORD & POS if y_name[0] != 'O' and y_name != BR: x = sequence.x[pos_current] word = sequence.sequence_list.x_dict.get_label_name(x).lower() word = unicodedata.normalize('NFKD', word).encode( 'ascii', 'ignore').decode('unicode_escape') stem = stemAugmented(word) if stem not in self.dataset.stem_vocabulary: word = assignFilterTag(word) pos_id = sequence.pos[pos_current] pos_tag = sequence.sequence_list.pos_dict.get_label_name(pos_id) if self.dataset.pos_dict.get_label_id(pos_tag) == -1: pos_tag = NOUN if all([ pos_tag[0] != 's', # PREPOS pos_tag[0] != 'c', # CONJ pos_tag[0] != 'd', # DETERM ]): if y_name not in self.inner_trigger_words: self.inner_trigger_words[y_name] = {} if y_name not in self.inner_trigger_pos: self.inner_trigger_pos[y_name] = {} # TRIGGER WORD if y_name not in self.inner_trigger_words[y_name]: self.inner_trigger_words[y_name][word] = 0 self.inner_trigger_words[y_name][word] += 1 # TRIGGER POS if pos_tag not in self.inner_trigger_pos[y_name]: self.inner_trigger_pos[y_name][pos_tag] = 0 self.inner_trigger_pos[y_name][pos_tag] += 1