def feature_extractor(X): # Apply attribute templates to obtain features. crfutils.apply_templates(X, templates) # Add the is_punct() feature. map(lambda x: x['F'].append("punct=" + is_punct(x['c'])), X) # Add the char_class() feature. map(lambda x: x['F'].append("class=" + char_class(x['c'])), X)
def feature_extractor(X): # Apply attribute templates to obtain features (in fact, attributes) crfutils.apply_templates(X, templates) if X: # Append BOS and EOS features manually X[0]['F'].append('__BOS__') # BOS feature X[-1]['F'].append('__EOS__') # EOS feature
def feature_extractor(X): for x in X: observation(x) crfutils.apply_templates(X, templates) if X: X[0]['F'].append('__BOS__') X[-1]['F'].append('__EOS__')
def FeatureExtractor(X): """apply attribute templates to obtain features (in fact, attributes)""" crfutils.apply_templates(X, templates) Featurizer(X) if X: X[0]['F'].append('__BOS__') # BOS feature X[-1]['F'].append('__EOS__') # EOS feature
def FeatureExtractor(X): """apply attribute templates to obtain features (in fact, attributes)""" #print 'FeatureExtractor called with X ', X crfutils.apply_templates(X, templates) Featurizer(X) #print X if X: #print 'in if X' X[0]['F'].append('__BOS__') # BOS feature X[-1]['F'].append('__EOS__') # EOS feature
def feature_extractor(X): # Append observations. for x in X: observation(x) # Apply the feature templates. crfutils.apply_templates(X, templates) if X: # Append BOS and EOS features manually X[0]['F'].append('__BOS__') # BOS feature X[-1]['F'].append('__EOS__') # EOS feature
def feature_extractor(X): # Append observations. for x in X: observation(x) # Apply the feature templates. crfutils.apply_templates(X, templates) # Append disjunctive features. for t in range(len(X)): disjunctive(X, t, 'w', -4, -1) disjunctive(X, t, 'w', 1, 4) # Append BOS and EOS features. if X: X[0]['F'].append('__BOS__') X[-1]['F'].append('__EOS__')
def feature_extractor(self, X): # Append observations. for x in X: self.observation(x) gen_regex_observation(X) gen_gazetteer_observation(X, gazetteer=self.gazetteer) # Apply the feature templates. crfutils.apply_templates(X, self.templates) # Append disjunctive features. for t in range(len(X)): disjunctive(X, t, 'w', -4, -1) disjunctive(X, t, 'w', 1, 4) if self.use_fregex: regexp_features(X, t) # Append BOS and EOS features. if X: X[0]['F'].append('__BOS__') X[-1]['F'].append('__EOS__')
def feature_extractor(X): # Apply templates to obtain features. crfutils.apply_templates(X, templates) # Add the is_punct() feature. map(lambda x: x['F'].append("punct=" + is_punct(x['c'])), X)
def feature_extractor(sentence): crfutils.apply_templates(sentence, attribute_templates)
def extract_features(self, seq, content_type): ret = [] if content_type == 'Blog': use_brown_clusters = self.params.get_boolean('blog.use_brown_clusters') use_traditional_features = self.params.get_boolean('blog.use_traditional_features') use_idf_word_features = self.params.get_boolean('blog.use_idf_word_features') use_embeddings = self.params.get_boolean('blog.use_embeddings') use_lowercase_embeddings = self.params.get_boolean('blog.use_lowercase_embeddings') use_postag = self.params.get_boolean('blog.use_postag') elif content_type == 'SocialMediaPosting': use_brown_clusters = self.params.get_boolean('tweet.use_brown_clusters') use_traditional_features = self.params.get_boolean('tweet.use_traditional_features') use_idf_word_features = self.params.get_boolean('tweet.use_idf_word_features') use_embeddings = self.params.get_boolean('tweet.use_embeddings') use_lowercase_embeddings = self.params.get_boolean('tweet.use_lowercase_embeddings') use_postag = self.params.get_boolean('tweet.use_postag') elif content_type == 'NewsArticle': use_brown_clusters = self.params.get_boolean('news.use_brown_clusters') use_traditional_features = self.params.get_boolean('news.use_traditional_features') use_idf_word_features = self.params.get_boolean('news.use_idf_word_features') use_embeddings = self.params.get_boolean('news.use_embeddings') use_lowercase_embeddings = self.params.get_boolean('news.use_lowercase_embeddings') use_postag = self.params.get_boolean('news.use_postag') elif content_type == 'Post': use_brown_clusters = self.params.get_boolean('dw.use_brown_clusters') use_traditional_features = self.params.get_boolean('dw.use_traditional_features') use_idf_word_features = self.params.get_boolean('dw.use_idf_word_features') use_embeddings = self.params.get_boolean('dw.use_embeddings') use_lowercase_embeddings = self.params.get_boolean('dw.use_lowercase_embeddings') use_postag = self.params.get_boolean('dw.use_postag') if use_traditional_features: ner_seq = [{'w': x[0], 'F':[]} for x in seq] for x in ner_seq: ner.observation(x) crfutils.apply_templates(ner_seq, self.templates) # ner_seq is the same len as seq. Also, ner_seq[F] is a list of features for i in range(2, len(seq)-2): fs = [] # list of features for this word if use_traditional_features: fs.extend(ner_seq[i]['F']) if use_idf_word_features: fs.extend(self.get_idf_word_features(seq[i][0], i-2)) # subtract two to get "real" index # word features #fs.append('U00=%s' % seq[i-2][0]) # word left-2 #fs.append('U01=%s' % seq[i-1][0]) # word left-1 #fs.append('U02=%s' % seq[i][0]) # current word (w) #fs.append('U03=%s' % seq[i+1][0]) # word right+1 #fs.append('U04=%s' % seq[i+2][0]) # word right+2 #fs.append('U05=%s/%s' % (seq[i-1][0], seq[i][0])) # w_left-2 / w #fs.append('U06=%s/%s' % (seq[i][0], seq[i+1][0])) # w / w_right+1 # lowercase features #fs.append('U00lc=%s' % seq[i-2][0].lower()) # word left-2 #fs.append('U01lc=%s' % seq[i-1][0].lower()) # word left-1 #fs.append('U02lc=%s' % seq[i][0].lower()) # current word (w) #fs.append('U03lc=%s' % seq[i+1][0].lower()) # word right+1 #fs.append('U04lc=%s' % seq[i+2][0].lower()) # word right+2 #fs.append('U05lc=%s/%s' % (seq[i-1][0].lower(), seq[i][0].lower())) # w_left-2 / w #fs.append('U06lc=%s/%s' % (seq[i][0].lower(), seq[i+1][0].lower())) # w / w_right+1 # This will only be non-empty if word lists are specified fs.extend(self.get_word_list_features(seq[i][0])) if use_brown_clusters: # Size is the number of bits in the brown prefix returned for size in [8, 12, 16, 20]: # Get results for five-word window for index in [-2, -1, 0, 1, 2]: bc = self.get_brown_prefix(seq[i+index][0], size) if bc: fs.append("brown.%d.%d=%s" % (size, index, bc)) # embeddings # TODO : try changing the %g to %f if use_embeddings: for j, embedding in enumerate(self.word_to_embedding[content_type]): # U00e0-0=1:float U00e0-1=1:float U00e1-399=1:float , word_left-2 embeddings # U01e0-0=1:float ... , word_left-1 embeddings # U02e... , word w embeddings # U03e... , word_right+1 embeddings # U04e... , word_right+2 embeddings for name, pos in zip(["U00", "U01", "U02", "U03", "U04"], [i-2,i-1,i,i+1,i+2]): w = seq[pos][0] # word in that position/index if w not in embedding: w = "*UNKNOWN*" # default all OOV words to the UNKNOWN embeddings for d in range(len(embedding[w])): fs.append("%se%d-%d=1:%g" % (name, j, d, embedding[w][d])) if use_lowercase_embeddings: for j, embedding in enumerate(self.word_to_embedding[content_type]): for name, pos in zip(["U00", "U01", "U02", "U03", "U04"], [i-2,i-1,i,i+1,i+2]): w = seq[pos][0] # word in that position/index wlc = w.lower() if wlc not in embedding: wlc = "*UNKNOWN*" # default all OOV words to the UNKNOWN embeddings for d in range(len(embedding[wlc])): fs.append("%slce%d-%d=1:%g" % (name, j, d, embedding[wlc][d])) if use_postag: fs.append('U10=%s' % seq[i-2][1]) fs.append('U11=%s' % seq[i-1][1]) fs.append('U12=%s' % seq[i][1]) fs.append('U13=%s' % seq[i+1][1]) fs.append('U14=%s' % seq[i+2][1]) fs.append('U15=%s/%s' % (seq[i-2][1], seq[i-1][1])) fs.append('U16=%s/%s' % (seq[i-1][1], seq[i][1])) fs.append('U17=%s/%s' % (seq[i][1], seq[i+1][1])) fs.append('U18=%s/%s' % (seq[i+1][1], seq[i+2][1])) fs.append('U20=%s/%s/%s' % (seq[i-2][1], seq[i-1][1], seq[i][1])) fs.append('U21=%s/%s/%s' % (seq[i-1][1], seq[i][1], seq[i+1][1])) fs.append('U22=%s/%s/%s' % (seq[i][1], seq[i+1][1], seq[i+2][1])) ret.append("%s\t%s" % (seq[i][2], '\t'.join(fs))) # example-label , followed by feature vector return ret
def feature_extractor(X): crfutils.apply_templates(X, templates) if X: X[0]['F'].append('__BOS__') # BOS feature X[-1]['F'].append('__EOS__') # EOS feature
features = OrderedDict() for i in feature_set: features[i[0]] = i[1] for w, _ in features.items(): if w in trigrams: features[w] = [[[w, -1]], [[w, 0]], [[w, 1]]] feature_keys = features.keys() feature_items = features.values() input_columns = ' '.join(feature_keys) + ' chunk y' attribute_templates = [] for i in feature_items: attribute_templates += i print("Using features: {}.".format(str(attribute_templates))) feature_extractor = lambda x: crfutils.apply_templates(x, attribute_templates) for fi, txt in [(train_csv, "train"), (devel_csv, "devel"), (test_csv, "test")]: write_to = path.join(trigram_path, txt + "_trigrams_" + str(mode) + ".crfsuite") fo = open(write_to, "w+") print("Writing to {}...".format(write_to)) crfutils.main(feature_extractor, fi, fo, fields=input_columns, sep='\t') fo.close() train_csv.close() devel_csv.close() test_csv.close()
def feature_extractor(X): # Apply attribute templates to obtain features (in fact, attributes) crfutils.apply_templates(X, templates)