def generate_features(self): dataset = Dataset('convinceme',annotation_list=['topic','dependencies','used_in_wassa2011', 'side']) directory = "{}/convinceme/output_by_thread".format(data_root_dir) for discussion in dataset.get_discussions(annotation_label='topic'): if self.topic != discussion.annotations['topic']: continue for post in discussion.get_posts(): feature_vector = defaultdict(int) post.discussion_id = discussion.id post.topic_side = get_topic_side(discussion, post.side) post.key = str((discussion.id,post.id)) feature_vector[self.classification_feature] = post.topic_side try: json_file = "{}/{}/{}.json".format(directory, discussion.id, post.id) pos, parsetree, dep, ident = json.load(open(json_file, 'r')) result = sorted(feat_vect(dep, pos, feature_vector), key=operator.itemgetter(0)) try: text = TextObj(post.text.decode('utf-8', 'replace')) except Exception, e: continue self.bounds.add(discussion_id=discussion.id, post_id=post.id, text=text.text, tuples=result) uni_from_boundaries(text.text, result, feature_vector) dependency_list = None if 'dependencies' not in post.annotations else post.annotations['dependencies'] if 'unigram' in self.features: ngrams_from_text(text.text.lower(), feature_vector, prefix="uni_lower_", n=1, style='float') ngrams_from_text(text.text, feature_vector, prefix="uni_caps_", n=1, style='float') feats = set(self.features).difference(set(['unigram'])) get_features_by_type(feature_vector=feature_vector, features=feats, text_obj=text, dependency_list=dependency_list) if None == dependency_list: continue if 'dependencies' in self.features: get_dependency_features(feature_vector, dependency_list, generalization='opinion') if DELETE_QUOTE: unigrams = map(lambda x: x[8:], filter(lambda x: x.startswith('unigram:'), feature_vector.keys())) for unigram in unigrams: key = 'quote: {}'.format(unigram) if key in feature_vector: del feature_vector[key] self.feature_vectors.append(feature_vector) except IOError, e: # XXX TODO : we don't have all the parses saved apparently so this sometimes fails. pass
def uni_from_boundaries(text, boundaries, features): bounds = [(bound[2].upper(), re.sub(r'\r', '', text)[bound[0]:bound[1]]) for bound in boundaries] for bound in bounds: ngrams_from_text(bound[1], features, prefix=bound[0]+"_uni_caps_", n=1, style='float') ngrams_from_text(bound[1].lower(), features, prefix=bound[0]+"_uni_lower_", n=1, style='float')