def generate(self, corpus, f_set, use_gold, use_pred): assert not (use_gold and use_pred), "No support for both" self.extract_abbreviation_synonyms(corpus, use_gold, use_pred) for docid, document in corpus.documents.items(): for edge in document.edges(): sentence = edge.get_combined_sentence() entities_in_sentences = edge.get_any_entities_in_sentences(predicted=use_pred) total_count = 0 # We sort to have a deterministic order creation of the features for e_class_id in sorted(entities_in_sentences): entities = entities_in_sentences[e_class_id] # TODO this is wrong for other entitiey types nor appearing in the edge # TODO also what about if the same entity type appears in both ends of the same edge? as in a protein-protein relation --> Just rest the counts of the edge individual_count = len(entities) - 1 # rest 1, as one is already one of the edge's entities -- assert individual_count >= 0 total_count += individual_count self.add_with_value(f_set, edge, 'f_counts_individual', individual_count, 'int', 'individual', e_class_id) self.add_with_value(f_set, edge, 'f_counts_total', total_count, 'int', 'total (all classes)') entities_between_entities = edge.get_any_entities_between_entities(predicted=use_pred) total_count = 0 # We sort to have a deterministic order creation of the features for e_class_id in sorted(entities_between_entities): entities = entities_between_entities[e_class_id] individual_count = len(entities) total_count += individual_count self.add_with_value(f_set, edge, 'f_counts_in_between_individual', individual_count, 'int', 'individual', e_class_id) self.add_with_value(f_set, edge, 'f_counts_in_between_total', total_count, 'int', 'total (all classes)') order = edge.entity1.class_id < edge.entity2.class_id if order: self.add(f_set, edge, 'f_order') for token in sentence: self.add(f_set, edge, 'f_bow', masked_text(token, edge.same_part, use_gold, use_pred, token_map=lambda t: t.features['lemma'], token_is_number_fun=lambda _: "NUM")) self.add(f_set, edge, 'f_pos', token.features['coarsed_pos']) self.add_with_value(f_set, edge, 'f_tokens_count', len(sentence)) # Remember, the edge's entities are sorted, i.e. e1.offset < e2.offset _e1_first_token_index = edge.entity1.tokens[0].features['tmp_id'] _e2_last_token_index = edge.entity2.tokens[-1].features['tmp_id'] assert _e1_first_token_index < _e2_last_token_index, (docid, sentence, edge.entity1.text, edge.entity2.text, _e1_first_token_index, _e2_last_token_index) self.add_with_value(f_set, edge, 'f_tokens_count_before', len(sentence[:_e1_first_token_index])) self.add_with_value(f_set, edge, 'f_tokens_count_after', len(sentence[(_e2_last_token_index+1):])) # if Part.is_negated(sentence): self.add(f_set, edge, "f_sentence_is_negated") # verbs = set(Part.get_main_verbs(sentence, token_map=lambda t: t.features["lemma"])) if len(verbs) == 0: self.add(f_set, edge, "f_main_verbs", "NO_MAIN_VERB") else: for v in verbs: self.add(f_set, edge, "f_main_verbs", v) counters = {} for part in document: for entity in (part.annotations if use_gold else part.predicted_annotations): ent_type_counter = counters.get(entity.class_id, Counter()) ent_key = __class__.entity2key(entity) ent_type_counter.update([ent_key]) counters[entity.class_id] = ent_type_counter e1_key = __class__.entity2key(edge.entity1) e1_count = counters[edge.entity1.class_id][e1_key] self.add_with_value(f_set, edge, 'f_entity1_count', e1_count) e2_key = __class__.entity2key(edge.entity2) e2_count = counters[edge.entity2.class_id][e2_key] self.add_with_value(f_set, edge, 'f_entity2_count', e2_count) together_counter = Counter() diff_sentences = {} for aux_edge in document.edges(): if aux_edge.e1_sentence_id == aux_edge.e2_sentence_id: together_key = __class__.edge2key(aux_edge) sents = diff_sentences.get(together_key, []) if aux_edge.e1_sentence_id not in sents: sents.append(aux_edge.e1_sentence_id) diff_sentences[together_key] = sents together_counter.update([together_key]) together_key = __class__.edge2key(edge) together_count = together_counter[together_key] if together_count > 0: self.add_with_value(f_set, edge, 'f_diff_sents_together_count', together_count)
def generate(self, corpus, f_set, use_gold, use_pred): assert not (use_gold and use_pred), "No support for both" for docid, document in corpus.documents.items(): for edge in document.edges(): sentence = edge.get_combined_sentence() # Remember, the edge's entities are sorted, i.e. e1.offset < e2.offset _e1_last_token_index = edge.entity1.tokens[-1].features[ 'tmp_id'] _e2_first_token_index = edge.entity2.tokens[0].features[ 'tmp_id'] assert _e1_last_token_index < _e2_first_token_index, ( docid, sentence, edge.entity1.text, edge.entity2.text, _e1_last_token_index, _e2_first_token_index) _e1_head_token_index = edge.entity1.head_token.features[ 'tmp_id'] _e2_head_token_index = edge.entity2.head_token.features[ 'tmp_id'] assert _e1_head_token_index < _e2_head_token_index, ( docid, sentence, edge.entity1.text, edge.entity2.text, _e1_head_token_index, _e2_head_token_index) dependency_paths = [ # # Commented out as of now: they do not seem to provide any performance benefit yet cost running time # # Path( # name='OW1', # tokens=edge.entity1.prev_tokens(sentence, n=self.h_ow_size, include_ent_first_token=True, mk_reversed=True), # is_edge_type_constant=True, # there_is_target=False, # default_n_grams=self.h_ow_grams, # ), # Path( # name='IW1', # tokens=edge.entity1.next_tokens(sentence, n=self.h_iw_size, include_ent_last_token=True), # is_edge_type_constant=True, # there_is_target=False, # default_n_grams=self.h_iw_grams, # ), # # Path( # name='IW2', # tokens=edge.entity2.prev_tokens(sentence, n=self.h_iw_size, include_ent_first_token=True, mk_reversed=True), # is_edge_type_constant=True, # there_is_target=False, # default_n_grams=self.h_iw_grams, # ), # Path( # name='OW2', # tokens=edge.entity2.next_tokens(sentence, n=self.h_ow_size, include_ent_last_token=True), # is_edge_type_constant=True, # there_is_target=False, # default_n_grams=self.h_ow_grams, # ), Path( name='LD', tokens=sentence[ _e1_last_token_index:_e2_first_token_index + 1], is_edge_type_constant=True, default_n_grams=self.h_ld_grams, ), compute_shortest_path(sentence, _e1_head_token_index, _e2_head_token_index).change_name( 'PD').change_default_n_grams( self.h_pd_grams) ] for dep_path in dependency_paths: dep_type = dep_path.name for n_gram in dep_path.default_n_grams: self.add_n_grams(f_set, use_gold, use_pred, edge, dep_path, dep_type, n_gram) count = len(dep_path.middle) count_without_punct = len( list( filter( lambda node: not node.token.features[ 'is_punct'], dep_path.middle))) self.add_with_value(f_set, edge, self.f('f_XX_tokens_count', dep_type), count, dep_type) self.add_with_value( f_set, edge, self.f('f_XX_tokens_count_without_punct', dep_type), count_without_punct, dep_type) if Part.is_negated( [node.token for node in dep_path.middle]): self.add(f_set, edge, self.f('f_XX_is_negated', dep_type), dep_type)