def get_generic_def_mention(sentence, span, length_bin_size=5): for dict_indicator_feat in _get_dictionary_indicator_features( sentence, span): yield dict_indicator_feat # Dependency path(s) from mention to keyword(s). Various transformations of # the dependency path are done. for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH): if i >= span.begin_word_id and i < span.begin_word_id + span.length: continue if j > span.begin_word_id and j < span.begin_word_id + span.length: continue is_in_dictionary = False for dict_id in dictionaries: if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in \ dictionaries[dict_id]: is_in_dictionary = True yield "KW_IND_[" + dict_id + "]" break if is_in_dictionary: kw_span = Span(begin_word_id=i, length=j - i) for dep_path_feature in _get_min_dep_path_features( sentence, span, kw_span, "KW"): yield dep_path_feature # The mention starts with a capital length = len(" ".join(materialize_span(sentence, span, lambda x: x.word))) bin_id = length // length_bin_size length_feat = "LENGTH_" + str(bin_id) yield length_feat
def get_seq_definition_features(sentence, span1, span2, length_bin_size=5): # word_seq_feat = "WORD_SEQ_[" + " ".join(materialize_span( # sentence, span1, lambda x: x.word)) + "]" # yield word_seq_feat # pos_seq_feat = "POS_SEQ_[" + " ".join(materialize_span( # sentence, span1, lambda x: str(x.pos))) + "]" # yield pos_seq_feat for first_pos_feature in _get_first_pos_features(sentence, span1, span2): yield first_pos_feature for length_feature in _get_length_span_feature(span1): yield "DEFINE_" + length_feature for length_feature in _get_length_span_feature_2(span2): yield "EXPLAIN_" + length_feature temp_span = Span(begin_word_id=(span2.begin_word_id - 1), length=1) for window_feat in _get_window_features(sentence, temp_span): yield window_feat for same_token_feature in _get_same_token_definition_feature( sentence, span1, span2): yield same_token_feature if is_upper(sentence[span2.begin_word_id].word): yield "EXPLAIN_STARTS_WITH_CAPITAL" else: yield "EXPLAIN_NOT_STARTS_WITH_CAPITAL" if is_candidate(sentence, span1, span2): yield "IS_CANDIDATE" else: yield "NOT_CANDIDATE"
def get_span(span_begin, span_length): """Return a Span object Args: span_begin: the index the Span begins at span_length: the length of the span """ return Span(begin_word_id=span_begin, length=span_length)
def get_generic_features_mention(sentence, span, length_bin_size=5): """Yield 'generic' features for a mention in a sentence. Args: sentence: a list of Word objects span: a Span namedtuple length_bin_size: the size of the bins for the length feature """ # Mention sequence features (words, lemmas, ners, and poses) for seq_feat in _get_seq_features(sentence, span): yield seq_feat # Window (left and right, up to size 3, with combinations) around the # mention for window_feat in _get_window_features(sentence, span): yield window_feat # Is (substring of) mention in a dictionary? for dict_indicator_feat in _get_dictionary_indicator_features( sentence, span): yield dict_indicator_feat # Dependency path(s) from mention to keyword(s). Various transformations of # the dependency path are done. for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH): if i >= span.begin_word_id and i < span.begin_word_id + span.length: continue if j > span.begin_word_id and j < span.begin_word_id + span.length: continue is_in_dictionary = False for dict_id in dictionaries: if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in \ dictionaries[dict_id]: is_in_dictionary = True yield "KW_IND_[" + dict_id + "]" break if is_in_dictionary: kw_span = Span(begin_word_id=i, length=j - i) for dep_path_feature in _get_min_dep_path_features( sentence, span, kw_span, "KW"): yield dep_path_feature # The mention starts with a capital if sentence[span.begin_word_id].word[0].isupper(): yield "STARTS_WITH_CAPITAL" # Length of the mention length = len(" ".join(materialize_span(sentence, span, lambda x: x.word))) bin_id = length // length_bin_size length_feat = "LENGTH_" + str(bin_id) yield length_feat
def get_generic_features_relation(sentence, span1, span2, length_bin_size=5): """Yield 'generic' features for a relation in a sentence. Args: sentence: a list of Word objects span1: the first Span of the relation span2: the second Span of the relation length_bin_size: the size of the bins for the length feature """ # Check whether the order of the spans is inverted. We use this information # to add a prefix to *all* the features. order = sorted([ span1.begin_word_id, span1.begin_word_id + span1.length, span2.begin_word_id, span2.begin_word_id + span2.length ]) begin = order[0] betw_begin = order[1] betw_end = order[2] end = order[3] if begin == span2.begin_word_id: inverted = "INV_" yield "IS_INVERTED" else: inverted = "" betw_span = Span(begin_word_id=betw_begin, length=betw_end - betw_begin) covering_span = Span(begin_word_id=begin, length=end - begin) # Words, Lemmas, Ners, and Poses sequence between the mentions for seq_feat in _get_seq_features(sentence, betw_span): yield inverted + seq_feat # Window feature (left and right, up to size 3, combined) for window_feat in _get_window_features(sentence, covering_span, isolated=False): yield inverted + window_feat # Ngrams of up to size 3 between the mentions for ngram_feat in _get_ngram_features(sentence, betw_span): yield inverted + ngram_feat # Indicator features of whether the mentions are in dictionaries found1 = False for feat1 in _get_dictionary_indicator_features(sentence, span1, prefix=inverted + "IN_DICT"): found1 = True found2 = False for feat2 in _get_dictionary_indicator_features(sentence, span2, prefix=""): found2 = True yield feat1 + feat2 if not found2: yield feat1 + "_[_NONE]" if not found1: for feat2 in _get_dictionary_indicator_features(sentence, span2, prefix=""): found2 = True yield inverted + "IN_DICT_[_NONE]" + feat2 # Dependency path (and transformations) between the mention for betw_dep_path_feature in _get_min_dep_path_features( sentence, span1, span2, inverted + "BETW"): yield betw_dep_path_feature # Dependency paths (and transformations) between the mentions and keywords for (i, j) in _get_substring_indices(len(sentence), MAX_KW_LENGTH): if (i >= begin and i < betw_begin) or (i >= betw_end and i < end): continue if (j > begin and j <= betw_begin) or (j > betw_end and j <= end): continue is_in_dictionary = False for dict_id in dictionaries: if " ".join(map(lambda x: str(x.lemma), sentence[i:j])) in \ dictionaries[dict_id]: is_in_dictionary = True yield inverted + "KW_IND_[" + dict_id + "]" break if is_in_dictionary: kw_span = Span(begin_word_id=i, length=j - i) path1 = _get_min_dep_path(sentence, span1, kw_span) lemmas1 = [] labels1 = [] for edge in path1: lemmas1.append(str(edge.word2.lemma)) labels1.append(edge.label) both1 = [] for j in range(len(labels1)): both1.append(labels1[j]) both1.append(lemmas1[j]) both1 = both1[:-1] path2 = _get_min_dep_path(sentence, span2, kw_span) lemmas2 = [] labels2 = [] for edge in path2: lemmas2.append(str(edge.word2.lemma)) labels2.append(edge.label) both2 = [] for j in range(len(labels2)): both2.append(labels2[j]) both2.append(lemmas2[j]) both2 = both2[:-1] yield inverted + "KW_[" + " ".join(both1) + "]_[" + \ " ".join(both2) + "]" yield inverted + "KW_L_[" + " ".join(labels1) + "]_[" + \ " ".join(labels2) + "]" for j in range(1, len(both1), 2): for dict_id in dictionaries: if both1[j] in dictionaries[dict_id]: both1[j] = "DICT_" + str(dict_id) break # Picking up the first dictionary we find for j in range(1, len(both2), 2): for dict_id in dictionaries: if both2[j] in dictionaries[dict_id]: both2[j] = "DICT_" + str(dict_id) break # Picking up the first dictionary we find yield inverted + "KW_D_[" + " ".join(both1) + "]_[" + \ " ".join(both2) + "]" # The mentions start with a capital letter first_capital = sentence[span1.begin_word_id].word[0].isupper() second_capital = sentence[span2.begin_word_id].word[0].isupper() capital_feat = inverted + "STARTS_WITH_CAPITAL_[" + str(first_capital) + \ "_" + str(second_capital) + "]" yield capital_feat # The lengths of the mentions first_length = len(" ".join( materialize_span(sentence, span1, lambda x: str(x.word)))) second_length = len(" ".join( materialize_span(sentence, span2, lambda x: str(x.word)))) first_bin_id = first_length // length_bin_size second_bin_id = second_length // length_bin_size length_feat = inverted + "LENGTHS_[" + str(first_bin_id) + "_" + \ str(second_bin_id) + "]" yield length_feat