def extract_sub_relations(self, mention_x: MentionDataLight, mention_y: MentionDataLight, relation: RelationType) -> RelationType: """ Check if input mentions has the given relation between them Args: mention_x: MentionDataLight mention_y: MentionDataLight relation: RelationType Returns: RelationType: relation in case mentions has given relation or RelationType.NO_RELATION_FOUND otherwise """ if relation is not RelationType.WORD_EMBEDDING_MATCH: return RelationType.NO_RELATION_FOUND mention_x_str = mention_x.tokens_str mention_y_str = mention_y.tokens_str if StringUtils.is_pronoun( mention_x_str.lower()) or StringUtils.is_pronoun( mention_y_str.lower()): if not self.contextual: return RelationType.NO_RELATION_FOUND if mention_x.mention_context is None or mention_y.mention_context is None: return RelationType.NO_RELATION_FOUND if self.is_word_embed_match(mention_x, mention_y): return RelationType.WORD_EMBEDDING_MATCH return RelationType.NO_RELATION_FOUND
def extract_vocab(mentions: List[MentionData], filter_stop_words: bool) -> List[str]: """ Extract Head, Lemma and mention string from all mentions to create a list of string vocabulary Args: mentions: filter_stop_words: Returns: """ vocab = set() for mention in mentions: head = mention.mention_head head_lemma = mention.mention_head_lemma tokens_str = mention.tokens_str if not filter_stop_words: vocab.add(head) vocab.add(head_lemma) vocab.add(tokens_str) else: if not StringUtils.is_stop(head): vocab.add(head) if not StringUtils.is_stop(head_lemma): vocab.add(head_lemma) if not StringUtils.is_stop(tokens_str): vocab.add(tokens_str) vocab_set = list(vocab) return vocab_set
def extract_sub_relations(self, mention_x: MentionDataLight, mention_y: MentionDataLight, relation: RelationType) -> RelationType: """ Check if input mentions has the given relation between them Args: mention_x: MentionDataLight mention_y: MentionDataLight relation: RelationType Returns: RelationType: relation in case mentions has given relation or RelationType.NO_RELATION_FOUND otherwise """ if relation is not RelationType.VERBOCEAN_MATCH: return RelationType.NO_RELATION_FOUND mention_x_str = mention_x.tokens_str mention_y_str = mention_y.tokens_str if StringUtils.is_pronoun( mention_x_str.lower()) or StringUtils.is_pronoun( mention_y_str.lower()): return RelationType.NO_RELATION_FOUND if self.is_verbocean_relation(mention_x, mention_y): return RelationType.VERBOCEAN_MATCH return RelationType.NO_RELATION_FOUND
def is_both_data_or_time(mention1: MentionDataLight, mention2: MentionDataLight) -> bool: """ check if both phrases refers to time or date Returns: bool """ mention1_ner = mention1.mention_ner mention2_ner = mention2.mention_ner if mention1_ner is None: _, _, _, mention1_ner = StringUtils.find_head_lemma_pos_ner( mention1.tokens_str) if mention2_ner is None: _, _, _, mention2_ner = StringUtils.find_head_lemma_pos_ner( mention2.tokens_str) is1_time_or_data = 'DATE' in mention1_ner or 'TIME' in mention1_ner is2_time_or_data = 'DATE' in mention2_ner or 'TIME' in mention2_ner result = False if is1_time_or_data and is2_time_or_data: result = True return result
def extract_sub_relations(self, mention_x: MentionDataLight, mention_y: MentionDataLight, relation: RelationType) -> RelationType: """ Check if input mentions has the given relation between them Args: mention_x: MentionDataLight mention_y: MentionDataLight relation: RelationType Returns: RelationType: relation in case mentions has given relation or RelationType.NO_RELATION_FOUND otherwise """ mention_x_str = mention_x.tokens_str mention_y_str = mention_y.tokens_str if StringUtils.is_pronoun(mention_x_str.lower()) or StringUtils.is_pronoun( mention_y_str.lower()): return RelationType.NO_RELATION_FOUND page_x = self.wordnet_impl.get_pages(mention_x) page_y = self.wordnet_impl.get_pages(mention_y) if page_x and page_y: if relation == RelationType.WORDNET_DERIVATIONALLY: return self.extract_derivation(page_x, page_y) if relation == RelationType.WORDNET_PARTIAL_SYNSET_MATCH: return self.extract_partial_synset_match(page_x, page_y) if relation == RelationType.WORDNET_SAME_SYNSET: return self.extract_same_synset_entity(page_x, page_y) return RelationType.NO_RELATION_FOUND
def extract_sub_relations(self, mention_x: MentionDataLight, mention_y: MentionDataLight, relation: RelationType) -> RelationType: """ Check if input mentions has the given relation between them Args: mention_x: MentionDataLight mention_y: MentionDataLight relation: RelationType Returns: RelationType: relation in case mentions has given relation or RelationType.NO_RELATION_FOUND otherwise """ mention_x_str = mention_x.tokens_str mention_y_str = mention_y.tokens_str if StringUtils.is_pronoun( mention_x_str.lower()) or StringUtils.is_pronoun( mention_y_str.lower()): return RelationType.NO_RELATION_FOUND if relation == RelationType.EXACT_STRING: return self.extract_exact_string(mention_x, mention_y) if relation == RelationType.FUZZY_FIT: return self.extract_fuzzy_fit(mention_x, mention_y) if relation == RelationType.FUZZY_HEAD_FIT: return self.extract_fuzzy_head_fit(mention_x, mention_y) if relation == RelationType.SAME_HEAD_LEMMA: is_same_lemma = self.extract_same_head_lemma(mention_x, mention_y) if is_same_lemma != RelationType.NO_RELATION_FOUND: return relation return RelationType.NO_RELATION_FOUND
def extract_all_relations( self, mention_x: MentionDataLight, mention_y: MentionDataLight) -> Set[RelationType]: """ Try to find if mentions has anyone or more of the relations this class support Args: mention_x: MentionDataLight mention_y: MentionDataLight Returns: Set[RelationType]: One or more of: RelationType.EXACT_STRING, RelationType.FUZZY_FIT, RelationType.FUZZY_HEAD_FIT, RelationType.SAME_HEAD_LEMMA, RelationType.SAME_HEAD_LEMMA_RELAX """ relations = set() mention_x_str = mention_x.tokens_str mention_y_str = mention_y.tokens_str if StringUtils.is_pronoun( mention_x_str.lower()) or StringUtils.is_pronoun( mention_y_str.lower()): relations.add(RelationType.NO_RELATION_FOUND) return relations relations.add(self.extract_exact_string(mention_x, mention_y)) relations.add(self.extract_fuzzy_fit(mention_x, mention_y)) relations.add(self.extract_fuzzy_head_fit(mention_x, mention_y)) relations.add(self.extract_same_head_lemma(mention_x, mention_y)) if len(relations) == 0: relations.add(RelationType.NO_RELATION_FOUND) return relations
def __init__(self, orig_phrase: str = None, orig_phrase_norm: str = None, wiki_title: str = None, wiki_title_norm: str = None, score: int = 0, pageid: int = 0, description: str = None, relations: WikipediaPageExtractedRelations = None) -> None: """ Object represent a Wikipedia Page and extracted fields. Args: orig_phrase (str): original search phrase orig_phrase_norm (str): original search phrase normalized wiki_title (str): page title wiki_title_norm (str): page title normalized score (int): score for getting wiki_title from orig_phrase pageid (int): the unique page identifier description (str, optional): the page description relations (WikipediaPageExtractedRelations): Object that represent all extracted Wikipedia relations """ self.orig_phrase = orig_phrase if orig_phrase_norm is None: self.orig_phrase_norm = StringUtils.normalize_str(orig_phrase) else: self.orig_phrase_norm = orig_phrase_norm self.wiki_title = wiki_title.replace(DISAMBIGUATION_TITLE, '') if wiki_title_norm is None: self.wiki_title_norm = StringUtils.normalize_str(wiki_title) else: self.wiki_title_norm = wiki_title_norm self.score = score self.pageid = int(pageid) self.description = description self.relations = relations
def is_both_opposite_personal_pronouns(phrase1: str, phrase2: str) -> bool: """ check if both phrases refers to pronouns Returns: bool """ result = False if StringUtils.is_pronoun(phrase1.lower()) and StringUtils.is_pronoun( phrase2.lower()): result = True return result
def extract_synonyms_and_derivation(word): lemma_names = set() derivationally_related_forms = set() for synset in wn.synsets(word): for lemma in synset.lemmas(): lemma_name = lemma.name().replace('_', ' ') if not StringUtils.is_stop(lemma_name.lower()): lemma_names.add(lemma_name) derivationally_related_forms.update( [l.name().replace('_', ' ') for l in lemma.derivationally_related_forms() if not StringUtils.is_stop(l.name().lower())]) return lemma_names, derivationally_related_forms
def __init__( self, tokens_str: str, mention_context: str = None, mention_head: str = None, mention_head_lemma: str = None, mention_pos: str = None, mention_ner: str = None, ): """ Object represent a mention with only text values Args: tokens_str: str the tokens combine text (join with space) mention_head: str mention_head_lemma: str """ self.tokens_str = tokens_str self.mention_context = mention_context if not mention_head and not mention_head_lemma: ( self.mention_head, self.mention_head_lemma, self.mention_head_pos, self.mention_ner, ) = StringUtils.find_head_lemma_pos_ner(str(tokens_str)) else: self.mention_head = mention_head self.mention_head_lemma = mention_head_lemma self.mention_head_pos = mention_pos self.mention_ner = mention_ner
def get_pages(self, mention): if mention.tokens_str in self.cache: return self.cache[mention.tokens_str] head_synonyms, head_names_derivationally = self.extract_synonyms_and_derivation( mention.mention_head) head_lemma_synonyms, head_lemma_derivationally = self.extract_synonyms_and_derivation( mention.mention_head_lemma) clean_phrase = StringUtils.normalize_str(mention.tokens_str) all_clean_words_synonyms = self.all_clean_words_synonyms(clean_phrase) wordnet_page = WordnetPage( mention.tokens_str, clean_phrase, mention.mention_head, mention.mention_head_lemma, head_synonyms, head_lemma_synonyms, head_names_derivationally, head_lemma_derivationally, all_clean_words_synonyms, ) self.cache[mention.tokens_str] = wordnet_page return wordnet_page
def extract_same_head_lemma(mention_x: MentionDataLight, mention_y: MentionDataLight) -> RelationType: """ Check if input mentions has same head lemma relation Args: mention_x: MentionDataLight mention_y: MentionDataLight Returns: RelationType.SAME_HEAD_LEMMA or RelationType.NO_RELATION_FOUND """ if StringUtils.is_preposition(mention_x.mention_head_lemma.lower()) or \ StringUtils.is_preposition(mention_y.mention_head_lemma.lower()): return RelationType.NO_RELATION_FOUND if mention_x.mention_head_lemma.lower() == mention_y.mention_head_lemma.lower(): return RelationType.SAME_HEAD_LEMMA return RelationType.NO_RELATION_FOUND
def all_clean_words_synonyms(clean_phrase): words = clean_phrase.split() return [ set([ lemma.lower().replace("_", " ") for synset in wn.synsets(w) for lemma in synset.lemma_names() if not StringUtils.is_stop(lemma.lower()) ]) for w in words ]
def extract_all_relations( self, mention_x: MentionDataLight, mention_y: MentionDataLight) -> Set[RelationType]: """ Try to find if mentions has anyone or more of the relations this class support Args: mention_x: MentionDataLight mention_y: MentionDataLight Returns: Set[RelationType]: One or more of: RelationType.WORDNET_SAME_SYNSET_ENTITY, RelationType.WORDNET_SAME_SYNSET_EVENT, RelationType.WORDNET_PARTIAL_SYNSET_MATCH, RelationType.WORDNET_DERIVATIONALLY """ relations = set() mention_x_str = mention_x.tokens_str mention_y_str = mention_y.tokens_str if StringUtils.is_pronoun( mention_x_str.lower()) or StringUtils.is_pronoun( mention_y_str.lower()): relations.add(RelationType.NO_RELATION_FOUND) return relations page_x = self.wordnet_impl.get_pages(mention_x) page_y = self.wordnet_impl.get_pages(mention_y) if page_x and page_y: deriv_rel = self.extract_derivation(page_x, page_y) part_syn_rel = self.extract_partial_synset_match(page_x, page_y) same_syn_rel = self.extract_same_synset_entity(page_x, page_y) if deriv_rel != RelationType.NO_RELATION_FOUND: relations.add(deriv_rel) if part_syn_rel != RelationType.NO_RELATION_FOUND: relations.add(part_syn_rel) if same_syn_rel != RelationType.NO_RELATION_FOUND: relations.add(same_syn_rel) if len(relations) == 0: relations.add(RelationType.NO_RELATION_FOUND) return relations
def extract_fuzzy_head_fit(mention_x: MentionDataLight, mention_y: MentionDataLight) -> RelationType: """ Check if input mentions has fuzzy head fit relation Args: mention_x: MentionDataLight mention_y: MentionDataLight Returns: RelationType.FUZZY_HEAD_FIT or RelationType.NO_RELATION_FOUND """ if StringUtils.is_preposition(mention_x.mention_head_lemma.lower( )) or StringUtils.is_preposition(mention_y.mention_head_lemma.lower()): return RelationType.NO_RELATION_FOUND mention_y_tokens = mention_y.tokens_str.split() mention_x_tokens = mention_x.tokens_str.split() if mention_x.mention_head in mention_y_tokens or mention_y.mention_head in mention_x_tokens: return RelationType.FUZZY_HEAD_FIT return RelationType.NO_RELATION_FOUND
def extract_relations_from_text_v0(self, text): self.disambiguation_links = set() self.categories = set() self.title_parenthesis = set() self.be_comp = set() self.disambiguation_links_norm = set() self.categories_norm = set() self.title_parenthesis_norm = set() self.be_comp_norm = set() ext_links = set() title_parenthesis = set() text_lines = text.split('\n') for line in text_lines: cat_links = self.extract_categories(line) if not self.is_part_name: self.is_part_name = self.is_name_part(line) if not self.is_part_name and [ s for s in PART_NAME_CATEGORIES if s in cat_links ]: self.is_part_name = True self.categories.update(cat_links) self.categories_norm.update( StringUtils.normalize_string_list(cat_links)) links, parenthesis_links = self.extract_links_and_parenthesis(line) ext_links.update(links) title_parenthesis.update(parenthesis_links) if self.is_disambiguation: self.disambiguation_links = ext_links self.disambiguation_links_norm = StringUtils.normalize_string_list( ext_links) self.title_parenthesis = title_parenthesis self.title_parenthesis_norm = StringUtils.normalize_string_list( title_parenthesis)
def extract_exact_string(mention_x: MentionDataLight, mention_y: MentionDataLight) -> RelationType: """ Check if input mentions has exact string relation Args: mention_x: MentionDataLight mention_y: MentionDataLight Returns: RelationType.EXACT_STRING or RelationType.NO_RELATION_FOUND """ relation = RelationType.NO_RELATION_FOUND mention1_str = mention_x.tokens_str mention2_str = mention_y.tokens_str if StringUtils.is_preposition(mention1_str.lower()) or \ StringUtils.is_preposition(mention2_str.lower()): return relation if mention1_str.lower() == mention2_str.lower(): relation = RelationType.EXACT_STRING return relation
def test_is_preposition(): assert StringUtils.is_preposition("the") is False assert StringUtils.is_preposition("on")
def read_json_mention_data_line(mention_line: str): """ Args: mention_line: a Json representation of a single mention Returns: MentionData object """ mention_data = None try: topic_id = None coref_chain = None doc_id = None sent_id = None tokens_numbers = None score = -1 mention_type = None predicted_coref_chain = None mention_context = None is_continue = False is_singleton = False mention_pos = None mention_ner = None mention_text = mention_line['tokens_str'] if 'topic_id' in mention_line: topic_id = mention_line['topic_id'] if 'coref_chain' in mention_line: coref_chain = mention_line['coref_chain'] if 'doc_id' in mention_line: doc_id = mention_line['doc_id'] if '.xml' not in doc_id: doc_id = doc_id + '.xml' if 'sent_id' in mention_line: sent_id = mention_line['sent_id'] if 'tokens_number' in mention_line: tokens_numbers = mention_line['tokens_number'] if 'mention_context' in mention_line: mention_context = mention_line['mention_context'] if 'mention_head' in mention_line and 'mention_head_lemma' in mention_line: mention_head = mention_line['mention_head'] mention_head_lemma = mention_line['mention_head_lemma'] if 'mention_head_pos' in mention_line: mention_pos = mention_line['mention_head_pos'] if 'mention_ner' in mention_line: mention_ner = mention_line['mention_ner'] else: mention_head, mention_head_lemma, mention_pos, \ mention_ner = StringUtils.find_head_lemma_pos_ner(str(mention_text)) if 'mention_type' in mention_line: mention_type = mention_line['mention_type'] if 'score' in mention_line: score = mention_line['score'] if 'is_continuous' in mention_line: is_continue = mention_line['is_continuous'] if 'is_singleton' in mention_line: is_singleton = mention_line['is_singleton'] if 'predicted_coref_chain' in mention_line: predicted_coref_chain = mention_line['predicted_coref_chain'] mention_data = MentionData( topic_id, doc_id, sent_id, tokens_numbers, mention_text, mention_context, mention_head, mention_head_lemma, coref_chain, mention_type, is_continue, is_singleton, score, predicted_coref_chain, mention_pos, mention_ner) except Exception: print('Unexpected error:', sys.exc_info()[0]) raise Exception('failed reading json line-' + str(mention_line)) return mention_data
def test_is_stopword(): assert StringUtils.is_stop("always") assert StringUtils.is_stop("sunday") is False
def read_json_mention_data_line(mention_line: str): """ Args: mention_line: a Json representation of a single mention Returns: MentionData object """ # pylint: disable=too-many-branches try: topic_id = None coref_chain = None doc_id = None sent_id = None tokens_numbers = None score = -1 mention_type = None predicted_coref_chain = None mention_context = None is_continue = False is_singleton = False mention_pos = None mention_ner = None mention_index = -1 mention_text = mention_line["tokens_str"] if "topic_id" in mention_line: topic_id = mention_line["topic_id"] if "coref_chain" in mention_line: coref_chain = mention_line["coref_chain"] if "doc_id" in mention_line: doc_id = mention_line["doc_id"] if ".xml" not in doc_id: doc_id = doc_id + ".xml" if "sent_id" in mention_line: sent_id = mention_line["sent_id"] if "tokens_number" in mention_line: tokens_numbers = mention_line["tokens_number"] if "mention_context" in mention_line: mention_context = mention_line["mention_context"] if "mention_head" in mention_line and "mention_head_lemma" in mention_line: mention_head = mention_line["mention_head"] mention_head_lemma = mention_line["mention_head_lemma"] if "mention_head_pos" in mention_line: mention_pos = mention_line["mention_head_pos"] if "mention_ner" in mention_line: mention_ner = mention_line["mention_ner"] else: ( mention_head, mention_head_lemma, mention_pos, mention_ner, ) = StringUtils.find_head_lemma_pos_ner(str(mention_text)) if "mention_type" in mention_line: mention_type = mention_line["mention_type"] if "score" in mention_line: score = mention_line["score"] if "is_continuous" in mention_line: is_continue = mention_line["is_continuous"] if "is_singleton" in mention_line: is_singleton = mention_line["is_singleton"] if "predicted_coref_chain" in mention_line: predicted_coref_chain = mention_line["predicted_coref_chain"] if "mention_index" in mention_line: mention_index = mention_line["mention_index"] mention_data = MentionData( topic_id, doc_id, sent_id, tokens_numbers, mention_text, mention_context, mention_head, mention_head_lemma, coref_chain, mention_type, is_continue, is_singleton, score, predicted_coref_chain, mention_pos, mention_ner, mention_index, ) except Exception: print("Unexpected error:", sys.exc_info()[0]) raise Exception("failed reading json line-" + str(mention_line)) return mention_data
def test_is_stopword(): assert StringUtils.is_stop('always') assert StringUtils.is_stop('sunday') is False
def test_is_pronoun(): assert StringUtils.is_pronoun('anybody') assert StringUtils.is_pronoun('the') is False
def test_is_determiner(): assert StringUtils.is_determiner('the') assert StringUtils.is_determiner('on') is False
def test_is_pronoun(): assert StringUtils.is_pronoun("anybody") assert StringUtils.is_pronoun("the") is False
def test_is_preposition(): assert StringUtils.is_preposition('the') is False assert StringUtils.is_preposition('on')
def test_is_determiner(): assert StringUtils.is_determiner("the") assert StringUtils.is_determiner("on") is False