def is_bare_np(graph_builder, constituent): head_word = graph_builder.get_head_word(constituent) head_form = head_word[FORM].lower() head_word_pos = head_word[POS] words = graph_builder.get_words(constituent) sentence_words = graph_builder.get_words( graph_builder.get_root(constituent)) first_word_index = sentence_words.index(words[0]) try: last_word_index = sentence_words.index(words[-1]) except ValueError: logger.warning("Multisentence entity %s", constituent[ID]) return False if pronouns.all(head_form) or pos_tags.pronoun(head_word_pos): return False if pos_tags.singular_common_noun(head_word_pos) and \ not temporals.temporals(head_form) and ( len(words) == 1 or pos_tags. adjective(words[0][POS])): siblings = graph_builder.get_syntactic_sibling(constituent) #position = siblings.index(constituent) # if words[-1] != head_word: # return False if first_word_index > 0: prev_word = sentence_words[first_word_index - 1] if prev_word[FORM].lower()[-1] in ('al', 'del', "y", "e", "ni"): return False if prev_word[FORM].lower()[-1] == ",": parent = graph_builder.get_syntactic_parent(constituent)[FORM] if "y" in parent: return False if "e" in parent: return False if "ni" in parent: return False return False if last_word_index + 1 < len(sentence_words): #if graph_builder.get_words(siblings[position+1])[0][FORM].lower() in ("y", "e", "ni"): next_word = sentence_words[last_word_index + 1] if next_word[FORM].lower()[0] in ("y", "e", "ni"): return False if next_word[FORM].lower()[0] == ",": parent = graph_builder.get_syntactic_parent(constituent)[FORM] if "y" in parent: return False if "e" in parent: return False if "ni" in parent: return False return False logger.debug("Mention is bare NP: %s(%s)", constituent[FORM], constituent[ID]) return True return False
def _get_number(self, mention): """Determines the number of the mention and return a constant. :param mention: The mention to determine number. :return PLURAL, SINGULAR or UNKNOWN constants. """ head_word = self.graph_builder.get_head_word(mention) word_pos = head_word.get(POS) ner = mention.get(NER) # Normalize parameters word_form = head_word[FORM].lower() word_form_lower = word_form.lower() # Pronouns if pos_tags.pronoun(word_pos) or pronouns.all(word_form_lower): self.logger.debug("Number: Pronoun") if pronouns.plural(word_form_lower): return PLURAL elif pronouns.singular(word_form_lower): return SINGULAR else: return UNKNOWN # Enumeration try: if mention[MENTION] == ENUMERATION_MENTION: self.logger.debug("Number: Enumeration") return PLURAL except KeyError: self.logger.warning("Number without TYPE") # Use the mention POS to determine the feature if pos_tags.singular(word_pos): self.logger.debug("Number: POS") return SINGULAR if pos_tags.plural(word_pos): self.logger.debug("Number: POS") return PLURAL # Bergsma Lists if self.use_bergsma_number_lists: if word_form in singular_words: self.logger.debug("Number: Bergsma list") return SINGULAR if word_form in plural_words: self.logger.debug("Number: Bergsma list") return PLURAL # Ner are singular by default except organizations if ner_tags.singular(ner): self.logger.debug("Number: NER") return SINGULAR return UNKNOWN
def _get_animacy(self, mention): """Determines the gender of the word. :param mention: The mention which animacy is wanted. :return: ANIMATE, INANIMATE or UNKNOWN constant """ head_word = self.graph_builder.get_head_word(mention) word_form = rules.get_head_word_form(self.graph_builder, mention) word_ner = mention.get(NER) word_pos = head_word.get(POS) # Normalize parameters normalized_ner = word_ner normalized_form = word_form.lower() normalized_form = re.sub("\d", "0", normalized_form) normalized_pos = word_pos.replace("$", "") # Pronouns if pos_tags.pronoun(normalized_pos) or pronouns.all(normalized_form): if pronouns.inanimate(normalized_form): return INANIMATE elif pronouns.animate(normalized_form): return ANIMATE else: return UNKNOWN # NER if ner_tags.animate(normalized_ner): return ANIMATE if ner_tags.inanimate(normalized_ner): return INANIMATE # Use the mention POS to determine the feature if pos_tags.inanimate(word_pos): return INANIMATE if pos_tags.animate(word_pos): return ANIMATE # Bergsma Lists if self.use_bergsma_number_lists: if word_form in animate_words: return ANIMATE if word_form in inanimate_words: return INANIMATE return UNKNOWN
def extract_and_mark(self, mention): """ Determine the type of the mention. Also check some mention related features. :param mention: The mention to be classified. """ words = self.graph_builder.get_words(mention) head = self.graph_builder.get_head_word(mention) head_pos = head[POS] head_form = head[FORM].lower() head_word_ner = head.get(HEAD_OF_NER) first_form = words[0][FORM].lower() if pronouns.relative(first_form) and len(words) == 1: mention[RELATIVE_PRONOUN] = True else: mention[RELATIVE_PRONOUN] = False if determiners.indefinite_articles(first_form): mention[STARTED_BY_INDEFINITE_ARTICLE] = True else: mention[STARTED_BY_INDEFINITE_ARTICLE] = False if pronouns.indefinite(first_form): mention[STARTED_BY_INDEFINITE_PRONOUN] = True else: mention[STARTED_BY_INDEFINITE_PRONOUN] = False # Enumeration mention if rules.is_enumeration(self.graph_builder, mention): self._set_mention_type(mention, ENUMERATION_MENTION) # Pronoun mention elif (len(words) == 1 and pos_tags.pronoun(head_pos)) or\ (len(words) == 1 and (pronouns.all(head_form) or pronouns.relative(head_form)) and # not ner_tags.mention_ner(head_word_ner)): True): self._set_mention_type(mention, PRONOUN_MENTION) # Proper Mention elif pos_tags.proper_noun(head_pos): # or ner_tags.all(head_word_ner): self._set_mention_type(mention, PROPER_MENTION) # In other case is nominal else: self._set_mention_type(mention, NOMINAL_MENTION)
def filter(self, mention, prev_mentions): """ check if the mention is pleonastic. :param mention: The mention to test. :return: True or False. """ if pos_tags.relative_pronoun(mention.get(POS, "")): words = self.graph_builder.get_words( self.graph_builder.get_root(mention)) mention_words = self.graph_builder.get_words(mention) first_word_index = words.index(mention_words[0]) last_word_index = words.index(mention_words[-1]) if first_word_index > 0: if pos_tags.determinant(words[first_word_index - 1][POS]): return True next_word = words[last_word_index + 1] if pos_tags.pronoun(next_word[POS]) or pronouns.all( next_word[FORM]): if mention[FORM].lower() == "que": self.logger.debug( "Mention is relative %s(%s)", mention[ID], self.graph_builder.get_root(mention)[FORM]) return True return False
def process_graph(self): from corefgraph.multisieve.features.constants import MENTION """ Prepare the graph for output. """ self.meta[self.graph_builder.doc_type] = self.graph_builder.get_doc_type() from corefgraph.resources.tagset import pos_tags from corefgraph.resources.dictionaries import pronouns self.meta["sentences"] = { 'words_histogram': [len(self.graph_builder.get_words(sentence)) for sentence in self.graph_builder.get_all_sentences()], 'pronouns_histogram': [len([word for word in self.graph_builder.get_words(sentence) if(pos_tags.pronoun(word[POS]) or pronouns.all(word[FORM]) or pronouns.relative(word[FORM]))]) for sentence in self.graph_builder.get_all_sentences()], 'named_entities_histogram': [len(self.graph_builder.get_sentence_named_entities(sentence)) for sentence in self.graph_builder.get_all_sentences()], 'mentions_histogram': [len(self.graph_builder.get_sentence_gold_mentions(sentence)) for sentence in self.graph_builder.get_all_sentences()] } self.meta["features"] = { 'counters': defaultdict(Counter), 'mentions': defaultdict(dict)} for index, sentence in enumerate(self.coreference_processor.mentions_textual_order): self.logger.debug("Featuring Sentence %d", index) sentence_mentions = [] # self.meta["sentences"].append(sentence_mentions) for mention in sentence: # Store mentions id in the meta sentence_mentions.append(mention[ID]) self.feature_extractor.characterize_mention(mention) # Resolve the coreference self.logger.debug("Resolve Coreference...") self.coreference_processor.resolve_text() self.meta["overall"] = { 'words': Counter([word[POS] for word in self.graph_builder.get_all_words()]), 'namedEntities': Counter([ne[NER] for ne in self.graph_builder.get_all_named_entities()]), 'constituents': Counter([constituent[TAG] for constituent in self.graph_builder.get_all_constituents()]), 'mentions': Counter([mention.get(MENTION) for mention in self.graph_builder.get_all_gold_mentions()]), 'mentions_size': [len(self.graph_builder.get_words(mention)) for mention in self.graph_builder.get_all_gold_mentions()], 'mentions_deep': [mention.get(CONSTITUENT, {DEEP: -1})[DEEP] for mention in self.graph_builder.get_all_gold_mentions()], 'mentions_per_entity': Counter([mention[GOLD_ENTITY] for mention in self.graph_builder.get_all_gold_mentions()]).values() }