Exemplo n.º 1
0
def is_bare_np(graph_builder, constituent):
    head_word = graph_builder.get_head_word(constituent)
    head_form = head_word[FORM].lower()
    head_word_pos = head_word[POS]
    words = graph_builder.get_words(constituent)
    sentence_words = graph_builder.get_words(
        graph_builder.get_root(constituent))
    first_word_index = sentence_words.index(words[0])
    try:
        last_word_index = sentence_words.index(words[-1])
    except ValueError:
        logger.warning("Multisentence entity %s", constituent[ID])
        return False

    if pronouns.all(head_form) or pos_tags.pronoun(head_word_pos):
        return False

    if pos_tags.singular_common_noun(head_word_pos) and \
            not temporals.temporals(head_form) and (
            len(words) == 1 or pos_tags.  adjective(words[0][POS])):
        siblings = graph_builder.get_syntactic_sibling(constituent)

        #position = siblings.index(constituent)

        # if words[-1] != head_word:
        #    return False
        if first_word_index > 0:
            prev_word = sentence_words[first_word_index - 1]
            if prev_word[FORM].lower()[-1] in ('al', 'del', "y", "e", "ni"):
                return False
            if prev_word[FORM].lower()[-1] == ",":
                parent = graph_builder.get_syntactic_parent(constituent)[FORM]
                if "y" in parent:
                    return False
                if "e" in parent:
                    return False
                if "ni" in parent:
                    return False
            return False
        if last_word_index + 1 < len(sentence_words):
            #if graph_builder.get_words(siblings[position+1])[0][FORM].lower() in ("y", "e", "ni"):
            next_word = sentence_words[last_word_index + 1]
            if next_word[FORM].lower()[0] in ("y", "e", "ni"):
                return False
            if next_word[FORM].lower()[0] == ",":
                parent = graph_builder.get_syntactic_parent(constituent)[FORM]
                if "y" in parent:
                    return False
                if "e" in parent:
                    return False
                if "ni" in parent:
                    return False
                return False
        logger.debug("Mention is bare NP: %s(%s)", constituent[FORM],
                     constituent[ID])
        return True
    return False
Exemplo n.º 2
0
    def _get_number(self, mention):
        """Determines the number of the mention and return a constant.
        :param mention: The mention to determine number.
        :return PLURAL, SINGULAR or UNKNOWN constants.
        """

        head_word = self.graph_builder.get_head_word(mention)
        word_pos = head_word.get(POS)
        ner = mention.get(NER)

        # Normalize parameters
        word_form = head_word[FORM].lower()
        word_form_lower = word_form.lower()
        # Pronouns
        if pos_tags.pronoun(word_pos) or pronouns.all(word_form_lower):
            self.logger.debug("Number: Pronoun")
            if pronouns.plural(word_form_lower):
                return PLURAL
            elif pronouns.singular(word_form_lower):
                return SINGULAR
            else:
                return UNKNOWN

        # Enumeration
        try:
            if mention[MENTION] == ENUMERATION_MENTION:
                self.logger.debug("Number: Enumeration")
                return PLURAL
        except KeyError:
            self.logger.warning("Number without TYPE")

        # Use the mention POS to determine the feature
        if pos_tags.singular(word_pos):
            self.logger.debug("Number: POS")
            return SINGULAR
        if pos_tags.plural(word_pos):
            self.logger.debug("Number: POS")
            return PLURAL

        # Bergsma Lists
        if self.use_bergsma_number_lists:
            if word_form in singular_words:
                self.logger.debug("Number: Bergsma list")
                return SINGULAR
            if word_form in plural_words:
                self.logger.debug("Number: Bergsma list")
                return PLURAL

        # Ner are singular by default except organizations
        if ner_tags.singular(ner):
            self.logger.debug("Number: NER")
            return SINGULAR
        return UNKNOWN
Exemplo n.º 3
0
    def _get_animacy(self, mention):
        """Determines the gender of the word.

        :param mention: The mention which animacy is wanted.
        :return: ANIMATE, INANIMATE or UNKNOWN constant
        """
        head_word = self.graph_builder.get_head_word(mention)
        word_form = rules.get_head_word_form(self.graph_builder, mention)
        word_ner = mention.get(NER)
        word_pos = head_word.get(POS)
        # Normalize parameters
        normalized_ner = word_ner
        normalized_form = word_form.lower()
        normalized_form = re.sub("\d", "0", normalized_form)
        normalized_pos = word_pos.replace("$", "")
        # Pronouns
        if pos_tags.pronoun(normalized_pos) or pronouns.all(normalized_form):
            if pronouns.inanimate(normalized_form):
                return INANIMATE
            elif pronouns.animate(normalized_form):
                return ANIMATE
            else:
                return UNKNOWN
        # NER
        if ner_tags.animate(normalized_ner):
            return ANIMATE
        if ner_tags.inanimate(normalized_ner):
            return INANIMATE

        # Use the mention POS to determine the feature
        if pos_tags.inanimate(word_pos):
            return INANIMATE
        if pos_tags.animate(word_pos):
            return ANIMATE
        # Bergsma Lists
        if self.use_bergsma_number_lists:
            if word_form in animate_words:
                return ANIMATE
            if word_form in inanimate_words:
                return INANIMATE
        return UNKNOWN
Exemplo n.º 4
0
    def extract_and_mark(self, mention):
        """ Determine the type of the mention. Also check some mention related
        features.

        :param mention: The mention to be classified.
        """
        words = self.graph_builder.get_words(mention)
        head = self.graph_builder.get_head_word(mention)
        head_pos = head[POS]
        head_form = head[FORM].lower()
        head_word_ner = head.get(HEAD_OF_NER)
        first_form = words[0][FORM].lower()
        if pronouns.relative(first_form) and len(words) == 1:
            mention[RELATIVE_PRONOUN] = True
        else:
            mention[RELATIVE_PRONOUN] = False

        if determiners.indefinite_articles(first_form):
            mention[STARTED_BY_INDEFINITE_ARTICLE] = True
        else:
            mention[STARTED_BY_INDEFINITE_ARTICLE] = False

        if pronouns.indefinite(first_form):
            mention[STARTED_BY_INDEFINITE_PRONOUN] = True
        else:
            mention[STARTED_BY_INDEFINITE_PRONOUN] = False
        # Enumeration mention
        if rules.is_enumeration(self.graph_builder, mention):
            self._set_mention_type(mention, ENUMERATION_MENTION)
        # Pronoun mention
        elif (len(words) == 1 and pos_tags.pronoun(head_pos)) or\
                (len(words) == 1 and (pronouns.all(head_form) or pronouns.relative(head_form)) and
                 # not ner_tags.mention_ner(head_word_ner)):
                 True):
            self._set_mention_type(mention, PRONOUN_MENTION)
        # Proper Mention
        elif pos_tags.proper_noun(head_pos):  # or ner_tags.all(head_word_ner):
            self._set_mention_type(mention, PROPER_MENTION)
        # In other case is nominal
        else:
            self._set_mention_type(mention, NOMINAL_MENTION)
Exemplo n.º 5
0
    def filter(self, mention, prev_mentions):
        """ check if the mention is pleonastic.

        :param mention: The mention to test.
        :return: True or False.
        """
        if pos_tags.relative_pronoun(mention.get(POS, "")):
            words = self.graph_builder.get_words(
                self.graph_builder.get_root(mention))
            mention_words = self.graph_builder.get_words(mention)
            first_word_index = words.index(mention_words[0])
            last_word_index = words.index(mention_words[-1])
            if first_word_index > 0:
                if pos_tags.determinant(words[first_word_index - 1][POS]):
                    return True
            next_word = words[last_word_index + 1]
            if pos_tags.pronoun(next_word[POS]) or pronouns.all(
                    next_word[FORM]):
                if mention[FORM].lower() == "que":
                    self.logger.debug(
                        "Mention is relative  %s(%s)", mention[ID],
                        self.graph_builder.get_root(mention)[FORM])
                    return True
        return False
Exemplo n.º 6
0
    def process_graph(self):
        from corefgraph.multisieve.features.constants import MENTION
        """ Prepare the graph for output.
        """
        self.meta[self.graph_builder.doc_type] = self.graph_builder.get_doc_type()
        from corefgraph.resources.tagset import pos_tags
        from corefgraph.resources.dictionaries import pronouns
        self.meta["sentences"] = {
            'words_histogram': [len(self.graph_builder.get_words(sentence))
                                for sentence in self.graph_builder.get_all_sentences()],
            'pronouns_histogram': [len([word for word in self.graph_builder.get_words(sentence) if(pos_tags.pronoun(word[POS]) or pronouns.all(word[FORM]) or pronouns.relative(word[FORM]))])
                                   for sentence in self.graph_builder.get_all_sentences()],
            'named_entities_histogram': [len(self.graph_builder.get_sentence_named_entities(sentence))
                                         for sentence in self.graph_builder.get_all_sentences()],
            'mentions_histogram': [len(self.graph_builder.get_sentence_gold_mentions(sentence))
                                   for sentence in self.graph_builder.get_all_sentences()]
        }

        self.meta["features"] = {
            'counters': defaultdict(Counter),
            'mentions': defaultdict(dict)}
        for index, sentence in enumerate(self.coreference_processor.mentions_textual_order):
            self.logger.debug("Featuring Sentence %d", index)
            sentence_mentions = []
            # self.meta["sentences"].append(sentence_mentions)
            for mention in sentence:
                # Store mentions id in the meta
                sentence_mentions.append(mention[ID])
                self.feature_extractor.characterize_mention(mention)
        # Resolve the coreference
        self.logger.debug("Resolve Coreference...")
        self.coreference_processor.resolve_text()

        self.meta["overall"] = {
            'words': Counter([word[POS] for word in self.graph_builder.get_all_words()]),
            'namedEntities': Counter([ne[NER] for ne in self.graph_builder.get_all_named_entities()]),
            'constituents': Counter([constituent[TAG] for constituent in self.graph_builder.get_all_constituents()]),
            'mentions': Counter([mention.get(MENTION) for mention in self.graph_builder.get_all_gold_mentions()]),
            'mentions_size': [len(self.graph_builder.get_words(mention)) for mention in self.graph_builder.get_all_gold_mentions()],
            'mentions_deep': [mention.get(CONSTITUENT, {DEEP: -1})[DEEP] for mention in self.graph_builder.get_all_gold_mentions()],
            'mentions_per_entity': Counter([mention[GOLD_ENTITY] for mention in self.graph_builder.get_all_gold_mentions()]).values()
        }