Пример #1
0
def sense_tokenize(text_block, annotator, stemmer, stop_words):
    """
    tokenize a block into sentences which are word tokenized, preserving the sense of the words
    (see the original paper for details)
    :param text_block: block of text (string)
    :param annotator: senna annotator
    :param stemmer: porter stemmer instance
    :param stop_words: list of stopwords to use
    :param phrase_tags: tags of the phrases to parse
    :return: list of sentences each tokenized into words
    """
    sentences = SENT_RE.findall(text_block)
    sense_phrases = []

    for sentence in sentences:
        sentence = sentence.replace('\'', '').replace('(', ' ') \
            .replace(')', ' ').replace("/", " or ").replace("-", "")

        sentence = TAG_RE.sub('', sentence)
        sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
        #logger.info("Will sense tokenize : %s" % sentence)
        try:
            senna_annotation = annotator.getAnnotations(sentence)
        except Exception as e:
            #logger.error("annontator error")
            #logger.error(e)
            continue

        chunk_parse, pos_tags, words = senna_annotation['chunk'], senna_annotation['pos'], \
                                       senna_annotation['words']

        single_words = [
            stemmer.stem(word) + '|' + normalize_pos(tag)
            for word, tag in pos_tags if word not in stop_words
        ]

        sense_phrases.append(single_words)

        noun_phrases = form_phrases(chunk_parse, NPTags)
        verb_phrases = form_phrases(chunk_parse, VPTags)

        non_phrase_words = [
            stemmer.stem(word) + '|' + normalize_pos(pos_tag)
            for ((word, chunk_tag), (_,
                                     pos_tag)) in izip(chunk_parse, pos_tags)
            if chunk_tag not in phrase_tags if word not in stop_words
        ]

        noun_entities, verb_entities = [], []
        for np in noun_phrases:
            en = relation_util.form_entity(words, np, chunk_parse, pos_tags,
                                           'NP')
            if not en: continue
            en_words = en.split(" ")
            if len(en_words) > 1:
                en_words_with_pos = [
                    stemmer.stem(w) + '|' + normalize_pos(pos_tag)
                    for (w, pos_tag) in pos_tags if stemmer.stem(w) in en_words
                ]
                en_words_with_pos.append(en + '|NP')
                sense_phrases.append(en_words_with_pos)
            noun_entities.append(en + '|NP')

        for vp in verb_phrases:
            en = relation_util.form_entity(words, vp, chunk_parse, pos_tags,
                                           'VP')
            if not en: continue
            en_words = en.split(" ")
            if len(en_words) > 1:
                en_words_with_pos = [
                    stemmer.stem(w) + '|' + normalize_pos(pos_tag)
                    for (w, pos_tag) in pos_tags if stemmer.stem(w) in en_words
                ]
                en_words_with_pos.append(en + '|VP')
                sense_phrases.append(en_words_with_pos)
            verb_entities.append(en + '|VP')

        noun_index, verb_index, non_phrase_index = 0, 0, 0
        sense_words = []
        for (word, chunk_tag) in chunk_parse:
            if chunk_tag not in phrase_tags:
                if non_phrase_index < len(non_phrase_words):
                    sense_words.append(non_phrase_words[non_phrase_index])
                    non_phrase_index += 1

            if chunk_tag in [NPTags.end.value, NPTags.single.value]:
                if noun_index < len(noun_entities):
                    sense_words.append(noun_entities[noun_index])
                    noun_index += 1

            if chunk_tag in [VPTags.end.value, VPTags.single.value]:
                if verb_index < len(verb_entities):
                    sense_words.append(verb_entities[verb_index])
                    verb_index += 1

        if sense_words:
            sense_phrases.append(sense_words)

    return sense_phrases
    def form_relations(self, text, block_id, payload, ff, persist=True):
        """
        form relation(s) on a given text
        :param text: text on which to get the relations on,
        text will be sentence tokenized and relations formed at sentence level
        :param block_id: unique identifier of the block
        :param persist: persist the relations extracted from the text in the sink,
        relation_sink needed to be specified
        :return: list of relations
        """
        text_sentences = pattern.tokenize(text)
        relations = []
        for sentence in text_sentences:

            # work with ascii string only
            sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
            try:
                senna_annotation = self.relation_annotator.getAnnotations(sentence)
            except Exception as e:
                logger.error(e)
                continue

            chunk_parse, pos_tags, role_labeling, tokenized_sentence = \
                senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \
                senna_annotation['words']

            # nothing to do here empty srl
            if not role_labeling: continue

            for semantic_element in role_labeling:
                arguments = RelationExtractor.__populate_arguments(semantic_element)
                modifiers = RelationExtractor.__populate_modifier(semantic_element)
                verb = semantic_element.get('V')
                # order of the arguments returned is important, A0 --> A1 --> A2 --> A3
                arguments = [v for v in vars(arguments).itervalues() if v]
                modifiers = [v for v in vars(modifiers).itervalues() if v]

                if not arguments: continue
                argument_pairs = [e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj
                                              in enumerate(arguments) if i < j)]

                verb = relation_util.normalize_relation(verb)

                for a0, a1 in argument_pairs:
                    en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff = ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

                for arg_modifier in modifiers:
                    mod_pos = sentence.find(arg_modifier)
                    linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0]
                    en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb,
                                                   sentence=sentence, text=text, block_id=block_id,
                                                   payload=payload, ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

        return relations
Пример #3
0
def sense_tokenize(text_block, annotator, stemmer, stop_words):
    """
    tokenize a block into sentences which are word tokenized, preserving the sense of the words
    (see the original paper for details)
    :param text_block: block of text (string)
    :param annotator: senna annotator
    :param stemmer: porter stemmer instance
    :param stop_words: list of stopwords to use
    :param phrase_tags: tags of the phrases to parse
    :return: list of sentences each tokenized into words
    """
    sentences = SENT_RE.findall(text_block)
    sense_phrases = []

    for sentence in sentences:
        sentence = sentence.replace('\'', '').replace('(', ' ') \
            .replace(')', ' ').replace("/", " or ").replace("-", "")

        sentence = TAG_RE.sub('', sentence)
        sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
        #logger.info("Will sense tokenize : %s" % sentence)
        try:
            senna_annotation = annotator.getAnnotations(sentence)
        except Exception as e:
            #logger.error("annontator error")
            #logger.error(e)
            continue

        chunk_parse, pos_tags, words = senna_annotation['chunk'], senna_annotation['pos'], \
                                       senna_annotation['words']

        single_words = [stemmer.stem(word) + '|' + normalize_pos(tag)
                        for word, tag in pos_tags if word not in stop_words]

        sense_phrases.append(single_words)

        noun_phrases = form_phrases(chunk_parse, NPTags)
        verb_phrases = form_phrases(chunk_parse, VPTags)

        non_phrase_words = [stemmer.stem(word) + '|' + normalize_pos(pos_tag) for
                            ((word, chunk_tag), (_, pos_tag)) in izip(chunk_parse, pos_tags)
                            if chunk_tag not in phrase_tags if word not in stop_words]

        noun_entities, verb_entities = [], []
        for np in noun_phrases:
            en = relation_util.form_entity(words, np, chunk_parse, pos_tags, 'NP')
            if not en: continue
            en_words = en.split(" ")
            if len(en_words) > 1:
                en_words_with_pos = [stemmer.stem(w) + '|' + normalize_pos(pos_tag)
                                     for (w, pos_tag) in pos_tags if stemmer.stem(w) in en_words]
                en_words_with_pos.append(en + '|NP')
                sense_phrases.append(en_words_with_pos)
            noun_entities.append(en + '|NP')

        for vp in verb_phrases:
            en = relation_util.form_entity(words, vp, chunk_parse, pos_tags, 'VP')
            if not en: continue
            en_words = en.split(" ")
            if len(en_words) > 1:
                en_words_with_pos = [stemmer.stem(w) + '|' + normalize_pos(pos_tag)
                                     for (w, pos_tag) in pos_tags if stemmer.stem(w) in en_words]
                en_words_with_pos.append(en + '|VP')
                sense_phrases.append(en_words_with_pos)
            verb_entities.append(en + '|VP')

        noun_index, verb_index, non_phrase_index = 0, 0, 0
        sense_words = []
        for (word, chunk_tag) in chunk_parse:
            if chunk_tag not in phrase_tags:
                if non_phrase_index < len(non_phrase_words):
                    sense_words.append(non_phrase_words[non_phrase_index])
                    non_phrase_index += 1

            if chunk_tag in [NPTags.end.value, NPTags.single.value]:
                if noun_index < len(noun_entities):
                    sense_words.append(noun_entities[noun_index])
                    noun_index += 1

            if chunk_tag in [VPTags.end.value, VPTags.single.value]:
                if verb_index < len(verb_entities):
                    sense_words.append(verb_entities[verb_index])
                    verb_index += 1

        if sense_words:
            sense_phrases.append(sense_words)

    return sense_phrases
Пример #4
0
    def form_relations(self, text, block_id, payload, ff, persist=True):
        """
        form relation(s) on a given text
        :param text: text on which to get the relations on,
        text will be sentence tokenized and relations formed at sentence level
        :param block_id: unique identifier of the block
        :param persist: persist the relations extracted from the text in the sink,
        relation_sink needed to be specified
        :return: list of relations
        """
        text_sentences = pattern.tokenize(text)
        relations = []
        for sentence in text_sentences:

            # work with ascii string only
            sentence = "".join((c for c in sentence if 0 < ord(c) < 127))
            try:
                senna_annotation = self.relation_annotator.getAnnotations(
                    sentence)
            except Exception as e:
                logger.error(e)
                continue

            chunk_parse, pos_tags, role_labeling, tokenized_sentence = \
                senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \
                senna_annotation['words']

            # nothing to do here empty srl
            if not role_labeling: continue

            for semantic_element in role_labeling:
                arguments = RelationExtractor.__populate_arguments(
                    semantic_element)
                modifiers = RelationExtractor.__populate_modifier(
                    semantic_element)
                verb = semantic_element.get('V')
                # order of the arguments returned is important, A0 --> A1 --> A2 --> A3
                arguments = [v for v in vars(arguments).itervalues() if v]
                modifiers = [v for v in vars(modifiers).itervalues() if v]

                if not arguments: continue
                argument_pairs = [
                    e for e in ((ai, aj) for i, ai in enumerate(arguments)
                                for j, aj in enumerate(arguments) if i < j)
                ]

                verb = relation_util.normalize_relation(verb)

                for a0, a1 in argument_pairs:
                    en0 = relation_util.form_entity(tokenized_sentence, a0,
                                                    chunk_parse, pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence, a1,
                                                    chunk_parse, pos_tags)
                    if not en0 or not en1: continue
                    relations.append(
                        RelationTuple(left_entity=en0,
                                      right_entity=en1,
                                      relation=verb,
                                      sentence=sentence,
                                      text=text,
                                      block_id=block_id,
                                      payload=payload,
                                      ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

                for arg_modifier in modifiers:
                    mod_pos = sentence.find(arg_modifier)
                    linked_arg = min([(a, abs(mod_pos - sentence.find(a)))
                                      for a in arguments],
                                     key=lambda e: e[1])[0]
                    en0 = relation_util.form_entity(tokenized_sentence,
                                                    linked_arg, chunk_parse,
                                                    pos_tags)
                    en1 = relation_util.form_entity(tokenized_sentence,
                                                    arg_modifier, chunk_parse,
                                                    pos_tags)
                    if not en0 or not en1: continue
                    relations.append(
                        RelationTuple(left_entity=en0,
                                      right_entity=en1,
                                      relation=verb,
                                      sentence=sentence,
                                      text=text,
                                      block_id=block_id,
                                      payload=payload,
                                      ff=ff))
                    logger.info("generated a relation for ")
                    logger.info(block_id)

        return relations