def sense_tokenize(text_block, annotator, stemmer, stop_words): """ tokenize a block into sentences which are word tokenized, preserving the sense of the words (see the original paper for details) :param text_block: block of text (string) :param annotator: senna annotator :param stemmer: porter stemmer instance :param stop_words: list of stopwords to use :param phrase_tags: tags of the phrases to parse :return: list of sentences each tokenized into words """ sentences = SENT_RE.findall(text_block) sense_phrases = [] for sentence in sentences: sentence = sentence.replace('\'', '').replace('(', ' ') \ .replace(')', ' ').replace("/", " or ").replace("-", "") sentence = TAG_RE.sub('', sentence) sentence = "".join((c for c in sentence if 0 < ord(c) < 127)) #logger.info("Will sense tokenize : %s" % sentence) try: senna_annotation = annotator.getAnnotations(sentence) except Exception as e: #logger.error("annontator error") #logger.error(e) continue chunk_parse, pos_tags, words = senna_annotation['chunk'], senna_annotation['pos'], \ senna_annotation['words'] single_words = [ stemmer.stem(word) + '|' + normalize_pos(tag) for word, tag in pos_tags if word not in stop_words ] sense_phrases.append(single_words) noun_phrases = form_phrases(chunk_parse, NPTags) verb_phrases = form_phrases(chunk_parse, VPTags) non_phrase_words = [ stemmer.stem(word) + '|' + normalize_pos(pos_tag) for ((word, chunk_tag), (_, pos_tag)) in izip(chunk_parse, pos_tags) if chunk_tag not in phrase_tags if word not in stop_words ] noun_entities, verb_entities = [], [] for np in noun_phrases: en = relation_util.form_entity(words, np, chunk_parse, pos_tags, 'NP') if not en: continue en_words = en.split(" ") if len(en_words) > 1: en_words_with_pos = [ stemmer.stem(w) + '|' + normalize_pos(pos_tag) for (w, pos_tag) in pos_tags if stemmer.stem(w) in en_words ] en_words_with_pos.append(en + '|NP') sense_phrases.append(en_words_with_pos) noun_entities.append(en + '|NP') for vp in verb_phrases: en = relation_util.form_entity(words, vp, chunk_parse, pos_tags, 'VP') if not en: continue en_words = en.split(" ") if len(en_words) > 1: en_words_with_pos = [ stemmer.stem(w) + '|' + normalize_pos(pos_tag) for (w, pos_tag) in pos_tags if stemmer.stem(w) in en_words ] en_words_with_pos.append(en + '|VP') sense_phrases.append(en_words_with_pos) verb_entities.append(en + '|VP') noun_index, verb_index, non_phrase_index = 0, 0, 0 sense_words = [] for (word, chunk_tag) in chunk_parse: if chunk_tag not in phrase_tags: if non_phrase_index < len(non_phrase_words): sense_words.append(non_phrase_words[non_phrase_index]) non_phrase_index += 1 if chunk_tag in [NPTags.end.value, NPTags.single.value]: if noun_index < len(noun_entities): sense_words.append(noun_entities[noun_index]) noun_index += 1 if chunk_tag in [VPTags.end.value, VPTags.single.value]: if verb_index < len(verb_entities): sense_words.append(verb_entities[verb_index]) verb_index += 1 if sense_words: sense_phrases.append(sense_words) return sense_phrases
def form_relations(self, text, block_id, payload, ff, persist=True): """ form relation(s) on a given text :param text: text on which to get the relations on, text will be sentence tokenized and relations formed at sentence level :param block_id: unique identifier of the block :param persist: persist the relations extracted from the text in the sink, relation_sink needed to be specified :return: list of relations """ text_sentences = pattern.tokenize(text) relations = [] for sentence in text_sentences: # work with ascii string only sentence = "".join((c for c in sentence if 0 < ord(c) < 127)) try: senna_annotation = self.relation_annotator.getAnnotations(sentence) except Exception as e: logger.error(e) continue chunk_parse, pos_tags, role_labeling, tokenized_sentence = \ senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \ senna_annotation['words'] # nothing to do here empty srl if not role_labeling: continue for semantic_element in role_labeling: arguments = RelationExtractor.__populate_arguments(semantic_element) modifiers = RelationExtractor.__populate_modifier(semantic_element) verb = semantic_element.get('V') # order of the arguments returned is important, A0 --> A1 --> A2 --> A3 arguments = [v for v in vars(arguments).itervalues() if v] modifiers = [v for v in vars(modifiers).itervalues() if v] if not arguments: continue argument_pairs = [e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj in enumerate(arguments) if i < j)] verb = relation_util.normalize_relation(verb) for a0, a1 in argument_pairs: en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff = ff)) logger.info("generated a relation for ") logger.info(block_id) for arg_modifier in modifiers: mod_pos = sentence.find(arg_modifier) linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0] en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff=ff)) logger.info("generated a relation for ") logger.info(block_id) return relations
def sense_tokenize(text_block, annotator, stemmer, stop_words): """ tokenize a block into sentences which are word tokenized, preserving the sense of the words (see the original paper for details) :param text_block: block of text (string) :param annotator: senna annotator :param stemmer: porter stemmer instance :param stop_words: list of stopwords to use :param phrase_tags: tags of the phrases to parse :return: list of sentences each tokenized into words """ sentences = SENT_RE.findall(text_block) sense_phrases = [] for sentence in sentences: sentence = sentence.replace('\'', '').replace('(', ' ') \ .replace(')', ' ').replace("/", " or ").replace("-", "") sentence = TAG_RE.sub('', sentence) sentence = "".join((c for c in sentence if 0 < ord(c) < 127)) #logger.info("Will sense tokenize : %s" % sentence) try: senna_annotation = annotator.getAnnotations(sentence) except Exception as e: #logger.error("annontator error") #logger.error(e) continue chunk_parse, pos_tags, words = senna_annotation['chunk'], senna_annotation['pos'], \ senna_annotation['words'] single_words = [stemmer.stem(word) + '|' + normalize_pos(tag) for word, tag in pos_tags if word not in stop_words] sense_phrases.append(single_words) noun_phrases = form_phrases(chunk_parse, NPTags) verb_phrases = form_phrases(chunk_parse, VPTags) non_phrase_words = [stemmer.stem(word) + '|' + normalize_pos(pos_tag) for ((word, chunk_tag), (_, pos_tag)) in izip(chunk_parse, pos_tags) if chunk_tag not in phrase_tags if word not in stop_words] noun_entities, verb_entities = [], [] for np in noun_phrases: en = relation_util.form_entity(words, np, chunk_parse, pos_tags, 'NP') if not en: continue en_words = en.split(" ") if len(en_words) > 1: en_words_with_pos = [stemmer.stem(w) + '|' + normalize_pos(pos_tag) for (w, pos_tag) in pos_tags if stemmer.stem(w) in en_words] en_words_with_pos.append(en + '|NP') sense_phrases.append(en_words_with_pos) noun_entities.append(en + '|NP') for vp in verb_phrases: en = relation_util.form_entity(words, vp, chunk_parse, pos_tags, 'VP') if not en: continue en_words = en.split(" ") if len(en_words) > 1: en_words_with_pos = [stemmer.stem(w) + '|' + normalize_pos(pos_tag) for (w, pos_tag) in pos_tags if stemmer.stem(w) in en_words] en_words_with_pos.append(en + '|VP') sense_phrases.append(en_words_with_pos) verb_entities.append(en + '|VP') noun_index, verb_index, non_phrase_index = 0, 0, 0 sense_words = [] for (word, chunk_tag) in chunk_parse: if chunk_tag not in phrase_tags: if non_phrase_index < len(non_phrase_words): sense_words.append(non_phrase_words[non_phrase_index]) non_phrase_index += 1 if chunk_tag in [NPTags.end.value, NPTags.single.value]: if noun_index < len(noun_entities): sense_words.append(noun_entities[noun_index]) noun_index += 1 if chunk_tag in [VPTags.end.value, VPTags.single.value]: if verb_index < len(verb_entities): sense_words.append(verb_entities[verb_index]) verb_index += 1 if sense_words: sense_phrases.append(sense_words) return sense_phrases
def form_relations(self, text, block_id, payload, ff, persist=True): """ form relation(s) on a given text :param text: text on which to get the relations on, text will be sentence tokenized and relations formed at sentence level :param block_id: unique identifier of the block :param persist: persist the relations extracted from the text in the sink, relation_sink needed to be specified :return: list of relations """ text_sentences = pattern.tokenize(text) relations = [] for sentence in text_sentences: # work with ascii string only sentence = "".join((c for c in sentence if 0 < ord(c) < 127)) try: senna_annotation = self.relation_annotator.getAnnotations( sentence) except Exception as e: logger.error(e) continue chunk_parse, pos_tags, role_labeling, tokenized_sentence = \ senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \ senna_annotation['words'] # nothing to do here empty srl if not role_labeling: continue for semantic_element in role_labeling: arguments = RelationExtractor.__populate_arguments( semantic_element) modifiers = RelationExtractor.__populate_modifier( semantic_element) verb = semantic_element.get('V') # order of the arguments returned is important, A0 --> A1 --> A2 --> A3 arguments = [v for v in vars(arguments).itervalues() if v] modifiers = [v for v in vars(modifiers).itervalues() if v] if not arguments: continue argument_pairs = [ e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj in enumerate(arguments) if i < j) ] verb = relation_util.normalize_relation(verb) for a0, a1 in argument_pairs: en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append( RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff=ff)) logger.info("generated a relation for ") logger.info(block_id) for arg_modifier in modifiers: mod_pos = sentence.find(arg_modifier) linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0] en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append( RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff=ff)) logger.info("generated a relation for ") logger.info(block_id) return relations