def __init__(self, me_list=None, embeddings_model=None, nlp=None, relationship_list=None): self.me_list = me_list self.nlp = nlp self.embeddings_model = embeddings_model self.relationship_list = relationship_list self.relation_types = RelationTypes()
class SocialRelationExtractor(Component): name = "relationextractor" provides = ["entities"] #requires = [""] defaults = {} language_list = ["de_core_news_sm"] def __init__(self, component_config=None): super(SocialRelationExtractor, self).__init__(component_config) self.re = RelationExtractor(LANG.DE) self.ae = AnalyticsEngine(LANG.DE) self.rt = RelationTypes() def process(self, message, **kwargs): print(f'Processing Message {message.text}') extracted_relations, response_message = self.ae.analyze_utterance( message.text, persist=True) print(f'Extracted relations: {extracted_relations}') if extracted_relations: if len(extracted_relations[0]) == 3: entity_value = extracted_relations[0][2] else: entity_value = self.rt.get_relation_from_relation_type_DE( extracted_relations[0][1]) entities = [{ "value": entity_value, "confidence": 1, "entity": "relativename", "extractor": "relationextractor" }, { "value": True, "confidence": 1, "entity": "relationextracted", "extractor": "relationextractor" }] message.set("entities", entities, add_to_output=True)
def __init__(self, component_config=None): super(SocialRelationExtractor, self).__init__(component_config) self.re = RelationExtractor(LANG.DE) self.ae = AnalyticsEngine(LANG.DE) self.rt = RelationTypes()
def __init__(self, lang): self.ng = NetworkGraph() # neo4j self.re = RelationExtractor(lang=lang) self.re_types = RelationTypes()
class ShortestPathRE: def __init__(self, me_list=None, embeddings_model=None, nlp=None, relationship_list=None): self.me_list = me_list self.nlp = nlp self.embeddings_model = embeddings_model self.relationship_list = relationship_list self.relation_types = RelationTypes() @classmethod def de_lang(cls): me_list = ['ich', 'mein', 'meine'] embeddings_model = FlairEmbeddingModels().de_lang() nlp = de_core_news_sm.load() relationship_list = [ 'vater', 'mutter', 'sohn', 'tochter', 'bruder', 'schwester', 'enkel', 'enkelin', 'großvater', 'großmutter', 'ehemann', 'ehefrau', 'onkel', 'tante', 'freund' ] return cls(me_list, embeddings_model, nlp, relationship_list) @classmethod def en_lang(cls): me_list = ['i', 'my'] embeddings_model = FlairEmbeddingModels().en_lang() nlp = en_core_web_md.load() relationship_list = [ 'father', 'mother', 'sister', 'brother', 'son', 'daughter', 'husband', 'wife', 'grandson', 'granddaughter', 'grandmother', 'grandfather', 'uncle', 'aunt', 'friend' ] return cls(me_list, embeddings_model, nlp, relationship_list) def __search_shortest_dep_path(self, entities, sentence, plot_graph): path_dict = {} graph = self.__build_undirected_graph(sentence, plot_graph) for i, first_entity in enumerate(entities): first_entity = first_entity.split('_')[ 0] # use only first name of multi-word entities #for j in range(len(entities)): # bidirectional relations for j in range(i + 1, len(entities)): # unidirectional relations second_entity = entities[j] second_entity = second_entity.split('_')[ 0] # use only first name of multi-word entities if not i == j and not first_entity == second_entity and not second_entity in self.me_list: try: shortest_path = nx.shortest_path(graph, source=first_entity, target=second_entity) key = first_entity + '-' + second_entity if len(shortest_path[1:-1]) > 0: # path_dict[key] = shortest_path # include entities in sp path_dict[key] = shortest_path[ 1:-1] # exclude entities in sp else: path_dict[key] = [] except NodeNotFound as err: logger.warning(f'Node not found: {err}') except NetworkXNoPath as err: logger.warning(f'Path not found: {err}') return path_dict def __build_undirected_graph(self, sentence, plot=False): doc = self.nlp(sentence) edges = [] for token in doc: for child in token.children: # TODO indicate direction of the relationship - maybe with the help of the child token 's source = token.lower_ sink = child.lower_ edges.append((f'{source}', f'{sink}')) graph = nx.Graph(edges) if plot: self.__plot_graph(graph) return graph @staticmethod def __plot_graph(graph): pos = nx.spring_layout(graph) # positions for all nodes nx.draw_networkx_nodes(graph, pos, node_size=200) # nodes nx.draw_networkx_edges(graph, pos, width=1) # edges nx.draw_networkx_labels(graph, pos, font_size=12, font_family='sans-serif') # labels plt.axis('off') # disable axis plot plt.show() def __measure_sp_rel_similarity(self, shortest_path): """ Measures the cosine similarity between word embeddings :param shortest_path: dict of sp values :return: relation type with the highest score """ relation = None highest_score = 0 highest_rel = None threshold = 0.6 for rel in self.relationship_list: try: # get word embeddings representation of shortest path and relation score = self.embeddings_model.n_similarity( shortest_path, [rel]) logger.debug(f'{rel} {score}') if score > highest_score: highest_score = score highest_rel = rel except KeyError as err: logger.debug(err) if highest_score > threshold: logger.debug( f'Highest score for {shortest_path} - {highest_rel}, Score: {highest_score}' ) relation = self.relation_types.get_relation_type(highest_rel) return relation def extract_sp_relation(self, entities, per_entities, sentence, plot_graph=False): sp_dict = self.__search_shortest_dep_path(entities, sentence, plot_graph) extracted_relations = [] for entity_pair, sp_words in sp_dict.items(): e1 = entity_pair.split('-')[0] e2 = entity_pair.split('-')[1] if len(sp_words) > 0: most_likely_relation = self.__measure_sp_rel_similarity( sp_words) if most_likely_relation: if e1 in self.me_list: e1 = 'USER' extracted_relation = e2, most_likely_relation, e1 else: extracted_relation = e1, most_likely_relation, e2 extracted_relations.append(extracted_relation) elif len(per_entities) > 1: extracted_relation = per_entities[ 0], 'KNOWS', per_entities[1] extracted_relations.append(extracted_relation) return extracted_relations
class PatternBasedRE: def __init__(self, nlp=None, grammar=None, relationship_list=None, me_list=None, embeddings_model=None): self.nlp = nlp self.grammar = grammar self.relationship_list = relationship_list self.me_list = me_list self.relation_types = RelationTypes() self.embeddings_model = embeddings_model @classmethod def de_lang(cls): nlp = de_core_news_sm.load() embeddings_model = FlairEmbeddingModels().de_lang() # PP: e.g. 'I habe einen Sohn', 'I habe einen kleinen Bruder' # NP: e.g. 'Meine kleine Schwester' grammar = r""" PP: {<PRON><AUX><DET><ADJ>?<NOUN>} NP: {<DET><ADJ>?<NOUN>} REL: {<PP>|<NP>}""" relationship_list = [ 'vater', 'mutter', 'sohn', 'tochter', 'bruder', 'schwester', 'enkel', 'enkelin', 'großvater', 'großmutter', 'ehemann', 'ehefrau', 'onkel', 'tante', 'freund' ] me_list = ['ich', 'mein', 'meine'] return cls(nlp, grammar, relationship_list, me_list, embeddings_model) @classmethod def en_lang(cls): nlp = en_core_web_md.load() embeddings_model = FlairEmbeddingModels().en_lang() # PP: e.g. 'I have a son', 'I have a smaller brother', 'I have a 9 year old son' # NP: e.g. 'My (little) sister' grammar = r""" PP: {<PRON><VERB><NUM>?<DET>?<ADJ>?<NOUN>} NP: {<ADJ><ADJ>?<NOUN>} REL: {<PP>|<NP>}""" relationship_list = [ 'father', 'mother', 'sister', 'brother', 'son', 'daughter', 'husband', 'wife', 'grandson', 'granddaughter', 'grandmother', 'grandfather', 'uncle', 'aunt', 'friend' ] me_list = ['i', 'my', 'me'] return cls(nlp, grammar, relationship_list, me_list, embeddings_model) def search_rel_type(self, sentence): for token in word_tokenize(sentence): if token.lower() in self.relationship_list: return token.lower() return None def pos_tag_sentence(self, sentence): sentence = re.sub('\W+', ' ', sentence) doc = self.nlp(sentence) pos_tagged_sentence = [] for token in doc: pos_tuple = (token.text, token.pos_) pos_tagged_sentence.append(pos_tuple) return pos_tagged_sentence def chunk_sentence(self, pos_tagged_sentence, draw=False): cp = nltk.RegexpParser(self.grammar) chunk_tree = cp.parse(pos_tagged_sentence) if draw: chunk_tree.draw() return chunk_tree def __measure_relation_similarity(self, rel_tree_words): """ Measures the cosine similarity between word embeddings :param rel_tree_words: dict of sp values :return: relation type with the highest score """ relation = None highest_score = 0 highest_rel = None threshold = 0.6 for rel in self.relationship_list: try: # get word embeddings representation of extracted relation and relation score = self.embeddings_model.n_similarity( rel_tree_words, [rel]) logger.debug(f'{rel} {score}') if score > highest_score: highest_score = score highest_rel = rel except KeyError as err: logger.debug(err) if highest_score > threshold: logger.debug( f'Highest score for {rel_tree_words} - {highest_rel}, Score: {highest_score}' ) relation = self.relation_types.get_relation_type(highest_rel) return relation def extract_rel(self, sentence, plot_tree=False): extracted_relations = [] # build chunks chunk_tree = self.chunk_sentence(self.pos_tag_sentence(sentence), draw=plot_tree) for i, sub_tree in enumerate(chunk_tree): if type(sub_tree) is nltk.tree.Tree and sub_tree.label() == 'REL': me = sub_tree[0][0][0].lower() rel_tree_words = [] for word in sub_tree[0]: if word[0] not in self.me_list: rel_tree_words.append(word[0]) if me in self.me_list and rel_tree_words: relation_type = self.__measure_relation_similarity( rel_tree_words) if sub_tree[0][-1][1] == 'PROPN': rel_person = sub_tree[0][-1][0] extracted_relation = rel_person, relation_type, 'USER' extracted_relations.append(extracted_relation) else: extracted_relation = relation_type, 'USER' extracted_relations.append(extracted_relation) return extracted_relations