Пример #1
0
 def __init__(self,
              me_list=None,
              embeddings_model=None,
              nlp=None,
              relationship_list=None):
     self.me_list = me_list
     self.nlp = nlp
     self.embeddings_model = embeddings_model
     self.relationship_list = relationship_list
     self.relation_types = RelationTypes()
Пример #2
0
class SocialRelationExtractor(Component):

    name = "relationextractor"
    provides = ["entities"]
    #requires = [""]
    defaults = {}
    language_list = ["de_core_news_sm"]

    def __init__(self, component_config=None):
        super(SocialRelationExtractor, self).__init__(component_config)
        self.re = RelationExtractor(LANG.DE)
        self.ae = AnalyticsEngine(LANG.DE)
        self.rt = RelationTypes()

    def process(self, message, **kwargs):
        print(f'Processing Message {message.text}')
        extracted_relations, response_message = self.ae.analyze_utterance(
            message.text, persist=True)
        print(f'Extracted relations: {extracted_relations}')

        if extracted_relations:
            if len(extracted_relations[0]) == 3:
                entity_value = extracted_relations[0][2]
            else:
                entity_value = self.rt.get_relation_from_relation_type_DE(
                    extracted_relations[0][1])

            entities = [{
                "value": entity_value,
                "confidence": 1,
                "entity": "relativename",
                "extractor": "relationextractor"
            }, {
                "value": True,
                "confidence": 1,
                "entity": "relationextracted",
                "extractor": "relationextractor"
            }]

            message.set("entities", entities, add_to_output=True)
Пример #3
0
 def __init__(self, component_config=None):
     super(SocialRelationExtractor, self).__init__(component_config)
     self.re = RelationExtractor(LANG.DE)
     self.ae = AnalyticsEngine(LANG.DE)
     self.rt = RelationTypes()
Пример #4
0
 def __init__(self, lang):
     self.ng = NetworkGraph()  # neo4j
     self.re = RelationExtractor(lang=lang)
     self.re_types = RelationTypes()
Пример #5
0
class ShortestPathRE:
    def __init__(self,
                 me_list=None,
                 embeddings_model=None,
                 nlp=None,
                 relationship_list=None):
        self.me_list = me_list
        self.nlp = nlp
        self.embeddings_model = embeddings_model
        self.relationship_list = relationship_list
        self.relation_types = RelationTypes()

    @classmethod
    def de_lang(cls):
        me_list = ['ich', 'mein', 'meine']
        embeddings_model = FlairEmbeddingModels().de_lang()
        nlp = de_core_news_sm.load()
        relationship_list = [
            'vater', 'mutter', 'sohn', 'tochter', 'bruder', 'schwester',
            'enkel', 'enkelin', 'großvater', 'großmutter', 'ehemann',
            'ehefrau', 'onkel', 'tante', 'freund'
        ]

        return cls(me_list, embeddings_model, nlp, relationship_list)

    @classmethod
    def en_lang(cls):
        me_list = ['i', 'my']
        embeddings_model = FlairEmbeddingModels().en_lang()
        nlp = en_core_web_md.load()
        relationship_list = [
            'father', 'mother', 'sister', 'brother', 'son', 'daughter',
            'husband', 'wife', 'grandson', 'granddaughter', 'grandmother',
            'grandfather', 'uncle', 'aunt', 'friend'
        ]

        return cls(me_list, embeddings_model, nlp, relationship_list)

    def __search_shortest_dep_path(self, entities, sentence, plot_graph):
        path_dict = {}
        graph = self.__build_undirected_graph(sentence, plot_graph)

        for i, first_entity in enumerate(entities):
            first_entity = first_entity.split('_')[
                0]  # use only first name of multi-word entities

            #for j in range(len(entities)):  # bidirectional relations
            for j in range(i + 1, len(entities)):  # unidirectional relations
                second_entity = entities[j]
                second_entity = second_entity.split('_')[
                    0]  # use only first name of multi-word entities

                if not i == j and not first_entity == second_entity and not second_entity in self.me_list:
                    try:
                        shortest_path = nx.shortest_path(graph,
                                                         source=first_entity,
                                                         target=second_entity)
                        key = first_entity + '-' + second_entity
                        if len(shortest_path[1:-1]) > 0:
                            # path_dict[key] = shortest_path  # include entities in sp
                            path_dict[key] = shortest_path[
                                1:-1]  # exclude entities in sp
                        else:
                            path_dict[key] = []
                    except NodeNotFound as err:
                        logger.warning(f'Node not found: {err}')
                    except NetworkXNoPath as err:
                        logger.warning(f'Path not found: {err}')

        return path_dict

    def __build_undirected_graph(self, sentence, plot=False):
        doc = self.nlp(sentence)
        edges = []
        for token in doc:
            for child in token.children:
                # TODO indicate direction of the relationship - maybe with the help of the child token 's
                source = token.lower_
                sink = child.lower_

                edges.append((f'{source}', f'{sink}'))

        graph = nx.Graph(edges)

        if plot:
            self.__plot_graph(graph)

        return graph

    @staticmethod
    def __plot_graph(graph):
        pos = nx.spring_layout(graph)  # positions for all nodes
        nx.draw_networkx_nodes(graph, pos, node_size=200)  # nodes
        nx.draw_networkx_edges(graph, pos, width=1)  # edges
        nx.draw_networkx_labels(graph,
                                pos,
                                font_size=12,
                                font_family='sans-serif')  # labels

        plt.axis('off')  # disable axis plot
        plt.show()

    def __measure_sp_rel_similarity(self, shortest_path):
        """
        Measures the cosine similarity between word embeddings
        :param shortest_path: dict of sp values
        :return: relation type with the highest score
        """
        relation = None
        highest_score = 0
        highest_rel = None
        threshold = 0.6

        for rel in self.relationship_list:
            try:
                # get word embeddings representation of shortest path and relation
                score = self.embeddings_model.n_similarity(
                    shortest_path, [rel])
                logger.debug(f'{rel} {score}')
                if score > highest_score:
                    highest_score = score
                    highest_rel = rel
            except KeyError as err:
                logger.debug(err)

        if highest_score > threshold:
            logger.debug(
                f'Highest score for {shortest_path} - {highest_rel}, Score: {highest_score}'
            )
            relation = self.relation_types.get_relation_type(highest_rel)

        return relation

    def extract_sp_relation(self,
                            entities,
                            per_entities,
                            sentence,
                            plot_graph=False):
        sp_dict = self.__search_shortest_dep_path(entities, sentence,
                                                  plot_graph)
        extracted_relations = []

        for entity_pair, sp_words in sp_dict.items():
            e1 = entity_pair.split('-')[0]
            e2 = entity_pair.split('-')[1]

            if len(sp_words) > 0:

                most_likely_relation = self.__measure_sp_rel_similarity(
                    sp_words)
                if most_likely_relation:
                    if e1 in self.me_list:
                        e1 = 'USER'
                        extracted_relation = e2, most_likely_relation, e1
                    else:
                        extracted_relation = e1, most_likely_relation, e2
                    extracted_relations.append(extracted_relation)
                elif len(per_entities) > 1:
                    extracted_relation = per_entities[
                        0], 'KNOWS', per_entities[1]
                    extracted_relations.append(extracted_relation)

        return extracted_relations
Пример #6
0
class PatternBasedRE:
    def __init__(self,
                 nlp=None,
                 grammar=None,
                 relationship_list=None,
                 me_list=None,
                 embeddings_model=None):
        self.nlp = nlp
        self.grammar = grammar
        self.relationship_list = relationship_list
        self.me_list = me_list
        self.relation_types = RelationTypes()
        self.embeddings_model = embeddings_model

    @classmethod
    def de_lang(cls):
        nlp = de_core_news_sm.load()
        embeddings_model = FlairEmbeddingModels().de_lang()
        # PP: e.g. 'I habe einen Sohn', 'I habe einen kleinen Bruder'
        # NP: e.g. 'Meine kleine Schwester'
        grammar = r"""
                PP: {<PRON><AUX><DET><ADJ>?<NOUN>}
                NP: {<DET><ADJ>?<NOUN>}            
                REL: {<PP>|<NP>}"""
        relationship_list = [
            'vater', 'mutter', 'sohn', 'tochter', 'bruder', 'schwester',
            'enkel', 'enkelin', 'großvater', 'großmutter', 'ehemann',
            'ehefrau', 'onkel', 'tante', 'freund'
        ]
        me_list = ['ich', 'mein', 'meine']

        return cls(nlp, grammar, relationship_list, me_list, embeddings_model)

    @classmethod
    def en_lang(cls):
        nlp = en_core_web_md.load()
        embeddings_model = FlairEmbeddingModels().en_lang()
        # PP: e.g. 'I have a son', 'I have a smaller brother', 'I have a 9 year old son'
        # NP: e.g. 'My (little) sister'
        grammar = r"""
                    PP: {<PRON><VERB><NUM>?<DET>?<ADJ>?<NOUN>}
                    NP: {<ADJ><ADJ>?<NOUN>}            
                    REL: {<PP>|<NP>}"""
        relationship_list = [
            'father', 'mother', 'sister', 'brother', 'son', 'daughter',
            'husband', 'wife', 'grandson', 'granddaughter', 'grandmother',
            'grandfather', 'uncle', 'aunt', 'friend'
        ]
        me_list = ['i', 'my', 'me']

        return cls(nlp, grammar, relationship_list, me_list, embeddings_model)

    def search_rel_type(self, sentence):
        for token in word_tokenize(sentence):
            if token.lower() in self.relationship_list:
                return token.lower()

        return None

    def pos_tag_sentence(self, sentence):
        sentence = re.sub('\W+', ' ', sentence)
        doc = self.nlp(sentence)

        pos_tagged_sentence = []
        for token in doc:
            pos_tuple = (token.text, token.pos_)
            pos_tagged_sentence.append(pos_tuple)

        return pos_tagged_sentence

    def chunk_sentence(self, pos_tagged_sentence, draw=False):
        cp = nltk.RegexpParser(self.grammar)
        chunk_tree = cp.parse(pos_tagged_sentence)

        if draw:
            chunk_tree.draw()

        return chunk_tree

    def __measure_relation_similarity(self, rel_tree_words):
        """
        Measures the cosine similarity between word embeddings
        :param rel_tree_words: dict of sp values
        :return: relation type with the highest score
        """
        relation = None
        highest_score = 0
        highest_rel = None
        threshold = 0.6

        for rel in self.relationship_list:
            try:
                # get word embeddings representation of extracted relation and relation
                score = self.embeddings_model.n_similarity(
                    rel_tree_words, [rel])
                logger.debug(f'{rel} {score}')
                if score > highest_score:
                    highest_score = score
                    highest_rel = rel
            except KeyError as err:
                logger.debug(err)

        if highest_score > threshold:
            logger.debug(
                f'Highest score for {rel_tree_words} - {highest_rel}, Score: {highest_score}'
            )
            relation = self.relation_types.get_relation_type(highest_rel)

        return relation

    def extract_rel(self, sentence, plot_tree=False):
        extracted_relations = []

        # build chunks
        chunk_tree = self.chunk_sentence(self.pos_tag_sentence(sentence),
                                         draw=plot_tree)

        for i, sub_tree in enumerate(chunk_tree):
            if type(sub_tree) is nltk.tree.Tree and sub_tree.label() == 'REL':
                me = sub_tree[0][0][0].lower()
                rel_tree_words = []
                for word in sub_tree[0]:
                    if word[0] not in self.me_list:
                        rel_tree_words.append(word[0])

                if me in self.me_list and rel_tree_words:
                    relation_type = self.__measure_relation_similarity(
                        rel_tree_words)

                    if sub_tree[0][-1][1] == 'PROPN':
                        rel_person = sub_tree[0][-1][0]
                        extracted_relation = rel_person, relation_type, 'USER'
                        extracted_relations.append(extracted_relation)
                    else:
                        extracted_relation = relation_type, 'USER'
                        extracted_relations.append(extracted_relation)

        return extracted_relations