Exemplo n.º 1
0
class KnowledgeGraph(object):
    def __init__(self):
        self._preprocess = Preprocessor()

    def _update_line(self, line, graph, context_so_far):
        ent_pattern = r"\[[a-zA-Z0-9' ]+\([a-zA-Z]+\[\d+\]\)\]"
        ref_pattern = r"\[[a-zA-Z0-9' ]+\]"
        words = r"[a-zA-Z0-9' ]+"
        entities = re.findall(ent_pattern, line)
        new_line = line

        for entity in entities:
            new_line = new_line.replace(entity, '')

        refs = re.findall(ref_pattern, new_line)
        for index, entity in enumerate(entities):
            name, class_type, identifier = re.findall(words, entity)
            # list of all objects that are of this entity
            graph_entity = graph.get(class_type)
            if graph_entity is None:
                graph_entity = {}
                graph[class_type] = graph_entity

            # check if identifier exists in the graph entity
            metadata = graph_entity.get(identifier)
            if metadata is None:
                metadata = {
                    'name': name,
                    'attributes': [],
                    'relation': {},
                    'start_index': context_so_far.find(name)
                }
                graph_entity[identifier] = metadata

            other_entities = [
                ent for pos, ent in enumerate(entities) if index != pos
            ]
            for ent in other_entities:
                ent_name, ent_class_type, ent_id = re.findall(words, ent)
                relation_ent = metadata['relation'].get(ent_class_type)
                if not relation_ent:
                    relation_ent = []
                    metadata['relation'][ent_class_type] = relation_ent
                if ent_id not in relation_ent:
                    relation_ent.append(ent_id)

            for ref in refs:
                attributes = re.findall(words, ref)
                for attr in attributes:
                    if attr not in metadata['attributes']:
                        metadata['attributes'].append(
                            (attr, context_so_far.rfind(attr)))

    def prepare(self, path):
        graph = {}
        context_so_far = ""
        with open(path) as f:
            for line in f:
                if "question_" in line:
                    continue

                context_so_far += (
                    " " if context_so_far else
                    "") + self._preprocess._line_cleanup(line) + "."
                self._update_line(line, graph, context_so_far)

        graph[CONTEXT] = context_so_far
        return graph

    def prepare_edges(self, graph):
        nodes = []
        for key in graph:
            if key == CONTEXT:
                continue
            nodes.extend([(key + "-" + elem) for elem in graph[key]])

        sorted_nodes = sorted(nodes)
        nodes = {k: v for v, k in enumerate(sorted_nodes)}
        m_len = len(nodes)

        edges = []
        for _ in range(m_len):
            edges.append([-1] * m_len)

        print(json.dumps(nodes))
        for entry in nodes:
            class_type, id = entry.split("-")
            neighbors = graph[class_type][id]['relation']

            entry_index = nodes[entry]
            edges[entry_index][entry_index] = 0
            for neighbor_class, neighbor_nodes in neighbors.iteritems():
                for neighbor_node in neighbor_nodes:
                    neighbor = "%s-%s" % (neighbor_class, neighbor_node)
                    neighbor_index = nodes[neighbor]
                    edges[entry_index][neighbor_index] = 1
        return nodes, edges, sorted_nodes