示例#1
0
    def query(self,
              topic,
              max_depth=3,
              dont_follow=['enrichment', 'classification'],
              config=None):
        """

            :param topic: a  graph to return the context of.  At least one node ID in topic \
             must be in full graph g to return any context.
            :param max_depth: The maximum distance from the topic to search
            :param config: The titanDB configuration to use if not using the one configured with the plugin
            :param dont_follow: A list of attribute types to not follow
            :return: subgraph in networkx format
        """
        if config is None:
            config = self.neo4j_config

        neo_graph = py2neoGraph(config)
        sg = nx.MultiDiGraph()

        # Get IDs of topic nodes in graph (if they exist).  Also add topics to subgraph
        topic_ids = set()
        for t, data in topic.nodes(data=True):
            cypher = ("MATCH (topic: {0} {1}) "
                      "RETURN collect(topic) as topics").format(
                          data['class'], "{key:{KEY}, value:{VALUE}}")
            props = {"KEY": data['key'], "VALUE": data['value']}
            records = neo_graph.cypher.execute(cypher, props)
            #print cypher, props  #  DEBUG
            #print type(records)
            for record in records:
                #print record  # DEBUG
                for tnode in record.topics:
                    attr = dict(tnode.properties)
                    uri = u'class={0}&key={1}&value={2}'.format(
                        attr['class'], attr['key'], attr['value'])
                    sg.add_node(uri, attr)
                    topic_ids.add(int(tnode.ref.split("/")[-1]))

        # Add nodes at depth 1  (done separately as it doesn't include the intermediary
        nodes = dict()
        if max_depth > 0 and len(
                topic_ids) > 0:  # no depth or no topicID means no subgraph
            cypher = self.build_query(max_depth)
            attr = {"TOPICS": list(topic_ids), "DONT_FOLLOW": dont_follow}
            #print cypher, attr  # DEBUG
            #            for record in neo_graph.cypher.stream(cypher, attr):  # Prefer streaming to execute, if it works
            for record in neo_graph.cypher.execute(cypher, attr):
                #print record  # DEBUG
                for node in record.nodes:
                    attr = dict(node.properties)
                    uri = 'class={0}&key={1}&value={2}'.format(
                        attr['class'], attr['key'], attr['value'])
                    sg.add_node(uri, attr)
                    nodes[node.ref.split("/")[-1]] = uri
                for rel in record.rels:
                    #                print type(rel) # DEBUG
                    # add edges SRC node
                    #                src_attr = dict(rel.start_node.properties)
                    #                src_uri = u"class={0}&key={1}&value={2}".format(src_attr['class'], src_attr['key'], src_attr['value'])
                    #                sg.add_node(src_uri, src_attr)
                    src_uri = nodes[rel.start_node.ref.split("/")
                                    [-1]]  # src node uri from neo4j ID

                    # Add edge DST node
                    #                dst_attr = dict(rel.end_node.properties)
                    #                dst_uri = u"class={0}&key={1}&value={2}".format(dst_attr['class'], dst_attr['key'], dst_attr['value'])
                    #                sg.add_node(dst_uri, dst_attr)
                    dst_uri = nodes[rel.end_node.ref.split("/")
                                    [-1]]  # dst node uri from neo4j ID

                    # add edge
                    edge_attr = dict(rel.properties)
                    edge_attr['relationship'] = rel.type
                    source_hash = uuid.uuid3(uuid.NAMESPACE_URL, src_uri)
                    dest_hash = uuid.uuid3(uuid.NAMESPACE_URL, dst_uri)
                    edge_uri = u"source={0}&destionation={1}".format(
                        str(source_hash), str(dest_hash))
                    rel_chain = u"relationship"
                    while rel_chain in edge_attr:
                        edge_uri = edge_uri + u"&{0}={1}".format(
                            rel_chain, edge_attr[rel_chain])
                        rel_chain = edge_attr[rel_chain]
                    if "origin" in edge_attr:
                        edge_uri += u"&{0}={1}".format(u"origin",
                                                       edge_attr["origin"])
                    edge_attr["uri"] = edge_uri
                    sg.add_edge(src_uri, dst_uri, edge_uri, edge_attr)

        # Set the topic distances
        distances = self.get_topic_distance(sg.to_undirected(), topic)
        nx.set_node_attributes(sg, u'topic_distance', distances)

        # TODO:  Handle duplicate edges (may dedup but leave in for now)
        #          Take edges into dataframe
        #          group by combine on features to be deduplicated.  Return edge_id's in each group.  Combine those edge_ids using a combine algorithm
        #          Could do the same with dedup algo, but just return the dedup edge_ids and delete them from the graph

        return sg
示例#2
0
    def enrich(self, g):  # Neo4j
        """

        :param g: networkx graph to be merged
        :param neo4j: bulbs neo4j config
        :return: Nonetype

        Note: Neo4j operates differently from the current titan import.  The neo4j import does not aggregate edges which
               means they must be handled at query time.  The current titan algorithm aggregates edges based on time on
               merge.
        """
        #neo4j_graph = NEO_Graph(neo4j)  # Bulbs
        neo_graph = py2neoGraph(self.neo4j_config)
        nodes = set()
        node_map = dict()
        edges = set()
        settled = set()
        # Merge all nodes first
        tx = neo_graph.cypher.begin()
        cypher = ("MERGE (node: {0} {1}) "
                  "ON CREATE SET node = {2} "
                  "RETURN collect(node) as nodes")
        # create transaction for all nodes
        for node, data in g.nodes(data=True):
            query = cypher.format(data['class'], "{key:{KEY}, value:{VALUE}}",
                                  "{MAP}")
            props = {"KEY": data['key'], "VALUE": data['value'], "MAP": data}
            # TODO: set "start_time" and "finish_time" to dummy variables in attr.
            # TODO:  Add nodes to graph, and cyper/gremlin query to compare to node start_time & end_time to dummy
            # TODO:  variable update if node start > dummy start & node finish < dummy finish, and delete dummy
            # TODO:  variables.
            tx.append(query, props)
        # commit transaction and create mapping of returned nodes to URIs for edge creation
        for record_list in tx.commit():
            for record in record_list:
                #            print record, record.nodes[0]._Node__id, len(record.nodes)
                for n in record.nodes:
                    #                print n._Node__id
                    attr = n.properties
                    uri = "class={0}&key={1}&value={2}".format(
                        attr['class'], attr['key'], attr['value'])
                    node_map[uri] = int(n.ref.split("/")[1])
    #                node_map[uri] = n._Node__id
    #    print node_map  # DEBUG

    # Create edges
        cypher = ("MATCH (src: {0}), (dst: {1}) "
                  "WHERE id(src) = {2} AND id(dst) = {3} "
                  "CREATE (src)-[rel: {4} {5}]->(dst) ")
        tx = neo_graph.cypher.begin()
        for edge in g.edges(data=True):
            try:
                if 'relationship' in edge[2]:
                    relationship = edge[2].pop('relationship')
                else:
                    # default to 'described_by'
                    relationship = 'describedBy'

                query = cypher.format(g.node[edge[0]]['class'],
                                      g.node[edge[1]]['class'], "{SRC_ID}",
                                      "{DST_ID}", relationship, "{MAP}")
                props = {
                    "SRC_ID": node_map[edge[0]],
                    "DST_ID": node_map[edge[1]],
                    "MAP": edge[2]
                }

                # create the edge
                # NOTE: No attempt is made to deduplicate edges between the graph to be merged and the destination graph.
                #        The query scripts should handle this.
                #        print edge, query, props  # DEBUG
                tx.append(query, props)
        #        rel = py2neoRelationship(node_map[src_uri], relationship, node_map[dst_uri])
        #        rel.properties.update(edge[2])
        #        neo_graph.create(rel)  # Debug
        #        edges.add(rel)
            except:
                print edge
                print node_map
                raise

        # create edges all at once
        #print edges  # Debug

    #    neo_graph.create(*edges)
        tx.commit()
示例#3
0
    def enrich(self, g):  # Neo4j
        """

        :param g: networkx graph to be merged
        :param neo4j: bulbs neo4j config
        :return: Nonetype

        Note: Neo4j operates differently from the current titan import.  The neo4j import does not aggregate edges which
               means they must be handled at query time.  The current titan algorithm aggregates edges based on time on
               merge.
        """
        #neo4j_graph = NEO_Graph(neo4j)  # Bulbs
        neo_graph = py2neoGraph(self.neo4j_config)
        nodes = set()
        node_map = dict()
        edges = set()
        settled = set()
        # Merge all nodes first
        tx = neo_graph.cypher.begin()
        cypher = ("MERGE (node: {0} {1}) "
                  "ON CREATE SET node = {2} "
                  "RETURN collect(node) as nodes"
                 )
        # create transaction for all nodes
        for node, data in g.nodes(data=True):
            query = cypher.format(data['class'], "{key:{KEY}, value:{VALUE}}", "{MAP}")
            props = {"KEY": data['key'], "VALUE":data['value'], "MAP": data}
            # TODO: set "start_time" and "finish_time" to dummy variables in attr.
            # TODO:  Add nodes to graph, and cyper/gremlin query to compare to node start_time & end_time to dummy
            # TODO:  variable update if node start > dummy start & node finish < dummy finish, and delete dummy
            # TODO:  variables.
            tx.append(query, props)
        # commit transaction and create mapping of returned nodes to URIs for edge creation
        for record_list in tx.commit():
            for record in record_list:
    #            print record, record.nodes[0]._Node__id, len(record.nodes)
                for n in record.nodes:
    #                print n._Node__id
                    attr = n.properties
                    uri = "class={0}&key={1}&value={2}".format(attr['class'], attr['key'], attr['value'])
                    node_map[uri] = int(n.ref.split("/")[1])
    #                node_map[uri] = n._Node__id
    #    print node_map  # DEBUG

        # Create edges
        cypher = ("MATCH (src: {0}), (dst: {1}) "
                  "WHERE id(src) = {2} AND id(dst) = {3} "
                  "CREATE (src)-[rel: {4} {5}]->(dst) "
                 )
        tx = neo_graph.cypher.begin()
        for edge in g.edges(data=True):
            try:
                if 'relationship' in edge[2]:
                    relationship = edge[2].pop('relationship')
                else:
                    # default to 'described_by'
                    relationship = 'describedBy'

                query = cypher.format(g.node[edge[0]]['class'],
                                      g.node[edge[1]]['class'],
                                     "{SRC_ID}",
                                     "{DST_ID}",
                                      relationship,
                                      "{MAP}"
                                     )
                props = {
                    "SRC_ID": node_map[edge[0]],
                    "DST_ID": node_map[edge[1]],
                    "MAP": edge[2]
                }

                # create the edge
                # NOTE: No attempt is made to deduplicate edges between the graph to be merged and the destination graph.
                #        The query scripts should handle this.
        #        print edge, query, props  # DEBUG
                tx.append(query, props)
        #        rel = py2neoRelationship(node_map[src_uri], relationship, node_map[dst_uri])
        #        rel.properties.update(edge[2])
        #        neo_graph.create(rel)  # Debug
        #        edges.add(rel)
            except:
                print edge
                print node_map
                raise

        # create edges all at once
        #print edges  # Debug
    #    neo_graph.create(*edges)
        tx.commit()
示例#4
0
    def query(self, topic, max_depth=3, dont_follow=['enrichment', 'classification'], config=None):
        """

            :param topic: a  graph to return the context of.  At least one node ID in topic \
             must be in full graph g to return any context.
            :param max_depth: The maximum distance from the topic to search
            :param config: The titanDB configuration to use if not using the one configured with the plugin
            :param dont_follow: A list of attribute types to not follow
            :return: subgraph in networkx format
        """
        if config is None:
            config = self.neo4j_config

        neo_graph = py2neoGraph(config)
        sg = nx.MultiDiGraph()

        # Get IDs of topic nodes in graph (if they exist).  Also add topics to subgraph
        topic_ids = set()
        for t, data in topic.nodes(data=True):
            cypher = ("MATCH (topic: {0} {1}) "
                      "RETURN collect(topic) as topics").format(data['class'], "{key:{KEY}, value:{VALUE}}")
            props = {"KEY":data['key'], "VALUE":data['value']}
            records = neo_graph.cypher.execute(cypher, props)
            #print cypher, props  #  DEBUG
            #print type(records)
            for record in records:
                #print record  # DEBUG
                for tnode in record.topics:
                    attr = dict(tnode.properties)
                    uri = u'class={0}&key={1}&value={2}'.format(attr['class'],attr['key'], attr['value'])
                    sg.add_node(uri, attr)
                    topic_ids.add(int(tnode.ref.split("/")[-1]))

        # Add nodes at depth 1  (done separately as it doesn't include the intermediary
        nodes = dict()
        if max_depth > 0 and len(topic_ids) > 0:  # no depth or no topicID means no subgraph
            cypher = self.build_query(max_depth)
            attr = {"TOPICS": list(topic_ids),
                    "DONT_FOLLOW": dont_follow}
            #print cypher, attr  # DEBUG
#            for record in neo_graph.cypher.stream(cypher, attr):  # Prefer streaming to execute, if it works
            for record in neo_graph.cypher.execute(cypher, attr):
                #print record  # DEBUG
                for node in record.nodes:
                    attr = dict(node.properties)
                    uri = 'class={0}&key={1}&value={2}'.format(attr['class'],attr['key'], attr['value'])
                    sg.add_node(uri, attr)
                    nodes[node.ref.split("/")[-1]] = uri
                for rel in record.rels:
    #                print type(rel) # DEBUG
                    # add edges SRC node
    #                src_attr = dict(rel.start_node.properties)
    #                src_uri = u"class={0}&key={1}&value={2}".format(src_attr['class'], src_attr['key'], src_attr['value'])
    #                sg.add_node(src_uri, src_attr)
                    src_uri = nodes[rel.start_node.ref.split("/")[-1]]  # src node uri from neo4j ID

                    # Add edge DST node
    #                dst_attr = dict(rel.end_node.properties)
    #                dst_uri = u"class={0}&key={1}&value={2}".format(dst_attr['class'], dst_attr['key'], dst_attr['value'])
    #                sg.add_node(dst_uri, dst_attr)
                    dst_uri = nodes[rel.end_node.ref.split("/")[-1]]  # dst node uri from neo4j ID

                    # add edge
                    edge_attr = dict(rel.properties)
                    edge_attr['relationship'] = rel.type
                    source_hash = uuid.uuid3(uuid.NAMESPACE_URL, src_uri)
                    dest_hash = uuid.uuid3(uuid.NAMESPACE_URL, dst_uri)
                    edge_uri = u"source={0}&destionation={1}".format(str(source_hash), str(dest_hash))
                    rel_chain = u"relationship"
                    while rel_chain in edge_attr:
                        edge_uri = edge_uri + u"&{0}={1}".format(rel_chain,edge_attr[rel_chain])
                        rel_chain = edge_attr[rel_chain]
                    if "origin" in edge_attr:
                        edge_uri += u"&{0}={1}".format(u"origin", edge_attr["origin"])
                    edge_attr["uri"] = edge_uri
                    sg.add_edge(src_uri, dst_uri, edge_uri, edge_attr)

        # Set the topic distances
        distances = self.get_topic_distance(sg.to_undirected(), topic)
        nx.set_node_attributes(sg, u'topic_distance', distances)

        # TODO:  Handle duplicate edges (may dedup but leave in for now)
        #          Take edges into dataframe
        #          group by combine on features to be deduplicated.  Return edge_id's in each group.  Combine those edge_ids using a combine algorithm
        #          Could do the same with dedup algo, but just return the dedup edge_ids and delete them from the graph

        return sg
示例#5
0
    def minion(self, *args, **xargs):
        self.shutdown = False

        # Get graph
        neo_graph = py2neoGraph(self.neo4j_config)

        random_cypher = ''' MATCH (a)-[:describedBy]->() 
                            RETURN a, rand() as r
                            ORDER BY r
                            LIMIT 1
                        '''

        # pick a random node
        records = neo_graph.cypher.execute(random_cypher)
        node = records[0][0]

        logging.info(
            "first node to consolidate edges for is class: {0}, key: {1}, value: {2}"
            .format(node.properties['class'], node.properties['key'],
                    node.properties['value']))
        print "first node to consolidate edges for is class: {0}, key: {1}, value: {2}".format(
            node.properties['class'], node.properties['key'],
            node.properties['value'])  # DEBUG

        while not self.shutdown:
            edges = defaultdict(set)
            destinations = set()

            # get edges starting with the node
            for rel in node.match_outgoing():
                if 'uri' in rel.properties:
                    edge_uri = rel.properties['uri']
                else:
                    # SRC URI
                    if 'uri' in rel.start_node.properties:
                        source_uri = rel.start_node.properties['uri']
                    else:
                        source_uri = "class={0}&key={1}&value={2}".format(
                            rel.start_node.properties['attribute'],
                            rel.start_node.properties['key'],
                            rel.start_node.properties['value'])

                    # DST URI
                    if 'uri' in rel.end_node.properties:
                        dest_uri = rel.end_node.properties['uri']
                    else:
                        dest_uri = "class={0}&key={1}&value={2}".format(
                            rel.end_node.properties['attribute'],
                            rel.end_node.properties['key'],
                            rel.end_node.properties['value'])

                    # Remove non-ascii as it gumms up uuid.
                    # NOTE: This shouldn't effect anything as it's just for the key in the edges dictionary
                    source_uri = self.Verum.removeNonAscii(source_uri)
                    dest_uri = self.Verum.removeNonAscii(dest_uri)

                    # Edge URI
                    source_hash = uuid.uuid3(uuid.NAMESPACE_URL, source_uri)
                    dest_hash = uuid.uuid3(uuid.NAMESPACE_URL, dest_uri)

                    edge_uri = "source={0}&destionation={1}".format(
                        str(source_hash), str(dest_hash))
                    rel_chain = "relationship"
                    while rel_chain in rel.properties:
                        edge_uri = edge_uri + "&{0}={1}".format(
                            rel_chain, rel.properties[rel_chain])
                        rel_chain = rel.properties[rel_chain]
                    if "origin" in rel.properties:
                        edge_uri += "&{0}={1}".format("origin",
                                                      rel.properties["origin"])

                # aggregate edges by dst, and uri
                edges[edge_uri].add(
                    rel
                )  # WARNING: The use of URI here is vulnerable to values being out of order in the URI and edges not being removed.

                # collect destinations to pick next node
                destinations.add(rel.end_node)

            time = datetime.utcnow()

            # SRC URI
            if 'uri' in node.properties:
                source_uri = node.properties['uri']
            else:
                source_uri = "class={0}&key={1}&value={2}".format(
                    node.properties['attribute'], node.properties['key'],
                    node.properties['value'])

            for edge_uri in edges:
                edge_list = list(edges[edge_uri])

                # DST URI
                if 'uri' in edge_list[0].end_node.properties:
                    dest_uri = edge_list[0].end_node.properties['uri']
                else:
                    dest_uri = "class={0}&key={1}&value={2}".format(
                        edge_list[0].end_node.properties['attribute'],
                        edge_list[0].end_node.properties['key'],
                        edge_list[0].end_node.properties['value'])

                logging.debug(
                    "Removing {0} edges from node {1} to {2}.".format(
                        len(edge_list[1:]), source_uri, dest_uri))
                #print "Removing {0} edges from node {1} to {2}.".format(len(edge_list[1:]), source_uri, dest_uri)  # DEBUG

                for edge in edge_list[1:]:
                    # keep earliest time as start
                    try:
                        edge_time = datetime.strptime(
                            edge.properties['start_time'],
                            "%Y-%m-%dT%H:%M:%SZ")
                        if 'start_time' in edge.properties and time > edge_time:
                            time = edge_time
                    except ValueError:  # The time on the node wasn't legit
                        pass
                    try:  # sometimes the edge is no longer there.  Better to pass than fail.
                        #  remove all but one node of each group
                        edge.delete()
                    except:
                        pass
                # Update time on remaining node
                try:
                    edge_time = datetime.strptime(
                        edge_list[0].properties['start_time'],
                        "%Y-%m-%dT%H:%M:%SZ")
                except:
                    edge_time = datetime.utcnow()
                if 'start_time' not in edge_list[
                        0].properties or time < edge_time:
                    edge_list[0].properties['start_time'] = time.strftime(
                        "%Y-%m-%dT%H:%M:%SZ")
                    edge_list[0].push()

                logging.debug(
                    "Keeping edge {0} from node {1} to node {2}.".format(
                        edge_list[0].uri, source_uri, dest_uri))
                #print "Keeping edge {0} from node {1} to node {2}.".format(edge_list[0].uri, source_uri, dest_uri)  # DEBUG

            #  Sleep to slow it down
            sleep(self.sleep_time)

            jump = random.random()

            # do the random walk
            if len(destinations) == 0 or jump <= self.jump:
                # pick a random node
                records = neo_graph.cypher.execute(random_cypher)
                node = records[0][0]
                logging.debug("Edge consolidation random walk jumped.")
            else:
                node = random.choice(destinations)
                logging.debug("Edge consolidation random walk didn't jumped.")

            logging.info(
                "Next node to consolidate edges for is class: {0}, key: {1}, value: {2}"
                .format(node.properties['class'], node.properties['key'],
                        node.properties['value']))