def insert_attr(graph_conn, attr_val_dict, target_id, node_id,
                        vertex_type):

            if (not g.V().has(id, node_id).hasNext()):
                logger.info(f'Insert_Vertex: {node_id}.')
                g.inject(attr_val_dict).unfold().as_(vertex_type).\
                addV(vertex_type).as_('v').property(id,node_id).\
                sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                    property(Cardinality.single, __.select('kv').by(Column.keys),
                                __.select('kv').by(Column.values)
                                )
                    ).iterate()
            else:
                logger.debug(
                    f'Ignore inserting existing Vertex with id {node_id}')

            # Insert_edge

            to_node = g.V().has(id, node_id).next()
            edgeId = target_id + '-' + node_id
            if (not g.E().has(id, edgeId).hasNext()):
                logger.info(f'Insert_Edge: {target_id} --> {node_id}.')
                g.V().has(id, target_id).addE('CATEGORY').to(to_node).property(
                    id, edgeId).iterate()
            else:
                logger.debug(
                    f'Ignore inserting existing edge with id {edgeId}')
예제 #2
0
    def insert_new_transaction_vertex_and_edge(self,
                                               tr_dict,
                                               connectted_node_dict,
                                               target_id,
                                               vertex_type='Transaction'):
        """Load transaction data, insert transaction object and related domain objects into GraphDB as vertex,
        with their properties as values, and insert their relation as edges.
            
        Example:
        >>> insert_new_transaction_vertex_and_edge(tr_dict, connectted_node_dict, target_id, vertex_type = 'Transaction')
        """
        def insert_attr(graph_conn, attr_val_dict, target_id, node_id,
                        vertex_type):

            if (not g.V().has(id, node_id).hasNext()):
                logger.info(f'Insert_Vertex: {node_id}.')
                g.inject(attr_val_dict).unfold().as_(vertex_type).\
                addV(vertex_type).as_('v').property(id,node_id).\
                sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                    property(__.select('kv').by(Column.keys),
                                __.select('kv').by(Column.values)
                                )
                    ).iterate()

            # Insert_edge

            to_node = g.V().has(id, node_id).next()
            if (not g.E().has(id, target_id + '-' + node_id).hasNext()):
                logger.info(f'Insert_Edge: {target_id} --> {node_id}.')
                g.V().has(id, target_id).addE('CATEGORY').to(to_node).property(
                    id, target_id + '-' + node_id).iterate()

        conn = self.gremlin_utils.remote_connection()
        g = self.gremlin_utils.traversal_source(connection=conn)

        if (not g.V().has(id, target_id).hasNext()):
            logger.info(f'Insert_Vertex: {target_id}.')
            g.inject(tr_dict).unfold().as_(vertex_type).\
            addV(vertex_type).as_('v').property(id,target_id).\
            sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                property(__.select('kv').by(Column.keys),
                            __.select('kv').by(Column.values)
                            )
                ).iterate()

        attr_cols = [f'val{x}' for x in range(1, 391)]
        empty_node_dict = {}
        for attr in attr_cols:
            empty_node_dict[attr] = 0.0

        for node_k, node_v in connectted_node_dict[0].items():
            node_id = node_k + '-' + str(node_v)
            insert_attr(g, [empty_node_dict],
                        target_id,
                        node_id,
                        vertex_type=node_k)

        conn.close()
예제 #3
0
def _user_search_query(graph: GraphTraversalSource,
                       tag_filter: str) -> List[Dict]:
    traversal = graph.V().hasLabel(User.USER_NODE_LABEL)
    traversal = traversal.has(User.USER_NODE_FULL_NAME)
    if tag_filter:
        traversal = traversal.where('published_tag', tag_filter)
    traversal = traversal.project('email', 'first_name', 'last_name',
                                  'full_name', 'github_username', 'team_name',
                                  'employee_type', 'manager_email', 'slack_id',
                                  'is_active', 'role_name', 'total_read',
                                  'total_own', 'total_follow')
    traversal = traversal.by('email')  # email
    traversal = traversal.by('first_name')  # first_name
    traversal = traversal.by('last_name')  # last_name
    traversal = traversal.by('full_name')  # full_name
    traversal = traversal.by('github_username')  # github_username
    traversal = traversal.by('team_name')  # team_name
    traversal = traversal.by('employee_type')  # employee_type
    traversal = traversal.by(
        __.coalesce(
            __.out(User.USER_MANAGER_RELATION_TYPE).values('email'),
            __.constant('')))  # manager_email
    traversal = traversal.by('slack_id')  # slack_id
    traversal = traversal.by('is_active')  # is_active
    traversal = traversal.by('role_name')  # role_name
    traversal = traversal.by(
        __.coalesce(
            __.outE(READ_RELATION_TYPE).values('read_count'),
            __.constant(0)).sum())  # total_read
    traversal = traversal.by(
        __.outE(OWNER_OF_OBJECT_RELATION_TYPE).fold().count())  # total_own
    traversal = traversal.by(
        __.outE('FOLLOWED_BY').fold().count())  # total_follow
    traversal = traversal.order().by(__.select('email'), Order.asc)
    return traversal.toList()
예제 #4
0
    async def flush(self,
                    conflicts_query: Optional[GraphTraversal] = None) -> None:
        """
        Issue creation/update queries to database for all elements in the
        session pending queue.
        """
        transaction_id = str(uuid.uuid4())
        processed = []
        try:
            while self._pending:
                elem = self._pending.popleft()
                actual_id = self.__dirty_element(elem, id=transaction_id)
                if actual_id:
                    processed.append(await self.save(elem))
                else:
                    await self.save(elem)

            if not processed: return
            if not conflicts_query:
                await self.__commit_transaction(transaction_id)
            else:
                await (self.g.E().has(
                    'dirty', transaction_id).aggregate('x').fold().V().has(
                        'dirty', transaction_id).aggregate('x').choose(
                            conflicts_query,
                            __.select('x').unfold().properties(
                                'dirty').drop()).iterate())  # type: ignore
                await self.__rollback_transaction(transaction_id)
        except Exception as e:
            await self.__rollback_transaction(transaction_id)
            raise e
        for elem in processed:
            elem.dirty = None
예제 #5
0
 def __add_author(self, t, author, post_url):
     img_src = None
     if "img_src" in author.keys():
         img_src = author['img_src']
         img_height = author['img_height']
         img_width = author['img_width']
     t = (
         t.V(author['name'])
         .fold()
         .coalesce(
             __.unfold(),
             __.addV('author')
             .property(T.id, author['name'])
             .property('name', author['name'])
         ).as_('p').addE('written_by').from_(__.V(post_url))
     )
     # Conditionally add the img_src, img_height, and img_width property if they do not exist
     if img_src:
         t = (
             t.sideEffect(
                 __.select('p').hasNot('img_src')
                 .property('img_src', img_src)
                 .property('img_height', img_height)
                 .property('img_width', img_width)
             )
         )
     return t
예제 #6
0
def invoke(focusObject, rootObject, componentParameters, **kwargs):
    log("Running elaborator")
    elaborate(focusObject)

    log(pprint.pformat(componentParameters))

    g = make_graph(focusObject)

    # Find the top 10 objects in the graph by degree centrality
    # Derived from Tinkerpop example: http://tinkerpop.apache.org/docs/current/recipes/#degree-centrality
    degrees = (g.V().project("v", "name", "degree").by().by("name").by(
        __.bothE().count()).order().by(__.select("degree"),
                                       Order.desc).limit(10).toList())

    log(pprint.pformat(degrees))

    with open(
            os.path.join(componentParameters["output_dir"], "centrality.csv"),
            "wb") as output_file:
        writer = csv.DictWriter(output_file, ["name", "degree"],
                                extrasaction="ignore")

        writer.writeheader()
        writer.writerows(degrees)

    componentParameters["runCommand"] = "cmd.exe /c echo"
예제 #7
0
def _table_search_query(graph: GraphTraversalSource,
                        tag_filter: str) -> List[Dict]:
    traversal = graph.V().hasLabel(TableMetadata.TABLE_NODE_LABEL)
    if tag_filter:
        traversal = traversal.has('published_tag', tag_filter)
    traversal = traversal.project('database', 'cluster', 'schema',
                                  'schema_description', 'name', 'key',
                                  'description', 'last_updated_timestamp',
                                  'column_names', 'column_descriptions',
                                  'total_usage', 'unique_usage', 'tags',
                                  'badges', 'programmatic_descriptions')
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out(
            SCHEMA_REVERSE_RELATION_TYPE).out(
                CLUSTER_REVERSE_RELATION_TYPE).values('name'))  # database
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out(
            SCHEMA_REVERSE_RELATION_TYPE).values('name'))  # cluster
    traversal = traversal.by(
        __.out(
            TableMetadata.TABLE_SCHEMA_RELATION_TYPE).values('name'))  # schema
    traversal = traversal.by(
        __.coalesce(
            __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out(
                DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values(
                    'description'), __.constant('')))  # schema_description
    traversal = traversal.by('name')  # name
    traversal = traversal.by(T.id)  # key
    traversal = traversal.by(
        __.coalesce(
            __.out(DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values(
                'description'), __.constant('')))  # description
    traversal = traversal.by(
        __.coalesce(
            __.out(LASTUPDATED_RELATION_TYPE).values(TIMESTAMP_PROPERTY),
            __.constant('')))  # last_updated_timestamp
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_COL_RELATION_TYPE).values(
            'name').fold())  # column_names
    traversal = traversal.by(
        __.out(TableMetadata.TABLE_COL_RELATION_TYPE).out(
            DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values(
                'description').fold())  # column_descriptions
    traversal = traversal.by(
        __.coalesce(
            __.outE(READ_REVERSE_RELATION_TYPE).values('read_count'),
            __.constant(0)).sum())  # total_usage
    traversal = traversal.by(
        __.outE(READ_REVERSE_RELATION_TYPE).count())  # unique_usage
    traversal = traversal.by(
        __.inE(TableMetadata.TAG_TABLE_RELATION_TYPE).outV().values(
            METADATA_KEY_PROPERTY_NAME).fold())  # tags
    traversal = traversal.by(
        __.out('HAS_BADGE').values('keys').dedup().fold())  # badges
    traversal = traversal.by(
        __.out(DescriptionMetadata.PROGRAMMATIC_DESCRIPTION_NODE_LABEL).values(
            'description').fold())  # programmatic_descriptions
    traversal = traversal.order().by(__.select('name'), Order.asc)
    return traversal.toList()
예제 #8
0
        def insert_attr(graph_conn, attr_val_dict, target_id, node_id, vertex_type): 

            if (not g.V().has(id,node_id).hasNext()):
                logger.info(f'Insert_Vertex: {node_id}.')
                g.inject(attr_val_dict).unfold().as_(vertex_type).\
                addV(vertex_type).as_('v').property(id,node_id).\
                sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                    property(__.select('kv').by(Column.keys),
                                __.select('kv').by(Column.values)
                                )
                    ).iterate()

            # Insert_edge

            to_node = g.V().has(id,node_id).next()
            if(not g.E().has(id,target_id+'-'+node_id).hasNext()):
                logger.info(f'Insert_Edge: {target_id} --> {node_id}.')
                g.V().has(id,target_id).addE('CATEGORY').to(to_node).property(id,target_id+'-'+node_id).iterate() 
예제 #9
0
    def _depth_search(self,
                      start_vertex,
                      traverser,
                      search_depth=DEFAULT_SEARCH_DEPTH):
        depth = int(
            search_depth) if search_depth is not None else DEFAULT_SEARCH_DEPTH

        return self.g.V(start_vertex).repeat(
            traverser.as_('e').otherV()).times(depth).emit().project(
                'e', 'v').by(__.select('e').valueMap(True).fold()).by(
                    __.valueMap(True).fold()).toList()
예제 #10
0
    def _column_entities(cls, *, _g: GraphTraversalSource,
                         tables_ids: Iterable[str],
                         existing: EXISTING) -> None:
        # fetch database -> cluster -> schema -> table links
        g = _g.V(tuple(tables_ids))
        g = g.outE(EdgeTypes.Column.value.label)
        g = g.inV().hasLabel(VertexTypes.Column.value.label).as_('columns')

        # fetch column -> links (no Stat)
        for t in [EdgeTypes.Description]:
            g = g.coalesce(__.select('columns').outE(
                t.value.label).fold()).as_(t.name)

        g = g.select(EdgeTypes.Description.name).unfold()
        g = g.local(
            __.union(__.outV().id(), __.valueMap(True),
                     __.inV().id()).fold())
        cls._into_existing(g.toList(), existing)
예제 #11
0
    def _repoint_vertex_edges(
            self, vertex_label: str,
            new_resource_urn: Union[URN, PartialUrn]) -> None:
        # https://tinkerpop.apache.org/docs/current/recipes/#edge-move

        resources_of_the_same_type = self.g.V().as_("old_vertex").hasLabel(
            vertex_label)
        for id_part in new_resource_urn.resource_id_parts:
            resources_of_the_same_type.has("_resource_id_parts", id_part)
        resources_with_same_id_but_unknown = (resources_of_the_same_type.or_(
            __.has("_account_id", "unknown"), __.has("_region",
                                                     "unknown"))).toList()

        for old_vertex in resources_with_same_id_but_unknown:
            # Outbound
            old_vertices_outbound_edges = self.g.V(old_vertex).outE().as_("e1")
            old_outbound_edges_partner_vertex = old_vertices_outbound_edges.inV(
            ).as_("b")

            new_vertex = old_outbound_edges_partner_vertex.V(
                self.generate_vertex_id(new_resource_urn)).as_("new_vertex")
            add_old_outbound_edges_to_new_vertex = (
                new_vertex.addE("has").to("b").as_("e2").sideEffect(
                    __.select("e1").properties().unfold().as_("p").select(
                        "e2").property(
                            __.select("p").key(),
                            __.select("p").value())))
            add_old_outbound_edges_to_new_vertex.select("e1").drop().iterate()
            # Inbound
            old_vertices_inbound_edges = self.g.V(old_vertex).select(
                "old_vertex").inE().as_("old_inbound_edge")
            old_inbound_edges_partner_vertex = old_vertices_inbound_edges.inV(
            ).as_("c")

            new_vertex = old_inbound_edges_partner_vertex.select("new_vertex")
            add_old_inbound_edges_to_new_vertex = (new_vertex.addE(
                "has").from_("c").as_("new_inbound_edge").sideEffect(
                    __.select("old_inbound_edge").properties().unfold().as_(
                        "p").select("new_inbound_edge").property(
                            __.select("p").key(),
                            __.select("p").value())))
            add_old_inbound_edges_to_new_vertex.select(
                "old_inbound_edge").drop().iterate()

            # Delete old vertex
            self.g.V(old_vertex).drop().iterate()
예제 #12
0
def _dashboard_search_query(graph: GraphTraversalSource,
                            tag_filter: str) -> List[Dict]:
    traversal = graph.V().hasLabel(DashboardMetadata.DASHBOARD_NODE_LABEL)
    traversal = traversal.has('name')
    if tag_filter:
        traversal = traversal.where('published_tag', tag_filter)

    traversal = traversal.project('group_name', 'name', 'cluster',
                                  'description', 'group_description',
                                  'group_url', 'url', 'uri',
                                  'last_successful_run_timestamp',
                                  'query_names', 'chart_names', 'total_usage',
                                  'tags', 'badges')
    traversal = traversal.by(
        __.out(
            DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).values(
                'name'))  # group_name
    traversal = traversal.by('name')  # name
    traversal = traversal.by(
        __.out(DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).out(
            DashboardMetadata.DASHBOARD_GROUP_CLUSTER_RELATION_TYPE).values(
                'name'))  # cluster
    traversal = traversal.by(
        __.coalesce(
            __.out(
                DashboardMetadata.DASHBOARD_DESCRIPTION_RELATION_TYPE).values(
                    'description'), __.constant('')))  # description
    traversal = traversal.by(
        __.coalesce(
            __.out(
                DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).out(
                    DashboardMetadata.DASHBOARD_DESCRIPTION_RELATION_TYPE).
            values('description'), __.constant('')))  # group_description
    traversal = traversal.by(
        __.out(
            DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).values(
                'dashboard_group_url'))  # group_url
    traversal = traversal.by('dashboard_url')  # dashboard_url
    traversal = traversal.by('key')  # uri

    traversal = traversal.by(
        __.coalesce(
            __.out('EXECUTED').has(
                'key', TextP.endingWith(
                    '_last_successful_execution')).values('timestamp'),
            __.constant('')))  # last_successful_run_timestamp
    traversal = traversal.by(
        __.out(DashboardQuery.DASHBOARD_QUERY_RELATION_TYPE).values(
            'name').dedup().fold())  # query_names
    traversal = traversal.by(
        __.out(DashboardQuery.DASHBOARD_QUERY_RELATION_TYPE).out(
            DashboardChart.CHART_RELATION_TYPE).values(
                'name').dedup().fold())  # chart_names
    traversal = traversal.by(
        __.coalesce(
            __.outE(READ_REVERSE_RELATION_TYPE).values(
                READ_RELATION_COUNT_PROPERTY),
            __.constant(0)).sum())  # total_usage
    traversal = traversal.by(
        __.out('TAGGED_BY').has(
            'tag_type', 'default').values('keys').dedup().fold())  # tags
    traversal = traversal.by(
        __.out('HAS_BADGE').values('keys').dedup().fold())  # badges

    traversal = traversal.order().by(__.select('name'), Order.asc)

    dashboards = traversal.toList()
    for dashboard in dashboards:
        dashboard['product'] = dashboard['uri'].split('_')[0]

    return dashboards
예제 #13
0
    def table_entities(cls, *, _g: GraphTraversalSource,
                       table_data: List[Table], existing: EXISTING) -> None:

        all_tables_ids = list(
            set([
                VertexTypes.Table.value.id(
                    key=TableUris.get(database=t.database,
                                      cluster=t.cluster,
                                      schema=t.schema,
                                      table=t.name).table) for t in table_data
            ]))

        all_owner_ids = list(
            set([
                VertexTypes.User.value.id(key=key) for key in [
                    t.table_writer.id for t in table_data
                    if t.table_writer is not None
                ]
            ]))
        all_application_ids = list(
            set(
                list(
                    possible_vertex_ids_for_application_key(*[
                        t.table_writer.id for t in table_data
                        if t.table_writer is not None
                    ]))))

        # chunk these since 100,000s seems to choke
        for tables_ids in chunk(all_tables_ids, 1000):
            LOGGER.info(f'fetching for tables: {tables_ids}')
            # fetch database -> cluster -> schema -> table links
            g = _g.V(tuple(tables_ids)).as_('tables')
            g = g.coalesce(__.inE(
                EdgeTypes.Table.value.label).dedup().fold()).as_(
                    EdgeTypes.Table.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Schema.value.label).inE(
                    EdgeTypes.Schema.value.label).dedup().fold()).as_(
                        EdgeTypes.Schema.name)
            g = g.coalesce(__.unfold().outV().hasLabel(
                VertexTypes.Cluster.value.label).inE(
                    EdgeTypes.Cluster.value.label).dedup().fold()).as_(
                        EdgeTypes.Cluster.name)

            # fetch table <- links
            for t in (EdgeTypes.BelongToTable, EdgeTypes.Generates,
                      EdgeTypes.Tag):
                g = g.coalesce(__.select('tables').inE(
                    t.value.label).fold()).as_(t.name)

            # fetch table -> column et al links
            for t in (EdgeTypes.Column, EdgeTypes.Description,
                      EdgeTypes.LastUpdatedAt, EdgeTypes.Source,
                      EdgeTypes.Stat):
                g = g.coalesce(__.select('tables').outE(
                    t.value.label).fold()).as_(t.name)

            # TODO: add owners, watermarks, last timestamp existing, source
            aliases = set([
                t.name
                for t in (EdgeTypes.Table, EdgeTypes.Schema, EdgeTypes.Cluster,
                          EdgeTypes.BelongToTable, EdgeTypes.Generates,
                          EdgeTypes.Tag, EdgeTypes.Column,
                          EdgeTypes.Description, EdgeTypes.LastUpdatedAt,
                          EdgeTypes.Source, EdgeTypes.Stat)
            ])
            g = g.select(*aliases).unfold().select(MapColumn.values).unfold()
            g = g.local(
                __.union(__.outV().id(), __.valueMap(True),
                         __.inV().id()).fold())
            cls._into_existing(g.toList(), existing)

            cls._column_entities(_g=_g,
                                 tables_ids=tables_ids,
                                 existing=existing)

        # fetch Application, User
        for ids in chunk(list(set(all_application_ids + all_owner_ids)), 5000):
            LOGGER.info(f'fetching for application/owners: {ids}')
            g = _g.V(ids).valueMap(True)
            cls._into_existing(g.toList(), existing)
    def insert_new_transaction_vertex_and_edge(self,
                                               tr_dict,
                                               connectted_node_dict,
                                               target_id,
                                               vertex_type='Transaction'):
        """Load transaction data, insert transaction object and related domain objects into GraphDB as vertex,
        with their properties as values, and insert their relation as edges.
            
        Example:
        >>> insert_new_transaction_vertex_and_edge(tr_dict, connectted_node_dict, target_id, vertex_type = 'Transaction')
        """
        def insert_attr(graph_conn, attr_val_dict, target_id, node_id,
                        vertex_type):

            if (not g.V().has(id, node_id).hasNext()):
                logger.info(f'Insert_Vertex: {node_id}.')
                g.inject(attr_val_dict).unfold().as_(vertex_type).\
                addV(vertex_type).as_('v').property(id,node_id).\
                sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                    property(Cardinality.single, __.select('kv').by(Column.keys),
                                __.select('kv').by(Column.values)
                                )
                    ).iterate()
            else:
                logger.debug(
                    f'Ignore inserting existing Vertex with id {node_id}')

            # Insert_edge

            to_node = g.V().has(id, node_id).next()
            edgeId = target_id + '-' + node_id
            if (not g.E().has(id, edgeId).hasNext()):
                logger.info(f'Insert_Edge: {target_id} --> {node_id}.')
                g.V().has(id, target_id).addE('CATEGORY').to(to_node).property(
                    id, edgeId).iterate()
            else:
                logger.debug(
                    f'Ignore inserting existing edge with id {edgeId}')

        conn = self.gremlin_utils.remote_connection()
        g = self.gremlin_utils.traversal_source(connection=conn)

        if (not g.V().has(id, target_id).hasNext()):
            logger.info(f'Insert_Vertex: {target_id}.')
            g.inject(tr_dict).unfold().as_(vertex_type).\
            addV(vertex_type).as_('v').property(id,target_id).\
            sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\
                property(Cardinality.single, __.select('kv').by(Column.keys),
                            __.select('kv').by(Column.values)
                            )
                ).iterate()

        cols = {'val' + str(i + 1): '0.0' for i in range(390)}
        for node_k, node_v in connectted_node_dict[0].items():
            node_id = node_k + '-' + str(node_v)
            empty_node_dict = {}
            empty_node_dict[attr_version_key] = json.dumps(cols)
            empty_node_dict = [empty_node_dict]
            insert_attr(g,
                        empty_node_dict,
                        target_id,
                        node_id,
                        vertex_type=node_k)

        conn.close()