def insert_attr(graph_conn, attr_val_dict, target_id, node_id, vertex_type): if (not g.V().has(id, node_id).hasNext()): logger.info(f'Insert_Vertex: {node_id}.') g.inject(attr_val_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,node_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(Cardinality.single, __.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() else: logger.debug( f'Ignore inserting existing Vertex with id {node_id}') # Insert_edge to_node = g.V().has(id, node_id).next() edgeId = target_id + '-' + node_id if (not g.E().has(id, edgeId).hasNext()): logger.info(f'Insert_Edge: {target_id} --> {node_id}.') g.V().has(id, target_id).addE('CATEGORY').to(to_node).property( id, edgeId).iterate() else: logger.debug( f'Ignore inserting existing edge with id {edgeId}')
def insert_new_transaction_vertex_and_edge(self, tr_dict, connectted_node_dict, target_id, vertex_type='Transaction'): """Load transaction data, insert transaction object and related domain objects into GraphDB as vertex, with their properties as values, and insert their relation as edges. Example: >>> insert_new_transaction_vertex_and_edge(tr_dict, connectted_node_dict, target_id, vertex_type = 'Transaction') """ def insert_attr(graph_conn, attr_val_dict, target_id, node_id, vertex_type): if (not g.V().has(id, node_id).hasNext()): logger.info(f'Insert_Vertex: {node_id}.') g.inject(attr_val_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,node_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(__.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() # Insert_edge to_node = g.V().has(id, node_id).next() if (not g.E().has(id, target_id + '-' + node_id).hasNext()): logger.info(f'Insert_Edge: {target_id} --> {node_id}.') g.V().has(id, target_id).addE('CATEGORY').to(to_node).property( id, target_id + '-' + node_id).iterate() conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) if (not g.V().has(id, target_id).hasNext()): logger.info(f'Insert_Vertex: {target_id}.') g.inject(tr_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,target_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(__.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() attr_cols = [f'val{x}' for x in range(1, 391)] empty_node_dict = {} for attr in attr_cols: empty_node_dict[attr] = 0.0 for node_k, node_v in connectted_node_dict[0].items(): node_id = node_k + '-' + str(node_v) insert_attr(g, [empty_node_dict], target_id, node_id, vertex_type=node_k) conn.close()
def _user_search_query(graph: GraphTraversalSource, tag_filter: str) -> List[Dict]: traversal = graph.V().hasLabel(User.USER_NODE_LABEL) traversal = traversal.has(User.USER_NODE_FULL_NAME) if tag_filter: traversal = traversal.where('published_tag', tag_filter) traversal = traversal.project('email', 'first_name', 'last_name', 'full_name', 'github_username', 'team_name', 'employee_type', 'manager_email', 'slack_id', 'is_active', 'role_name', 'total_read', 'total_own', 'total_follow') traversal = traversal.by('email') # email traversal = traversal.by('first_name') # first_name traversal = traversal.by('last_name') # last_name traversal = traversal.by('full_name') # full_name traversal = traversal.by('github_username') # github_username traversal = traversal.by('team_name') # team_name traversal = traversal.by('employee_type') # employee_type traversal = traversal.by( __.coalesce( __.out(User.USER_MANAGER_RELATION_TYPE).values('email'), __.constant(''))) # manager_email traversal = traversal.by('slack_id') # slack_id traversal = traversal.by('is_active') # is_active traversal = traversal.by('role_name') # role_name traversal = traversal.by( __.coalesce( __.outE(READ_RELATION_TYPE).values('read_count'), __.constant(0)).sum()) # total_read traversal = traversal.by( __.outE(OWNER_OF_OBJECT_RELATION_TYPE).fold().count()) # total_own traversal = traversal.by( __.outE('FOLLOWED_BY').fold().count()) # total_follow traversal = traversal.order().by(__.select('email'), Order.asc) return traversal.toList()
async def flush(self, conflicts_query: Optional[GraphTraversal] = None) -> None: """ Issue creation/update queries to database for all elements in the session pending queue. """ transaction_id = str(uuid.uuid4()) processed = [] try: while self._pending: elem = self._pending.popleft() actual_id = self.__dirty_element(elem, id=transaction_id) if actual_id: processed.append(await self.save(elem)) else: await self.save(elem) if not processed: return if not conflicts_query: await self.__commit_transaction(transaction_id) else: await (self.g.E().has( 'dirty', transaction_id).aggregate('x').fold().V().has( 'dirty', transaction_id).aggregate('x').choose( conflicts_query, __.select('x').unfold().properties( 'dirty').drop()).iterate()) # type: ignore await self.__rollback_transaction(transaction_id) except Exception as e: await self.__rollback_transaction(transaction_id) raise e for elem in processed: elem.dirty = None
def __add_author(self, t, author, post_url): img_src = None if "img_src" in author.keys(): img_src = author['img_src'] img_height = author['img_height'] img_width = author['img_width'] t = ( t.V(author['name']) .fold() .coalesce( __.unfold(), __.addV('author') .property(T.id, author['name']) .property('name', author['name']) ).as_('p').addE('written_by').from_(__.V(post_url)) ) # Conditionally add the img_src, img_height, and img_width property if they do not exist if img_src: t = ( t.sideEffect( __.select('p').hasNot('img_src') .property('img_src', img_src) .property('img_height', img_height) .property('img_width', img_width) ) ) return t
def invoke(focusObject, rootObject, componentParameters, **kwargs): log("Running elaborator") elaborate(focusObject) log(pprint.pformat(componentParameters)) g = make_graph(focusObject) # Find the top 10 objects in the graph by degree centrality # Derived from Tinkerpop example: http://tinkerpop.apache.org/docs/current/recipes/#degree-centrality degrees = (g.V().project("v", "name", "degree").by().by("name").by( __.bothE().count()).order().by(__.select("degree"), Order.desc).limit(10).toList()) log(pprint.pformat(degrees)) with open( os.path.join(componentParameters["output_dir"], "centrality.csv"), "wb") as output_file: writer = csv.DictWriter(output_file, ["name", "degree"], extrasaction="ignore") writer.writeheader() writer.writerows(degrees) componentParameters["runCommand"] = "cmd.exe /c echo"
def _table_search_query(graph: GraphTraversalSource, tag_filter: str) -> List[Dict]: traversal = graph.V().hasLabel(TableMetadata.TABLE_NODE_LABEL) if tag_filter: traversal = traversal.has('published_tag', tag_filter) traversal = traversal.project('database', 'cluster', 'schema', 'schema_description', 'name', 'key', 'description', 'last_updated_timestamp', 'column_names', 'column_descriptions', 'total_usage', 'unique_usage', 'tags', 'badges', 'programmatic_descriptions') traversal = traversal.by( __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out( SCHEMA_REVERSE_RELATION_TYPE).out( CLUSTER_REVERSE_RELATION_TYPE).values('name')) # database traversal = traversal.by( __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out( SCHEMA_REVERSE_RELATION_TYPE).values('name')) # cluster traversal = traversal.by( __.out( TableMetadata.TABLE_SCHEMA_RELATION_TYPE).values('name')) # schema traversal = traversal.by( __.coalesce( __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out( DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values( 'description'), __.constant(''))) # schema_description traversal = traversal.by('name') # name traversal = traversal.by(T.id) # key traversal = traversal.by( __.coalesce( __.out(DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values( 'description'), __.constant(''))) # description traversal = traversal.by( __.coalesce( __.out(LASTUPDATED_RELATION_TYPE).values(TIMESTAMP_PROPERTY), __.constant(''))) # last_updated_timestamp traversal = traversal.by( __.out(TableMetadata.TABLE_COL_RELATION_TYPE).values( 'name').fold()) # column_names traversal = traversal.by( __.out(TableMetadata.TABLE_COL_RELATION_TYPE).out( DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values( 'description').fold()) # column_descriptions traversal = traversal.by( __.coalesce( __.outE(READ_REVERSE_RELATION_TYPE).values('read_count'), __.constant(0)).sum()) # total_usage traversal = traversal.by( __.outE(READ_REVERSE_RELATION_TYPE).count()) # unique_usage traversal = traversal.by( __.inE(TableMetadata.TAG_TABLE_RELATION_TYPE).outV().values( METADATA_KEY_PROPERTY_NAME).fold()) # tags traversal = traversal.by( __.out('HAS_BADGE').values('keys').dedup().fold()) # badges traversal = traversal.by( __.out(DescriptionMetadata.PROGRAMMATIC_DESCRIPTION_NODE_LABEL).values( 'description').fold()) # programmatic_descriptions traversal = traversal.order().by(__.select('name'), Order.asc) return traversal.toList()
def insert_attr(graph_conn, attr_val_dict, target_id, node_id, vertex_type): if (not g.V().has(id,node_id).hasNext()): logger.info(f'Insert_Vertex: {node_id}.') g.inject(attr_val_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,node_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(__.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() # Insert_edge to_node = g.V().has(id,node_id).next() if(not g.E().has(id,target_id+'-'+node_id).hasNext()): logger.info(f'Insert_Edge: {target_id} --> {node_id}.') g.V().has(id,target_id).addE('CATEGORY').to(to_node).property(id,target_id+'-'+node_id).iterate()
def _depth_search(self, start_vertex, traverser, search_depth=DEFAULT_SEARCH_DEPTH): depth = int( search_depth) if search_depth is not None else DEFAULT_SEARCH_DEPTH return self.g.V(start_vertex).repeat( traverser.as_('e').otherV()).times(depth).emit().project( 'e', 'v').by(__.select('e').valueMap(True).fold()).by( __.valueMap(True).fold()).toList()
def _column_entities(cls, *, _g: GraphTraversalSource, tables_ids: Iterable[str], existing: EXISTING) -> None: # fetch database -> cluster -> schema -> table links g = _g.V(tuple(tables_ids)) g = g.outE(EdgeTypes.Column.value.label) g = g.inV().hasLabel(VertexTypes.Column.value.label).as_('columns') # fetch column -> links (no Stat) for t in [EdgeTypes.Description]: g = g.coalesce(__.select('columns').outE( t.value.label).fold()).as_(t.name) g = g.select(EdgeTypes.Description.name).unfold() g = g.local( __.union(__.outV().id(), __.valueMap(True), __.inV().id()).fold()) cls._into_existing(g.toList(), existing)
def _repoint_vertex_edges( self, vertex_label: str, new_resource_urn: Union[URN, PartialUrn]) -> None: # https://tinkerpop.apache.org/docs/current/recipes/#edge-move resources_of_the_same_type = self.g.V().as_("old_vertex").hasLabel( vertex_label) for id_part in new_resource_urn.resource_id_parts: resources_of_the_same_type.has("_resource_id_parts", id_part) resources_with_same_id_but_unknown = (resources_of_the_same_type.or_( __.has("_account_id", "unknown"), __.has("_region", "unknown"))).toList() for old_vertex in resources_with_same_id_but_unknown: # Outbound old_vertices_outbound_edges = self.g.V(old_vertex).outE().as_("e1") old_outbound_edges_partner_vertex = old_vertices_outbound_edges.inV( ).as_("b") new_vertex = old_outbound_edges_partner_vertex.V( self.generate_vertex_id(new_resource_urn)).as_("new_vertex") add_old_outbound_edges_to_new_vertex = ( new_vertex.addE("has").to("b").as_("e2").sideEffect( __.select("e1").properties().unfold().as_("p").select( "e2").property( __.select("p").key(), __.select("p").value()))) add_old_outbound_edges_to_new_vertex.select("e1").drop().iterate() # Inbound old_vertices_inbound_edges = self.g.V(old_vertex).select( "old_vertex").inE().as_("old_inbound_edge") old_inbound_edges_partner_vertex = old_vertices_inbound_edges.inV( ).as_("c") new_vertex = old_inbound_edges_partner_vertex.select("new_vertex") add_old_inbound_edges_to_new_vertex = (new_vertex.addE( "has").from_("c").as_("new_inbound_edge").sideEffect( __.select("old_inbound_edge").properties().unfold().as_( "p").select("new_inbound_edge").property( __.select("p").key(), __.select("p").value()))) add_old_inbound_edges_to_new_vertex.select( "old_inbound_edge").drop().iterate() # Delete old vertex self.g.V(old_vertex).drop().iterate()
def _dashboard_search_query(graph: GraphTraversalSource, tag_filter: str) -> List[Dict]: traversal = graph.V().hasLabel(DashboardMetadata.DASHBOARD_NODE_LABEL) traversal = traversal.has('name') if tag_filter: traversal = traversal.where('published_tag', tag_filter) traversal = traversal.project('group_name', 'name', 'cluster', 'description', 'group_description', 'group_url', 'url', 'uri', 'last_successful_run_timestamp', 'query_names', 'chart_names', 'total_usage', 'tags', 'badges') traversal = traversal.by( __.out( DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).values( 'name')) # group_name traversal = traversal.by('name') # name traversal = traversal.by( __.out(DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).out( DashboardMetadata.DASHBOARD_GROUP_CLUSTER_RELATION_TYPE).values( 'name')) # cluster traversal = traversal.by( __.coalesce( __.out( DashboardMetadata.DASHBOARD_DESCRIPTION_RELATION_TYPE).values( 'description'), __.constant(''))) # description traversal = traversal.by( __.coalesce( __.out( DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).out( DashboardMetadata.DASHBOARD_DESCRIPTION_RELATION_TYPE). values('description'), __.constant(''))) # group_description traversal = traversal.by( __.out( DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).values( 'dashboard_group_url')) # group_url traversal = traversal.by('dashboard_url') # dashboard_url traversal = traversal.by('key') # uri traversal = traversal.by( __.coalesce( __.out('EXECUTED').has( 'key', TextP.endingWith( '_last_successful_execution')).values('timestamp'), __.constant(''))) # last_successful_run_timestamp traversal = traversal.by( __.out(DashboardQuery.DASHBOARD_QUERY_RELATION_TYPE).values( 'name').dedup().fold()) # query_names traversal = traversal.by( __.out(DashboardQuery.DASHBOARD_QUERY_RELATION_TYPE).out( DashboardChart.CHART_RELATION_TYPE).values( 'name').dedup().fold()) # chart_names traversal = traversal.by( __.coalesce( __.outE(READ_REVERSE_RELATION_TYPE).values( READ_RELATION_COUNT_PROPERTY), __.constant(0)).sum()) # total_usage traversal = traversal.by( __.out('TAGGED_BY').has( 'tag_type', 'default').values('keys').dedup().fold()) # tags traversal = traversal.by( __.out('HAS_BADGE').values('keys').dedup().fold()) # badges traversal = traversal.order().by(__.select('name'), Order.asc) dashboards = traversal.toList() for dashboard in dashboards: dashboard['product'] = dashboard['uri'].split('_')[0] return dashboards
def table_entities(cls, *, _g: GraphTraversalSource, table_data: List[Table], existing: EXISTING) -> None: all_tables_ids = list( set([ VertexTypes.Table.value.id( key=TableUris.get(database=t.database, cluster=t.cluster, schema=t.schema, table=t.name).table) for t in table_data ])) all_owner_ids = list( set([ VertexTypes.User.value.id(key=key) for key in [ t.table_writer.id for t in table_data if t.table_writer is not None ] ])) all_application_ids = list( set( list( possible_vertex_ids_for_application_key(*[ t.table_writer.id for t in table_data if t.table_writer is not None ])))) # chunk these since 100,000s seems to choke for tables_ids in chunk(all_tables_ids, 1000): LOGGER.info(f'fetching for tables: {tables_ids}') # fetch database -> cluster -> schema -> table links g = _g.V(tuple(tables_ids)).as_('tables') g = g.coalesce(__.inE( EdgeTypes.Table.value.label).dedup().fold()).as_( EdgeTypes.Table.name) g = g.coalesce(__.unfold().outV().hasLabel( VertexTypes.Schema.value.label).inE( EdgeTypes.Schema.value.label).dedup().fold()).as_( EdgeTypes.Schema.name) g = g.coalesce(__.unfold().outV().hasLabel( VertexTypes.Cluster.value.label).inE( EdgeTypes.Cluster.value.label).dedup().fold()).as_( EdgeTypes.Cluster.name) # fetch table <- links for t in (EdgeTypes.BelongToTable, EdgeTypes.Generates, EdgeTypes.Tag): g = g.coalesce(__.select('tables').inE( t.value.label).fold()).as_(t.name) # fetch table -> column et al links for t in (EdgeTypes.Column, EdgeTypes.Description, EdgeTypes.LastUpdatedAt, EdgeTypes.Source, EdgeTypes.Stat): g = g.coalesce(__.select('tables').outE( t.value.label).fold()).as_(t.name) # TODO: add owners, watermarks, last timestamp existing, source aliases = set([ t.name for t in (EdgeTypes.Table, EdgeTypes.Schema, EdgeTypes.Cluster, EdgeTypes.BelongToTable, EdgeTypes.Generates, EdgeTypes.Tag, EdgeTypes.Column, EdgeTypes.Description, EdgeTypes.LastUpdatedAt, EdgeTypes.Source, EdgeTypes.Stat) ]) g = g.select(*aliases).unfold().select(MapColumn.values).unfold() g = g.local( __.union(__.outV().id(), __.valueMap(True), __.inV().id()).fold()) cls._into_existing(g.toList(), existing) cls._column_entities(_g=_g, tables_ids=tables_ids, existing=existing) # fetch Application, User for ids in chunk(list(set(all_application_ids + all_owner_ids)), 5000): LOGGER.info(f'fetching for application/owners: {ids}') g = _g.V(ids).valueMap(True) cls._into_existing(g.toList(), existing)
def insert_new_transaction_vertex_and_edge(self, tr_dict, connectted_node_dict, target_id, vertex_type='Transaction'): """Load transaction data, insert transaction object and related domain objects into GraphDB as vertex, with their properties as values, and insert their relation as edges. Example: >>> insert_new_transaction_vertex_and_edge(tr_dict, connectted_node_dict, target_id, vertex_type = 'Transaction') """ def insert_attr(graph_conn, attr_val_dict, target_id, node_id, vertex_type): if (not g.V().has(id, node_id).hasNext()): logger.info(f'Insert_Vertex: {node_id}.') g.inject(attr_val_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,node_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(Cardinality.single, __.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() else: logger.debug( f'Ignore inserting existing Vertex with id {node_id}') # Insert_edge to_node = g.V().has(id, node_id).next() edgeId = target_id + '-' + node_id if (not g.E().has(id, edgeId).hasNext()): logger.info(f'Insert_Edge: {target_id} --> {node_id}.') g.V().has(id, target_id).addE('CATEGORY').to(to_node).property( id, edgeId).iterate() else: logger.debug( f'Ignore inserting existing edge with id {edgeId}') conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) if (not g.V().has(id, target_id).hasNext()): logger.info(f'Insert_Vertex: {target_id}.') g.inject(tr_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,target_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(Cardinality.single, __.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() cols = {'val' + str(i + 1): '0.0' for i in range(390)} for node_k, node_v in connectted_node_dict[0].items(): node_id = node_k + '-' + str(node_v) empty_node_dict = {} empty_node_dict[attr_version_key] = json.dumps(cols) empty_node_dict = [empty_node_dict] insert_attr(g, empty_node_dict, target_id, node_id, vertex_type=node_k) conn.close()