def _calculate_outputs( topo_traversal: GraphTraversalSource, source_vertex: Vertex, arrival_rates: ARRIVAL_RATES, output_rates: DefaultDict[int, Dict[str, float]], coefficients: pd.Series, ) -> DefaultDict[int, Dict[str, float]]: source_task: int = ( topo_traversal.V(source_vertex).properties("task_id").value().next()) in_streams: List[Dict[str, str]] = \ (topo_traversal.V(source_vertex).inE("logically_connected") .project("stream_name", "source_component") .by(properties("stream").value()) .by(outV().properties("component").value()) .dedup() .toList()) out_streams: List[str] = \ (topo_traversal.V(source_vertex) .outE("logically_connected").values("stream") .dedup().toList()) for out_stream in out_streams: output_rate: float = 0.0 for in_stream in in_streams: in_stream_name: str = in_stream["stream_name"] source_component: str = in_stream["source_component"] stream_arrivals: float = \ arrival_rates[source_task][(in_stream_name, source_component)] try: coefficent: float = float(coefficients.loc[source_task, out_stream, in_stream_name, source_component]) except KeyError: LOG.debug( "No coefficient available for source task %d, " "out stream %s, in stream %s from component %s", source_task, out_stream, in_stream_name, source_component) else: output_rate += (stream_arrivals * coefficent) # It is possible that some of the IO coefficients may be negative, # implying that the more you receive on an input stream the less you # output to a given output stream. If we anticipate a large arrival on # this negative input stream and low on other positive streams then it # is possible that the predicted output rate could be negative (which # is obviously meaningless). if output_rate < 0.0: output_rate = 0.0 output_rates[source_task][out_stream] = output_rate return output_rates
def _user_search_query(graph: GraphTraversalSource, tag_filter: str) -> List[Dict]: traversal = graph.V().hasLabel(User.USER_NODE_LABEL) traversal = traversal.has(User.USER_NODE_FULL_NAME) if tag_filter: traversal = traversal.where('published_tag', tag_filter) traversal = traversal.project('email', 'first_name', 'last_name', 'full_name', 'github_username', 'team_name', 'employee_type', 'manager_email', 'slack_id', 'is_active', 'role_name', 'total_read', 'total_own', 'total_follow') traversal = traversal.by('email') # email traversal = traversal.by('first_name') # first_name traversal = traversal.by('last_name') # last_name traversal = traversal.by('full_name') # full_name traversal = traversal.by('github_username') # github_username traversal = traversal.by('team_name') # team_name traversal = traversal.by('employee_type') # employee_type traversal = traversal.by( __.coalesce( __.out(User.USER_MANAGER_RELATION_TYPE).values('email'), __.constant(''))) # manager_email traversal = traversal.by('slack_id') # slack_id traversal = traversal.by('is_active') # is_active traversal = traversal.by('role_name') # role_name traversal = traversal.by( __.coalesce( __.outE(READ_RELATION_TYPE).values('read_count'), __.constant(0)).sum()) # total_read traversal = traversal.by( __.outE(OWNER_OF_OBJECT_RELATION_TYPE).fold().count()) # total_own traversal = traversal.by( __.outE('FOLLOWED_BY').fold().count()) # total_follow traversal = traversal.order().by(__.select('email'), Order.asc) return traversal.toList()
def get_comp_links_by_grouping(graph_traversal: GraphTraversalSource, grouping: str) -> List[Dict[str, str]]: """ Gets a list of component connection dictionaries. These describe all source->stream->destination connections with the specified grouping value in the topology available via the supplied graph traversal source. Arguments: graph_traversal (GraphTraversalSource): A GraphTraversalSource instance linked to the topology subgraph whose connections are to be queried. grouping (str): The stream grouping of the connections to be returned. Returns: A list of dictionaries each containing "source", "stream" and "destination" keys of the component and stream name respectively. """ component_connections: List[Dict[str, str]] = \ (graph_traversal.V().hasLabel(P.within("bolt", "spout")).as_("source") .outE("logically_connected").has("grouping", grouping).as_("stream") .inV().as_("destination").select("source", "stream", "destination") .by("component").by("stream").by("component").dedup().toList()) return component_connections
def _table_search_query(graph: GraphTraversalSource, tag_filter: str) -> List[Dict]: traversal = graph.V().hasLabel(TableMetadata.TABLE_NODE_LABEL) if tag_filter: traversal = traversal.has('published_tag', tag_filter) traversal = traversal.project('database', 'cluster', 'schema', 'schema_description', 'name', 'key', 'description', 'last_updated_timestamp', 'column_names', 'column_descriptions', 'total_usage', 'unique_usage', 'tags', 'badges', 'programmatic_descriptions') traversal = traversal.by( __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out( SCHEMA_REVERSE_RELATION_TYPE).out( CLUSTER_REVERSE_RELATION_TYPE).values('name')) # database traversal = traversal.by( __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out( SCHEMA_REVERSE_RELATION_TYPE).values('name')) # cluster traversal = traversal.by( __.out( TableMetadata.TABLE_SCHEMA_RELATION_TYPE).values('name')) # schema traversal = traversal.by( __.coalesce( __.out(TableMetadata.TABLE_SCHEMA_RELATION_TYPE).out( DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values( 'description'), __.constant(''))) # schema_description traversal = traversal.by('name') # name traversal = traversal.by(T.id) # key traversal = traversal.by( __.coalesce( __.out(DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values( 'description'), __.constant(''))) # description traversal = traversal.by( __.coalesce( __.out(LASTUPDATED_RELATION_TYPE).values(TIMESTAMP_PROPERTY), __.constant(''))) # last_updated_timestamp traversal = traversal.by( __.out(TableMetadata.TABLE_COL_RELATION_TYPE).values( 'name').fold()) # column_names traversal = traversal.by( __.out(TableMetadata.TABLE_COL_RELATION_TYPE).out( DescriptionMetadata.DESCRIPTION_RELATION_TYPE).values( 'description').fold()) # column_descriptions traversal = traversal.by( __.coalesce( __.outE(READ_REVERSE_RELATION_TYPE).values('read_count'), __.constant(0)).sum()) # total_usage traversal = traversal.by( __.outE(READ_REVERSE_RELATION_TYPE).count()) # unique_usage traversal = traversal.by( __.inE(TableMetadata.TAG_TABLE_RELATION_TYPE).outV().values( METADATA_KEY_PROPERTY_NAME).fold()) # tags traversal = traversal.by( __.out('HAS_BADGE').values('keys').dedup().fold()) # badges traversal = traversal.by( __.out(DescriptionMetadata.PROGRAMMATIC_DESCRIPTION_NODE_LABEL).values( 'description').fold()) # programmatic_descriptions traversal = traversal.order().by(__.select('name'), Order.asc) return traversal.toList()
def _build_gremlin_vertices(g: GraphTraversalSource, row: Any) -> GraphTraversalSource: g = g.V(str(row["~id"])).fold().coalesce( __.unfold(), __.addV(row["~label"]).property(T.id, str(row["~id"]))) g = _build_gremlin_properties(g, row) return g
def _build_gremlin_insert_vertices( g: GraphTraversalSource, row: Any, use_header_cardinality: bool = False) -> GraphTraversalSource: g = g.V(str(row["~id"])).fold().coalesce( __.unfold(), __.addV(row["~label"]).property(T.id, str(row["~id"]))) g = _set_properties(g, use_header_cardinality, row) return g
def expire_connections_for_other(cls, *, _g: GraphTraversalSource, vertex_type: VertexType, keys: FrozenSet[str], existing: EXISTING) -> None: # V().has(label, 'key', P.without(keys)) is more intuitive but doesn't scale, so instead just find all those g = _g.V().hasLabel(vertex_type.label).where(__.bothE()) g = g.values(WellKnownProperties.Key.value.name) all_to_expire_keys = set(g.toList()).difference(keys) # TODO: when any vertex ids that need something besides key all_to_expire = set( vertex_type.id(key=key) for key in all_to_expire_keys) for to_expire in chunk(all_to_expire, 1000): g = _g.V(tuple(to_expire)).bothE() g = g.local( __.union(__.outV().id(), __.valueMap(True), __.inV().id()).fold()) cls._into_existing(g.toList(), existing)
def _build_gremlin_edges(g: GraphTraversalSource, row: pd.Series) -> GraphTraversalSource: g = (g.V(str(row["~from"])).fold().coalesce( __.unfold(), _build_gremlin_vertices(__, { "~id": row["~from"], "~label": "Vertex" })).addE(row["~label"]).to( __.V(str(row["~to"])).fold().coalesce( __.unfold(), _build_gremlin_vertices(__, { "~id": row["~to"], "~label": "Vertex" })))) g = _build_gremlin_properties(g, row) return g
def _column_entities(cls, *, _g: GraphTraversalSource, tables_ids: Iterable[str], existing: EXISTING) -> None: # fetch database -> cluster -> schema -> table links g = _g.V(tuple(tables_ids)) g = g.outE(EdgeTypes.Column.value.label) g = g.inV().hasLabel(VertexTypes.Column.value.label).as_('columns') # fetch column -> links (no Stat) for t in [EdgeTypes.Description]: g = g.coalesce(__.select('columns').outE( t.value.label).fold()).as_(t.name) g = g.select(EdgeTypes.Description.name).unfold() g = g.local( __.union(__.outV().id(), __.valueMap(True), __.inV().id()).fold()) cls._into_existing(g.toList(), existing)
def _build_gremlin_insert_edges( g: GraphTraversalSource, row: pd.Series, use_header_cardinality: bool) -> GraphTraversalSource: g = (g.V(str(row["~from"])).fold().coalesce( __.unfold(), _build_gremlin_insert_vertices(__, { "~id": row["~from"], "~label": "Vertex" })).addE(row["~label"]).property(T.id, str(row["~id"])).to( __.V(str(row["~to"])).fold().coalesce( __.unfold(), _build_gremlin_insert_vertices(__, { "~id": row["~to"], "~label": "Vertex" })))) g = _set_properties(g, use_header_cardinality, row) return g
def get_levels(topo_traversal: GraphTraversalSource) -> List[List[Vertex]]: """ Gets the levels of the logical graph. The traversal starts with the source spouts and performs a breadth first search through the logically connected vertices. Arguments: topo_traversal (GraphTraversalSource): A traversal source instance mapped to the topology subgraph whose levels are to be calculated. Returns: A list where each entry is a list of Vertex instances representing a level within the logical graph. The first level will be the spout instances. """ # Only load the static enums we need so we don't pollute the globals dict keys = statics.staticEnums["keys"] values = statics.staticEnums["values"] local_scope = statics.staticEnums["local"] # Repeatedly traverse the tree defined by the logical connections, grouping # each group (or set because we us de-duplicate) of vertices by their depth # in the tree. This depth is based the current number of times the repeat # step has run (loops). So you end up with a map of integer depth to list # of vertices which is emitted by the cap step. After this we just put the # Hash Map in key order (ascending) and then take only the values (the # lists of vertices) and unfold them into a list. # The first group by(-1) statement is so that the spout vertices are # included at the top of the list levels: List[List[Vertex]] = ( topo_traversal.V().hasLabel("spout").group("m").by( constant(-1)).repeat( out("logically_connected").dedup().group("m").by( loops())).until(not_(outE("logically_connected"))). cap("m").order(local_scope).by(keys).select(values).unfold().toList()) return levels
def _calculate_arrivals(topo_traversal: GraphTraversalSource, source_vertex: Vertex, arrival_rates: ARRIVAL_RATES, output_rates: DefaultDict[int, Dict[str, float]], i2i_rps: pd.DataFrame) -> ARRIVAL_RATES: # Get all downstream edges and vertices for this source vertex out_edges: List[Dict[str, Union[str, int, float]]] = \ (topo_traversal.V(source_vertex).outE("logically_connected") .project("source_task", "source_component", "stream_name", "destination_task", "destination_component") .by(outV().properties("task_id").value()) .by(outV().properties("component").value()) .by(properties("stream").value()) .by(inV().properties("task_id").value()) .by(inV().properties("component").value()) .toList()) if not out_edges: return arrival_rates source_task: int = cast(int, out_edges[0]["source_task"]) source_component: str = cast(str, out_edges[0]["source_component"]) LOG.debug("Processing output from source instance %s_%d", source_component, source_task) for out_edge in out_edges: stream: str = cast(str, out_edge["stream_name"]) try: stream_output: float = cast(float, output_rates[source_task][stream]) except KeyError: LOG.debug( "No output rate information for source task %d on " "stream %s. Skipping the outgoing edge", source_task, stream) continue destination_task: int = cast(int, out_edge["destination_task"]) try: r_prob: float = float( i2i_rps.loc(axis=0)[source_task, destination_task, stream]) except KeyError: LOG.debug( "Unable to find routing probability for connection from " "task %d to %d on stream %s", source_task, destination_task, stream) edge_output: float = 0.0 else: edge_output = (stream_output * r_prob) LOG.debug( "Output from %s-%d to %s-%d on stream %s is " "calculated as %f * %f = %f", source_component, source_task, out_edge["destination_component"], destination_task, stream, stream_output, r_prob, edge_output) arrival_rates[destination_task][(stream, source_component)] += edge_output return arrival_rates
def _dashboard_search_query(graph: GraphTraversalSource, tag_filter: str) -> List[Dict]: traversal = graph.V().hasLabel(DashboardMetadata.DASHBOARD_NODE_LABEL) traversal = traversal.has('name') if tag_filter: traversal = traversal.where('published_tag', tag_filter) traversal = traversal.project('group_name', 'name', 'cluster', 'description', 'group_description', 'group_url', 'url', 'uri', 'last_successful_run_timestamp', 'query_names', 'chart_names', 'total_usage', 'tags', 'badges') traversal = traversal.by( __.out( DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).values( 'name')) # group_name traversal = traversal.by('name') # name traversal = traversal.by( __.out(DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).out( DashboardMetadata.DASHBOARD_GROUP_CLUSTER_RELATION_TYPE).values( 'name')) # cluster traversal = traversal.by( __.coalesce( __.out( DashboardMetadata.DASHBOARD_DESCRIPTION_RELATION_TYPE).values( 'description'), __.constant(''))) # description traversal = traversal.by( __.coalesce( __.out( DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).out( DashboardMetadata.DASHBOARD_DESCRIPTION_RELATION_TYPE). values('description'), __.constant(''))) # group_description traversal = traversal.by( __.out( DashboardMetadata.DASHBOARD_DASHBOARD_GROUP_RELATION_TYPE).values( 'dashboard_group_url')) # group_url traversal = traversal.by('dashboard_url') # dashboard_url traversal = traversal.by('key') # uri traversal = traversal.by( __.coalesce( __.out('EXECUTED').has( 'key', TextP.endingWith( '_last_successful_execution')).values('timestamp'), __.constant(''))) # last_successful_run_timestamp traversal = traversal.by( __.out(DashboardQuery.DASHBOARD_QUERY_RELATION_TYPE).values( 'name').dedup().fold()) # query_names traversal = traversal.by( __.out(DashboardQuery.DASHBOARD_QUERY_RELATION_TYPE).out( DashboardChart.CHART_RELATION_TYPE).values( 'name').dedup().fold()) # chart_names traversal = traversal.by( __.coalesce( __.outE(READ_REVERSE_RELATION_TYPE).values( READ_RELATION_COUNT_PROPERTY), __.constant(0)).sum()) # total_usage traversal = traversal.by( __.out('TAGGED_BY').has( 'tag_type', 'default').values('keys').dedup().fold()) # tags traversal = traversal.by( __.out('HAS_BADGE').values('keys').dedup().fold()) # badges traversal = traversal.order().by(__.select('name'), Order.asc) dashboards = traversal.toList() for dashboard in dashboards: dashboard['product'] = dashboard['uri'].split('_')[0] return dashboards
def _build_gremlin_update(g: GraphTraversalSource, row: Any) -> GraphTraversalSource: g = g.V(str(row["~id"])) g = _build_gremlin_properties(g, row) return g
def delete_graph_for_shard_only(g: GraphTraversalSource) -> None: shard = get_shard() assert shard, f'expected shard to exist! Surely you are only using this in development or test?' # TODO: do something better than not using WellKnownProperties.TestShard here (since that makes a circular # dependency) g.V().has('shard', shard).drop().iterate()
def _build_gremlin_update( g: GraphTraversalSource, row: Any, use_header_cardinality: bool) -> GraphTraversalSource: g = g.V(str(row["~id"])) g = _set_properties(g, use_header_cardinality, row) return g
def delete_everything(traversal: GraphTraversalSource): return traversal.V().drop().toList()
def table_entities(cls, *, _g: GraphTraversalSource, table_data: List[Table], existing: EXISTING) -> None: all_tables_ids = list( set([ VertexTypes.Table.value.id( key=TableUris.get(database=t.database, cluster=t.cluster, schema=t.schema, table=t.name).table) for t in table_data ])) all_owner_ids = list( set([ VertexTypes.User.value.id(key=key) for key in [ t.table_writer.id for t in table_data if t.table_writer is not None ] ])) all_application_ids = list( set( list( possible_vertex_ids_for_application_key(*[ t.table_writer.id for t in table_data if t.table_writer is not None ])))) # chunk these since 100,000s seems to choke for tables_ids in chunk(all_tables_ids, 1000): LOGGER.info(f'fetching for tables: {tables_ids}') # fetch database -> cluster -> schema -> table links g = _g.V(tuple(tables_ids)).as_('tables') g = g.coalesce(__.inE( EdgeTypes.Table.value.label).dedup().fold()).as_( EdgeTypes.Table.name) g = g.coalesce(__.unfold().outV().hasLabel( VertexTypes.Schema.value.label).inE( EdgeTypes.Schema.value.label).dedup().fold()).as_( EdgeTypes.Schema.name) g = g.coalesce(__.unfold().outV().hasLabel( VertexTypes.Cluster.value.label).inE( EdgeTypes.Cluster.value.label).dedup().fold()).as_( EdgeTypes.Cluster.name) # fetch table <- links for t in (EdgeTypes.BelongToTable, EdgeTypes.Generates, EdgeTypes.Tag): g = g.coalesce(__.select('tables').inE( t.value.label).fold()).as_(t.name) # fetch table -> column et al links for t in (EdgeTypes.Column, EdgeTypes.Description, EdgeTypes.LastUpdatedAt, EdgeTypes.Source, EdgeTypes.Stat): g = g.coalesce(__.select('tables').outE( t.value.label).fold()).as_(t.name) # TODO: add owners, watermarks, last timestamp existing, source aliases = set([ t.name for t in (EdgeTypes.Table, EdgeTypes.Schema, EdgeTypes.Cluster, EdgeTypes.BelongToTable, EdgeTypes.Generates, EdgeTypes.Tag, EdgeTypes.Column, EdgeTypes.Description, EdgeTypes.LastUpdatedAt, EdgeTypes.Source, EdgeTypes.Stat) ]) g = g.select(*aliases).unfold().select(MapColumn.values).unfold() g = g.local( __.union(__.outV().id(), __.valueMap(True), __.inV().id()).fold()) cls._into_existing(g.toList(), existing) cls._column_entities(_g=_g, tables_ids=tables_ids, existing=existing) # fetch Application, User for ids in chunk(list(set(all_application_ids + all_owner_ids)), 5000): LOGGER.info(f'fetching for application/owners: {ids}') g = _g.V(ids).valueMap(True) cls._into_existing(g.toList(), existing)