def __init__(self, endpoints, job_name=None, to_dict=lambda x: x, pool_size=1, **kwargs): self.gremlin_utils = GremlinUtils(endpoints) self.conn = None self.g = None self.region = endpoints.region self.job_name = job_name self.to_dict = to_dict self.pool_size = pool_size self.kwargs = kwargs
def remoteConnection(self, neptune_endpoint=None, neptune_port=None, show_endpoint=True): connection = GremlinUtils(Endpoints( neptune_endpoint, neptune_port)).remote_connection(show_endpoint) self.connections.append(connection) return connection
def graphTraversal(self, neptune_endpoint=None, neptune_port=None, show_endpoint=True, connection=None): if connection is None: connection = self.remoteConnection(neptune_endpoint, neptune_port, show_endpoint) self.connections.append(connection) return GremlinUtils(Endpoints(neptune_endpoint, neptune_port)).traversal_source( show_endpoint, connection)
def __init__(self, neptune_endpoint, elasticache_endpoint): GremlinUtils.init_statics(globals()) gremlin_utils = GremlinUtils( Endpoints(neptune_endpoint=neptune_endpoint)) self.vertext_metrics = VertexMetrics(elasticache_endpoint) self.neptune_connection = gremlin_utils.remote_connection() self.g = gremlin_utils.traversal_source( connection=self.neptune_connection)
def handle_records(self, stream_log): params = json.loads(os.environ['AdditionalParams']) neptune_endpoint = params['neptune_cluster_endpoint'] neptune_port = params['neptune_port'] GremlinUtils.init_statics(globals()) endpoints = Endpoints(neptune_endpoint=neptune_endpoint, neptune_port=neptune_port) gremlin_utils = GremlinUtils(endpoints) conn = gremlin_utils.remote_connection() g = gremlin_utils.traversal_source(connection=conn) records = stream_log[RECORDS_STR] last_op_num = None last_commit_num = None count = 0 try: for record in records: # Process record op = record[OPERATION_STR] data = record[DATA_STR] type = data['type'] id = data['id'] if op == ADD_OPERATION: if type == 'vl': logger.info(g.V(id).valueMap(True).toList()) if type == 'e': logger.info(g.E(id).valueMap(True).toList()) # Update local checkpoint info last_op_num = record[EVENT_ID_STR][OP_NUM_STR] last_commit_num = record[EVENT_ID_STR][COMMIT_NUM_STR] count += 1 except Exception as e: logger.error('Error occurred - {}'.format(str(e))) raise e finally: try: conn.close() yield HandlerResponse(last_op_num, last_commit_num, count) except Exception as e: logger.error('Error occurred - {}'.format(str(e))) raise e finally: conn.close()
def __init__(self, endpoints): self.gremlin_utils = GremlinUtils(endpoints) GremlinUtils.init_statics(globals())
class GlueGremlinClient: def __init__(self, endpoints): self.gremlin_utils = GremlinUtils(endpoints) GremlinUtils.init_statics(globals()) def add_vertices(self, label): """Adds a vertex with the supplied label for each row in a DataFrame partition. If the DataFrame contains an '~id' column, the values in this column will be treated as user-supplied IDs for the new vertices. If the DataFrame does not have an '~id' column, Neptune will autogenerate a UUID for each vertex. Example: >>> dynamicframe.toDF().foreachPartition(neptune.add_vertices('Product')) """ def add_vertices_for_label(rows): try: conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) for row in rows: entries = row.asDict() traversal = g.addV(label) for key, value in entries.items(): key = key.split(':')[0] if key == '~id': traversal.property(id, value) elif key == '~label': pass else: traversal.property(key, value) traversal.next() conn.close() except GremlinServerError as err: print("Neptune error: {0}".format(err)) except: print("Unexpected error:", sys.exc_info()[0]) return add_vertices_for_label def upsert_vertices(self, label): """Conditionally adds vertices for the rows in a DataFrame partition using the Gremlin coalesce() idiom. The DataFrame must contain an '~id' column. Example: >>> dynamicframe.toDF().foreachPartition(neptune.upsert_vertices('Product')) """ def upsert_vertices_for_label(rows): try: conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) for row in rows: entries = row.asDict() create_traversal = __.addV(label) for key, value in entries.items(): key = key.split(':')[0] if key == '~id': create_traversal.property(id, value) elif key == '~label': pass else: create_traversal.property(key, value) g.V(entries['~id']).fold().coalesce( __.unfold(), create_traversal).next() conn.close() except GremlinServerError as err: print("Neptune error: {0}".format(err)) except: print("Unexpected error:", sys.exc_info()[0]) return upsert_vertices_for_label def add_edges(self, label): """Adds an edge with the supplied label for each row in a DataFrame partition. If the DataFrame contains an '~id' column, the values in this column will be treated as user-supplied IDs for the new edges. If the DataFrame does not have an '~id' column, Neptune will autogenerate a UUID for each edge. Example: >>> dynamicframe.toDF().foreachPartition(neptune.add_edges('ORDER_DETAIL')) """ def add_edges_for_label(rows): try: conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) for row in rows: entries = row.asDict() traversal = g.V(row['~from']).addE(label).to(V( row['~to'])).property(id, row['~id']) for key, value in entries.items(): key = key.split(':')[0] if key not in ['~id', '~from', '~to', '~label']: traversal.property(key, value) traversal.next() conn.close() except GremlinServerError as err: print("Neptune error: {0}".format(err)) except: print("Unexpected error:", sys.exc_info()[0]) return add_edges_for_label def upsert_edges(self, label): """Conditionally adds edges for the rows in a DataFrame partition using the Gremlin coalesce() idiom. The DataFrame must contain '~id', '~from', '~to' and '~label' columns. Example: >>> dynamicframe.toDF().foreachPartition(neptune.upsert_edges('ORDER_DETAIL')) """ def add_edges_for_label(rows): try: conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) for row in rows: entries = row.asDict() create_traversal = __.V(row['~from']).addE(label).to( V(row['~to'])).property(id, row['~id']) for key, value in entries.items(): key = key.split(':')[0] if key not in ['~id', '~from', '~to', '~label']: create_traversal.property(key, value) g.E(entries['~id']).fold().coalesce( __.unfold(), create_traversal).next() conn.close() except GremlinServerError as err: print("Neptune error: {0}".format(err)) except: print("Unexpected error:", sys.exc_info()[0]) return add_edges_for_label
class BatchUtils: def __init__(self, endpoints, job_name=None, to_dict=lambda x: x, pool_size=1, **kwargs): self.gremlin_utils = GremlinUtils(endpoints) self.conn = None self.g = None self.region = endpoints.region self.job_name = job_name self.to_dict = to_dict self.pool_size = pool_size self.kwargs = kwargs def close(self): try: self.gremlin_utils.close() except: pass def __execute_batch_internal(self, rows, operations, **kwargs): @backoff.on_exception(backoff.constant, tuple(retriable_errors), max_tries=5, giveup=is_non_retriable_error, on_backoff=reset_connection_if_connection_issue, on_success=publish_metrics, interval=2, jitter=backoff.full_jitter) def execute(self, rows, operations, **kwargs): if not self.conn: self.conn = self.gremlin_utils.remote_connection( pool_size=self.pool_size, **self.kwargs) self.g = self.gremlin_utils.traversal_source( connection=self.conn) t = self.g for operation in operations: for row in rows: t = operation(t, row, **kwargs) t.next() return execute(self, rows, operations, **kwargs) def execute_batch(self, rows, operations=[], batch_size=50, **kwargs): if 'mappings' not in kwargs: kwargs['mappings'] = Mappings() rows_list = [] for row in rows: rows_list.append(self.to_dict(row)) if len(rows_list) == batch_size: self.__execute_batch_internal(rows_list, operations, **kwargs) rows_list = [] if rows_list: self.__execute_batch_internal(rows_list, operations, **kwargs) def add_vertices(self, batch_size=50, rows=None, **kwargs): def batch_op(rows): self.execute_batch(rows, operations=[add_vertex], batch_size=batch_size, **kwargs) return batch_op(rows) if rows else batch_op def upsert_vertices(self, batch_size=50, rows=None, **kwargs): def batch_op(rows): operations = [upsert_vertex] on_upsert = kwargs.get('on_upsert', None) if on_upsert and on_upsert == 'replaceAllProperties': operations.append(replace_vertex_properties) self.execute_batch(rows, operations=operations, batch_size=batch_size, **kwargs) return batch_op(rows) if rows else batch_op def add_edges(self, batch_size=50, rows=None, **kwargs): def batch_op(rows): self.execute_batch(rows, operations=[add_edge], batch_size=batch_size, **kwargs) return batch_op(rows) if rows else batch_op def upsert_edges(self, batch_size=50, rows=None, **kwargs): def batch_op(rows): operations = [upsert_edge] on_upsert = kwargs.get('on_upsert', None) if on_upsert and on_upsert == 'replaceAllProperties': operations.append(replace_edge_properties) self.execute_batch(rows, operations=operations, batch_size=batch_size, **kwargs) return batch_op(rows) if rows else batch_op def add_edge_properties(self, batch_size=50, rows=None, **kwargs): def batch_op(rows): self.execute_batch(rows, operations=[add_properties_to_edge], batch_size=batch_size, **kwargs) return batch_op(rows) if rows else batch_op
import traceback from gremlin_python import statics from gremlin_python.structure.graph import Graph from gremlin_python.process.graph_traversal import __ from gremlin_python.process.strategies import * from gremlin_python.driver.driver_remote_connection import DriverRemoteConnection from gremlin_python.driver.protocol import GremlinServerError from gremlin_python.process.traversal import * from neptune_python_utils.gremlin_utils import GremlinUtils from neptune_python_utils.endpoints import Endpoints from neptune_python_utils.mappings import Mappings logging.getLogger('backoff').addHandler(logging.StreamHandler()) logger = logging.getLogger() GremlinUtils.init_statics(globals()) def get_cardinality(s): return Cardinality.single if s == 'single' else Cardinality.set_ def add_vertex(t, row, **kwargs): mappings = kwargs['mappings'] label = kwargs['label'] if 'label' in kwargs else mappings.get_label(row) t = t.addV(label) for key, value in row.items(): mapping = mappings.mapping_for(key)
sc = SparkContext() glueContext = GlueContext(sc) job = Job(glueContext) job.init(args['JOB_NAME'], args) database = args['DATABASE_NAME'] order_table = '{}salesdb_sales_order'.format(args['TABLE_PREFIX']) order_detail_table = '{}salesdb_sales_order_detail'.format( args['TABLE_PREFIX']) gremlin_endpoints = GlueNeptuneConnectionInfo( args['AWS_REGION'], args['CONNECT_TO_NEPTUNE_ROLE_ARN']).neptune_endpoints( args['NEPTUNE_CONNECTION_NAME']) gremlin_client = GlueGremlinClient(gremlin_endpoints) gremlin_utils = GremlinUtils(gremlin_endpoints) def get_last_checkpoint(client, tablename): conn = client.remote_connection() g = client.traversal_source(connection=conn) checkpoint = (g.V().hasLabel('Checkpoint').has( 'table', tablename).fold().coalesce( __.unfold(), __.addV('Checkpoint').property('table', tablename).property( 'value', datetime.datetime(2015, 1, 1, 0, 0))).values('value').next()) conn.close() return checkpoint
class GraphModelClient: def __init__(self, endpoint): self.gremlin_utils = GremlinUtils(endpoint) GremlinUtils.init_statics(globals()) def insert_new_transaction_vertex_and_edge(self, tr_dict, connectted_node_dict, target_id, vertex_type='Transaction'): """Load transaction data, insert transaction object and related domain objects into GraphDB as vertex, with their properties as values, and insert their relation as edges. Example: >>> insert_new_transaction_vertex_and_edge(tr_dict, connectted_node_dict, target_id, vertex_type = 'Transaction') """ def insert_attr(graph_conn, attr_val_dict, target_id, node_id, vertex_type): if (not g.V().has(id, node_id).hasNext()): logger.info(f'Insert_Vertex: {node_id}.') g.inject(attr_val_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,node_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(__.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() # Insert_edge to_node = g.V().has(id, node_id).next() if (not g.E().has(id, target_id + '-' + node_id).hasNext()): logger.info(f'Insert_Edge: {target_id} --> {node_id}.') g.V().has(id, target_id).addE('CATEGORY').to(to_node).property( id, target_id + '-' + node_id).iterate() conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) if (not g.V().has(id, target_id).hasNext()): logger.info(f'Insert_Vertex: {target_id}.') g.inject(tr_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,target_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(__.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() attr_cols = [f'val{x}' for x in range(1, 391)] empty_node_dict = {} for attr in attr_cols: empty_node_dict[attr] = 0.0 for node_k, node_v in connectted_node_dict[0].items(): node_id = node_k + '-' + str(node_v) insert_attr(g, [empty_node_dict], target_id, node_id, vertex_type=node_k) conn.close() def query_target_subgraph(self, target_id, tr_dict, transaction_value_cols, union_id_cols, dummied_col): """Extract 2nd degree subgraph of target transaction.Dump data into subgraph dict and n_feats dict. subgraph_dict: related transactions' id list and values through edges n_feats dict: related 1 degree vertex and transactions' embeded elements vectors. Usually after insert new test sample's vertex and edges into graphDB. Example: >>> query_target_subgraph('3661635', load_data_from_event(), 'M2_T,M3_F,M3_T,...') """ subgraph_dict = {} neighbor_list = [] neighbor_dict = {} transaction_embed_value_dict = {} ii = 0 s_t = dt.now() conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) t1 = self.gremlin_utils.traversal_source(connection=conn) target_name = target_id[(target_id.find('-') + 1):] feature_list = g.V().has(id, target_id).out().id().toList() for feat in feature_list: ii += 1 feat_name = feat[:feat.find('-')] feat_value = feat[(feat.find('-') + 1):] node_list = g.V().has( id, feat).both().limit(MAX_FEATURE_NODE).id().toList() target_and_conn_node_list = [int(target_name)] + [ int(target_conn_node[(target_conn_node.find('-') + 1):]) for target_conn_node in node_list ] target_and_conn_node_list = list(set(target_and_conn_node_list)) neighbor_list += target_and_conn_node_list nodes_and_feature_value_array = (target_and_conn_node_list, [feat_value] * len(target_and_conn_node_list)) subgraph_dict['target<>' + feat_name] = nodes_and_feature_value_array e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: subgraph_dict used {(e_t - s_t).total_seconds()} seconds' ) logger.info( f'subgraph_dict len: {len(subgraph_dict.keys())} key: {subgraph_dict.keys()}' ) logger.info(f'subgraph_dict: {subgraph_dict}') new_s_t = e_t union_li = [ t1.V().has(id, target_id).both().hasLabel(label).both().limit( MAX_FEATURE_NODE) for label in union_id_cols ] logger.info( f'union_id_cols len: {len(union_id_cols)} key: {union_id_cols}') logger.info(f'union_li len: {len(union_li)} key: {union_li}') if len(union_id_cols) == 51: node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\ union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\ union_li[6], union_li[7], union_li[8], union_li[9], union_li[10],\ union_li[11], union_li[12], union_li[13], union_li[14], union_li[15],\ union_li[16], union_li[17], union_li[18], union_li[19], union_li[20],\ union_li[21], union_li[22], union_li[23], union_li[24], union_li[25],\ union_li[26], union_li[27], union_li[28], union_li[29], union_li[30],\ union_li[31], union_li[32], union_li[33], union_li[34], union_li[35],\ union_li[36], union_li[37], union_li[38], union_li[39], union_li[40],\ union_li[41], union_li[42], union_li[43], union_li[44], union_li[45],\ union_li[46], union_li[47], union_li[48], union_li[49], union_li[50]).elementMap().toList() else: node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\ union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\ union_li[6], union_li[7], union_li[8], union_li[9], union_li[10]).elementMap().toList() e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.' ) new_s_t = e_t logger.info(f'node_dict len: {len(node_dict)} key: {node_dict}') for item in node_dict: node = item.get(list(item)[0]) node_value = node[(node.find('-') + 1):] neighbor_dict[node_value] = [ item.get(key) for key in transaction_value_cols ] target_value = target_id[(target_id.find('-') + 1):] neighbor_dict[target_value] = [ tr_dict[0].get(key) for key in transaction_value_cols ] logger.info( f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.' ) logger.info( f'neighbor_dict len: {len(neighbor_dict.keys())} key: {neighbor_dict.keys()}' ) logger.info(f'neighbor_dict: {neighbor_dict}') attr_cols = ['val' + str(x) for x in range(1, 391)] for attr in feature_list: attr_name = attr[:attr.find('-')] attr_value = attr[(attr.find('-') + 1):] attr_dict = g.V().has(id, attr).valueMap().toList()[0] attr_dict = [attr_dict.get(key)[-1] for key in attr_cols] attr_input_dict = {} attr_input_dict[attr_value] = attr_dict transaction_embed_value_dict[attr_name] = attr_input_dict e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: transaction_embed_value_dict used {(e_t - new_s_t).total_seconds()} seconds. Total test cost {(e_t - s_t).total_seconds()} seconds.' ) new_s_t = e_t transaction_embed_value_dict['target'] = neighbor_dict conn.close() logger.info( f'transaction_embed_value_dict len: {len(transaction_embed_value_dict.keys())} key: {transaction_embed_value_dict.keys()}' ) logger.info( f'transaction_embed_value_dict: {transaction_embed_value_dict}') return subgraph_dict, transaction_embed_value_dict
def __init__(self): GremlinUtils.init_statics(globals()) gremlin_utils = GremlinUtils() self.neptune_connection = gremlin_utils.remote_connection() self.g = gremlin_utils.traversal_source( connection=self.neptune_connection)
print("Connect to neptune...") # Simple way of creating endpoints, requires creating a dummy connection in Glue endpoints = GlueNeptuneConnectionInfo( 'us-east-1', args['neptune_connection_role']).neptune_endpoints( args['neptune_connection_name']) # Complex way of creating endpoints - no connection required, but needs the neptune url # sts = boto3.client('sts', region_name='us-east-1') # role = sts.assume_role(RoleArn=role_arn, RoleSessionName='bananananame', DurationSeconds=3600) # credentials = Credentials( # access_key=role['Credentials']['AccessKeyId'], # secret_key=role['Credentials']['SecretAccessKey'], # token=role['Credentials']['SessionToken']) gremlin_utils = GremlinUtils(endpoints) conn = gremlin_utils.remote_connection(show_endpoint=True) g = gremlin_utils.traversal_source(connection=conn) print("Endpoints created") print(g.V().limit(10).valueMap().toList()) print("Sanity checked") bulkload = BulkLoad(source='s3://co-resource-ingestion-bucket-dev/output-dir/', role=args['neptune_to_s3_role'], region='us-east-1', endpoints=endpoints) bulkload.load()
class GlueGremlinClient: def __init__(self, endpoints): self.gremlin_utils = GremlinUtils(endpoints) GremlinUtils.init_statics(globals()) def not_cme(e): return '"code":"ConcurrentModificationException"' not in str(e) @backoff.on_exception(backoff.expo, GremlinServerError, max_tries=5, giveup=not_cme) def retry_query(self, query): q = query q.next() def add_vertices(self, label, batch_size=1): """Adds a vertex with the supplied label for each row in a DataFrame partition. If the DataFrame contains an '~id' column, the values in this column will be treated as user-supplied IDs for the new vertices. If the DataFrame does not have an '~id' column, Neptune will autogenerate a UUID for each vertex. Example: >>> dynamicframe.toDF().foreachPartition(neptune.add_vertices('Product')) """ def add_vertices_for_label(rows): conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) t = g i = 0 for row in rows: entries = row.asDict() t = t.addV(label) for key, value in entries.items(): key = key.split(':')[0] if key == '~id': t = t.property(id, value) elif key == '~label': pass else: t = t.property(key, value) i += 1 if i == batch_size: self.retry_query(t) t = g i = 0 if i > 0: self.retry_query(t) conn.close() return add_vertices_for_label def upsert_vertices(self, label, batch_size=1): """Conditionally adds vertices for the rows in a DataFrame partition using the Gremlin coalesce() idiom. The DataFrame must contain an '~id' column. Example: >>> dynamicframe.toDF().foreachPartition(neptune.upsert_vertices('Product')) """ def upsert_vertices_for_label(rows): conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) t = g i = 0 for row in rows: entries = row.asDict() create_traversal = __.addV(label) for key, value in entries.items(): key = key.split(':')[0] if key == '~id': create_traversal = create_traversal.property(id, value) elif key == '~label': pass else: create_traversal = create_traversal.property( key, value) t = t.V(entries['~id']).fold().coalesce( __.unfold(), create_traversal) i += 1 if i == batch_size: self.retry_query(t) t = g i = 0 if i > 0: self.retry_query(t) conn.close() return upsert_vertices_for_label def add_edges(self, label, batch_size=1): """Adds an edge with the supplied label for each row in a DataFrame partition. If the DataFrame contains an '~id' column, the values in this column will be treated as user-supplied IDs for the new edges. If the DataFrame does not have an '~id' column, Neptune will autogenerate a UUID for each edge. Example: >>> dynamicframe.toDF().foreachPartition(neptune.add_edges('ORDER_DETAIL')) """ def add_edges_for_label(rows): conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) t = g i = 0 for row in rows: entries = row.asDict() t = t.V(entries['~from']).addE(label).to(V( entries['~to'])).property(id, entries['~id']) for key, value in entries.items(): key = key.split(':')[0] if key not in ['~id', '~from', '~to', '~label']: t = t.property(key, value) i += 1 if i == batch_size: self.retry_query(t) t = g i = 0 if i > 0: self.retry_query(t) conn.close() return add_edges_for_label def upsert_edges(self, label, batch_size=1): """Conditionally adds edges for the rows in a DataFrame partition using the Gremlin coalesce() idiom. The DataFrame must contain '~id', '~from', '~to' and '~label' columns. Example: >>> dynamicframe.toDF().foreachPartition(neptune.upsert_edges('ORDER_DETAIL')) """ def add_edges_for_label(rows): conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) t = g i = 0 for row in rows: entries = row.asDict() create_traversal = __.V(entries['~from']).addE(label).to( V(entries['~to'])).property(id, entries['~id']) for key, value in entries.items(): key = key.split(':')[0] if key not in ['~id', '~from', '~to', '~label']: create_traversal.property(key, value) t = t.V(entries['~from']).outE(label).hasId( entries['~id']).fold().coalesce(__.unfold(), create_traversal) i += 1 if i == batch_size: self.retry_query(t) t = g i = 0 if i > 0: self.retry_query(t) conn.close() return add_edges_for_label
class GraphModelClient: def __init__(self, endpoint): self.gremlin_utils = GremlinUtils(endpoint) GremlinUtils.init_statics(globals()) def insert_new_transaction_vertex_and_edge(self, tr_dict, connectted_node_dict, target_id, vertex_type='Transaction'): """Load transaction data, insert transaction object and related domain objects into GraphDB as vertex, with their properties as values, and insert their relation as edges. Example: >>> insert_new_transaction_vertex_and_edge(tr_dict, connectted_node_dict, target_id, vertex_type = 'Transaction') """ def insert_attr(graph_conn, attr_val_dict, target_id, node_id, vertex_type): if (not g.V().has(id, node_id).hasNext()): logger.info(f'Insert_Vertex: {node_id}.') g.inject(attr_val_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,node_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(Cardinality.single, __.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() else: logger.debug( f'Ignore inserting existing Vertex with id {node_id}') # Insert_edge to_node = g.V().has(id, node_id).next() edgeId = target_id + '-' + node_id if (not g.E().has(id, edgeId).hasNext()): logger.info(f'Insert_Edge: {target_id} --> {node_id}.') g.V().has(id, target_id).addE('CATEGORY').to(to_node).property( id, edgeId).iterate() else: logger.debug( f'Ignore inserting existing edge with id {edgeId}') conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) if (not g.V().has(id, target_id).hasNext()): logger.info(f'Insert_Vertex: {target_id}.') g.inject(tr_dict).unfold().as_(vertex_type).\ addV(vertex_type).as_('v').property(id,target_id).\ sideEffect(__.select(vertex_type).unfold().as_('kv').select('v').\ property(Cardinality.single, __.select('kv').by(Column.keys), __.select('kv').by(Column.values) ) ).iterate() cols = {'val' + str(i + 1): '0.0' for i in range(390)} for node_k, node_v in connectted_node_dict[0].items(): node_id = node_k + '-' + str(node_v) empty_node_dict = {} empty_node_dict[attr_version_key] = json.dumps(cols) empty_node_dict = [empty_node_dict] insert_attr(g, empty_node_dict, target_id, node_id, vertex_type=node_k) conn.close() def query_target_subgraph(self, target_id, tr_dict, transaction_value_cols, union_id_cols, dummied_col): """Extract 2nd degree subgraph of target transaction.Dump data into subgraph dict and n_feats dict. subgraph_dict: related transactions' id list and values through edges n_feats dict: related 1 degree vertex and transactions' embeded elements vectors. Usually after insert new test sample's vertex and edges into graphDB. Example: >>> query_target_subgraph('3661635', load_data_from_event(), 'M2_T,M3_F,M3_T,...') """ subgraph_dict = {} neighbor_list = [] neighbor_dict = {} transaction_embed_value_dict = {} ii = 0 s_t = dt.now() conn = self.gremlin_utils.remote_connection() g = self.gremlin_utils.traversal_source(connection=conn) target_name = target_id[(target_id.find('-') + 1):] feature_list = g.V().has(id, target_id).out().id().toList() for feat in feature_list: ii += 1 feat_name = feat[:feat.find('-')] feat_value = feat[(feat.find('-') + 1):] node_list = g.V().has( id, feat).both().limit(MAX_FEATURE_NODE).id().toList() target_and_conn_node_list = [int(target_name)] + [ int(target_conn_node[(target_conn_node.find('-') + 1):]) for target_conn_node in node_list ] target_and_conn_node_list = list(set(target_and_conn_node_list)) neighbor_list += target_and_conn_node_list nodes_and_feature_value_array = (target_and_conn_node_list, [feat_value] * len(target_and_conn_node_list)) subgraph_dict['target<>' + feat_name] = nodes_and_feature_value_array e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: subgraph_dict used {(e_t - s_t).total_seconds()} seconds' ) new_s_t = e_t union_li = [ __.V().has(id, target_id).both().hasLabel(label).both().limit( MAX_FEATURE_NODE) for label in union_id_cols ] if len(union_id_cols) == 51: node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\ union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\ union_li[6], union_li[7], union_li[8], union_li[9], union_li[10],\ union_li[11], union_li[12], union_li[13], union_li[14], union_li[15],\ union_li[16], union_li[17], union_li[18], union_li[19], union_li[20],\ union_li[21], union_li[22], union_li[23], union_li[24], union_li[25],\ union_li[26], union_li[27], union_li[28], union_li[29], union_li[30],\ union_li[31], union_li[32], union_li[33], union_li[34], union_li[35],\ union_li[36], union_li[37], union_li[38], union_li[39], union_li[40],\ union_li[41], union_li[42], union_li[43], union_li[44], union_li[45],\ union_li[46], union_li[47], union_li[48], union_li[49], union_li[50]).elementMap().toList() else: node_dict = g.V().has(id,target_id).union(__.both().hasLabel('card1').both().limit(MAX_FEATURE_NODE),\ union_li[1], union_li[2], union_li[3], union_li[4], union_li[5],\ union_li[6], union_li[7], union_li[8], union_li[9], union_li[10]).elementMap().toList() e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: node_dict used {(e_t - new_s_t).total_seconds()} seconds.' ) new_s_t = e_t logger.debug(f'Found {len(node_dict)} nodes from graph dbs...') class Item(): def __init__(self, item): self.item = item def __hash__(self): return hash(self.item.get(list(self.item)[0])) def __eq__(self, other): if isinstance(other, self.__class__): return self.__hash__() == other.__hash__() else: return NotImplemented def __repr__(self): return "Item(%s)" % (self.item) node_dict = list(set([Item(node) for node in node_dict])) logger.debug(f'Found {len(node_dict)} nodes without duplication') for item in node_dict: item = item.item node = item.get(list(item)[0]) node_value = node[(node.find('-') + 1):] try: logger.debug( f'the props of node {node} is {item.get(attr_version_key)}' ) jsonVal = json.loads(item.get(attr_version_key)) neighbor_dict[node_value] = [ jsonVal[key] for key in transaction_value_cols ] logger.debug( f'neighbor pair is {node_value}, {neighbor_dict[node_value]}' ) except json.JSONDecodeError: logger.warn( f'Malform node value {node} is {item.get(attr_version_key)}, run below cmd to remove it' ) logger.info(f'g.V(\'{node}\').drop()') target_value = target_id[(target_id.find('-') + 1):] jsonVal = json.loads(tr_dict[0].get(attr_version_key)) neighbor_dict[target_value] = [ jsonVal[key] for key in transaction_value_cols ] logger.info( f'INSIDE query_target_subgraph: neighbor_dict used {(e_t - new_s_t).total_seconds()} seconds.' ) attr_cols = ['val' + str(x) for x in range(1, 391)] for attr in feature_list: attr_name = attr[:attr.find('-')] attr_value = attr[(attr.find('-') + 1):] attr_dict = g.V().has(id, attr).valueMap().toList()[0] logger.debug(f'attr is {attr}, dict is {attr_dict}') jsonVal = json.loads(attr_dict.get(attr_version_key)[0]) attr_dict = [float(jsonVal[key]) for key in attr_cols] attr_input_dict = {} attr_input_dict[attr_value] = attr_dict transaction_embed_value_dict[attr_name] = attr_input_dict e_t = dt.now() logger.info( f'INSIDE query_target_subgraph: transaction_embed_value_dict used {(e_t - new_s_t).total_seconds()} seconds. Total test cost {(e_t - s_t).total_seconds()} seconds.' ) new_s_t = e_t transaction_embed_value_dict['target'] = neighbor_dict conn.close() return subgraph_dict, transaction_embed_value_dict