def create_catagory_and_relation(name, dataframe, gremlin_client): # upsert category vertices cateDF = dataframe.select(name).distinct() dynamic_df = DynamicFrame.fromDF(cateDF, glueContext, f'{name}DF') category_df = GlueGremlinCsvTransforms.create_prefixed_columns( dynamic_df, [('~id', name, name)]) logger.info(f'Upserting category \'{name}\' as vertices of graph...') category_df.toDF().foreachPartition( gremlin_client.upsert_vertices(name, batch_size=100)) # upsert edge logger.info( f'Creating glue dynamic frame from spark dataframe for the relation between transaction and {name}...' ) dynamic_df = DynamicFrame.fromDF(dataframe, glueContext, f'{name}EdgeDF') relation = GlueGremlinCsvTransforms.create_prefixed_columns( dynamic_df, [('~from', TRANSACTION_ID, 't'), ('~to', name, name)]) relation = GlueGremlinCsvTransforms.create_edge_id_column( relation, '~from', '~to') relation = SelectFields.apply(frame=relation, paths=["~id", '~from', '~to'], transformation_ctx=f'selection_{name}') logger.info(f'Upserting edges between \'{name}\' and transaction...') relation.toDF().foreachPartition( gremlin_client.upsert_edges('CATEGORY', batch_size=100))
def dump_edge_as_graph(name, dataframe): # upsert edge logger.info(f'Creating glue dynamic frame from spark dataframe for the relation between transaction and {name}...') dynamic_df = DynamicFrame.fromDF(dataframe, glueContext, f'{name}EdgeDF') relation = GlueGremlinCsvTransforms.create_prefixed_columns(dynamic_df, [('~from', TRANSACTION_ID, 't'),('~to', name, name)]) relation = GlueGremlinCsvTransforms.create_edge_id_column(relation, '~from', '~to') relation = SelectFields.apply(frame = relation, paths = ["~id", '~from', '~to'], transformation_ctx = f'selection_{name}') logger.info(f'Upserting edges between \'{name}\' and transaction...') dump_df_to_s3(relation.toDF(), f'relation_{name}_edgelist', graph = True)
test_ids = transactions.select_fields(TRANSACTION_ID) get_fraud_frac = lambda series: 100 * sum(series) / len(series) isfraud_df: DynamicFrame = transactions.select_fields("isFraud") logger.info("Percent fraud for train transactions: {}".format( sum_col(transactions.toDF(), "isFraud"))) dump_df_to_s3(test_ids.toDF(), 'test', header=False) id_cols = args['id_cols'] cat_cols = args['cat_cols'] features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols, cat_cols) # Creating glue dynamic frame from spark dataframe features_dynamic_df = DynamicFrame.fromDF(features_df, glueContext, 'FeaturesDF') features_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns( features_dynamic_df, [('~id', TRANSACTION_ID, 't')]) logger.info(f'Upserting transactions as vertices of graph...') features_dynamic_df.toDF().foreachPartition( gremlin_client.upsert_vertices('Transaction', batch_size=50)) logger.info(f'Creating glue DF from labels dataframe') labels_dynamic_df = DynamicFrame.fromDF(labels_df, glueContext, 'LabelsDF') labels_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns( labels_dynamic_df, [('~id', TRANSACTION_ID, 't')]) logger.info(f'Upserting transactions with isFraud property...') labels_dynamic_df.toDF().foreachPartition( gremlin_client.upsert_vertices('Transaction', batch_size=100)) dump_df_to_s3(features_df, 'features') dump_df_to_s3(labels_df, 'tags') relational_edges = get_relations_and_edgelist(transactions.toDF(), identities.toDF(), id_cols)
get_fraud_frac = lambda series: 100 * sum(series)/len(series) isfraud_df: DynamicFrame = transactions.select_fields("isFraud") logger.info("Percent fraud for train transactions: {}".format(sum_col(transactions.toDF(), "isFraud"))) dump_df_to_s3(test_ids, 'test', header=False) id_cols = args['id_cols'] cat_cols = args['cat_cols'] features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols, cat_cols) logger.info(f'Dumping features and labels for training...') dump_df_to_s3(features_df, 'features') dump_df_to_s3(labels_df, 'tags') featurs_graph_df = features_df.withColumn('props_values:String', to_json(struct(list(filter(lambda x: (x != TRANSACTION_ID), features_df.schema.names))))) featurs_graph_df = featurs_graph_df.select('TransactionID','props_values:String') logger.info(f'Creating glue dynamic frame from spark dataframe...') features_graph_dynamic_df = DynamicFrame.fromDF(featurs_graph_df, glueContext, 'FeaturesDF') features_graph_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(features_graph_dynamic_df, [('~id', TRANSACTION_ID, 't')]) features_graph_dynamic_df = GlueGremlinCsvTransforms.addLabel(features_graph_dynamic_df,'Transaction') features_graph_dynamic_df = SelectFields.apply(frame = features_graph_dynamic_df, paths = ["~id",'~label', 'props_values:String']) logger.info(f'Dumping transaction data as graph data...') dump_df_to_s3(features_graph_dynamic_df.toDF(), f'transaction', graph = True) relational_edges = get_relations_and_edgelist(transactions.toDF(), identities.toDF(), id_cols) for name, df in relational_edges.items(): if name != TRANSACTION_ID: logger.info(f'Dumping edge {name} for training...') dump_df_to_s3(df, f'relation_{name}_edgelist') logger.info(f'Dumping edge {name} as graph data...') dump_edge_as_graph(name, df)
table_name=order_table, transformation_ctx="datasource0") df0 = datasource0.toDF().filter(col("ORDER_DATE") == checkpoint) datasource1 = DynamicFrame.fromDF(df0, glueContext, 'datasource1') print("Total orders : " + str(datasource0.count())) print("Orders for checkpoint: " + str(datasource1.count())) applymapping1 = ApplyMapping.apply( frame=datasource1, mappings=[("ORDER_DATE", "timestamp", "orderDate", "string"), ("SHIP_MODE", "string", "shipMode", "string"), ("SITE_ID", "double", "siteId", "int"), ("ORDER_ID", "int", "orderId", "int")], transformation_ctx="applymapping1") applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns( applymapping1, [('~id', 'orderId', 'o')]) selectfields1 = SelectFields.apply(frame=applymapping1, paths=["~id", "orderDate", "shipMode"], transformation_ctx="selectfields1") selectfields1.toDF().foreachPartition(gremlin_client.add_vertices('Order')) print("Creating OrderDetail vertices...") datasource2 = glueContext.create_dynamic_frame.from_catalog( database=database, table_name=order_detail_table, transformation_ctx="datasource1") datasource3 = datasource2.join(["ORDER_ID"], ["ORDER_ID"], datasource1, transformation_ctx="join")
product_category_table = '{}salesdb_product_category'.format(args['TABLE_PREFIX']) supplier_table = '{}salesdb_supplier'.format(args['TABLE_PREFIX']) gremlin_endpoints = GlueNeptuneConnectionInfo(args['AWS_REGION'], args['CONNECT_TO_NEPTUNE_ROLE_ARN']).neptune_endpoints(args['NEPTUNE_CONNECTION_NAME']) gremlin_client = GlueGremlinClient(gremlin_endpoints) # Product vertices print("Creating Product vertices...") datasource0 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_table, transformation_ctx = "datasource0") datasource1 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_category_table, transformation_ctx = "datasource1") datasource2 = datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join") applymapping1 = ApplyMapping.apply(frame = datasource2, mappings = [("NAME", "string", "name:String", "string"), ("UNIT_PRICE", "decimal(10,2)", "unitPrice", "string"), ("PRODUCT_ID", "int", "productId", "int"), ("QUANTITY_PER_UNIT", "int", "quantityPerUnit:Int", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CATEGORY_NAME", "string", "category:String", "string"), ("DESCRIPTION", "string", "description:String", "string"), ("IMAGE_URL", "string", "imageUrl:String", "string")], transformation_ctx = "applymapping1") applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping1, [('~id', 'productId', 'p'),('~to', 'supplierId', 's')]) selectfields1 = SelectFields.apply(frame = applymapping1, paths = ["~id", "name:String", "category:String", "description:String", "unitPrice", "quantityPerUnit:Int", "imageUrl:String"], transformation_ctx = "selectfields1") selectfields1.toDF().foreachPartition(gremlin_client.upsert_vertices('Product', batch_size=100)) # Supplier vertices print("Creating Supplier vertices...") datasource3 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = supplier_table, transformation_ctx = "datasource3") applymapping2 = ApplyMapping.apply(frame = datasource3, mappings = [("COUNTRY", "string", "country:String", "string"), ("ADDRESS", "string", "address:String", "string"), ("NAME", "string", "name:String", "string"), ("STATE", "string", "state:String", "string"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CITY", "string", "city:String", "string"), ("PHONE", "string", "phone:String", "string")], transformation_ctx = "applymapping1") applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping2, [('~id', 'supplierId', 's')]) selectfields3 = SelectFields.apply(frame = applymapping2, paths = ["~id", "country:String", "address:String", "city:String", "phone:String", "name:String", "state:String"], transformation_ctx = "selectfields3") selectfields3.toDF().foreachPartition(gremlin_client.upsert_vertices('Supplier', batch_size=100))
datasource1, transformation_ctx="join") applymapping1 = ApplyMapping.apply( frame=datasource2, mappings=[("NAME", "string", "name:String", "string"), ("UNIT_PRICE", "decimal(10,2)", "unitPrice", "decimal(10,2)"), ("PRODUCT_ID", "int", "productId", "int"), ("QUANTITY_PER_UNIT", "int", "quantityPerUnit:Int", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CATEGORY_NAME", "string", "category:String", "string"), ("DESCRIPTION", "string", "description:String", "string"), ("IMAGE_URL", "string", "imageUrl:String", "string")], transformation_ctx="applymapping1") applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns( applymapping1, [('~id', 'productId', 'p'), ('~to', 'supplierId', 's')]) selectfields1 = SelectFields.apply(frame=applymapping1, paths=[ "~id", "name:String", "category:String", "description:String", "unitPrice", "quantityPerUnit:Int", "imageUrl:String" ], transformation_ctx="selectfields1") writeCsvFile(GlueGremlinCsvTransforms.addLabel(selectfields1, 'Product'), nodes_path) # SUPPLIER edges print("Creating SUPPLIER edges...")
rdf.show(1) # 2. Map fields to bulk load CSV column headings format applymapping1 = ApplyMapping.apply( frame=rdf, mappings=[("`Reservations.val.Instances.val.InstanceId`", "string", "instanceId:String", "string"), ("`Reservations.val.Instances.val.InstanceType`", "string", "instanceType:String", "string")], transformation_ctx="applymapping1") applymapping1.show(1) # 3. Append prefixes to values in ID columns (ensures vertices for different types have unique IDs across graph) applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns( applymapping1, [('~id', 'instanceId:String', 'ec2')]) # 4. Select fields for upsert selectfields1 = SelectFields.apply( frame=applymapping2, paths=["~id", 'instanceId:String', 'instanceType:String'], transformation_ctx="selectfields1") # 5. Write to s3 writeCsvFile(GlueGremlinCsvTransforms.addLabel(selectfields1, 'EC2'), nodes_folder) job.commit() print("Done")