def create_catagory_and_relation(name, dataframe, gremlin_client): # upsert category vertices cateDF = dataframe.select(name).distinct() dynamic_df = DynamicFrame.fromDF(cateDF, glueContext, f'{name}DF') category_df = GlueGremlinCsvTransforms.create_prefixed_columns( dynamic_df, [('~id', name, name)]) logger.info(f'Upserting category \'{name}\' as vertices of graph...') category_df.toDF().foreachPartition( gremlin_client.upsert_vertices(name, batch_size=100)) # upsert edge logger.info( f'Creating glue dynamic frame from spark dataframe for the relation between transaction and {name}...' ) dynamic_df = DynamicFrame.fromDF(dataframe, glueContext, f'{name}EdgeDF') relation = GlueGremlinCsvTransforms.create_prefixed_columns( dynamic_df, [('~from', TRANSACTION_ID, 't'), ('~to', name, name)]) relation = GlueGremlinCsvTransforms.create_edge_id_column( relation, '~from', '~to') relation = SelectFields.apply(frame=relation, paths=["~id", '~from', '~to'], transformation_ctx=f'selection_{name}') logger.info(f'Upserting edges between \'{name}\' and transaction...') relation.toDF().foreachPartition( gremlin_client.upsert_edges('CATEGORY', batch_size=100))
def dump_edge_as_graph(name, dataframe): # upsert edge logger.info(f'Creating glue dynamic frame from spark dataframe for the relation between transaction and {name}...') dynamic_df = DynamicFrame.fromDF(dataframe, glueContext, f'{name}EdgeDF') relation = GlueGremlinCsvTransforms.create_prefixed_columns(dynamic_df, [('~from', TRANSACTION_ID, 't'),('~to', name, name)]) relation = GlueGremlinCsvTransforms.create_edge_id_column(relation, '~from', '~to') relation = SelectFields.apply(frame = relation, paths = ["~id", '~from', '~to'], transformation_ctx = f'selection_{name}') logger.info(f'Upserting edges between \'{name}\' and transaction...') dump_df_to_s3(relation.toDF(), f'relation_{name}_edgelist', graph = True)
job.init(args['JOB_NAME'], args) ## @type: DataSource ## @args: [database = "demo-mysql", table_name = "salesdb_product", transformation_ctx = "datasource0"] ## @return: datasource0 ## @inputs: [] datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "demo-mysql", table_name = "salesdb_product", transformation_ctx = "datasource0") datasource1 = glueContext.create_dynamic_frame.from_catalog(database = "demo-mysql", table_name = "salesdb_product_category", transformation_ctx = "datasource1") datasource3=datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join") print "Joined Rows : "+str(datasource3.count()) datasource3.printSchema() ## @type: ApplyMapping ## @args: [mapping = [("name", "string", "name", "string"), ("unit_price", "decimal(10,2)", "unit_price", "decimal(10,2)"), ("product_id", "double", "product_id", "int"), ("quantity_per_unit", "double", "quantity_per_unit", "int"), ("category_id", "double", "category_id", "int"), ("supplier_id", "double", "supplier_id", "int")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource3] applymapping1 = ApplyMapping.apply(frame = datasource3, mappings = [("NAME", "string", "name", "string"), ("UNIT_PRICE", "decimal(10,2)", "unit_price", "decimal(10,2)"), ("PRODUCT_ID", "int", "product_id", "int"), ("QUANTITY_PER_UNIT", "int", "quantity_per_unit", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplier_id", "int"), ("CATEGORY_NAME", "string", "category_name", "string"), ("DESCRIPTION", "string", "description", "string"), ("IMAGE_URL", "string", "image_url", "string")], transformation_ctx = "applymapping1") #applymapping1.toDF().show() ## @type: SelectFields ## @args: [paths = ["category_name", "category_id", "image_url", "product_id", "name", "description", "quantity_per_unit", "unit_price", "supplier_id"], transformation_ctx = "selectfields2"] ## @return: selectfields2 ## @inputs: [frame = applymapping1] selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["category_name", "category_id", "image_url", "product_id", "name", "description", "quantity_per_unit", "unit_price", "supplier_id"], transformation_ctx = "selectfields2") print "selectfields2 Rows : "+str(selectfields2.count()) selectfields2.toDF().show() ## @type: DataSink ## @args: [database = "demo-redshift", table_name = "redshiftdb_public_product_dim", redshift_tmp_dir = TempDir, transformation_ctx = "datasink3"] ## @return: datasink3 ## @inputs: [frame = selectfields2] datasink3 = glueContext.write_dynamic_frame.from_catalog(frame = selectfields2, database = "demo-redshift", table_name = "redshiftdb_public_product_dim", redshift_tmp_dir = args["TempDir"], transformation_ctx = "datasink3") job.commit()
("PROFIT", "decimal(10,2)", "profit", "decimal(10,2)"), ("EXTENDED_PRICE", "decimal(10,2)", "extended_price", "decimal(10,2)"), ("DATE_KEY", "string", "date_key", "string"), ("ORDER_ID", "int", "order_id", "int")], transformation_ctx="applymapping1") print "After ApplyMapping :" applymapping1.printSchema() applymapping1.toDF().show(5, False) #selectfields2 = SelectFields.apply(frame = applymapping1, paths = ["quantity", "discount", "tax", "unit_price", "line_id", "date_key", "order_date", "extended_price", "ship_mode", "line_number", "product_id", "site_id", "supply_cost", "order_id", "profit"], transformation_ctx = "selectfields2") selectfields2 = SelectFields.apply(frame=applymapping1, paths=[ "order_id", "site_id", "order_date", "date_key", "ship_mode", "line_id", "line_number", "quantity", "product_id", "unit_price", "discount", "supply_cost", "tax", "extended_price", "profit" ], transformation_ctx="selectfields2") print "After SelectFields :" selectfields2.toDF().select("date_key").show(5, False) #selectfields2.toDF().show(5,False) jdbc_url = jdbc_url + '?user='******'&password=' + password postActionStmt = "vacuum SALES_ORDER_FACT" selectfields2.toDF().write.format("com.databricks.spark.redshift") \ .option("url", jdbc_url) \ .option("dbtable", target_table_1) \
get_fraud_frac = lambda series: 100 * sum(series)/len(series) isfraud_df: DynamicFrame = transactions.select_fields("isFraud") logger.info("Percent fraud for train transactions: {}".format(sum_col(transactions.toDF(), "isFraud"))) dump_df_to_s3(test_ids, 'test', header=False) id_cols = args['id_cols'] cat_cols = args['cat_cols'] features_df, labels_df = get_features_and_labels(transactions.toDF(), id_cols, cat_cols) logger.info(f'Dumping features and labels for training...') dump_df_to_s3(features_df, 'features') dump_df_to_s3(labels_df, 'tags') featurs_graph_df = features_df.withColumn('props_values:String', to_json(struct(list(filter(lambda x: (x != TRANSACTION_ID), features_df.schema.names))))) featurs_graph_df = featurs_graph_df.select('TransactionID','props_values:String') logger.info(f'Creating glue dynamic frame from spark dataframe...') features_graph_dynamic_df = DynamicFrame.fromDF(featurs_graph_df, glueContext, 'FeaturesDF') features_graph_dynamic_df = GlueGremlinCsvTransforms.create_prefixed_columns(features_graph_dynamic_df, [('~id', TRANSACTION_ID, 't')]) features_graph_dynamic_df = GlueGremlinCsvTransforms.addLabel(features_graph_dynamic_df,'Transaction') features_graph_dynamic_df = SelectFields.apply(frame = features_graph_dynamic_df, paths = ["~id",'~label', 'props_values:String']) logger.info(f'Dumping transaction data as graph data...') dump_df_to_s3(features_graph_dynamic_df.toDF(), f'transaction', graph = True) relational_edges = get_relations_and_edgelist(transactions.toDF(), identities.toDF(), id_cols) for name, df in relational_edges.items(): if name != TRANSACTION_ID: logger.info(f'Dumping edge {name} for training...') dump_df_to_s3(df, f'relation_{name}_edgelist') logger.info(f'Dumping edge {name} as graph data...') dump_edge_as_graph(name, df)
datasource1 = DynamicFrame.fromDF(df0, glueContext, 'datasource1') print "Total orders : " + str(datasource0.count()) print "Orders for checkpoint: " + str(datasource1.count()) applymapping1 = ApplyMapping.apply( frame=datasource1, mappings=[("ORDER_DATE", "timestamp", "orderDate", "string"), ("SHIP_MODE", "string", "shipMode", "string"), ("SITE_ID", "double", "siteId", "int"), ("ORDER_ID", "int", "orderId", "int")], transformation_ctx="applymapping1") applymapping1 = GremlinCsvTransforms.create_prefixed_columns( applymapping1, [('~id', 'orderId', 'o')]) selectfields1 = SelectFields.apply(frame=applymapping1, paths=["~id", "orderDate", "shipMode"], transformation_ctx="selectfields1") selectfields1.toDF().foreachPartition(neptune.add_vertices('Order')) print "Creating OrderDetail vertices..." datasource2 = glueContext.create_dynamic_frame.from_catalog( database=database, table_name=order_detail_table, transformation_ctx="datasource1") datasource3 = datasource2.join(["ORDER_ID"], ["ORDER_ID"], datasource1, transformation_ctx="join") print "Total order details : " + str(datasource2.count())
("ADDRESS", "string", "address", "string"), ("NAME", "string", "name", "string"), ("STATE", "string", "state", "string"), ("SUPPLIER_ID", "int", "supplier_id", "int"), ("CITY", "string", "city", "string"), ("PHONE", "string", "phone", "string")], transformation_ctx="applymapping1") print "Source count datasource0: " + str(applymapping1.count()) applymapping1.toDF().show() ## @type: SelectFields ## @args: [paths = ["country", "address", "city", "phone", "name", "state", "supplier_id"], transformation_ctx = "selectfields2"] ## @return: selectfields2 ## @inputs: [frame = applymapping1] selectfields2 = SelectFields.apply(frame=applymapping1, paths=[ "country", "address", "city", "phone", "name", "state", "supplier_id" ], transformation_ctx="selectfields2") print "Source count datasource0: " + str(selectfields2.count()) selectfields2.toDF().show() ## @type: DataSink ## @args: [database = "demo-redshift", table_name = "redshiftdb_public_supplier_dim", redshift_tmp_dir = TempDir, transformation_ctx = "datasink3"] ## @return: datasink3 ## @inputs: [frame = selectfields2] datasink3 = glueContext.write_dynamic_frame.from_catalog( frame=selectfields2, database="demo-redshift", table_name="redshiftdb_public_supplier_dim", redshift_tmp_dir=args["TempDir"], transformation_ctx="datasink3") print "Redshift Rows inserted : " + str(datasink3.toDF().count())
supplier_table = '{}salesdb_supplier'.format(args['TABLE_PREFIX']) gremlin_endpoints = GlueNeptuneConnectionInfo(args['AWS_REGION'], args['CONNECT_TO_NEPTUNE_ROLE_ARN']).neptune_endpoints(args['NEPTUNE_CONNECTION_NAME']) gremlin_client = GlueGremlinClient(gremlin_endpoints) # Product vertices print("Creating Product vertices...") datasource0 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_table, transformation_ctx = "datasource0") datasource1 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = product_category_table, transformation_ctx = "datasource1") datasource2 = datasource0.join( ["CATEGORY_ID"],["CATEGORY_ID"], datasource1, transformation_ctx = "join") applymapping1 = ApplyMapping.apply(frame = datasource2, mappings = [("NAME", "string", "name:String", "string"), ("UNIT_PRICE", "decimal(10,2)", "unitPrice", "string"), ("PRODUCT_ID", "int", "productId", "int"), ("QUANTITY_PER_UNIT", "int", "quantityPerUnit:Int", "int"), ("CATEGORY_ID", "int", "category_id", "int"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CATEGORY_NAME", "string", "category:String", "string"), ("DESCRIPTION", "string", "description:String", "string"), ("IMAGE_URL", "string", "imageUrl:String", "string")], transformation_ctx = "applymapping1") applymapping1 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping1, [('~id', 'productId', 'p'),('~to', 'supplierId', 's')]) selectfields1 = SelectFields.apply(frame = applymapping1, paths = ["~id", "name:String", "category:String", "description:String", "unitPrice", "quantityPerUnit:Int", "imageUrl:String"], transformation_ctx = "selectfields1") selectfields1.toDF().foreachPartition(gremlin_client.upsert_vertices('Product', batch_size=100)) # Supplier vertices print("Creating Supplier vertices...") datasource3 = glueContext.create_dynamic_frame.from_catalog(database = database, table_name = supplier_table, transformation_ctx = "datasource3") applymapping2 = ApplyMapping.apply(frame = datasource3, mappings = [("COUNTRY", "string", "country:String", "string"), ("ADDRESS", "string", "address:String", "string"), ("NAME", "string", "name:String", "string"), ("STATE", "string", "state:String", "string"), ("SUPPLIER_ID", "int", "supplierId", "int"), ("CITY", "string", "city:String", "string"), ("PHONE", "string", "phone:String", "string")], transformation_ctx = "applymapping1") applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns(applymapping2, [('~id', 'supplierId', 's')]) selectfields3 = SelectFields.apply(frame = applymapping2, paths = ["~id", "country:String", "address:String", "city:String", "phone:String", "name:String", "state:String"], transformation_ctx = "selectfields3") selectfields3.toDF().foreachPartition(gremlin_client.upsert_vertices('Supplier', batch_size=100))
rdf.show(1) # 2. Map fields to bulk load CSV column headings format applymapping1 = ApplyMapping.apply( frame=rdf, mappings=[("`Reservations.val.Instances.val.InstanceId`", "string", "instanceId:String", "string"), ("`Reservations.val.Instances.val.InstanceType`", "string", "instanceType:String", "string")], transformation_ctx="applymapping1") applymapping1.show(1) # 3. Append prefixes to values in ID columns (ensures vertices for different types have unique IDs across graph) applymapping2 = GlueGremlinCsvTransforms.create_prefixed_columns( applymapping1, [('~id', 'instanceId:String', 'ec2')]) # 4. Select fields for upsert selectfields1 = SelectFields.apply( frame=applymapping2, paths=["~id", 'instanceId:String', 'instanceType:String'], transformation_ctx="selectfields1") # 5. Write to s3 writeCsvFile(GlueGremlinCsvTransforms.addLabel(selectfields1, 'EC2'), nodes_folder) job.commit() print("Done")
def main(): """This script will load data from the supplied DynamoDB Table to S3 so it can be analyzed with Athena""" if (JOB_TYPE == "issues"): DDB_TABLE_NAME = DDB_ISSUES_TABLE_NAME GLUE_TABLE_NAME = GLUE_ISSUES_TABLE_NAME FIELD_PATHS = [ "eventid", "acknowledged", "created", "sitename", "issuesource", "priority", "areaname#status#processname#eventdescription#stationname#devicename#created", "version", "devicename", "devicename#eventid", "createdat", "areaname", "processname", "createddateutc", "eventdescription", "areaname#status#processname#stationname#devicename#created", "stationname", "id", "acknowledgedtime", "status", "updatedat", "closed", "resolutiontime", "createdby", "acknowledgedby", "closedby", "rejectedby", "additionaldetails" ] elif (JOB_TYPE == "hierarchy"): DDB_TABLE_NAME = DDB_DATA_HIERARCHY_TABLE_NAME GLUE_TABLE_NAME = GLUE_DATA_HIERARCHY_TABLE_NAME FIELD_PATHS = [ "createdat", "name", "description", "id", "devicestationid", "type", "version", "parentid", "updatedat", "areasiteid", "eventprocessid", "eventtype", "priority", "rootcauses", "sms", "eventimgkey", "email", "protocol", "endpoint", "filterpolicy", "subscriptionarn", "stationareaid", "processareaid", "alias" ] else: raise JobInputException( f"JOB_TYPE was invalid ({JOB_TYPE}). Expecting either \"issues\" or \"hierarchy\"" ) log_message([ "Running with the following context:", f"DDB_TABLE_NAME: {DDB_TABLE_NAME}", f"GLUE_TABLE_NAME: {GLUE_TABLE_NAME}", f"GLUE_DB_NAME: {GLUE_DB_NAME}", f"GLUE_OUTPUT_BUCKET: {GLUE_OUTPUT_BUCKET}" ]) DDB_TABLE_NAME_FORMATTED = DDB_TABLE_NAME.lower().replace('-', '_') log_message("Mapping columns") COLUMN_MAPPINGS = list(map(lambda x: get_column_mapping(x), FIELD_PATHS)) log_message("Creating a Dynamic Frame from the DynamoDB table schema") datasource0 = glue_context.create_dynamic_frame.from_catalog( database=GLUE_DB_NAME, table_name=DDB_TABLE_NAME_FORMATTED, transformation_ctx="datasource0") log_message("Applying column mappings") applymapping1 = ApplyMapping.apply(frame=datasource0, mappings=COLUMN_MAPPINGS, transformation_ctx="applymapping1") log_message("Selecting fields") selectfields2 = SelectFields.apply(frame=applymapping1, paths=FIELD_PATHS, transformation_ctx="selectfields2") log_message("Resolving") resolvechoice3 = ResolveChoice.apply(frame=selectfields2, choice="MATCH_CATALOG", database=GLUE_DB_NAME, table_name=GLUE_TABLE_NAME, transformation_ctx="resolvechoice3") resolvechoice4 = ResolveChoice.apply(frame=resolvechoice3, choice="make_struct", transformation_ctx="resolvechoice4") log_message("Persisting data in S3") glue_context.write_dynamic_frame.from_catalog( frame=resolvechoice4, database=GLUE_DB_NAME, table_name=GLUE_TABLE_NAME, transformation_ctx="datasink5") job.commit() log_message("Done")