Пример #1
0
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from pyspark.ml.feature import StringIndexer

glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
args = getResolvedOptions(sys.argv, ['s3_bucket'])
s3_bucket = args['s3_bucket']
print(s3_bucket)

input_dir = f's3://{s3_bucket}/2020/'
print(input_dir)
output_dir = f's3://{s3_bucket}/output-dir'
print(output_dir)

df = glueContext.create_dynamic_frame_from_options('s3', {'paths':[input_dir], 'recurse':True, 'groupFiles': 'inPartition', 'groupSize': '1048576'}, format="json")
df.printSchema()
df1 = df.toDF()
indexer = StringIndexer(inputCol="GENDER", outputCol="GENDER_INDEX")
indexed = indexer.fit(df1).transform(df1)
indexed.show()
df = df.apply_mapping([
    ('FIRST', 'string', 'FIRST', 'string'),
    ('LAST', 'string', 'LAST', 'string'),
    ('AGE', 'integer', 'AGE', 'integer'),
    ('GENDER_INDEX', 'integer', 'GENDER', 'integer'),
    ('LATITUDE', 'string', 'LATITUDE', 'string'),
    ('LONGITUDE', 'string', 'LONGITUDE', 'string'),
])
glueContext.write_dynamic_frame.from_options(frame = df, connection_type = "s3", connection_options = {"path": output_dir}, format = "csv")
Пример #2
0
from pyspark.context import SparkContext
from pyspark.sql.functions import unix_timestamp

## @params: [JOB_NAME,S3_CSV_OUTPUT_PATH]
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'S3_CSV_OUTPUT_PATH'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Load JSON files into dynamic frame.
datasource0 = glueContext.create_dynamic_frame_from_options(
    "s3", {
        'paths': ["s3://segment-personalize-workshop/segment-logs"],
        'recurse': True
    },
    format="json")
print("Input file total record count: ", datasource0.count())

# Filters the JSON documents that we want included in the output CSV.
# These are the event types we're interested for our dataset.
supported_events = ['Product Added', 'Order Completed', 'Product Clicked']


def filter_function(dynamicRecord):
    if ('userId' in dynamicRecord and 'properties' in dynamicRecord
            and 'sku' in dynamicRecord["properties"]
            and 'event' in dynamicRecord
            and dynamicRecord['event'] in supported_events):
        return True
    (query_df1.ApplicationId == applicationId))  ##query_df_filter2.toDF()

querySql_final = query_df_final.select(col("query"))
querySql = querySql_final.first()["query"]

## read the data from s3

#var bucketname = s3_input_data_folder
paths = "s3://" + s3_input_data_folder + "/"

data_df = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    connection_options={
        "paths": [paths],
        "useS3ListImplementation": True,
        "recurse": True
    },
    format=file_format,  ## "csv",
    format_options={
        "withHeader": True
    },
    transformation_ctx="data_df").toDF()
data_df.show(10)

## read data from input table to a data frame
#data_df=glueContext.create_dynamic_frame.from_catalog(database=database,table_name=table_name).toDF()

## running sql query on the dataframe created with input dataset
data_df.createOrReplaceTempView('data_df')
data_df = spark.sql('{} from data_df'.format(querySql))

query_columns = ['werk', 'spj', 'knr', 'result', 'probability', 'time']
Пример #4
0
from awsglue.context import GlueContext
glueConext = GlueContext(spark)


dyf = glueConext.create_dynamic_frame_from_catalog(
    database='<database>',
    table_name='<table>'
)


dyf = glueConext.create_dynamic_frame_from_options(
    connection_type='sqlserver',
    connection_options={
        "url": "jdbc:sqlserver://database-2.xxx.eu-west-1.rds.amazonaws.com:1433/test",
        "user": "******",
        "password": "******",
        "dbtable": "<table>"
    }
)

# with recurse, 
"""
in S3:
a/b1/c1/files.csv
a/b1/c2/files.csv
"""
dyf = glueContext.create_dynamic_frame.from_options(
    's3', 
    connection_options={"paths": ["s3://a/"], 'recurse':True}, 
    format='json', 
Пример #5
0
# FilterPredicateTest
# for JDBC connector only
options_filterPredicateTest = {
    "query": "select NumberOfEmployees, CreatedDate from Account where",
    "url": "jdbc:some:url:user=${user};Password=${Password};SecurityToken=${SecurityToken};",
    "secretId": "test-partner-driver",
    "className" : "partner.jdbc.some.Driver",

    # test parameter
    "filterPredicate": "BillingState='CA'"
}

##################################### read data from data source ######################################
datasource0 = glueContext.create_dynamic_frame_from_options(
    connection_type = "marketplace.jdbc",
    connection_options = options_secretsManagerTest) # pick up the right test conection options

######################################## validate data reading ########################################

## validate data schema and count
# more data type: https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-crawler-pyspark-extensions-types.html
expected_schema = StructType([Field("NumberOfEmployees", IntegerType()), Field("CreatedDate", TimestampType())])
expected_count = 2

assert datasource0.schema() == expected_schema

print("expected schema: " + str(expected_schema.jsonValue()))
print("result schema: " + str(datasource0.schema().jsonValue()))
print("result schema in tree structure: ")
datasource0.printSchema()
Пример #6
0
spark = glueContext.spark_session

job = Job(glueContext)
job.init(job_name, args)



df = sqlContext.createDataFrame([
         (7,"ishan","kompressor","mbenz"),
         (14,"john","wrangler","jeep"),],
         ["HOUR","NAME","car","brand"])


df.write.parquet("s3://xx-xx-xx/Glue/oge/cars/")

datasource0 = glueContext.create_dynamic_frame_from_options("s3", {'paths': ["s3://xx-xx-xx/Glue/ogenew/"], "recurse":True}, format="parquet",transformation_ctx = "datasource0")


datasink2 = glueContext.write_dynamic_frame.from_options(frame = datasource0, connection_type = "s3", connection_options = {"path": "s3://xx-xx-xx/Glue/ogenew_pp/", "partitionKeys": ["brand","car"]}, format = "parquet",transformation_ctx = "datasink2")

df = sqlContext.createDataFrame([
          (2019,10,7,7,"ishan","chicago"),
          (2018,11,8,9,"james","italy"),
          (2017,12,9,14,"john","plano"),
          (2016,1,10,13,"adam","texas"),
          (2015,2,11,12,"chris","mexico"),
          (2014,3,12,22,"niel","portland"),],
          ["YEAR","MONTH","DAY","HOUR","NAME","CITY"])

df.write.parquet("/aws-xx-logs/Glue/glue_bookmark_issue_non_partitioned/")
Пример #7
0
spark = glue_context.spark_session
job = Job(glue_context)
job.init(args['JOB_NAME'], args)

# Job input parameters
project = args['project']
stage = args['stage']
env = Env.get_config(project, stage)
incoming_path = "s3://{}/{}/food".format(env["s3_data_lake"],
                                         env["incoming_folder"])
primary_path = "s3://{}/{}/food".format(env["s3_data_lake"],
                                        env["primary_folder"])

# Read Data
datasource0 = glue_context.create_dynamic_frame_from_options(connection_type='s3',
                                connection_options = {"paths": [incoming_path]}, format = "csv", \
                                format_options={"withHeader":True}, transformation_ctx = "readctx")

datasource0.printSchema()
datasource0.show(2)

# mask numbers in data with _number_
bpSamplePySpark = BpSamplePySpark(spark, env)
df = bpSamplePySpark.process_data(datasource0.toDF())
dyf = DynamicFrame.fromDF(df, glue_context, 'dyf')

# Write data
glue_context.write_dynamic_frame_from_options(frame = dyf, connection_type = "s3", \
                                connection_options = {"path": primary_path, \
                                }, format = "parquet", transformation_ctx = "writectx")
job.commit()
Пример #8
0
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

glueContext = GlueContext(SparkContext.getOrCreate())

schema = StructType([
    StructField('category_id', IntegerType(), False),
    StructField('sum_value', IntegerType(), False),
    StructField('userid', IntegerType(), False),
])

dyf = glueContext.create_dynamic_frame_from_options(
    's3', {'paths': [dir_path]},
    format='csv',
    format_options={'withHeader': True})

# DynamicFrame から DataFrame への変換
df = dyf.toDF()
df = df.withColumn(target_column, f.col(target_column).cast(IntegerType()))
df = df.withColumn("sum_value", f.col("sum_value").cast(IntegerType()))
df = df.withColumn("userid", f.col("userid").cast(IntegerType()))

print("size of dataframe")
print((df.count(), len(df.columns)))

# category_id で横持ちへpivot
df2 = df.groupby('userid').pivot(target_column).sum('sum_value').fillna(0)
print("pivot")
print((df2.count(), len(df2.columns)))
Пример #9
0
    ddb_split = 8 * (int(num_workers) - 1)
else:
    num_executers = (int(num_workers) - 1) * 2 - 1
    ddb_split = 4 * num_executers

print(str(ddb_split))

args = getResolvedOptions(sys.argv, ["JOB_NAME"])
glue_context = GlueContext(SparkContext.getOrCreate())
job = Job(glue_context)
job.init(args["JOB_NAME"], args)

dyf = glue_context.create_dynamic_frame_from_options(
    connection_type="dynamodb",
    connection_options={
        "dynamodb.region": region,
        "dynamodb.splits": str(ddb_split),
        "dynamodb.throughput.read.percent": "1.2",
        "dynamodb.input.tableName": source_ddb_name
    })
dyf.show()

sts_client = boto3.client('sts')
sts_response = sts_client.assume_role(RoleArn=target_role_arn,
                                      RoleSessionName='assume-role')

ddb_key = sts_response['Credentials']['AccessKeyId']
ddb_secret = sts_response['Credentials']['SecretAccessKey']
ddb_token = sts_response['Credentials']['SessionToken']

glue_context.write_dynamic_frame_from_options(
    frame=dyf,
Пример #10
0
## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Tables with important attributes
# ozone = glueContext.create_dynamic_frame.from_catalog(database = "1strategy-insights-dm", table_name = "insights_dm_stage_cached_applications", transformation_ctx = "applications")
ozone = glueContext.create_dynamic_frame_from_options(
    "mysql",
    connection_options={
        "url":
        "jdbc:mysql://air-quality.crwizazpv2rg.us-west-2.rds.amazonaws.com:3306/airquality",
        "user": "******",
        "password": "******",
        "dbtable": "ozone"
    },
    format=None)

# Create Data Frames and Views to query
ozone_df = ozone.toDF()
ozone_df.createOrReplaceTempView("ozone_view")

utah_subset_query = """
select * from ozone where state = 'utah';
"""

utah_ozone = spark.sql(utah_subset_query)
Пример #11
0
import boto3
import pandas as pd
import numpy as np

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

### create temp view of S3 cafe dynamic frame
cafeFLData = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    format="parquet",
    connection_options={"paths": ["s3://cafe/cafe_FL_PoC"]})
cafeFLData_SDF = cafeFLData.toDF()
print(" S3 cafeFLData_SDF Count: ", cafeFLData_SDF.count())
cafeFLData_SDF.createOrReplaceTempView("cafe_temp_view")

cafeRefreshData = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    format="parquet",
    connection_options={"paths": ["s3://cafe_refresh"]})
cafeRefreshData_SDF = cafeRefreshData.toDF()
cafeRefreshData_SDF.createOrReplaceTempView("cafe_refresh_temp_view")

######################  INSERTION  #######################################
#print('Starting to INSERT')
df_insert = spark.sql("""
Пример #12
0
s3_object_exists = False
try:
    s3_resource.Object(bucket, key).load()
    s3_object_exists = True
except Exception as e:
    print("S3 object not found. error message: "+str(e))

if (s3_object_exists):
    try:
        table = glue_client.get_table(DatabaseName=glue_db, Name=glue_table)

        table_columns = table['Table']['StorageDescriptor']['Columns']
        s3_destination = str(table['Table']['StorageDescriptor']['Location'])

        # Create Dynamic Frame from S3 CSV Object
        dynamicFrame = glueContext.create_dynamic_frame_from_options(connection_type = "s3", connection_options = {"paths": [s3_source_path]}, format_options={"withHeader": True,"separator": ","}, format = "csv")

        # Convert to Spark Data Frame
        dataFrame = dynamicFrame.toDF()

        # Cast Column types from Glue Table into Spark Data Frame
        for column in table_columns:
            dataFrame = dataFrame.withColumn(column['Name'], dataFrame[column['Name']].cast(column['Type']))

        # Convert back to Glue Dynamic Frame for S3 upload
        final_dynamicFrame = DynamicFrame.fromDF(dataFrame, glueContext, "final_dynamicFrame")

        # Delete any unnecessary columns
        final_dynamicFrame = final_dynamicFrame.drop_fields(['col4', 'col5', 'col6'])

        # Send dynamic frame to S3 as parquet files. S3 location specified by the given Glue table
Пример #13
0
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import datetime
import boto3

args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

datasource0 = glueContext.create_dynamic_frame_from_options(
    "s3", {'paths': ["s3://xxx-xx-logs/Glue/parquet_sample_dataset/"]},
    format="parquet",
    transformation_ctx="datasource0")

datasink3 = glueContext.write_dynamic_frame.from_options(
    frame=datasource0,
    connection_type="s3",
    connection_options={"path": "s3://xx-xx-logs/Glue/glue_bm_issue_11_12/"},
    format="parquet",
    transformation_ctx="datasink3")

job.commit()
Пример #14
0
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "green__tripdata_staging", table_name = "stagingfhv_tripdata_2015_01_csv", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
origin = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    connection_options={
        "paths": ["s3://taxi-data-etl/fhv_trippdata_test.csv"]
    },
    format="csv",
    format_options={
        "withHeader": True,
        "separator": ","
    })

# origin = glueContext.create_dynamic_frame.from_catalog(database="green__tripdata_staging",
#                                                             table_name="stagingfhv_tripdata_2015_01_csv",
#                                                             transformation_ctx="origin")
## @type: ApplyMapping
## @args: [mapping = [("dispatching_base_num", "string", "dispatching_base_num", "string"), ("pickup_date", "string", "pickup_datetime", "timestamp"), ("locationid", "string", "pickup_location_id", "long")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
applymapping1 = ApplyMapping.apply(
    frame=origin,
    mappings=[("dispatching_base_num", "string", "dispatching_base_num",
Пример #15
0
path_input = "s3://" + bucket + "/" + key
path_output = "s3://" + bucket + "/pyspark_output/" + output_name

print("bucket: " + bucket)
print("key: " + key)
print("output_name: " + output_name)
print("path_input: " + path_input)
print("path_output: " + path_output)

glueContext = GlueContext(SparkContext.getOrCreate())

df = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    connection_options={"paths": [path_input]},
    format="csv",
    format_options={
        "withHeader": True,
        "separator": ","
    },
)

glueContext.write_dynamic_frame.from_options(
    frame=df,
    connection_type="s3",
    connection_options={
        "path": path_output,
        "partitionKeys": ["col3"]
    },
    format="parquet",
)
Пример #16
0
prefix = "s3://" + workflow_params['jsonlines_bucket'] + "/" + workflow_params[
    'jsonlines_object']
output = "s3://" + workflow_params['jsonlines_bucket'] + "/abcstations"

logger = glueContext.get_logger()
logger.info("prefix: {}".format(prefix))
logger.info("output: {}".format(output))
## @params: [JOB_NAME]
job.init(args['JOB_NAME'], args)

dynamic_frame = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    format="json",
    connection_options={
        #           "paths" : ["s3://mig-test-bucket/input"]
        "paths": [prefix]
    },
    transformation_ctx="dynamic_frame"
)  #optional - needed if you are using bookmarks

# printSchema() doesnt save to a variable
# myschema=dynamic_frame.printSchema()
myschema = dynamic_frame._jdf.schema().treeString()
logger.info("{}".format(myschema))

# Perform transformations here if you need
#
#

# Glue just adds a new parquet file to output with the data - in this case it leads to duplicate data so use a spark df frame with overwrite mode instead.
Пример #17
0
datepath = "/{}/{}/{}".format(year, month, day)
inputpath = inputpath + datepath
## database source
jdbcurl = "jdbc:postgresql://preventech-cn-stage.cmusx9j6p12i.rds.cn-north-1.amazonaws.com.cn:5432/da_nxg_chinadev"
dbusernam = "DA_NGCA_DEV_APP"
dbpassword = "******"
grouptable = "master_data.groups"
customertable = "master_data.customer"

print("===================== start ETL step 1 =====================")
print(" ER json data: " + inputpath)
## DataSource  原始json 数据
erjsondata = glueContext.create_dynamic_frame_from_options(
    "s3", {
        'paths': [inputpath],
        'recurse': True
    },
    format="json",
    transformation_ctx="erjsondata")
## group & subgroup 数据
groupdata = glueContext.create_dynamic_frame_from_options(
    "postgresql", {
        'url': jdbcurl,
        'user': dbusernam,
        'password': dbpassword,
        'dbtable': grouptable
    },
    transformation_ctx="groupdata")
## customer 数据
customerdata = glueContext.create_dynamic_frame_from_options(
    "postgresql", {
Пример #18
0
spark_session = glueContext.spark_session
sqlContext = SQLContext(spark_session.sparkContext, spark_session)

args = getResolvedOptions(
    sys.argv, ['JOB_NAME', 'bucket', 'path', 'outputBucket', 'staticCSVPath'])
bucket = args['bucket']
csvPath = args['path']
outputBucket = args['outputBucket']
staticCSVPath = args['staticCSVPath']

print(bucket, csvPath)

newFile = glueContext.create_dynamic_frame_from_options("s3", {
    'paths': ["s3://" + bucket + "/" + csvPath]
},
                                                        format="csv",
                                                        format_options={
                                                            'withHeader': True
                                                        }).drop_fields(
                                                            ['name']).toDF()

staticFile = glueContext.create_dynamic_frame_from_options("s3", {
    'paths': [staticCSVPath]
},
                                                           format="csv",
                                                           format_options={
                                                               'withHeader':
                                                               True
                                                           }).toDF()

newFile.registerTempTable("newCSV")
staticFile.registerTempTable("staticCSV")
	'output_path',
	'number_of_partitions'
])

glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session

job = Job(glueContext)
job.init(args['JOB_NAME'], args)

inputPath = args['input_path']
outputPath = args['output_path']
numberOfPartitions = int(args.get('number_of_partitions', 1))

input_dyf = glueContext.create_dynamic_frame_from_options("s3", {
		"paths": [ inputPath ],
		"recurse": True,
		"groupFiles": "inPartition"
	},
	format = "parquet"
)

repartitionedDYF = input_dyf.repartition(numberOfPartitions)
glueContext.write_dynamic_frame.from_options(
	frame = repartitionedDYF,
	connection_type = "s3",
	connection_options = {"path": outputPath},
	format = "glueparquet"
)

job.commit()
Пример #20
0
def count_null(c):
    """Use conversion between boolean and integer
    - False -> 0
    - True -> 1
    """
    pred = col(c).isNull()
    return sum(pred.cast("integer")).alias(c)         
    
def null_perc (df):
    exprs = [((count_null(c) / count("*"))*100).alias(c) for c in df.columns]
    return df.agg(*exprs).collect()


## Creating glue dynamic frame, map respective columns, drop null value columns and drop fields for feature selection

patients = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/patients/*/*/*'],},format="parquet",transformation_ctx = "patients_csv")
patients_dropcol = patients.drop_fields(['patient_county','patient_deathdate','patient_drivers','patient_ssn','patient_passport','patient_suffix','patient_birthplace','patient_prefix','patient_maiden','patient_state','patient_zip','patient_first','patient_last','patient_city','patient_address','patient_lon','patient_lat','partition_0','partition_1','partition_2'])

##Dropping fields for feature engineering
conditions = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/conditions/*/*/*'],},format="parquet",transformation_ctx = "conditions_csv")
conditions_dropcol = conditions.drop_fields(['conditions_stop','conditions_start','conditions_description','partition_0','partition_1','partition_2'])

encounters = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/encounters/*/*/*'],},format="parquet",transformation_ctx = "encounters_csv")
encounters_dropcol = encounters.drop_fields(['encounters_description','encounters_reasondescription','partition_0','partition_1','partition_2'])

immunizations = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/immunizations/*/*/*'],},format="parquet",transformation_ctx = "immunizations_csv")
immunizations_dropcol = immunizations.drop_fields(['immunizations_patient','immunizations_date','immunizations_code','immunizations_description','immunizations_base_cost','partition_0','partition_1','partition_2'])

medications = glueContext.create_dynamic_frame_from_options('s3',connection_options={'paths':[input_dir + '/medications/*/*/*'],},format="parquet",transformation_ctx = "medications_csv")
medications_dropcol = medications.drop_fields(['medications_dispenses','medications_start','medications_payer','medications_code','medications_stop','medications_reasoncode','medications_reasondescription','medications_description','partition_0','partition_1','partition_2'])
Пример #21
0
#  Copyright 2016-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#  SPDX-License-Identifier: MIT-0

from pyspark.context import SparkContext
from awsglue.context import GlueContext

glue_context = GlueContext(SparkContext())
logger = glue_context.get_logger()

connection_options = {
    "table": "customer",
    "scale": "1",
    "numPartitions": "1",
    "connectionName": "GlueTPCDSConnection"
}

# read data from data source
datasource0 = glue_context.create_dynamic_frame_from_options(
    connection_type="marketplace.spark", connection_options=connection_options)

# validate data reading and row count
expected_count = 100000
result_count = datasource0.count()
assert result_count == expected_count
logger.info(
    f'Expected record count: {expected_count}, result record count: {result_count}'
)
Пример #22
0
class generate_testdata():
    def __init__(self):

        self.glueContext = None
        self.json_obj = None
        self.new_json_obj = None
        self.df_readschema = None
        self.sqlContext = None
        self.df_tgt = None
        self.spark = None
        self.df_customer_table = None
        self.df_order_table = None

    def initialize(self):
        spark_context = SparkContext()
        self.glueContext = GlueContext(spark_context)
        self.sqlContext = SQLContext(spark_context)
        self.spark = self.glueContext.spark_session

    def extract(self):
        args_new = getResolvedOptions(sys.argv, ['cfg_file_path_new'])
        cfg_file_path_new = args_new['cfg_file_path_new']

        file_res = urlparse(cfg_file_path_new)
        s3_resource = boto3.resource('s3')
        file_obj = s3_resource.Object(file_res.netloc,
                                      file_res.path.lstrip('/'))
        content = file_obj.get()['Body'].read()
        content_new = content
        self.new_json_obj = json.loads(content_new)
        print(
            '===================================Second Json======================================='
        )
        print(content_new)
        self.df_order_table = self.glueContext.create_dynamic_frame_from_options(
            connection_type='s3',
            connection_options={
                "paths": [self.new_json_obj['Schema']['src_file_path']]
            },
            format='csv',
            format_options={
                "withHeader": True
            }).toDF()
        self.df_customer_table = self.glueContext.create_dynamic_frame_from_options(
            connection_type='s3',
            connection_options={
                "paths": [self.new_json_obj['Schema']['src_file_path_2']]
            },
            format=self.new_json_obj['Schema']['format'],
            format_options={
                "withHeader": True
            }).toDF()

        self.df_order_table.show()
        self.df_customer_table.show()

    def transform(self):
        self.df_order_table = self.df_order_table.withColumn(
            'order_date', to_date(col('order_datetime'), 'yyyy-MM-dd'))
        self.df_order_table = self.df_order_table.withColumn(
            'order_month', func.month(col('order_datetime')))

        df_filter_cust = self.df_customer_table.where(col('age') > 18)
        ###inner join
        df_order_customer = self.df_order_table.join(
            df_filter_cust,
            on=(self.df_order_table['customer_id'] ==
                df_filter_cust['customer_id']),
            how='inner').select(df_filter_cust['customer_id'],
                                self.df_order_table['order_id'],
                                self.df_order_table['order_month'],
                                self.df_order_table['amount'])

        # total sales amount for each month of each customer who are greater than age 18
        wind = Window.partitionBy('customer_id', 'order_month')

        df_order_customer = df_order_customer.withColumn(
            'total_sale',
            func.sum(col('amount')).over(wind))

        df_order_customer.distinct()
        df_order_customer.show()

        ###list the cutomer_id and their second order_id of customers who places more than 2 order in last 20 dayssss
        ########################
        wind = Window.partitionBy('customer_id', 'order_date').orderBy(
            func.col('order_id').asc())
        df_temp = self.df_order_table.withColumn('row', func.row_number().over(wind))\

        df_temp = df_temp.withColumn(
            'current_date', to_date(func.current_timestamp(), 'yyyy-MM-dd'))

        df_temp = df_temp.withColumn(
            'diff_days', func.datediff('current_date', 'order_date'))

        df_temp = df_temp.withColumn(
            "diff",
            when((col('diff_days') <= lit(20)), lit(1)).otherwise(0))
        df_temp = df_temp.where(col('diff') == 1)
        wind = Window.partitionBy('customer_id')
        df_temp = df_temp.withColumn('count',
                                     func.count('order_id').over(wind))
        df_temp = df_temp.where((col('count') > 2) & (col('row') == 2))

        df_temp.show()

    def load(self):
        # self.df_tgt.coalesce(1)
        # self.df_tgt.write.mode('Overwrite').format('csv').save(
        #     's3 path',
        #     header='true')
        dynf = DynamicFrame.fromDF(self.df_tgt, self.glueContext, "glue_job")
        self.glueContext.write_dynamic_frame_from_options(
            frame=dynf,
            connection_type="s3",
            connection_options={
                "path": 's3://slf-ca-dev-glue/amp/test/testing_schema/output',
            },
            format="parquet",
            format_options={},
            transformation_ctx="")
Пример #23
0
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
import pyspark.sql.functions as f
from functools import reduce

import boto3
import json

#initiate glue context
glueContext = GlueContext(SparkContext.getOrCreate())

#read the source file---RPA
rpa_dyf = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    connection_options={
        "paths": ["s3://smart-ingest-bucket/RPA-source-file/"]
    },
    format="csv",
    format_options={'withHeader': True})

#read the lookup file---RPA-NAVIXA-AE RO
rpa_AE_RO_dyf = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    connection_options={
        "paths": [
            "s3://smart-ingest-bucket/Dimension-files/Navixa Employee History_RPA.csv"
        ]
    },
    format="csv",
    format_options={'withHeader': True})
Пример #24
0
def main(args):
    if args.verbose:
        print("Got arguments: %s" % (args))

    glue_client = boto3.client("glue", region_name=args.region)
    s3_client = boto3.client("s3", region_name=args.region)

    # Verify source DB and table exist
    if validate_db(args.athenaDatabase,
                   glue_client=glue_client) and args.verbose:
        print("Validated source database %s exists." % (args.athenaDatabase))
    if (validate_table(
            args.athenaDatabase, args.athenaTable, glue_client=glue_client)
            and args.verbose):
        print("Validated source table %s exists." % (args.athenaTable))

    # Verify input and output buckets exist and are accessible.
    for bucket in [args.outputBucket, args.inputBucket]:
        if validate_bucket(bucket, s3_client=s3_client) and args.verbose:
            print("Verified bucket s3://%s exists and is accessible." %
                  (args.outputBucket))

    # Use latest file in bucket that matches prefix string as input.
    input_csv, latest_dt = get_latest_file(args.inputBucket,
                                           prefix=args.inputPrefix,
                                           s3_client=s3_client)

    if not input_csv:
        raise Exception(
            "Found no candidate CSV files in bucket %s with prefix %s." % (
                args.inputBucket,
                "(no prefix)" if not args.inputPrefix else args.inputPrefix,
            ))

    print("Got latest CSV file s3://%s/%s with write time %s." %
          (args.inputBucket, input_csv, latest_dt))

    sc = SparkContext()
    gc = GlueContext(sc)
    sparkSession = gc.spark_session
    job = Job(gc)
    job.init(args.JOB_NAME, vars(args))
    sparkSession.udf.register("json_clean", json_clean)

    # For requester payer
    sparkSession._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader",
                                                "true")
    gc._jsc.hadoopConfiguration().set("fs.s3.useRequesterPaysHeader", "true")

    # Get minimum hour for existing data
    prelim_min_hour = 0
    aligner = DAILY_ALIGNER if args.fullDays else HOURLY_ALIGNER
    # * 1000000 needed since start_time is in microseconds
    prelim_min_hour = ((
        (START_TIME -
         (3600 * args.maxHoursAgo)) // aligner) * aligner) * 1000000

    # Get CSV file contents, filtering out lines that don't fall within --maxHoursAgo.
    input_df = (gc.create_dynamic_frame_from_options(
        "s3",
        {
            "paths": ["s3://%s/%s" % (args.inputBucket, input_csv)]
        },
        "csv",
        {
            "withHeader": True
        },
    ).toDF().select("*"))
    print("Input DF count: %s" % (input_df.count()))
    csv_prelim_df = input_df.filter("cast(start_time as bigint) >= %s" %
                                    (prelim_min_hour))
    print("Date-bounded DF count: %s" % (csv_prelim_df.count()))

    # If --hourly is not set, query for policy hits for current hour
    # and previous 24 hours.
    # When --hourly is set, query for policy hits only for the current
    # and previous hour. (Querying for the previous hour is necessary)
    # as the previous CSV generation job may have run in the middle of the
    # hour, meaning new events for that hour may exist.)
    # (now - (now % 3600)) = start of the current hour
    # ((now - (now % 3600)) - 3600) = start of the previous hour
    hours_ago = 1 if args.hourly else 24
    increm_min_hour_sec = (START_TIME -
                           (START_TIME % 3600)) - (3600 * hours_ago)
    increm_min_hour_micro = increm_min_hour_sec * 1000000

    # When nonStrict is set, query for the latest hourly epoch in the input file
    # as well as all subsequent hourly epochs. This query may be more expensive
    # to complete, but allows for the possibility of filling in gaps if previous
    # runs of the job failed or were paused.
    if args.nonStrict:
        # We may need to re-query for transactions in the latest epoch depending on
        # when the previous query was run. Filter these rows out as well, then
        # set newest_epoch to our minimum hour epoch.
        last_seen_epoch = (
            input_df.selectExpr("cast(start_time as bigint) as ts").agg({
                "start_time":
                "max"
            }).collect().pop()[0])
        last_seen_hour = ((last_seen_epoch // 1000000) // aligner) * aligner
        # Make sure we don't go outside our --maxHoursAgo range.
        # This check is necessary as increm_min_hour_sec is used in the pushdown
        # predicate when querying against the Athena source table/view.
        increm_min_hour_sec = max(last_seen_hour, increm_min_hour_sec)
        increm_min_hour_micro = increm_min_hour_sec * 1000000

    print("Set minimum hour in seconds: %s" % (increm_min_hour_sec))

    csv_hits_in_range = csv_prelim_df.filter(
        "cast(start_time as bigint) < %s" % (increm_min_hour_micro))
    print("Total CSV hits in timeframe of interest: %s" %
          (csv_hits_in_range.count()))

    # Set timeframe for pushdown predicate
    pushdown = "(hour >= %s)" % (increm_min_hour_sec)

    # Get table contents starting from minimum hour settings.
    raw_data = gc.create_dynamic_frame.from_catalog(
        database=args.athenaDatabase,
        table_name=args.athenaTable,
        transformation_ctx="raw_data",
        push_down_predicate=pushdown,
    )

    # Determine what fields we should extract based on table definition.
    # Deriving the fields from the table schema is preferable to inferring
    # it from the underlying parquet because the two may differ.
    tmp_schema = get_table_schema(args.athenaDatabase,
                                  args.athenaTable,
                                  glue_client=glue_client)
    # Verify we only have one matching column on which to explode.
    if not [col[0] for col in tmp_schema].count(args.policyType) == 1:
        raise Exception(
            "Wrong number of matching %s columns found in schema: %s" %
            (args.policyType, tmp_schema))
    # Sub out the policies/parent_policies field for an explode expression.
    # Lightly modify other fields to promote clean and consistent writes to CSV file.
    schema = []
    for col in tmp_schema:
        if col[0] == args.policyType:
            schema.append("explode(%s) as policy" % (col[0]))
        elif "map" in col[1].lower() or "struct" in col[1].lower():
            schema.append("json_clean(%s) as %s" % (col[0], col[0]))
        else:
            schema.append("cast(%s as string)" % (col[0]))

    # Get targeted policies/parent_policies.
    # Should be passed to SparkSQL as quoted strings.
    pol_arr = ", ".join([
        "'%s'" % (pol) for pol in args.policyStrings.split(args.delimiter)
        if pol
    ])

    # Filter table contents according to policy hits.
    # A non-null policies field implies a non-null parent_policies field
    # (and vice versa), so it's OK to filter on just one.

    new_hits = (raw_data.toDF().filter("policies is not NULL").selectExpr(
        *schema).filter("policy in (%s)" % (pol_arr)))
    print("New policy triggers found since last job: %s" % (new_hits.count()))

    # Combine newly collected policy hits with dataframe of previous CSV contents.
    write_df = new_hits.union(csv_hits_in_range).orderBy(
        "start_time").coalesce(1)

    uniq = hash_key(args.salt, args.ordinal, args.subscriber, args.receiver)
    s3_loc = "s3://%s" % (os.path.join(args.outputBucket, args.outputDir,
                                       uniq))
    if args.verbose:
        print("S3 Results Location: %s" % s3_loc)

    write_df.write.option("quoteAll", True).csv(s3_loc, header=True)

    # Rename output file, if requested.
    if args.outputFilename:
        rename_resp = rename_file(
            args.outputBucket,
            args.outputFilename,
            prefix=args.outputDir,
            dont_preserve_dir=args.dontPreserveOutputDir,
            keep_orig=args.keepOrigOnRename,
        )

        if rename_resp and args.verbose:
            print("Renamed file to %s/%s." %
                  (args.outputDir, args.outputFilename))

    job.commit()
Пример #25
0
from awsglue.transforms import *
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext()
glueContext = GlueContext(sc)
job = Job(glueContext)
# Please update the values in the options to connect to your own data source
options = {
    "tableName": "all_log_streams",
    "schemaName": "/aws-glue/jobs/output",
    "connectionName": "my-connection"
}
datasource = glueContext.create_dynamic_frame_from_options(
    connection_type="custom.dynamodb",
    connection_options=options,
    transformation_ctx="datasource")
datasource.show()
job.commit()
Пример #26
0
from pyspark.sql.functions import lit
from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
import pyspark.sql.functions as f
from functools import reduce
from pyspark.sql.types import *
import datetime
import boto3
import json

#initiate glue context
glueContext = GlueContext(SparkContext.getOrCreate())

##Loading the source files from s3 bucket and converting the corresponding dynamic dataframes to apache spark data frames
Jas_Labour_Paid_dyf=glueContext.create_dynamic_frame_from_options(connection_type="s3", connection_options = {"paths":["s3://smart-ingest-bucket/Quantum-source-file/jas_labor_paid_export.csv"]}, format="csv",format_options={'withHeader' : True})
Jas_Labour_Paid_df = Jas_Labour_Paid_dyf.toDF()

Jas_Labour_Production_dyf=glueContext.create_dynamic_frame_from_options(connection_type="s3", connection_options = {"paths":["s3://smart-ingest-bucket/Quantum-source-file/jas_labor_production_export.csv"]}, format="csv",format_options={'withHeader' : True})
Jas_Labour_Production_df = Jas_Labour_Production_dyf.toDF()

##Performing Transformations on Jas_Labour_Paid_df
##Jas_Labour_Paid_df.printSchema()
Jas_Labour_Paid_df=Jas_Labour_Paid_df.select(col('EXTERNAL_ID').alias('EMPLOYEE_NUMBER'),\
col('USER_NAME').alias('EMPLOYEE_NAME'),'ATTENTION',\
trim(split(col('DEPT_NAME'),'-')[0]).alias('PROGRAM_DESC'),\
to_date(substring(col('TIME_START'),1,9),'dd-MMM-yy').alias('TRANSACTION_DATE'),\
col('HOURS_OVER_TIME').cast(DoubleType()).alias('AVAIL_OT_HRS'),\
col('HOURS_TOTAL').cast(DoubleType()).alias('HOURS_TOTAL'),'STATUS',\
col('HOURS_INDIRECT').cast(DoubleType()).alias('HOURS_INDIRECT'),\
col('HOURS_TIMED').cast(DoubleType()).alias('HOURS_TIMED'), 'TAC_CODE')\
Пример #27
0
## @params: [TempDir, JOB_NAME]
args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME', 'db_name', 'redshift_connection', 'cis_bucket', 'geo_bucket'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

"""
    Copy CIS Demo Data to Redshift
"""
cis_datasource = glueContext.create_dynamic_frame_from_options("s3", \
                                                               {"paths": ["s3://{}".format(args['cis_bucket'])],"recurse": True}, \
                                                               format="csv", \
                                                               format_options={ \
                                                                   "withHeader": False, \
                                                                   "separator": ";" \
                                                                   })

cis_mapping = ApplyMapping.apply(frame = cis_datasource, mappings = [("col0", "string", "customer_id", "string"), \
                                                                     ("col1", "string", "name", "string"), \
                                                                     ("col2", "long", "zip", "long"), \
                                                                     ("col3", "string", "city", "string"), \
                                                                     ("col4", "string", "state", "string"), \
                                                                     ("col5", "string", "street", "string"), \
                                                                     ("col6", "string", "phone", "string"), \
                                                                     ("col7", "string", "meter_id", "string"), \
                                                                     ("col8", "double", "lat", "double"), \
                                                                     ("col9", "double", "long", "double")\
                                                                     ], transformation_ctx ="cis_mapping")
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.utils import getResolvedOptions

args = getResolvedOptions(sys.argv, ["JOB_NAME", "table_name", "role_arn"])
table_name = args["table_name"]
role_arn = args["role_arn"]

glue_context = GlueContext(SparkContext.getOrCreate())
job = Job(glue_context)
job.init(args["JOB_NAME"], args)

dyf = glue_context.create_dynamic_frame_from_options(
    connection_type="dynamodb",
    connection_options={
        "dynamodb.region": "us-west-2",
        "dynamodb.input.tableName": table_name,
        "dynamodb.sts.roleArn": role_arn
    })
dyf.show()

glue_context.write_dynamic_frame_from_options(
    frame=dyf,
    connection_type="dynamodb",
    connection_options={
        "dynamodb.output.tableName": table_name,
        "dynamodb.throughput.write.percent": "0.5"
    })
job.commit()
#  Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#  SPDX-License-Identifier: MIT-0
from awsglue.transforms import *
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

sc = SparkContext()
glueContext = GlueContext(sc)
job = Job(glueContext)
# Please update the values in the options to connect to your own data source
options = {
    "sfDatabase": "snowflake_sample_data",
    "sfSchema": "PUBLIC",
    "sfWarehouse": "WORKSHOP_123",
    "dbtable": "lineitem",
    "connectionName":
    "my-connection"  # please refer to Glue Studio Create Custom Connector doc to create a connection
}
datasource = glueContext.create_dynamic_frame_from_options(
    connection_type=
    "custom.spark",  # for marketplace workflow, use marketplace.spark
    connection_options=options,
    transformation_ctx="datasource")
datasource.show()
job.commit()
Пример #30
0
from pyspark.sql.functions import *
from awsglue.dynamicframe import DynamicFrame
import boto3

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

### Print the changed records
cafePrintData = glueContext.create_dynamic_frame_from_options(
    connection_type="s3",
    format="parquet",
    connection_options={"paths": ["s3://cafe-poc/cafe_FL_PoC/"]})
cafePrintData_SDF = dimcafePrintData.toDF()
cafePrintData_SDF.createOrReplaceTempView("cafe_new_view")
cafe_df = spark.sql("""
SELECT 
*
From cafe_new_view as p
WHERE p.cafeKey IN (2642,2645,2647)

""")
#-WHERE p.cafeKey in (2642,2644,2645,2646)

print(" S3 cafe Count: ", dimcafe_df.count())
cafe_df.show(truncate=False)