from awsglue.context import GlueContext
from awsglue.utils import getResolvedOptions

import pyspark.sql.functions as F
from pyspark.sql import Row, Window, SparkSession
from pyspark.sql.types import *
from pyspark.conf import SparkConf
from pyspark.context import SparkContext


args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

spark._jsc.hadoopConfiguration().set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")
spark._jsc.hadoopConfiguration().set("parquet.enable.summary-metadata", "false")

## Read in data by pointing to its's table name in Glue Data Catalog
schema = StructType() \
  .add('source', StringType()) \
  .add('type', StringType()) \
  .add('data', StringType()) \
  .add('ts', StringType())

src = spark.read.load('s3://demoanalyticsapp-output/*/*/*/*/', format='parquet', schema=schema)

## Build out some new columns with data extracted from the JSON string
from awsglue.context import GlueContext
from awsglue.job import Job
import time
import pg8000
import boto3
import re
from decimal import *
import extract_rs_query_logs_functions as functions


## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['TempDir', 'JOB_NAME','REGION','CLUSTER_ENDPOINT'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
job_configs={}
job_configs.update(args)
clusterId= re.search('jdbc:redshift://(.+?)\..*',args['CLUSTER_ENDPOINT']).group(1)
job_configs.update(functions.getJobConfigurations(clusterId,job_configs))
job_configs['CLUSTER_ID']=clusterId
tempDir=args['TempDir']
s3Prefix=job_configs['s3_prefix']
credentials=boto3.Session().get_credentials()
job_configs['aws_access_key_id'] = credentials.access_key
job_configs['aws_secret_access_key'] = credentials.secret_key
job_configs['aws_session_token'] = credentials.token
job_configs.update(args)
job_configs['jdbcURL']="{}?user={}&password={}".format(args['CLUSTER_ENDPOINT'],job_configs['user'],job_configs['password'])
job_configs['region_name']=boto3.session.Session().region_name
Exemplo n.º 3
0
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.types import Row, StringType, IntegerType, ArrayType, StructType, DoubleType, BooleanType, DateType
from  pyspark.sql.functions import input_file_name, concat, col
from pyspark.sql.functions import first, last
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf, struct, explode

sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)


## Get the argumentlist

args=getResolvedOptions(sys.argv,
                        ['JOB_NAME',
                        'project',
                        'workflow_type',
                        'output_bucket'])

## The GDC endpoint for files and the NCI endpoint to query for the S3 URL

files_endpt = 'https://api.gdc.cancer.gov/files'
indexd_endpt = 'https://nci-crdc.datacommons.io/index/index/'
Exemplo n.º 4
0
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

##### FROM FILES
tedx_dataset_path = "s3://unibg-tedx-data-bucket/tedx_dataset.csv"

###### READ PARAMETERS
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

##### START JOB CONTEXT AND JOB
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session  #la sessione di spark viene presa da glue
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

#### READ INPUT FILES TO CREATE AN INPUT DATASET
tedx_dataset = spark.read.option("header",
                                 "true").option("quote", "\"").option(
                                     "escape", "\"").csv(tedx_dataset_path)

#se un json devo mettere .csv#devo dire che il testo all'interno le virgolette devo considerarlo come contenuto unico.

tedx_dataset.printSchema()

#### FILTER ITEMS WITH NULL POSTING KEY
count_items = tedx_dataset.count()
count_items_null = tedx_dataset.filter(
    "idx is not null").count()  #filtra i dati e togle i valori nulli
args = getResolvedOptions(sys.argv, ['BucketName'])

s3 = boto3.resource('s3')
s3_client = boto3.client('s3')
athena_client = boto3.client(service_name='athena', region_name='us-west-2')
bucket_name = args['BucketName']
BucketName = args['BucketName']

database = 'c360view_stage'

sc = SparkContext()

glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init('n1_c360_dispositions')


def run_query(client, query):
    response = client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={'Database': database},
        ResultConfiguration={
            'OutputLocation': 's3://{}/athenaoutput/'.format(bucket_name)
        },
    )
    return response


def validate_query(client, query_id):
Exemplo n.º 6
0
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame

DYNAMODB_INPUT_TABLE_NAME = 'aws-glue-local-test-table'
S3_OUTPUT_BUCKET_NAME = 'aws-glue-local-test-bucket'

args = getResolvedOptions(sys.argv, ['JOB_NAME'])

JOB_NAME = args['JOB_NAME']

sc = SparkContext()
glueContext = GlueContext(sc)
job = Job(glueContext)
job.init(JOB_NAME, args)

datasource = glueContext.create_dynamic_frame.from_options(
    connection_type='dynamodb',
    connection_options={'dynamodb.input.tableName': DYNAMODB_INPUT_TABLE_NAME})

applymapping = ApplyMapping.apply(
    frame=datasource,
    mappings=[('Id', 'string', 'Id', 'string'),
              ('Column1', 'string', 'Column1', 'string'),
              ('Column2', 'string', 'Column2', 'string'),
              ('Column3', 'string', 'Column3', 'string')])

glueContext.write_dynamic_frame.from_options(
    frame=datasource,
Exemplo n.º 7
0
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

args = getResolvedOptions(sys.argv, ["JOB_NAME"])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)

# Script generated for node Amazon S3
AmazonS3_node1641389272287 = glueContext.create_dynamic_frame.from_options(
    format_options={"quoteChar": '"', "withHeader": True, "separator": ","},
    connection_type="s3",
    format="csv",
    connection_options={
        "paths": [
            "s3://db-mirgration-2112/migration-0501/NEXTDB/MARKS/LOAD00000001.csv"
        ],
        "recurse": True,
    },import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
Exemplo n.º 8
0
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job_name = args["hca_job_name"]
raw_location = args['raw_location']
datamart_location = args['datamart_location']
delta_location = args['delta_location']
archive_location = args['archive_location']
aws_region = args['aws_region']

#
# initialize glue job with glue security config supplied in params
# to encrypt cloudwatch logs
#
job = Job(glueContext)
job.init(job_name, args)

#
# read the catalog
#
catalog: Dict[str, Any] = common.utils.get_catalog()

#
# get the jdbc connection from the glue catalog
#
client = boto3.client('glue', region_name=aws_region)
glue_job = client.get_job(JobName=args['JOB_NAME'])['Job']
db_name = next(iter(glue_job['Connections']
                    ['Connections'])) if 'Connections' in glue_job else None
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

import logging

logging.getLogger().setLevel(logging.INFO)
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'json_input'])
json_input = args['json_input']
job_name = args['JOB_NAME']

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(job_name, args)


def get_parquet_output(json_input):
    path_parts = json_input.split('/')
    convert_bucket = path_parts[2].replace('-delivery-', '-convert-')
    stream_name = path_parts[3]
    batch_id = '-'.join(path_parts[8][:-3].split('-')[10:])
    partitions = 'landed_year={}/landed_month={}/landed_day={}/landed_hour={}/landed_batch_id={}'.format(
        path_parts[4], path_parts[5], path_parts[6], path_parts[7], batch_id)
    convert_path = '/'.join(
        ['s3:', '', convert_bucket, stream_name, partitions])
    return convert_path

Exemplo n.º 10
0
to the export Glue job. The Glue job runs this script to export data from DDB to S3.
"""
import sys
import boto3
import re
import json
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue.dynamicframe import DynamicFrame
from datetime import datetime

glueContext = GlueContext(SparkContext.getOrCreate())
job = Job(glueContext)

args = getResolvedOptions(sys.argv, [
    'JOB_NAME', 'jobId', 'exportType', 'transactionTime', 'since',
    'outputFormat', 'ddbTableName', 'workerType', 'numberWorkers',
    's3OutputBucket'
])

# type and tenantId are optional parameters
type = None
if ('--{}'.format('type') in sys.argv):
    type = getResolvedOptions(sys.argv, ['type'])['type']
groupId = None
if ('--{}'.format('groupId') in sys.argv):
    groupId = getResolvedOptions(sys.argv, ['groupId'])['groupId']
tenantId = None
output_database = args['output_database']
tmp_table = args['tmp_table']
output_table = args['output_table']
output_path = args['output_path']

# Retrieve partition key information from tmp-table
partition_keys = []
glue = boto3.client('glue')
res = glue.get_table(DatabaseName=output_database, Name=tmp_table)
for partition_key in res['Table']['PartitionKeys']:
    partition_keys.append(partition_key['Name'])

# getOrCreate allows this to run as a job or in a notebook.
glue_context = GlueContext(SparkContext.getOrCreate())
spark = glue_context.spark_session
job = Job(glue_context)
job.init(args['JOB_NAME'], args)

# Create DynamicFrame from Data Catalog
dyf = glue_context.create_dynamic_frame.from_catalog(database=output_database,
                                                     table_name=tmp_table,
                                                     transformation_ctx='dyf')

# Resolve choice type with make_struct
dyf = ResolveChoice.apply(frame=dyf,
                          choice='make_struct',
                          transformation_ctx='resolvechoice')

# Drop null fields
dyf = DropNullFields.apply(frame=dyf, transformation_ctx='dropnullfields')
Exemplo n.º 12
0
    "glue_db_name", "glue_issues_table_name", "glue_data_hierarchy_table_name",
    "glue_output_bucket"
]
args = getResolvedOptions(sys.argv, args_list)  # NOSONAR: python:S4823
JOB_TYPE = args["job_type"]
DDB_ISSUES_TABLE_NAME = args["ddb_issues_table_name"]
DDB_DATA_HIERARCHY_TABLE_NAME = args["ddb_data_hierarchy_table_name"]
GLUE_ISSUES_TABLE_NAME = args["glue_issues_table_name"]
GLUE_DATA_HIERARCHY_TABLE_NAME = args["glue_data_hierarchy_table_name"]
GLUE_DB_NAME = args["glue_db_name"]
GLUE_OUTPUT_BUCKET = args["glue_output_bucket"]

# Sets Glue context and logging
spark_context = SparkContext()
glue_context = GlueContext(spark_context)
job = Job(glue_context)


class JobInputException(Exception):
    """Raised when input to the job is not valid"""
    pass


def log_message(msg):
    msg_arr = [f'****** LOG_MSG {datetime.now()} ******']

    if not isinstance(msg, list):
        msg = [msg]

    # Add some preceding whitespace to each line for the log message.
    # This makes it easier to read in the Glue logs on Cloudwatch
Exemplo n.º 13
0
def getResolvedOptions(args, options):
    parser = GlueArgumentParser()

    if Job.continuation_options()[0][2:] in options:
        raise RuntimeError("Using reserved arguments " +
                           Job.continuation_options()[0][2:])

    if Job.job_bookmark_options()[0][2:] in options:
        raise RuntimeError("Using reserved arguments " +
                           Job.job_bookmark_options()[0][2:])

    parser.add_argument(Job.job_bookmark_options()[0],
                        choices=Job.job_bookmark_options()[1:],
                        required=False)
    parser.add_argument(Job.continuation_options()[0],
                        choices=Job.continuation_options()[1:],
                        required=False)

    for option in Job.job_bookmark_range_options():
        if option[2:] in options:
            raise RuntimeError("Using reserved arguments " + option)
        parser.add_argument(option, required=False)

    for option in Job.id_params()[1:]:
        if option in options:
            raise RuntimeError("Using reserved arguments " + option)
        # TODO: Make these mandatory, for now for backward compatability making these optional, also not including JOB_NAME in the reserved parameters list.
        parser.add_argument(option, required=False)

    if Job.encryption_type_options()[0] in options:
        raise RuntimeError("Using reserved arguments " +
                           Job.encryption_type_options()[0])
    parser.add_argument(Job.encryption_type_options()[0],
                        choices=Job.encryption_type_options()[1:])

    # TODO: Remove special handling for 'RedshiftTempDir' and 'TempDir' after TempDir is made mandatory for all Jobs
    # Remove 'RedshiftTempDir' and 'TempDir' from list of user supplied options
    options = [
        opt for opt in options if opt not in {'RedshiftTempDir', 'TempDir'}
    ]
    parser.add_argument('--RedshiftTempDir', required=False)
    parser.add_argument('--TempDir', required=False)

    for option in options:
        parser.add_argument('--' + option, required=True)

    parsed, extra = parser.parse_known_args(args[1:])

    parsed_dict = vars(parsed)

    # TODO: remove special handling after TempDir is made mandatory for all jobs
    if 'TempDir' in parsed_dict and parsed_dict['TempDir'] is not None:
        # TODO: Remove special handling for 'RedshiftTempDir' and 'TempDir'
        parsed_dict['RedshiftTempDir'] = parsed_dict['TempDir']
    elif 'RedshiftTempDir' in parsed and parsed_dict[
            'RedshiftTempDir'] is not None:
        parsed_dict['TempDir'] = parsed_dict['RedshiftTempDir']

    # Special handling for continuations. If --job-bookmark-option is set we
    # use that, regardless of whether --continuation-option is set. If
    # --job-bookmark-option is not set but --continuation-option is set, fall
    # back to that.

    bookmark_value = parsed_dict.pop("continuation_option", None)
    if 'job_bookmark_option' not in parsed_dict or parsed_dict[
            'job_bookmark_option'] is None:
        if bookmark_value is None:
            bookmark_value = Job.job_bookmark_options()[3]
        else:
            # translate old style continuation options into job-bookmark options
            option_index = Job.continuation_options().index(bookmark_value)
            bookmark_value = Job.job_bookmark_options()[option_index]

        parsed_dict['job_bookmark_option'] = bookmark_value
    absent_range_option = []
    for option in Job.job_bookmark_range_options():
        key = option[2:].replace('-', '_')
        if key not in parsed_dict or parsed_dict[key] is None:
            absent_range_option.append(option)
    if parsed_dict['job_bookmark_option'] == 'job-bookmark-pause':
        if len(absent_range_option) == 1:
            raise RuntimeError("Missing option or value for " +
                               absent_range_option[0])
    else:
        if len(absent_range_option) == 0:
            raise RuntimeError("Invalid option(s)" +
                               ' '.join(Job.job_bookmark_range_options()))

    _global_args.update(parsed_dict)

    return parsed_dict
Exemplo n.º 14
0
def main():
    ## @params: [JOB_NAME, db_name, entity_name, datetime_column, date_column, partition_column, output_bucket_name]
    args = getResolvedOptions(sys.argv, [
        'JOB_NAME', 'raw_db_name', 'clean_db_name', 'source_entity_name',
        'target_entity_name', 'datetime_column', 'date_column',
        'partition_column', 'output_bucket_name'
    ])
    job_name = args['JOB_NAME']
    raw_db_name = args['raw_db_name']
    clean_db_name = args['clean_db_name']
    source_entity_name = args['source_entity_name']
    target_entity_name = args['target_entity_name']
    partition_column = args['partition_column']
    datetime_column = args['datetime_column']
    date_column = args['date_column']
    output_bucket_name = args['output_bucket_name']

    # Constants derived from parameters
    raw_table_name = source_entity_name
    clean_table_name = target_entity_name

    processing_start_datetime = datetime.now(timezone.utc)

    # Initialization of contexts and job
    glue_context = GlueContext(SparkContext.getOrCreate())
    job = Job(glue_context)
    job.init(job_name, args)

    ## @type: DataSource
    ## @args: [database = "<db_name>", table_name = "raw_<entity_name>", transformation_ctx = "raw_data"]
    ## @return: raw_data
    ## @inputs: []
    raw_data: DynamicFrame = glue_context.create_dynamic_frame.from_catalog(
        database=raw_db_name,
        table_name=raw_table_name,
        transformation_ctx="raw_data")

    # Terminate early if there is no data to process
    if raw_data.toDF().head() is None:
        job.commit()
        return

    ## @type: CleanDataset
    ## @args: []
    ## @return: cleaned_data
    ## @inputs: [frame = raw_data]
    input_data = raw_data.toDF()
    cleaned_data = input_data.select(*[
        from_unixtime(c).alias(c) if c == 'processing_datetime' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_timestamp(c).alias(c) if c.endswith('_datetime') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        to_date(c).alias(c) if c.endswith('_date') else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('string').alias(c) if c == 'zip' else col(c)
        for c in input_data.columns
    ])
    cleaned_data = cleaned_data.select(*[
        col(c).cast('decimal(15,2)').alias(c) if dict(input_data.dtypes)[c] ==
        'double' else col(c) for c in input_data.columns
    ])

    ## @type: EnrichDataset
    ## @args: []
    ## @return: enriched_data
    ## @inputs: [frame = cleaned_data]
    enriched_data = cleaned_data.withColumn('etl_processing_datetime', unix_timestamp(f.lit(processing_start_datetime), 'yyyy-MM-dd HH:mm:ss').cast("timestamp")) \
        .withColumn(date_column, f.date_format(f.col(datetime_column), "yyyy-MM-dd").cast("date"))

    ## @type: DataSink
    ## @args: [connection_type = "s3", connection_options = {"path": "s3://<output_bucket_name>/clean/<entity_name>", "enableUpdateCatalog": "True", "updateBehavior": "UPDATE_IN_DATABASE", "partitionKeys" : "[<partition_key>]"}, format = "glueparquet"]
    ## @return: sink
    ## @inputs: [frame = enriched_data]

    sink = glue_context.getSink(connection_type="s3",
                                path="s3://" + output_bucket_name + "/" +
                                clean_table_name,
                                enableUpdateCatalog=True,
                                updateBehavior="UPDATE_IN_DATABASE",
                                partitionKeys=[partition_column])
    sink.setFormat("glueparquet")
    sink.setCatalogInfo(catalogDatabase=clean_db_name,
                        catalogTableName=clean_table_name)
    sink.writeFrame(DynamicFrame.fromDF(enriched_data, glue_context, 'result'))

    job.commit()
Exemplo n.º 15
0
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from boto3 import client
from pyspark.sql.functions import lit

# ENVIRONMENT VARIABLES
#args = getResolvedOptions(sys.argv, ['JOB_NAME', 'DATABASES', 'S3_RAW_PARQUET_PATH'])

# Loading the contexts
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
#job.init(args['JOB_NAME'], args)
job.init("billing_wiseOCS_accounts_csv_to_parquet")

# Where the script will write the files to
#s3_destiny = args['S3_RAW_PARQUET_PATH']

# Sources
# list_databases = args['DATABASES'].split(',')
list_databases = ["billing_wiseocs"]

# Glue object to list the tables within each database
client = client('glue', region_name='us-east-2')

for database in list_databases:
    response = client.get_tables(DatabaseName=database.lower())