示例#1
0
def create_spark_schema_from_metadata_file(filepath, drop_columns = [], non_nullable_cols = []):
    """
    Creates a spark schema from a json file that is a meta data dictionary. If filepath starts with s3:// the 
    function assumes it is an S3 file otherwise it tries to read the file from the local directory.
    """
    if 's3://' in filepath :
        metadata = read_json_from_s3(filepath)
    else :
        metadata = read_json(filepath)
    return create_spark_schema_from_metadata(metadata, drop_columns=drop_columns, non_nullable_cols=non_nullable_cols)
from awsglue.job import Job

from gluejobutils.s3 import read_json_from_s3

args = getResolvedOptions(sys.argv, ["JOB_NAME", "metadata_path", "test_arg"])

print("JOB SPECS...")
print("JOB_NAME: ", args["JOB_NAME"])
print("test argument: ", args["test_arg"])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)

meta_employees = read_json_from_s3(
    os.path.join(args["metadata_path"], "employees.json"))
meta_teams = read_json_from_s3(
    os.path.join(args["metadata_path"], "teams.json"))

spark.read.csv("s3://data_bucket/employees/").createOrReplaceTempView("emp")
spark.read.csv("s3://data_bucket/teams/").createOrReplaceTempView("team")

df = spark.sql(
    "SELECT * FROM emp LEFT JOIN team ON emp.employee_id = team.employee_id")

df.write("s3://data_bucket/join/")

job.commit()
示例#3
0
if b != 'alpha-gluejobutils' or o != 'testing/data/diamonds_csv/diamonds.csv':
    raise ValueError('s3_path_to_bucket_key FAILURE')

b, o = s3.s3_path_to_bucket_key('s3://alpha-gluejobutils/testing/data')
if b != 'alpha-gluejobutils' or o != 'testing/data':
    raise ValueError('s3_path_to_bucket_key FAILURE')

b, o = s3.s3_path_to_bucket_key('s3://alpha-gluejobutils/testing/data/')
if b != 'alpha-gluejobutils' or o != 'testing/data/':
    raise ValueError('s3_path_to_bucket_key FAILURE')
print("===> s3_path_to_bucket_key ===> OK")

### ### ### ### ### ### ###
### read_json_from_s3 ###
### ### ### ### ### ### ###
test_json = s3.read_json_from_s3(
    's3://alpha-gluejobutils/testing/meta_data/diamonds.json')
diff = len(
    set([
        '$schema', 'name', 'description', 'data_format', 'columns',
        'partitions', 'location'
    ]).difference(test_json.keys()))
if diff != 0:
    raise ValueError('read_json_from_s3 FAILURE')
print("===> read_json_from_s3 ===> OK")

### ### ### ### ### ###
### write_json_to_s3 ###
### ### ### ### ### ###
json_data = {'a': 'dog', 'b': 14, 'c': [1, 2, 3], 'd': {'cat': 'alpha'}}
s3.write_json_to_s3(json_data,
                    's3://alpha-gluejobutils/testing/data_dump/test1.json')
示例#4
0
# Good practice to print out arguments for debugging
print "JOB SPECS..."
print "JOB_NAME: ", args["JOB_NAME"]
print "metadata_base_path: ", args["metadata_base_path"]
print "GITHUB_TAG: ", args["github_tag"]
print "SNAPSHOT_DATE: ", args["snapshot_date"]

# Init your spark script
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

# Read in meta data
database_meta = read_json_from_s3(os.path.join(args['metadata_base_path'], "curated/database.json"))
random_postcodes_meta = read_json_from_s3(os.path.join(args['metadata_base_path'], "curated/random_postcodes.json"))
calculated_meta = read_json_from_s3(os.path.join(args['metadata_base_path'], "curated/calculated.json"))

# Read in the data
spark.read.json('s3://mojap-raw-hist/open_data/postcodes_example/').createOrReplaceTempView('postcodes')

# Do some spark transforms (not much to do here let's just add an extra fields)
postcodes = spark.sql("""
SELECT *, '{}' AS dea_version
FROM postcodes
""".format(args['github_tag']))

postcodes.createOrReplaceTempView('postcodes')

print postcodes.columns