lookup_filename = args['lookup_filename'] sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) print 'Starting Parquet Conversion ...' input_file = '%s/*.tar.gz' % (s3_location_source) output_folder = s3_location_target # With absolute path print 'input_file= %s' % (input_file) print 'output_folder= %s' % (output_folder) print 'lookup_filename= %s' % (lookup_filename) zips = sc.binaryFiles(input_file) files_data = zips.map(extractall_tarfile) tsv_filename = lookup_filename output_rdd = files_data.flatMap(lambda x: [el for el in x]).filter(lambda x: x[ 0] == tsv_filename).map(lambda x: x[1]).flatMap(lambda x: x.split('\n')) print output_rdd.count() df = output_rdd.map(parse_log).toDF() df.distinct().write.mode('overwrite').parquet(output_folder) print 'Done Parquet Conversion !' df.printSchema() job.commit()
return log #---- ## @params: [JOB_NAME] args = getResolvedOptions(sys.argv, ['JOB_NAME']) sc = SparkContext() glueContext = GlueContext(sc) spark = glueContext.spark_session job = Job(glueContext) job.init(args['JOB_NAME'], args) filename = 's3://move-dataeng-dropbox-prod/adobe/omniture/mobilelookup/*.tar.gz' #test_tgz/test.tar.gz' #homerealtor_20151123-000000.tar.gz' #instru_ods.tar.gz' #test_tgz.tar.gz' # mysql-connector-java-5.1.39.tar.gz' # instru_ods.tar.gz zips = sc.binaryFiles(filename) files_data = zips.map(extractall_tarfile) tsv_filename_base = 'mobile_attributes' tsv_filename = 'mobile_attributes.tsv' output_rdd = files_data.flatMap(lambda x: [el for el in x]).filter(lambda x: x[ 0] == tsv_filename).map(lambda x: x[1]).flatMap(lambda x: x.split('\n')) print output_rdd.count() df = output_rdd.map(parse_log).toDF() bucket_name = 's3://move-dataeng-temp-dev/glue-etl/omniture/lookups' out_filename = "%s/%s" % (bucket_name, tsv_filename_base) df.distinct().write.mode('overwrite').parquet(out_filename) job.commit()