def test_capture_illegalargument_exception(self): self.assertRaisesRegexp(IllegalArgumentException, "Setting negative mapred.reduce.tasks", lambda: self.spark.sql("SET mapred.reduce.tasks=-1")) df = self.spark.createDataFrame([(1, 2)], ["a", "b"]) self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values", lambda: df.select(sha2(df.a, 1024)).collect()) try: df.select(sha2(df.a, 1024)).collect() except IllegalArgumentException as e: self.assertRegexpMatches(e.desc, "1024 is not in the permitted values") self.assertRegexpMatches(e.stackTrace, "org.apache.spark.sql.functions")
def remove_duplicated_schedules(schedules): # Compute the hash of each stop item schedules = schedules.withColumn( "hash", F.sha2( F.concat_ws( "||", schedules.trip_id, schedules.times, schedules.start_time, schedules.end_time, schedules.locations, schedules.headsigns, ), 256, ), ) # Drop duplicated paths unique_schedules = schedules.dropDuplicates(["hash"]) # Remove the hash columns unique_schedules = unique_schedules.drop("hash") return unique_schedules
def load_subtable(self, csv_filepath, uid_name, uid_col_list, csv_bq, passenger_bq=None): """ Function to load a supporting table to passengers from GCS and save in BigQuery. :param csv_filepath: str input filename :param uid_name: str name to give the UID column :param uid_col_list: list of str column names to combine into UID :param csv_bq: str output project.datset.table where the dat will be saved :param passenger_bq: str, optional. If passengers_df already has been loaded """ csv_path = 'gs://{}/{}'.format(self.bucket, csv_filepath) logger.info(f"Loading address info from {csv_path}") csv_df = self.sparkql.read.csv(csv_path, header=True) csv_df = csv_df.withColumn(uid_name, sha2(concat_ws("", *uid_col_list ), 256 )) if passenger_bq: passengers_df = self.sparkql.read.format('bigquery') \ .option('table', passenger_bq) \ .load() \ .withColumnRenamed('uid', 'passenger_uid') else: passengers_df = self.passengers_df.withColumnRenamed('uid', 'passenger_uid') csv_df = csv_df.join(passengers_df.select('email', 'passenger_uid'), on='email', how='left') logger.info(f"writing card data to {csv_bq}") csv_df.write.format('bigquery') \ .option('table', csv_bq) \ .save()
def create_dimensional_partitions(data, parquet_loc, execution_date): """Creates the dimensional objects (Dimensions, Facts) from a PySpark RDD and outputs them to the Parquet format to be processed by Redshift. Args: data (pyspark.rdd.RDD): the base PySpark RDD parquet_loc (str): the output path where the Parquet files are to be saved execution_date (str): the execution date """ broker_staging = data.select(['broker']).distinct() # calculate hash using SHA2 broker_staging = broker_staging.withColumn( "hash", sha2(concat_ws("||", *broker_staging.columns), 256)) # geography dim geography_staging = data.select(['country', 'county', 'parish']).distinct() geography_staging = geography_staging.withColumn( "hash", sha2(concat_ws("||", *geography_staging.columns), 256)) asset_staging = data.select([ 'contract_number', 'country', 'county', 'parish', 'title', 'description', 'price', 'property_type', 'bathrooms', 'bedrooms', 'area_net', 'latitude', 'longitude' ]).distinct() # calculate hash using SHA2 asset_staging = asset_staging.withColumn( "hash", sha2(concat_ws("||", *asset_staging.columns), 256)) # weekly stock base asset_stock = data.select([ 'broker', 'contract_number', 'country', 'county', 'parish', 'price' ]).withColumn("quantity", lit(1)).withColumn("stock_date", lit(execution_date)) # save the data onto parquet to be consumed by Redshift broker_staging_loc = parquet_loc + "broker_staging.parquet" asset_staging_loc = parquet_loc + "asset_staging.parquet" geography_staging_loc = parquet_loc + "geography.parquet" stock_staging_loc = parquet_loc + "asset_stock.parquet" to_parquet(broker_staging, broker_staging_loc) to_parquet(asset_staging, asset_staging_loc) to_parquet(geography_staging, geography_staging_loc) to_parquet(asset_stock, stock_staging_loc)
def benchmark2(): print("===Benchmark 2===") print( "Comparing JDBC writes to InnoDB and API writes to ColumnStore with larger datasets" ) print("") emptyDatabase() print("creating dataframe 1: two random generated doubles") randDF = sqlContext.range(0, 7000000).withColumn( 'uniform', rand(seed=23)).withColumn('normal', randn(seed=42)).cache() randDFRows = randDF.count() randDFItems = randDFRows * len(randDF.columns) randDF.printSchema() print("bemchmarking dataframe 1") rand_benchmark = benchmark2execution( "rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE") randDF.unpersist() print( "creating dataframe 2: sha1, sha256, sha512 and md5 hashes of integers" ) tmpDF = sqlContext.createDataFrame( sc.parallelize(range( 0, 3000000)).map(lambda i: Row(number=i, string=str(i)))) hashDF = tmpDF.select(tmpDF.number, sha1(tmpDF.string).alias("sha1"), sha2(tmpDF.string, 256).alias("sha256"), sha2(tmpDF.string, 512).alias("sha512"), md5(tmpDF.string).alias("md5")).cache() hashDFRows = hashDF.count() hashDFItems = hashDFRows * len(hashDF.columns) hashDF.printSchema() print("bemchmarking dataframe 2") hash_benchmark = benchmark2execution( "hash", hashDF, "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)" ) hashDF.unpersist() print("jdbc_innodb\tapi_columnstore\t\trows\t\titems") print("%.3fs\t\t%.3fs\t\t%i\t\t%i" % (rand_benchmark[0], rand_benchmark[1], randDFRows, randDFItems)) print("%.3fs\t\t%.3fs\t\t%i\t\t%i" % (hash_benchmark[0], hash_benchmark[1], hashDFRows, hashDFItems))
def test_capture_illegalargument_exception(self): self.assertRaisesRegexp( IllegalArgumentException, "Setting negative mapred.reduce.tasks", lambda: self.sqlCtx.sql("SET mapred.reduce.tasks=-1")) df = self.sqlCtx.createDataFrame([(1, 2)], ["a", "b"]) self.assertRaisesRegexp(IllegalArgumentException, "1024 is not in the permitted values", lambda: df.select(sha2(df.a, 1024)).collect())
def add_row_hash(df): """Adds a row hash to detect record changes. :param df: :return df: """ hash_columns = [x for x in df.columns if x not in ["snapshot_date"]] df = df \ .withColumn("row_hash", F.sha2(F.concat_ws("||", *hash_columns), 256)) return df
def benchmarkSHA256(df, jobLogger): jobLogger.info( '****************************************************************') jobLogger.info('Starting benchmark test calculatng SHA-512 hashes') start_time = timer() hashed_df = (df.withColumn('hashed_value', F.sha2(F.col('value'), 512))) # now trigger the computations by fetching a count at the RDD level count_value = hashed_df.rdd.count() end_time = timer() return (end_time - start_time), count_value
def test_capture_illegalargument_exception(self): self.assertRaisesRegexp( IllegalArgumentException, "Setting negative mapred.reduce.tasks", lambda: self.sqlCtx.sql("SET mapred.reduce.tasks=-1"), ) df = self.sqlCtx.createDataFrame([(1, 2)], ["a", "b"]) self.assertRaisesRegexp( IllegalArgumentException, "1024 is not in the permitted values", lambda: df.select(sha2(df.a, 1024)).collect(), )
def launch(self): self.logger.info("Launching databricks_jobs job") df, repartition = self.prepare_dataframe() image_df = df. \ repartition(repartition, sha2("image_path", 224)).\ rdd.\ flatMap(lambda x: extract_face_emb(x.image_path)). \ map(lambda x: ';'.join(map(str, x))).\ saveAsTextFile(self.output_path) self.logger.info("Sample job finished!")
def pseudonymize(self, df, schema): #: list[list[str]]): """ Performs pseudonymization of the given dataframe based on the provided schema. For example, if the given df is for an entity called person, 2 dataframes will be returned, one called person that has hashed ids and masked fields, and one called person_lookup that contains the original person_id, person_id_pseudo, and the non-masked values for columns marked to be masked.""" df_pseudo = df_lookup = df for col_name, dtype, op in schema: if op == "hash-no-lookup" or op == "hnl": # This means that the lookup can be performed against a different table so no lookup is needed. df_pseudo = df_pseudo.withColumn( col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256)).withColumnRenamed(col_name, col_name + "_pseudonym") df_lookup = df_lookup.drop(col_name) elif op == "hash" or op == 'h': df_pseudo = df_pseudo.withColumn( col_name, F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256)).withColumnRenamed(col_name, col_name + "_pseudonym") df_lookup = df_lookup.withColumn( col_name + "_pseudonym", F.sha2(F.concat(F.col(col_name), F.lit(self.salt)), 256)) elif op == "mask" or op == 'm': df_pseudo = df_pseudo.withColumn(col_name, F.lit('*')) elif op == "partition-by": pass # make no changes for this column so that it will be in both dataframes and can be used for partitioning elif op == "no-op" or op == 'x': df_lookup = df_lookup.drop(col_name) df_pseudo = self.fix_column_names(df_pseudo) df_lookup = self.fix_column_names(df_lookup) return (df_pseudo, df_lookup)
def transformation(logger, spark, source_df, processing_dt, initial_spark_schemas, config, collection): try: if config["module_name"] == "vacancy": new_df = source_df.select([ F.col(col).alias( re.sub("[^0-9a-zA-Z$]+", " ", col).strip().replace(" ", "_").lower()) for col in source_df.columns ]) new_df = new_df.withColumn("date_uploaded", F.lit(datetime.strftime(processing_dt, "%Y-%m-%d"))) \ .withColumn("error_desc", F.lit(None).cast(ArrayType(StringType()))) schema = get_old_schema( logger, spark, schema=initial_spark_schemas[collection], database_name=config["published_database_name"], table_name=collection) old_df = spark.createDataFrame([], schema) evolved_df = get_evolved_schema(logger, old_df, new_df) elif config["module_name"] in ("application", "payment"): new_df = source_df.select([ F.col(col).alias(col[0].lower() + re.sub(r'(?!^)[A-Z]', lambda x: '_' + x.group( 0).lower(), col[1:])) for col in source_df.columns ]) new_df = new_df.withColumn("date_uploaded", F.lit(datetime.strftime(processing_dt, "%Y-%m-%d"))) \ .withColumn("error_desc", F.lit(None).cast(ArrayType(StringType())))\ .withColumn("row_hash_id", F.sha2(F.concat_ws("||", *new_df.columns), 256)) old_schema = get_old_schema( logger, spark, schema=initial_spark_schemas[collection], database_name=config["published_database_name"], table_name=collection) old_df = spark.createDataFrame([], old_schema) evolved_df = get_evolved_schema(logger, old_df, new_df) except BaseException as ex: logger.error( "Failed to transformation the source dataframe because of error: %s", str(ex)) sys.exit(-1) return evolved_df
def universal_identifier_generator(data_set, key_field, key_name): """ Universal Identifier Generator generates UUIDs based on data fields from the data set. This is the equivalent of a validation hash, based on business key(s). :param data_set: The data set the hash is being built from and added to. :param key_field: Business key field(s) to be hashed. :type key_field: string or list :param key_name: Name of the uuid field :type key_name: String :return uuid_key: """ if type(key_field) is not list: key_field = [key_field] data_set = data_set.withColumn( key_name, F.sha2(F.concat_ws('||', *key_field), 512)) return data_set
def main(): spark = SparkSession.builder.appName("Anonymize PySpark").getOrCreate() args_iter = iter(sys.argv[1:]) args = dict(zip(args_iter, args_iter)) #sample args for interactive testing #args = {'project_bucket': 'project1-lz', 'input_table': 'upload', 'output_table': 'raw', 'database': 'default', 'file_name': 'Tweets.csv'} project_bucket = args['project_bucket'] input_table = args['input_table'] output_table = args['output_table'] database = args['database'] input_s3_uri = 's3://' + project_bucket + '/' + input_table + '/' + args[ 'file_name'] output_s3_uri = 's3://' + project_bucket + '/' + output_table + '/' + args[ 'file_name'].split('.')[0] + '-anon/' # Interactive pyspark from glue development endpoint allows reaading from glue crawlers # from awsglue.context import GlueContext #glueContext = GlueContext(SparkContext.getOrCreate()) # Create a dataframe from glue catalog #df = glueContext.create_data_frame.from_catalog(database=database, table_name=input_table) #Print out information about this data #print("Count: ", df.count()) #df.printSchema() df = spark.read.csv(input_s3_uri, header=True) # replace each tweeters name with crc bigint dfAnnocrc = df.withColumn("annonym", sha2("name", 256)).select("annonym", "tweet_id", "airline", "airline_sentiment", "text") # write back to s3 as parquet dfAnnocrc.write.mode("append").parquet(output_s3_uri)
def add_hashed_id(df, columns=[], hashed_col='Hashed_ID', hash_type='md5'): """ This method will create a dummy transaction for each account record in the dataframe. Returns -------- Dataframe with hashed Id as a column ------ Parameters -------- df : spark dataframe dataframe to create hashed id on columns : list of strings columns to use hash, default is None which takes in all columns of df hashed_col : string column name for hashed id -------- """ if len(columns) == 0: columns = df.columns else: illegal_columns = [] for column in columns: if column not in df.columns: illegal_columns.append(column) if len(illegal_columns) > 0: raise IllegalArgumentException( 'Column {} does not exist in dataframe'.format(', '.join(illegal_columns))) if hashed_col is None or hashed_col == '': hashed_col = 'Hashed_ID' if hash_type == 'md5': df = df.withColumn(hashed_col, F.md5(F.concat(*columns))) else: df = df.withColumn(hashed_col, F.sha2(F.concat(*columns))) return df
def load_passengers(self, passenger_filename, passenger_output): """ Function to load the passenger data from csv in GCS, clean, add UID, and upload to BigQuery :param passenger_filename: str input file name :param passenger_output: str of project.dataset.table to save passenger data """ self.passenger_filename = passenger_filename self.passenger_output = passenger_output people_path = 'gs://{}/{}'.format(self.bucket, passenger_filename) logger.info(f"Loading passenger info from {self.bucket}.{passenger_filename}") passengers_df = self.sparkql.read.csv(people_path, header=True) # Use withColumn and initcap to standardize the names passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\ .withColumn('middle_name', initcap(col('middle_name')))\ .withColumn('last_name', initcap(col('last_name'))) # Create full_name column passengers_df = passengers_df.withColumn('full_name', concat_ws(" ", col('first_name'), col('middle_name'), col('last_name'))) passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256)) # Write to BigQuery logger.info(f"Writing file to {passenger_output}") passengers_df.write.format('bigquery') \ .option('table', passenger_output) \ .save() self.passengers_df = passengers_df
def run(): # Build session sparkql = SparkSession.builder.master('local[1]').getOrCreate() # Load config informaiton if __name__ == '__main__': people_path = config['defaults']['ch3']['ep1']['passenger_input'].get(str) save_path = config['defaults']['ch3']['ep1']['passenger_output'].get(str) bq_table = config['defaults']['ch3']['ep1']['passenger_table'].get(str) logger.info(f"Loading passenger info from {people_path}") # read csv file into spark dataframe passengers_df = sparkql.read.csv(people_path, header=True) logger.info(f"There are {passengers_df.count()} rows") # Load the passenger data and make sure the names have initial capitalization logger.info("Cleaning names and creating full name") passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\ .withColumn('middle_name', initcap(col('middle_name')))\ .withColumn('last_name', initcap(col('last_name'))) # Create full_name column passengers_df = passengers_df.withColumn( 'full_name', concat_ws(" ", col('first_name'), col('middle_name'), col('last_name'))) logger.info("Creating sha2 uid from email") # Create a sha2 uid based on the email passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256)) logger.info(f"Saving file to {save_path}") # Save dataframe as a parquet file passengers_df.write.parquet(save_path) logger.info("Uploading file to BigQuery") # Upload the file as an external table in BigQuery gbq_load(bq_table, save_path)
def add_hashed_column(dataframe, column_name): dataframe = dataframe.withColumn( column_name + "_hashed", sha2(dataframe[column_name].cast(StringType()), 512)) return dataframe
nestedwindowSpec = Window.partitionBy("ROW_ID").orderBy( monotonically_increasing_id()) nested_parquet_filepath = (mount + basepath + zone + "/" + contry_name + "/" + source_name + "/" + object_name + "_" + nested_column + "/" + year + "/" + month + "/" + day + "/" + object_name + "_" + nested_column + "_" + filename_timestamp + ".parquet") if max_records_on_array > 0: result = denormalizer(newDf, nested_column, identity_columns) result = result.withColumn("ITEM_ID", sqlfn.row_number().over(nestedwindowSpec)) # added to remove any structypes result = flattenDataframe(result) result.write.format("parquet").save(nested_parquet_filepath) newDf = newDf.withColumn( nested_column, sha2(newDf[nested_column].cast(StringType()), 512)) else: print("No nested records found, writing empty file") result = newDf.select("contactPoints", *identity_columns).limit(0) result = result.withColumn("ITEM_ID", sqlfn.row_number().over(nestedwindowSpec)) result.write.format("parquet").save(nested_parquet_filepath) newDf = newDf.withColumn(nested_column, lit(None).cast(StringType())) # COMMAND ---------- # MAGIC %md # MAGIC #### Total Count of the Records # COMMAND ----------
def transform_hcp_trans_data(): df_hcp_trans_data = spark \ .read \ .option('mergeSchema', 'true') \ .parquet(config.get(config_set, 'hcp.txns.base.raw.path')) df_hcp_trans_data.createOrReplaceTempView(config.get(config_set, 'hcp.transactions.data.table')) df_hcp_txns_base = spark.sql(""" select BOOKING_STATUS ,HOTELHUB_BOOKING_REF ,CONFIRMATION_REF ,CANCELLATION_REF ,PNR ,PNR_Type ,HOTELHUB_MODE ,MARKET ,CLIENT_CLIENT_TOP_NAME ,CLIENT_SUB_UNIT_CLIENT_NAME ,CUSTOMER_AGENCY_NAME ,HOTEL_NAME ,CITY ,COUNTRY ,STAR_RATING ,cast(concat_ws ('-', concat(cast('20' as string), substr(cast(date_in as string),1,2)), substr(cast(date_in as string),3,2), substr(cast(date_in as string),5,2) ) as timestamp) as DATE_IN ,cast(concat_ws ('-', concat(cast('20' as string), substr(cast(date_out as string),1,2)), substr(cast(date_out as string),3,2), substr(cast(date_out as string),5,2) ) as timestamp) as DATE_OUT ,NIGHTS ,NUM_OF_ROOMS ,NUM_OF_GUEST ,OUT_POLICY_REASON ,BOOKING_SOURCE ,RATE_DESCRIPTION ,CANCELLATION_POLICY ,cast(RATEPERDAY_AMOUNT as double) as RATEPERDAY_AMOUNT ,RATEPERDAY_CURRCODE ,AGENCY_PRIORITY ,CUSTOMER_PRIORITY ,PAYMENT_MODE ,cast(RATEPERDAY_EUR as double) as RATEPERDAY_EUR ,cast(RATEPERDAY_GBP as double) as RATEPERDAY_GBP ,cast(RATEPERDAY_USD as double) as RATEPERDAY_USD ,TOTALAMOUNT_BOOKED_CURRCODE ,TOTALAMOUNT_BOOKED ,LOCAL_CURRENCY_CODE ,cast(RATEPERDAY_LCC as double) as RATEPERDAY_LCC ,cast(TOTALAMOUNT_EUR as double) as TOTALAMOUNT_EUR ,cast(TOTALAMOUNT_GBP as double) as TOTALAMOUNT_GBP ,cast(TOTALAMOUNT_USD as double) as TOTALAMOUNT_USD ,cast(TOTALAMOUNT_LCC as double) as TOTALAMOUNT_LCC ,BOOKED_RATE_TYPE_CODE ,CONTENT_SOURCE ,GDS_CHAIN_NAME ,cast(concat_ws ('-', concat(concat(cast('20' as string)), substr(cast(created_date as string),1,2)), substr(cast(created_date as string),3,2), substr(cast(created_date as string),5,2) ) as timestamp) as CREATED_DATE ,CREATEDBY_USER ,CONFIRMEDBY_USER ,case when length(cancel_datetime)=0 then '' else cast(concat_ws ('-', concat(concat(cast('20' as string)), substr(cancel_datetime,1,2)), substr(cancel_datetime,3,2), substr(cancel_datetime,5,11) ) as timestamp) end as cancel_datetime ,ABANDON_BY_USER ,OBT_PNR ,CLIENT_BOOKING_CHANNEL ,RATE_ACCESS_CODE_BOOKED ,COMMISSION_TYPE ,COMMISSION_CURRENCY ,COMMISSION_AMOUNT ,ESTIMATED_INCOME_DUE ,RATE_ACCESS_CODE_SHOPPED ,HOTELHUB_PROPERTY_ID ,HARP_PROPERTY_ID_NO ,CONTENT_SOURCE_PROPERTY_ID ,AGGREGATOR_BOOKING_COMMISSION ,AGGREGATOR_REVENUE_VALUE ,AGGREGATOR_REVENUE_SHARE ,AGGREGATOR_CURRENCY ,RATE_CHANGE ,RATE_ACCESS_CODE_RETURNED ,BACK_OFFICE_ACCOUNT_NUMBER ,case when traveller_portrait_guid='' then 'UNKNOWN' when traveller_portrait_guid like '%-%' then regexp_replace(traveller_portrait_guid, '-',':') else traveller_portrait_guid end as TRAVELLER_PORTRAIT_GUID ,case when length(booking_start_dttm)=0 then '' else cast(concat_ws ('-', concat(concat(cast('20' as string)), substr(booking_start_dttm,1,2)), substr(booking_start_dttm,3,2), substr(booking_start_dttm,5,11) ) as timestamp) end as booking_st_tm ,case when length(booking_end_dttm)=0 then '' else cast(concat_ws ('-', concat(concat(cast('20' as string)), substr(booking_end_dttm,1,2)), substr(booking_end_dttm,3,2), substr(booking_end_dttm,5,11) ) as timestamp) end as booking_en_tm ,cast(STEP0_TIME as int) as STEP0_TIME ,cast(STEP1_TIME as int) as STEP1_TIME ,cast(STEP2_TIME as int) as STEP2_TIME ,cast(STEP3_TIME as int) as STEP3_TIME ,cast(STEP4_TIME as int) as STEP4_TIME ,GDS_SHOPPED_FOR_RATE ,cast(CHEAP_CLIENT_NEG_RATE as int) as CHEAP_CLIENT_NEG_RATE ,CHEAP_CLIENT_NEG_RATE_CURRCODE ,CHEAP_CLIENT_NEG_RATE_DESCRIPTION ,cast(CHEAP_CLIENT_OR_CWV_RATE as int) as CHEAP_CLIENT_OR_CWV_RATE ,CHEAP_CLIENT_OR_CWV_NEG_RATE_CURRCODE ,CHEAP_CLIENT_OR_CWV_RATE_DESCRIPTION ,cast(CHEAP_GDS_PUBLISHED_RATE as int) as CHEAP_GDS_PUBLISHED_RATE ,CHEAP_GDS_NEG_RATE_CURRCODE ,CHEAP_GDS_PUBLISHED_RATE_DESCRIPTION ,cast(CHEAP_BOOKINGCOM_RATE as int) as CHEAP_BOOKINGCOM_RATE ,CHEAP_BOOKINGCOM_RATE_CURRCODE ,CHEAP_BOOKINGCOM_RATE_DESCRIPTION ,BRANCH_IATA ,ON_REQUEST_INDICATOR ,case when length(lastmodified_datetime)=0 then '' else cast(concat_ws('-', concat(concat(cast('20' as string)), substr(lastmodified_datetime,1,2)), substr(lastmodified_datetime,3,2),substr(lastmodified_datetime,5,11) ) as timestamp) end as lastmodified_datetime ,modified_count ,booking_time_duration ,rate_bucket ,back_office ,client_sub_unit_client_id ,cast (MISSED_SAVING as int) as missed_saving ,cast(REALISED_SAVING as int) as realised_saving ,POPULAR_HOTEL ,HOTEL_RANK ,cast(HOTEL_TOTAL as int) as hotel_total ,agency_source_name ,client_client_top_id ,err_desc ,aggregator_property_type ,hotel_bucket_simplified ,gds_commission_text ,avlb_htl_count ,offer ,aaa_rate ,error_code ,concat(hotelhub_property_id,rate_access_code_booked,booked_rate_type_code,case when content_source like 'BOOKING%' then 'BC' when content_source like 'EAN%' then 'EH' when content_source like 'DESIYA%' then 'DH' when content_source like 'PREMIER%' then 'PI' when content_source like 'CHL%' then 'CM' when content_source like 'SABR%' then 'S' when content_source like 'AMAD%' then 'A' when content_source like 'GALI%' then 'G' when content_source like 'APO%' then '1V' when content_source like 'HOTELH%' and gds_shopped_for_rate like 'SABR%' then 'S' when content_source like 'HOTELH%' and gds_shopped_for_rate like 'APO%' then '1V' when content_source like 'HOTELH%' and gds_shopped_for_rate like 'AMAD%' then 'A' when content_source like 'HOTELH%' and gds_shopped_for_rate like 'GALI%' then 'G' else 'XX' end ) as rate_id ,case when content_source in('BOOKING.COM','EAN HOTEL COLLECT', 'BOOKING.COM CASHONLY','DESIYA HOTELS','LOCAL AGGREGATOR') then 'AGG RATE' when content_source ='PREMIER INN - PI' then 'PUB - DIRECT CONNECT' when (rate_bucket like '%CLIENT%' or rate_bucket like '%ROOMIT%') and rate_access_code_booked='CWV' then 'ROOMIT (CWV)' when (rate_bucket like '%CLIENT%' or rate_bucket like '%ROOMIT%') and ((instr(lower(rate_description),'client value')!=0) or (instr(lower(rate_description),'cwv')!=0)) then 'ROOMIT (CWV)' when (rate_access_code_booked='CWV' or rate_access_code_booked is null or rate_access_code_booked='') and ((instr(lower(rate_description),'client value')!=0) or (instr(lower(rate_description),'cwv')!=0)) then 'ROOMIT (CWV)' when (instr(lower(rate_description),'client')!=0 and instr(lower(rate_description),'value')!=0)or (instr(lower(rate_description),'carlson')!=0 and instr(lower(rate_description),'value')!=0) or instr(lower(rate_description),'roomit')!=0 or instr(lower(rate_description),'room it')!=0 then 'ROOMIT (CWV)' when rate_description not like'CWV%' and instr(lower(rate_description),'crs')!=0 then 'CLIENT' when (rate_bucket like '%CLIENT%' or rate_bucket like '%ROOMIT%') and rate_access_code_booked ='CLIENT' then 'ROOMIT (CLIENT)' when (rate_access_code_booked='CLIENT' or rate_access_code_booked is null or rate_access_code_booked='') and ((instr(lower(rate_description),'client')!=0) or (instr(lower(rate_description),'carlson')!=0)) then 'ROOMIT (CLIENT)' when ((instr(lower(rate_description),'client')!=0) and (instr(lower(rate_description),'value')=0)) or ((instr(lower(rate_description),'carlson')!=0) and (instr(lower(rate_description),'value')=0)) or instr(lower(rate_description), 'consortia')>0 then 'ROOMIT (CLIENT)' when rate_bucket like '%PUBLIC%' and instr(lower(rate_description), 'worldwide')>0 then 'PUB' when instr(lower(rate_description), 'room rac')>0 or instr(lower(rate_description), 'room pro')>0 or instr(lower(rate_description), 'bed flexible rate')>0 or instr(lower(rate_description), 'beds flexible rate')>0 then 'PUB' when (rate_bucket like '%CLIENT%' or rate_bucket like '%KUNDEN%') then 'CLIENT' when ((rate_bucket like '%REQUEST%' or rate_bucket like '%ANFRAGE%' or rate_bucket like '%DEMANDE%' or rate_bucket like '%PETICI%' or rate_bucket like 'PUBLIC%' or rate_bucket is null or rate_bucket='') and (gds_commission_text like 'NO%' or gds_commission_text is null or gds_commission_text ='')) then 'CLIENT' when (rate_bucket like '%PUBLIC%' or rate_bucket='U') and (gds_commission_text like '%NON%' or gds_commission_text like '%NOT%' or gds_commission_text like '%NO C%' or gds_commission_text like '% 0.00%') then 'CLIENT' when ((gds_commission_text like '%NON%' or gds_commission_text like '%NOT%' or gds_commission_text like '%NO C%' or gds_commission_text like '% 0.00%' or gds_commission_text like '%UNK%') and (rate_description like '%COR%' or rate_description like '%CLT' or rate_description like '%GOV%' or rate_description like '%NEG%')) and (instr(lower(rate_description),'corn')=0 and instr(lower(rate_description),'decor')=0 and instr(lower(rate_description),'corri')=0) then 'CLIENT' when ((gds_commission_text is null or gds_commission_text ='')and (rate_bucket like '%REQUEST%' or rate_bucket like '%ANFRAGE%' or rate_bucket like '%DEMANDE%' or rate_bucket like '%PETICI%' or rate_bucket like 'PUBLIC%' or rate_bucket is null or rate_bucket='') and (rate_description like '%CLT' or rate_description like'%COR%' or rate_description like'%NEG%')) and (instr(lower(rate_description),'corn')=0 and instr(lower(rate_description),'decor')=0 and instr(lower(rate_description),'corri')=0) then 'CLIENT' when ((gds_commission_text is null or gds_commission_text ='')and rate_description like '%COR') then 'CLIENT' when (gds_commission_text like '%NON%' or gds_commission_text like '%NOT%' or gds_commission_text like '%NO C%' or gds_commission_text like '% 0.00%') then 'CLIENT' when instr(lower(rate_description),'corporate')!=0 or instr(lower(rate_description),'government')!=0 then 'CLIENT' when (rate_access_code_booked is null or rate_access_code_booked='' or rate_access_code_booked='SC' or rate_access_code_booked='COR') and (instr(lower(rate_description),' cor')>0 or instr(lower(rate_description),' clt')>0) then 'CLIENT' when instr(lower(rate_description),lower(client_client_top_name))>0 then 'CLIENT' when instr(lower(rate_description), 'aaa')>0 or instr(lower(rate_description), 'caa')>0 or instr(lower(rate_description), 'aarp')>0 or instr(lower(rate_description), 'spg member')>0 then 'CLIENT' else 'PUB' end as new_rate_bucket, payment_type_used, case when content_source like 'BOOKING%' then 'BC' when content_source like 'EAN%' then 'EH' when content_source like 'DESIYA%' then 'DH' when content_source like 'PREMIER%' then 'PI' when content_source like 'CHL%' then 'CM' when content_source like 'SABR%' then 'S' when content_source like 'AMAD%' then 'A' when content_source like 'GALI%' then 'G' when content_source like 'APO%' then '1V' when content_source like 'HOTELH%' and gds_shopped_for_rate like 'SABR%' then 'S' when content_source like 'HOTELH%' and gds_shopped_for_rate like 'APO%' then '1V' when content_source like 'HOTELH%' and gds_shopped_for_rate like 'AMAD%' then 'A' when content_source like 'HOTELH%' and gds_shopped_for_rate like 'GALI%' then 'G' else 'XX' end as channel_type, SESSIONID, date_created_year, date_created_month, date_created_day from {}""".format(config.get(config_set, 'hcp.transactions.data.table'))) df_hcp_txns_base_deduped = df_hcp_txns_base \ .withColumn('rownum', F.row_number().over(Window .partitionBy('HOTELHUB_BOOKING_REF') .orderBy(F.col('LASTMODIFIED_DATETIME').desc()))) \ .filter(F.col('rownum') == 1) \ .drop('rownum') # Export for general analytical use as sbx_dst.hcp_txns_base dump_partitioned_dataframe(df_hcp_txns_base_deduped, ['date_created_year', 'date_created_month', 'date_created_day'], config.get(config_set, 'hcp.txns.base.path'), config.get(config_set, 'hcp.txns.base.table')) # Generate match keys channels = ['CYTRIC', 'TRVDOO', 'KDSS', 'GETTHERE', 'CONCUR', 'BOOK2GO', 'SERKO', 'ZILLIOUS'] regexp_pattern = '[^a-zA-Z0-9]+' df_hcp_txns_new = spark.read.parquet(config.get(config_set, 'hcp.txns.base.path')) \ .filter(F.col('booking_status').isin(['CFD', 'CNX'])) \ .filter('pnr is not null or obt_pnr is not null') \ .withColumn('concat_base_OBT_PNR', F.concat('date_in', 'date_out', 'OBT_PNR')) \ .withColumn('concat_base_PNR', F.concat('date_in', 'date_out', 'PNR')) \ .withColumn('full_mk', F.when(F.col('CLIENT_BOOKING_CHANNEL').isin(channels), F.regexp_replace(F.concat('concat_base_OBT_PNR', 'HARP_PROPERTY_ID_NO', 'TRAVELLER_PORTRAIT_GUID'), regexp_pattern, '')) .otherwise(F.regexp_replace(F.concat('concat_base_PNR', 'HARP_PROPERTY_ID_NO', 'TRAVELLER_PORTRAIT_GUID'), regexp_pattern, ''))) \ .withColumn('prop_mk', F.when(F.col('CLIENT_BOOKING_CHANNEL').isin(channels), F.regexp_replace(F.concat('concat_base_OBT_PNR', 'HARP_PROPERTY_ID_NO'), regexp_pattern, '')) .otherwise(F.regexp_replace(F.concat('concat_base_PNR', 'HARP_PROPERTY_ID_NO'), regexp_pattern, ''))) \ .withColumn('pnr_mk', F.when(F.col('CLIENT_BOOKING_CHANNEL').isin(channels), F.regexp_replace('concat_base_OBT_PNR', regexp_pattern, '')) .otherwise(F.regexp_replace('concat_base_PNR', regexp_pattern, ''))) \ .withColumn('dedupe_key', F.when(F.col('CLIENT_BOOKING_CHANNEL').isin(channels), F.sha2(F.regexp_replace('concat_base_OBT_PNR', regexp_pattern, ''), 256)) .otherwise(F.sha2(F.regexp_replace('concat_base_PNR', regexp_pattern, ''), 256))) \ .withColumn('rk', F.rank().over(Window.partitionBy('dedupe_key').orderBy(F.col('lastmodified_datetime').desc()))) \ .filter('rk = 1') \ .withColumn('row_num', F.lit(9999)) dump_partitioned_dataframe(df_hcp_txns_new, ['date_created_year', 'date_created_month', 'date_created_day'], config.get(config_set, 'hcp.txns.new.path'), config.get(config_set, 'hcp.txns.new.table'))
def hash_and_register_data_tables(data_df, table_name): data_df = data_df.withColumn("hash_value", F.sha2(F.concat_ws("||", *data_df.columns), 512)) data_df.createOrReplaceTempView(table_name) print(f'row count for {table_name}: ' + str(data_df.count()))
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic") for row in guestTable: tableName = row if tableName == "thirdparty": sql = guestTable[tableName][ 0] + " thirdparty_etl.thirdparty_t_task " + guestTable[ tableName][1] guestPath = "s3://rupiahplus-data-warehouse/etl/banda/guest/" + tableName spark.sql(sql).write.mode("overwrite").orc(guestPath) else: #先给banda加上今天的partitions tempDataBase = " `banda-etl-s3`" guestPath = "s3://rupiahplus-data-warehouse/etl/banda/guest/" + tableName if tableName == "t_customer": sql = "select * from " + tempDataBase + "." + tableName spark.sql(sql).withColumn('mobile', F.sha2( F.col('mobile'), 256)).drop('imei').drop('password').drop( 'etldate').write.mode("overwrite").orc(guestPath) elif tableName == "t_loan_app": sql = "select * from " + tempDataBase + "." + tableName spark.sql(sql).drop('credential_no').drop( 'etldate').write.mode("overwrite").orc(guestPath) elif tableName == "t_personal_info": sql = guestTable[tableName][0] + tempDataBase + "." + tableName spark.sql(sql).drop('credential_no').drop( 'etldate').write.mode("overwrite").orc(guestPath) elif tableName == "t_auto_review_loan": sql = "select * from " + tempDataBase + "." + tableName spark.sql(sql).drop('name').drop('etldate').write.mode( "overwrite").orc(guestPath) elif tableName == "t_lpay_deposit": sql = "select * from " + tempDataBase + "." + tableName
df2_1=df2.select("entitydata").rdd.map(lambda p1:is_json(p1["entitydata"])).map(lambda g2: dict((k.lower(), unicode(v)) if type(v) != "unicode" else ((k.lower(), v)) for k, v in g2.iteritems())) l2=df2_1.collect() #l2=df2.select("entitydata").rdd.map(lambda r: json.loads(r["entitydata"])).map(lambda g2: dict((k.lower(), unicode(v)) if type(v) != "unicode" else ((k.lower(), v)) for k, v in g2.iteritems())).collect() df3_2=spark.createDataFrame(l2) #df3=df2.select("entitydata").rdd.map(lambda s:re.sub(r'(:)(null)',r'\1"NULL"',s["entitydata"])).map(lambda s2:re.sub(r'(:)(true)',r'\1"True"',s2)).map(lambda s3:re.sub(r'(:)(false)',r'\1"False"',s3)).map(lambda k: re.sub(r'(:)([0-9a-zA-Z.:]+)(,)', r'\1"\2"\3', k)).map(lambda k: re.sub(r'(:)([0-9a-zA-Z,:]+)(})', r'\1"\2"\3', k)).map(lambda d21: re.sub('[a-zA-Z"]+:', lambda m: m.group(0).lower(), d21)).map(lambda p: ast.literal_eval(p)) #.map(lambda g1: str(g1).lower()) #df3_2=df3.map(lambda v:Row(**v)).toDF() #df3_2=spark.createDataFrame(df3.collect()) df2=df2.withColumn("columnindex", row_number().over(w)) df3_2=df3_2.withColumn("columnindex", row_number().over(w)) final=df2.join(df3_2, df2.columnindex == df3_2.columnindex, 'inner').drop(df3_2.columnindex) final=final.drop('columnindex') sha_columns=df3_2.columns sha_columns.append("eventtype") sha_columns.remove("columnindex") final=final.withColumn("sha_key", sha2(concat_ws("||", *sha_columns), 256)) if "_corrupt_record_data" in final.columns: print("_corrupt_JSON_data found") bad_record_new=final.filter("_corrupt_record_data is not null").select("entitystring","process__id","gdia_load_date") bad_record_new=bad_record_new.withColumn("reason", lit("JSON parsing error in entity data")) bad_record=bad_record.union(bad_record_new) final=final.filter("_corrupt_record_data is null") final=final.drop("_corrupt_record_data") else: print(" json data is clean") final=final.drop('entitydata') final=final.drop('entitystring') if i344.lower() in tables_in_db: print("Table already existing: "+i344) col_in_new_data=[j28.lower() for j28 in final.columns] col_in_table=sqlContext.table(i344).columns
tokenizedit=tokenized.withColumn('match_deviceid_3_tokens',match_deviceid_3_tokens_udf(col('words'))) # new_expand_match=tokenizedit.join(tokens_to_match, tokenizedit.match_deviceid_3_tokens == tokens_to_match.match_deviceid_3_tokens , 'left_outer').select(tokenizedit.metadata, tokenizedit.logzio_id, tokenizedit.beat, tokenizedit.host, tokenizedit.it, tokenizedit.logzio_codec, tokenizedit.message, tokenizedit.offset, tokenizedit.source, tokenizedit.tags, tokenizedit.type, tokenizedit.messagecut , tokenizedit.words ) tokenized_validated = new_expand_match.orderBy(rand()).limit(95000) tokenized_validated.printSchema() # tokenized_validated.coalesce(1).write.json(output_file2) # Tokenize NON-Fraud-LABEL # hash the message de-duplicate those records notfraud_file=sqlContext.read.json(input_file3).repartition(50) notfraud_file.printSchema() # notfraud_df=notfraud_file\ .filter("message IS NOT NULL").filter("words IS NOT NULL")\ .withColumn('fraud_label',lit(0).cast('int'))\ .withColumn('hash_message',F.sha2(col('message'),512)).groupby(col('hash_message'))\ .agg(F.first(col('fraud_label')).alias('fraud_label'),F.first(col('words')).alias('words'),F.first(col('message')).alias('message'))\ .persist(pyspark.StorageLevel.MEMORY_AND_DISK_2) notfraud_df.printSchema() # Only the Not-Fraud are randomly sorted # from pyspark.sql.functions import rand # df_notfraud_words = notfraud_df.filter("message IS NOT NULL").select(col('fraud_label'),col('hash_message'),col('words'))\ .persist(pyspark.StorageLevel.MEMORY_AND_DISK_2) df_notfraud_words.printSchema() # # FILTER FRAUD AND LABEL # Join with Internal Curation Data in urltopredict staged folder # hash the message de-duplicate those records fraud_file=sqlContext.read.json(input_file1_playback_fraud).repartition(50)
def process_log_data(spark, input_data, output_data): """Process user log data creating the tables user, time and songplays Args: spark (SparkSession): The spark session object input_data (str): The input files path output_data (str): The output files path """ # read log data file LOGGER.info('read log data file') log_df = spark.read.json(input_data) # filter by actions for song plays LOGGER.info('filter by actions for song plays') log_df = log_df.where(F.col('page') == 'NextSong') # extract columns for users table LOGGER.info('extract columns for users table') user_table = log_df.select( ['userId', 'firstName', 'lastName', 'gender', 'level']) # write users table to parquet files LOGGER.info('write users table to parquet files') user_path = os.path.join(output_data, 'user') user_table.coalesce(1).write.mode('overwrite').parquet(user_path) # create datetime column from original timestamp column LOGGER.info('create datetime column from original timestamp column') get_timestamp = F.udf(lambda x: datetime.utcfromtimestamp(int(x) / 1000), TimestampType()) log_df = log_df.withColumn("start_time", get_timestamp("ts")) # extract columns to create time table LOGGER.info('extract columns to create time table') time_table = log_df.select( 'start_time', F.hour('start_time').alias('hour'), F.dayofmonth('start_time').alias('day'), F.weekofyear('start_time').alias('weekofyear'), F.month('start_time').alias('month'), F.year('start_time').alias('year'), F.dayofweek('start_time').alias('weekday')).drop_duplicates( ['start_time']) # write time table to parquet partitioned by year and month LOGGER.info('write time table to parquet partitioned by year and month') time_table.coalesce(1).write.mode('overwrite')\ .partitionBy('year', 'month')\ .parquet(os.path.join(output_data, 'time')) # read in song data to use for songplays table LOGGER.info('read in song data to use for songplays table') song_df = spark.read.parquet(os.path.join(output_data, 'song')) artist_df = spark.read.parquet(os.path.join(output_data, 'artist')) # join artist and song data LOGGER.info('join artist and song data') song_df = artist_df.select(['artist_name', 'artist_id'])\ .join(song_df, on='artist_id', how='inner') # extract columns from joined song and log datasets to create songplays LOGGER.info('extract columns from joined song and log datasets to create ' 'songplays') on_clause = \ (song_df.title == log_df.song) \ & (song_df.artist_name == log_df.artist) \ & (song_df.duration == log_df.length) songplays_table = log_df.join(song_df, on_clause, how='inner') # select columns and create year and month columns LOGGER.info('select columns and create year and month columns') songplays_table = songplays_table.select( 'start_time', F.col('userId').alias('user_id'), 'level', 'song_id', 'artist_id', F.col('itemInSession').alias('session_id'), 'location', F.col('userAgent').alias('user_agent'), F.month('start_time').alias('month'), F.year('start_time').alias('year')) # create songplay_id and drop duplicates by this column LOGGER.info('create songplay_id and drop duplicates by this column') key_columns = [ 'start_time', 'user_id', 'song_id', 'artist_id', 'session_id' ] songplays_table = songplays_table.withColumn( 'songplay_id', F.sha2(F.concat_ws("||", *key_columns), 256)).drop_duplicates(['songplay_id']) # write songplays table to parquet files partitioned by year and month LOGGER.info('write songplays table to parquet partitioned by year/month') songplays_table.coalesce(1).write.mode('overwrite')\ .partitionBy('year', 'month')\ .parquet(os.path.join(output_data, 'songplays'))
def test_functions(): df = spark.createDataFrame([{'col': 'foo'}], ['col']) rows = df.select(F.sha2(df.col, 256).alias('hashed')).collect() assert rows[ 0].hashed == '2c26b46b68ffc68ff99b453c1d30413413422d706483bfa0f98a5e886266e7ae'
# COMMAND ---------- from pyspark.sql import functions as F df = df.withColumn('swap', F.rand(2586) > 0.45) df = df.withColumn( '_first_name', F.when(F.col('swap'), F.rpad(F.substring(F.col('last_name'), 1, 1), 6, '*')).otherwise( F.rpad(F.substring(F.col('first_name'), 1, 1), 6, '*'))) df = df.withColumn( '_last_name', F.when(F.col('swap'), F.rpad(F.substring(F.col('first_name'), 1, 1), 6, '*')).otherwise( F.rpad(F.substring(F.col('last_name'), 1, 1), 6, '*'))) df = df.withColumn('_address', F.sha2(F.col('address'), 256)) # COMMAND ---------- display(df) # COMMAND ---------- # Create a view or table df.select(df._first_name, df._last_name, df._address, df.date_of_birth) \ .coalesce(1) \ .write \ .format('csv') \ .option("header", first_row_is_header) \ .option("sep", delimiter) \
# |20150722-13-3145-119-81-61-166|3323 | # |20150722-12-1630-52-74-219-71 |2967 | # |20150722-14-0015-52-74-219-71 |2907 | # |20150722-17-0015-119-81-61-166|2841 | # +------------------------------+---------------+ # 4. Find the most engaged users, ie the IPs with the longest session times # I am appending IP and Hash value of User agent, there by assuming each user agent within same ip # corresponds to different user # I am also assuming that we are trying to find most engaged users based on session times # (not across all the sessions_ids, which would become most engaged user of a day) # As I am copying my output here, I am sorting the output by duration_min mostEngaugedBaseDF = baseDF.select("timestamp", "session_id", "ip", "user_agent") mostEngaugedBaseDF = mostEngaugedBaseDF.withColumn( "user", f.concat("ip", f.lit('_'), f.sha2("user_agent", 256))) mostEngaugedDF = mostEngaugedBaseDF.groupby('user', 'session_id')\ .agg((f.max('timestamp').cast('long') - f.min('timestamp').cast('long')) / 60)\ .toDF("user", "session_id", "duration_min")\ .orderBy("duration_min", ascending=False) mostEngaugedDF.show(10, truncate=False) # +--------------------------------------------------------------------------------+--------------------------------+------------------+ # |user |session_id |duration_min | # +--------------------------------------------------------------------------------+--------------------------------+------------------+ # |111.119.199.22_f54af9f03ea52c6a4f3d0873010fa93778a1e387399baf0c331558235b47d37b |20150722-06-3145-111-119-199-22 |13.983333333333333| # |117.220.186.227_3a5a319663e42275d264c0d49636fe3673c4ace35a759d89f400715744532cbd|20150722-06-3145-117-220-186-227|13.4 | # |15.211.153.75_180050cb76309ecd4e9e895a18ed06b490500b93ab309126d91a9719e69097b7 |20150722-06-3145-15-211-153-75 |9.933333333333334 | # |119.235.53.134_3a5a319663e42275d264c0d49636fe3673c4ace35a759d89f400715744532cbd |20150722-06-3145-119-235-53-134 |9.9 | # |116.50.79.74_3a5a319663e42275d264c0d49636fe3673c4ace35a759d89f400715744532cbd |20150722-06-3145-116-50-79-74 |9.65 | # |52.74.219.71_3973e022e93220f9212c18d0d0c543ae7c309e46640da93a4a0314de999f5112 |20150722-06-3145-52-74-219-71 |9.316666666666666 |
# Build session sparkql = SparkSession.builder.master('yarn').getOrCreate() # Load passenger data bucket = <your bucket> sparkql.conf.set('temporaryGcsBucket', bucket) #this gives our job a temporary bucket to use when writint bucket_path = 'gs://{}/'.format(bucket) people_path = bucket_path + 'passengers_1k.csv' passengers_df = sparkql.read.csv(people_path, header=True) # Use withColumn and initcap to standardize the names passengers_df = passengers_df.withColumn('first_name', initcap(col('first_name')))\ .withColumn('middle_name', initcap(col('middle_name')))\ .withColumn('last_name', initcap(col('last_name'))) # Create full_name column passengers_df = passengers_df.withColumn('full_name', concat_ws(" ", col('first_name'), col('middle_name'), col('last_name'))) passengers_df = passengers_df.withColumn('uid', sha2(col('email'), 256)) bq_dataset = <your dataset> bq_table = 'passengers' passengers_df.write.format('bigquery') \ .option('table', '{}.{}'.format(bq_dataset, bq_table)) \ .save()
sparkql.conf.set('temporaryGcsBucket', bucket) bucket_path = 'gs://{}/'.format(bucket) addr_path = bucket_path + 'passengers_addrs_1k.csv' addr_df = sparkql.read.csv(addr_path, header=True) card_path = bucket_path + 'passengers_cards_1k.csv' card_df = sparkql.read.csv(card_path, header=True) # Create uid for each addr_df = addr_df.withColumn('addr_uid', sha2(concat_ws("", col("street_address"), col("city"), col("state_code"), col("from_date"), col("to_date") ), 256 )) card_df = card_df.withColumn('card_uid', sha2(concat_ws("", col("provider"), col("card_number"), col("expiration_date"), col("security_code") ), 256 ))
df = spark.read.format("com.databricks.spark.csv") \ .option("header", "true") \ .option("multiline","true") \ .option("escape", "\"") \ .schema(SCHEMA_INPUT_FILE) \ .load("/user/root/bd-platform/input/rs_aligned_json_metadata.csv") metadata_schema = StructType([ StructField("src_filename", StringType(), True), StructField("tgt_filename", StringType(), True), StructField("label", StringType(), True) ]) df = df.withColumn("metadata_json_parsed", F.from_json(df["metadata_json"], metadata_schema)) df = df.select(F.sha2(F.col("src"), 256).alias("src_hash"), "src", "tgt", "src_lang", "tgt_lang", \ "metadata_json_parsed.src_filename", \ "metadata_json_parsed.tgt_filename", \ "metadata_json_parsed.label") df.write.parquet("/user/root/hive/proto/proto.parquet") ###################################################### # Exposing in spark-sql ###################################################### # > create database test_db; # > use test_db; # > create table sentences_parquet( # src_hash string, # src string,