def get_undirected_features(paired_interactions, paired_interactions_articles): features_all = paired_interactions.withColumn('pair', f.array_sort(f.array(col('event_user_id'), col('event_user_id_r')))) \ .drop_duplicates(subset=['pair']).select('pair', 'num_common_pages') features_articles = paired_interactions_articles.withColumn('pair', f.array_sort( f.array(col('event_user_id'), col('event_user_id_r')))) \ .drop_duplicates(subset=['pair']) \ .select('pair', 'num_common_articles') # ,'mean_concentration_ratio') undirected = features_all.join(features_articles, on='pair') return undirected
def column_revalue(vcf): # info 값 수정 필요 name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"] for name in name_list: if name == "FORMAT": vcf = vcf.withColumn( name, F.array_sort(F.array_distinct(F.flatten(F.col(name))))) vcf = vcf.withColumn( name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":"))) else: vcf = vcf.withColumn(name, F.array_max(F.col(name))) return vcf
def unique_values(df, cols): from functools import reduce counts = df.groupBy( F.lit(True).alias("drop_me") ).agg( *[F.array_sort(F.collect_set(F.col(c))).alias(c) for c in cols] ).drop("drop_me").cache() result = reduce(lambda l, r: l.unionAll(r), [counts.select(F.lit(c).alias("field"), F.col(c).alias("unique_vals")) for c in counts.columns]).collect() counts.unpersist() return dict([(r[0],r[1]) for r in result])
df.cache().count() pairs=df.groupBy(["tract","patch"]).count() p= pairs.groupBy("tract").count().withColumnRenamed("count","npatch") p.count() good=p.filter(p['npatch']==49).sort("tract") good.count() good.show(200) g=good.select("tract").collect() from numpy import * a=array([gg[0] for gg in g]) bad=p.filter(p['npatch']!=49).sort("tract") bad.show(200) pairs.join(bad,"tract").groupBy("tract").agg(F.count("patch"),F.array_sort(F.collect_list("patch"))).sort("tract").show(200,truncate=False) #geometry geo=df.groupBy("tract").agg(F.avg("ra"),F.min("ra"),F.max("ra"),F.min("dec"),F.max("dec"),F.avg("dec")) #join with npatch dfj=geo.join(p,"tract") #ckeck bads p=dfj.toPandas() plt.plot(p["avg(ra)"],p["avg(dec)"],'o') bad=(p.npatch!=49) plt.plot(p[bad]["avg(ra)"],p[bad]["avg(dec)"],'ro')
def main(): parser = argparse.ArgumentParser() parser.add_argument("--lang", "-l", default="enwiki", type=str, help="language to parse (en or enwiki)") parser.add_argument( "--start", "-t1", default=None, type=str, help= "start day to parse [inclusive] (YYYY-MM-DD-HH); default: previous day - 7days" ) parser.add_argument( "--end", "-t2", default=None, type=str, help= "end day to parse [exclusive] (YYYY-MM-DD-HH); default: current day") args = parser.parse_args() lang = args.lang.replace('wiki', '') wiki_db = lang + 'wiki' t1 = args.start t2 = args.end if t1 != None and t2 != None: try: date_start = datetime.datetime.strptime(t1, '%Y-%m-%d-%H') date_end = datetime.datetime.strptime(t2, '%Y-%m-%d-%H') except ValueError: print('Provide correct day-format YYYY-MM-DD-HH') else: date_start = datetime.date.today() - datetime.timedelta(days=8) date_end = datetime.date.today() date_start_str = date_start.strftime('%Y-%m-%d-%H') date_end_str = date_end.strftime('%Y-%m-%d-%H') #### other parameters ## filter pageviews from actor with more than 500 pageviews ## the aim is to filter automated traffic that is not tagged as spider n_p_max = 500 ## maximum number of pageviews/user/day n_p_min = 1 ## minimum number of pageviews/user/day ## filtering sessions dt = 3600 ## cutoff for splitting sessions(interevent time between 2 pageivews) nlen_min = 2 ## min length of session nlen_max = 30 ## max length of session ## sessions will be saved locally in filename_save path_save = os.path.abspath('../output/sessions/') # filename_save = '%s.reading-sessions-%s--%s'%(lang,date_start_str,date_end_str) filename_save = 'reading-sessions-actors_%s_%s_%s' % ( wiki_db, date_start_str, date_end_str) ## tmp-directory for data on hive (will be deleted) base_dir_hdfs = '/tmp/reader-embedding/sessions' ### start spark = SparkSession.builder\ .master('yarn')\ .appName('reading-sessions')\ .enableHiveSupport()\ .getOrCreate() ######## ## query ################################################ ## time-window ts_start = calendar.timegm(date_start.timetuple()) ts_end = calendar.timegm(date_end.timetuple()) row_timestamp = F.unix_timestamp( F.concat(F.col('year'), F.lit('-'), F.col('month'), F.lit('-'), F.col('day'), F.lit(' '), F.col('hour'), F.lit(':00:00'))) ## window for counting pageviews per actor per day w_p = Window.partitionBy(F.col('actor_signature_per_project_family'), F.col('year'), F.col('month'), F.col('day')) ### actor table (filtered webrequests) ## https://wikitech.wikimedia.org/wiki/Analytics/Data_Lake/Traffic/Pageview_actor df_actor = ( spark.read.table('wmf.pageview_actor').where( row_timestamp >= ts_start).where(row_timestamp < ts_end).where( F.col('is_pageview') == True) ## agent-type user to filter spiders ## https://meta.wikimedia.org/wiki/Research:Page_view/Tags#Spider .where(F.col('agent_type') == "user") ## user: desktop/mobile/mobile app; isaac filters != mobile app .where(F.col('access_method') != "mobile app") ## only wikis .where(F.col('normalized_host.project_family') == 'wikipedia') ## only namespace 0 .where(F.col('namespace_id') == 0).withColumn( 'wiki_db', F.concat(F.col('normalized_host.project'), F.lit('wiki')))) ## filter only specific wiki (or all if wiki_db=='wikidata') if wiki_db == 'wikidata': pass else: df_actor = df_actor.where(F.col('wiki_db') == wiki_db) ## checkpoint for inspecting table # df_actor.limit(10).write.mode('overwrite').parquet('/user/mgerlach/sessions/test.parquet') # filter maximum and minimum pageviews per user # n_p is the number of pageviews per actor per day (across projects) df_actor = (df_actor.withColumn( 'n_p', F.sum(F.lit(1)).over(w_p)).where(F.col('n_p') >= n_p_min).where( F.col('n_p') <= n_p_max)) ## join the wikidata-item to each pageview ## we keep only pageviews for which we have a correpsionding wikidata-item id ## table with mapping wikidata-ids to page-ids ## partition wikidb and page-id ordered by snapshot w_wd = Window.partitionBy(F.col('wiki_db'), F.col('page_id')).orderBy( F.col('snapshot').desc()) df_wd = ( spark.read.table('wmf.wikidata_item_page_link') ## snapshot: this is a partition! .where( F.col('snapshot') >= '2020-07-01') ## resolve issues with non-mathcing wikidata-items ## only wikis (enwiki, ... not: wikisource) .where(F.col('wiki_db').endswith('wiki'))) ## filter only specific wiki (or all if wiki_db=='wikidata') if wiki_db == 'wikidata': pass else: df_wd = df_wd.where(F.col('wiki_db') == wiki_db) ## get the most recent wikidata-item for each pid+wikidb df_wd = (df_wd.withColumn( 'item_id_latest', F.first(F.col('item_id')).over(w_wd)).select( 'wiki_db', 'page_id', F.col('item_id_latest').alias('item_id')).drop_duplicates()) df_actor_wd = (df_actor.join(df_wd, on=['page_id', 'wiki_db'], how='inner')) ## aggregate all pageviews with same actor-signature across wikis to get sessions df_actor_wd_agg = ( df_actor_wd.groupby('actor_signature_per_project_family').agg( # F.first(F.col('access_method')).alias('access_method'), ## this could change along a session # F.first(F.col('geocoded_data')).alias('geocoded_data'), # F.first(F.col('n_p_by_user')).alias('session_length'), F.array_sort( F.collect_list( F.struct( F.col('ts'), F.col('page_id'), F.col('pageview_info.page_title').alias('page_title'), F.col('wiki_db'), F.col('item_id').alias('qid'), ))).alias('session'))) ## apply filter to the sessions try: os.mkdir(path_save) except FileExistsError: pass PATH_TMP = os.path.join(path_save, 'tmp') try: os.mkdir(PATH_TMP) except FileExistsError: pass ## hdfs-storing, some temporary files which will be deleted later output_hdfs_dir = os.path.join(base_dir_hdfs, filename_save) os.system('hadoop fs -rm -r %s' % output_hdfs_dir) ## local storing base_dir_local = path_save output_local_dir_tmp = os.path.join(base_dir_local, 'tmp', filename_save) output_local_file = os.path.join(base_dir_local, filename_save) ## load data # requests = spark.read.load(filename).rdd.map(lambda x: x['session']) requests = df_actor_wd_agg.rdd.map(lambda x: x['session']) ## keep only pageviews from a language requests = requests.map(lambda rs: [r for r in rs if r['page_id'] != None]) to_str = lambda x: ' '.join([str(e['page_id']) for e in x]) (requests.map(parse_requests).filter( filter_blacklist_qid) ## remove main_page .filter(lambda x: len(x) >= nlen_min ) ## only sessions with at least length nlen_min .map(filter_unique_articles ) ## remove repeated occurrence of same article in session .filter(lambda x: len(x) >= nlen_min ) ## only sessions with at least length nlen_min .flatMap(lambda x: sessionize(x, dt=dt) ) ## break sessions if interevent time is too large .filter(lambda x: len(x) >= nlen_min ) ## only sessions with at least length nlen_min .filter(lambda x: len(x) <= nlen_max ) ## only sessions with at most length nlen_max .map(to_str) ## conctenate session as single string ## write to hdfs .saveAsTextFile( output_hdfs_dir, compressionCodecClass="org.apache.hadoop.io.compress.GzipCodec")) ## copy to local (set of tmp-dirs) os.system('hadoop fs -copyToLocal %s %s' % (output_hdfs_dir, output_local_dir_tmp)) ## concatenate and unzip into single file os.system('cat %s/* | gunzip > %s' % (output_local_dir_tmp, output_local_file)) # ## remove set of tmp-dirs os.system('rm -rf %s' % output_local_dir_tmp) # ## remove hadoop data os.system('hadoop fs -rm -r %s' % output_hdfs_dir) print('Path to reading sessions: %s' % filename_save) return filename_save
def eq(self, left: IndexOpsLike, right: Any) -> SeriesOrIndex: if isinstance(right, (list, tuple)): from pyspark.pandas.series import first_series, scol_for from pyspark.pandas.frame import DataFrame from pyspark.pandas.internal import NATURAL_ORDER_COLUMN_NAME, InternalField len_right = len(right) if len(left) != len(right): raise ValueError("Lengths must be equal") sdf = left._internal.spark_frame structed_scol = F.struct( sdf[NATURAL_ORDER_COLUMN_NAME], *left._internal.index_spark_columns, left.spark.column, ) # The size of the list is expected to be small. collected_structed_scol = F.collect_list(structed_scol) # Sort the array by NATURAL_ORDER_COLUMN so that we can guarantee the order. collected_structed_scol = F.array_sort(collected_structed_scol) right_values_scol = F.array(*(F.lit(x) for x in right)) index_scol_names = left._internal.index_spark_column_names scol_name = left._internal.spark_column_name_for( left._internal.column_labels[0]) # Compare the values of left and right by using zip_with function. cond = F.zip_with( collected_structed_scol, right_values_scol, lambda x, y: F.struct( *[ x[index_scol_name].alias(index_scol_name) for index_scol_name in index_scol_names ], F.when(x[scol_name].isNull() | y.isNull(), False). otherwise(x[scol_name] == y, ).alias(scol_name), ), ).alias(scol_name) # 1. `sdf_new` here looks like the below (the first field of each set is Index): # +----------------------------------------------------------+ # |0 | # +----------------------------------------------------------+ # |[{0, false}, {1, true}, {2, false}, {3, true}, {4, false}]| # +----------------------------------------------------------+ sdf_new = sdf.select(cond) # 2. `sdf_new` after the explode looks like the below: # +----------+ # | col| # +----------+ # |{0, false}| # | {1, true}| # |{2, false}| # | {3, true}| # |{4, false}| # +----------+ sdf_new = sdf_new.select(F.explode(scol_name)) # 3. Here, the final `sdf_new` looks like the below: # +-----------------+-----+ # |__index_level_0__| 0| # +-----------------+-----+ # | 0|false| # | 1| true| # | 2|false| # | 3| true| # | 4|false| # +-----------------+-----+ sdf_new = sdf_new.select("col.*") index_spark_columns = [ scol_for(sdf_new, index_scol_name) for index_scol_name in index_scol_names ] data_spark_columns = [scol_for(sdf_new, scol_name)] internal = left._internal.copy( spark_frame=sdf_new, index_spark_columns=index_spark_columns, data_spark_columns=data_spark_columns, index_fields=[ InternalField.from_struct_field(index_field) for index_field in sdf_new.select( index_spark_columns).schema.fields ], data_fields=[ InternalField.from_struct_field( sdf_new.select(data_spark_columns).schema.fields[0]) ], ) return first_series(DataFrame(internal)) else: from pyspark.pandas.base import column_op return column_op(Column.__eq__)(left, right)
def main(): """Main function""" # Get args args = get_args() # Azure credentials sas_token = args.sas storage_account_name = args.storage container_in = args.container_in container_out = args.container_out azure_accounts = list() azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_in }) azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_out }) # VM cores = args.vm_cores ram = args.vm_ram shuffle_partitions = args.shuffle_partitions # Geohash file path geohash_path = args.geohashpath # Date, country, prefix country = args.country date_string = args.date prefix = args.prefix # Set date variables day_time = datetime.strptime(date_string, "%Y-%m-%d") year = day_time.year month = day_time.month day = day_time.day # stop config seconds = 60 accuracy = args.accuracy roam_dist = args.roam_dist min_stay = args.min_stay overlap_hours = args.overlap_hours # Path in - path out blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/preprocessed/{country}/" path_out = f"stoplocation-v{VERSION}_r{roam_dist}-s{min_stay}-a{accuracy}-h{overlap_hours}/{country}" if prefix: path_out = f"stoplocation-v{VERSION}_prefix_r{roam_dist}-s{min_stay}-a{accuracy}-h{overlap_hours}/{country}" # config spark conf = getSparkConfig(cores, ram, shuffle_partitions, azure_accounts) # Create spark session sc = SparkContext(conf=conf).getOrCreate() sqlContext = SQLContext(sc) spark = sqlContext.sparkSession # Init azure client blob_service_client = BlobServiceClient.from_connection_string( CONN_STRING.format(storage_account_name, sas_token)) # build keys, date is mandatory, prefix opt partition_key = "year={}/month={}/day={}".format(year, month, day) if prefix: partition_key = "year={}/month={}/day={}/prefix={}".format( year, month, day, prefix) blob_base = "{}/{}".format(path_out, partition_key) # # check for skip # TODO # skip = False print("process " + partition_key + " to " + blob_base) start_time = time.time() local_dir = LOCAL_PATH + partition_key print("write temp to " + local_dir) # cleanup local if exists if (os.path.isdir(local_dir)): map(os.unlink, (os.path.join(local_dir, f) for f in os.listdir(local_dir))) # TODO cleanup remote if exists # Output schema schema = ArrayType( StructType([ #StructField('device_type', IntegerType(), False), StructField('serial', IntegerType(), False), StructField('latitude', DoubleType(), False), StructField('longitude', DoubleType(), False), StructField('begin', TimestampType(), False), StructField('end', TimestampType(), False), StructField('personal_area', BooleanType(), False), StructField('distance', DoubleType(), False), StructField('geohash6', StringType(), False), StructField('after_stop_distance', DoubleType(), False) ])) spark_get_stop_location = udf( lambda z: get_stop_location(z, roam_dist, min_stay), schema) # Geohash file print("read geohash parquet") csv_time = time.time() dfs_us_states = spark.read.format("parquet").load(geohash_path) # states = [s.STUSPS for s in dfs_us_states.select( # 'STUSPS').distinct().collect()] dfs_us_states = dfs_us_states.select( col('STUSPS').alias('state'), col('geohash').alias('geohash5')) dfs_us_states = dfs_us_states.drop_duplicates(subset=['geohash5']) # Input dataset print("read dataset table") read_time = time.time() # dfs = spark.read.format("parquet").load(blob_in) # # apply partition filter # dfs_partition = dfs.where( # f"(year = {year} AND month = {month} AND day = {day} AND prefix = '{prefix}')") # read only partition to reduce browse time dfs_cur_partition = spark.read.format("parquet").load( f"{blob_in}/{partition_key}") # lit partition filters as data dfs_cur_partition = dfs_cur_partition.withColumn('year', F.lit(year)) dfs_cur_partition = dfs_cur_partition.withColumn('month', F.lit(month)) dfs_cur_partition = dfs_cur_partition.withColumn('day', F.lit(day)) if prefix: dfs_cur_partition = dfs_cur_partition.withColumn( 'prefix', F.lit(prefix)) # read next day for overlap next_day = day_time + timedelta(days=1) next_partition_key = "year={}/month={}/day={}".format( next_day.year, next_day.month, next_day.day) if prefix: next_partition_key = "year={}/month={}/day={}/prefix={}".format( next_day.year, next_day.month, next_day.day, prefix) dfs_next_partition = spark.read.format("parquet").load( f"{blob_in}/{next_partition_key}") dfs_next_partition = dfs_next_partition.where( F.hour("timestamp") <= (overlap_hours - 1)) # lit partition filters as data dfs_next_partition = dfs_next_partition.withColumn('year', F.lit(next_day.year)) dfs_next_partition = dfs_next_partition.withColumn('month', F.lit(next_day.month)) dfs_next_partition = dfs_next_partition.withColumn('day', F.lit(next_day.day)) if prefix: dfs_next_partition = dfs_next_partition.withColumn( 'prefix', F.lit(prefix)) # union with overlap dfs_partition = dfs_cur_partition.unionAll(dfs_next_partition) print("process with spark") spark_time = time.time() # select columns dfs_partition = dfs_partition.select( 'prefix', 'userID', 'timestamp', 'latitude', 'longitude', (F.when(col('opt1') == 'PERSONAL_AREA', True).otherwise(False)).alias('personal_area'), 'accuracy') # keep only data with required accuracy dfs_partition = dfs_partition.where((col('accuracy') <= accuracy) & (col('accuracy') >= 0)) # stats - enable only for debug! # num_inputs = dfs_partition.count() # print(f"read {num_inputs} rows from "+partition_key) # Lowering the granularity to 1 minutes # explicitely convert to timestamp #dfs_partition = dfs_partition.withColumn('timestamp', col('timestamp').cast('timestamp')) seconds_window = F.unix_timestamp( 'timestamp') - F.unix_timestamp('timestamp') % seconds w = Window().partitionBy('userID', seconds_window).orderBy('accuracy') dfs_partition = dfs_partition.withColumn( 'rn', F.row_number().over(w).cast('int')).where(col('rn') == 1).drop('rn') # Radians lat/lon dfs_partition = dfs_partition.withColumn('latitude', F.radians('latitude')).withColumn( 'longitude', F.radians('longitude')) # Groups GPS locations into chucks. A chunk is formed by groups of points that are distant no more than roam_dist w = Window.partitionBy(['prefix', 'userID']).orderBy('timestamp') dfs_partition = dfs_partition.withColumn('next_lat', F.lead('latitude', 1).over(w)) dfs_partition = dfs_partition.withColumn('next_lon', F.lead('longitude', 1).over(w)) # Haversine distance dfs_partition = dfs_partition.withColumn( 'distance_next', EARTH_RADIUS * 2 * F.asin( F.sqrt( F.pow(F.sin((col('next_lat') - col('latitude')) / 2.0), 2) + F.cos('latitude') * F.cos('next_lat') * F.pow(F.sin((col('next_lon') - col('longitude')) / 2.0), 2)))) dfs_partition = dfs_partition.withColumn( 'distance_prev', F.lag('distance_next', default=0).over(w)) # Chunks dfs_partition = dfs_partition.withColumn( 'chunk', F.when(col('distance_prev') > roam_dist, 1).otherwise(0)) windowval = (Window.partitionBy( 'prefix', 'userID').orderBy('timestamp').rangeBetween(Window.unboundedPreceding, 0)) dfs_partition = dfs_partition.withColumn( 'chunk', F.sum('chunk').over(windowval).cast('int')) # Remove chunks of the next day w = Window.partitionBy(['prefix', 'userID', 'chunk']) dfs_partition = dfs_partition.withColumn( 'min_timestamp', F.dayofmonth(F.min('timestamp').over(w))) dfs_partition = dfs_partition.where( col('min_timestamp') == day).drop('min_timestamp') # Get the stops result_df = dfs_partition.groupBy('prefix', 'userID', 'chunk').agg( F.array_sort( F.collect_list( F.struct('timestamp', 'latitude', 'longitude', 'distance_prev', 'personal_area'))).alias('gpsdata'), F.sum('distance_prev').alias('dist_sum')) result_df = result_df.withColumn('gpsdata', spark_get_stop_location('gpsdata')) result_df = result_df.select('userID', 'chunk', F.explode_outer('gpsdata').alias('e'), 'dist_sum') result_df = result_df.select( 'userID', 'chunk', col('e.latitude').alias('latitude'), col('e.longitude').alias('longitude'), col('e.begin').alias('begin'), col('e.end').alias('end'), col('e.personal_area').alias('personal_area'), col('e.geohash6').alias('geohash6'), col('e.serial').alias('serial'), col('e.distance').alias('stop_distance'), col('e.after_stop_distance').alias('after_stop_distance'), 'dist_sum') result_df = result_df.fillna(0, subset=['after_stop_distance']) # Remove all those stop that start the next day result_df = result_df.where((col('begin').isNull()) | (F.dayofmonth('begin') != next_day.day)) result_df = result_df.withColumn( 'isStop', F.when(col('serial').isNotNull(), 1).otherwise(0)) result_df = result_df.withColumn( 'dist_sum', F.when(col('isStop') == 1, col('stop_distance')).otherwise(col('dist_sum'))) windowval = (Window.partitionBy('userId').orderBy( 'chunk', 'serial').rowsBetween(Window.currentRow, Window.unboundedFollowing)) result_df = result_df.withColumn('isStop_cum', F.sum('isStop').over(windowval)) result_df = result_df.groupBy('userId', 'isStop_cum').agg( F.first('latitude', ignorenulls=True).alias('latitude'), F.first('longitude', ignorenulls=True).alias('longitude'), F.first('begin', ignorenulls=True).alias('begin'), F.first('end', ignorenulls=True).alias('end'), F.first('personal_area', ignorenulls=True).alias('personal_area'), F.first('geohash6', ignorenulls=True).alias('geohash6'), F.sum('dist_sum').alias('prev_travelled_distance'), F.sum('after_stop_distance').alias('after_stop_distance')) # compute next distance, which is null if it's the last windowval = Window.partitionBy('userId').orderBy(F.desc('isStop_cum')) result_df = result_df.withColumn( 'next_travelled_distance', F.lead('prev_travelled_distance').over(windowval)) result_df = result_df.withColumn( 'next_travelled_distance', F.when((col('next_travelled_distance').isNull()) & (col('after_stop_distance') > 0), col('after_stop_distance')).otherwise( col('next_travelled_distance'))) # Drop nulls result_df = result_df.dropna(subset=['latitude']).drop('isStop_cum') # Transform latitude and longitude back to degrees result_df = result_df.withColumn('latitude', F.degrees('latitude')) result_df = result_df.withColumn('longitude', F.degrees('longitude')) # US states result_df = result_df.withColumn( "geohash5", F.expr("substring(geohash6, 1, length(geohash6)-1)")) result_df = result_df.join(F.broadcast(dfs_us_states), on="geohash5", how="inner").drop('geohash5') # lit partition data - enable only if added to partitionBy # result_df = result_df.withColumn('year', F.lit(year)) # result_df = result_df.withColumn('month', F.lit(month)) # result_df = result_df.withColumn('day', F.lit(day)) # write out_partitions = len(US_STATES) result_df.repartition(out_partitions, "state").write.partitionBy( "state").format('parquet').mode("overwrite").save(local_dir + "/") # stats - enable only for debug! # num_records = result_df.count() # print(f"written {num_records} rows to "+local_dir) # if num_records == 0: # raise Exception("Zero rows output") print("upload local data to azure") upload_time = time.time() # upload parts over states for state in US_STATES: print(f"upload files for {state}") state_dir = local_dir + "/state=" + state state_key = f"{partition_key}/state={state}/" if (os.path.isdir(state_dir)): files = [ filename for filename in os.listdir(state_dir) if filename.startswith("part-") ] if len(files) > 0: for file_local in files: file_path = state_dir + "/" + file_local part_num = int(file_local.split('-')[1]) part_key = '{:05d}'.format(part_num) # fix name as static hash to be reproducible filename_hash = hashlib.sha1( str.encode(state_key + part_key)).hexdigest() blob_key = "{}/state={}/part-{}-{}.snappy.parquet".format( blob_base, state, part_key, filename_hash) print("upload " + file_path + " to " + container_out + ":" + blob_key) blob_client = blob_service_client.get_blob_client( container_out, blob_key) with open(file_path, "rb") as data: blob_client.upload_blob(data, overwrite=True) # cleanup os.remove(file_path) else: print(f"no files to upload for {state}") else: print(f"missing partition for {state}") print("--- {} seconds elapsed ---".format(int(time.time() - start_time))) print() stop_time = time.time() spark.stop() end_time = time.time() print("Done in {} seconds (csv:{} read:{} spark:{} upload:{} stop:{})". format(int(end_time - start_time), int(read_time - csv_time), int(spark_time - read_time), int(upload_time - spark_time), int(stop_time - upload_time), int(end_time - stop_time))) print('Done.')
# Extract event sequences and groundtruth udf_normalize = F.udf( lambda x: [[ (x[i][0] - x[0][0] + (x[-1][0] - x[0][0]) / (len(x) - 1)) / args.time_divisor, float(x[i][1]), ] for i in range(len(x))], psql.types.ArrayType(psql.types.ArrayType(psql.types.FloatType())), ) with Timer("extract event sequences"): event_seqs = (df_filtered.withColumn( "phrase", F.explode("phrases")).withColumn( "event", F.array("ts", "type")).groupby("phrase").agg( F.array_sort( F.collect_set("event")).alias("event_seq")).filter( F.size("event_seq").between( args.min_seq_length, args.max_seq_length)).withColumn( "event_seq", udf_normalize("event_seq"))).persist() event_seqs.limit(5).toPandas() # seq_lengths = ( # event_seqs.select("phrase", F.size("event_seq").alias("size")) # .groupby("size") # .count() # .sort("size") # )