def analysis_6(units_df, primary_person_df, log): """In two ways found the result for Query 6. :param units_df: DataFrame Units_use. :param primary_person_df: DataFrame Primary_Person_use. :param log: Logger. :return None """ joined = units_df.alias("U").join(primary_person_df.alias("P"), col("P.CRASH_ID") == col("U.CRASH_ID")) crashes_by_zip = ( joined .filter(col("PRSN_ALC_RSLT_ID") == "Positive") .filter(~col("VEH_BODY_STYL_ID").contains("MOTORCYCLE")) .filter(col("DRVR_ZIP").isNotNull()) .groupBy("DRVR_ZIP").agg(size(collect_set(col("P.CRASH_ID"))).alias("CRASHES")) ) zip_ordered_by_crashes = crashes_by_zip.withColumn("rnk", dense_rank().over(Window.orderBy(desc("CRASHES")))) top5_zip = zip_ordered_by_crashes.filter(col("rnk") < 6).select("DRVR_ZIP", "CRASHES") # APPROACH - 2 BY CONTRIBUTING FACTOR contributing_factor_alcohol = ( joined .filter(col("CONTRIB_FACTR_1_ID").contains("ALCOHOL") | col( "CONTRIB_FACTR_2_ID").contains("ALCOHOL") | col("CONTRIB_FACTR_P1_ID").contains("ALCOHOL")) .filter(col("DRVR_ZIP").isNotNull()) .groupBy("DRVR_ZIP").agg(size(collect_set(col("P.CRASH_ID"))).alias("CRASHES")) ) zip_by_crashes = contributing_factor_alcohol.withColumn("rnk", dense_rank().over(Window.orderBy(desc("CRASHES")))) top5 = zip_by_crashes.filter(col("rnk") < 6).select("DRVR_ZIP", "CRASHES") log.warn("Results for Query 6") top5_zip.show(10, False) top5.show(10, False) return None
def prepare_firmware_cve_counts(firmware_cves_df: DataFrame, firmware_hashes_df: DataFrame) -> DataFrame: # yapf: disable # Ensure that the windows for each of low, med, hi, and crit are over the entire firmware space instead of just # those for which a CVE is known to exist firmware_cves_full_df = firmware_hashes_df.join( firmware_cves_df, 'firmware_hash', 'left' ).na.fill(0) low_window = Window.orderBy(firmware_cves_full_df['low']) med_window = Window.orderBy(firmware_cves_full_df['medium']) high_window = Window.orderBy(firmware_cves_full_df['high']) crit_window = Window.orderBy(firmware_cves_full_df['critical']) # sum(wi*xi)/sum(wi) cve_composite_score = (percent_rank().over(low_window) + (2 * percent_rank().over(med_window)) + (3 * percent_rank().over(high_window)) + (4 * percent_rank().over(crit_window))) / 10 fwc_with_score_df = firmware_cves_full_df.withColumn( 'firmware_cve_component_score', cve_composite_score ).select( 'firmware_hash', 'firmware_cve_component_score' ) # yapf: enable return fwc_with_score_df
def initialize_edges(self, vertices): src = vertices.select(F.col("id")).orderBy(F.rand()).limit(self.nbr_edges).withColumnRenamed("id", "src") \ .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))) src.createOrReplaceTempView("src") vertices.createOrReplaceTempView("vertices") query = self.spark.sql("select vertices.id from vertices minus select src.src from src") dst = query.orderBy(F.rand()).limit(self.nbr_edges).withColumnRenamed("id", "dst") \ .withColumn("id", F.row_number().over(Window.orderBy(F.monotonically_increasing_id()))) self.edges = src.join(dst, src.id == dst.id).select(F.col('src'), F.col('dst')).persist(StorageLevel.MEMORY_AND_DISK) return self.edges
def accomodate_people(df:DataFrame): """[calculate accommodation possibilities for the lowest price and highest rating] Args: df (DataFrame): [spark dataframe] """ (df.withColumn("rank_ranking",f.rank().over(Window.orderBy(f.desc("review_scores_value")))) .withColumn("price_ranking", f.rank().over(Window.orderBy("price"))) .filter((f.col('rank_ranking') == 1) & (f.col("price_ranking")==1)) .select(f.col("accommodates")) .coalesce(1) .write .option("header","true") .format('csv') .save('out/out_2_4.txt'))
def main(sc, out_file_name): """ Read GDELT data from S3, count occurrence of news sources, determine the top 100 most frequent ones, and write list to out_file_name """ #Read 'GKG" table from GDELT S3 bucket. Transform into RDD gkgRDD = sc.textFile('s3a://gdelt-open-data/v2/gkg/2018*.gkg.csv') gkgRDD = gkgRDD.map(lambda x: x.encode("utf", "ignore")) gkgRDD.cache() gkgRDD = gkgRDD.map(lambda x: x.split('\t')) gkgRDD = gkgRDD.filter(lambda x: len(x) == 27) gkgRDD = gkgRDD.filter(lambda x: f.is_not_empty([x[3]])) gkgRowRDD = gkgRDD.map(lambda x: Row(src_common_name=x[3])) sqlContext = SQLContext(sc) #Transform RDDs to dataframes gkgDF = sqlContext.createDataFrame(gkgRowRDD) #Frequency count for each source srcDF = gkgDF.select('src_common_name').groupBy('src_common_name').agg( count('*').alias('count')) #Select top 100 most frequent sources, and write to output file window = Window.orderBy(srcDF['count'].desc()) rankDF = srcDF.select( '*', rank().over(window).alias('rank')).filter(col('rank') <= 100).where( col('src_common_name') != '') pandasDF = rankDF.toPandas() pandasDF.to_csv(out_file_name, columns=["src_common_name", "count", "rank"])
def test_window_functions_without_partitionBy(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.orderBy("key", df.value) from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 4, 1, 1, 1, 1), ("2", 1, 1, 1, 4, 2, 2, 2, 1), ("2", 1, 2, 1, 4, 3, 2, 2, 2), ("2", 2, 2, 2, 4, 4, 4, 3, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def main(): spark, sc = spark_init() schemaDF = StructType([ StructField("empid", StringType(), True), StructField("name", StringType(), True), StructField("rank", DoubleType(), True), StructField("salary", StringType(), True), StructField("mgrid", StringType(), True) ]) empDfBr = spark.read\ .option("header","true")\ .schema(schemaDF)\ .option("sep",",")\ .format("csv")\ .load(r"C:\data\retail_db\order_items\employee2.txt") #broadcastEmpDfBr = sc.broadcast(empDfBr) df1 = F.broadcast(empDfBr).alias("a").join(F.broadcast(empDfBr).alias("b"),\ F.col("a.mgrid")==F.col("b.empid"),"leftouter")\ .select(F.col("a.*"), F.col("b.name").alias("manager_name")) df2 = df1.select("*").agg(F.col("mgr_id")).select("*").take(1) df1.withColumn("total_salary", F.sum(F.col("salary")).over(Window.partitionBy(F.col("manager_name"))))\ .withColumn("rownum", F.row_number().over(Window.orderBy(F.col("total_salary").desc())))\ .filter(F.col("rownum")==1).select("mgrid","manager_name").show() df1.groupBy(F.col("mgrid")).agg(F.max(F.col("salary")).alias("max_sal")).filter(F.col("mgrid").isNotNull())\ .select("mgrid","max_sal").collect()
def city_process(input_data, output_data, spark): demo_df = spark.read.format('csv').load(os.path.join( input_data, 'demographics/*.csv'), header=True, inferSchema=True, sep=';') #cut down table and rename columns demo_df = demo_df.select('City','State Code')\ .withColumnRenamed('City', 'city')\ .withColumnRenamed('State Code', 'state_code') #read in airport data to get us cities df_air = spark.read.format('csv').load(os.path.join( input_data, 'airports/*.csv'), header=True, inferSchema=True) #filter down to only US cities us_air = df_air.filter(df_air.iso_country == 'US') #apply function and rename municipality us_air = us_air.withColumn('state_code', region_state(col('iso_region')))\ .withColumnRenamed('municipality', 'city')\ .select('city', 'state_code') #combine the two dfs together to create the final city table city = us_air.union(demo_df)\ .drop_duplicates()\ .withColumn('city_Id', F.monotonically_increasing_id())\ .withColumn('city_id,', F.row_number().over(W.orderBy('city_Id')))\ .select('city_id', 'city', 'state_code') #write final df to s3 processed path city.write.mode('overwrite').parquet(os.path.join(output_data, 'city/'))
def als_model(): user_inventory = spark.sql("SELECT * FROM userinfo").filter( 'playtime_forever > 0') ratings = user_inventory.withColumn( "user", f.dense_rank().over(Window.orderBy("userid"))) correspond = ratings.select('userid', 'user').dropDuplicates() als = ALS(userCol="user", itemCol="appid", ratingCol="playtime_forever") model = als.fit(ratings) model.save("als_model") top20 = model.recommendForAllUsers(20) recommend = top20.join(correspond, top20.user == correspond.user).select( 'userid', 'recommendations') recommendList = recommend.rdd.map( lambda x: (x[0], [appid[0] for appid in x[1]])).collect() for r in recommendList: userid = r[0] idList = r[1] spark.sql("INSERT INTO als_top20 ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s',\ '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" % \ (userid, idList[0], idList[1], idList[2], idList[3], idList[4], idList[5], \ idList[6], idList[7], idList[8], idList[9], idList[10], idList[11], \ idList[12], idList[13], idList[14], idList[15], idList[16], \ idList[17], idList[18], idList[19]))
def generate_aggregate_player_data(shots_fixed): """ Generates aggregate information and unique count for NBA players based on shots taken Parameters ---------- shots_fixed: Dataframe containing NBA player shot data and the following columns: 'GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME', 'EVENT_TYPE', 'LOC_X', 'LOC_Y', and 'SHOT_DISTANCE' Returns ------- Aggregated player data Notes ----- """ # filter to relevant columns result = shots_fixed.select('GAME_ID', 'GAME_EVENT_ID', 'PLAYER_ID', 'PLAYER_NAME', 'EVENT_TYPE', 'LOC_X', 'LOC_Y', 'SHOT_DISTANCE') # categorize shots result = result.withColumn('EVENT_TYPE', F.when(F.col('EVENT_TYPE')== 'Missed Shot', 0).otherwise(1)) # aggregate all player data result = result.groupBy('PLAYER_ID', 'PLAYER_NAME').pivot('EVENT_TYPE').count() result = result.withColumnRenamed('0', 'missed_shot').withColumnRenamed('1', 'made_shot') # sort and add unique numerical id for training in TensorFlow w1 = Window.orderBy("PLAYER_ID") result = result.withColumn('rank', F.rank().over(w1)) result = result.withColumn('rank', F.col('rank')- 1) return result
def train_test_split(full_data, country, feature_list, time_horizon): target_label = country + "_cases" prev = country + "_1lag" w = Window.orderBy(col("Date")) # Assemble feature vector where Canada cases are the target vectorAssembler = VectorAssembler(inputCols=feature_list, outputCol='features') vdata = vectorAssembler.transform(full_data) \ .withColumnRenamed(target_label, "actual_orig") \ .withColumn("actual", lead(col("actual_orig"), time_horizon).over(w)) \ .withColumn("diff", col("actual_orig")-col(prev)) \ .withColumn("actual_diff", lead(col("diff"), time_horizon).over(w)) \ .filter(col("actual_diff").isNotNull()) train = vdata.select( ['features', "Date", "actual_orig", "diff", "actual", "actual_diff"]).filter(col("Date") < SPLIT_DT) test = vdata.select( ['features', "Date", "actual_orig", "diff", "actual", "actual_diff"]).filter(col("Date") >= SPLIT_DT) print("Total days: " + str(vdata.count())) print("Total days for train dataset: " + str(train.count())) print("Total days for test dataset: " + str(test.count())) return train, test
def data_reduction(table, limite_superior, limite_inferior, KPI): data_alg = table.select('date_time', 'sector_id', KPI).withColumn('condition', table[KPI].between(95,100)) h = data_alg.withColumn('numero', F.when(data_alg['condition'] == True, 1).otherwise(-1)) #Guardamos una columna con el valor numerico de True y False df_lag = h.withColumn('estado_anterior', F.lag(h['numero']) .over(Window.orderBy('sector_id'))) #Guardamos el valor anterior al valor de observación, #para saber su estado anterior result = df_lag.withColumn('derivada', (df_lag['numero'] - df_lag['estado_anterior'])) #Calculamos la derivada con la resta del valor actual #y del estado anterior g = result.withColumn('Start', F.when(result['derivada'] == -2,result.date_time )) #El comienzo de la degradación es donde la derivada #tiene cierto valor negativo, guardandose el date time s = g.withColumn('End', F.when(g['derivada'] == 2,result.date_time )) #El fin de la degradación es donde la derivada #tiene cierto valor positivo, guardandose el date time PERIODO = s.select(s.Start, s.End).dropna(how = 'all') Start = PERIODO.select(PERIODO.Start).dropna(how = 'any') End = PERIODO.select(PERIODO.End).dropna(how = 'any') final = Start.withColumn('End', End.End) # final = Start.withColumn() #final = [Start.toPandas(), End.toPandas()] #resultado= F.concat(Start, End) #Result = pd.concat(final, axis = 1) #print(Result) display(final)
def getClusterData(amenities): n_clust = 5 x = amenities.select('lat', 'lon').collect() model = KMeans(n_clusters=n_clust, random_state=353).fit(x) clusters = model.predict(x) cluster = clusters.tolist() centres = model.cluster_centers_ # convert list to a dataframe df = sqlContext.createDataFrame([(l, ) for l in cluster], ['cluster']) df = df.withColumn( "index", f.row_number().over(Window.orderBy(f.monotonically_increasing_id())) - 1) amnt = amenities.join(df, amenities.amnt_id == df.index).drop( "index", 'amnt_id') # amnt.show() lat = amnt.select('lat').collect() lon = amnt.select('lon').collect() cluster = amnt.select('cluster').collect() return lat, lon, cluster, centres, amnt
def get_recordings_df(mapped_listens_df, metadata, save_path): """ Prepare recordings dataframe. Args: mapped_listens_df (dataframe): listens mapped with msid_mbid_mapping. save_path (str): path where recordings_df should be saved Returns: recordings_df: Dataframe containing distinct recordings and corresponding mbids and names. """ recording_window = Window.orderBy('mb_recording_mbid') recordings_df = mapped_listens_df.select('mb_artist_credit_id', 'mb_artist_credit_mbids', 'mb_recording_mbid', 'mb_release_mbid', 'msb_artist_credit_name_matchable', 'msb_recording_name_matchable') \ .distinct() \ .withColumn('recording_id', rank().over(recording_window)) metadata['recordings_count'] = recordings_df.count() save_dataframe(recordings_df, save_path) return recordings_df
def calculate_distance(enhanced_checkin_data): """ Create distance (km) column in PySpark dataframe Parameters ---------- enhanced_checkin_data: Dataframe that has gone through parse_and_clean_swarm_venue_responses function or has following equivalent columns: created_at, lat, lng Returns ------- Same dataframe that has an additional column called distance_in_km Notes ----- """ # remove records that do not have geocoordinates result = enhanced_checkin_data.filter(F.col('lat').isNotNull()) # add prior latitude and longitude as new columns for every record w1 = Window.orderBy(result.createdAt.asc()) result = result.withColumn('prior_latitude', F.lag(F.col('lat'), 1).over(w1)) \ .withColumn('prior_longitude', F.lag(F.col('lng'), 1).over(w1)) \ .withColumn('prior_name', F.lag(F.col('name'), 1).over(w1)) \ .withColumn('prior_country', F.lag(F.col('country'), 1).over(w1)) # remove the first data point for the calculation given there is no distance to be performed result = result.filter(F.col('prior_latitude').isNotNull()) # calculate distance in km leveraging udf define below result = result.withColumn('distance_in_km', calculate_distance_udf('lat', 'lng', 'prior_latitude', 'prior_longitude')) return result
def read_glove_vecs(glove_file, output_path): rdd = sc.textFile(glove_file) row = Row("glovevec") df = rdd.map(row).toDF() split_col = F.split(F.col('glovevec'), " ") df = df.withColumn('word', split_col.getItem(0)) df = df.withColumn('splitted', split_col) vec_udf = F.udf(lambda row: [float(i) for i in row[1:]], ArrayType(FloatType())) df = df.withColumn('vec', vec_udf(F.col('splitted'))) df = df.drop('splitted', "glovevec") w = Window.orderBy(["word"]) qdf = df.withColumn('vec', F.concat_ws(',', 'vec')).withColumn("id", F.row_number().over(w)) path = '{}/words'.format(output_path) qdf.coalesce(1).write.format('csv').option("sep", "\t").option('header', 'true').save(path) print('Words saved to: "{}"'.format(path)) list_words = list(map(lambda row: row.asDict(), qdf.collect())) word_to_vec_map = {item['word']: item['vec'] for item in list_words} words_to_index = {item['word']: item["id"] for item in list_words} index_to_words = {item["id"]: item['word'] for item in list_words} return words_to_index, index_to_words, word_to_vec_map
def get_top_charts_globally(raw_df, chart_length): """ :param raw_df: Raw data to be processed :param chart_length: the number of records to be displayed :return: clean data frame containing top ranked combinaton of atrist to track globally """ df_raw = raw_df.select( 'tagid', col('match.track.id').alias('track_id'), col('match.track.metadata.tracktitle').alias('track_title'), col('match.track.metadata.artistname').alias('artist_name')) # Getting distinct counts of tagids for each track df_agg = df_raw.groupBy("track_id").agg( countDistinct("tagid").alias('tag_id_count')).orderBy( desc("tag_id_count")) # Joining back to the raw dataframe as we need artist name and track name for display df_final = df_agg.join(df_raw, df_raw['track_id'] == df_agg['track_id'], how="inner") # Using dense_rank function to rank each record based on the tag_id_cout df_final = df_final.select( 'artist_name', 'track_title', 'tag_id_count').withColumn( "chart_position", dense_rank().over(Window.orderBy(desc("tag_id_count")))).orderBy( desc("tag_id_count")).dropDuplicates() df_final = df_final.select('chart_position', 'track_title', 'artist_name') return df_final
def score_recommender(spark, model, df_movie, k, user_id): from pyspark.sql.window import Window from pyspark.sql.functions import row_number, col # rec_items = model.recommendForAllUsers(k) df_scoring = create_scoring_data(user_id, df_movie) dfs_scoring = spark.createDataFrame(df_scoring) dfs_predictions = model.transform(dfs_scoring) dfs_predictions = dfs_predictions.dropDuplicates(['user', 'item', 'prediction']) window = Window.orderBy(dfs_predictions['prediction'].desc()) df_topk = dfs_predictions \ .select('*', row_number().over(window).alias('row_number')) \ .filter(col('row_number') <= k) \ .drop('row_number', 'prediction', 'rating') \ .toPandas() recs = list(df_topk.item) print(recs) return recs
def find_most_common_trees(self, df): """Find the top 5 most commonly occurred tree types in San Francisco area :param df: Input DataFrame containing all details of trees :return: Dataframe of top 5 common tree types """ # create split rule to find the tree sub type split_col = split(df['species'], '::') # a dataframe of trees with only required fields comm_trees = df.select('species', 'tree_id').withColumn('tree_type', split_col.getItem(1)) # filter out the trees with no or unknown sub types and get the count of valid sub types comm_tree_df = (comm_trees.select('tree_type').filter( col('tree_type') != '').groupBy('tree_type').count().orderBy( col('count').desc())) # find the top 5 commonly occured tree types by using ranking most_comm_trees = comm_tree_df.withColumn( "rank", rank().over(Window.orderBy(col("count").desc()))).filter( col("rank") <= 5).select('tree_type', 'count') self.log.warn('Found the top 5 most common trees in San Francisco') return most_comm_trees
def create_train_data(): w1 = Window.orderBy("uid") w2 = Window.partitionBy("seg").orderBy("uid") df_train = spark.read.csv( os.path.join("datasets", "train.csv"), header=True, schema=schema).withColumn( "uid", monotonically_increasing_id()).withColumn( "idx", row_number().over(w1).cast(IntegerType())).withColumn( "seg", fn.floor(((fn.col("idx") - 1) / 150000)).cast( IntegerType())).withColumn( "no", row_number().over(w2).cast( IntegerType())).withColumn( "name", fn.concat( lit("raw_"), fn.lpad(fn.col("seg"), 4, "0").cast( StringType()))).withColumn( "set", lit(0)) df_train.createOrReplaceTempView("data") df_train_f = spark.sql(""" SELECT uid, set, seg, no, name, x, y FROM data ORDER BY set, seg, no, uid """) df_train_f = df_train_f.repartition(1) df_train_f.write.mode("overwrite").parquet( os.path.join("datasets", "train.parquet"))
def save_dataset(df_pos, df_neg, path): df = df_pos.union(df_neg) w = Window.orderBy(["words_stemmed"]) df = df.withColumn("review_id", F.row_number().over(w)).withColumn('int_seq', F.concat_ws(',', 'words_stemmed')) qdf = df.select(['review_id', 'int_seq', 'class']) qdf.coalesce(1).write.format('csv').option('header', 'true').save(path)
def top_zip_crashes_alcohol(df_unit, df_person, top_n): """ :param df_unit: :type df_unit: :param df_person: :type df_person: :param top_n: :type top_n: :return: :rtype: """ df_joined = df_person.join(df_unit, ['CRASH_ID'], 'left') \ .select(['CRASH_ID', 'DRVR_ZIP', 'CONTRIB_FACTR_1_ID', 'CONTRIB_FACTR_2_ID', 'CONTRIB_FACTR_P1_ID']) \ .drop_duplicates() wspec = Window.orderBy(desc('crash_count')) intd_df = df_joined.filter((df_joined['CONTRIB_FACTR_1_ID'].like('%DRINKING%')) | (df_joined['CONTRIB_FACTR_1_ID'].like('%ALCOHOL%')) | (df_joined['CONTRIB_FACTR_2_ID'].like('%DRINKING%')) | (df_joined['CONTRIB_FACTR_2_ID'].like('%ALCOHOL%')) | (df_joined['CONTRIB_FACTR_P1_ID'].like('%DRINKING%')) | (df_joined['CONTRIB_FACTR_P1_ID'].like('%ALCOHOL%'))) intd_df1 = intd_df.groupBy('DRVR_ZIP').agg(countDistinct('CRASH_ID').alias('crash_count')) \ .orderBy(desc('crash_count')).dropna() intd_df2 = intd_df1.withColumn('rank', dense_rank().over(wspec)) driver_zip_obj = intd_df2.filter(intd_df2['rank'] <= top_n).collect() list_driver_zip = [row['DRVR_ZIP'] for row in driver_zip_obj] return list(enumerate(list_driver_zip, start=1))
def convert_annoy_index(item_factors): window = Window.orderBy('id') item_factors = item_factors.withColumn('annoy_id', row_number().over(window)) annoy_index_map = item_factors.select('id', 'annoy_id') item_factors = item_factors.select('annoy_id', 'features') return item_factors, annoy_index_map
def convert_idf_score_to_buckets(tenant_idf): global IDF_VALUE global BUCKETS over_all = Window.orderBy(IDF_VALUE) tenant_idf_with_bucket = tenant_idf.withColumn( IDF_VALUE, ntile(BUCKETS).over(over_all)) print("SystemLog: Done converting to bucketized score") return tenant_idf_with_bucket
def get_cdf(df, variable, col_name): cdf = df.select(variable).na.drop().\ withColumn( col_name, funcs.cume_dist().over(Window.orderBy(variable)) ).distinct() return cdf
def readFromTables(column, row_name='temp_rownum'): df = spark.read.table("temp.sample_{}_{}".format(table, 0)) df = df_column.select(column) for i in range(len(partitions)-1): df_temp = spark.read.table("temp.sample_{}_{}".format(table, i+1)) df = df.union(df_temp) df = df\ .orderBy(column)\ .withColumn(row_name, F.row_number().over(W.orderBy(column)))\ .limit(threshold) return df ## Set parameters source = self.source db, table = tablename.split(sep='.') print("Counting ...") columns = [ column for column in source.columns if source.select(column).distinct().count() < threshold ] print("only enumerating: ", columns) ## is the table have partition? if yes devide by the partitions!! try: partitions = spark.sql("""SHOW PARTITIONS {}""".format(tablename)).limit(1).collect() print("ALERT!! Partition File, it'll takes longer time") ### iterate all partition file then yield some distinct element of each part for i, partition_str in enumerate(partitions): filter_condition = partition_str[0].split(sep='/') filter_condition = ' AND '.join(filter_condition) print("Executing {}".format(filter_condition)) df_part = distinctElement(source.filter(filter_condition), columns) print("Saving into temp.sample_{}_{}".format(table,i)) df_part.write\ .format("parquet")\ .mode("overwrite")\ .saveAsTable("temp.sample_{}_{}".format(table, i)) ### iterate read file, and yield global distinct elements of all partitions print("Re-reading") listoftuple = [(x+1,) for x in range(threshold)] df_column = spark.createDataFrame(listoftuple, schema = ['index']) for column in columns: df_temp = readFromTables(column) df_column = df_column.join(df_temp, df_temp.temp_rownum==df_column.column_rownum, 'left') df_all = df_column.orderBy('index').drop('index') ### Drop all sample table print("Drop all sample table") for i, partition_str in enumerate(partitions): spark.sql(""" DROP TABLE temp.sample_{}_{}""".format(table, i)) return df_all ## if only the table not partitioned except: return distinctElement(self.source, columns)
def transform_data_with_udf(clickstream_data, purchase_data): window1 = Window.partitionBy('userId').orderBy('eventTime') window2 = Window.orderBy('sessionId') clickstream_data = (clickstream_data.withColumn( 'appOpenFlag', app_open_flag_udf(clickstream_data['eventType'])).withColumn( 'sessionId', sum(col('appOpenFlag')).over(window1)).withColumn( 'attr', attributes_udf( clickstream_data['eventType'], clickstream_data['attributes'])).withColumn( 'campaign_id', when( get_json_object('attr', '$.campaign_id').isNotNull(), get_json_object('attr', '$.campaign_id')).otherwise(None) ).withColumn( 'channel_id', when( get_json_object('attr', '$.channel_id').isNotNull(), get_json_object( 'attr', '$.channel_id')).otherwise(None)).withColumn( 'purchase_id', when( get_json_object( 'attr', '$.purchase_id').isNotNull(), get_json_object( 'attr', '$.purchase_id')).otherwise(None)). withColumn( 'campaignId', last(col('campaign_id'), ignorenulls=True).over( window2.rowsBetween( Window.unboundedPreceding, 0))).withColumn( 'channelId', last(col('channel_id'), ignorenulls=True).over( window2.rowsBetween( Window.unboundedPreceding, 0)))) target_df = clickstream_data.join( purchase_data, clickstream_data['purchase_id'] == purchase_data['purchaseId'], JOIN_TYPE.LEFT) return target_df.select(col('purchaseId'), col('purchaseTime'), col('billingCost'), col('isConfirmed'), col('sessionId'), col('campaignId'), col('channelId'))
def data_range(self, verbose=True): """ Ensures variables within the dataframe well_df are within range, as set by the attribute thresholds. The out of range values are replaced by the previous in range value Parameters ---------- verbose : bool (optional) whether to allow for verbose (default is True) """ window = Window.orderBy("ts") # Spark Window ordering data frames by time lag_names = [] # Empty list to store column names for well_columns in self.well_df.schema.names: # loop through all components (columns) of data if well_columns != "ts": # no tresholding for timestamp if well_columns in self.thresholds.keys(): tresh = self.thresholds[well_columns] # set thresholds values for parameter from dictionary else: tresh = [-1000, 1000] # if feature not in thresholds attribute, set large thresholds if verbose: print(well_columns, "treshold is", tresh) for i in range(1, 10): # Naive approach, creating large amount of lagged features columns lag_col = well_columns + "_lag_" + str(i) lag_names.append(lag_col) self.well_df = self.well_df.withColumn(lag_col, F.lag(well_columns, i, 0).over(window)) for i in range(8, 0, -1): lag_col = well_columns + "_lag_" + str(i) prev_lag = well_columns + "_lag_" + str(i + 1) # apply minimum and maximum threshold to column, and replace out of range values with previous value self.well_df = self.well_df.withColumn(lag_col, F.when(F.col(lag_col) < tresh[0], F.col(prev_lag)) .otherwise(F.col(lag_col))) self.well_df = self.well_df.withColumn(lag_col, F.when(F.col(lag_col) > tresh[1], F.col(prev_lag)).otherwise(F.col(lag_col))) # apply minimum and maximum threshold to column, and replace out of range values with previous value lag_col = well_columns + "_lag_1" self.well_df = self.well_df.withColumn(well_columns, F.when(F.col(well_columns) < tresh[0], F.col(lag_col)) .otherwise(F.col(well_columns))) self.well_df = self.well_df.withColumn(well_columns, F.when(F.col(well_columns) > tresh[1], F.col(lag_col)) .otherwise(F.col(well_columns))) self.well_df = self.well_df.drop(*lag_names) return
def running_total(): input_data_path = os.path.join(folder_path, 'orders_data', 'orders.csv') df = sqlContext.read \ .option("multiline", "true") \ .option("header", "true") \ .csv(input_data_path) wind = Window.orderBy("id") windCol = functions.sum("orderQty").over(wind) df.select("*", windCol.alias("totalQuantity").cast(IntegerType())) \ .show()
def split_by_row_index(df, num_partitions=2): # Let's assume you don't have a row_id column that has the row order t = df.withColumn('_row_id', monotonically_increasing_id()) # Using ntile() because monotonically_increasing_id is discontinuous across partitions t = t.withColumn('_partition', ntile(num_partitions).over(Window.orderBy(t._row_id))) return [ t.filter(t._partition == i + 1).drop('_row_id', '_partition') for i in range(num_partitions) ]
def lag_generater(crimeWeek): crimeWeek = crimeWeek.select( "*", lag("count").over( Window.orderBy("yearweek")).alias("count_lag1")).na.drop() crimeWeek = crimeWeek.select( "*", lag("count_lag1").over( Window.orderBy("yearweek")).alias("count_lag2")).na.drop() crimeWeek = crimeWeek.select( "*", lag("count_lag2").over( Window.orderBy("yearweek")).alias("count_lag3")).na.drop() crimeWeek = crimeWeek.select( "*", lag("count_lag3").over( Window.orderBy("yearweek")).alias("count_lag4")).na.drop() crimeWeek = crimeWeek.withColumnRenamed("count", "label") return crimeWeek
def runOtherFunctions(spark, personDf): df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]); # array df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False) # desc, asc personDf.show() personDf.sort(functions.desc("age"), functions.asc("name")).show() # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음 # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능) df2 = spark.createDataFrame([("Splits str around pattern",)], ['value']) df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False) # rownum, rank f1 = StructField("date", StringType(), True) f2 = StructField("product", StringType(), True) f3 = StructField("amount", IntegerType(), True) schema = StructType([f1, f2, f3]) p1 = ("2017-12-25 12:01:00", "note", 1000) p2 = ("2017-12-25 12:01:10", "pencil", 3500) p3 = ("2017-12-25 12:03:20", "pencil", 23000) p4 = ("2017-12-25 12:05:00", "note", 1500) p5 = ("2017-12-25 12:05:07", "note", 2000) p6 = ("2017-12-25 12:06:25", "note", 1000) p7 = ("2017-12-25 12:08:00", "pencil", 500) p8 = ("2017-12-25 12:09:45", "note", 30000) dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema) w1 = Window.partitionBy("product").orderBy("amount") w2 = Window.orderBy("amount") dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"), functions.rank().over(w2).alias("rank")).show()
def collect_numeric_metric(metric, df, population): cdf = df.select(df[metric['src']]) cdf = cdf.dropna(subset=metric['src']) cdf = cdf.select(cdf[metric['src']].cast('float').alias('bucket')) total_count = cdf.count() num_partitions = total_count / 500 ws = Window.orderBy('bucket') cdf = cdf.select( cdf['bucket'], cume_dist().over(ws).alias('c'), row_number().over(ws).alias('i')) cdf = cdf.filter("i = 1 OR i %% %d = 0" % num_partitions) cdf = cdf.collect() # Collapse rows with duplicate buckets. collapsed_data = [] prev = None for d in cdf: if not collapsed_data: collapsed_data.append(d) # Always keep first record. continue if prev and prev['bucket'] == d['bucket']: collapsed_data.pop() collapsed_data.append(d) prev = d # Calculate `p` from `c`. data = [] prev = None for i, d in enumerate(collapsed_data): p = d['c'] - prev['c'] if prev else d['c'] data.append({ 'bucket': d['bucket'], 'c': d['c'], 'p': p, }) prev = d """ Example of what `data` looks like now:: [{'bucket': 0.0, 'c': 0.00126056, 'p': 0.00126056}, {'bucket': 3.0, 'c': 0.00372313, 'p': 0.00246256}, {'bucket': 4.0, 'c': 0.00430616, 'p': 0.0005830290622683026}, {'bucket': 6.13319683, 'c': 0.00599801, 'p': 0.00169184}, {'bucket': 8.0, 'c': 0.08114486, 'p': 0.07514685}, {'bucket': 8.23087882, 'c': 0.08197282, 'p': 0.00082795}, ...] """ # Push data to database. sql = ("INSERT INTO api_numericcollection " "(num_observations, population, metric_id, dataset_id) " "VALUES (%s, %s, %s, %s) " "RETURNING id") params = [total_count, population, metric['id'], dataset_id] if DEBUG_SQL: collection_id = 0 print sql, params else: cursor.execute(sql, params) conn.commit() collection_id = cursor.fetchone()[0] for d in data: sql = ("INSERT INTO api_numericpoint " "(bucket, proportion, collection_id) " "VALUES (%s, %s, %s)") params = [d['bucket'], d['p'], collection_id] if DEBUG_SQL: print sql, params else: cursor.execute(sql, params) if not DEBUG_SQL: conn.commit()