def filter_traces_contained(df, dt1, dt2, parameters=None): """Gets traces that are contained in the given interval """ if parameters is None: parameters = {} timestamp_key = parameters[ PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY case_id_glue = parameters[ PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME dt1 = get_dt_from_string(dt1) dt2 = get_dt_from_string(dt2) df_converted = importer.convert_timestamp_to_utc_in_df( df, timest_columns={timestamp_key}) df_ordered = df_converted.orderBy(case_id_glue, timestamp_key) w = Window().partitionBy(case_id_glue).orderBy(timestamp_key) w2 = Window().partitionBy(case_id_glue).orderBy(F.desc(timestamp_key)) stacked = df_ordered.withColumn(timestamp_key + "_last", F.max(df_ordered[timestamp_key]).over(w2)) stacked = stacked.withColumn(timestamp_key + "_first", F.min(stacked[timestamp_key]).over(w)) stacked = stacked.filter(stacked[timestamp_key + "_first"] > dt1) stacked = stacked.filter(stacked[timestamp_key + "_last"] < dt2) stacked_dropped = stacked.drop(timestamp_key + "_last", timestamp_key + "_first") return stacked_dropped
def profiler(table): print('PROFILING TABLE: ' + str(table)) table.unpersist() table.cache() for coll in table.columns: print('CHECKING MAX LENGTH') try: max_length = table.rdd.map(lambda x: len(str(x[coll]))).reduce( lambda x, y: x if x > y else y) except Exception as e: continue print('CHECKING MIN LENGTH') try: min_length = table.rdd.map(lambda x: len(str(x[coll]))).reduce( lambda x, y: x if x < y else y) except Exception as e: continue print('MAX LENGTH: ' + str(max_length) + ' MIN LENGTH: ' + str(min_length)) print('GROUP BY ON COLUMN: ' + str(coll)) groupBy = table.groupBy(coll).agg(count(coll).alias('c')).orderBy( col('c').desc()) groupBy.show(10, 1000) if groupBy.count() > 1: print('THE TABLE HAS NOT UNIQUE VALUES') else: print('THE TABLE HAS DISTINCT VALUES') print('ROW COUNT ON COLUMN: ' + str(coll)) table.withColumn( 'row_num', row_number().over( Window().partitionBy(coll).orderBy(coll))).filter( col('row_num') == 1).show(10, 100) table.unpersist() print('FINISHED PROCESSING TABLE: ' + str(table))
def main(): print (f"""Getting average yearly prices per region for all""") appName = config['common']['appName'] spark = s.spark_session(appName) sc = s.sparkcontext() spark = s.setSparkConfBQ(spark) lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect() print("\nStarted at");uf.println(lst) wSpecY = Window().partitionBy(F.date_format('Date',"yyyy"), 'regionname') house_df = s.loadTableFromBQ(spark,config['GCPVariables']['sourceDataset'],config['GCPVariables']['sourceTable']) house_df.printSchema() house_df.show(2, False) print(f"""\nAnnual House prices per regions in GBP""") # Workout yearly aversge prices df2 = house_df. \ select( \ F.date_format('Date', 'yyyy').cast("Integer").alias('year') \ , 'regionname' \ , round(F.avg('averageprice').over(wSpecY)).alias('AVGPricePerYear') \ , round(F.avg('flatprice').over(wSpecY)).alias('AVGFlatPricePerYear') \ , round(F.avg('TerracedPrice').over(wSpecY)).alias('AVGTerracedPricePerYear') \ , round(F.avg('SemiDetachedPrice').over(wSpecY)).alias('AVGSemiDetachedPricePerYear') \ , round(F.avg('DetachedPrice').over(wSpecY)).alias('AVGDetachedPricePerYear')). \ distinct().orderBy('Date', asending=True) df2.show(20,False) s.writeTableToBQ(df2,"overwrite",config['GCPVariables']['targetDataset'],config['GCPVariables']['yearlyAveragePricesAllTable']) print(f"""created {config['GCPVariables']['yearlyAveragePricesAllTable']}""") lst = (spark.sql("SELECT FROM_unixtime(unix_timestamp(), 'dd/MM/yyyy HH:mm:ss.ss') ")).collect() print("\nFinished at");uf.println(lst)
def load_powietrze(keys_space_name="json", table_name="powietrze", time_frame=None, spark=None, agg=None, time_frames="5 minutes", time_update="1 minute"): # Wczytanie danych powietrze_temp, sc = load_table.load_and_get_table_df( keys_space_name, table_name, time_frame, spark) # Dodanie zmiennych opisujących dokładnie czas i suniecie kolumn nieuzywanych do predykcji powietrze = powietrze_preprocessing(powietrze_temp, agg, time_frames, time_update) powietrze.sort("name", "timestamp").show(200) # Stworzenie zmiennej celu w = Window().partitionBy("name").orderBy("timestamp") dane = powietrze.withColumn("target", lead("target_temp", 4).over(w)).na.drop() #dane.sort("name", "timestamp").show(200) #print(dane.schema) return dane, sc
def overall_prediction_grouping(csv, prediction_dataset=None): """ Grouping dataset by date :param csv: -- dataframe: containing all the data :param prediction_dataset: -- dataframe: containing previous prediction :return: -- dataframe: grouped """ grouped = csv.groupby('Date').agg({'Date': 'count'}) grouped_with_date = grouped.withColumn('Date', change_to_date_func(col('Date'))) window_row = Window().orderBy('Date') grouped_indexed = grouped_with_date.withColumn( 'id', row_number().over(window_row)) if prediction_dataset: grouped_with_cols = grouped_indexed.select('Date', 'id', 'count(Date)').withColumn( 'id', to_vector(col('id'))) prediction_dataset_with_cols = prediction_dataset.select( 'Date', 'id', 'prediction').withColumnRenamed('prediction', 'count(Date)') return grouped_with_cols.union(prediction_dataset_with_cols) else: return grouped_indexed.withColumn('id', to_vector(col('id')))
def _add_outliers(dataframe, **kwargs): """ Calculate a boundary for which a data point will be considered an outlier [bool] The boundary is the mean plus "stddev" (number of standard derivations) * the standard derivation Uses pyspark's Window function to partition over the special predictions and thereby count number of data points in each cluster, their number of outliers and the outlier percentage :param dataframe: :param kwargs: prediction_col can be set in the function call, else it will search for the column name 'predictionCol' distance_col can be set in the function call, else it will search for the column name 'distance' no_stddev (number of standard derivations) can be set in the function call, else default sat to 2 :return: dataframe with added 'is_outlier' bool column """ assert kwargs.get('distance_col', 'distance') in dataframe.columns, 'Distances have not been computed!' prediction_col = F.col(kwargs.get('prediction_col', 'prediction')) distance_col = F.col(kwargs.get('distance_col', 'distance')) no_stddev = kwargs.get('no_stddev', 2.0) window_outlier = Window().partitionBy(prediction_col) computed_boundary = (F.mean(distance_col).over(window_outlier) + no_stddev * F.stddev_pop(distance_col).over(window_outlier) ) return (dataframe .withColumn(colName='computed_boundary', col=computed_boundary) .withColumn(colName='is_outlier', col=F.when(distance_col > computed_boundary, 1).otherwise(0)) )
def merge_tracks_sources(tracks1, tracks2): # 1. Union overlapping columns overlapping_columns = [c for c in tracks1.columns if c in tracks2.columns] print("overlapping_columns", overlapping_columns) tracks1_project = tracks1.select(overlapping_columns) tracks2_project = tracks2.select(overlapping_columns) tracks = tracks1_project.union(tracks2_project) # .distinct() print("tracks", tracks.count(), tracks1.count(), tracks2.count()) tracks.show(10, truncate=10) show_distinct(tracks, "id", count_only=True) # 2. Remove duplicate tracks w = Window().partitionBy("id").orderBy("name") # some tracks appear in both data sources, but their records are not identical, and we select the first record for each id tracks = tracks.select('*', row_number().over(w).alias("rank")).where( "rank = 1").select(overlapping_columns) # 4. Appending extra columns from both data sources tracks = tracks.join( tracks1.select( 'id', *[c for c in tracks1.columns if c not in overlapping_columns]), 'id', 'left') tracks = tracks.join( tracks2.select( 'id', *[c for c in tracks2.columns if c not in overlapping_columns]), 'id', 'left') # TODO(etl): remove duplicates due to redundant records in the right-side table of left outer join return tracks
def load_velib(keys_space_name="json", table_name="velib", time_frame=None, spark=None, agg=None, time_frames="5 minutes", time_update="1 minute"): # Wczytanie danych velib_temp, sc = load_table.load_and_get_table_df(keys_space_name, table_name, time_frame, spark) # Dodanie zmiennych opisujących dokładnie czas i Usuniecie kolumn nieuzywanych do predykcji velib = velib_preprocessing(velib_temp, agg, time_frames, time_update) velib.sort("station_id", "timestamp").show(300) # Stworzenie zmiennej celu w = Window().partitionBy("station_id").orderBy("timestamp") dane = velib.withColumn("target", lead("num_bikes_available", 240).over(w)).na.drop() #dane.sort("station_id", "timestamp").show(300) #print(dane.dtypes) return dane, sc
def chain_pings(self): print( "\n_______________________________________________\nCHAINING PINGS\n\n" ) w = Window().partitionBy('device_id', 'study_dt').orderBy('utc_timestamp') init_cnt = self.df.count() self.df = self.df.withColumn('chain_dist', ((((self.df['accuracy'] + lead(self.df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \ .withColumn('chain', when((distance(self.df['latitude'], self.df['longitude'], \ lead(self.df['latitude'],1).over(w), lead(self.df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1) \ .when((distance(self.df['latitude'], self.df['longitude'], \ lag(self.df['latitude'],1).over(w), lag(self.df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1).otherwise(0)) self.unchain_df = self.df.filter(self.df['chain'] == 0) \ .drop('chain_dist','chain') self.df = self.df.filter(self.df['chain'] == 1) \ .drop('chain_dist','chain') unchain_cnt = self.unchain_df.cache().count() chain_cnt = self.df.cache().count() tbl_data = [['Initial count', init_cnt, 0, 0, 'Count of pings prior to analyzing spatial relationships'], \ ['Chained count', chain_cnt, init_cnt - chain_cnt, ((init_cnt - chain_cnt) / float(init_cnt)) * 100, \ 'Count of pings that have spatially proximate neighbors to consider for clustering']] # Display filter table print(tabulate(tbl_data, floatfmt=".2f", headers=['Phase', 'Ping Count', 'Removed Pings', \ 'Percent Reduction', 'Description']))
def load_urzedy(keys_space_name="json", table_name="urzedy", time_frame=None, spark=None, agg=None, time_frames="5 minutes", time_update="1 minute"): # Wczytanie danych urzedy_temp, sc = load_table.load_and_get_table_df(keys_space_name, table_name, time_frame, spark) urzedy = urzedy_preprocessing(urzedy_temp, agg, time_frames, time_update) urzedy.sort("idgrupy", "timestamp").show(300) # Stworzenie zmiennej celu w = Window().partitionBy("idgrupy").orderBy("timestamp") dane = urzedy.withColumn("target", lead("liczbaklwkolejce", 240).over(w)).na.drop() #dane.sort("idgrupy", "timestamp").show(300) #print(dane.dtypes) return dane, sc
def findTopMovies(self, userDataDF, itemDF): windowPerMovie = Window.partitionBy("item_id") windowPerMovieSortRating = Window().orderBy(F.desc("sumOfRating")) topMoviesDF=userDataDF.filter("timestamp is not null").join(itemDF,userDataDF.item_id==itemDF.movieid,"inner").\ select("item_id","movietitle",F.sum("rating").over(windowPerMovie).alias("sumOfRating")).\ select("*").distinct().select("*",F.row_number().over(windowPerMovieSortRating).alias("rnk")) return topMoviesDF
def _create_index(self, index, make_index): '''Handles index creation logic base on user input''' if index is None: # Case 1: user wanted to make index but did not specify column name assert not make_index, "Must specify an index name if make_index is True" # Case 2: make_index not specified but no index supplied, use first column logger.warning(("Using first column as index. ", "To change this, specify the index parameter")) index = self.df.columns[0] elif make_index and index in self.df.columns: # Case 3: user wanted to make index but column already exists raise RuntimeError( "Cannot make index: index column already present") elif index not in self.df.columns: if not make_index: # Case 4: user names index, it is not in df. does not specify # make_index. Make new index column and warn logger.warning( "index %s not found in dataframe, creating new " "integer column", index) # Case 5: make_index with no errors or warnings # (Case 4 also uses this code path) rank_window = Window().orderBy(col(self.df.columns[0])) new_add_col = row_number().over(rank_window) self.df = self.df.withColumn(index, new_add_col - 1) # Case 6: user specified index, which is already in df. No action needed. self.index = index
def get_top_10_coinstalls(addons_expanded_day): def str_map_to_dict(m): result = {} for i in m: k, v = i.split("=") result[k] = v return result def format_row(row): return Row( addon_id=row.addon_id, top_10_coinstalls=str_map_to_dict(row.top_10_coinstalls), ) w = Window().partitionBy("addon_id").orderBy(F.col("count").desc()) d = ( addons_expanded_day.join( addons_expanded_day.filter("is_system=false").withColumnRenamed( "addon_id", "coaddon"), on="client_id", ).groupby("client_id", "addon_id", "coaddon").count().withColumn( "rn", (F.row_number().over(w) - F.lit(1))) # start at 0 .filter("rn BETWEEN 1 and 10" ) # ignore 0th addon (where coaddon==addon_id) .groupby("addon_id").agg( F.collect_list(F.concat(F.col("rn"), F.lit("="), "coaddon")).alias( "top_10_coinstalls")).rdd.map(format_row).toDF()) return d
def build_dictionary_table(df, *column, index_column="id"): dictionary = df.select(*column).distinct().sort(*column) #for c in column: # dictionary = dictionary.where("{0} is not null".format(c)) w = Window().orderBy(*column) dictionary = dictionary.withColumn(index_column, row_number().over(w)) return dictionary
def calculate_average_distance(vehicles_evts_df, op_prd_evts_df): """ calculate average distance in an operating period of all vehicles and per vehicle as well. :param vehicles_evts_df: vehicle events DF. :param op_prd_evts_df: operating periods DF. :return: DF with col `distance` """ # so that we could join both data frames. vehicles_evts_df = vehicles_evts_df.withColumn('key', F.lit(1)) op_prd_evts_df = op_prd_evts_df.withColumn('key', F.lit(1)) df_merge = vehicles_evts_df.join(op_prd_evts_df, on='key', how='left').drop('key') df_merge = df_merge \ .withColumn('lng', F.toRadians('lng')) \ .withColumn('lat', F.toRadians('lat')) w = Window().partitionBy('op_prd_id', 'vehicle_id').orderBy("at") df = df_merge.withColumn( 'distance', calculate_distance('lng', 'lat', F.lag('lng', 1).over(w), F.lag('lat', 1).over(w))).alias('distance') df = df.withColumn( 'distance', F.when(F.isnull(df['distance']), 0).otherwise(df['distance'])).alias('distance') return df
def shift_1(df,shift_count=1): from pyspark.sql.functions import col,lag from pyspark.sql.window import Window w_1 = Window().partitionBy().orderBy(col('date')) df = df.withColumn('shift_1',lag('sales',count=shift_count).over(w_1)) return df
def Breiman(df, label_column, column_names): '''This function calculates the average of a given column conditional on the value of another column''' for col in column_names: print(col) w = Window().partitionBy(col) df = df.withColumn(col + "B", avg(label_column).over(w)) df.drop(col) return df
def markDistinct(self, dataFrame): w = Window().partitionBy('Key').orderBy(Functions.lit('A')) localDf = dataFrame.withColumn('IsDistinctKey', Functions.row_number().over(w)) localDf = localDf.withColumn( 'IsDistinctKey', Functions.when(localDf.IsDistinctKey == 1, '0').otherwise('1')) return localDf
def add_lead_lag(df, variable): for month in range(1,37,3): w = Window().partitionBy(col("GVKEY")).orderBy(col("GVKEY_year_mth")) first_new_col = "forward_"+str(month)+"_month_"+str(variable) second_new_col = "past_"+str(month)+"_month_"+str(variable) df = df.withColumn(first_new_col, lag(col(variable),-month,None).over(w)) \ .withColumn(second_new_col, lag(col(variable),month,None).over(w)) return df
def extract_nested_json(source_dataframe,table,col_table_name,col_nested_data,cols_to_add_hash): df2=source_dataframe.filter('lower('+col_table_name+') =='+'\"'+table+'\"') df2_1_col=df2.select(col_nested_data).rdd.map(lambda p1:is_json(p1[col_nested_data])).map(lambda g2: [k.lower() for k in g2.keys()]).reduce(lambda h77,h78: list(set(h77+h78))) df2_1=df2.select(col_nested_data).rdd.map(lambda p1:is_json(p1[col_nested_data])).map(lambda g2: dict((k.lower(), unicode(v)) if type(v) != "unicode" else ((k.lower(), v)) for k, v in g2.iteritems())).map(lambda g4: dict((k29,unicode("")) if k29 not in g4.keys() else (k29,g4[k29]) for k29 in df2_1_col)) df3_2=df2_1.map(lambda v:Row(**v)).toDF() df2=df2.withColumn("columnindex", row_number().over(Window().partitionBy(lit("A")).orderBy(lit('A')))) df3_2=df3_2.withColumn("columnindex", row_number().over(Window().partitionBy(lit("A")).orderBy(lit('A')))) final=df2.join(df3_2, df2.columnindex == df3_2.columnindex, 'inner').drop(df3_2.columnindex) final=final.drop('columnindex') sha_columns=df3_2.columns sha_columns.remove("columnindex") if type(cols_to_add_hash)==list: sha_columns.extend(cols_to_add_hash) else: sha_columns.append("eventtype") final=final.withColumn("sha_key", sha2(concat_ws("||", *sha_columns), 256)) return(final)
def get_top_addon_names(addons_expanded): w = Window().partitionBy("addon_id").orderBy(F.col("n").desc()) cnts = addons_expanded.groupby("addon_id", "name").agg( F.countDistinct("client_id").alias("n")) addon_names = (cnts.withColumn( "rn", F.row_number().over(w)).where(F.col("rn") == 1).select( "addon_id", "name")) return addon_names
def __baseWindow(self): # add all sort keys - time is first, unique sequence number breaks the tie ptntl_sort_keys = [self.ts_col, self.sequence_col] sort_keys = [f.col(col_name).cast("long") for col_name in ptntl_sort_keys if col_name != ''] w = Window().orderBy(sort_keys) if self.partitionCols: w = w.partitionBy([f.col(elem) for elem in self.partitionCols]) return w
def cbind(df1, df2): df1 = df1.withColumn('const', F.lit(1)) df2 = df2.withColumn('const', F.lit(1)) w = Window().partitionBy().orderBy('const') df1 = df1.withColumn("row_id", F.rank().over(w)) df2 = df2.withColumn("row_id", F.row_number().over(w)) cbind_df = df1.join(df2, on=["row_id"]).sort("row_id").drop("row_id") return cbind_df.drop('const')
def inner(df): from pyspark.sql.functions import lag, col, datediff w = Window().partitionBy().orderBy(col(colName)) df1 = (df.select(colName, lag(colName).over(w).alias("lagged_col")).na.drop()) (df1.withColumn('diff_col', datediff( df1[colName], df1['lagged_col'])).sort('diff_col', ascending=False).show()) return df
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = ' ' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter((f.col('page') == 'NextSong')) # extract columns for users table users_table = df.selectExpr("userId as user_id", "firstName as first_name", " lastName as last_name", "gender", "level") # write users table to parquet files users_table.write.parquet("Users_Table") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(x / 1000).strftime( '%Y-%m-%d %H:%M:%S')) df = df.withColumn("timestamp", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf( lambda x: datetime.fromtimestamp(x / 1000).strftime('%Y-%m-%d')) df = df.withColumn("datetime", get_datetime(df.ts)) # create weekday column from original timestamp column get_datetime = udf( lambda x: datetime.fromtimestamp(x / 1000).strftime('%A')) df = df.withColumn("weekday", get_weekday(df.ts)) # extract columns to create time table time_table = df.select("timestamp", hour("timestamp").alias('hour'), dayofmonth("timestamp").alias('day'), weekofyear("timestamp").alias('weekofyear'), month("timestamp").alias('month'), year("timestamp").alias('year'), "weekday") # write time table to parquet files partitioned by year and month time_table.write.partitionBy(['year', 'month']).parquet("time_table") # read in song data to use for songplays table song_df = spark.read.json(" ") # extract columns from joined song and log datasets to create songplays table songplays_table = song_df.alias('a').join(df.alias('b'),(col('b.song') == col('a.title')) & (col('b.artist') == col('a.artist_name') ) )\ .selectExpr("ts as start_time", "userId as user_id", "level", "song_id", "artist_id", "sessionId as session_id", "location", "useragent as user_agent").distinct() w = Window().orderBy(lit('A')) songplays_table = songplays_table.withColumn("songplay_id", row_number().over(w)) # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy(['year', 'month']).parquet("songplays_table")
def initParameters(): read_df = readSourceData() transform_df = transformData() spark_session = SparkSession.builder \ .master('local[1]') \ .appName(ctest['common']['appName']) \ .getOrCreate() wSpecY = Window().partitionBy(F.date_format('datetaken', "yyyy"), 'regionname') return [spark_session, wSpecY]
def process_stream(self, rdd): """ Args rdd: rdd :rtype: None """ def detect_anomaly(sensor_readings, running_avg, std_dev): """ Args: sensor_readings: List(float) running_avg: float std_dev: float :rtype: int """ anomalies = [] for x, (i, y) in zip(sensor_readings, enumerate(running_avg)): upper_limit = running_avg[i - 1] + 3 * std_dev lower_limit = running_avg[i - 1] - 3 * std_dev if (x > upper_limit) or (x < lower_limit): anomalies.append(x) return len(anomalies) if rdd.isEmpty(): print("RDD is empty") else: df = rdd.toDF().cache() w = (Window().partitionBy(col("id")).rowsBetween(-1, 1)) df = df.withColumn('rolling_average', F.avg("val").over(w)) agg_df = df.groupBy(['id']).agg( F.collect_list("val").alias("sensor_reading"), first("ts").cast('timestamp').alias("start_ts"), last("ts").cast('timestamp').alias("end_ts"), F.round(F.stddev("val"), 3).alias("std_temp"), F.collect_list("rolling_average").alias("rol_avg")) agg_df.show() anomaly_udf = udf(detect_anomaly, IntegerType()) processed_df = agg_df.withColumn( "num_anomaly", anomaly_udf("sensor_reading", "rol_avg", "std_temp")).sort(desc("num_anomaly")) final_df = processed_df.withColumn( "anomaly", F.when(F.col("num_anomaly") > 1, True).otherwise(False)) final_df = final_df.select("id", "start_ts", "end_ts", "std_temp", "num_anomaly", "anomaly") try: connector = pgConnector.PostgresConnector( "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password") connector.write(final_df, "anomaly_window_tbl", "append") except Exception as e: print(e) pass
def generate_window(order_col, rowrange=None, partitions=None): window = Window().orderBy(order_col) # for future reference: if we want to use partitions # if partitions is not None: # window = window.partitionBy(partitions) if rowrange is not None: window = window.rowsBetween(*rowrange) return window
def cvm_formatting_cdw( recs, model_master_name, model_name: str, model_type: str, start_date: str, time_prd_val: int = -7, env: str = 'TST', ): model_master = oracle_cdw_read(f'SELECT MASTER_ID FROM {model_master_name} WHERE ROWNUM = 1' f'ORDER BY master_id DESC', database = 'CDWCMMO', env = env, db_type = 'Oracle').toPandas() master_id = model_master['MASTER_ID'][0] + 1 start_date, end_date = date_period(time_prd_val, start_date) w = Window().orderBy('sku_X', 'cvm_rank') output = recs. \ withColumn('MASTER_ID', lit(master_id)). \ withColumn('RECORD_ID', row_number().over(w)). \ withColumn('TIME_PRD_VAL', lit(time_prd_val)). \ withColumn('MODEL_TYPE', lit(model_type)). \ withColumn('MODEL_NAME', lit(model_name)). \ withColumn('START_DATE', lit(start_date)). \ withColumn('END_DATE', lit(end_date)) # Order the columns output = output. \ select('MASTER_ID', 'RECORD_ID', 'TIME_PRD_VAL', 'MODEL_TYPE', 'MODEL_NAME', 'SKU_X', 'COUPON_X', 'SKU_Y', 'COUPON_Y', 'BASKET_COUNT_XY', 'BASKET_COUNT_X', 'BASKET_COUNT_Y', 'SKU_BASKET_COUNT_X', 'SKU_BASKET_COUNT_Y', 'SKU_SALES_X', 'SKU_SALES_Y', 'CONFIDENCE', 'CVM_RANK', 'START_DATE', 'END_DATE', ) return master_id, output
def main(bike_inputs, taxi_inputs, year): # read df from csv bike_trips_df = spark.read.csv('tripsl.csv', header=True) taxi_trips_df = spark.read.csv('taxi.csv', header=True) #filter on given year bike_trips_df = bike_trips_df.withColumn( 'date', bike_trips_df['starttime'].cast('date')) bike_trips_df = bike_trips_df.withColumn( 'year', functions.year(bike_trips_df['date'])) bike_trips_df = bike_trips_df.filter(bike_trips_df['year'] == 2016) #calculate the average distance travel by bikes this year bike_dist_df = bike_trips_df.groupby([ 'start_station_name', 'end_station_name', 'start_station_latitude', 'start_station_longitude', 'end_station_latitude', 'end_station_longitude' ]).count().withColumnRenamed('start_station_name', 'startStationName').withColumnRenamed( 'end_station_name', 'endStationName') #calculate distance w = Window().partitionBy(['start_station_name', 'end_station_name']).orderBy( ['start_station_name', 'end_station_name']) bike_dist_df = bike_dist_df.withColumn( "dist", dist("start_station_longitude", "start_station_latitude", 'end_station_longitude', 'end_station_latitude').cast('decimal')) #calculate 50 percentile distance avergae_distance = bike_dist_df.orderBy('dist').selectExpr( 'percentile_approx(dist, 0.5)').collect()[0][0] #convert km to miles average_distance = float(avergae_distance) * 0.621371 #filter out any taxi trips larger than this distance new_taxi_trips_df = taxi_trips_df.filter( taxi_trips_df['trip_distance'] < average_distance).filter( taxi_trips_df['pickup_latitude'].isNotNull()).filter( taxi_trips_df['pickup_longitude'].isNotNull()).filter( taxi_trips_df['dropoff_latitude'].isNotNull()).filter( taxi_trips_df['dropoff_longitude'].isNotNull()) #calculate thr velocity of each trip in miles/s new_taxi_trips_df = new_taxi_trips_df.withColumn( 'velocity', new_taxi_trips_df['trip_distance'] / new_taxi_trips_df['travel_time']).orderBy('velocity') traffic_velocity = new_taxi_trips_df.selectExpr( 'percentile_approx(velocity, 0.2)').collect()[0][0] new_taxi_trips_df = new_taxi_trips_df.filter( new_taxi_trips_df['velocity'] <= traffic_velocity) new_taxi_trips_df.repartition(1).toPandas().to_csv( 'data/q3_data/taxi_traffic.csv', header=True)