def test_optimize_zorder_by_w_partition_filter(self) -> None: # write an unoptimized delta table df = self.spark.createDataFrame([i for i in range(0, 100)], IntegerType()) \ .withColumn("col1", floor(col("value") % 7)) \ .withColumn("col2", floor(col("value") % 27)) \ .withColumn("p", floor(col("value") % 10)) \ .repartition(4).write.partitionBy("p") df.format("delta").save(self.tempFile) # create DeltaTable dt = DeltaTable.forPath(self.spark, self.tempFile) # execute Z-OrderBy optimizer = dt.optimize().where("p = 2") result = optimizer.executeZOrderBy(["col1", "col2"]) metrics = result.select("metrics.*").head() # assertions (partition 'p = 2' has four files) self.assertTrue(metrics.numFilesAdded == 1) self.assertTrue(metrics.numFilesRemoved == 4) self.assertTrue(metrics.totalFilesSkipped == 0) self.assertTrue(metrics.totalConsideredFiles == 4) self.assertTrue(metrics.zOrderStats.strategyName == 'all') self.assertTrue(metrics.zOrderStats.numOutputCubes == 1)
def test_optimize_zorder_by(self) -> None: # write an unoptimized delta table self.spark.createDataFrame([i for i in range(0, 100)], IntegerType()) \ .withColumn("col1", floor(col("value") % 7)) \ .withColumn("col2", floor(col("value") % 27)) \ .withColumn("p", floor(col("value") % 10)) \ .repartition(4).write.partitionBy("p").format("delta").save(self.tempFile) # create DeltaTable dt = DeltaTable.forPath(self.spark, self.tempFile) # execute Z-Order Optimization optimizer = dt.optimize() result = optimizer.executeZOrderBy(["col1", "col2"]) metrics = result.select("metrics.*").head() self.assertTrue(metrics.numFilesAdded == 10) self.assertTrue(metrics.numFilesRemoved == 37) self.assertTrue(metrics.totalFilesSkipped == 0) self.assertTrue(metrics.totalConsideredFiles == 37) self.assertTrue(metrics.zOrderStats.strategyName == 'all') self.assertTrue(metrics.zOrderStats.numOutputCubes == 10) # negative test: Z-Order on partition column def optimize() -> None: dt.optimize().where("p = 1").executeZOrderBy(["p"]) self.__intercept( optimize, "p is a partition column. " "Z-Ordering can only be performed on data columns")
def skipp_attributes(df): song = udf(lambda x: int(x == 'NextSong'), IntegerType()) skipped = udf(lambda x: int(x != 0), IntegerType()) session = Window.partitionBy("userId", "sessionId").orderBy(desc("ts")) return df.select( 'userId', 'page', 'ts', 'length', 'sessionId', 'itemInSession' ).where((df.page != 'Thumbs Up') & (df.page != 'Thumbs Down')).withColumn( 'song', song('page')).orderBy( 'userId', 'sessionId', 'itemInSession').withColumn( 'nextActSong', lag(col('song')).over(session)).withColumn( 'tsDiff', (lag('ts').over(session) - col('ts')) / 1000).withColumn( 'timeSkipped', (floor('length') - col('tsDiff'))).withColumn( 'roundedLength', floor('length')).where((col('song') == 1) & ( (col('nextActSong') != 0) & (col('timeSkipped') >= 0))).withColumn( 'skipped', skipped('timeSkipped')).select( 'userId', 'timeSkipped', 'skipped', 'length', 'ts', 'tsDiff').groupBy('userId').agg({ 'skipped': 'avg', 'timeSkipped': 'avg' }).withColumnRenamed( 'avg(skipped)', 'skipRate').withColumnRenamed( 'avg(timeSkipped)', 'avgTimeSkipped')
def convert_time(data_df, min_time): data_df = data_df.withColumn( 'day', f.floor((f.col('time') - min_time) / (3600 * 24)).cast('integer')) data_df = data_df.withColumn('week', f.col('day') % 7) data_df = data_df.withColumn( 'hour', f.floor((f.col('time') - min_time) / 3600).cast('integer') % 24) #data_df = data_df.withColumn('hour', f.round(f.col('hour')).cast('integer')) return data_df
def calculation(criteria): """ This function does column function and calculations on all rows of the dataframe through column operations. @type criteria: dataframe @param criteria: Joined table of user information and stock information which needs to be calculated """ # Buy-shares criteria = criteria.withColumn('numb_share', when(col('previous_price') - col('price') > col('buy'), (col('numb_share') + floor(col('cash')/col('price')))) .otherwise(col('numb_share'))) # Buy-total value adjustment criteria = criteria.withColumn('total_value', when(col('previous_price') - col('price') > col('buy'), col('total_value') + floor(col('cash')/col('price'))*col('price')) .otherwise(col('total_value'))) # Buy-cash adjustment criteria = criteria.withColumn('cash', when(col('previous_price') - col('price') > col('buy'), col('cash') - floor(col('cash')/col('price'))*col('price')) .otherwise(col('cash'))) # sell-Profit Calculation criteria = criteria.withColumn('profit', when(col('price') - col('previous_price') > col('sell'), col('profit') + (col('numb_share')*col('price')) - col('total_value')) .otherwise(col('profit'))) # sell-cash adjustment criteria = criteria.withColumn('cash', when(col('price') - col('previous_price') > col('sell'), col('cash') + col('total_value')).otherwise(col('cash'))) # sell-Total Value adjustment criteria = criteria.withColumn('total_value', when(col('price') - col('previous_price') > col('sell'), 0).otherwise(col('total_value'))) # Sell-shares adjustment criteria = criteria.withColumn('numb_share', when(col('price') - col('previous_price') > col('sell'), 0).otherwise(col('numb_share'))) # time adjustment criteria = criteria.withColumn('time', when((col('previous_price') - col('price') > col('buy')) |((col('price') - col('previous_price') > col('sell'))), col('time_new')).otherwise(col('time'))) # previous price adjustment criteria = criteria.withColumn('previous_price', when((col('previous_price') - col('price') > col('buy')) | ((col('price') - col('previous_price') > col('sell'))), col('price')).otherwise(col('previous_price'))) criteria = criteria.drop('time_new', 'volume', 'price') combine(criteria, cass_data)
def percentiles(df, c, by=None, p=[10, 25, 50, 75, 90], index='_idx', result='_res'): _gcols = [by] if isinstance(by, str) and by else by or [] ptile = f'{c}##p' # percentiles per row w = Window.partitionBy(*_gcols).orderBy(c) d = df.select(c, *_gcols, F.floor(100 * (F.percent_rank().over(w))).alias(ptile)) # aggregate agg_keys = F.array(*[F.lit(x) for x in p]) agg_values = F.array( *[F.max(F.when(F.col(ptile) < x, F.col(c))) for x in p]) r = d.groupby(*_gcols).agg( F.map_from_arrays(agg_keys, agg_values).alias(result)) # add colname r = r.withColumn(index, F.lit(c)) return r
def create_train_data(): w1 = Window.orderBy("uid") w2 = Window.partitionBy("seg").orderBy("uid") df_train = spark.read.csv( os.path.join("datasets", "train.csv"), header=True, schema=schema).withColumn( "uid", monotonically_increasing_id()).withColumn( "idx", row_number().over(w1).cast(IntegerType())).withColumn( "seg", fn.floor(((fn.col("idx") - 1) / 150000)).cast( IntegerType())).withColumn( "no", row_number().over(w2).cast( IntegerType())).withColumn( "name", fn.concat( lit("raw_"), fn.lpad(fn.col("seg"), 4, "0").cast( StringType()))).withColumn( "set", lit(0)) df_train.createOrReplaceTempView("data") df_train_f = spark.sql(""" SELECT uid, set, seg, no, name, x, y FROM data ORDER BY set, seg, no, uid """) df_train_f = df_train_f.repartition(1) df_train_f.write.mode("overwrite").parquet( os.path.join("datasets", "train.parquet"))
def granularityPartition(dataDF, N, aggMode='mean'): ''' 针对数值型数据表,需要指定划分的记录块大小、字段的聚合方式。 仅仅支持纯数值型表格 :param dataDF:待处理数据表 :param N:块大小 :param aggMode: :return:聚合方式,可选mean/min/max/count ''' mode2index = {"mean": 1, "max": 3, "min": 2, "count": 0} if aggMode not in mode2index: raise ValueError("aggMode必须为mean\min\max\count之一") dataDF = addIdCol(dataDF, idFieldName="id_temp_bob") dataDF = dataDF.withColumn("group_id_temp_bob", floor(dataDF.id_temp_bob / N)) dataDF = eval("dataDF.groupby('group_id_temp_bob')." + aggMode + "()") dataDF = dataDF.drop("id_temp_bob").drop("group_id_temp_bob") dataDF = changeFieldName(dataDF, "count", "countN") for each in dataDF.columns: dataDF = changeFieldName(dataDF, each, each.replace('(', '_').replace(')', '_')) if aggMode == 'mean': return dataDF.drop('avg_id_temp_bob_').drop("avg_group_id_temp_bob_") else: return dataDF.drop(aggMode + "_id_temp_bob_").drop(aggMode + "_group_id_temp_bob_")
def floordiv(left, right): return F.when(F.lit(right is np.nan), np.nan).otherwise( F.when( F.lit(right != 0) | F.lit(right).isNull(), F.floor(left.__div__(right))).otherwise( F.when( F.lit(left == np.inf) | F.lit(left == -np.inf), left).otherwise(F.lit(np.inf).__div__(left))))
def floordiv(left: Column, right: Any) -> Column: return F.when(SF.lit(right is np.nan), np.nan).otherwise( F.when( SF.lit(right != 0) | SF.lit(right).isNull(), F.floor(left.__div__(right))).otherwise( F.when( SF.lit(left == np.inf) | SF.lit(left == -np.inf), left).otherwise(SF.lit(np.inf).__div__(left))))
def _discretize_time(self, column: sf.Column) -> sf.Column: days_since_study_start = sf.datediff(column, sf.lit(self.study_start)) bucket = sf.floor(days_since_study_start / self.bucket_size).cast("int") if self.bucket_rounding == "floor": bucket = (sf.when( (bucket < self.n_buckets) | bucket.isNull(), bucket).otherwise(self.n_buckets - 1).cast("int")) return bucket
def main(conf): spark_session = SparkSession.builder.appName("TopMoviesPerDecade")\ .getOrCreate() movies_df, ratings_df = load_data(spark_session) ratings_decade_wise = ratings_df.withColumn('decade', func.floor(func.year( func.from_unixtime('time_stamp')\ .cast(DateType())) /10)*10)\ .drop('time_stamp') movie_data_tmp = movies_df.drop('movie_name') ratings_w_movies = ratings_decade_wise.join( func.broadcast(movie_data_tmp), ratings_decade_wise.movie_id == movie_data_tmp.movie_id, how='left').drop(movie_data_tmp.movie_id) ratings_w_movies = ratings_w_movies.withColumn( 'categories', func.explode(func.split(ratings_w_movies["genre"], "\\|"))).drop('genre', 'rating') ratings_agg = ratings_w_movies.groupBy("decade", "categories", "movie_id").agg({ 'categories': 'count' }).withColumnRenamed( 'count(categories)', 'freq') window_spec = Window.partitionBy("decade", "categories").orderBy(func.desc("freq")) ratings_agg = ratings_agg.withColumn("rank", func.rank().over(window_spec)) top10 = ratings_agg.where(ratings_agg["rank"] <= 10) top10.show(100) categories = [('Crime', 1), ('Romance', 2), ('Thriller', 3), ('Adventure', 4), ('Drama', 5), ('War', 6), ('Documentary', 7), ('Fantasy', 8), ('Mystery', 9), ('Musical', 10), ('Animation', 11), ('Film-Noir', 12), ('(no genres listed)', 13), ('IMAX', 14), ('Horror', 15), ('Western', 16), ('Comedy', 17), ('Children', 18), ('Action', 19), ('Sci-Fi', 20)] category_df = spark_session.createDataFrame(categories, ['categories', 'category_id']) top10 = top10.join(func.broadcast(category_df), ['categories']) movie_data = movies_df.drop('genre') top10 = top10.join(func.broadcast(movie_data), ['movie_id'], how='left').drop('categories', 'freq') top10 = top10.withColumnRenamed('r', 'rank') print(top10.show(1000)) pg_cred = conf.pg_db['pg_data_lake']
def gts_from_impute (infile): # Get the main data and put a unique index on each variant maindata = infile.filter(infile.data[0:1] != "#") splitdata = maindata.select("filename",f.split(maindata.data,"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX")) gtdata1 = splitdata.select("filename", "VAR_IDX", f.posexplode(splitdata.split_data)).toDF("filename","VAR_IDX","COLUMN_IDX","GTPROB").filter("COLUMN_IDX > 4") # Now, get subject ID and which GT gtdata2 = gtdata1.select("filename", "VAR_IDX", "GTPROB", "COLUMN_IDX", f.floor((gtdata1.COLUMN_IDX - 5) / 3).alias("SAMPLE_IDX"), ((gtdata1.COLUMN_IDX - 5) % 3).cast(StringType()).alias("GT_IDX")) gtdata3 = rkutil.withColumnsRenamed(gtdata2.groupBy("filename","VAR_IDX","SAMPLE_IDX").pivot("GT_IDX",["0","1","2"]).agg(f.collect_list("GTPROB")), ["0","1","2"],["c0","c1","c2"]) gtdata4 = gtdata3.select("filename","VAR_IDX","SAMPLE_IDX", f.element_at(gtdata3.c0, 1).cast(FloatType()).alias("P11"), f.element_at(gtdata3.c1, 1).cast(FloatType()).alias("P12"), f.element_at(gtdata3.c2, 1).cast(FloatType()).alias("P22")) return(gtdata4)
def __init__(self): super(FeatureResponse5xxTotal, self).__init__() self.group_by_aggs = { '5xx': F.count(F.when(F.col('5xx') == True, F.col('5xx'))) # noqa } self.pre_group_by_calcs = { 'response_code_category': F.floor(F.col('http_response_code') / 100.), '5xx': F.col('response_code_category') == 5, }
def __init__(self): super(FeatureResponse4xxToRequestRatio, self).__init__() self.group_by_aggs = { '4xx': F.count(F.when(F.col('4xx') == True, F.col('4xx'))), # noqa 'num_requests': F.count(F.col('@timestamp')).cast('float'), } self.pre_group_by_calcs = { 'response_code_category': F.floor( F.col('http_response_code') / 100.), '4xx': F.col('response_code_category') == 4, }
def GetFirstDate(df, _unitoftime): #find the minimum date df_grouped = df.groupby([df['STUDYID'], df["CODE"]]).agg(min(df['DAYS_INDEX'])) #convert to first date df_grouped = df_grouped.withColumn( "DAYS_INDEX", floor((df_grouped["min(DAYS_INDEX)"].cast(FloatType())) / _unitoftime)) #drop the minimum days df_grouped = df_grouped.drop(df_grouped["min(DAYS_INDEX)"]) #Filter out all diagnoses that occured at time 0 AND also filter out the diagnosis we are looking for return df_grouped
def __calc_stats(self, df, resolution): """ Calculates statistics for every column in the Spark DF and returns a seperate DF with the results. Statistics: sum, min, max, count, mean, kurtosis, skewness, stddev, variance. :param df: DF containing the columns that you want to run your statistics calculations on :param resolution: int resolution in milli or microseconds OR string '5m'/'1h'/'1d' :return: aggregation dataframe containing statistics """ if type(resolution) is str: # resolution to microseconds res_dict = {'5m': 300000000, '1h': 3600000000, '1d': 86400000000} agg_interval = res_dict[resolution] elif type(resolution) is int: if len(str(resolution)) < 16: resolution = int(str(resolution).ljust(16, '0')) agg_interval = resolution ts_col = F.col('timestamp') df_ori_cols = list(set(df.columns) - set(['timestamp'])) df = df.withColumn('interval_start', (F.floor(ts_col / agg_interval) * agg_interval)) #\ #.withColumn('interval_stop', F.ceil(ts_col/agg_interval) * agg_interval)\ #.orderBy(F.col('interval_start')) agg_df = df.groupBy('interval_start').agg( F.max(ts_col).alias('max_ts')) # TODO Column type checking: string columns are automatically ignored and parse as NaN, so # TODO drop NaN columns? # TODO: interval_stop ignore, as well as drop max_ts # TODO: filter out NaN columns # TODO: question: run the statistics job as a seperate job without having to make a udf script stat_cols = df_ori_cols #[c for c in df_ori_cols if c not in ['interval_start', 'interval_stop', 'timestamp', 'max_ts']] for column in stat_cols: grouped_df = df.groupBy('interval_start')\ .agg(F.sum(column).alias('sum_%s' % column), F.min(column).alias('min_%s' % column), F.max(column).alias('max_%s' % column), F.count(column).alias('count_%s' % column), F.kurtosis(column).alias('kurtosis_%s' % column), F.mean(column).alias('mean_%s' % column), F.skewness(column).alias('skewness_%s' % column), F.stddev(column).alias('stddev_%s' % column), F.variance(column).alias('var_%s' % column)) agg_df = grouped_df.join(agg_df, on='interval_start') #agg_df = agg_df.drop('max_ts').drop(F.when(F.col('*').isna())).dropna(how='all').drop_duplicates() return agg_df
def __init__(self): super(FeatureResponse4xxRate, self).__init__() self.group_by_aggs.update({ '4xx': F.count(F.when(F.col('4xx') == True, F.col('4xx'))), # noqa }) self.pre_group_by_calcs.update({ 'response_code_category': F.floor(F.col('http_response_code') / 100.), '4xx': F.col('response_code_category') == 4, })
def add_decade_column(df: DataFrame, date_col: str = 'date') -> DataFrame: """ Add year and decade columns from date column. :param df: dataframe including date column :param date_col: column name of date :return: dataframe """ df = df.withColumn('year', F.year(date_col)) df = df.withColumn('decade', (F.floor(F.col('year') / 10) * 10).cast('string')) df = df.withColumn('decade', F.concat('decade', F.lit('s'))) logging.info("Decade and year columns are generated from date column") return df
def get_distribucion_de_clientes_por_facturas_emitidas(data): df = None if(data is not None): df = (data #InvoiceID equivale a InvoiceNo .select('CustomerID', 'InvoiceNo', 'Total') .groupBy('CustomerID') .agg(count('InvoiceNo').alias('facturas'), sum('Total').alias('Total')) .withColumn('facturas', floor(col('facturas') / 10) * 10) .groupBy('facturas') .agg(count('CustomerID').alias('Clientes')) .sort('facturas')) return df
def add_features(df): df = df.withColumn("hour", hour(df["pickupDatetime"]).cast("int")) df = df.withColumn("year", year(df["pickupDatetime"]).cast("int")) df = df.withColumn("month", month(df["pickupDatetime"]).cast("int")) df = df.withColumn("day", dayofmonth(df["pickupDatetime"]).cast("int")) df = df.withColumn("day_of_week", dayofweek(df["pickupDatetime"]).cast("int")) df = df.withColumn( "diff", datediff(df["dropoffDatetime"], df["pickupDatetime"]).cast("int") ) df = df.withColumn( "startLatr", (F.floor(df["startLat"] / (0.01)) * 0.01).cast("double") ) df = df.withColumn( "startLonr", (F.floor(df["startLon"] / (0.01)) * 0.01).cast("double") ) df = df.withColumn( "endLatr", (F.floor(df["endLat"] / (0.01)) * 0.01).cast("double") ) df = df.withColumn( "endLonr", (F.floor(df["endLon"] / (0.01)) * 0.01).cast("double") ) # df = df.drop('pickup_datetime', axis=1) # df = df.drop('dropoff_datetime', axis=1) import numpy # df.withColumn("h_distance",haversine_distance( # df.select("startLat"), # df.select("startLon"), # df.select("endLat"), # df.select("endLon"), # ).cast('double')) df = df.withColumn("is_weekend", (df["day_of_week"] > 5).cast("int")) return df
def get_distribucion_de_beneficios_por_facturas_emitidas(data): df = None if(data is not None): df = (data .select('CustomerID', 'InvoiceNo', 'Total') .groupBy('CustomerID') .agg(count('InvoiceNo').alias('facturas'), sum('Total').alias('Total')) .withColumn('facturas', floor(col('facturas') / 10) * 10) .groupBy('facturas') .agg(sum('Total').alias('Importe Total')) .sort('facturas') ) return df
def cut(infile, QE_info, sorted_res=True): spark = SparkSession.builder.master('local').appName("slice").getOrCreate() dataschema = StructType([ StructField("H", FloatType(), False), \ StructField("K", FloatType(), False), \ StructField("L", FloatType(), False), \ StructField("E", FloatType(), False), \ StructField("I", FloatType(), False)]) df = spark.read.csv(infile, sep=",", schema=dataschema) starts, ends, steps = convert_to_ses(QE_info) heads = ['H', 'K', 'L', 'E'] res_heads, res_shape = [], [] df_in_range = df.filter((df.H>=starts[0]) & (df.H<ends[0]) & \ (df.K>=starts[1]) & (df.K<ends[1]) & \ (df.L>=starts[2]) & (df.L<ends[2]) & \ (df.E>=starts[3]) & (df.E<ends[3])) for col_ix, col_name in enumerate(heads): if steps[col_ix] != 0 and steps[ col_ix] != ends[col_ix] - starts[col_ix]: res_heads.append(col_name + '_bin_ix') res_shape.append( ceil((ends[col_ix] - starts[col_ix]) / steps[col_ix])) #find_ix = UserDefinedFunction(lambda x: floor( (x-starts[col_ix])/steps[col_ix] ), IntegerType()) df_in_range = df_in_range.withColumn(col_name+'_bin_ix', \ func.floor( (col(col_name)-starts[col_ix])/steps[col_ix] )) if not res_heads: # means 0-Dimension spark.stop() return np.array(df.groupBy().avg('I').collect()) raw_res = np.array( df_in_range.groupBy(*res_heads).agg({ 'I': 'mean' }).collect()) spark.stop() if sorted_res: res = np.full((*res_shape), np.nan) if len(res_shape) == 1: # means 1-Dimension for row in raw_res: res[int(row[0])] = row[1] else: for row in raw_res: res[tuple(row[:-1].astype(int))] = row[-1] return res else: return raw_res
def compute_precision_recall_graph(predictions, n_points): inf_cumulative_window = \ (Window .partitionBy('label') .orderBy('id_bucket') .rowsBetween(Window.unboundedPreceding, Window.currentRow)) sup_cumulative_window = \ (Window .partitionBy('label') .orderBy('id_bucket') .rowsBetween(1, Window.unboundedFollowing)) def prob_positive(v): try: return float(v[1]) except ValueError: return None prob_positive = udf(prob_positive, DoubleType()) return \ (predictions .select('label', floor(prob_positive('probability') * n_points) .alias('id_bucket')) .groupBy('label', 'id_bucket').count() .withColumn('count_negatives', sum('count').over(inf_cumulative_window)) .withColumn('count_positives', sum('count').over(sup_cumulative_window)) .groupBy('id_bucket').pivot('label', [0, 1]) .sum('count_negatives', 'count_positives') .select(((col('id_bucket') + 1) / n_points).alias('threshold'), col('0_sum(count_negatives)').alias('true_negative'), col('0_sum(count_positives)').alias('false_positive'), col('1_sum(count_negatives)').alias('false_negative'), col('1_sum(count_positives)').alias('true_positive')) .select(col('threshold').alias('Threshold'), (col('true_positive') / (col('true_positive') + col('false_positive'))) .alias('Precision'), (col('true_positive') / (col('true_positive') + col('false_negative'))) .alias('Recall'), (col('false_positive') / (col('false_positive') + col('true_negative'))) .alias('FPR')) .orderBy('Threshold') .toPandas())
def windowing(df, batch_size): """ Args: df: dataframe to perform windowing on batch_size: number of rows per batch """ if "timestamp" not in df.columns: raise ValueError("timestamp column not found!") df = df.withColumn("timestamp_1", F.unix_timestamp(F.col("timestamp"))) window_spec = Window.orderBy("timestamp_1") return df.withColumn( "batch_id", F.floor( (F.row_number().over(window_spec) - F.lit(1)) / int(batch_size)), )
def load(self, df): # if df.rdd.isEmpty(): # return column_timestamp = col('_time_updated').cast('bigint') column_period = floor(column_timestamp / self.period_seconds) df = df \ .withColumn('_time_updated', current_timestamp()) \ .withColumn('_time_updated_period', column_period) function.write_delta(df, self.path_target, name_column_partition='_time_updated_period')
def dist_buss_toronto_stars(self, output): df_business = self.df_business df_business.createOrReplaceTempView('business') df_business_clean=spark.sql("select *,regexp_replace(PostalCode,' ','') as ZipCode from business") df_toronto_data = df_business_clean.select(df_business_clean['BusinessID'], df_business_clean['Name'],\ df_business_clean['ZipCode'],df_business_clean['Latitude'],\ df_business_clean['Longitude'],functions.floor(df_business_clean['BusinessStars']).alias('Stars'))\ .where(df_business_clean['City']=='Toronto') # write data to output df_toronto_data.write.csv(output,header=True)
def customer_meta(df): SENIOR_CUTOFF = 65 ADULT_CUTOFF = 18 DAYS_IN_YEAR = 365.25 EXPONENTIAL_DIST_SCALE = 6.3 augmented_original = replicate_df(df, options["dup_times"] or 1) customerMetaRaw = augmented_original.select( "customerID", F.lit(now).alias("now"), (F.abs(F.hash(augmented_original.customerID)) % 4096 / 4096).alias("choice"), "SeniorCitizen", "gender", "Partner", "Dependents", F.col("MonthlyCharges").cast( get_currency_type()).alias("MonthlyCharges"), ) customerMetaRaw = customerMetaRaw.withColumn( "ageInDays", F.floor( F.when( customerMetaRaw.SeniorCitizen == 0, (customerMetaRaw.choice * ((SENIOR_CUTOFF - ADULT_CUTOFF - 1) * DAYS_IN_YEAR)) + (ADULT_CUTOFF * DAYS_IN_YEAR), ).otherwise((SENIOR_CUTOFF * DAYS_IN_YEAR) + (DAYS_IN_YEAR * (-F.log1p(-customerMetaRaw.choice) * EXPONENTIAL_DIST_SCALE)))).cast("int"), ) customerMetaRaw = customerMetaRaw.withColumn( "dateOfBirth", F.expr("date_sub(now, ageInDays)")) return customerMetaRaw.select( "customerID", "dateOfBirth", "gender", "SeniorCitizen", "Partner", "Dependents", "MonthlyCharges", "now", ).orderBy("customerID")
def gen_star_counts(self, output): df_users = self.df_users # get users stars base number df_stars_range = df_users.select(df_users['UserID'], functions.floor(df_users['AverageStars']).alias('Stars')) # group by user stars df_stars_groups = df_stars_range.groupBy(df_stars_range['Stars']) # get count of each group df_stars_count = df_stars_groups.agg(functions.count(df_stars_range['UserID']).alias('UsersCount')) # sort data df_sorted = df_stars_count.orderBy(df_stars_count['Stars']) # write data to output df_sorted.write.csv(output, header=True)
def process_data(raw_data_sdf, bert_layer): """ Performs the bulk of the work of tokenization and other cleanups. Returns a reduced spark data frame including ids, masks, and segments, and other helpful elements :param raw_data_sdf: spark dataframe, the news stories to be processed :param bert_layer: tensorflow Keras layer for the BERT model being used. """ global stop_words_bc, tokenizer, domains_bc # add weeks column clean_data_sdf = raw_data_sdf.withColumn( 'weeks', f.floor(f.datediff(f.col('published'), f.lit('2010-01-01')) / 7)) log_time("Begin regex") clean_data_sdf = clean_data_sdf.withColumn('regex', udf_add_regex('source_domain')) # remove all the identifying text from stories clean_data_sdf = clean_data_sdf.withColumn( 'clean_text', udf_clean_text(f.array('text_or_desc', 'regex'))) clean_data_sdf.take(100) log_time("Begin tokenizer") tokenizer = get_tokenizer(bert_layer) clean_data_sdf = clean_data_sdf.withColumn('tokens', udf_get_tokens('clean_text')) log_time("Begin masks, etc.") clean_data_sdf = clean_data_sdf.withColumn('masks', udf_get_masks('tokens')) clean_data_sdf = clean_data_sdf.withColumn('segments', udf_get_segments('tokens')) clean_data_sdf = clean_data_sdf.withColumn('ids', udf_get_ids('tokens')) clean_data_sdf = clean_data_sdf.withColumn( 'source_index', udf_source_index('source_domain').cast('int')) # let's slim down the dataframe before we save it to disk. clean_data_sdf = clean_data_sdf[[ 'source_domain', 'text_or_desc', 'clean_text', 'published', 'year', 'title', 'url', 'weeks', 'tokens', 'masks', 'segments', 'ids', 'source_index' ]] return clean_data_sdf