def test_mixed_sql_and_udf(self): df = self.data w = self.unbounded_window ow = self.ordered_window max_udf = self.pandas_agg_max_udf min_udf = self.pandas_agg_min_udf result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w)) expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w)) # Test mixing sql window function and window udf in the same expression result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w)) expected2 = expected1 # Test chaining sql aggregate function and udf result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ .withColumn('min_v', min(df['v']).over(w)) \ .withColumn('v_diff', col('max_v') - col('min_v')) \ .drop('max_v', 'min_v') expected3 = expected1 # Test mixing sql window function and udf result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ .withColumn('rank', rank().over(ow)) expected4 = df.withColumn('max_v', max(df['v']).over(w)) \ .withColumn('rank', rank().over(ow)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
def gen_report_table(hc,curUnixDay): rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6]))) HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor") #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_indoor order by entityid,clientmac,utoday" df_id_stat=hc.sql(sql) df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_id_mm df_min_max ,to caculate firtarrival and last arrival df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY] df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon) df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth") #newly added part for indoors7 and indoors30 based on current date df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0)) df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0)) df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30") hc.sql("drop table if exists df_indoor_fin") df_indoor_fin.write.saveAsTable("df_indoor_fin") rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5]))) HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow") # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday sql="select entityid,clientmac,utoday,UFIRSTDAY," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_flow order by entityid,clientmac,utoday" df_fl_stat=hc.sql(sql) df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_fl_mm df_min_max ,to caculate firtarrival and last arrival df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY] df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon) df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth") hc.sql("drop table if exists df_flow_fin") df_flow_fin.write.saveAsTable("df_flow_fin")
def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def getValueFieldValueLists(self, handlerId, keyFields, valueFields): df = self.entity.groupBy(keyFields) agg = self.options.get("aggregation",self.getDefaultAggregation(handlerId)) maxRows = int(self.options.get("rowCount","100")) numRows = min(maxRows,df.count()) valueLists = [] for valueField in valueFields: valueDf = None if agg == "SUM": valueDf = df.agg(F.sum(valueField).alias("agg")) elif agg == "AVG": valueDf = df.agg(F.avg(valueField).alias("agg")) elif agg == "MIN": valueDf = df.agg(F.min(valueField).alias("agg")) elif agg == "MAX": valueDf = df.agg(F.max(valueField).alias("agg")) else: valueDf = df.agg(F.count(valueField).alias("agg")) for keyField in keyFields: valueDf = valueDf.sort(F.col(keyField).asc()) valueDf = valueDf.dropna() rows = valueDf.select("agg").take(numRows) valueList = [] for row in rows: valueList.append(row["agg"]) valueLists.append(valueList) return valueLists
def reduce_to_ohlc(time, rdd): row_rdd = rdd.map(lambda row: row.split(',')) \ .filter(lambda row: len(row) == 3) \ .map(lambda row: Row( symbol=row[0], tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'), price=float(row[1]) )) sql_context = get_sql_context_instance(rdd.context) data = sql_context.createDataFrame(row_rdd) data.cache() data.write.format('org.apache.spark.sql.cassandra') \ .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save() ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \ .orderBy('tx_time') \ .groupBy('symbol', 'batch_time') \ .agg( F.first(data.price).alias('open'), F.max(data.price).alias('high'), F.min(data.price).alias('low'), F.last(data.price).alias('close'), F.first(data.tx_time).alias('open_time'), F.last(data.tx_time).alias('close_time') ) existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .load() \ .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time') merged_ohlc = ohlc.join(existing_ohlc, (ohlc.symbol == existing_ohlc.symbol) & (ohlc.batch_time == existing_ohlc.batch_time), 'left' ) merged_ohlc = merged_ohlc.select( ohlc.symbol.alias('symbol'), ohlc.batch_time.alias('batch_time'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'), F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'), F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high') ) merged_ohlc.write.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save()
def test_multiple_udfs(self): df = self.data w = self.unbounded_window result1 = df.withColumn('mean_v', self.pandas_agg_mean_udf(df['v']).over(w)) \ .withColumn('max_v', self.pandas_agg_max_udf(df['v']).over(w)) \ .withColumn('min_w', self.pandas_agg_min_udf(df['w']).over(w)) expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) \ .withColumn('max_v', max(df['v']).over(w)) \ .withColumn('min_w', min(df['w']).over(w)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_timestamp_splitter(test_specs, spark_dataset): dfs_rating = spark_dataset.withColumn(DEFAULT_TIMESTAMP_COL, col(DEFAULT_TIMESTAMP_COL).cast("float")) splits = spark_timestamp_split( dfs_rating, ratio=test_specs["ratio"], col_timestamp=DEFAULT_TIMESTAMP_COL ) assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratio"], test_specs["tolerance"] ) assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx( 1 - test_specs["ratio"], test_specs["tolerance"] ) max_split0 = splits[0].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0] min_split1 = splits[1].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0] assert(max_split0 <= min_split1) # Test multi split splits = spark_timestamp_split(dfs_rating, ratio=test_specs["ratios"]) assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"] ) assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"] ) assert splits[2].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"] ) max_split0 = splits[0].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0] min_split1 = splits[1].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0] assert(max_split0 <= min_split1) max_split1 = splits[1].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0] min_split2 = splits[2].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0] assert(max_split1 <= min_split2)
def handleUIOptions(self, displayColName): agg = self.options.get("aggregation") valFields = self.options.get("valueFields") if agg == 'COUNT': return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas() elif agg == 'SUM': return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas() elif agg == 'AVG': return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas() elif agg == 'MIN': return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas() elif agg == 'MAX': return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas() elif agg == 'MEAN': return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas() else: return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
def do_something_only_once(): # the command I use to run this script: #~/spark-1.6.1/bin/spark-submit --packages=com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.4.0 server.py global topdis, meta, dic, towo, cluto, doctopdat, maxdate, mindate, lda ## Loading of data sc = SparkContext(appName='Simple App') #"local" sqlContext = SQLContext(sc) # Load metadata avro reader = sqlContext.read.format('com.databricks.spark.avro') meta = reader.load('data/spark_metadata.avro') # # Loading topic distributions topdisFile = 'data/spark_output.tuples' csvLoader = sqlContext.read.format('com.databricks.spark.csv') topdis = csvLoader.options(delimiter=',',header='false', inferschema='true').load(topdisFile) strip_first_col_int = udf(lambda row: int(row[1:]), IntegerType()) topdis = topdis.withColumn('C0',strip_first_col_int(topdis['C0'])) strip_first_col_float = udf(lambda row: float(row[1:]), FloatType()) topdis = topdis.withColumn('C1',strip_first_col_float(topdis['C1'])) strip_last_col = udf(lambda row: float(row[:-2]), FloatType()) topdis = topdis.withColumn('C20',strip_last_col(topdis['C20'])) # # Load dictionary CSV dicFile = 'data/spark_dic.csv' csvLoader = sqlContext.read.format('com.databricks.spark.csv') dic = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(dicFile) dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count')) ldaFile = 'data/spark_lda.csv' csvLoader = sqlContext.read.format('com.databricks.spark.csv') lda = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(ldaFile) lda = lda.select(rowNumber().alias('id'), lda.columns).join(dic, dic.id == lda.id, 'inner').cache() # dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count')) # # # Load clustertopics CSV # clutoFile = 'enron_small_clustertopics.csv' # csvLoader = sqlContext.read.format('com.databricks.spark.csv') # cluto = csvLoader.options(delimiter=',', header='false', inferschema='true').load(clutoFile) # # # Load topicswords CSV # towoFile = 'enron_small_lda_transposed.csv' # csvLoader = sqlContext.read.format('com.databricks.spark.csv') # towo = csvLoader.options(delimiter=',', header='false', inferschema='true').load(towoFile) # # Merge topdis which has document id and with metadata, based on document id metasmall = meta.select('id',unix_timestamp(meta['date'],"yyyy-MM-dd'T'HH:mm:ssX").alias("timestamp")) doctopdat = topdis.join(metasmall, metasmall.id == topdis.C0,'inner').cache() maxdate = doctopdat.select(max('timestamp').alias('maxtimestamp')).collect()[0]['maxtimestamp'] mindate = doctopdat.select(min('timestamp').alias('mintimestamp')).collect()[0]['mintimestamp']
def _if_later(data1, data2): """Helper function to test if records in data1 are earlier than that in data2. Returns: bool: True or False indicating if data1 is earlier than data2. """ x = (data1.select(DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL) .groupBy(DEFAULT_USER_COL) .agg(F.max(DEFAULT_TIMESTAMP_COL).cast('long').alias('max')) .collect()) max_times = {row[DEFAULT_USER_COL]: row['max'] for row in x} y = (data2.select(DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL) .groupBy(DEFAULT_USER_COL) .agg(F.min(DEFAULT_TIMESTAMP_COL).cast('long').alias('min')) .collect()) min_times = {row[DEFAULT_USER_COL]: row['min'] for row in y} result = True for user, max_time in max_times.items(): result = result and min_times[user] >= max_time return result
def test_bounded_simple(self): from pyspark.sql.functions import mean, max, min, count df = self.data w1 = self.sliding_row_window w2 = self.shrinking_range_window plus_one = self.python_plus_one count_udf = self.pandas_agg_count_udf mean_udf = self.pandas_agg_mean_udf max_udf = self.pandas_agg_max_udf min_udf = self.pandas_agg_min_udf result1 = df.withColumn('mean_v', mean_udf(plus_one(df['v'])).over(w1)) \ .withColumn('count_v', count_udf(df['v']).over(w2)) \ .withColumn('max_v', max_udf(df['v']).over(w2)) \ .withColumn('min_v', min_udf(df['v']).over(w1)) expected1 = df.withColumn('mean_v', mean(plus_one(df['v'])).over(w1)) \ .withColumn('count_v', count(df['v']).over(w2)) \ .withColumn('max_v', max(df['v']).over(w2)) \ .withColumn('min_v', min(df['v']).over(w1)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
# sqlfuncs.avg('age').alias('age'), sqlfuncs.max('gender').alias('gender'),sqlfuncs.max('date').alias('last_consume'),\ # sqlfuncs.min('date').alias('first_consume') ) # konsum_user_agg.registerTempTable('user_agg') # # # print(konsum_user_agg.first()) # konsum_user_agg.write.save('/home/erlenda/data/konsum/a_users_parquet') # # # # #reg_late=konsum_user.filter(konsum_user.reg_date<datetime.datetime(2015,11,16,0,0)) # pvs=konsum_user.groupBy('a_virtual','a_user_key','timegroup','device').agg(sqlfuncs.sum(konsum_user.pv).alias("pvs"),\ sqlfuncs.sum(konsum_user.pv_bet).alias("pvs_bet"),\ sqlfuncs.max('date').alias('last_consume'),\ sqlfuncs.min('date').alias('first_consume'),\ sqlfuncs.sum(konsum_user.visits).alias("visits")) pprint(pvs.take(10)) print() #print(pvs.take(100)[55]) pvs_tot1=pvs.agg(sqlfuncs.sum(pvs.pvs)).first() print('Total after basic aggregation',pvs_tot1) pvs_mapped=pvs.rdd.map(lambda x:((x.a_user_key,x.a_virtual), (Counter({literal_eval(x.timegroup):x.pvs}),\ Counter({literal_eval(x.timegroup):1}),\ x.pvs,\ x.pvs_bet,\ Counter({x.device:x.pvs}) ) ) ) pvs_reduced=pvs_mapped.reduceByKey(lambda a,b:(a[0]+b[0],a[1]+b[1],a[2]+b[2],a[3]+b[3],a[4]+b[4]) )
content_size_summary_df = logs_df.describe(['content_size']) content_size_summary_df.show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC Alternatively, we can use SQL to directly calculate these statistics. You can explore the many useful functions within the `pyspark.sql.functions` module in the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions). # MAGIC # MAGIC After we apply the `.agg()` function, we call `.first()` to extract the first value, which is equivalent to `.take(1)[0]`. # COMMAND ---------- from pyspark.sql import functions as sqlFunctions content_size_stats = (logs_df .agg(sqlFunctions.min(logs_df['content_size']), sqlFunctions.avg(logs_df['content_size']), sqlFunctions.max(logs_df['content_size'])) .first()) print 'Using SQL functions:' print 'Content Size Avg: {1:,.2f}; Min: {0:.2f}; Max: {2:,.0f}'.format(*content_size_stats) # COMMAND ---------- # MAGIC %md # MAGIC ### (3b) Example: HTTP Status Analysis # MAGIC # MAGIC Next, let's look at the status values that appear in the log. We want to know which status values appear in the data and how many times. We again start with `logs_df`, then group by the `status` column, apply the `.count()` aggregation function, and sort by the `status` column. # COMMAND ----------
def all(self, axis: Union[int, str] = 0) -> bool: """ Return whether all elements are True. Returns True unless there at least one element within a series that is False or equivalent (e.g. zero or empty) Parameters ---------- axis : {0 or 'index'}, default 0 Indicate which axis or axes should be reduced. * 0 / 'index' : reduce the index, return a Series whose index is the original column labels. Examples -------- >>> ks.Series([True, True]).all() True >>> ks.Series([True, False]).all() False >>> ks.Series([0, 1]).all() False >>> ks.Series([1, 2, 3]).all() True >>> ks.Series([True, True, None]).all() True >>> ks.Series([True, False, None]).all() False >>> ks.Series([]).all() True >>> ks.Series([np.nan]).all() True >>> df = ks.Series([True, False, None]).rename("a").to_frame() >>> df.set_index("a").index.all() False """ axis = validate_axis(axis) if axis != 0: raise ValueError('axis should be either 0 or "index" currently.') sdf = self._internal._sdf.select(self._scol) col = scol_for(sdf, sdf.columns[0]) # Note that we're ignoring `None`s here for now. # any and every was added as of Spark 3.0 # ret = sdf.select(F.expr("every(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0] # Here we use min as its alternative: ret = sdf.select(F.min(F.coalesce(col.cast('boolean'), F.lit(True)))).collect()[0][0] if ret is None: return True else: return ret
def min(scol): return F.when( F.row_number().over(self._window) >= self._min_periods, F.min(scol).over(self._window)).otherwise(F.lit(None))
def test_end_to_end(spark, tmp_path, root, num_partitions): server_a_keys = json.loads((root / "server_a_keys.json").read_text()) server_b_keys = json.loads((root / "server_b_keys.json").read_text()) shared_seed = json.loads((root / "shared_seed.json").read_text())["shared_seed"] batch_id = "test_batch" n_data = 7 n_rows = 20 params_a = [ batch_id.encode(), n_data, "A", server_a_keys["private_key"].encode(), b64decode(shared_seed), server_a_keys["public_key"].encode(), server_b_keys["public_key"].encode(), ] params_b = [ batch_id.encode(), n_data, "B", server_b_keys["private_key"].encode(), b64decode(shared_seed), server_b_keys["public_key"].encode(), server_a_keys["public_key"].encode(), ] def show(df): df.show() return df def explain(df): df.explain() return df shares = ( spark.createDataFrame( [Row(payload=[int(i % 3 == 0 or i % 5 == 0) for i in range(n_data)])] * n_rows ) .select( F.pandas_udf( partial( udf.encode, batch_id.encode(), n_data, server_a_keys["public_key"].encode(), server_b_keys["public_key"].encode(), ), returnType="a: binary, b: binary", )("payload").alias("shares") ) .withColumn("id", F.udf(lambda: str(uuid4()), "string")()) ) shares.cache() shares.show() verify1_a = shares.select( "id", F.pandas_udf(partial(udf.verify1, *params_a), returnType="binary")( "shares.a" ).alias("verify1_a"), ) verify1_b = shares.select( "id", F.pandas_udf(partial(udf.verify1, *params_b), returnType="binary")( "shares.b" ).alias("verify1_b"), ) verify2_a = ( shares.join(verify1_a, on="id") .join(verify1_b, on="id") .select( "id", F.pandas_udf(partial(udf.verify2, *params_a), returnType="binary")( "shares.a", "verify1_a", "verify1_b" ).alias("verify2_a"), ) ) verify2_b = ( shares.join(verify1_a, on="id") .join(verify1_b, on="id") .select( "id", F.pandas_udf(partial(udf.verify2, *params_b), returnType="binary")( "shares.b", "verify1_b", "verify1_a" ).alias("verify2_b"), ) ) aggregate_a = ( shares.join(verify2_a, on="id") .join(verify2_b, on="id") .select( F.col("shares.a").alias("shares"), F.col("verify2_a").alias("internal"), F.col("verify2_b").alias("external"), ) # this only works if partition < 4GB .groupBy() .applyInPandas( lambda pdf: udf.aggregate(*params_a, pdf), schema="payload binary, error int, total int", ) .groupBy() .applyInPandas( lambda pdf: udf.total_share(*params_a, pdf), schema="payload binary, error int, total int", ) ) aggregate_b = explain( show( shares.join(verify2_a, on="id") .join(verify2_b, on="id") .repartitionByRange(num_partitions, "id") .select( F.col("shares.b").alias("shares"), F.col("verify2_b").alias("internal"), F.col("verify2_a").alias("external"), ) .withColumn("pid", F.spark_partition_id()) ) .groupBy("pid") .applyInPandas( lambda pdf: udf.aggregate(*params_b, pdf), schema="payload binary, error int, total int", ) .groupBy() .applyInPandas( lambda pdf: udf.total_share(*params_b, pdf), schema="payload binary, error int, total int", ) ) def test_total_shares(aggregate): print(aggregate) assert len(aggregate.payload) > 0 assert aggregate.error == 0 assert aggregate.total == n_rows return True assert test_total_shares(aggregate_a.first()) assert test_total_shares(aggregate_b.first()) published = show( aggregate_a.withColumn("server", F.lit("internal")) .union(aggregate_b.withColumn("server", F.lit("external"))) .groupBy() .pivot("server", ["internal", "external"]) .agg(*[F.min(x).alias(x) for x in aggregate_a.columns]) ).select( F.pandas_udf(partial(udf.publish, *params_a), returnType="array<int>")( "internal_payload", "external_payload" ).alias("payload"), F.col("internal_error").alias("error"), F.col("internal_total").alias("total"), ) published.cache() assert published.count() == 1 row = published.first() assert row.error == 0 assert row.total == n_rows assert row.payload == [ int(i % 3 == 0 or i % 5 == 0) * n_rows for i in range(n_data) ]
#load stores ... I'm lazy and use string ids stores_rdd=sc.textFile(base_path+"stores.txt").map(lambda x:x.split('\t')) # id | name stores_fields=[ StructField('id',StringType(),False), # name,type,nullable StructField('name',StringType(),False), ] stores=sqlCtx.createDataFrame(stores_rdd,StructType(stores_fields)) products_rdd=sc.textFile(base_path+"products.txt").map(lambda x:x.split('\t')) # id | name | category products_fields=[ StructField('id',StringType(),False), # name,type,nullable StructField('name',StringType(),False), StructField('category',StringType(),True), ] products=sqlCtx.createDataFrame(products_rdd,StructType(products_fields)) sqlCtx.registerDataFrameAsTable(sales,"sales") sqlCtx.registerDataFrameAsTable(stores,"stores") sqlCtx.registerDataFrameAsTable(products,"products") # can do SQL, including joins and GroupBy !! sqlCtx.sql("SELECT * FROM sales sa join stores st on sa.store=st.id").show() sqlCtx.sql("SELECT * FROM sales s join products p on s.product=p.id").show() # .explain # .agg from pyspark.sql import functions as funcs sales.groupBy('day').agg(funcs.min('store').alias('MinStore'),funcs.max('quantity').alias('MaxQty')).show()
# A slightly different way to generate the two random columns df = sqlContext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)) #df.describe().show() display(df.describe()) # COMMAND ---------- #df.describe('uniform', 'normal').show() display(df.describe('uniform', 'normal')) # COMMAND ---------- from pyspark.sql.functions import mean, min, max #df.select([mean('uniform'), min('uniform'), max('uniform')]).show() display(df.select([mean('uniform'), min('uniform'), max('uniform')])) # COMMAND ---------- # MAGIC %md ### Sample covariance and correlation # MAGIC # MAGIC Covariance is a measure of how two variables change with respect to each other. A positive number would mean that there is a tendency that as one variable increases, the other increases as well. A negative number would mean that as one variable increases, the other variable has a tendency to decrease. The sample covariance of two columns of a DataFrame can be calculated as follows: # COMMAND ---------- from pyspark.sql.functions import rand df = sqlContext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) # COMMAND ----------
.show() ''' Consulta d) Los usuarios con la fecha de creación más antigua y la más reciente, respectivamente ''' ''' Necesario para utilizar la función to_date ''' from pyspark.sql.functions import * df.select("*")\ .where((to_date(df.CreationDate) == df.select( min( to_date("CreationDate"))\ .alias("min"))\ .collect()[0].min) | ( to_date(df.CreationDate) == df.select( max(to_date("CreationDate"))\ .alias("max"))\ .collect()[0].max))\ .orderBy(to_date("CreationDate"))\ .show() ''' Comparando fechas hasta los milisegundos''' ''' Usuario más antiguo ''' df.sort("CreationDate", ascending=False)\ .limit(1)\
def getAndInsertStationLimits(pStationId): stationLimits = daily[daily.station_id == pStationId].groupBy('station_id').agg( max('max_temp'), avg('max_temp'), min('max_temp'), max('max_pressure'), avg('max_pressure'), min('max_pressure'), max('min_pressure'), avg('min_pressure'), min('min_pressure'), max('med_temp'), avg('med_temp'), min('med_temp'), max('min_temp'), avg('min_temp'), min('min_temp'), max('precip'), avg('precip'), min('precip'), max('wind_med_vel'), avg('wind_med_vel'), min('wind_med_vel'), max('wind_streak'), avg('wind_streak'), min('wind_streak')) stationLimitsRenamed = stationLimits.select("max(max_temp)", "avg(max_temp)", "min(max_temp)", "max(max_pressure)", "avg(max_pressure)" , "min(max_pressure)", "max(med_temp)", "avg(med_temp)", "min(med_temp)", "max(min_temp)", "avg(min_temp)", "min(min_temp)", "max(precip)", "avg(precip)", "min(precip)", "max(wind_med_vel)", "avg(wind_med_vel)", "min(wind_med_vel)", "max(wind_streak)", "avg(wind_streak)", "min(wind_streak)", "max(min_pressure)", "avg(min_pressure)", "min(min_pressure)").withColumnRenamed( "max(max_temp)", "value1").withColumnRenamed( "avg(max_temp)", "value2").withColumnRenamed( "min(max_temp)", "value3").withColumnRenamed( "max(max_pressure)", "value4").withColumnRenamed( "avg(max_pressure)", "value5").withColumnRenamed( "min(max_pressure)", "value6").withColumnRenamed( "max(med_temp)", "value7").withColumnRenamed( "avg(med_temp)", "value8").withColumnRenamed( "min(med_temp)", "value9").withColumnRenamed( "max(min_temp)", "value10").withColumnRenamed( "avg(min_temp)", "value11").withColumnRenamed( "min(min_temp)", "value12").withColumnRenamed( "max(precip)", "value13").withColumnRenamed( "avg(precip)", "value14").withColumnRenamed( "min(precip)", "value15").withColumnRenamed( "max(wind_med_vel)", "value16").withColumnRenamed( "avg(wind_med_vel)", "value17").withColumnRenamed( "min(wind_med_vel)", "value18").withColumnRenamed( "max(wind_streak)", "value19").withColumnRenamed( "avg(wind_streak)", "value20").withColumnRenamed( "min(wind_streak)", "value21").withColumnRenamed( "max(min_pressure)", "value22").withColumnRenamed( "avg(min_pressure)", "value23").withColumnRenamed( "min(min_pressure)", "value24").collect() maxMaxTemp = stationLimitsRenamed[0].value1 avgMaxTemp = stationLimitsRenamed[0].value2 minMaxTemp = stationLimitsRenamed[0].value3 maxMaxPressure = stationLimitsRenamed[0].value4 avgMaxPressure = stationLimitsRenamed[0].value5 minMaxPressure = stationLimitsRenamed[0].value6 maxMedTemp = stationLimitsRenamed[0].value7 avgMedTemp = stationLimitsRenamed[0].value8 minMedTemp = stationLimitsRenamed[0].value9 maxMinTemp = stationLimitsRenamed[0].value10 avgMinTemp = stationLimitsRenamed[0].value11 minMinTemp = stationLimitsRenamed[0].value12 maxPrecip = stationLimitsRenamed[0].value13 avgPrecip = stationLimitsRenamed[0].value14 minPrecip = stationLimitsRenamed[0].value15 maxWindMedVel = stationLimitsRenamed[0].value16 avgWindMedVel = stationLimitsRenamed[0].value17 minWindMedVel = stationLimitsRenamed[0].value18 maxWindStreak = stationLimitsRenamed[0].value19 avgWindStreak = stationLimitsRenamed[0].value20 minWindStreak = stationLimitsRenamed[0].value21 maxMinPressure = stationLimitsRenamed[0].value22 avgMinPressure = stationLimitsRenamed[0].value23 minMinPressure = stationLimitsRenamed[0].value24 session.execute("INSERT INTO Station_limits (station_id,\"maxMaxTemp\",\"avgMaxTemp\",\"minMaxTemp\",\"maxMaxPressure\",\ \"avgMaxPressure\",\"minMaxPressure\",\"maxMedTemp\",\"avgMedTemp\",\"minMedTemp\",\"maxMinTemp\",\"avgMinTemp\",\"minMinTemp\",\"maxPrecip\",\ \"avgPrecip\",\"minPrecip\",\"maxWindMedVel\",\"avgWindMedVel\",\"minWindMedVel\",\"maxWindStreak\",\"avgWindStreak\",\"minWindStreak\",\"maxMinPressure\",\"avgMinPressure\",\"minMinPressure\") \ VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" , [str(pStationId), maxMaxTemp, avgMaxTemp, minMaxTemp, maxMaxPressure, avgMaxPressure, minMaxPressure, maxMedTemp, avgMedTemp, minMedTemp, maxMinTemp, avgMinTemp, minMinTemp, maxPrecip, avgPrecip, minPrecip, maxWindMedVel, avgWindMedVel, minWindMedVel, maxWindStreak, avgWindStreak, minWindStreak, maxMinPressure, avgMinPressure, minMinPressure])
# MAGIC # MAGIC **agg(\*exprs)** # MAGIC # MAGIC Aggregate on the entire DataFrame without groups. # MAGIC # MAGIC Execute the following two cells for a couple examples of the `agg` operation: # COMMAND ---------- products.agg({"BasePrice": "max"}).show(1) # COMMAND ---------- from pyspark.sql import functions as F products.agg(F.min(products.BasePrice)).show(1) # COMMAND ---------- # MAGIC %md # MAGIC #### Where # MAGIC # MAGIC **where(**condition**)** # MAGIC # MAGIC Filters rows using the given condition. # MAGIC # MAGIC `where()` is an alias for `filter()`. # MAGIC # MAGIC Execute the following cell: # COMMAND ----------
# In[67]: sqlContext.sql("SELECT stringLengthString('test')").collect() # In[68]: df.agg({"age": "max"}).collect() # In[69]: from pyspark.sql import functions as F # In[70]: df.agg(F.min(df.age)).collect() # In[71]: from pyspark.sql.functions import * df_as1 = df.alias("df_as1") df_as2 = df.alias("df_as2") joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner') joined_df.select("df_as1.name", "df_as2.name", "df_as2.age").collect() # In[72]: df.coalesce(1).rdd.getNumPartitions() # In[73]:
def process_data(self): ############################################################################## # DECLARE VARIABLES ############################################################################## dt_range = self.study_dates("2020-07-30") dt = dt_range s1_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata' s1_initial_bucket_depth = 'cuebiq/daily-feed/US/' s1_bucket_output = 'cuebiq/daily-feed-reduced/US/' s2_bucket_name = 'b6-8f-fc-09-0f-db-50-3f-gpsdata' s2_initial_bucket_depth = 'cuebiq/daily-feed-reduced/US/' s2_bucket_output = 'cuebiq/processed-data/US/micro-clusters/' anchor_dist = 430 time_thresh = 28800 part_num = 9 gps_schema = StructType([ StructField("utc_timestamp", IntegerType(), True), StructField("device_id", StringType(), True), StructField("os", IntegerType(), True), StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True), StructField("accuracy", IntegerType(), True), StructField("tz_offset", IntegerType(), True) ]) s2_gps_schema = StructType([ StructField("utc_timestamp", IntegerType(), True), StructField("device_id", StringType(), True), StructField("os", IntegerType(), True), StructField("latitude", FloatType(), True), StructField("longitude", FloatType(), True), StructField("accuracy", IntegerType(), True), StructField("tz_offset", IntegerType(), True), StructField("row_number", IntegerType(), True) ]) ############################################################################## # WINDOWS ############################################################################## w = Window().partitionBy('device_id').orderBy('utc_timestamp') l = Window().partitionBy('device_id', 'lin_grp').orderBy('utc_timestamp') w2 = Window().partitionBy('device_id').orderBy('row_number') ############################################################################## # BEGIN DAILY ITERATION ############################################################################## print("Reading in files for {}".format(str(dt['study_dt'])[:10])) print("s3://{}/{}[{}|{}|{}]/*.gz".format(s1_bucket_name, s1_initial_bucket_depth, dt['s3_before'], dt['s3_study_dt'], dt['s3_after'])) print("") ################################################################################################# # START STEP 1 ################################################################################################# df1 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020729*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_before'] +"/*.gz") # the day before df2 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020730*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_study_dt'] +"/*.gz") # actual study date df3 = dataFrameReader \ .options(header = 'false', delimiter = '\t', codec = 'gzip') \ .schema(gps_schema) \ .format("csv") \ .load("/opt/spark/sample_data/daily-feed/US/2020731*/*.csv.gz") #.load("s3://" + s1_bucket_name + "/" + s1_initial_bucket_depth + dt['s3_after'] +"/*.gz") # the day after # Union data from three inputs into 1 dataframe df = df1.union(df2).union(df3) \ .repartition(part_num, 'device_id') del df1 del df2 del df3 ############################################################################## # FILTER INITIAL JUNK RECORDS # Removes duplicated records (based on time and id), poor accuracy, bad coordinates, and timestamps outside of study range ############################################################################## df = df.na.drop(subset=['latitude','longitude','tz_offset','accuracy']) \ .filter(((df['accuracy'] >= 5) & (df['accuracy'] <= 65)) \ & ((~(df['latitude'] == 0)) | ~(df['longitude'] == 0)) \ & (df['utc_timestamp'] + df['tz_offset']) \ .between(dt['utc_study_dt'], dt['utc_after'])) \ .dropDuplicates(['utc_timestamp','device_id']) ############################################################################## # EXCESSIVE SPEED REMOVAL ############################################################################## df = df.withColumn('dist_to',distance(df['latitude'], df['longitude'], lead(df['latitude'],1).over(w), \ lead(df['longitude'],1).over(w))) \ .withColumn('sec_to', (lead(df['utc_timestamp'], 1).over(w) - df['utc_timestamp'])) \ .withColumn('speed_to', rate_of_speed(col('dist_to'), col('sec_to'),'hour')) \ .withColumn('dist_from', lag(col('dist_to'), 1).over(w)) \ .withColumn('sec_from', lag(col('sec_to'), 1).over(w)) \ .withColumn('speed_from', lag(col('speed_to'), 1).over(w)) \ .filter(((col('dist_to').isNull()) | (col('dist_from').isNull())) \ | ((((col('speed_from') + col('speed_to')) / 2) <= 90) | ((col('dist_to') >= 150) | (col('dist_from') >= 150))) \ & ((col('speed_from') < 600) & (col('speed_to') < 600)) \ & ((col('speed_from') < 20) | (col('speed_to') < 20))) \ .select('utc_timestamp', 'device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset') ############################################################################## # LINEAR TRAVEL PING REMOVAL # Break pings out into groups of 4 to measure the linear distance ############################################################################## #Assign a record number and linear grouping and lead distance df = df.withColumn('RecordNum',row_number().over(w)) \ .withColumn('lin_grp', py.ceil(row_number().over(w) / 4)) \ .withColumn('dist_to', distance(df['latitude'], df['longitude'], \ lead(df['latitude'],1).over(l), lead(df['longitude'],1).over(l),'meters')) # Create aggregated table for linear groupings expr = [py.min(col('utc_timestamp')).alias('min_utc_timestamp'),py.max(col('utc_timestamp')).alias('max_utc_timestamp'), \ py.count(col('utc_timestamp')).alias('cnt'),py.sum(col('dist_to')).alias('sum_dist'),py.min(col('dist_to')).alias('min_dist')] dfl_grp = df.groupBy('device_id', 'lin_grp').agg(*expr) dfl_grp.createOrReplaceTempView('dfl_grp') df.createOrReplaceTempView('dfl') # Grab just the first and last records in each linear grouping and append aggregated info dfls = spark.sql( "SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \ a.lin_grp, b.sum_dist, b.min_dist, b.cnt \ FROM dfl as a INNER JOIN dfl_grp as b \ ON a.device_id = b.device_id \ AND a.lin_grp = b.lin_grp \ AND a.utc_timestamp = b.min_utc_timestamp \ UNION ALL \ SELECT a.utc_timestamp, a.device_id, a.os, a.latitude, a.longitude, a.accuracy, a.tz_offset, \ a.lin_grp, b.sum_dist, b.min_dist, b.cnt \ FROM dfl as a INNER JOIN dfl_grp as b \ ON a.device_id = b.device_id \ AND a.lin_grp = b.lin_grp \ AND a.utc_timestamp = b.max_utc_timestamp") # Measure the distance between first and last in each linear grouping and compare to sum distance of all points # Only keep groups that meet criteria for being straight-line df_j = dfls.withColumn('strt_dist', distance(dfls['latitude'],dfls['longitude'], \ lead(dfls['latitude'],1).over(l), \ lead(dfls['longitude'],1).over(l), 'meters')) \ .withColumn('lin',col('strt_dist') / dfls['sum_dist']) \ .na.drop(subset=['strt_dist']) \ .filter((dfls['min_dist'] > 0) \ & (col('strt_dist').between(150, 2000)) \ & (dfls['cnt'] == 4) \ & (col('lin') >= .99825)) \ .select('device_id','lin_grp', 'lin') # Outer join main dataframe to linears groups to filter non-linear pings df = df.join(df_j, ['device_id','lin_grp'], how='left_outer') \ .filter(col('lin').isNull()) \ .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset') del dfl_grp del dfls del df_j ####################################### # CHAIN # Calculating the dynamic chain threshold to find proximate ping relationships ####################################### df = df.withColumn('chain_dist', ((((df['accuracy'] + lead(df['accuracy'],1).over(w)) - 10) * (230 / 120) + 200))) \ .withColumn('chain', when((distance(df['latitude'], df['longitude'], \ lead(df['latitude'],1).over(w), lead(df['longitude'], 1).over(w),'feet')) <= col('chain_dist'), 1) .when((distance(df['latitude'], df['longitude'], \ lag(df['latitude'],1).over(w), lag(df['longitude'], 1).over(w),'feet')) <= lag(col('chain_dist'), 1).over(w), 1)) \ .filter(col('chain') == 1) \ .withColumn('row_number', row_number().over(w)) \ .select('utc_timestamp','device_id', 'os', 'latitude', 'longitude', 'accuracy', 'tz_offset','row_number') \ .persist() df \ .repartition(100,'device_id').sortWithinPartitions('device_id','row_number') \ .write \ .csv(path="/opt/spark/sample_data/daily-feed-reduced/"+dt['s3_study_dt'], mode="append", compression="gzip", sep=",") #.csv(path="s3://" + s1_bucket_name + '/' + s1_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") ############################################################################################## # START STEP 2 ############################################################################################## print('Begin micro-clustering') # INITIALIZE ANCHOR TABLE - Create initial anchor start points based on row number = 1 and distance threshold self.df_dist = df.withColumn('tz_timestamp', df['utc_timestamp'] + df['tz_offset']) \ .withColumn('anchor', when(df['row_number'] == 1, col('tz_timestamp')) \ .when(distance(df['latitude'], df['longitude'], \ lag(df['latitude'],1).over(w2),lag(df['longitude'],1).over(w2),'feet') \ >= anchor_dist, col('tz_timestamp')) \ .when(col('tz_timestamp') - lag(col('tz_timestamp'),1).over(w2) >= time_thresh, col('tz_timestamp'))) \ .select('tz_timestamp','device_id','os','latitude','longitude','accuracy','row_number','anchor') \ .repartition(part_num, 'device_id') \ .persist() print('df_dist starting count = {}'.format( self.df_dist.count())) # Materialize table for caching df.unpersist() del df ##################################################################################################### # ITERATE THROUGH DATAFRAME ANCHOR PROCESS - iterations are broken out to speed up checkpointing # Checkpointing is used to chop off the physical plans of the dataframes that grow with each iteration ###################################################################################################### df_anchor1 = self.anchor_func(3, 3) df_anchor2 = self.anchor_func(5, 5) df_anchor3 = self.anchor_func(12, 6) df_anchor4 = self.anchor_func(20, 5) df_anchor5 = self.anchor_func(30, 5) df_anchor6 = self.anchor_func(50, 5) df_anchor7 = self.anchor_func(80, 5, 1000000) df_anchor8 = self.anchor_func(1000, 5, 1000000) ################################################################################################## # Collect remaining pings to driver for Python analysis print('collect remaining pings') anchor_list = self.df_dist.rdd.map(lambda row: {'timestamp':row[0], 'device_id':row[1], 'latitude':row[3], \ 'longitude':row[4], 'anchor':row[7]}).collect() # Sort elements in list by device_id and timestamp anchor_list.sort(key=operator.itemgetter('device_id', 'timestamp')) # Python analysis on driver of final remaining pings print('iterate through remaining pings on driver') anchor_dr = [] for r in anchor_list: if r['anchor'] is not None: anchor_dr.append(r) else: if anchor_dr[-1]['device_id'] == r['device_id']: if distance_dr(r['latitude'],r['longitude'], \ anchor_dr[-1]['latitude'], \ anchor_dr[-1]['longitude'], 'feet') <= anchor_dist \ & r['timestamp'] - anchor_dr[-1]['timestamp'] < time_thresh: anchor_dr.append({'timestamp':r['timestamp'], 'device_id':r['device_id'], \ 'latitude':anchor_dr[-1]['latitude'], 'longitude':anchor_dr[-1]['longitude'], \ 'anchor':anchor_dr[-1]['anchor']}) else: r['anchor'] = r['timestamp'] anchor_dr.append(r) # Condense result table for dataframe distribution print('generate driver anchor table') new_anchor = [] for r in anchor_dr: new_anchor.append([r['timestamp'], r['device_id'], r['anchor']]) # Bring driver results back into a distributed dataframe and join results print('disperse driver anchor table back to cluster') new_anchor_schema = StructType([ StructField('tz_timestamp', IntegerType(), True), StructField('device_id', StringType(), True), StructField('anchor', IntegerType(), True) ]) df_anchor_dr = spark.createDataFrame(new_anchor,new_anchor_schema) \ .repartition(part_num, 'device_id') # Join remaining anchors to main analysis table self.df_dist = self.df_dist.select('tz_timestamp','device_id','os','latitude','longitude', \ 'accuracy','row_number') \ .join(df_anchor_dr,['tz_timestamp','device_id']) \ # Union all anchor tables together and sort print('finalizing anchor results into central table') df_anchors_fnl = df_anchor1.union(df_anchor2).union(df_anchor3).union(df_anchor4).union(df_anchor5) \ .union(df_anchor6).union(df_anchor7).union(df_anchor8).union(self.df_dist) \ .repartition(part_num,'device_id') \ .persist() self.df_dist.unpersist() ####################################################################################### # Calculate centroids ####################################################################################### print('start calculating centroids') # Get max accuracy value for each micro-cluster and filter clusters with fewer than 2 pings df_anchor_grp = df_anchors_fnl.groupBy('device_id','anchor').agg(*[py.max(col('accuracy')).alias('max_accuracy'), \ py.count(col('tz_timestamp')).alias('cnt')]) \ .withColumn('max_acc_1', col('max_accuracy') + 1) \ .filter(col('cnt') > 1) \ .select('device_id','anchor','max_acc_1','cnt') # Calculate the nominator for each micro-cluster df_anchors_fnl = df_anchors_fnl.join(df_anchor_grp, ['device_id','anchor']) \ .withColumn('nom',col('max_acc_1') - col('accuracy')) df_denom = df_anchors_fnl.groupBy( 'device_id', 'anchor').agg(*[py.sum(col('nom')).alias('denom')]) df_anchors_fnl = df_anchors_fnl.join(df_denom, ['device_id','anchor']) \ .withColumn('weight', df_anchors_fnl['nom'] / df_denom['denom']) \ .withColumn('lat', df_anchors_fnl['latitude'] * col('weight')) \ .withColumn('lon', df_anchors_fnl['longitude'] * col('weight')) expr = [py.sum(col('lat')).alias('new_latitude'), py.sum(col('lon')).alias('new_longitude'), \ py.avg(col('latitude')).alias('avg_latitude'), py.avg(col('longitude')).alias('avg_longitude'), \ py.count(col('tz_timestamp')).alias('cluster_png_cnt'), py.first(col('os')).alias('os'), \ py.min(col('tz_timestamp')).alias('start_timestamp'), py.max(col('tz_timestamp')).alias('end_timestamp'), \ py.avg(col('accuracy')).alias('avg_accuracy')] df_micro = df_anchors_fnl.groupBy('device_id','anchor').agg(*expr) \ .withColumn('fnl_lat', (col('new_latitude') * (3/4)) + (col('avg_latitude') * (1/4))) \ .withColumn('fnl_lon', (col('new_longitude') * (3/4)) + (col('avg_longitude') * (1/4))) \ .withColumn('geohash9', geohash_udf_9(col('fnl_lat'), col('fnl_lon'))) \ .withColumn('dwell_seconds', col('end_timestamp') - col('start_timestamp')) \ .withColumn('start_tm', py.from_unixtime(col('start_timestamp'))) \ .withColumn('end_tm', py.from_unixtime(col('end_timestamp'))) \ .filter(col('dwell_seconds') > 1) \ .select('device_id','os','start_tm','end_tm', \ 'dwell_seconds','cluster_png_cnt', col('fnl_lat').alias('latitude'), \ col('fnl_lon').alias('longitude'), 'geohash9', 'avg_accuracy') df_micro \ .repartition(100,'device_id').sortWithinPartitions('device_id','start_tm') \ .write \ .csv(path="/opt/spark/sample_data/processed-data/" + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") #.csv(path="s3://" + s2_bucket_name + '/' + s2_bucket_output + dt['s3_study_dt'], mode="append", compression="gzip", sep=",") df_anchors_fnl.unpersist() return
## Question 7 - What is the mean of the Close column ? ### Python print("The mean of the Close column is :") data.select(col("Close"))\ .agg(avg(col("Close")).alias("Mean"))\ .show() ### SQL print("The mean of the Close column is :") spark.sql("""select mean(Close) as Mean from data_SQL""").show() ## Question 8 - What is the max and min of the Volume column ? ### Python data.select([min("Volume").alias("Minimum_volume"), max("Volume").alias("Maximum_volume")])\ .show() ### SQL spark.sql( """select min(Volume) as `Minimum_volume`, max(Volume) as `Maximum_volume` from data_SQL""" ).show() ## Question 9 - How many days was the Close lower than 60 dollars ? ### Python data.filter(col("Close") < 60)\ .agg(count(col("Date")).alias("DaysNumber"))\ .show() ### SQL
from pyspark.sql import SparkSession from pyspark.sql import functions as F if __name__ == "__main__": spark = SparkSession.builder.master("local").appName("pyspark homework").getOrCreate() file_path = "hdfs:///dataset/bank-data.csv" df = spark.read.csv(path=file_path, header=True, inferSchema=True) df.groupBy("sex").agg(F.min("income"), F.max("income"), F.mean("income")).show() df.groupBy("region").agg({"income": "mean"}).show()
def doRenderMpld3(self, handlerId, figure, axes, keyFields, keyFieldValues, keyFieldLabels, valueFields, valueFieldValues): allNumericCols = self.getNumericalFieldNames() if len(allNumericCols) == 0: self._addHTML("Unable to find a numerical column in the dataframe") return keyFields = self.options.get("keyFields") valueField = self.options.get("valueFields") if(keyFields==None and valueField==None): keyFields=self.getFirstStringColInfo() valueField=self.getFirstNumericalColInfo() else: keyFields = keyFields.split(',') valueField = valueField.split(',') if(len(valueField) > 1): self._addHTML("You can enter only have one value field for Bar Charts (2-D)"+str(len(valueField))) return keyFields = keyFields[0] valueField=valueField[0] #if(len(valueFields>)): #init fig=figure ax=axes #fig, ax = plt.subplots() #fig = plt.figure() params = plt.gcf() plSize = params.get_size_inches() params.set_size_inches( (plSize[0]*2, plSize[1]*2) ) agg=self.options.get("aggregation") groupByCol=self.options.get("groupByCol") if (agg=="None" or agg==None): colLabel = keyFields y = self.entity.select(valueField).toPandas()[valueField].dropna().tolist() x_intv = np.arange(len(y)) labels = self.entity.select(keyFields).toPandas()[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel(valueField, fontsize=18) elif(agg=='AVG'): y1=self.entity.groupBy(keyFields).agg(F.avg(valueField).alias("avg")).toPandas().sort_values(by=keyFields) y=y1["avg"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("Average "+valueField, fontsize=18) elif(agg=='SUM'): y1=self.entity.groupBy(keyFields).agg(F.sum(valueField).alias("sum")).toPandas().sort_values(by=keyFields) y=y1["sum"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("sum "+valueField, fontsize=18) elif(agg=='MAX'): y1=self.entity.groupBy(keyFields).agg(F.max(valueField).alias("max")).toPandas().sort_values(by=keyFields) y=y1["max"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("max "+valueField, fontsize=18) elif(agg=='MIN'): y1=self.entity.groupBy(keyFields).agg(F.min(valueField).alias("min")).toPandas().sort_values(by=keyFields) y=y1["min"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("min "+valueField, fontsize=18) elif(agg=='COUNT'): y1=self.entity.groupBy(keyFields).agg(F.count(valueField).alias("count")).toPandas().sort_values(by=keyFields) y=y1["count"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("count "+valueField, fontsize=18) mpld3.enable_notebook() plt.bar(x_intv,y,color="blue",alpha=0.5) ax_fmt = BarChart(labels) mpld3.plugins.connect(fig, ax_fmt)
iotmsgsRDD = sqlContext.read.json(js) iotmsgsRDD.registerTempTable("iotmsgsTable") print("JSON converted to DataFrame of casted floating point numbers") sqlContext.sql("select distinct cast(payload.data.temperature as float) \ from iotmsgsTable order by temperature desc").show() print("DataFrame showing automated 'describe' summary of floating points") sqlContext.sql("select distinct cast(payload.data.temperature as float) \ from iotmsgsTable order by temperature desc").describe().show() print("DataFrame of selected SQL dataframe functions") temperatureDF = sqlContext.sql("select distinct cast(payload.data.temperature \ as float) from iotmsgsTable order by temperature desc") functionsDF = temperatureDF.select([mean('temperature'), min('temperature'), \ max('temperature')]) print(type(functionsDF)) print(functionsDF) functionsDF.printSchema() functionsDF.show() # Collect a List of Rows of data from the DataFrame print("Extracted List of Rows of selected SQL dataframe function") functionsList = temperatureDF.select([mean('temperature'), min('temperature'), \ max('temperature')]).collect() print(type(functionsList)) print(functionsList) print() # Collect max temperature from Row #1 of the DataFrame
sc = SparkContext(conf = conf) sqlcontext = SQLContext(sc) # 1. Create a DataFrame with one int column and 10 rows. df = sqlcontext.range(0, 10) df.show() # Generate two other columns using uniform distribution and normal distribution. df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")) df.show() # 2. Summary and Descriptive Statistics df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)) df.describe('uniform', 'normal').show() df.select([mean('uniform'), min('uniform'), max('uniform')]).show() # 3. Sample covariance and correlation # Covariance is a measure of how two variables change with respect to each other. # A positive number would mean that there is a tendency that as one variable increases, # the other increases as well. # A negative number would mean that as one variable increases, # the other variable has a tendency to decrease. df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) df.stat.cov('rand1', 'rand2') df.stat.cov('id', 'id') # Correlation is a normalized measure of covariance that is easier to understand, # as it provides quantitative measurements of the statistical dependence between two random variables. df.stat.corr('rand1', 'rand2') df.stat.corr('id', 'id')
def _downsample(self, f: str) -> DataFrame: """ Downsample the defined function. Parameters ---------- how : string / mapped function **kwargs : kw args passed to how function """ # a simple example to illustrate the computation: # dates = [ # datetime.datetime(2012, 1, 2), # datetime.datetime(2012, 5, 3), # datetime.datetime(2022, 5, 3), # ] # index = pd.DatetimeIndex(dates) # pdf = pd.DataFrame(np.array([1,2,3]), index=index, columns=['A']) # pdf.resample('3Y').max() # A # 2012-12-31 2.0 # 2015-12-31 NaN # 2018-12-31 NaN # 2021-12-31 NaN # 2024-12-31 3.0 # # in this case: # 1, obtain one origin point to bin all timestamps, we can get one (2009-12-31) # from the minimum timestamp (2012-01-02); # 2, the default intervals for 'Y' are right-closed, so intervals are: # (2009-12-31, 2012-12-31], (2012-12-31, 2015-12-31], (2015-12-31, 2018-12-31], ... # 3, bin all timestamps, for example, 2022-05-03 belongs to interval # (2021-12-31, 2024-12-31], since the default label is 'right', label it with the right # edge 2024-12-31; # 4, some intervals maybe too large for this down sampling, so we need to pad the dataframe # to avoid missing some results, like: 2015-12-31, 2018-12-31 and 2021-12-31; # 5, union the binned dataframe and padded dataframe, and apply aggregation 'max' to get # the final results; # one action to obtain the range, in the future we may cache it in the index. ts_min, ts_max = (self._psdf._internal.spark_frame.select( F.min(self._resamplekey_scol), F.max(self._resamplekey_scol)).toPandas().iloc[0]) # the logic to obtain an origin point to bin the timestamps is too complex to follow, # here just use Pandas' resample on a 1-length series to get it. ts_origin = (pd.Series([0], index=[ts_min ]).resample(rule=self._offset.freqstr, closed=self._closed, label="left").sum().index[0]) assert ts_origin <= ts_min bin_col_name = "__tmp_resample_bin_col__" bin_col_label = verify_temp_column_name(self._psdf, bin_col_name) bin_col_field = InternalField( dtype=np.dtype("datetime64[ns]"), struct_field=StructField(bin_col_name, TimestampType(), True), ) bin_scol = self._bin_time_stamp( ts_origin, self._resamplekey_scol, ) agg_columns = [ psser for psser in self._agg_columns if (isinstance(psser.spark.data_type, NumericType)) ] assert len(agg_columns) > 0 # in the binning side, label the timestamps according to the origin and the freq(rule) bin_sdf = self._psdf._internal.spark_frame.select( F.col(SPARK_DEFAULT_INDEX_NAME), bin_scol.alias(bin_col_name), *[psser.spark.column for psser in agg_columns], ) # in the padding side, insert necessary points # again, directly apply Pandas' resample on a 2-length series to obtain the indices pad_sdf = (ps.from_pandas( pd.Series([0, 0], index=[ts_min, ts_max]).resample( rule=self._offset.freqstr, closed=self._closed, label=self._label).sum().index)._internal.spark_frame.select( F.col(SPARK_DEFAULT_INDEX_NAME).alias(bin_col_name)).where( (ts_min <= F.col(bin_col_name)) & (F.col(bin_col_name) <= ts_max))) # union the above two spark dataframes. sdf = bin_sdf.unionByName( pad_sdf, allowMissingColumns=True).where(~F.isnull(F.col(bin_col_name))) internal = InternalFrame( spark_frame=sdf, index_spark_columns=[scol_for(sdf, SPARK_DEFAULT_INDEX_NAME)], data_spark_columns=[F.col(bin_col_name)] + [ scol_for(sdf, psser._internal.data_spark_column_names[0]) for psser in agg_columns ], column_labels=[bin_col_label] + [psser._column_label for psser in agg_columns], data_fields=[bin_col_field] + [ psser._internal.data_fields[0].copy(nullable=True) for psser in agg_columns ], column_label_names=self._psdf._internal.column_label_names, ) psdf: DataFrame = DataFrame(internal) groupby = psdf.groupby(psdf._psser_for(bin_col_label), dropna=False) downsampled = getattr(groupby, f)() downsampled.index.name = None return downsampled
def get_bins(self, column_name, num_bins=10, split_points=None): """ Finds number of items in each bin. Only one of the params num_bins ot split_points need to be supplied. :param column_name: column to be binned :param num_bins: number of bins to create :param split_points: list of tupels [(a,b), (b, c), ...] such that all values in the range [a, b) assigned to bucket1 :return: """ if not column_name in self._numeric_columns: raise BIException.column_does_not_exist(column_name) splits = None if split_points == None: min_max = self._data_frame.agg( FN.min(column_name).alias('min'), FN.max(column_name).alias('max')).collect() min_value = min_max[0]['min'] max_value = min_max[0]['max'] quantile_discretizer = QuantileDiscretizer(numBuckets=10, inputCol=column_name, outputCol='buckets', relativeError=0.01) bucketizer = quantile_discretizer.fit(self._data_frame) # splits have these values [-Inf, Q1, Median, Q3, Inf] splits = bucketizer.getSplits() else: splits = split_points # cast column_name to double type if needed, otherwise Bucketizer does not work splits[0] = min_value - 0.1 splits[-1] = max_value + 0.1 column_df = None if self._column_data_types.get(column_name) != DoubleType: column_df = self._data_frame.select( FN.col(column_name).cast('double').alias('values')) else: column_df = self._data_frame.select( FN.col(column_name).alias('values')) bucketizer = Bucketizer(inputCol='values', outputCol='bins') bucketizer.setSplits(splits) if min_value == max_value: histogram = Histogram(column_name, self._num_rows) bin_number = 0 start_value = min_value - 0.5 end_value = max_value + 0.5 histogram.add_bin(bin_number, start_value, end_value, self._num_rows) else: buckets_and_counts = bucketizer.transform(column_df).groupBy( 'bins').agg({ '*': 'count' }).collect() histogram = Histogram(column_name, self._num_rows) for row in buckets_and_counts: bin_number = int(row[0]) start_value = splits[bin_number] end_value = splits[bin_number + 1] histogram.add_bin( bin_number, start_value, end_value, float(row[1]) * 100.0 / (end_value - start_value)) return histogram
# the new column objects in a `dict` of the form {'new_col_name': # new_col_object}, and add all the columns to `df_agg` in a loop at the end. # # We name the new columns with a suffix like `_2d` indicating that the # column is an aggregation of days up to 2 days back in time # # These summary columns are generally some aggregation function of the # corresponding `_0d` summary, over the new sliding window: # `f.some_func(f.col(`feat_some_func_0d`)).over(window). # We show a few simple examples, feel free to add your own! feat = settings['feature'] + '_' # convenience dd = '_{}d'.format(days) # column name suffix indicating window size new_cols = { # min = min of daily mins feat + 'min' + dd: f.min(f.col(feat + 'min_0d')).over(window), # max = max of daily maxes feat + 'max' + dd: f.max(f.col(feat + 'max_0d')).over(window), # sum = sum of daily sums feat + 'sum' + dd: f.sum(f.col(feat + 'sum_0d')).over(window), # count = sum of daily counts feat + 'count' + dd: f.sum(f.col(feat + 'count_0d')).over(window), # A few more complicated examples:
'float'), 2).alias('Close'), result_desc['Volume'].cast('int').alias('Volume') ).show() hv_ratio = walmartdf.withColumn( "HV ratio", walmartdf['High'] / walmartdf['Volume']) hv_ratio.select('HV Ratio').show() # finding highest value date walmartdf.orderBy(walmartdf['High'].desc()).head(1)[0][0] walmartdf.agg(mean(walmartdf['Close'])).show() walmartdf.select(max(walmartdf['Volume']), min('Volume')).show() walmartdf.filter(walmartdf['Close'] < 60).count() (walmartdf.filter(walmartdf['High'] > 80).count( ) / walmartdf.agg(count(walmartdf['Date'])).head(1)[0][0]) * 100 newdf = walmartdf.withColumn("year", year(walmartdf['Date'])) newdf.groupby("year").max().select('year', 'max(High)').show() newdf2 = walmartdf.withColumn("month", month(walmartdf['Date'])) newdf2.groupBy("month").mean().select( 'month', 'avg(Close)').orderBy('month').show()
def group_by_day(df, *, spark_session, group_keys, from_date_field, to_date_field, join_partitions=100, count_partitions=3, min_cutoff_date=None, max_cutoff_date=datetime.date.today(), drop_date_fields=True): """Count entries per day grouped by the keys in group_keys. Args: df (DataFrame): Incoming spark dataframe. Kwargs: spark_session (SparkSession): Spark session used to build date dataframe. group_keys (list): List of strings with keys to group by. from_date_field (str): From data field name. to_date_field (str): To data field name. join_partitions (int): Number of partitions for the join with dates. count_partitions (int): Number of partitions after the count. min_cutoff_date (datetime.date): Drop all data before this date. max_cutoff_date (datetime.date): Drop all data after this date, can often be today. drop_date_fields (bool): If True drop original from_date and to_date fields from result. Returns: Spark DataFrame with grouped counts per key combination per day. Example input: DataFrame +-------------+---------+-----------+-----------+---------+--------+ |customer_id |prod_code|shop_code |channel |from_date| to_date| +-------------+---------+-----------+-----------+---------+--------+ | 1234| ABC| SHOP1| A| 20160606|20160610| | 3456| ABC| SHOP1| A| 20160607|20160611| +-------------+---------+-----------+-----------+---------+--------+ group_keys=['prod_code', 'shop_code', 'channel'] from_date_field='from_date' to_date_field='to_date' drop_date_fields=True Example output: DataFrame +--------+-----------+------------+---------+-----+ | datenum|channel |shop_code |prod_code|count| +--------+-----------+------------+---------+-----+ |20160606| A| SHOP1| ABC| 1| |20160607| A| SHOP1| ABC| 2| +--------+-----------+------------+---------+-----+ """ df_with_dates = df\ .withColumn('date_from', F.col(from_date_field))\ .withColumn('date_to', F.col(to_date_field)) dates_range = df_with_dates.agg(F.min(F.col("date_from")), F.max(F.col("date_to"))).collect() all_dates = all_dates_between(dates_range[0][0], dates_range[0][1], min_cutoff_date, max_cutoff_date) dates_df = spark_session.createDataFrame(all_dates, T.DateType()) between = dates_df['value'].between(df_with_dates['date_from'], df_with_dates['date_to']) result = df_with_dates.repartition(join_partitions)\ .join(dates_df, on=between) datenum_type = F.date_format(F.col('datenum'), 'YYYYMMdd').cast("int") result = result\ .withColumn('datenum', result['value'])\ .withColumn('datenum', datenum_type)\ .drop('value')\ .drop('date_from')\ .drop('date_to') if drop_date_fields: result = result.drop(from_date_field).drop(to_date_field) new_group_keys = group_keys[:] new_group_keys.append('datenum') return result.groupBy(new_group_keys)\ .count().repartition(count_partitions)
def lift_splitted(sqc, query, target='true_target', proba='target_proba', split_by={'model_name, business_dt'}, cost=None, n_buckets=100): """ Calculate lift function over splits. Requires sqlContext Input: sqc - sqlContext of spark session query - query Dataframe with true_target, target_proba and split columns split_by - list of columns to calculate lift independently target - binary actual values to predict cost - optional, charges proba - probabilities of target = 1 n_buckets - number of buckets to bin lift function Output: pdf - pandas DataFrame with cumulative lift and coverage """ import pandas as pd import pyspark.sql.functions as F sqc.sql(query).registerTempTable("tmp_lift_splitted") # generate ntiles if cost is not None: sql = """select {sb}, CAST({target} AS INT) {target}, {proba}, {cost}, NTILE({nb}) OVER (PARTITION BY {sb} ORDER BY {proba} DESC) as tile FROM tmp_lift_splitted t """.format(sb=split_by, target=target, proba=proba, cost=cost, nb=n_buckets) else: sql = """select {sb}, CAST({target} AS INT) {target}, {proba}, NTILE({nb}) OVER (PARTITION BY {sb} ORDER BY {proba} DESC) as tile FROM tmp_lift_splitted t """.format(sb=split_by, target=target, proba=proba, cost=cost, nb=n_buckets) sdf = sqc.sql(sql) if cost is not None: pdf = sdf.groupby(split_by.union({'tile'})).agg( F.sum(F.col(target)).alias("target_sum"), F.count(F.col(target)).alias("target_cnt"), F.min(F.col(proba)).alias("target_proba_min"), F.max(F.col(proba)).alias("target_proba_max"), F.sum(F.col(cost)).alias("cost_sum"), F.sum(F.col(cost).isNotNull().cast('integer')).alias( "cost_cnt")).toPandas() pdf['cost_sum'] = pdf['cost_sum'].astype(float) else: pdf = sdf.groupby(split_by.union({'tile'})).agg( F.sum(F.col(target)).alias("target_sum"), F.count(F.col(target)).alias("target_cnt"), F.min(F.col(proba)).alias("target_proba_min"), F.max(F.col(proba)).alias("target_proba_max")).toPandas() if 'business_dt' in split_by: pdf['business_dt'] = pd.to_datetime(pdf['business_dt'], errors='coerce') pdf = pdf.sort_values('tile') grouped = pdf.groupby(split_by, as_index=False) pdf['target_cum_sum'] = grouped.target_sum.cumsum() pdf['target_cum_cnt'] = grouped.target_cnt.cumsum() if cost is not None: pdf['charge_cum_sum'] = grouped.cost_sum.cumsum() pdf['charge_cum_cnt'] = grouped.cost_cnt.cumsum() pdf = pdf.sort_values(split_by).set_index(split_by) pdf['response_cum'] = pdf.target_cum_sum / pdf.target_cum_cnt pdf['target_sum_base'] = pdf.loc[pdf.tile == n_buckets, 'target_cum_sum'] pdf['target_cnt_base'] = pdf.loc[pdf.tile == n_buckets, 'target_cum_cnt'] pdf['lift'] = pdf['response_cum'] / (pdf['target_sum_base'] / pdf['target_cnt_base']) pdf['coverage'] = pdf['target_cum_sum'] / pdf['target_sum_base'] if cost is not None: pdf['charge_average'] = pdf.cost_sum / pdf.cost_cnt pdf['charge_gain'] = pdf.charge_cum_sum / pdf.loc[pdf.tile == n_buckets, 'charge_cum_sum'] \ .loc[pdf.index] return pdf
# | ZJV03Y00| # | ZDEB33GK| # | Z302T6CW| # +-------------+ # only showing top 5 rows spark.sql("""SELECT model, min(capacity_bytes / pow(1024, 3)) min_GB, max(capacity_bytes/ pow(1024, 3)) max_GB FROM backblaze_stats_2019 GROUP BY 1 ORDER BY 3 DESC""").show(5) backblaze_2019.groupby(F.col("model")).agg( F.min(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("min_GB"), F.max(F.col("capacity_bytes") / F.pow(F.lit(1024), 3)).alias("max_GB"), ).orderBy(F.col("max_GB"), ascending=False).show(5) # --------------------+--------------------+-------+ # | model| min_GB| max_GB| # +--------------------+--------------------+-------+ # | TOSHIBA MG07ACA14TA| 13039.0|13039.0| # | ST12000NM0007|-9.31322574615478...|11176.0| # |HGST HUH721212ALN604| 11176.0|11176.0| # | ST10000NM0086| 9314.0| 9314.0| # |HGST HUH721010ALE600| 9314.0| 9314.0| # +--------------------+--------------------+-------+ # only showing top 5 rows spark.sql("""SELECT
def create_bin(): lat_long_rdd = reading_file() max_lat = lat_long_rdd.agg(F.min(lat_long_rdd.Longi)).collect() f.write(str(max_lat))
hitsCount = diffDF.select("URL", "session_id")\ .groupBy("session_id")\ .agg(F.count("URL"),F.countDistinct("URL")) # Saving output of hitsCount dataframe to disk hitsCount.coalesce(1).write.mode("overwrite").csv( "hdfs:///root/pydev/dataset/hit_count", header='true') # Solution for Question 2 # Generating Avg session time as "avg(SESSION_TIME) group by client_ip", where # SESSION_TIME is calculated as difference between max and min of event time (rTS) grouped by client_ip, SESSION_ID sessionTimeBySessionID = diffDF.select("rTS", "client_ip", "session_id")\ .groupBy("client_ip", "session_id")\ .agg(F.max("rTS") - F.min("rTS"))\ .withColumnRenamed("(max(rTS) - min(rTS))", "SESSION_TIME") avgSessionTimeByUser = sessionTimeBySessionID.select("client_ip", "SESSION_TIME")\ .groupBy("client_ip")\ .agg(F.avg("SESSION_TIME")) # Saving output of avgSessionTimeByUser dataframe to disk avgSessionTimeByUser.coalesce(1).write.mode("overwrite").csv( "hdfs:///root/pydev/dataset/AvgSession_Time", header='true') # Question 4 # Most engaged users, i.e. the IP with the longest session time # Fetching IP with max(SESSION_TIME) within entire WebLog dataset windowSpec = Window\
def fn(col): if 'window' in kwargs: window = kwargs['window'] return F.min(col).over(window) else: return F.min(col)
StructField("TransID", IntegerType()), StructField("CustID", IntegerType()), StructField("TransTotal", FloatType()), StructField("TransNumItems", IntegerType()), StructField("TransDesc", StringType()) ]) purchases = spark.read.csv(path="hdfs://localhost:9000/user/Purchases.txt", schema=p_schema) purchases.createOrReplaceTempView("Purchases") T1 = spark.sql("SELECT * FROM Purchases WHERE TransTotal <= 600") T1.write.json(path="hdfs://localhost:9000/user/T1", mode="overwrite") T2 = T1.groupby("TransNumItems").agg( F.min("TransTotal"), F.max("TransTotal"), F.expr("percentile_approx(TransTotal,0.5)").alias("median(TransTotal)")) T2.write.json(path="hdfs://localhost:9000/user/T2", mode="overwrite") T3 = T1.join(customers, T1.CustID == customers.ID, "inner") T3 = T3.filter((T3.Age <= 25) & (T3.Age >= 18)).groupby("CustID") T3 = T3.agg( F.first("Age").alias("Age"), F.sum("TransNumItems").alias("TotalItems"), F.sum("TransTotal").alias("TotalSpent")) T3.write.json(path="hdfs://localhost:9000/user/T3", mode="overwrite") rows = T3.collect() hits = [] n = len(rows) for i in range(n):
print "schema :", auctionDf.schema print "Describe() :\n", auctionDf.describe().show() print "explain() :\n", auctionDf.explain(extended=True) print "\nAction Data Analysis..............." print "\nauctionid Distinct Count :", auctionDf.select( "auctionid").distinct().count() print "itemtype Distinct Count :", auctionDf.select( "itemtype").distinct().count() print "itemtype Distinct Values :", auctionDf.select( "itemtype").distinct().show() print "\nGroupby :", auctionDf.groupBy("itemtype", "auctionid").count().agg( func.max("count"), func.min("count"), func.avg("count")).show() print "Groupby :", auctionDf.groupBy("itemtype", "auctionid").agg(func.max("bid"), func.min("bid"), func.avg("bid")).show() print "\nFilter :", auctionDf.filter(auctionDf.price > 200).count() xboxes = sqlc.sql( "SELECT itemtype,auctionid,bid,bidtime,bidder,bidderrate,openbid,price FROM auctions WHERE itemtype = 'xbox'" ) print "SQL xboxes:\n", xboxes.head(5) print "SQL xboxes describe:\n", xboxes.describe().show() sc.stop() tnow = datetime.datetime.now() print("\n\nEnd of Program...Spark : %s" % tnow)
def reconstruct(self, df_trajectory_bucketed, df_type='pandas'): # Assert Implemented Methods assert df_type in { 'pandas', 'spark' }, 'reconstruct@<TrajectoryReconstructorLinear>: df_type = "{}" is not implemented!'.format( df_type) # Reconstruct if (df_type == 'pandas'): # Sort and Copy df_result = df_trajectory_bucketed.copy().sort_values( ['id_user', 'id_timestamp']) df_result.reset_index(drop=True, inplace=True) # ID TimeStamp Bounds id_timestamp_min = df_result['id_timestamp'].min() id_timestamp_max = df_result['id_timestamp'].max() # New Data data = [] _n = len(df_result) for i, (id_user, id_timestamp, lat, lng) in enumerate( zip(df_result['id_user'], df_result['id_timestamp'], df_result['lat'], df_result['lng'])): # First Row and Missing TimeStamps if (i == 0 and id_timestamp > id_timestamp_min): data.extend([[id_user, t, lat, lng] for t in range(id_timestamp_min, id_timestamp) ]) # Get Next Row if Possible if (i < (_n - 1)): # Next id_user_next = df_result['id_user'][i + 1] id_timestamp_next = df_result['id_timestamp'][i + 1] lat_next = df_result['lat'][i + 1] lng_next = df_result['lng'][i + 1] # Linear Interpolation if (id_user == id_user_next and (id_timestamp + 1) < id_timestamp_next): # Line Slopes slope_lat = (lat_next - lat) / (id_timestamp_next - id_timestamp) slope_lng = (lng_next - lng) / (id_timestamp_next - id_timestamp) # Update for t in range(id_timestamp + 1, id_timestamp_next): dt = (t - id_timestamp) data.append([ id_user, t, lat + dt * slope_lat, lng + dt * slope_lng ]) elif (id_user != id_user_next): # New User Encountered data.extend([[ id_user, t, lat, lng ] for t in range(id_timestamp, id_timestamp_max + 1)]) data.extend([[ id_user_next, t, lat, lng ] for t in range(id_timestamp_min, id_timestamp_next)]) else: # Last Row and Missing Following TimeStamps if (id_timestamp < id_timestamp_max): data.extend( [[id_user, t, lat, lng] for t in range((id_timestamp + 1), (id_timestamp_max + 1))]) # Append New Data df_result = pd.concat([ df_result, pd.DataFrame(data, columns=['id_user', 'id_timestamp', 'lat', 'lng']) ]).sort_values(['id_user', 'id_timestamp']).reset_index(drop=True) elif (df_type == 'spark'): # ID TimeStamp Bounds row = df_trajectory_bucketed.agg( sql_functions.min(sql_functions.col("id_timestamp")).alias( "id_timestamp_min"), sql_functions.max(sql_functions.col("id_timestamp")).alias( "id_timestamp_max")).head() id_timestamp_min, id_timestamp_max = row['id_timestamp_min'], row[ 'id_timestamp_max'] # Define Imputer Function def linear_interpolator(x_arr): # Result result = [] # Sort by TimeStamp x_arr = sorted(x_arr, key=lambda x: x[0]) # First TimeStamp id_timestamp_first, lat_first, lng_first = x_arr[0] for id_timestamp in range(id_timestamp_min, id_timestamp_first): result.append((id_timestamp, lat_first, lng_first)) # Linear Reconstruction for data_now, data_next in zip(x_arr[:-1], x_arr[1:]): # Extract Info id_timestamp_now, lat_now, lng_now = data_now id_timestamp_next, lat_next, lng_next = data_next # Add Now result.append((id_timestamp_now, lat_now, lng_now)) # Skip Linear Interpolation assert id_timestamp_next > id_timestamp_now, 'Sorting has gone wrong!' if ((id_timestamp_next - id_timestamp_now) == 1): continue # Linear Interpolation ## Slopes dt = id_timestamp_next - id_timestamp_now slope_lat = (lat_next - lat_now) / dt slope_lng = (lng_next - lng_now) / dt ## Add Inner Points for i in range(1, id_timestamp_next - id_timestamp_now): id_timestamp = id_timestamp_now + i result.append((id_timestamp, lat_now + i * slope_lat, lng_now + i * slope_lng)) # Last TimeStamp id_timestamp_last, lat_last, lng_last = x_arr[-1] for id_timestamp in range(id_timestamp_last, id_timestamp_max + 1): result.append((id_timestamp, lat_last, lng_last)) # Return return (result) udf_linear_interpolator = sql_functions.udf( linear_interpolator, sql_types.ArrayType(SCHEMA_DATA_POINT, False)) # Aggregate, Apply UDF & Explode ## Aggregate Data Points df_result = df_trajectory_bucketed.withColumn( "data_point", udf_gather_data_point( sql_functions.col("id_timestamp"), sql_functions.col("lat"), sql_functions.col("lng"), )).groupby("id_user").agg( sql_functions.collect_list(sql_functions.col( "data_point")).alias("data_point_list")) ## Apply Linear Interpolation df_result = df_result.select( "id_user", udf_linear_interpolator(sql_functions.col( "data_point_list")).alias("data_point_list")) ## Explode to Comply with DataCatalogue df_result = df_result.withColumn( "data_point", sql_functions.explode( sql_functions.col("data_point_list"))).select( "id_user", "data_point.id_timestamp", "data_point.lat", "data_point.lng") # Return return (df_result)
logs_df.count() # count by different code type logs_df.groupBy("code").count().show() # rank by counts from pyspark.sql.functions import asc, desc logs_df.groupBy('code').count().orderBy(desc('count')).show() # calculate average size of different code logs_df.groupBy("code").avg("bytes").show() # more calculation by code - average, min, max import pyspark.sql.functions as F logs_df.groupBy("code").agg( logs_df.code, F.avg(logs_df.bytes), F.min(logs_df.bytes), F.max(logs_df.bytes) ).show() # homework # 1 yelp_df.select("cool").agg({"cool" : "mean"}).collect() # 2 import pyspark.sql.functions as F yelp_df.filter('review_count >= 10').groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show() # 3 yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show() # 4 from pyspark.sql.functions import asc, desc yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy('state').count().orderBy(desc('count')).show()
sqlCtx = SQLContext(sc) lines = sc.parallelize(["m1,d1,1", "m1,d2,2", "m2,d1,1", "m2,d2,2"]) record = lines.map(lambda line: line.split(",")).map( lambda columns: Row(machine=columns[0], domain=columns[1], request=columns[2])) recordSchema = sqlCtx.createDataFrame(record) recordSchema.groupBy().agg({"*": "count"}).show() recordSchema.groupBy("machine", recordSchema["domain"]).agg( {"domain": "max", "request": "min"}).show() recordSchema.groupBy("machine", recordSchema.domain).agg(functions.count("*"), functions.max( recordSchema.request), functions.min(recordSchema["request"]), functions.sum(recordSchema["request"]), functions.avg(recordSchema["request"])).show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int")).groupBy("machine").count().show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").max("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").min("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").sum("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").avg("request").show()
trunc_df = yelp_df.filter("review_count>=10 and open = 'True'").groupBy("state").count() trunc_df.orderBy(desc("count")).collect() ################### /usr/lib/hue/apps/search/examples/collections/solr_configs_log_analytics_demo/index_data.csv logs_df = sqlCtx.load(source="com.databricks.spark.csv",header = 'true',inferSchema = 'true',path ='index_data_http.csv') sc._jsc.hadoopConfiguration().set('textinputformat.record.delimiter','\r\n') sc._jsc.hadoopConfiguration().set('textinputformat.record.delimiter','\r\n') from pyspark.sql.functions import asc, desc logs_df.groupBy("code").count().orderBy(desc("count")).show() logs_df.groupBy("code").avg("bytes").show() import pyspark.sql.functions as F logs_df.groupBy("code").agg(logs_df.code,F.avg(logs_df.bytes),F.min(logs_df.bytes),F.max(logs_df.bytes)).show() ########################################### yelp_df = sqlCtx.load(source='com.databricks.spark.csv',header = 'true',inferSchema = 'true',path ='index_data.csv') yelp_df.registerTempTable("yelp") filtered_yelp = sqlCtx.sql("SELECT * FROM yelp WHERE useful >= 1") filtered_yelp.count() sqlCtx.sql("SELECT MAX(useful) AS max_useful FROM yelp").collect() useful_perc_data.join(yelp_df,yelp_df.id == useful_perc_data.uid,"inner").select(useful_perc_data.uid, "useful_perc", "review_count") useful_perc_data.registerTempTable("useful_perc_data") sqlCtx.sql( """SELECT useful_perc_data.uid, useful_perc, review_count FROM useful_perc_data
#set time variables for date filtering time = datetime.datetime.now() epochtime = int(time.strftime("%s")) start_time = epochtime - 86400 compare_time = datetime.datetime.fromtimestamp(start_time) #create a dataframe from the raw metrics rawmetrics = sqlContext.read.format("org.apache.spark.sql.cassandra").options(table="raw_metrics", keyspace="metrics").load() #filter metrics to those in last 24 hours last_day = rawmetrics.where(rawmetrics.metric_time > compare_time) #aggregates averages = last_day.groupby('device_id').agg(func.avg('metric_value').alias('metric_avg')) maximums = last_day.groupby('device_id').agg(func.max('metric_value').alias('metric_max')) minimums = last_day.groupby('device_id').agg(func.min('metric_value').alias('metric_min')) #rename id columns for uniqueness averages_a = averages.withColumnRenamed("device_id", "id") maximums_a = maximums.withColumnRenamed("device_id", "maxid") minimums_a = minimums.withColumnRenamed("device_id", "minid") #join the tables above temp = averages_a.join(maximums_a, averages_a.id == maximums_a.maxid) aggs = temp.join(minimums, temp.id == minimums.device_id).select('id','metric_min','metric_max','metric_avg') #add columns to format for cassandra addday = aggs.withColumn("metric_day", lit(time)) addname = addday.withColumn("metric_name",lit("KWH")) inserts = addname.withColumnRenamed("id","device_id")
# COMMAND ---------- from pyspark.sql.functions import approx_count_distinct df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364 # COMMAND ---------- from pyspark.sql.functions import first, last df.select(first("StockCode"), last("StockCode")).show() # COMMAND ---------- from pyspark.sql.functions import min, max df.select(min("Quantity"), max("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import sum df.select(sum("Quantity")).show() # 5176450 # COMMAND ---------- from pyspark.sql.functions import sumDistinct df.select(sumDistinct("Quantity")).show() # 29310 # COMMAND ----------
FROM %s WHERE dt >= '%s' AND dt <= '%s' ''' % (spa_utils.rename('app.app_pa_festival_features', params), update_start, update_end)) #df_time = sql('''select * from app.app_pa_festival_features where dt between '2018-01-20' and '2018-04-20' ''') df_time = df_time.withColumnRenamed('dt', 'date') df_time.cache() # 找出每个cid3有记录的开始时间和结束时间 df_sales_dtcid3_start_end_date = df_sales_dtcid3\ .groupby('item_third_cate_cd')\ .agg(F.min('date').alias('start_date'), F.max('date').alias('end_date')) # 填充cid3跨度的日期特征 df_cid3_duration_time = df_sales_dtcid3_start_end_date\ .join(df_time, (df_time['date'] >= df_sales_dtcid3_start_end_date['start_date']) & (df_time['date'] <= df_sales_dtcid3_start_end_date['end_date']), 'left')\ .drop('start_date', 'end_date') df_cid3_duration_time.cache() df_complete = df_cid3_duration_time\ .join(df_sales_dtcid3, ['date', 'item_third_cate_cd'], 'left')\ .fillna(0)
Consulta d) Los usuarios con la fecha de creación más antigua y la más reciente, respectivamente ''' ''' Necesario para utilizar la función to_date ''' from pyspark.sql.functions import * df.select("*")\ .where((to_date(df.CreationDate) == df.select( min( to_date("CreationDate"))\ .alias("min"))\ .collect()[0].min) | ( to_date(df.CreationDate) == df.select( max(to_date("CreationDate"))\ .alias("max"))\ .collect()[0].max))\ .orderBy(to_date("CreationDate"))\ .show() ''' Comparando fechas hasta los milisegundos''' ''' Usuario más antiguo '''
test_df = test_df.select(*(all_cols + ['Id', 'Date'])).cache() # Build vocabulary of categorical columns. vocab = build_vocabulary(train_df.select(*categorical_cols) .unionAll(test_df.select(*categorical_cols)).cache(), categorical_cols) # Cast continuous columns to float & lookup categorical columns. train_df = cast_columns(train_df, continuous_cols + ['Sales']) train_df = lookup_columns(train_df, vocab) test_df = cast_columns(test_df, continuous_cols) test_df = lookup_columns(test_df, vocab) # Split into training & validation. # Test set is in 2015, use the same period in 2014 from the training set as a validation set. test_min_date = test_df.agg(F.min(test_df.Date)).collect()[0][0] test_max_date = test_df.agg(F.max(test_df.Date)).collect()[0][0] a_year = datetime.timedelta(365) val_df = train_df.filter((test_min_date - a_year <= train_df.Date) & (train_df.Date < test_max_date - a_year)) train_df = train_df.filter((train_df.Date < test_min_date - a_year) | (train_df.Date >= test_max_date - a_year)) # Determine max Sales number. max_sales = train_df.agg(F.max(train_df.Sales)).collect()[0][0] print('===================================') print('Data frame with transformed columns') print('===================================') train_df.show() print('================') print('Data frame sizes')
from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, StringType spark = SparkSession.builder.appName("MostObscureSuperheroes").getOrCreate() schema = StructType([ \ StructField("id", IntegerType(), True), \ StructField("name", StringType(), True)]) names = spark.read.schema(schema).option( "sep", " ").csv("file:///SparkCourse/Marvel-names.txt") lines = spark.read.text("file:///SparkCourse/Marvel-graph.txt") # Small tweak vs. what's shown in the video: we trim whitespace from each line as this # could throw the counts off by one. connections = lines.withColumn("id", func.split(func.trim(func.col("value")), " ")[0]) \ .withColumn("connections", func.size(func.split(func.trim(func.col("value")), " ")) - 1) \ .groupBy("id").agg(func.sum("connections").alias("connections")) minConnectionCount = connections.agg(func.min("connections")).first()[0] minConnections = connections.filter( func.col("connections") == minConnectionCount) minConnectionsWithNames = minConnections.join(names, "id") print("The following characters have only " + str(minConnectionCount) + " connection(s):") minConnectionsWithNames.select("name").show()
def fn(col): return ~(F.min(col))
#import SQLContext and pyspark SQL functions from pyspark.sql import SQLContext, Row import pyspark.sql.functions as func sqlContext = SQLContext(sc) inputRDD = sc.textFile("/user/pravat/auctiondata.csv").map(lambda l: l.split(",")) auctions = inputRDD.map(lambda p:Row(auctionid=p[0], bid=float(p[1]), bidtime=float(p[2]), bidder=p[3], bidrate=int(p[4]), openbid=float(p[5]), price=float(p[6]), itemtype=p[7], dtl=int(p[8]))) # Infer the schema, and register the DataFrame as a table. auctiondf = sqlContext.createDataFrame(auctions) auctiondf.registerTempTable("auctions") auctiondf.show() auctiondf.printSchema() totbids = auctiondf.count() print totbids totalauctions = auctiondf.select("auctionid").distinct().count() print total auctions itemtypes = auctiondf.select("itemtype").distinct().count() print itemtypes auctiondf.groupBy("itemtype","auctionid").count().show() auctiondf.groupBy("itemtype","auctionid").count().agg(func.min("count"), func.max("count"), func.avg("count")).show() auctiondf.groupBy("itemtype", "auctionid").agg(func.min("bid"), func.max("bid"), func.avg("bid")).show() auctiondf.filter(auctiondf.price>200).count() xboxes = sqlContext.sql("SELECT auctionid, itemtype,bid,price,openbid FROM auctions WHERE itemtype = 'xbox'").show()
(2, "Michael Armbrust", 1, [250, 100])])\ .toDF("id", "name", "graduate_program", "spark_status") person.stat.crosstab("id", "name").show() # COMMAND ---------- #Get an overview of the summary statistics df.describe().show() #Get specific summary statistics of numeric variables df.select("quantity").summary("min", "max").show() #Alternative from pyspark.sql.functions import min, max df.select(min("quantity"), max("quantity")).show() #Get a specific statistic spark.sql("select max(quantity) from df").show() from pyspark.sql.functions import max df.select(max("quantity")).show() # COMMAND ---------- #Calculate the mean quantity sold and the number of customers spark.sql("select avg(quantity), count(distinct(customerid)) from df").show() from pyspark.sql.functions import avg, expr df.select(avg("quantity"), expr("count(distinct(customerid))")).show() df.selectExpr("avg(quantity)", "count(distinct(customerid))").show()
from pyspark.sql import functions as F #Creating data frame from list data = [('John', 'Smith', 47),('Jane', 'Smith', 22), ('Frank', 'Jones', 28)] schema = ['fname', 'lname', 'age'] df = sqlContext.createDataFrame(data, schema) df #Retrieving contents of data frame df.printSchema() df.show() df.first() df.count() #Adding columns df = df.withColumn('salary', F.lit(0)) df.show() df.withColumn('salary2', df['age'] * 100).show() #Filtering and subsetting df.filter(df['age'] > 30).select('fname','age').show() df.select(F.max('age').alias('max-age')).show() #Grouped aggregations df.groupBy('lname').max('age').show() df.groupBy('lname').agg(F.avg('age').alias('avg-age'), F.min('age'), F.max('age')).show()
from pyspark.sql import SparkSession from pyspark.sql import functions as func from pyspark.sql.types import StructType, StructField, IntegerType, StringType spark = SparkSession.builder.appName("MostPopularSuperhero").getOrCreate() schema = StructType([ \ StructField("id", IntegerType(), True), \ StructField("name", StringType(), True)]) names = spark.read.schema(schema).option("sep", " ").csv("file:///sparkcourse/Marvel+names") lines = spark.read.text("file:///sparkcourse/Marvel+graph") connections = lines.withColumn("id", func.split(func.col("value"), " ")[0]) \ .withColumn("connections", func.size(func.split(func.col("value"), " ")) - 1) \ .groupBy("id").agg(func.sum("connections").alias("connections")) minconnect = connections.agg(func.min("connections")).first()[0] minconnections = connections.filter(func.col("connections") == minconnect) minconn_names = minconnections.join(names, "id") minconn_names.select("name").show(minconn_names.count())
content_size_summary_df = logs_df.describe(['content_size']) content_size_summary_df.show() # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC Alternatively, we can use SQL to directly calculate these statistics. You can explore the many useful functions within the `pyspark.sql.functions` module in the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions). # MAGIC # MAGIC After we apply the `.agg()` function, we call `.first()` to extract the first value, which is equivalent to `.take(1)[0]`. # COMMAND ---------- from pyspark.sql import functions as sqlFunctions contentSizeStats = (logs_df .agg(sqlFunctions.min(logs_df['content_size']), sqlFunctions.avg(logs_df['content_size']), sqlFunctions.max(logs_df['content_size'])) .first()) print 'Using SQL functions:' print 'Content Size Avg: {1:,.2f}; Min: {0:.2f}; Max: {2:,.0f}'.format(*contentSizeStats) # COMMAND ---------- # MAGIC %md # MAGIC ### (3b) Example: HTTP Status Analysis # MAGIC # MAGIC Next, let's look at the status values that appear in the log. We want to know which status values appear in the data and how many times. We again start with `logs_df`, then group by the `status` column, apply the `.count()` aggregation function, and sort by the `status` column. # COMMAND ----------
StructField("longitude", FloatType(), True), StructField("time", StringType(), True), StructField("summary", StringType(), True), StructField("precipIntensity", FloatType(), True), StructField("precipProbability", FloatType(), True), StructField("temperature", FloatType(), True), StructField("apparentTemperature", FloatType(), True), StructField("humidity", FloatType(), True), StructField("windSpeed", FloatType(), True), StructField("visibility", FloatType(), True), StructField("pressure", FloatType(), True), ] schema = StructType(fields) # Apply the schema to the RDD. weather_df = sqlContext.createDataFrame(weather_csv, schema) print "#######" print weather_df.printSchema() print weather_df.show() print "#######" # Do some simple aggregations weather_agg = weather_df.groupBy("date", "city").agg( F.min(weather_df.temperature), F.max(weather_df.temperature), F.min(weather_df.pressure), F.max(weather_df.pressure)) weather_agg.show() # Save the new dataframes with schemas weather_df.write.save( "hdfs://hadoop:9000/weather_data_schema", mode="overwrite")