def test_mixed_sql_and_udf(self): df = self.data w = self.unbounded_window ow = self.ordered_window max_udf = self.pandas_agg_max_udf min_udf = self.pandas_agg_min_udf result1 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min_udf(df['v']).over(w)) expected1 = df.withColumn('v_diff', max(df['v']).over(w) - min(df['v']).over(w)) # Test mixing sql window function and window udf in the same expression result2 = df.withColumn('v_diff', max_udf(df['v']).over(w) - min(df['v']).over(w)) expected2 = expected1 # Test chaining sql aggregate function and udf result3 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ .withColumn('min_v', min(df['v']).over(w)) \ .withColumn('v_diff', col('max_v') - col('min_v')) \ .drop('max_v', 'min_v') expected3 = expected1 # Test mixing sql window function and udf result4 = df.withColumn('max_v', max_udf(df['v']).over(w)) \ .withColumn('rank', rank().over(ow)) expected4 = df.withColumn('max_v', max(df['v']).over(w)) \ .withColumn('rank', rank().over(ow)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas()) self.assertPandasEqual(expected2.toPandas(), result2.toPandas()) self.assertPandasEqual(expected3.toPandas(), result3.toPandas()) self.assertPandasEqual(expected4.toPandas(), result4.toPandas())
def test_window_functions(self): df = self.sqlCtx.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) w = Window.partitionBy("value").orderBy("key") from pyspark.sql import functions as F sel = df.select( df.value, df.key, F.max("key").over(w.rowsBetween(0, 1)), F.min("key").over(w.rowsBetween(0, 1)), F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))), F.rowNumber().over(w), F.rank().over(w), F.denseRank().over(w), F.ntile(2).over(w), ) rs = sorted(sel.collect()) expected = [ ("1", 1, 1, 1, 1, 1, 1, 1, 1), ("2", 1, 1, 1, 3, 1, 1, 1, 1), ("2", 1, 2, 1, 3, 2, 1, 1, 1), ("2", 2, 2, 2, 3, 3, 3, 2, 2), ] for r, ex in zip(rs, expected): self.assertEqual(tuple(r), ex[: len(r)])
def process_file(date_update): """Process downloaded MEDLINE folder to parquet file""" print("Process MEDLINE file to parquet") # remove if folder still exist if glob(os.path.join(save_dir, 'medline_*.parquet')): subprocess.call(['rm', '-rf', 'medline_*.parquet']) date_update_str = date_update.strftime("%Y_%m_%d") path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000) parse_results_rdd = path_rdd.\ flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict) for publication_dict in pp.parse_medline_xml(x)]) medline_df = parse_results_rdd.toDF() medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str), mode='overwrite') window = Window.partitionBy(['pmid']).orderBy(desc('file_name')) windowed_df = medline_df.select( max('delete').over(window).alias('is_deleted'), rank().over(window).alias('pos'), '*') windowed_df.\ where('is_deleted = False and pos = 1').\ write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str), mode='overwrite') # parse grant database parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\ .filter(lambda x: x is not None)\ .map(lambda x: Row(**x)) grant_df = parse_grant_rdd.toDF() grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str), mode='overwrite')
def getValueFieldValueLists(self, handlerId, keyFields, valueFields): df = self.entity.groupBy(keyFields) agg = self.options.get("aggregation",self.getDefaultAggregation(handlerId)) maxRows = int(self.options.get("rowCount","100")) numRows = min(maxRows,df.count()) valueLists = [] for valueField in valueFields: valueDf = None if agg == "SUM": valueDf = df.agg(F.sum(valueField).alias("agg")) elif agg == "AVG": valueDf = df.agg(F.avg(valueField).alias("agg")) elif agg == "MIN": valueDf = df.agg(F.min(valueField).alias("agg")) elif agg == "MAX": valueDf = df.agg(F.max(valueField).alias("agg")) else: valueDf = df.agg(F.count(valueField).alias("agg")) for keyField in keyFields: valueDf = valueDf.sort(F.col(keyField).asc()) valueDf = valueDf.dropna() rows = valueDf.select("agg").take(numRows) valueList = [] for row in rows: valueList.append(row["agg"]) valueLists.append(valueList) return valueLists
def reduce_to_ohlc(time, rdd): row_rdd = rdd.map(lambda row: row.split(',')) \ .filter(lambda row: len(row) == 3) \ .map(lambda row: Row( symbol=row[0], tx_time=datetime.strptime(row[2], '%Y-%m-%d %H:%M:%S.%f'), price=float(row[1]) )) sql_context = get_sql_context_instance(rdd.context) data = sql_context.createDataFrame(row_rdd) data.cache() data.write.format('org.apache.spark.sql.cassandra') \ .options(table='transactions2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save() ohlc = data.select('symbol', truncate_min(data.tx_time).alias('batch_time'), 'price', 'tx_time') \ .orderBy('tx_time') \ .groupBy('symbol', 'batch_time') \ .agg( F.first(data.price).alias('open'), F.max(data.price).alias('high'), F.min(data.price).alias('low'), F.last(data.price).alias('close'), F.first(data.tx_time).alias('open_time'), F.last(data.tx_time).alias('close_time') ) existing_ohlc = sql_context.read.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .load() \ .select('symbol', 'batch_time', 'open', 'open_time', 'high', 'low', 'close', 'close_time') merged_ohlc = ohlc.join(existing_ohlc, (ohlc.symbol == existing_ohlc.symbol) & (ohlc.batch_time == existing_ohlc.batch_time), 'left' ) merged_ohlc = merged_ohlc.select( ohlc.symbol.alias('symbol'), ohlc.batch_time.alias('batch_time'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open).otherwise(ohlc.open).alias('open'), F.when(existing_ohlc.open_time < ohlc.open_time, existing_ohlc.open_time).otherwise(ohlc.open_time).alias('open_time'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close).otherwise(ohlc.close).alias('close'), F.when(existing_ohlc.close_time > ohlc.close_time, existing_ohlc.close_time).otherwise(ohlc.close_time).alias('close_time'), F.when(existing_ohlc.low < ohlc.low, existing_ohlc.low).otherwise(ohlc.low).alias('low'), F.when(existing_ohlc.high > ohlc.high, existing_ohlc.high).otherwise(ohlc.high).alias('high') ) merged_ohlc.write.format('org.apache.spark.sql.cassandra') \ .options(table='ohlc_1_min2', keyspace='stock', cluster='Test Cluster') \ .mode('append') \ .save()
def test_multiple_udfs(self): df = self.data w = self.unbounded_window result1 = df.withColumn('mean_v', self.pandas_agg_mean_udf(df['v']).over(w)) \ .withColumn('max_v', self.pandas_agg_max_udf(df['v']).over(w)) \ .withColumn('min_w', self.pandas_agg_min_udf(df['w']).over(w)) expected1 = df.withColumn('mean_v', mean(df['v']).over(w)) \ .withColumn('max_v', max(df['v']).over(w)) \ .withColumn('min_w', min(df['w']).over(w)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_timestamp_splitter(test_specs, spark_dataset): dfs_rating = spark_dataset.withColumn(DEFAULT_TIMESTAMP_COL, col(DEFAULT_TIMESTAMP_COL).cast("float")) splits = spark_timestamp_split( dfs_rating, ratio=test_specs["ratio"], col_timestamp=DEFAULT_TIMESTAMP_COL ) assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratio"], test_specs["tolerance"] ) assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx( 1 - test_specs["ratio"], test_specs["tolerance"] ) max_split0 = splits[0].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0] min_split1 = splits[1].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0] assert(max_split0 <= min_split1) # Test multi split splits = spark_timestamp_split(dfs_rating, ratio=test_specs["ratios"]) assert splits[0].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][0], test_specs["tolerance"] ) assert splits[1].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][1], test_specs["tolerance"] ) assert splits[2].count() / test_specs["number_of_rows"] == pytest.approx( test_specs["ratios"][2], test_specs["tolerance"] ) max_split0 = splits[0].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0] min_split1 = splits[1].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0] assert(max_split0 <= min_split1) max_split1 = splits[1].agg(F.max(DEFAULT_TIMESTAMP_COL)).first()[0] min_split2 = splits[2].agg(F.min(DEFAULT_TIMESTAMP_COL)).first()[0] assert(max_split1 <= min_split2)
def handleUIOptions(self, displayColName): agg = self.options.get("aggregation") valFields = self.options.get("valueFields") if agg == 'COUNT': return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas() elif agg == 'SUM': return self.entity.groupBy(displayColName).agg(F.sum(valFields).alias("agg")).toPandas() elif agg == 'AVG': return self.entity.groupBy(displayColName).agg(F.avg(valFields).alias("agg")).toPandas() elif agg == 'MIN': return self.entity.groupBy(displayColName).agg(F.min(valFields).alias("agg")).toPandas() elif agg == 'MAX': return self.entity.groupBy(displayColName).agg(F.max(valFields).alias("agg")).toPandas() elif agg == 'MEAN': return self.entity.groupBy(displayColName).agg(F.mean(valFields).alias("agg")).toPandas() else: return self.entity.groupBy(displayColName).agg(F.count(displayColName).alias("agg")).toPandas()
def do_something_only_once(): # the command I use to run this script: #~/spark-1.6.1/bin/spark-submit --packages=com.databricks:spark-avro_2.10:2.0.1,com.databricks:spark-csv_2.10:1.4.0 server.py global topdis, meta, dic, towo, cluto, doctopdat, maxdate, mindate, lda ## Loading of data sc = SparkContext(appName='Simple App') #"local" sqlContext = SQLContext(sc) # Load metadata avro reader = sqlContext.read.format('com.databricks.spark.avro') meta = reader.load('data/spark_metadata.avro') # # Loading topic distributions topdisFile = 'data/spark_output.tuples' csvLoader = sqlContext.read.format('com.databricks.spark.csv') topdis = csvLoader.options(delimiter=',',header='false', inferschema='true').load(topdisFile) strip_first_col_int = udf(lambda row: int(row[1:]), IntegerType()) topdis = topdis.withColumn('C0',strip_first_col_int(topdis['C0'])) strip_first_col_float = udf(lambda row: float(row[1:]), FloatType()) topdis = topdis.withColumn('C1',strip_first_col_float(topdis['C1'])) strip_last_col = udf(lambda row: float(row[:-2]), FloatType()) topdis = topdis.withColumn('C20',strip_last_col(topdis['C20'])) # # Load dictionary CSV dicFile = 'data/spark_dic.csv' csvLoader = sqlContext.read.format('com.databricks.spark.csv') dic = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(dicFile) dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count')) ldaFile = 'data/spark_lda.csv' csvLoader = sqlContext.read.format('com.databricks.spark.csv') lda = csvLoader.options(delimiter='\t', header='false', inferschema='true').load(ldaFile) lda = lda.select(rowNumber().alias('id'), lda.columns).join(dic, dic.id == lda.id, 'inner').cache() # dic = dic.select(dic['C0'].alias('id'), dic['C1'].alias('word'), dic['C2'].alias('count')) # # # Load clustertopics CSV # clutoFile = 'enron_small_clustertopics.csv' # csvLoader = sqlContext.read.format('com.databricks.spark.csv') # cluto = csvLoader.options(delimiter=',', header='false', inferschema='true').load(clutoFile) # # # Load topicswords CSV # towoFile = 'enron_small_lda_transposed.csv' # csvLoader = sqlContext.read.format('com.databricks.spark.csv') # towo = csvLoader.options(delimiter=',', header='false', inferschema='true').load(towoFile) # # Merge topdis which has document id and with metadata, based on document id metasmall = meta.select('id',unix_timestamp(meta['date'],"yyyy-MM-dd'T'HH:mm:ssX").alias("timestamp")) doctopdat = topdis.join(metasmall, metasmall.id == topdis.C0,'inner').cache() maxdate = doctopdat.select(max('timestamp').alias('maxtimestamp')).collect()[0]['maxtimestamp'] mindate = doctopdat.select(min('timestamp').alias('mintimestamp')).collect()[0]['mintimestamp']
def test_bounded_mixed(self): from pyspark.sql.functions import mean, max df = self.data w1 = self.sliding_row_window w2 = self.unbounded_window mean_udf = self.pandas_agg_mean_udf max_udf = self.pandas_agg_max_udf result1 = df.withColumn('mean_v', mean_udf(df['v']).over(w1)) \ .withColumn('max_v', max_udf(df['v']).over(w2)) \ .withColumn('mean_unbounded_v', mean_udf(df['v']).over(w1)) expected1 = df.withColumn('mean_v', mean(df['v']).over(w1)) \ .withColumn('max_v', max(df['v']).over(w2)) \ .withColumn('mean_unbounded_v', mean(df['v']).over(w1)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def _if_later(data1, data2): """Helper function to test if records in data1 are earlier than that in data2. Returns: bool: True or False indicating if data1 is earlier than data2. """ x = (data1.select(DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL) .groupBy(DEFAULT_USER_COL) .agg(F.max(DEFAULT_TIMESTAMP_COL).cast('long').alias('max')) .collect()) max_times = {row[DEFAULT_USER_COL]: row['max'] for row in x} y = (data2.select(DEFAULT_USER_COL, DEFAULT_TIMESTAMP_COL) .groupBy(DEFAULT_USER_COL) .agg(F.min(DEFAULT_TIMESTAMP_COL).cast('long').alias('min')) .collect()) min_times = {row[DEFAULT_USER_COL]: row['min'] for row in y} result = True for user, max_time in max_times.items(): result = result and min_times[user] >= max_time return result
def test_bounded_simple(self): from pyspark.sql.functions import mean, max, min, count df = self.data w1 = self.sliding_row_window w2 = self.shrinking_range_window plus_one = self.python_plus_one count_udf = self.pandas_agg_count_udf mean_udf = self.pandas_agg_mean_udf max_udf = self.pandas_agg_max_udf min_udf = self.pandas_agg_min_udf result1 = df.withColumn('mean_v', mean_udf(plus_one(df['v'])).over(w1)) \ .withColumn('count_v', count_udf(df['v']).over(w2)) \ .withColumn('max_v', max_udf(df['v']).over(w2)) \ .withColumn('min_v', min_udf(df['v']).over(w1)) expected1 = df.withColumn('mean_v', mean(plus_one(df['v'])).over(w1)) \ .withColumn('count_v', count(df['v']).over(w2)) \ .withColumn('max_v', max(df['v']).over(w2)) \ .withColumn('min_v', min(df['v']).over(w1)) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
sqlCtx = SQLContext(sc) lines = sc.parallelize(["m1,d1,1", "m1,d2,2", "m2,d1,1", "m2,d2,2"]) record = lines.map(lambda line: line.split(",")).map( lambda columns: Row(machine=columns[0], domain=columns[1], request=columns[2])) recordSchema = sqlCtx.createDataFrame(record) recordSchema.groupBy().agg({"*": "count"}).show() recordSchema.groupBy("machine", recordSchema["domain"]).agg( {"domain": "max", "request": "min"}).show() recordSchema.groupBy("machine", recordSchema.domain).agg(functions.count("*"), functions.max( recordSchema.request), functions.min(recordSchema["request"]), functions.sum(recordSchema["request"]), functions.avg(recordSchema["request"])).show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int")).groupBy("machine").count().show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").max("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").min("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").sum("request").show() recordSchema.select(recordSchema.machine, recordSchema.request.cast( "int").alias("request")).groupBy("machine").avg("request").show()
from pyspark.shell import sqlContext from pyspark.sql.functions import rand, randn from pyspark.sql import * from pyspark.sql.functions import mean, min, max df = sqlContext.range(0, 7) df.show() df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show() df.describe("uniform", "normal").show() dfNew = df.describe("uniform", "normal").show() dfNew.select([mean("uniform"), min("uniform"), max("uniform")]).show()
print('OR') df_new.orderBy(df_new['High'].desc()).head(1)[0][0] print('What about day preseted the 2nd highest peak in High?'.upper()) # Note that I need the second row, i.e., head(2) df_new.orderBy(df_new['High'].desc()).head(2)[1][0] print('What is the mean of the Close column?'.upper()) from pyspark.sql.functions import mean df_new.select(format_number(mean('Close'), 2).alias('avg')).show() print('What is the min and max values of the Volume column?'.upper()) from pyspark.sql.functions import min, max df_new.select( format_number(min('Volume'), 2).alias('min_volume'), format_number(max('Volume'), 2).alias('max_volume')).show() print('How many days had the Close lower than 60 dollars'.upper()) from pyspark.sql.functions import count df_filt = df_new.filter(df_new['Close'] < 60) df_filt.select(count(df_filt['Close'])).show() print('Percentage of days where High>80'.upper()) df_filt = df_new.filter(df_new['High'] > 80) df_filt = df_filt.select(count(df_filt['Date']).alias('days_80')) df_filt2 = df_new.select(count(df_new['Date']).alias('total_days')) percentage = 100 * df_filt.head(1)[0][0] / df_filt2.head(1)[0][0] print('The percentage is {}'.format(percentage))
def min_and_max_year(df: DataFrame): return df.select("Year").agg(f.min("Year").alias("min"), f.max("Year").alias("max"))
def tree_json(self, tree, df): data = [] for line in tree.splitlines() : if line.strip(): line = line.strip() data.append(line) else : break if not line : break res = [] res.append({'name': 'Root', 'children':self.parse(data[1:], df)}) measure_column_name = self._target_column self._splits = [] start = self._data_frame.filter(col(measure_column_name).isNotNull()).select(FN.min(measure_column_name)).collect()[0][0] self._splits.append(start) self._label_code = {} label_code = 0.0 self._coding = [] for idx in range(len(self._predicts)): if idx == len(self._predicts) - 1: end = self._data_frame.filter(col(measure_column_name).isNotNull()).select(FN.max(measure_column_name)).collect()[0][0] else: end = old_div((self._predicts[idx]+self._predicts[idx+1]),2) group_name = NarrativesUtils.round_number(start,2) + ' to ' + NarrativesUtils.round_number(end,2) self._map[self._predicts[idx]] ={'start':start, 'end': end, 'group': group_name} self._label_code[label_code] = group_name start = end label_code = label_code+1 self._splits.append(start) return res[0]
def main(): # set up the logger logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'ra_summary.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # table nsjoin (day, uuid) # table mapmon (day, uuid) datenow = str(datetime.date.today()-datetime.timedelta(1)) day_idx = datenow[0:4]+datenow[5:7]+datenow[8:10] uuid_list = [x.split('=')[-1] for x in beeline.show_partitions('mrqos.mapmon_sum').split('\n') if day_idx in x] sc = SparkContext() hiveCtx = HiveContext(sc) post_partition_n = 1000 for uuid_idx in uuid_list: # ns_ip, demand, asnum ns_asnum, ns_country, ns_continent, ns_lat, ns_lon, ns_mpgid, mpgload nsjoin_query = """ select ns_ip, demand, asnum ns_asnum, country_code ns_country, continent ns_continent, round(latitude,3) ns_lat, round(longitude,3) ns_lon, mpgid ns_mpgid, mpgload from mapper.nsjoin where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL and demand > 1""".format(day_idx, uuid_idx) # mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp mapmon_query = """ select mpgid, mrid, mpg_type, region, link, min_s, max_s, min_r, max_r, ping, local, cont_fb, mpd_dftime, ecor, continent, country, latitude, longitude, prp from mrqos.mapmon_sum where day={} and mpd_uuid='{}' and longitude is not NULL and latitude is not NULL""".format(day_idx, uuid_idx) logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx)) nsjoin = hiveCtx.sql(nsjoin_query) nsjoin_rows = nsjoin.repartition(post_partition_n).cache() data = hiveCtx.sql(mapmon_query) data_rows = data.repartition(post_partition_n).cache() col = ['mpgid', 'mrid', 'mpg_type', 'region', 'link', 'min_s', 'max_s', 'min_r', 'max_r', 'ping', 'local', 'cont_fb', 'mpd_dftime', 'ecor', 'continent', 'country', 'latitude', 'longitude', 'prp', 'ns_ip', 'demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon', 'mpgload'] cols_appended = ['nsip', 'mrid', 'ns_demand', 'ns_asnum', 'ns_country', 'ns_continent', 'ns_lat', 'ns_lon', 'mpgid', 'mpg_type', 'mpg_load', 'regions', 'region_links', 'dftime_ratio', 'ecors', 'list_min_s', 'list_max_s', 'list_min_r', 'list_max_r', 'region_lats', 'region_lons', 'min_s', 'max_s', 'min_r', 'max_r', 'ping_ratio', 'local_ratio', 'cont_fb_ratio', 'in_cont_ratio', 'in_country_ratio', 'private_ratio', 'avg_distance', 'num_region_mapped', 'mapping_entropy', 'sum_dftime'] df = nsjoin_rows.join(data_rows, data_rows.mpgid == nsjoin_rows.ns_mpgid, 'inner')[col].cache() row1 = data_rows.agg(F.max(data_rows.mpd_dftime)).collect()[0] max_dftime = row1[0] df2 = df.map(lambda x: x + Row(geodesic_distance_weighted(x.ns_lat, x.ns_lon, x.latitude, x.longitude, x.mpd_dftime)))\ .map(lambda x: (( x[19], # nsip x[20], # demand x[21], # ns_asnum x[22], # ns_country x[23], # ns_continent round(x[24], 3), # ns_lat & ns_lon round(x[25], 3), x[0], # mpgid x[1], # mrid x[2], # mpg type x[26], # mpg load ), [ [int(x[3])], # region [str(int(x[3])) + "_" + str(int(x[4]))], # region_link x[5]/max_dftime, # min_s x[6]/max_dftime, # max_s x[7]/max_dftime, # min_r x[8]/max_dftime, # max_r x[9]/max_dftime, # ping ratio x[10]/max_dftime, # local ratio x[11]/max_dftime, # cont_fb ratio [round(x[12]/max_dftime, 3)], # mpd_dftime/max_dftime (time ratio) [int(x[13])], # ecor x[12]/max_dftime * [0, 1][x[14] == x[23]], # mapping in-continent ratio x[12]/max_dftime * [0, 1][x[15] == x[22]], # mapping in-country ratio [round(x[16], 3)], # lat [round(x[17], 3)], # lon x[18]/max_dftime, # prp x[27]/max_dftime, # w_distance x[12], [round(x[5]/x[12], 2)], # min_s list [round(x[6]/x[12], 2)], # max_s list [round(x[7]/x[12], 2)], # min_r list [round(x[8]/x[12], 2)], # max_r list ]))\ .reduceByKey(lambda a, b: [x+y for x, y in zip(a, b)])\ .map(lambda x: [x[0][0], # nsip x[0][8], # mrid x[0][1], # demand x[0][2], # ns_asnum x[0][3], # ns_country x[0][4], # ns_continent x[0][5], # ns_lat x[0][6], # ns_lon x[0][7], # mpgid x[0][9], # mpg type x[0][10], # mpg load x[1][0], # list of region x[1][1], # list of region_link [round(100 * float(y), 2) for y in x[1][9]], # list of covered_record ratio x[1][10], # list of ecor x[1][13], # list of region lat x[1][14], # list of region lon round(x[1][2] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_s round(x[1][3] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_s round(x[1][4] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # min_r round(x[1][5] * max_dftime / x[1][17], 3) if x[1][17] > 0 else -1, # max_r round(100 * x[1][6] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # ping ratio round(100 * x[1][7] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # local ratio round(100 * x[1][8] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # cont_fb ratio round(100 * x[1][11] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-continent ratio round(100 * x[1][12] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # mapping in-country ratio round(100 * x[1][15] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # private ratio round(x[1][16] * max_dftime / x[1][17], 2) if x[1][17] > 0 else -1, # w_distance round(x[1][17], 3), # summation of covered dftime x[1][18], # list of min_s x[1][19], # list of max_s x[1][20], # list of min_r x[1][21], # list of max_r len(x[1][9]), # number of different regions mapped round(computeEntropyPMF(x[1][9]), 6), # entropy of the region assignments ])\ .map(lambda x: x + [[i[0] for i in sorted(enumerate([float(y) for y in x[13]]), key=lambda z:z[1], reverse=True)]])\ .map(lambda x: x[:11] + [':'.join([str(x[11][i]) for i in x[35]]), # list of region ':'.join([str(x[12][i]) for i in x[35]]), # list of region_link ':'.join([str(x[13][i]) for i in x[35]]), # list of covered_record ratio ':'.join([str(x[14][i]) for i in x[35]]), # list of ecor ':'.join([str(x[29][i]) for i in x[35]]), # list of min_s ':'.join([str(x[30][i]) for i in x[35]]), # list of max_s ':'.join([str(x[31][i]) for i in x[35]]), # list of min_r ':'.join([str(x[32][i]) for i in x[35]]), # list of max_r ':'.join([str(x[15][i]) for i in x[35]]), # list of region lat ':'.join([str(x[16][i]) for i in x[35]]), # list of region lon ] + x[17:28] + x[33:35] + [x[28]])\ .toDF(cols_appended).cache() df_all = df2.map(lambda x: toCSVLine(x)) logger.info('writing into HDFS') df_all.saveAsTextFile('/ghostcache/hadoop/data/MRQOS/mrqos_mapmon_stats/datestamp={}/uuid={}'.format(day_idx, uuid_idx)) logger.info('updating Hive table: mrqos_mapmon_stats') beeline.add_partitions("mrqos.mrqos_mapmon_stats","datestamp='{}',uuid='{}'".format(day_idx, uuid_idx))
.format('com.databricks.spark.csv') \ .options(header='false') \ .load(args.file, schema=StructType(fields)) # calculate the totals summed across all dates countDF = df.groupBy('name').agg({"count": "sum"}).withColumnRenamed('sum(count)', 'total') # read from the column dates dates = sorted(df.select("date") .distinct() .map(lambda row: row[0]) .collect()) # find the counts for each date cols = [when(col("date") == m, col("percentage")).otherwise(None).alias(m) for m in dates] maxs = [max(col(m)).alias(m) for m in dates] # reformat dataframe series = (df .select(col("name"), *cols) .groupBy("name") .agg(*maxs) .na.fill(0)) compressedTimeseries = series.select("name", concat_ws(",", *dates).alias("timeseries")) # add totals to timeseries table resultDF = compressedTimeseries.join(countDF, 'name', 'inner') resultDF.write.format('com.databricks.spark.csv').save('converted.csv.files')
"user_st_et_record.ser_id", "user_st_et_record.st", "user_st_et_record.et") else: LOGGER.info('For non first runs of SPARK_PROCESSOR') #Get Hive Partitions from pyspark.sql.functions import max from pyspark.sql.functions import first spark.sql("MSCK REPAIR TABLE " + settings.hive_db + "." + settings.hive_table) hive_partitions = spark.sql("SHOW PARTITIONS " + settings.hive_db + "." + settings.hive_table) latest_partition = hive_partitions.select( regexp_extract(max(hive_partitions.result), '(et_hr)=(\d+)', 2).alias('part_hr')) LOGGER.info('Hive Partitions:- {}'.format( hive_partitions.collect())) LOGGER.info('Latest Hive Partition:- {}'.format( latest_partition.collect())) import pandas as pd part_df = latest_partition.toPandas() old_users = spark.read.csv(settings.hive_dir + str(part_df.iloc[0]['part_hr'])) old_users_hive = old_users.select( (old_users._c0).alias("user_id"), (old_users._c1).alias("st")).distinct() join_users = all_users.join(old_users_hive, "user_id",
def chisquare_trend(self,column_name,base_dir): if self._date_columns != None: if self._dateFormatDetected: output = [] date_column = self._date_column_suggested chisquare_column = column_name result_column = self._result_column if chisquare_column in self._dataframe_helper.get_numeric_columns(): min_max = self._data_frame.select([FN.min(chisquare_column), FN.max(chisquare_column)]).collect() maxval = min_max[0][1] minval = min_max[0][0] step = (maxval - minval) / 5.0 splits = [math.floor(minval), minval + step, minval + (step * 2), minval + (step * 3), minval + (step * 4), math.ceil(maxval)] bucketizer = Bucketizer(splits=splits,inputCol=chisquare_column,outputCol="BINNED_COL") self._data_frame = self._data_frame.withColumn(chisquare_column, self._data_frame[chisquare_column].cast(DoubleType())) bucketedData = bucketizer.transform(self._data_frame) df = bucketedData.select([col for col in bucketedData.columns if col != chisquare_column]) df = df.withColumnRenamed("BINNED_COL",chisquare_column) ranges = [] for idx in range(len(splits)-1): text = str(splits[idx])+" to "+str(splits[idx+1]) ranges.append(text) bin_dict = dict(list(zip(list(range(len(ranges))),ranges))) else: df = self._data_frame df = df.select([date_column,chisquare_column,result_column]).toPandas() df["suggestedDate"] = df[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat)) df["year_month"] = df["suggestedDate"].apply(lambda x:x.strftime("%b-%y")) # result_column_count_df = df.groupBy(self._result_column).count().orderBy("count",ascending=False) # grouped_data.sort_values(by='key', ascending=True) result_column_count = df[result_column].value_counts() top2levels = result_column_count[:2].index for level in top2levels: filtered_df = df.loc[df[result_column] == level] grouped_result = pd.DataFrame(filtered_df[date_column].value_counts()).reset_index() grouped_result.columns=[date_column,"value"] # grouped_result["suggestedDate"] = grouped_result[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat)) grouped_result["year_month"] = grouped_result[date_column].apply(lambda x: datetime.strptime(x,self._existingDateFormat).strftime("%b-%y")) crosstab_df = pd.DataFrame(pd.crosstab(filtered_df["suggestedDate"],filtered_df[chisquare_column])).reset_index() if chisquare_column in self._dataframe_helper.get_numeric_columns(): crosstab_columns = crosstab_df.columns chisquare_levels = crosstab_columns[1:] chisquare_levels = [bin_dict[x] for x in chisquare_levels] crosstab_df.columns = [crosstab_columns[0]]+chisquare_levels else: chisquare_levels = crosstab_df.columns[1:] crosstab_df["year_month"] = crosstab_df["suggestedDate"].apply(lambda x:x.strftime("%b-%y")) final_df = pd.merge(grouped_result,crosstab_df, how='outer', on=['year_month']) final_df.sort_values(by="suggestedDate",ascending=True,inplace=True) final_df.reset_index(drop=True,inplace=True) final_df["overallPerChange"] = [0]+[round((x-y)*100/float(y),self._num_significant_digits) for x,y in zip(final_df["value"].iloc[1:],final_df["value"])] growth_dict = {} for val in chisquare_levels: growth_dict[val] = {} growth_dict[val]["growth"] = round(((final_df[val].iloc[-1]-final_df[val].iloc[0])*100/float(final_df[val].iloc[0])),self._num_significant_digits) if growth_dict[val]["growth"] > 3 or final_df[val].iloc[0] == 0: growth_dict[val]["growthType"] = "positive" print(growth_dict[val]["growth"]) elif growth_dict[val]["growth"] < -3: growth_dict[val]["growthType"] = "negative" else: growth_dict[val]["growthType"] = "stable" growth_dict[val]["total"] = sum(final_df[val]) growth_dict["overall"] = {} growth_dict["overall"]["growth"] = round((final_df["value"].iloc[-1]-final_df["value"].iloc[0]/float(final_df["value"].iloc[0])),self._num_significant_digits) data_dict = {} total_tuple = [] for k,v in list(growth_dict.items()): if k != "overall": total_tuple.append((k,v["total"])) sorted_total_tuple = sorted(total_tuple,key=lambda x:x[1],reverse=True) top_dimension = sorted_total_tuple[0][0] final_df["topDimensionPerChange"] = [0]+[round((x-y)*100/float(y),self._num_significant_digits) for x,y in zip(final_df[top_dimension].iloc[1:],final_df[top_dimension])] data_dict["dimension"] = chisquare_column data_dict["correlation"] = final_df["value"].corr(final_df[top_dimension]) data_dict["subset_increase_percent"] = growth_dict[top_dimension]["growth"] data_dict["overall_increase_percent"] = growth_dict["overall"]["growth"] data_dict["target"] = level data_dict["top_dimension"] = top_dimension overall_peak_index = np.argmax(final_df["value"]) overall_low_index = np.argmin(final_df["value"]) top_dimension_peak_index = np.argmax(final_df[top_dimension]) top_dimension_low_index = np.argmin(final_df[top_dimension]) data_dict["overallPeakValue"] = final_df["value"][overall_peak_index] data_dict["overallLowestValue"] = final_df["value"][overall_low_index] data_dict["overallPeakTime"] = final_df["year_month"][overall_peak_index] data_dict["overallLowestTime"] = final_df["year_month"][overall_low_index] data_dict["overallPeakIncrease"] = final_df["overallPerChange"][overall_peak_index] data_dict["topDimensionPeakValue"] = final_df[top_dimension][top_dimension_peak_index] data_dict["topDimensionLowestValue"] = final_df[top_dimension][top_dimension_low_index] data_dict["topDimensionPeakTime"] = final_df["year_month"][top_dimension_peak_index] data_dict["topDimensionLowestTime"] = final_df["year_month"][top_dimension_low_index] data_dict["topDimensionPeakIncrease"] = final_df["topDimensionPerChange"][top_dimension_peak_index] data_dict["overall_streak"] = NarrativesUtils.streak_data(final_df,overall_peak_index,overall_low_index,\ "overallPerChange","value") data_dict["top_dimension_streak"] = NarrativesUtils.streak_data(final_df,top_dimension_peak_index,top_dimension_low_index,\ "topDimensionPerChange",top_dimension) # print growth_dict data_dict["num_positive_growth_dimensions"] = 0 data_dict["positive_growth_dimensions"] = [] data_dict["positive_growth_values"] = [] data_dict["num_negative_growth_dimensions"] = 0 data_dict["negative_growth_dimensions"] = [] data_dict["negative_growth_values"] = [] data_dict["num_stable_growth_dimensions"] = 0 data_dict["stable_growth_dimensions"] = [] data_dict["stable_growth_values"] = [] data_dict["overall_growth_rate"] = growth_dict["overall"]["growth"] data_dict["total_levels"] = len(chisquare_levels) for val in chisquare_levels: if growth_dict[val]["growthType"] == "positive": data_dict["num_positive_growth_dimensions"] += 1 data_dict["positive_growth_dimensions"].append(val) data_dict["positive_growth_values"].append(growth_dict[val]["growth"]) elif growth_dict[val]["growthType"] == "negative": data_dict["num_negative_growth_dimensions"] += 1 data_dict["negative_growth_dimensions"].append(val) data_dict["negative_growth_values"].append(growth_dict[val]["growth"]) else: data_dict["num_stable_growth_dimensions"] += 1 data_dict["stable_growth_dimensions"].append(val) data_dict["stable_growth_values"].append(growth_dict[val]["growth"]) summary1 = NarrativesUtils.get_template_output(self._base_dir,\ 'chisquare_trend.html',data_dict) chart_data = {"data":[],"header":[]} chart_data["header"] = ["time",result_column,top_dimension] chart_data["data"]=[["time"],[result_column],[top_dimension]] for idx in range(final_df.shape[0]): chart_data["data"][0].append(final_df["year_month"].iloc[idx]) chart_data["data"][1].append(final_df["value"].iloc[idx]) chart_data["data"][2].append(final_df[top_dimension].iloc[idx]) paragraphs = NarrativesUtils.paragraph_splitter(summary1) card_data = {"paragraphs":paragraphs,"chart":chart_data} output.append([card_data]) print(json.dumps(output,indent=2))
#Groupby and list form , collect_list df.groupBy("username").agg(F.collect_list("friend").alias("friends_grouped")).show(10) def top_ss(ss_list): tsk = str(Counter(ss_list).most_common(50)) return tsk from pyspark.sql.functions import collect_list udf_top = udf(top_ss, StringType()) final_data = useful_data.groupBy("single_col_l").agg(udf_top(collect_list(col('single_col_2'))).alias('ss_frequencies')) #Select max or maximum from a column train.select(max("datetime")).show(truncate=False) #Get Item : Extract item from a specific postion of a column consisting of lists #Previously id was [ab,fg,fe] out of which new_id [ab] is to be selected ans=df_tmp.withColumn('new_id',split(df_tmp.id,',').getItem(0)) #Add row number column to a dataframe: Useful as pyspark dataframes cannot be accessed by index, no command like tail and join reshuffles them df.withColumn("id", monotonically_increasing_id()).show() #Relacing null values, missing values train_test=train_test.na.fill({'siteid':3696590,'browserid_merged':2, 'devid_encode':1}) #siteid, browserid_merged are column names #Sellect not null values of a column df1.filter(df1.ColumnName_to_check.isNotNull()).show()
new_fire_df.select("ResponseDelayedinMins")\ .where(F.col("ResponseDelayedinMins") > 5)\ .show(5, False) #Date and time columns... fire_ts_df = new_fire_df\ .withColumn("IncidentDate", to_timestamp(F.col("CallDate"), "MM/dd/yyyy"))\ .drop("CallDate")\ .withColumn("OnWatchDate", to_timestamp(F.col("WatchDate"), "MM/dd/yyyy"))\ .drop("WatchDate") \ .withColumn("AvailableDtTm", to_timestamp(F.col("AvailableDtTm"), "MM/dd/yyyy hh:mm:ss a")) \ .drop("WatchDate") fire_ts_df.select("IncidentDate", "OnWatchDate", "AvailableDtTm")\ .show(5, False) fire_ts_df.select(year("IncidentDate"))\ .distinct()\ .orderBy(year("IncidentDate"))\ .show() #AGGREGATIONS fire_ts_df.select("CallType")\ .where(F.col("CallType").isNotNull())\ .groupBy("CallType")\ .count()\ .orderBy("count", ascending=False)\ .show(n=10, truncate=False) fire_ts_df.select(F.sum("NumAlarms"), F.avg("ResponseDelayedinMins"), F.min("ResponseDelayedinMins"), F.max("ResponseDelayedinMins"))\ .show()
def execute(spark, logger, s3_bucket, run_id, aoi_name, complete_catalog, probability_images, seed, config_filename): """The primary script Args: spark (``pyspark.sql.SparkSession``) logger (``py4j.JavaObject``) s3_bucket (str): Name of the S3 bucket to search for configuration objects and save results to run_id (str): The identifier of the current run aoi_id (str): The identifier for the current area of interest probability_images (int): The number of tiles to save the generated probability images for seed (int): A random seed used to sample the probability images, for reproducability Required external inputs: <s3_bucket>/cvmapper_config.yaml under ``learner`` key: prefix: The S3 prefix under which CSVs can be read and written pool: Name of CSV file under s3_bucket/prefix giving the comprehensive list of active grid cells incoming_names: Name of CSV file under s3_bucket/prefix giving list of cells used for training/validation image_catalog: Name of CSV file under s3_bucket giving catalog of imagery image_output_pattern: URI pattern used for output of probability images. Must contain two '{}' tokens to be replaced by the column and row for the relevant cell outgoing: S3 URI to save the CSV of worst-performing cells to location pool: A CSV of ``name``, ``col``, ``row`` for each grid cell under consideration. Identified by ``pool`` parameter above. incoming names: CSV containing (at least) ``name``, ``iteration``, and ``usage`` columns. Every name in this file must also be contained in the image pool. Location of this file given in YAML file. image catalog: A CSV minimally containing ``col``, ``row``, ``season``, and ``uri`` columns. Season is either 'GS' or 'OS'. Every grid cell in the location pool must be contained here, and must have an entry for both seasons. URI points to TIFF that completely covers listed cell with valid image data (no NODATA values). Note: Grid cells are defined according to the master_layout object, which specifies a rectangular extent in long/lat coords. This extent is subdivided into cells (in this case, 13792 columns and 14477 rows). Each cell is then given a pixel resolution (in this case 200x200, but whatever is chosen must match the resolution of the label images provided in the ``s3://<s3_bucket>/<prefix>/<name>_<col>_<row>.tif`` files identified by the incoming names CSV). When we refer to tiles, we mean image chips of the stated resolution, indexed by ``gps.SpatialKey`` objects. The key is a col/row pair where row=0, col=0 corresponds to the chip in the upper left corner of the bounding extent. Note: Grid cell names for the output probability images (`image_output_pattern`) are relative to a different, coarser layout. These grid cell ids need not be clearly defined, since the output of this process is simply a bucket of COGs for display using another tool. However, see the `coarse_layout` definition below for specific details of the layout. """ params = parse_yaml_from_s3(s3_bucket, config_filename)['learner'] label_path = parse_yaml_from_s3( s3_bucket, config_filename)['labeller']['consensus_directory'][1:-1] s3_prefix = params['prefix'] s3_prefix = s3_prefix[0:-1] if s3_prefix.endswith('/') else s3_prefix catalog_prefix = params['image_catalog'] catalog_prefix_fix = params['image_catalog_fix'] feature_names = functools.reduce(lambda a, b: a + b, [[ "{}_raw_{}".format(season, n), "{}_avg_{}".format(season, n), "{}_std_{}".format(season, n) ] for season in ["GS", "OS"] for n in range(1, 5)]) master_layout = gps.LayoutDefinition( gps.Extent(-17.541, -35.46, 51.459, 37.54), gps.TileLayout(13800, 14600, 200, 200)) master_metadata = gps.Metadata( gps.Bounds(gps.SpatialKey(0, 0), gps.SpatialKey(13800, 14600)), "+proj=longlat +datum=WGS84 +no_defs ", gps.CellType.INT8, master_layout.extent, master_layout) #################################### logger.warn("Reading source tables") checkpoint = time.time() f_pool = spark\ .read\ .option('inferScheme', True)\ .option('header', True)\ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['pool']))\ .repartition('col', 'row') qs_in = spark \ .read \ .option('inferScheme', True) \ .option('header', True) \ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['qs'])) \ .repartition('col', 'row') incoming = spark.read\ .option('header', True)\ .schema(StructType([ StructField('name', StringType()), StructField('run', IntegerType()), StructField('iteration', IntegerType()), StructField('processed', BooleanType()), StructField('usage', StringType()), StructField('label', StringType()) ]))\ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names'])) # merge incoming_names and incoming_names_static incoming = incoming.union(spark.read \ .option('header', True) \ .schema(StructType([ StructField('name', StringType()), StructField('run', IntegerType()), StructField('iteration', IntegerType()), StructField('processed', BooleanType()), StructField('usage', StringType()), StructField('label', StringType()) ])) \ .csv('s3n://{}/{}/{}'.format(s3_bucket, s3_prefix, params['incoming_names_static']))) incoming = incoming.filter(incoming['run'] == params['runid']).filter( incoming['label'] == True) test_names = f_pool.join(incoming.select('name'), 'name', 'left_anti').withColumn("usage", lit("test")) all_names = f_pool.join(incoming.select('name', 'usage'), f_pool.name == incoming.name, how='left')\ .select(f_pool.name.alias('name'), 'col', 'row', 'usage') num_test_images = test_names.count() image_catalog = spark.read\ .option('inferScheme', True)\ .option('header', True)\ .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix))\ .repartition('col', 'row') all_image_uris = image_catalog\ .filter(image_catalog['season'] == 'GS')\ .alias('gs')\ .join(image_catalog.filter(image_catalog['season'] == 'OS').alias('os'), (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row')))\ .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS')) logger.warn( "Elapsed time for reading source tables: {}s".format(time.time() - checkpoint)) #################################### logger.warn("Reading training labels & building training features") checkpoint = time.time() training_data = gather_data(all_image_uris, all_names.filter(all_names.usage == 'train'), master_metadata, feature_names, s3_bucket, label_path, include_masks=True) training_data.show() logger.warn( "Elapsed time for reading training labels and feature building: {}s". format(time.time() - checkpoint)) #################################### logger.warn("Balancing data") checkpoint = time.time() balanced_data = balance_samples(spark, training_data, 'mask') balanced_data.show() logger.warn("Elapsed time for balancing data: {}s".format(time.time() - checkpoint)) #################################### logger.warn("Training model") checkpoint = time.time() pipeline = ml_pipeline(feature_names, 'mask') model = pipeline.fit(balanced_data) print(model) logger.warn("Elapsed time for training the model: {}s".format(time.time() - checkpoint)) #################################### logger.warn("Validating model results") checkpoint = time.time() validation_data = gather_data( all_image_uris, all_names.filter(all_names.usage == 'validate'), master_metadata, feature_names, s3_bucket, label_path, include_masks=True) valid_fit = model.transform(validation_data).select( 'prediction', 'probability', 'mask') metrics = MulticlassMetrics( valid_fit.rdd.map(lambda r: (r.prediction, r.mask))) confusion_matrix = metrics.confusionMatrix().toArray().flatten().tolist( ) #left to right, top to bottom tss = 1.0 * confusion_matrix[3] / (confusion_matrix[3] + confusion_matrix[2]) + \ 1.0 * confusion_matrix[0] / (confusion_matrix[0] + confusion_matrix[1]) - 1 binmetrics = BinaryClassificationMetrics( valid_fit.rdd.map(lambda r: (float(r['probability'][1]), r['mask']))) last_iteration = incoming.agg(F.max('iteration')).collect()[0][0] report = pd.DataFrame({ 'run': [run_id], 'iteration': [last_iteration + 1], 'tss': [tss], 'accuracy': [metrics.accuracy], 'precision': [metrics.precision(1.0)], 'recall': [metrics.recall(1.0)], 'fpr': [metrics.falsePositiveRate(1.0)], 'tpr': [metrics.truePositiveRate(1.0)], 'AUC': [binmetrics.areaUnderROC], 'aoi': [aoi_name], 'iteration_time': [datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S.%f')] }) # TODO: allow target location to be derived from params (local or s3) # added because of an error where incoming_metrics.csv contained different iteration number (10) # than expected by DB (4). Ryan's guess is that this is due to multiple test clusters overwriting csv # print("############Old Iteration Metrics to overwrite###########") # incoming_previous = pd.read_csv(os.path.join("s3://",s3_bucket,s3_prefix,params['metrics'])) # print(incoming_previous.to_string()) # print("############New Iteration Metrics to use to overwrite###########") # print(report.to_string()) pd_df_to_s3_csv(report, s3_bucket, os.path.join(s3_prefix, params['metrics'])) logger.warn( "Elapsed time for validating and saving metrics to s3: {}s".format( time.time() - checkpoint)) #################################### logger.warn("Classifying test data") checkpoint = time.time() filtered_names = test_names.filter(test_names.usage == "test") # filtered_names.cache() # filtered_names.show() test_features = gather_data(all_image_uris, filtered_names, master_metadata, feature_names, s3_bucket) test_features_sample = test_features.sample(True, 0.1) fitted = model.transform(test_features_sample).select( 'spatial_key', 'column_index', 'row_index', 'probability', 'prediction') # fitted.cache() # fitted.show() grouped = fitted.groupBy('spatial_key') # don't want to use following UDF, but indication is that there is a bug in pyspark preventing vector accesses: # https://stackoverflow.com/questions/44425159/access-element-of-a-vector-in-a-spark-dataframe-logistic-regression-probability # (This did not work without the UDF!) firstelement = F.udf(lambda v: float(v[0]), FloatType()) # added this UDF to select the probability of field rather than no field to write to probability images secondelement = F.udf(lambda v: float(v[1]), FloatType()) logger.warn( "Elapsed time for classifying test grids: {}s".format(time.time() - checkpoint)) #################################### if probability_images > 0 or complete_catalog: logger.warn("Write catalog of {} probability images".format( probability_images)) checkpoint = time.time() if complete_catalog: # new catalog image_catalog_fix = spark.read \ .option('inferScheme', True) \ .option('header', True) \ .csv('s3n://{}/{}'.format(s3_bucket, catalog_prefix_fix)) \ .repartition('col', 'row') all_image_uris_fix = image_catalog_fix \ .filter(image_catalog_fix['season'] == 'GS') \ .alias('gs') \ .join(image_catalog_fix.filter(image_catalog_fix['season'] == 'OS').alias('os'), (col('gs.col') == col('os.col')) & (col('gs.row') == col('os.row'))) \ .select(col('gs.col'), col('gs.row'), col('gs.uri').alias('GS'), col('os.uri').alias('OS')) #recollect all pixels for all testing images compreh_names = f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer') features_compreh = gather_data(all_image_uris_fix, compreh_names, master_metadata, feature_names, s3_bucket) fitted_compreh = model.transform(features_compreh)\ .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction') grouped_compreh = fitted_compreh.groupBy('spatial_key') # added to test sampling assembled = grouped_compreh.agg( assembleTile('column_index', 'row_index', secondelement('probability'), master_layout.tileLayout.tileCols, master_layout.tileLayout.tileRows, 'float32').alias('probability')) layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF()) else: #################################### logger.warn("Identify worst performing cells") checkpoint = time.time() # TODO: Determine which images to take certainty = grouped \ .agg(F.avg(F.pow(firstelement(fitted.probability) - lit(0.5), 2.0)).alias('certainty')).cache() certainty.show() worst_keys_rdd = certainty \ .sort('certainty') \ .select('spatial_key') \ .limit(round(certainty.count() * 0.05)) \ .rdd.takeSample(False, (params['number_outgoing_names'])) worst_keys = spark.createDataFrame(worst_keys_rdd) outgoing_names = worst_keys \ .join(f_pool, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))) \ .select('name') \ .withColumn('run', lit(run_id)) \ .withColumn('iteration', lit(last_iteration + 1)) \ .withColumn('processed', lit(False)) \ .withColumn('usage', lit('train')) \ .toPandas() uri = urlparse.urlparse(params['outgoing']) pd_df_to_s3_csv(outgoing_names, uri.netloc, uri.path[1:]) logger.warn( "Elapsed time for sorting certainty, converting to Pandas Dataframe, and saving to s3: {}s" .format(time.time() - checkpoint)) ########################################### checkpoint = time.time() # sampling testing images (num = probability_images) filtered_names_sample = filtered_names\ .sample(False, min(1.0, float(probability_images) / float(num_test_images)), seed=seed)\ .join(image_catalog.filter(image_catalog['season'] == 'GS'), ['col', 'row'])\ .select('scene_id')\ .dropDuplicates()\ .join(image_catalog.filter(image_catalog['season'] == 'GS'), 'scene_id')\ .join(f_pool.join(qs_in, ['name', 'col', 'row', 'name_col_row'], 'outer'), ['col','row'])\ .select('name', 'col', 'row', 'name_col_row') #re-collect all pixels within sampled images features_images = gather_data(all_image_uris, filtered_names_sample, master_metadata, feature_names, s3_bucket) #reclassify sampled testing images fitted_images = model.transform(features_images)\ .select('spatial_key', 'column_index', 'row_index', 'probability', 'prediction') grouped_sample = fitted_images.join( filtered_names_sample, (col('spatial_key.col') == col('col')) & (col('spatial_key.row') == col('row'))).groupby('spatial_key') assembled = grouped_sample.agg( assembleTile('column_index', 'row_index', secondelement('probability'), master_layout.tileLayout.tileCols, master_layout.tileLayout.tileRows, 'float32').alias('probability')) layer = gps.TiledRasterLayer.from_rasterframe(assembled.asRF()) coarse_layout = gps.LayoutDefinition( gps.Extent(-17.541, -35.46, 51.459, 37.54), gps.TileLayout(1380, 1460, 2000, 2000)) # we multiply by 100 to select digits that will be kept after converting from float to int. # range of int8 is to 128, so we can only preserve 2 sig figs output_tiles = (layer*100).convert_data_type(gps.CellType.INT8)\ .tile_to_layout(coarse_layout)\ .to_geotiff_rdd(storage_method=gps.StorageMethod.TILED) cog_location = '/tmp/image_{}_{}.tif' if 'image_output_pattern' not in params else params[ 'image_output_pattern'] output_tiles.foreach(lambda pair: write_bytes_to_s3( cog_location.format(pair[0].col, pair[0].row, aoi_name, run_id, str(last_iteration + 1)), pair[1])) logger.warn( "Elapsed time for writing catalog of probability images: {}s". format(time.time() - checkpoint))
# Samuel Tribe - 201318996 - [email protected] from pyspark import SparkContext, SparkConf from pyspark.sql import functions as F from pyspark.sql.session import SparkSession from pyspark.sql.types import DateType conf = SparkConf().setAppName("covid19").setMaster("local") spark = SparkSession(SparkContext(conf=conf)) csvPath = "C:\spark\COMP336-Coursework-1\data\covid19.csv" covidDF = spark.read.csv(csvPath,header=True,inferSchema=True) covidDF = covidDF.withColumn("date", F.col("date").cast(DateType())) print("covid19.csv read as Dataframe with header=True") covidDF.show() print("Schema for dataframe") covidDF.printSchema() print("Filtering out NULL values from dataframe") covidDF = covidDF.filter(covidDF.continent.isNotNull() & covidDF.location.isNotNull() & covidDF.date.isNotNull() & covidDF.total_cases.isNotNull() & covidDF.new_cases.isNotNull() & covidDF.total_deaths.isNotNull() & covidDF.new_deaths.isNotNull()) covidDF.show() print("Highest deaths per country") covidDF.groupBy(['location']).agg(F.max(covidDF.total_deaths)).show() print("max and min function results on total_cases") covidDF.groupBy(['location']).agg(F.max(covidDF.total_cases).alias('total_cases_max'), F.min(covidDF.total_cases).alias('total_cases_min')).show()
# count by different code type logs_df.groupBy("code").count().show() # rank by counts from pyspark.sql.functions import asc, desc logs_df.groupBy('code').count().orderBy(desc('count')).show() # calculate average size of different code logs_df.groupBy("code").avg("bytes").show() # more calculation by code - average, min, max import pyspark.sql.functions as F logs_df.groupBy("code").agg( logs_df.code, F.avg(logs_df.bytes), F.min(logs_df.bytes), F.max(logs_df.bytes) ).show() # homework # 1 yelp_df.select("cool").agg({"cool" : "mean"}).collect() # 2 import pyspark.sql.functions as F yelp_df.filter('review_count >= 10').groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show() # 3 yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy("stars").agg(yelp_df.stars, F.avg(yelp_df.cool)).show() # 4 from pyspark.sql.functions import asc, desc yelp_df.filter((yelp_df.review_count >= 10) & (yelp_df.open == 'True')).groupBy('state').count().orderBy(desc('count')).show() # 5
def calculate_and_plot_ar(jnd_path, sdk_path, dest): jnd = sqlContext.read.parquet( jnd_path) #'/Users/amirdavidoff/Desktop/data/enriched_data/jnd' window = Window.partitionBy("jnd_sender_id").orderBy(["jnd_ts"]) ''' plot response code by answer ''' rates = jnd.select(['nbr_response_code', 'is_answered', 'jnd_retailer']).toPandas() count_codes = rates.groupby(['nbr_response_code', 'jnd_retailer'], as_index=False).agg(["count"]).reset_index() count_codes.columns = ['code', 'retailer', 'count'] count_codes.to_csv(dest + '/count_codes.csv') rates = rates.groupby(['nbr_response_code', 'jnd_retailer'], as_index=False).agg( {"is_answered": ["count", "mean", binary_ci]}) #rates["codes"] = rates.index rates.columns = [ 'codes', 'retailer', 'is_answered_count', 'is_answered_mean', 'is_answered_ci' ] rates = rates.sort_values('is_answered_mean', ascending=True).reset_index(drop=True) rates = rates[rates["is_answered_count"] >= 50] rates["codes"] = np.where(rates["codes"] == "", "empty", rates["codes"]) fig = plt.figure(figsize=(10, 10)) for retailer in rates["retailer"].unique().tolist(): temp = rates[rates["retailer"] == retailer] plt.errorbar(temp["is_answered_mean"], temp["codes"], xerr=temp['is_answered_ci'].values, fmt='o', label=retailer, alpha=0.7) plt.xlabel('answer rate') plt.ylabel('nbr response code') plt.title('nbr codes answer rate') plt.legend() plt.savefig( '{}/nbr_code_ctr2.png'.format(dest) ) # /Users/amirdavidoff/mmuze-research/Improve_conversation_quality/tasks/task5 plt.show(block=False) plt.close(fig) ''' sdk ''' #sdk_path = '/Users/amirdavidoff/Desktop/data/yaron_sql_dumps/sdk_reports.csv' spark_sdk = sqlContext.read.options(header=True).csv(sdk_path) #,sep='\t' spark_sdk = spark_sdk.withColumn( 'date', F.to_date(F.from_unixtime(F.col('timestamp') / F.lit(1000.0)))) spark_sdk = spark_sdk.withColumn( 'date_time', F.from_unixtime(F.col('timestamp') / F.lit(1000.0))) spark_sdk = spark_sdk.withColumn("timestamp_int", spark_sdk.timestamp.cast(IntegerType())) spark_sdk = spark_sdk.where((F.to_date('date') >= F.lit("2019-07-01"))) sdk_window = Window.partitionBy("user_id").orderBy(["date_time"]) sdk_window_no_order = Window.partitionBy("user_id") #sdk = sqlContext.read.parquet('/Users/amirdavidoff/Desktop/data/enriched_data/sdk') #sdk.count() #sdk.groupBy('action').count().show(100) actions = ['click', 'add to cart'] spark_sdk = spark_sdk.where(spark_sdk.action.isin(actions)) #sdk.count() sdk_cols = [ 'retailer_id', 'user_id', 'timestamp', 'date', 'date_time', 'action', 'value' ] spark_sdk = spark_sdk.select(sdk_cols) spark_sdk = spark_sdk.withColumn('rank', F.rank().over(sdk_window)) spark_sdk = spark_sdk.withColumn('max_rank', F.max('rank').over(sdk_window_no_order)) spark_sdk = spark_sdk.withColumn( 'dates', F.collect_list(F.col('date_time').cast(StringType())).over(sdk_window)) spark_sdk = spark_sdk.withColumn('actions', F.collect_list('action').over(sdk_window)) spark_sdk2 = spark_sdk.where(spark_sdk.rank == spark_sdk.max_rank) #sdkp2 = spark_sdk2.toPandas() ''' join jnd and sdk ''' jnd2 = jnd.join(spark_sdk2.select( ['user_id', 'dates', 'actions', 'retailer_id']), spark_sdk.user_id == jnd.jnd_sender_id, how="left") #jnd2.count() def len_clicks(ls): try: return len([c for c in ls if c == 'click']) except: return None len_clicks_udf = F.udf(len_clicks, IntegerType()) def len_adds(ls): try: return len([c for c in ls if c == 'add to cart']) except: return None len_adds_udf = F.udf(len_adds, IntegerType()) jnd2 = jnd2.withColumn('clicks', len_clicks_udf(F.col('actions'))) jnd2 = jnd2.withColumn('adds', len_adds_udf(F.col('actions'))) ''' add nbr count ''' jnd_window = Window.partitionBy("jnd_sender_id") jnd2 = jnd2.withColumn( "nbr_count", F.sum(F.when(F.col('nbr_date').isNotNull(), 1).otherwise(0)).over(jnd_window)) #jnd2.where(jnd2.nbr_count<=3).select('jnd_sender_id').dropDuplicates().show(100) #jnd2.where(jnd2.nlu_positive_product_type=='shirt').select('jnd_sender_id').dropDuplicates().show(100) ''' grp convs ''' convs_grp = jnd2.groupBy('jnd_sender_id').agg( F.count('nbr_date').alias('nbr_count'), F.sum('is_answered').alias('sum_is_answered'), F.first('clicks').alias('clicks'), F.first('adds').alias('adds'), F.first('jnd_retailer').alias('retailer')).toPandas() convs_grp[['clicks', 'adds', 'sum_is_answered' ]] = convs_grp[['clicks', 'adds', 'sum_is_answered']].fillna(0) convs_grp["answers_cut"] = pd.cut(convs_grp.sum_is_answered, [0, 1, 2, np.Inf], include_lowest=True, right=False) convs_grp["answers_cut"].value_counts() ''' plot answer on click and add ''' grp = convs_grp.groupby(['answers_cut', 'retailer'], as_index=False).agg({ "clicks": ["count", "mean", binary_ci], "adds": ["count", "mean", binary_ci] }) grp.columns = [ 'cut', 'retailer', 'click_count', 'click_rate', 'click_ci', 'add_to_cart_count', 'add_to_cart_rate', 'add_to_cart_ci' ] for retailer in ['429']: #convs_grp['retailer'].unique().tolist() for c in ['click', 'add_to_cart']: temp = grp[grp["retailer"] == retailer] plt.errorbar([i for i in range(len(temp[c + "_rate"]))], temp[c + "_rate"].values, yerr=temp[c + "_ci"].values, fmt='o', label=c + "_" + retailer) plt.xticks([i for i in range(len(temp[c + "_rate"]))], temp["cut"]) plt.legend() plt.title("user answer count effect on click / add to cart") plt.ylabel('rate') plt.xlabel("answer count") plt.savefig( '{}/answer_click.png'.format(dest) ) #/Users/amirdavidoff/mmuze-research/Improve_conversation_quality/tasks/task5 plt.show(block=False) plt.close(fig) ''' plot questions for each type''' window_type2 = Window.partitionBy(["jnd_sender_id", 'lag_type']).orderBy(["jnd_ts"]) jnd2 = jnd2.withColumn('lag_type', F.lag(F.col('nlu_positive_product_type')).over( window)) #F.lit('_'),F.col('code_rank'))) #jnd2 = jnd2.withColumn('lag_type2',F.lag(F.col('nlu_positive_product_type'),2).over(window))#F.lit('_'),F.col('code_rank'))) #jnd2 = jnd2.withColumn('lag_type3',F.when(F.col('lag_type').isNull(),F.col('lag_type2')).otherwise(F.col('lag_type')))#F.lit('_'),F.col('code_rank'))) #jnd2 = jnd2.withColumn('lag_type_filled',F.when(F.col('nlu_date').isNull(),F.last('nlu_positive_product_type',True).over(Window.partitionBy('jnd_sender_id').orderBy('jnd_ts').rowsBetween(-sys.maxsize, 0)))) #jnd2 = jnd2.withColumn('type_q_rank',F.row_number().over(window_type2)) jnd2 = jnd2.withColumn( 'type_q_rank2', F.sum(F.when(F.col('nbr_response_code').isNotNull(), 1).otherwise(0)).over(window_type2)) jnd2 = jnd2.withColumn( 'q_rank', F.concat(F.col('type_q_rank2'), F.lit('_'), F.col('nbr_response_code'))) jnd2 = jnd2.withColumn('lag_nlu_date', F.lag('nlu_date').over(window)) cols = [ 'nlu_positive_product_type', 'lag_type', 'nlu_text', 'nlu_date', 'is_answered', 'nbr_date', 'nbr_response_code', 'type_q_rank2', 'q_rank', 'jnd_sender_id', 'jnd_ts' ] temp = jnd2.where((jnd2.nbr_response_code.isNotNull()) & (jnd2.lag_type.isNotNull())).select(cols) #temp = temp.withColumn('type_q_rank',F.dense_rank().over(window_type2)) #delete = temp.where(temp.jnd_sender_id=='4rbvkcyxbrg').toPandas() grp = temp.groupBy(['lag_type', 'q_rank']).agg( F.count('is_answered').alias('count'), F.mean('is_answered').alias('answer_rate')).toPandas() grp["ci"] = grp[["answer_rate", "count"]].apply(binary_ci_pd, axis=1) grp = grp[grp["count"] >= 100] grp = grp.sort_values(["lag_type", "count"], ascending=False) ''' plot types bar plot ''' for typ in grp["lag_type"].unique().tolist(): fig = plt.figure(figsize=(10, 10)) temp = grp[grp["lag_type"] == typ] temp = temp.sort_values('q_rank', ascending=False) #temp = temp.sort_values("answer_rate") plt.errorbar(temp["answer_rate"], temp["q_rank"], xerr=temp['ci'].values, fmt='o', label=typ, alpha=0.7) plt.xlabel('answer rate') plt.ylabel('order_question') plt.title('answer rate per question and order for type {}'.format(typ)) plt.legend() # plt.show() plt.savefig('{}/{}.png'.format(dest, typ)) plt.show(block=False) plt.close(fig) ''' original plots - there is a bug check it sometime bug came after filling answered na with 0 12/11/19'''
from pyspark import SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql import functions as F sc = SparkContext(appName = "Lab2ex3") sqlContext = SQLContext(sc) temp_file = sc.textFile("BDA/input/temperature-readings.csv") temp_lines = temp_file.map(lambda line: line.split(";")) tempReadingsRow = temp_lines.map(lambda p: Row(station=p[0], date=p[1], year=p[1].split("-")[0], month=p[1].split("-")[1], day=p[1].split("-")[2], time=p[2], value=float(p[3]), quality=p[4])) schemaTempReadings = sqlContext.createDataFrame(tempReadingsRow) schemaTempReadings.registerTempTable("tempReadingsTable") min_max_val_day = schemaTempReadings.select("year", "month", "day", "station", "value").filter((schemaTempReadings["year"]>=1960) & (schemaTempReadings["year"]<=2014)).groupBy("year", "month", "day", "station").agg(F.max(schemaTempReadings["value"]).alias("maxVal"), F.min(schemaTempReadings["value"]).alias("minVal")) monthly_avg = min_max_val_day.select("year", "month", "station", ((min_max_val_day["maxVal"] + min_max_val_day["minVal"])/2).alias("dailyAvg")).groupBy("year", "month", "station").agg(F.avg("dailyAvg").alias("monthlyAvg")).orderBy("monthlyAvg", ascending=False) monthly_avg.rdd.coalesce(1,shuffle=True).saveAsTextFile("BDA/output")
sc = SparkContext(conf = conf) sqlcontext = SQLContext(sc) # 1. Create a DataFrame with one int column and 10 rows. df = sqlcontext.range(0, 10) df.show() # Generate two other columns using uniform distribution and normal distribution. df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")) df.show() # 2. Summary and Descriptive Statistics df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)) df.describe('uniform', 'normal').show() df.select([mean('uniform'), min('uniform'), max('uniform')]).show() # 3. Sample covariance and correlation # Covariance is a measure of how two variables change with respect to each other. # A positive number would mean that there is a tendency that as one variable increases, # the other increases as well. # A negative number would mean that as one variable increases, # the other variable has a tendency to decrease. df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) df.stat.cov('rand1', 'rand2') df.stat.cov('id', 'id') # Correlation is a normalized measure of covariance that is easier to understand, # as it provides quantitative measurements of the statistical dependence between two random variables. df.stat.corr('rand1', 'rand2') df.stat.corr('id', 'id')
def calc_whiskers(colname, outliers): # Computes min and max values of non-outliers - the whiskers minmax = (outliers.filter("not `__{}_outlier`".format(colname)).agg( F.min("`%s`" % colname).alias("min"), F.max(colname).alias("max")).toPandas()) return minmax.iloc[0][["min", "max"]].values
# Build vocabulary of categorical columns. vocab = build_vocabulary( train_df.select(*categorical_cols).unionAll( test_df.select(*categorical_cols)).cache(), categorical_cols) # Cast continuous columns to float & lookup categorical columns. train_df = cast_columns(train_df, continuous_cols + ['Sales']) train_df = lookup_columns(train_df, vocab) test_df = cast_columns(test_df, continuous_cols) test_df = lookup_columns(test_df, vocab) # Split into training & validation. # Test set is in 2015, use the same period in 2014 from the training set as a validation set. test_min_date = test_df.agg(F.min(test_df.Date)).collect()[0][0] test_max_date = test_df.agg(F.max(test_df.Date)).collect()[0][0] one_year = datetime.timedelta(365) train_df = train_df.withColumn('Validation', (train_df.Date > test_min_date - one_year) & (train_df.Date <= test_max_date - one_year)) # Determine max Sales number. max_sales = train_df.agg(F.max(train_df.Sales)).collect()[0][0] # Convert Sales to log domain train_df = train_df.withColumn('Sales', F.log(train_df.Sales)) print('===================================') print('Data frame with transformed columns') print('===================================') train_df.show()
def fn(col): return ~(F.max(col))
from pyspark.sql import SparkSession from pyspark.sql import functions as F def write_file(accommodates): with open("../output/out_2_4.txt", "a") as f: f.write(str(accommodates)) spark: SparkSession = SparkSession.builder.master("local[*]").appName( "task1").getOrCreate() hotelDF = spark.read.parquet( "../input/part-00000-tid-4320459746949313749-5c3d407c-c844-4016-97ad-2edec446aa62-6688-1-c000.snappy.parquet" ) min_max = hotelDF.agg(F.min(hotelDF.price), F.max(hotelDF.review_scores_value)).head() col_min_price = min_max[0] col_max_rating = min_max[1] data = hotelDF.filter((F.col("price") == col_min_price) & ( F.col("review_scores_value") == col_max_rating)).collect() accommodates = data['accommodates'] write_file(accommodates)
def market_x_seg(config, **dict): header_df = dict['table1'].filter(col('contact_stage_code') == 'ALC') header_df = header_df.drop('contact_stage_code') print '1' header_df.cache() print header_df.count() print header_df.distinct().count() header_df.show() detail_df = dict['table2'] # details control customers required detail_df = detail_df.withColumn("offer_rank_num", col("offer_rank_num").cast(DoubleType())) detail_df.cache() print '2' print detail_df.count() print detail_df.distinct().count() detail_df.show() detail_header_df = detail_df.join(header_df, 'prsn_code', 'left_outer') detail_header_df.cache() print '3' print detail_header_df.count() print detail_header_df.distinct().count() detail_header_df.show(truncate=False) volume_udf = udf( lambda col1, col2: 'baulcm' if (col1 >= 1 and col1 <= 8 and col2 != 'Y' and col2 is not None) else 'lessloyal' if (col1 >= 1 and col1 <= 3 and col2 == 'Y') else 'baby' if (col1 >= 9 and col1 <= 12) else 'npm' if (col1 >= 13 and col1 <= 20) else 'extralcm' if (col1 >= 21 and col1 <= 24) else 'babyextra' if (col1 >= 25 and col1 <= 28) else 'nfm' if (col1 >= 29 and col1 <= 32) else 'additional' if (col1 >= 33 and col1 <= 40) else 'others') slot_df = detail_header_df.withColumn( 'slot_segment', (volume_udf(col('offer_rank_num'), col('less_loyal_flag')))) slot_df.cache() print '4' print slot_df.count() print slot_df.distinct().count() slot_df.filter(col('contact_stage_code') == 'RDM').show(200) slot_df = slot_df.drop('offer_rank_num') slot_df = slot_df.drop('less_loyal_flag') print 'count for slot_df' slot_df.cache() print slot_df.count() print slot_df.distinct().count() slot_df.show() card_lifestyle_seg_df = dict['card_lifestyle_seg'].filter( trim(col(config['identity_type_code'])) != '') slot_segment_df = slot_df.join(card_lifestyle_seg_df, config['identity_type_code'], 'left_outer') print 'count for slot_segment_df' slot_segment_df.cache() print slot_segment_df.count() print slot_segment_df.distinct().count() slot_segment_df.show() card_loyalty_seg_df = dict['card_loyalty_seg'].filter( trim(col(config['identity_type_code'])) != '') slot_segment_df = slot_segment_df.join(card_loyalty_seg_df, config['identity_type_code'], 'left_outer') print 'count for slot_segment_df' slot_segment_df.cache() print slot_segment_df.count() print slot_segment_df.distinct().count() slot_segment_df.show() card_pricesence_seg_df = dict['card_pricesence_seg'].filter( col(config['identity_type_code']) != '') slot_segment_df = slot_segment_df.join(card_pricesence_seg_df, config['identity_type_code'], 'left_outer') print 'count for slot_segment_df' slot_segment_df.cache() print slot_segment_df.count() print slot_segment_df.distinct().count() slot_segment_df.show() date_df = dict['date_dim'] targ_date = date_df.filter( col("fis_week_id") == str(config['seg_week'])).filter( col('fis_day_of_week_num') == '7').select( col('date').cast(StringType())).collect()[0][0] # age card_dim_df = dict['card_dim'].withColumn( 'age_1', F.floor(F.datediff(F.lit(targ_date), F.col('card_birth_date'))) / 365).withColumn( 'Age', F.when(F.col('age_1') > 66, '67 eller mer').otherwise( F.when(F.col('age_1') > 55, '56-66').otherwise( F.when(F.col('age_1') > 45, '46-55').otherwise( F.when(F.col('age_1') > 35, '36-45').otherwise( F.when(F.col('age_1') > 25, '26-35').otherwise( F.when(F.col('age_1') > 0, '0-25').otherwise( 'Uklassifiserte'))))))).drop( F.col('age_1')) slot_segment_df = slot_segment_df.join(card_dim_df, config['identity_type_code'], 'left_outer') slot_segment_df.cache() print '6' print slot_segment_df.count() print slot_segment_df.distinct().count() slot_segment_df.show() # supplier_name print dict offer_dim_df = dict['offer_dim'] # .select("offer_code", "supplier_name").distinct() # CHECK FOR NULL EMPTY OFFER_CODE and empty offer_dim_df = offer_dim_df.filter(trim(col("offer_code")) != "").groupBy( 'offer_code', 'supplier_name').agg( F.max('offer_discount_amt').alias("offer_discount_amt"), F.max('offer_amount').alias("offer_amount")).dropDuplicates( ['offer_code']) # removed ACT after discussion with Sharang slot_segment_df = slot_segment_df.join( offer_dim_df, 'offer_code', 'left_outer').select( slot_segment_df["*"], offer_dim_df.supplier_name, offer_dim_df.offer_amount, F.when( (slot_segment_df.contact_stage_code.isin("ALC", "DLV", "EXP")), offer_dim_df.offer_discount_amt).otherwise( slot_segment_df.offer_discount_amt).alias( "offer_discount_amt")).drop( slot_segment_df.offer_discount_amt) slot_segment_df.cache() print '7' print slot_segment_df.count() print slot_segment_df.distinct().count() slot_segment_df.show() # private label and prod_hier_l20_code # prod_dim_df = dict['prod_dim'].withColumn('supplier_private_label', F.when( # F.upper(F.col('prod_desc')).like('%X-TRA%'), # F.lit('private') # ).when( # F.upper(F.col('prod_desc')).like('%NGLAMARK%'), # F.lit('private') # ).when( # F.upper(F.col('prod_desc')).like('%MARKET%'), # F.lit('private') # ).otherwise(F.lit('non-private'))) # slot_segment_df = slot_segment_df.join(prod_dim_df, 'prod_code', 'left_outer') # banner_name # store_dim_df = dict['store_dim'] # slot_segment_df = slot_segment_df.join(store_dim_df, 'store_code', 'left_outer') return slot_segment_df
# read the precipitation data rdd = sc.textFile("data/precipitation-readings.csv") # create DataFrame from RDD parts = rdd.map(lambda a: a.split(';')) precReadingsRow = parts.map(lambda x: (x[0], x[1], int(x[1].split("-")[ 0]), int(x[1].split("-")[1]), x[2], float(x[3]), x[4])) precReadingsString = [ "station", "date", "year", "month", "time", "value", "quality" ] schemaPrecReadings = sqlContext.createDataFrame(precReadingsRow, precReadingsString) # find the max temp per station maxTemps = schemaTempReadings.groupBy('station').agg( F.max('value').alias('maxTemp')) # filter maxTemps = maxTemps.filter((maxTemps['maxTemp'] >= 25) & (maxTemps['maxTemp'] <= 30)) # calculate the daily precipitation and find the max maxPrecs = schemaPrecReadings.groupBy('station', 'date').agg( F.sum('value')).groupBy('station').agg( F.max('sum(value)').alias('maxDailyPrecipitation')) # filter maxPrecs = maxPrecs.filter((maxPrecs['maxDailyPrecipitation'] >= 100) & (maxPrecs['maxDailyPrecipitation'] <= 200)) # join and output the max temp and max precipitation joined = maxTemps.join(maxPrecs, 'station', 'inner').orderBy('station', ascending=False).show()
sc = SparkContext("local", "Test") print(sc) sqlContext = SQLContext(sc) sqlContext #Creating data frame from list data = [('John', 'Smith', 47),('Jane', 'Smith', 22), ('Frank', 'Jones', 28)] schema = ['fname', 'lname', 'age'] df = sqlContext.createDataFrame(data, schema) df #Retrieving contents of data frame df.printSchema() df.show() df.first() df.count() #Adding columns df = df.withColumn('salary', F.lit(0)) df.show() df.withColumn('salary2', df['age'] * 100).show() #Filtering and subsetting df.filter(df['age'] > 30).select('fname','age').show() df.select(F.max('age').alias('max-age')).show() #Grouped aggregations df.groupBy('lname').max('age').show()
###HadoopLink = "hdfs://10.82.187.10:8020/hadoop/hdfs/INPUTPARQUET/" CashLoanForRandomSample = hq.read.parquet( HadoopLink + "var/CashLoanForRandomSample_parquet").persist() CashLoanForRandomSample.registerTempTable("CashLoanForRandomSample") ClientContractDateMapping = hq.read.parquet( HadoopLink + "dict/ClientContractDateMapping_parquet").persist() ClientContractDateMapping.registerTempTable("ClientContractDateMapping") SaleOfCredits = hq.read.parquet(HadoopLink + "contr/SaleOfCredits_parquet").persist() SaleOfCredits.registerTempTable("SaleOfCredits") max_date = str( CashLoanForRandomSample.agg(psf.max("ReportingDate")).take(1)[0][0]) tag = str(sys.argv[1]) if len(sys.argv) > 1 else 'Last3MWindowsChurnIn3M' #tag='Last3MWindowsChurnIn3M' soc = hq.sql("\ SELECT \ clfrs.ContractID,clfrs.ReportingDate \ ,COUNT(case when soc.ContractID is not null then 1 else null end) AS ExistFlag \ FROM CashLoanForRandomSample clfrs \ LEFT JOIN SaleOfCredits soc \ ON soc.ContractID = clfrs.ContractID AND soc.SalesDate < clfrs.ReportingDate \ GROUP BY clfrs.ContractID,clfrs.ReportingDate \ ") soc.registerTempTable("soc") step1 = hq.sql("\
#print('Best maxIter: ' + str(cvModel_gdbt.bestModel._java_obj.getMaxIter())) bestModel_gdbt = cvModel_gdbt.bestModel #Get the best model with best hyper-parameter # According to the AUC result on test samples, GDBT with maxDepth=4, maxBins=20, and maxIter=10, is the best model. best_model = bestModel_gdbt #Apply the best model # 2 Classify all the users # Predict over all comments predictions_over_comments = best_model.transform(dataset_noEmpty) # Predict over all users. If a user has more than one comments, he or she has more than one prediction. # We assume that we want to find the potential buyer so we don't want to miss any candidates. # As a result, we apply max-win algorithm, which mean unless all prediction is 0, the user is marked as 1. from pyspark.sql import functions as F predictions_over_users = predictions_over_comments.groupBy('userid').agg(F.max('prediction').alias('predictions_over_users')) predictions_over_users.show(5) # Display the percetage of cat or dog owner. #print('%.2f% of users are cat or dog owner.' % (predictions_over_users.filter(F.col('predictions_over_users') == 1).count()/predictions_over_users.count()*100)) print(predictions_over_users.filter(F.col('predictions_over_users') == 1).count()/predictions_over_users.count()*100) #investigate the reasons from the text # 3 get insight of users # First, select cat or dog owners from the dataset cat_dog_owner = ((predictions_over_users.filter(F.col('predictions_over_users') == 1)).join(predictions_over_comments, ['userid'])).select('userid', 'comment', 'words','predictions_over_users','creator_name') # Second, find top 10 popular words in cat and dot owners' comments. # In particular, common words, such as 'and', 'I', 'you', and 'we', have been kicked out. common_words = ['i', 'the', 'and', 'a', 'to', 'you', 'is', 'it', 'of', 'my', 'that', 'in', 'so', 'for', 'have', 'this', 'your', 'are', 'was', 'on', 'with', 'but', 'he', 'they', 'be', 'me', 'just', 'do', 'all', 'one', 'not', 'what', 'im', 'if',
def any(self, axis: Union[int, str] = 0) -> bool: """ Return whether any element is True. Returns False unless there at least one element within a series that is True or equivalent (e.g. non-zero or non-empty). Parameters ---------- axis : {0 or 'index'}, default 0 Indicate which axis or axes should be reduced. * 0 / 'index' : reduce the index, return a Series whose index is the original column labels. Examples -------- >>> ks.Series([False, False]).any() False >>> ks.Series([True, False]).any() True >>> ks.Series([0, 0]).any() False >>> ks.Series([0, 1, 2]).any() True >>> ks.Series([False, False, None]).any() False >>> ks.Series([True, False, None]).any() True >>> ks.Series([]).any() False >>> ks.Series([np.nan]).any() False >>> df = ks.Series([True, False, None]).rename("a").to_frame() >>> df.set_index("a").index.any() True """ axis = validate_axis(axis) if axis != 0: raise NotImplementedError( 'axis should be either 0 or "index" currently.') sdf = self._internal._sdf.select(self.spark_column) col = scol_for(sdf, sdf.columns[0]) # Note that we're ignoring `None`s here for now. # any and every was added as of Spark 3.0 # ret = sdf.select(F.expr("any(CAST(`%s` AS BOOLEAN))" % sdf.columns[0])).collect()[0][0] # Here we use max as its alternative: ret = sdf.select(F.max(F.coalesce(col.cast("boolean"), F.lit(False)))).collect()[0][0] if ret is None: return False else: return ret
# Note also that the alias function is a way of specifying the name of the column # in the output avg_cost_by_animal = ( recent_rescue.filter( recent_rescue.AnimalGroup.isin( "Horse", "Goat", "Cat", "Bird" )) .groupBy("AnimalGroup") .agg( f.min('TotalCost').alias('Min'), f.avg('TotalCost').alias('Mean'), f.max('TotalCost').alias('Max'), f.count('TotalCost').alias('Count')) .sort("Mean", ascending=False) .toPandas() ) avg_cost_by_animal #------------------------ ## Joining Data #------------------------ # Lets load in another data source to indicate population based on postcode, and join that # onto the rescue data filepath = "/tmp/training/population_by_postcode.csv"
from pyspark.sql import SparkSession from pyspark.sql import functions as F if __name__ == "__main__": spark = SparkSession.builder.master("local").appName("pyspark homework").getOrCreate() file_path = "hdfs:///dataset/bank-data.csv" df = spark.read.csv(path=file_path, header=True, inferSchema=True) df.groupBy("sex").agg(F.min("income"), F.max("income"), F.mean("income")).show() df.groupBy("region").agg({"income": "mean"}).show()
#import SQLContext and pyspark SQL functions from pyspark.sql import SQLContext, Row import pyspark.sql.functions as func sqlContext = SQLContext(sc) inputRDD = sc.textFile("/user/pravat/auctiondata.csv").map(lambda l: l.split(",")) auctions = inputRDD.map(lambda p:Row(auctionid=p[0], bid=float(p[1]), bidtime=float(p[2]), bidder=p[3], bidrate=int(p[4]), openbid=float(p[5]), price=float(p[6]), itemtype=p[7], dtl=int(p[8]))) # Infer the schema, and register the DataFrame as a table. auctiondf = sqlContext.createDataFrame(auctions) auctiondf.registerTempTable("auctions") auctiondf.show() auctiondf.printSchema() totbids = auctiondf.count() print totbids totalauctions = auctiondf.select("auctionid").distinct().count() print total auctions itemtypes = auctiondf.select("itemtype").distinct().count() print itemtypes auctiondf.groupBy("itemtype","auctionid").count().show() auctiondf.groupBy("itemtype","auctionid").count().agg(func.min("count"), func.max("count"), func.avg("count")).show() auctiondf.groupBy("itemtype", "auctionid").agg(func.min("bid"), func.max("bid"), func.avg("bid")).show() auctiondf.filter(auctiondf.price>200).count() xboxes = sqlContext.sql("SELECT auctionid, itemtype,bid,price,openbid FROM auctions WHERE itemtype = 'xbox'").show()
# konsum_user_agg=konsum_user.groupBy('a_user_key').agg(sqlfuncs.max('reg_date').alias('reg_date'),\ # sqlfuncs.avg('age').alias('age'), sqlfuncs.max('gender').alias('gender'),sqlfuncs.max('date').alias('last_consume'),\ # sqlfuncs.min('date').alias('first_consume') ) # konsum_user_agg.registerTempTable('user_agg') # # # print(konsum_user_agg.first()) # konsum_user_agg.write.save('/home/erlenda/data/konsum/a_users_parquet') # # # # #reg_late=konsum_user.filter(konsum_user.reg_date<datetime.datetime(2015,11,16,0,0)) # pvs=konsum_user.groupBy('a_virtual','a_user_key','timegroup','device').agg(sqlfuncs.sum(konsum_user.pv).alias("pvs"),\ sqlfuncs.sum(konsum_user.pv_bet).alias("pvs_bet"),\ sqlfuncs.max('date').alias('last_consume'),\ sqlfuncs.min('date').alias('first_consume'),\ sqlfuncs.sum(konsum_user.visits).alias("visits")) pprint(pvs.take(10)) print() #print(pvs.take(100)[55]) pvs_tot1=pvs.agg(sqlfuncs.sum(pvs.pvs)).first() print('Total after basic aggregation',pvs_tot1) pvs_mapped=pvs.rdd.map(lambda x:((x.a_user_key,x.a_virtual), (Counter({literal_eval(x.timegroup):x.pvs}),\ Counter({literal_eval(x.timegroup):1}),\ x.pvs,\ x.pvs_bet,\ Counter({x.device:x.pvs}) ) ) )
# MAGIC %md ### Question: What is the difference between the revenue of a product and the revenue of the best selling product in the same category as this product? # COMMAND ---------- import sys from pyspark.sql.window import Window import pyspark.sql.functions as func # Window function partioned by Category and ordered by Revenue windowSpec = \ Window \ .partitionBy(df['category']) \ .orderBy(df['revenue'].desc()) \ .rangeBetween(-sys.maxsize, sys.maxsize) # Create dataframe based on the productRevenue table dataFrame = sqlContext.table("productRevenue") # Calculate the Revenue difference revenue_difference = \ (func.max(dataFrame['revenue']).over(windowSpec) - dataFrame['revenue']) # Generate a new dataframe (original dataframe and the revenue difference) revenue_diff = dataFrame.select( dataFrame['product'], dataFrame['category'], dataFrame['revenue'], revenue_difference.alias("revenue_difference")) # Display revenue_diff display(revenue_diff)
# A slightly different way to generate the two random columns df = sqlContext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27)) #df.describe().show() display(df.describe()) # COMMAND ---------- #df.describe('uniform', 'normal').show() display(df.describe('uniform', 'normal')) # COMMAND ---------- from pyspark.sql.functions import mean, min, max #df.select([mean('uniform'), min('uniform'), max('uniform')]).show() display(df.select([mean('uniform'), min('uniform'), max('uniform')])) # COMMAND ---------- # MAGIC %md ### Sample covariance and correlation # MAGIC # MAGIC Covariance is a measure of how two variables change with respect to each other. A positive number would mean that there is a tendency that as one variable increases, the other increases as well. A negative number would mean that as one variable increases, the other variable has a tendency to decrease. The sample covariance of two columns of a DataFrame can be calculated as follows: # COMMAND ---------- from pyspark.sql.functions import rand df = sqlContext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27)) # COMMAND ----------
def gen_report_table(hc,curUnixDay): rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6]))) HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor") #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_indoor order by entityid,clientmac,utoday" df_id_stat=hc.sql(sql) df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_id_mm df_min_max ,to caculate firtarrival and last arrival df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY] df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon) df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth") #newly added part for indoors7 and indoors30 based on current date df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0)) df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0)) df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30") hc.sql("drop table if exists df_indoor_fin") df_indoor_fin.write.saveAsTable("df_indoor_fin") rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5]))) HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow") # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday sql="select entityid,clientmac,utoday,UFIRSTDAY," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_flow order by entityid,clientmac,utoday" df_fl_stat=hc.sql(sql) df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_fl_mm df_min_max ,to caculate firtarrival and last arrival df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY] df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon) df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth") hc.sql("drop table if exists df_flow_fin") df_flow_fin.write.saveAsTable("df_flow_fin")
# COMMAND ---------- from pyspark.sql.functions import approx_count_distinct df.select(approx_count_distinct("StockCode", 0.1)).show() # 3364 # COMMAND ---------- from pyspark.sql.functions import first, last df.select(first("StockCode"), last("StockCode")).show() # COMMAND ---------- from pyspark.sql.functions import min, max df.select(min("Quantity"), max("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import sum df.select(sum("Quantity")).show() # 5176450 # COMMAND ---------- from pyspark.sql.functions import sumDistinct df.select(sumDistinct("Quantity")).show() # 29310 # COMMAND ----------
def doRenderMpld3(self, handlerId, figure, axes, keyFields, keyFieldValues, keyFieldLabels, valueFields, valueFieldValues): allNumericCols = self.getNumericalFieldNames() if len(allNumericCols) == 0: self._addHTML("Unable to find a numerical column in the dataframe") return keyFields = self.options.get("keyFields") valueField = self.options.get("valueFields") if(keyFields==None and valueField==None): keyFields=self.getFirstStringColInfo() valueField=self.getFirstNumericalColInfo() else: keyFields = keyFields.split(',') valueField = valueField.split(',') if(len(valueField) > 1): self._addHTML("You can enter only have one value field for Bar Charts (2-D)"+str(len(valueField))) return keyFields = keyFields[0] valueField=valueField[0] #if(len(valueFields>)): #init fig=figure ax=axes #fig, ax = plt.subplots() #fig = plt.figure() params = plt.gcf() plSize = params.get_size_inches() params.set_size_inches( (plSize[0]*2, plSize[1]*2) ) agg=self.options.get("aggregation") groupByCol=self.options.get("groupByCol") if (agg=="None" or agg==None): colLabel = keyFields y = self.entity.select(valueField).toPandas()[valueField].dropna().tolist() x_intv = np.arange(len(y)) labels = self.entity.select(keyFields).toPandas()[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel(valueField, fontsize=18) elif(agg=='AVG'): y1=self.entity.groupBy(keyFields).agg(F.avg(valueField).alias("avg")).toPandas().sort_values(by=keyFields) y=y1["avg"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("Average "+valueField, fontsize=18) elif(agg=='SUM'): y1=self.entity.groupBy(keyFields).agg(F.sum(valueField).alias("sum")).toPandas().sort_values(by=keyFields) y=y1["sum"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("sum "+valueField, fontsize=18) elif(agg=='MAX'): y1=self.entity.groupBy(keyFields).agg(F.max(valueField).alias("max")).toPandas().sort_values(by=keyFields) y=y1["max"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("max "+valueField, fontsize=18) elif(agg=='MIN'): y1=self.entity.groupBy(keyFields).agg(F.min(valueField).alias("min")).toPandas().sort_values(by=keyFields) y=y1["min"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("min "+valueField, fontsize=18) elif(agg=='COUNT'): y1=self.entity.groupBy(keyFields).agg(F.count(valueField).alias("count")).toPandas().sort_values(by=keyFields) y=y1["count"].dropna().tolist() x_intv = np.arange(len(y)) labels=y1[keyFields].dropna().tolist() plt.xticks(x_intv,labels) plt.xlabel(keyFields, fontsize=18) plt.ylabel("count "+valueField, fontsize=18) mpld3.enable_notebook() plt.bar(x_intv,y,color="blue",alpha=0.5) ax_fmt = BarChart(labels) mpld3.plugins.connect(fig, ax_fmt)
''' Necesario para utilizar la función to_date ''' from pyspark.sql.functions import * df.select("*")\ .where((to_date(df.CreationDate) == df.select( min( to_date("CreationDate"))\ .alias("min"))\ .collect()[0].min) | ( to_date(df.CreationDate) == df.select( max(to_date("CreationDate"))\ .alias("max"))\ .collect()[0].max))\ .orderBy(to_date("CreationDate"))\ .show() ''' Comparando fechas hasta los milisegundos''' ''' Usuario más antiguo ''' df.sort("CreationDate", ascending=False)\ .limit(1)\ .show() ''' Usuario más reciente '''
StructField("Likes", StringType(), True), StructField("RTs", StringType(), True), StructField("Hashtags", StringType(), True), StructField("UserMentionNames", StringType(), True), StructField("UserMentionID", StringType(), True), StructField("name", StringType(), True), StructField("Place", StringType(), True), StructField("Followers", IntegerType(), True), StructField("Friends", IntegerType(), True) ]) csvDF =( spark \ .readStream \ .schema(userSchema) \ .option("delimiter", ";") \ .option("maxFilesPerTrigger",1)\ .csv(inputpath)) query2 = csvDF.select("name", "Followers", "Friends") aria = query2.withColumn("FRRatio", query2.Followers / query2.Friends).select( "name", "FRRatio").groupBy("name").agg(max("FRRatio").alias("FRRatio")) spar = aria.orderBy('FRRatio', ascending=False) spar.writeStream\ .outputMode("complete")\ .format("console")\ .option("numrows" ,1)\ .queryName("counts1")\ .start()\ .awaitTermination(60) spar.stop()
department_schema = StructType([ StructField("dept_id", StringType(), True), StructField("dept_name", StringType(), True) ]) department_data = spark.sparkContext.textFile("C:\\data\\department.txt").map( lambda x: x.split(',')) department = spark.createDataFrame(department_data, department_schema) department = department.select( department.dept_id.cast("int").alias('dept_id'), department.dept_name) print(department.count()) department.dropDuplicates() print(department.count()) department_people = people.where('dept_id in (10,20,30)').join(department, people.dept_id == department.dept_id) \ .groupBy(people.name, people.age, department.dept_name).agg(F.max(department.dept_id).alias('dept_id')) department_peopleSorted = department_people.orderBy( department_people.name, department_people.dept_id.desc()) department_peopleSorted.dropDuplicates() #print(department_peopleSorted.collect()) # department_peopleSorted.coalesce(1).write.mode("overwrite").save("C:\\data\\out_put") #department_peopleSorted.coalesce(1).rdd.saveAsTextFile("file:///C:/data/out_put/") department_peopleSorted.registerTempTable("test_table") people.registerTempTable("people_table") query = ''' select people.name,people.age,people_dept.dept_id from people_table people join test_table people_dept on people.dept_id = people_dept.dept_id
# COMMAND ---------- flightData2015 = spark\ .read\ .option("inferSchema", "true")\ .option("header", "true")\ .csv("/mnt/enterprise/flightdata/2015-summary.csv") # COMMAND ---------- flightData2015.sort("count").explain() # COMMAND ---------- from pyspark.sql.functions import max flightData2015.select(max("count")).take(1) # COMMAND ---------- from pyspark.sql.functions import desc flightData2015\ .groupBy("DEST_COUNTRY_NAME")\ .sum("count")\ .withColumnRenamed("sum(count)", "destination_total")\ .sort(desc("destination_total"))\ .limit(5)\ .show()
from pyspark import SparkConf,SparkContext from pyspark.streaming import StreamingContext from pyspark.sql import SQLContext, SparkSession, Window from pyspark.sql.types import StringType, IntegerType, TimestampType, StructType, StructField from pyspark.sql.functions import explode, split, max, rank, min, current_timestamp, expr spark = SparkSession.builder.appName("Task1b").getOrCreate() userSchema = StructType().add("id", "integer").add("lang", "string").add("date", "string").add("source", "string").add("len", "integer").add("likes", "integer").add("RTs", "string").add("hashtags", "string").add("umn", "string").add("umid", "string").add("name", "string").add("place", "string").add("followers", "integer").add("friends", "integer") csvDF = spark.readStream.option("sep", ";").schema(userSchema).csv("hdfs://localhost:9000/stream") #csvDF = spark.readStream.option("sep", ";").schema(userSchema).csv("/home/chaitra/data") csvDF = csvDF.withColumn("ratio", csvDF.followers/csvDF.friends).where("followers != 0 and friends != 0") q = csvDF.groupBy('name').agg(max('ratio').alias("FRRatio")).sort("FRRatio", ascending=False) query = q.writeStream.outputMode("complete").format("console").option("numRows", 1).start() query.awaitTermination(60) query.stop()
def create_monthly_yearly_award(pargs, params): """ Read in denormalized_awards and cntry_award_no data and create combined_yr_mo_awards data :inputs: denormalized_awards, cntry_award_no :outputs: combined_yr_mo_awards """ df = spark.read.option("delimiter", "|").csv(data_paths['denormalized_awards'], inferSchema=True) award_ref = spark.read.csv(data_paths[configs['cntry_award_no']], header=True, inferSchema=True) \ .withColumnRenamed("awd_desc_{cntry}".format(cntry=configs["cntry"]), "awd_desc_cntry") # Rename columns orig_col = df.schema.names for i in range(0, len(orig_col)): df = df.withColumnRenamed(orig_col[i], configs["de_awd_col_names"][i]) # Filter by country and create award rank column rnk_scores = configs["rnk_scores"] monthly_awards1 = df.filter(F.col("cntry_key_no").isin(run['cntry_key_no'])) \ .withColumn("awd_rnk_no", F.when(df.CROWN_PLUS_FLG == 1, rnk_scores["CROWN_PLUS_FLG"]) .when(df.F_TRIPLE_DIA_FLG == 1, rnk_scores["F_TRIPLE_DIA_FLG"]) .when(df.TRIPLE_DIA_FLG == 1, rnk_scores["TRIPLE_DIA_FLG"]) .when(df.F_DOUBLE_DIA_FLG == 1, rnk_scores["F_DOUBLE_DIA_FLG"]) .when(df.DOUBLE_DIA_FLG == 1, rnk_scores["DOUBLE_DIA_FLG"]) .when(df.F_EXEC_DIA_FLG == 1, rnk_scores["F_EXEC_DIA_FLG"]) .when(df.EXEC_DIA_FLG == 1, rnk_scores["EXEC_DIA_FLG"]) .when(df.F_DIA_FLG == 1, rnk_scores["F_DIA_FLG"]) .when(df.DIA_FLG == 1, rnk_scores["DIA_FLG"]) .when(df.F_EMRLD_FLG == 1, rnk_scores["F_EMRLD_FLG"]) .when(df.EMRLD_FLG == 1, rnk_scores["EMRLD_FLG"]) .when(df.F_SAPPHIRE_FLG == 1, rnk_scores["F_SAPPHIRE_FLG"]) .when(df.SAPPHIRE_FLG == 1, rnk_scores["SAPPHIRE_FLG"]) .when(df.PEARL_FLG == 1, rnk_scores["PEARL_FLG"]) .when(df.F_PLAT_FLG == 1, rnk_scores["F_PLAT_FLG"]) .when(df.RUBY_FLG == 1, rnk_scores["RUBY_FLG"]) .when(df.PLAT_FLG == 1, rnk_scores["PLAT_FLG"]) .when(df.GOLD_FLG == 1, rnk_scores["GOLD_FLG"]) .when(df.SILVER_FLG == 1, rnk_scores["SILVER_FLG"]) .otherwise(F.lit(None)) ).select('imc_key_no', 'mo_yr_key_no', 'cntry_key_no', 'awd_rnk_no') # Merge two tables monthly_awards2 = monthly_awards1.withColumn("month", expr("substring(mo_yr_key_no, length(mo_yr_key_no)-1, length(mo_yr_key_no))").cast('int')) \ .withColumn("year", expr("substring(mo_yr_key_no, 1, 4)").cast('int')) monthly_awards3 = monthly_awards2.withColumn("perf_yr", when(monthly_awards2.month >= configs["first_month_of_perf_yr"], monthly_awards2.year + 1).otherwise(monthly_awards2.year)) \ .select('imc_key_no', 'mo_yr_key_no', 'cntry_key_no', 'perf_yr', 'awd_rnk_no') monthly_awards = monthly_awards3.join(award_ref, monthly_awards3.awd_rnk_no == award_ref.cur_awd_awd_rnk_no, 'left') \ .select('imc_key_no', 'mo_yr_key_no', 'cntry_key_no', 'perf_yr', 'awd_rnk_no', 'awd_desc_cntry').withColumnRenamed("awd_desc_cntry", "i_mthly_awd_cd") \ .withColumnRenamed("awd_rnk_no", "i_mthly_awd_rnk_no") yearly_awards1 = monthly_awards.groupBy( 'imc_key_no', 'cntry_key_no', 'perf_yr').agg(F.max("i_mthly_awd_rnk_no").alias("i_yrly_awd_rnk_no")) yearly_awards = yearly_awards1.join(award_ref, yearly_awards1.i_yrly_awd_rnk_no == award_ref.cur_awd_awd_rnk_no, 'left')\ .select('imc_key_no', 'perf_yr', 'awd_desc_cntry', 'i_yrly_awd_rnk_no').withColumnRenamed("awd_desc_cntry", "i_yrly_awd_cd")\ .withColumnRenamed("awd_desc_cntry","i_yrly_awd_cd").withColumnRenamed("imc_key_no", "imc_key_no_yr").withColumnRenamed("perf_yr", "perf_yr_yr") combined_awards = monthly_awards.join(yearly_awards, (monthly_awards.imc_key_no == yearly_awards.imc_key_no_yr) & (monthly_awards.perf_yr == yearly_awards.perf_yr_yr), 'left') \ .select('imc_key_no', 'mo_yr_key_no', 'cntry_key_no', 'perf_yr', 'i_mthly_awd_cd', 'i_yrly_awd_cd', 'i_mthly_awd_rnk_no', 'i_yrly_awd_rnk_no') combined_awards = combined_awards.withColumn( 'mo_yr_key_no', combined_awards.mo_yr_key_no.cast('string')).withColumn( 'imc_key_no', combined_awards.imc_key_no.cast('string')) combined_awards = combined_awards.withColumn( 'mo_yr_key_no', to_timestamp(combined_awards.mo_yr_key_no, 'yyyyMM')).withColumn( 'mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd')) # Write finial result combined_awards.write.parquet(data_paths['combined_yr_mo_awards'].format( run_mode=run['run_mode'], run_id=run['run_id']), mode='overwrite')
# In[ ]: sqlCtx.sql("select program,avg(age) AS AverageAge FROM st GROUP BY program").show() # In[ ]: # In[ ]: from pyspark.sql import functions as funcs AvgMin=students.groupBy('program').agg(funcs.avg('age').alias('AverageAge '),funcs.max('age').alias('MaximumAge')) AvgMin.show() # In[ ]: # #How the queries are optimized # In[ ]: sqlCtx.sql("select name, program FROM st").explain()
def create_classroom_feature(pargs, params): """ :inputs: download_df, browse_df, share_df, search_df, fav_df :outputs: classroom_data """ download_df = spark.read.parquet(data_paths['download_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) browse_df = spark.read.parquet(data_paths['browse_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) share_df = spark.read.parquet(data_paths['share_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) search_df = spark.read.parquet(data_paths['search_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) fav_df = spark.read.parquet(data_paths['fav_df'].format( run_mode=run['run_mode'], run_id=run['run_id'])) fav_df2 = fav_df.withColumn("date", to_date(fav_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) fav_df2 = fav_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(fav_df2.date, "yyyyMM"))) fav_df2 = fav_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) fav_df3 = fav_df2.withColumn( "ADJ_USERID", expr("substring(USERNAME, 1, length(USERNAME)-2)")) fav_df3 = fav_df3.withColumn( "ADJ_USERID", expr("substring(ADJ_USERID, 4, length(ADJ_USERID))")) fav = fav_df3.withColumn("ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) fav = fav.groupby(['ADJ_USERID', 'MONTH']).count() fav = fav.withColumnRenamed("count", "num_fav") download_df2 = download_df.withColumn( "date", to_date(download_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) download_df2 = download_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(download_df2.date, "yyyyMM"))) download_df2 = download_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) download_df3 = download_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) download_df4 = download_df3.withColumn( "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) download = download_df4.groupby(['ADJ_USERID', 'MONTH']).count() download = download.withColumnRenamed("count", "num_" + "download") browse_df2 = browse_df.withColumn( "date", to_date(browse_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) browse_df2 = browse_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(browse_df2.date, "yyyyMM"))) browse_df2 = browse_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) browse_df3 = browse_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) browse_df4 = browse_df3.withColumn( "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) browse = browse_df4.groupby(['ADJ_USERID', 'MONTH']).count() browse = browse.withColumnRenamed("count", "num_" + "browse") share_df2 = share_df.withColumn( "date", to_date(share_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) share_df2 = share_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(share_df2.date, "yyyyMM"))) share_df2 = share_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) share_df3 = share_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) share_df4 = share_df3.withColumn( "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) share = share_df4.groupby(['ADJ_USERID', 'MONTH']).count() share = share.withColumnRenamed("count", "num_" + "share") search_df2 = search_df.withColumn( "date", to_date(search_df.CRTIME, 'yyyy/MM/dd HH:mm:ss')) search_df2 = search_df2.withColumn( "MONTH_tmp", F.from_unixtime(F.unix_timestamp(search_df2.date, "yyyyMM"))) search_df2 = search_df2.withColumn( "MONTH", F.concat(expr("substring(MONTH_tmp, 1, 4)"), expr("substring(MONTH_tmp, 6, 2)"))) search_df3 = search_df2.withColumn( "ADJ_USERID", expr("substring(USERID, 1, length(USERID)-2)")) search_df4 = search_df3.withColumn( "ADJ_USERID", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) search = search_df4.groupby(['ADJ_USERID', 'MONTH']).count() search = search.withColumnRenamed("count", "num_" + "search") data = [("2013-01-01", str(datetime.date.today()))] df = spark.createDataFrame(data, ["minDate", "maxDate"]) df = df.withColumn("monthsDiff", F.months_between("maxDate", "minDate")) \ .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \ .select("*", F.posexplode("repeat").alias("date", "val")) \ .withColumn("date", F.expr("add_months(minDate, date)")) \ .select('date') df = df.withColumn( "MONTH", F.from_unixtime(F.unix_timestamp(F.col("date")), "yyyyMM")).select('MONTH') unique_id = download.select('ADJ_USERID').distinct() \ .union(browse.select('ADJ_USERID').distinct()) \ .union(share.select('ADJ_USERID').distinct()) \ .union(search.select('ADJ_USERID').distinct()) \ .union(fav.select('ADJ_USERID').distinct()) unique_id = unique_id.distinct() all_abo_month = unique_id.crossJoin(df) combine = download.select(['ADJ_USERID', 'MONTH']).union(browse.select(['ADJ_USERID', 'MONTH'])) \ .union(share.select(['ADJ_USERID', 'MONTH'])) \ .union(search.select(['ADJ_USERID', 'MONTH'])) \ .union(fav.select(['ADJ_USERID', 'MONTH'])) min_max_date = combine.groupby("ADJ_USERID").agg(F.min("MONTH"), F.max("MONTH")) all_abo_month = all_abo_month.join( min_max_date, all_abo_month.ADJ_USERID == min_max_date.ADJ_USERID, how='left').drop(min_max_date.ADJ_USERID) all_abo_month = all_abo_month.filter(F.col("MONTH") >= F.col("min(MONTH)")) all_abo_month = all_abo_month.filter(F.col("MONTH") <= F.col("max(MONTH)")) all_abo_month = all_abo_month.select(["ADJ_USERID", "MONTH"]) download = all_abo_month.join(download, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): download = download.withColumn('num_' + "download" + str(n), F.lag(download['num_' + "download"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_3m", download['num_' + "download"] + download['num_' + "download" + "1"] + download['num_' + "download" + "2"]) download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_6m", download["n_lag_currentyr_" + "download" + "_sum_3m"] + download['num_' + "download" + "3"] + download['num_' + "download" + "4"] + download['num_' + "download" + "5"]) download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_9m", download["n_lag_currentyr_" + "download" + "_sum_6m"] + download['num_' + "download" + "6"] + download['num_' + "download" + "7"] + download['num_' + "download" + "8"]) download = download.withColumn( "n_lag_currentyr_" + "download" + "_sum_12m", download["n_lag_currentyr_" + "download" + "_sum_9m"] + download['num_' + "download" + "9"] + download['num_' + "download" + "10"] + download['num_' + "download" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "download" + str(n)] download = download.drop(*droplist) browse = all_abo_month.join(browse, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): browse = browse.withColumn('num_' + "browse" + str(n), F.lag(browse['num_' + "browse"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_3m", browse['num_' + "browse"] + browse['num_' + "browse" + "1"] + browse['num_' + "browse" + "2"]) browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_6m", browse["n_lag_currentyr_" + "browse" + "_sum_3m"] + browse['num_' + "browse" + "3"] + browse['num_' + "browse" + "4"] + browse['num_' + "browse" + "5"]) browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_9m", browse["n_lag_currentyr_" + "browse" + "_sum_6m"] + browse['num_' + "browse" + "6"] + browse['num_' + "browse" + "7"] + browse['num_' + "browse" + "8"]) browse = browse.withColumn( "n_lag_currentyr_" + "browse" + "_sum_12m", browse["n_lag_currentyr_" + "browse" + "_sum_9m"] + browse['num_' + "browse" + "9"] + browse['num_' + "browse" + "10"] + browse['num_' + "browse" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "browse" + str(n)] browse = browse.drop(*droplist) share = all_abo_month.join(share, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): share = share.withColumn('num_' + "share" + str(n), F.lag(share['num_' + "share"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_3m", share['num_' + "share"] + share['num_' + "share" + "1"] + share['num_' + "share" + "2"]) share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_6m", share["n_lag_currentyr_" + "share" + "_sum_3m"] + share['num_' + "share" + "3"] + share['num_' + "share" + "4"] + share['num_' + "share" + "5"]) share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_9m", share["n_lag_currentyr_" + "share" + "_sum_6m"] + share['num_' + "share" + "6"] + share['num_' + "share" + "7"] + share['num_' + "share" + "8"]) share = share.withColumn( "n_lag_currentyr_" + "share" + "_sum_12m", share["n_lag_currentyr_" + "share" + "_sum_9m"] + share['num_' + "share" + "9"] + share['num_' + "share" + "10"] + share['num_' + "share" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "share" + str(n)] share = share.drop(*droplist) search = all_abo_month.join(search, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): search = search.withColumn('num_' + "search" + str(n), F.lag(search['num_' + "search"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_3m", search['num_' + "search"] + search['num_' + "search" + "1"] + search['num_' + "search" + "2"]) search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_6m", search["n_lag_currentyr_" + "search" + "_sum_3m"] + search['num_' + "search" + "3"] + search['num_' + "search" + "4"] + search['num_' + "search" + "5"]) search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_9m", search["n_lag_currentyr_" + "search" + "_sum_6m"] + search['num_' + "search" + "6"] + search['num_' + "search" + "7"] + search['num_' + "search" + "8"]) search = search.withColumn( "n_lag_currentyr_" + "search" + "_sum_12m", search["n_lag_currentyr_" + "search" + "_sum_9m"] + search['num_' + "search" + "9"] + search['num_' + "search" + "10"] + search['num_' + "search" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "search" + str(n)] search = search.drop(*droplist) fav = all_abo_month.join(fav, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) for n in range(1, 12): fav = fav.withColumn('num_' + "fav" + str(n), F.lag(fav['num_' + "fav"], n, 0) \ .over(Window.partitionBy("ADJ_USERID").orderBy("MONTH"))) fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_3m", fav['num_' + "fav"] + fav['num_' + "fav" + "1"] + fav['num_' + "fav" + "2"]) fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_6m", fav["n_lag_currentyr_" + "fav" + "_sum_3m"] + fav['num_' + "fav" + "3"] + fav['num_' + "fav" + "4"] + fav['num_' + "fav" + "5"]) fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_9m", fav["n_lag_currentyr_" + "fav" + "_sum_6m"] + fav['num_' + "fav" + "6"] + fav['num_' + "fav" + "7"] + fav['num_' + "fav" + "8"]) fav = fav.withColumn( "n_lag_currentyr_" + "fav" + "_sum_12m", fav["n_lag_currentyr_" + "fav" + "_sum_9m"] + fav['num_' + "fav" + "9"] + fav['num_' + "fav" + "10"] + fav['num_' + "fav" + "11"]) droplist = [] for n in range(1, 12): droplist = droplist + ['num_' + "fav" + str(n)] fav = fav.drop(*droplist) classroom_data = all_abo_month.join( download, ['ADJ_USERID', 'MONTH'], 'left').join(browse, ['ADJ_USERID', 'MONTH'], 'left').join(share, ['ADJ_USERID', 'MONTH'], 'left').join( search, ['ADJ_USERID', 'MONTH'], 'left').join(fav, ['ADJ_USERID', 'MONTH'], 'left').na.fill(0) classroom_data = classroom_data.withColumnRenamed("ADJ_USERID", "imc_no") classroom_data = classroom_data.withColumnRenamed("MONTH", "mo_yr_key_no") df = classroom_data df = df.withColumn('mo_yr_key_no', df.mo_yr_key_no.cast('string')) df = df.withColumn('mo_yr_key_no', to_timestamp(df.mo_yr_key_no, 'yyyyMM')) df = df.withColumn('mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd')) classroom_data = df print("now saving the data") classroom_data.write.parquet(data_paths['classroom_data'].format( run_mode=run['run_mode'], run_id=run['run_id']), mode='overwrite')
from pyspark.sql import functions as F #Creating data frame from list data = [('John', 'Smith', 47),('Jane', 'Smith', 22), ('Frank', 'Jones', 28)] schema = ['fname', 'lname', 'age'] df = sqlContext.createDataFrame(data, schema) df #Retrieving contents of data frame df.printSchema() df.show() df.first() df.count() #Adding columns df = df.withColumn('salary', F.lit(0)) df.show() df.withColumn('salary2', df['age'] * 100).show() #Filtering and subsetting df.filter(df['age'] > 30).select('fname','age').show() df.select(F.max('age').alias('max-age')).show() #Grouped aggregations df.groupBy('lname').max('age').show() df.groupBy('lname').agg(F.avg('age').alias('avg-age'), F.min('age'), F.max('age')).show()
def wechat_cloudcommerce(pargs, params): """ wechat cloud commerce (yungou) feature engineering including search,browse,order placement, product purchased :inputs: wechat_miniprogram_input :outputs: wechat_cloudcommerce """ wechat_mini = spark.read.option("delimiter", "\t").option( "header", "true").option("encoding", "UTF-8").csv(data_paths['wechat_miniprogram_input']) wechat_mini = wechat_mini.withColumn('time', to_timestamp('时间戳', 'yyyy-MM-dd')) wechat_mini = wechat_mini.withColumn('month', to_timestamp('时间戳', 'yyyy-MM')) wechat_mini = wechat_mini.withColumn('month', date_format('month', 'yyyyMM')) wechat_mini2 = wechat_mini.withColumnRenamed('事件类型', 'event_type') \ .withColumnRenamed('时间戳', 'timestamp') \ .withColumnRenamed('诸葛id', 'trip_id') \ .withColumnRenamed('事件id', 'event_id') \ .withColumnRenamed('事件名', 'event_name') \ .withColumnRenamed('商品id', 'product_id') \ .withColumnRenamed('商品名称', 'product_name') \ .withColumnRenamed('搜索词', 'search_word') # clean up imc_no wechat_mini3 = wechat_mini2.withColumn("leading360", expr("substring(amwayid, 1, 3)")) wechat_mini4 = wechat_mini3.withColumn( "ADJ_USERID", when( F.col("leading360") == "360", expr("substring(amwayid, 4, length(amwayid)-2)")).otherwise( F.col("amwayid"))) wechat_mini5 = wechat_mini4.withColumn( "imc_no", regexp_replace(F.col("ADJ_USERID"), "^0*", "")) wechat_mini_all = wechat_mini5.withColumn( "imc_no", when( F.col("leading360") == "360", expr("substring(imc_no, 1, length(imc_no)-2)")).otherwise( F.col("imc_no"))) # browse wechat_mini_browse = wechat_mini_all.where((F.col("event_type") == '页面浏览')) wechat_mini_browse2 = wechat_mini_browse.groupBy('imc_no', 'month').agg( F.count("event_id").alias("n_num_cloudcommerce_browse")) # search wechat_mini_search = wechat_mini_all.where((F.col("event_type") == '站内搜索')) wechat_mini_search2 = wechat_mini_search.groupBy('imc_no', 'month').agg( F.count("event_id").alias("n_num_cloudcommerce_search")) # order wechat_mini_order = wechat_mini_all.where( (F.col("event_name") == '小程序_订单确认')) wechat_mini_order2 = wechat_mini_order.groupBy('imc_no', 'month').agg( F.count("event_id").alias("n_num_cloudcommerce_order")) # cart purchase_trip = wechat_mini_order.select('trip_id').distinct() wechat_mini_cart = wechat_mini_all.join( purchase_trip, 'trip_id', 'inner').where( (F.col("event_type") == '商品加购')) wechat_mini_cart2 = wechat_mini_cart.groupBy( 'imc_no', 'month', 'trip_id').agg( F.count("product_id").alias( "n_num_cloudcommerce_product_per_cart")) wechat_mini_cart3 = wechat_mini_cart2.groupBy('imc_no', 'month').agg( F.avg("n_num_cloudcommerce_product_per_cart").alias( "n_num_cloudcommerce_product_per_cart")) # all abo and month combination unique_id = wechat_mini_all.select('imc_no').distinct() month = wechat_mini_all.select('month').distinct() all_abo_month = unique_id.crossJoin(month) min_max_date = wechat_mini_all.groupby("imc_no").agg( F.min("month"), F.max("month")) all_abo_month = all_abo_month.join( min_max_date, all_abo_month.imc_no == min_max_date.imc_no, how='left').drop(min_max_date.imc_no) all_abo_month = all_abo_month.filter(F.col("month") >= F.col("min(month)")) all_abo_month = all_abo_month.filter(F.col("month") <= F.col("max(month)")) # join everything together combine1 = all_abo_month.join(wechat_mini_browse2, ['imc_no', 'month'], 'left').na.fill(0) combine2 = combine1.join(wechat_mini_search2, ['imc_no', 'month'], 'left').na.fill(0) combine3 = combine2.join(wechat_mini_order2, ['imc_no', 'month'], 'left').na.fill(0) combine4 = combine3.join(wechat_mini_cart3, ['imc_no', 'month'], 'left').na.fill(0) # create lag features combine = combine4.withColumnRenamed("month", "mo_yr_key_no") feature_list = [ 'n_num_cloudcommerce_browse', 'n_num_cloudcommerce_search', 'n_num_cloudcommerce_order', 'n_num_cloudcommerce_product_per_cart' ] lag_features = configs["lag_features"] for feature in feature_list: for lag_mo in lag_features: for lag in range(0, lag_mo): colname = feature + "_" + str(lag) feature_col = feature + "_sum_" + str(lag_mo) + "m" combine = combine.withColumn( colname, F.lag(combine[feature], lag).over( Window.partitionBy("imc_no").orderBy("mo_yr_key_no"))) if lag == 0: combine = combine.withColumn(feature_col, combine[colname]) else: combine = combine.withColumn( feature_col, combine[feature_col] + combine[colname]) main_col = ['imc_no', 'mo_yr_key_no'] selected_feature = [] for feature in feature_list: for lag_mo in lag_features: feature_col = feature + "_sum_" + str(lag_mo) + "m" selected_feature.append(feature_col) selected_feature = main_col + feature_list + selected_feature wechat_cloudcommerce = combine.select(selected_feature) wechat_formatting = wechat_cloudcommerce wechat_formatting = wechat_formatting.withColumn( 'mo_yr_key_no', wechat_formatting.mo_yr_key_no.cast('string')) # wechat_formatting = wechat_formatting.withColumn('imc_no',wechat_formatting.imc_no.cast('string')) wechat_formatting = wechat_formatting.withColumn( 'mo_yr_key_no', to_timestamp(wechat_formatting.mo_yr_key_no, 'yyyyMM')) wechat_formatting = wechat_formatting.withColumn( 'mo_yr_key_no', date_format('mo_yr_key_no', 'yyyy-MM-dd')) wechat_cloudcommerce = wechat_formatting wechat_cloudcommerce.write.parquet( data_paths['wechat_cloudcommerce'].format(run_mode=run['run_mode'], run_id=run['run_id']), mode='overwrite')
# COMMAND ---------- # MAGIC %md # MAGIC # MAGIC Alternatively, we can use SQL to directly calculate these statistics. You can explore the many useful functions within the `pyspark.sql.functions` module in the [documentation](https://spark.apache.org/docs/latest/api/python/pyspark.sql.html#module-pyspark.sql.functions). # MAGIC # MAGIC After we apply the `.agg()` function, we call `.first()` to extract the first value, which is equivalent to `.take(1)[0]`. # COMMAND ---------- from pyspark.sql import functions as sqlFunctions contentSizeStats = (logs_df .agg(sqlFunctions.min(logs_df['content_size']), sqlFunctions.avg(logs_df['content_size']), sqlFunctions.max(logs_df['content_size'])) .first()) print 'Using SQL functions:' print 'Content Size Avg: {1:,.2f}; Min: {0:.2f}; Max: {2:,.0f}'.format(*contentSizeStats) # COMMAND ---------- # MAGIC %md # MAGIC ### (3b) Example: HTTP Status Analysis # MAGIC # MAGIC Next, let's look at the status values that appear in the log. We want to know which status values appear in the data and how many times. We again start with `logs_df`, then group by the `status` column, apply the `.count()` aggregation function, and sort by the `status` column. # COMMAND ---------- status_to_count_df =(logs_df
GROUP BY DEST_COUNTRY_NAME """) dataFrameWay = flightData2015\ .groupBy("DEST_COUNTRY_NAME")\ .count() sqlWay.explain() dataFrameWay.explain() # COMMAND ---------- from pyspark.sql.functions import max flightData2015.select(max("count")).take(1) # COMMAND ---------- maxSql = spark.sql(""" SELECT DEST_COUNTRY_NAME, sum(count) as destination_total FROM flight_data_2015 GROUP BY DEST_COUNTRY_NAME ORDER BY sum(count) DESC LIMIT 5 """) maxSql.show()