def test_drop_columns(self, spark): data = [('Alice', 20, 'London'), ('Alice', 33, 'Paris'), ('Alice', 20, 'Paris'), ('Nina', 40, None)] columns = ['name', 'age', 'city'] df = spark.createDataFrame(data, columns) res_df = Count(['name', 'age'], 'count').transform(df).sort(f.asc('name'), f.asc('age')) rows = res_df.collect() assert rows[0]['count'] == 2 assert rows[1]['count'] == 1 assert rows[2]['count'] == 1
def transform(df: DataFrame) -> DataFrame: """Weekly top five visitors.""" grouped = df.where(df.event == "view").groupby("week", "visitorid").count().select( "week", "visitorid", col('count').alias('n')) grouped_ranked = grouped.withColumn( "rank", dense_rank().over(Window.partitionBy("week").orderBy(desc("n")))) top_five_customers = grouped_ranked.where(col("rank") < 6).orderBy( asc("week"), asc("rank")) return top_five_customers
def get_df_change_level_init(df_student_level): df_student_level_new = df_student_level.select( 'contact_id', df_student_level.level_current.alias('level'), f.lit(MIN_DATE).alias('time_level_created')) df_student_level_new = df_student_level_new.orderBy( f.asc('contact_id'), f.asc('time_level_created')) df_student_level_first = df_student_level_new.groupBy('contact_id').agg( f.first('level').alias('level'), f.first('time_level_created').alias('time_level_created')) return df_student_level_first
def get_df_change_advisor_init(df_student_advisor): df_student_advisor_new = df_student_advisor.select( 'contact_id', df_student_advisor.advisor_id_old.alias('advisor_id')) df_student_advisor_new = df_student_advisor_new.orderBy( f.asc('contact_id'), f.asc('created_at')) df_student_advisor_first = df_student_advisor_new.groupBy( 'contact_id').agg( f.first('advisor_id').alias('advisor_id'), f.lit(MIN_DATE).alias('created_at')) df_student_advisor_first = df_student_advisor_first\ .filter(df_student_advisor_first.advisor_id.isNotNull()) return df_student_advisor_first
def run_whole_dataset(): """ Run analyses over the entire dataset """ import pyspark.sql.functions as sqlf from datetime import datetime as dt stations = mkstations('data/stations.csv') # Hottest and coldest day and corresponding weather stations in the # entire dataset print("\nEntire dataset (2000-2016)\n==========================\n") print(' * Loading all datasets into a single DataFrame...') df = mkdf('data/20??.csv') print(' * Computing coldest station for entire dataset...\n') coldest = df.filter(df.meas=='TMIN').groupBy('sta', 'date').min('degc') \ .sort(sqlf.asc('min(degc)')).first() date = dt.strptime(coldest.date, '%Y%m%d').strftime('%d %b %Y') city = getcity(stations, coldest.sta) print('Coldest station was %s (%s) on %s: %0.1f deg C' % (coldest.sta, city, date, float(coldest['min(degc)']) / 10.0)) # and now the hottest print('\n * Computing hottest station for entire dataset...\n') hottest = df.filter(df.meas=='TMAX').groupBy('sta', 'date').max('degc') \ .sort(sqlf.desc('max(degc)')).first() date = dt.strptime(hottest.date, '%Y%m%d').strftime('%d %b %Y') city = getcity(stations, hottest.sta) print('Hottest station was %s (%s) on %s: %0.1f deg C' % (hottest.sta, city, date, float(hottest['max(degc)']) / 10.0))
def passed_temperature_analyse(filename): print("begin to analyse passed temperature") spark = SparkSession.builder.master("local").appName("passed_temperature_analyse").getOrCreate() df = spark.read.csv(filename, header=True) df_temperature = df.select( # 选择需要的列 df['province'], df['city_name'], df['city_code'], df['temperature'].cast(DecimalType(scale=1)), F.date_format(df['time'], "yyyy-MM-dd").alias("date"), # 得到日期数据 F.hour(df['time']).alias("hour") # 得到小时数据 ) # 筛选四点时次 df_4point_temperature = df_temperature.filter(df_temperature['hour'].isin([2, 8, 12, 20])) # df_4point_temperature.printSchema() df_avg_temperature = df_4point_temperature.groupBy("province", "city_name", "city_code", "date") \ .agg(F.count("temperature"), F.avg("temperature").alias("avg_temperature")) \ .filter("count(temperature) = 4") \ .sort(F.asc("avg_temperature")) \ .select("province", "city_name", "city_code", "date", F.format_number('avg_temperature', 1).alias("avg_temperature")) df_avg_temperature.cache() avg_temperature_list = df_avg_temperature.collect() df_avg_temperature.coalesce(1).write.json("file:///F:/Code_All/Jupyter_Code/spark_test/result_data/bigData/passed_rain_temperature.json") print("end analysing passed temperature") return avg_temperature_list[0:10]
def test_sorting_functions_with_column(self): from pyspark.sql import functions from pyspark.sql.column import Column funs = [ functions.asc_nulls_first, functions.asc_nulls_last, functions.desc_nulls_first, functions.desc_nulls_last ] exprs = [col("x"), "x"] for fun in funs: for expr in exprs: res = fun(expr) self.assertIsInstance(res, Column) self.assertIn( f"""'x {fun.__name__.replace("_", " ").upper()}'""", str(res)) for expr in exprs: res = functions.asc(expr) self.assertIsInstance(res, Column) self.assertIn("""'x ASC NULLS FIRST'""", str(res)) for expr in exprs: res = functions.desc(expr) self.assertIsInstance(res, Column) self.assertIn("""'x DESC NULLS LAST'""", str(res))
def analyse_entire_dataset(): """ Analyse the entire dataset (2000-2019) """ import pyspark.sql.functions as sqlfunc # from datetime import datetime as dt # Hottest and coldest day and corresponding weather stations in the entire dataset # Loading all datasets into a single DataFrame. print("\n-------------------------\n") df = mkdataframe('/user/tatavag/weather/20??.csv') # Coldest station coldest = df.filter(df.minormax=='TMIN').groupBy('station', 'date').min('degrees') \ .sort(sqlfunc.asc('min(degrees)')).first() # date = dt.strptime(coldest.date, '%Y%m%d').strftime('%d %b %Y') print('Coldest station was %s on %s: %f' % (coldest.station, coldest.date, float(coldest['min(degrees)']))) # Hottest station hottest = df.filter(df.minormax=='TMAX').groupBy('station', 'date').max('degrees') \ .sort(sqlfunc.desc('max(degrees)')).first() # date = dt.strptime(hottest.date, '%Y%m%d').strftime('%d %b %Y') print('Hottest station was %s on %s: %f' % (hottest.station, hottest.date, float(hottest['max(degrees)']))) # Median TMIN TMINmed = df.filter(df.minormax=='TMIN').approxQuantile('degrees',[0.5], 0.25) print('Median TMIN for the entire dataset: %f' % (TMINmed[0])) # Median TMAX TMAXmed = df.filter(df.minormax=='TMAX').approxQuantile('degrees',[0.5], 0.25) print('Median TMAX for the entire dataset: %f' % (TMAXmed[0]))
def main(inputs, output): # main logic starts here wiki_schema = types.StructType([ types.StructField('language', types.StringType()), types.StructField('title', types.StringType()), types.StructField('views', types.IntegerType()), types.StructField('size', types.LongType()), ]) #reading data wikiData = spark.read.csv(inputs, schema=wiki_schema, sep=" ").withColumn( 'hour', path_to_hour(functions.input_file_name())) #filtering data filteredWikiData = wikiData[(wikiData['language'] == 'en') & (wikiData['title'] != 'Main_Page') & (wikiData['title'] != 'Special:Page')].cache() #finding max views per hour. maxCount = filteredWikiData.groupBy('hour').agg( functions.max(filteredWikiData['views']).alias('max')) #joining data to obtain hour and title. joinData = filteredWikiData.join( maxCount, filteredWikiData.views == maxCount.max).select( filteredWikiData["hour"], filteredWikiData["title"], filteredWikiData["views"]) #sorting data based on hour and storing it in json file. joinData.sort(functions.asc('hour')).write.json(output, mode='overwrite')
def with_row_number(output_col: str, order_by: list, df: DataFrame, sort="asc", zero_indexed=True) -> DataFrame: """Assign a sequential row number to each member of a dataframe""" is_desc = sort.lower() in ["desc", "descending"] if isinstance(order_by, str) or isinstance(order_by, Column): order_by = [order_by] elif not isinstance(order_by, list): msg = "Ordering criteria must be a string column name or a list of string column names" raise Exception(msg) # create a window function depending on the sort order if is_desc: window = Window.orderBy(*[F.desc(i) for i in order_by]) else: window = Window.orderBy(*[F.asc(i) for i in order_by]) # if the client wants to start from row 1 then that's fine if not zero_indexed: return df.withColumn(output_col, F.row_number().over(window)) # otherwise start from row number 0 return df.withColumn(output_col, F.row_number().over(window) - 1)
def hist(columns, min_value, max_value, buckets=10): """ Get the histogram column in json format :param columns: Columns to be processed :param min_value: Min value used to calculate the buckets :param max_value: Max value used to calculate the buckets :param buckets: Number of buckets :return: """ columns = parse_columns(self, columns) for col_name in columns: # Create splits splits = create_buckets(min_value, max_value, buckets) # Create buckets in the dataFrame df = bucketizer(self, col_name, splits=splits) counts = (df.groupBy(col_name + "_buckets").agg( F.count(col_name + "_buckets").alias("count")).cols.rename( col_name + "_buckets", "value").sort(F.asc("value")).to_json()) hist = [] for x, y in zip(counts, splits): # if x["value"] is not None and x["count"] != 0: hist.append({ "lower": y["lower"], "upper": y["upper"], "count": x["count"] }) return hist
def main(in_dir, out_dir): # data = spark.read.text(data1) # data = data.filter(data['value'] != '') # data.show() # wordbreak = r'[%s\s]+' % (re.escape(string.punctuation),) # data = data.withColumn('words', functions.explode(functions.split(functions.col('value'),wordbreak))) # data = data.withColumn('words', functions.lower(data['words'])) # data = data.filter(data['words'] != '') # data = data.groupBy('words').agg(functions.count(data['words'])) # data = data.sort(functions.col('words').asc()) # data = data.sort(functions.col('count(words)').desc()) # # data = data[data['words'] != ''] # data.write.csv(data2, mode = 'overwrite') data = spark.read.text(in_dir) wordbreak = r'[%s\s]+' % (re.escape(string.punctuation), ) data = data.withColumn( 'words', functions.explode(functions.split('value', wordbreak))) data = data.withColumn('words', functions.lower(data['words'])) data = data.filter(data['words'] != '') data = data.select('words') data = data.groupBy('words').agg( functions.count(data['words']).alias('count')) data = data.sort(asc('words')) data = data.sort(desc('count')) data.write.csv(out_dir, mode='overwrite')
def main(inputs, output): # main logic starts here comments_schema = types.StructType([ # commented-out fields won't be read types.StructField('archived', types.BooleanType(), True), types.StructField('author', types.StringType(), True), types.StructField('author_flair_css_class', types.StringType(), True), types.StructField('author_flair_text', types.StringType(), True), types.StructField('body', types.StringType(), True), types.StructField('controversiality', types.LongType(), True), types.StructField('created_utc', types.StringType(), True), types.StructField('distinguished', types.StringType(), True), types.StructField('downs', types.LongType(), True), types.StructField('edited', types.StringType(), True), types.StructField('gilded', types.LongType(), True), types.StructField('id', types.StringType(), True), types.StructField('link_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('parent_id', types.StringType(), True), types.StructField('retrieved_on', types.LongType(), True), types.StructField('score', types.LongType(), True), types.StructField('score_hidden', types.BooleanType(), True), types.StructField('subreddit', types.StringType(), True), types.StructField('subreddit_id', types.StringType(), True), types.StructField('ups', types.LongType(), True), #types.StructField('year', types.IntegerType(), False), #types.StructField('month', types.IntegerType(), False), ]) comments = spark.read.json(inputs, schema=comments_schema) find_avg = comments.groupBy((comments.subreddit).alias("Subreddit")).agg( avg(comments.score).alias("Average")) averages = find_avg.orderBy(asc("Subreddit")).coalesce(1) averages.write.csv(output, mode='overwrite')
def occCalc(self, channelID, testing=False): """ Calculates occupancy for the user defined month """ if type(channelID) != list: raise TypeError('ChannelID is required to be a list') conf = SparkConf()\ .setAppName("Occupancy Calc")\ .set("spark.master", "local[*]")\ .set("spark.driver.maxResultSize", "15G") sc = SparkContext(conf=conf) sql = SQLContext(sc) path = 'AZURE PATH' + self.month +\ '/*/*/' + self.sensor + '*' data = sql.read.parquet(path) timeCount = data.select('scan_time').distinct().count() timeCount = sc.broadcast(timeCount) subData = data.select('scan_time', 'channel_id', 'power_dbm').filter( data.channel_id.isin(channelID)) subData = subData.groupBy('channel_id').agg( (count(column('power_dbm')) / timeCount.value).alias('freq'), stddev(column('power_dbm')).alias('sd')).sort( asc('freq'), desc('sd')) if testing: subData.toPandas().to_csv('C:/path/freq.csv', sep='\t') sc.stop() else: sc.stop() return (subData.toPandas())
def main(): logs = read_data() df = create_dataframe(logs) answer = '' # 1. Número total de hosts únicos unique_hosts = df.select('host').drop_duplicates().count() answer += '1. Número total de hosts únicos\n' answer += 'Answer: {0} hosts únicos\n'.format(unique_hosts) # 2. O total de erros 404 total_404_errors = df.where(df.status_code == 404).count() answer += '2. O total de erros 404\n' answer += 'Answer: {0} erros\n'.format(total_404_errors) # 3. As 5 URLs que mais causaram erro 404 urls_with_most_404_errors = df.where(df.status_code == 404)\ .groupBy('host')\ .agg(F.count('status_code').alias('count_errors_404'))\ .orderBy(F.desc('count_errors_404'))\ .limit(5)\ .select('host')\ .collect() urls_with_most_404_errors = [row['host'] for row in urls_with_most_404_errors] answer += '3. As 5 URLs que mais causaram erro 404\n' answer += 'Answer: {0}\n'.format(', '.join(urls_with_most_404_errors)) # 4. Quantidade de erros 404 por dia errors_per_day = df.where(df.status_code == 404)\ .groupBy(F.dayofmonth('timestamp').alias('dia'))\ .agg(F.count('status_code').alias('count_errors_404'))\ .orderBy(F.asc('dia'))\ .collect() errors_per_day = ['dia: {0}: {1} erros 404'.format(row['dia'], row['count_errors_404']) for row in errors_per_day] answer += '4. Quantidade de erros 404 por dia\n' answer += 'Answer: {0}\n'.format('\n'.join(errors_per_day)) # 5. O total de bytes retornados total_bytes = df.select(F.sum(df.total_bytes).alias('total_bytes')).collect() total_bytes = total_bytes[0]['total_bytes'] answer += '5. O total de bytes retornados\n' answer += 'Answer: {0} bytes'.format(total_bytes) export_answer(answer)
def countUserRegTime(accountdf, df): """统计某一天的用户注册时长 从全部注册用户里筛选出当天上线的用户,算出2个日期差, 然后统计不同日期差里的用户数量并进行排序 """ return df.join(accountdf, df.uid == accountdf.uid, 'inner').select( F.datediff(df.day, accountdf.regtime).alias('daydiff')).groupBy( 'daydiff').count().sort(F.asc('daydiff'))
def counting_of_404_by_day(): result = logs.filter("http_code like '%404%'") result = result.withColumn("date", F.regexp_extract("timestamp", date_regex, 1)) result = result.groupby(["date", "http_code"]).count() result = result.sort(F.asc("date")) for row in result.collect(): print("DATA: {} - ERROS: {}".format(row["date"], row["count"]))
def main(society_1, society_2, society_3, society_4, society_5, tag): df_society_1 = spark.read.format("csv").option("header", "true").load(society_1 + '_tags.csv') df_society_1.show() df_society_2 = spark.read.format("csv").option("header", "true").load(society_2 + '_tags.csv') df_society_2.show() df_society_3 = spark.read.format("csv").option("header", "true").load(society_3 + '_tags.csv') df_society_3.show() df_society_4 = spark.read.format("csv").option("header", "true").load(society_4 + '_tags.csv') df_society_4.show() df_society_5 = spark.read.format("csv").option("header", "true").load(society_5 + '_tags.csv') df_society_5.show() df_result_1 = relation(society_1, society_2, tag, df_society_1, df_society_2) df_result_2 = relation(society_1, society_3, tag, df_society_1, df_society_3) df_result_3 = relation(society_1, society_4, tag, df_society_1, df_society_4) df_result_4 = relation(society_1, society_5, tag, df_society_1, df_society_5) # df_result_1 = spark.read.format("csv").option("header", "true").load(society_2 + '_' + tag + '.csv').sort(functions.desc(str(society_2 + "_count"))) # df_result_1.show() # df_result_2 = spark.read.format("csv").option("header", "true").load(society_3 + '_' + tag + '.csv').sort(functions.desc(str(society_3 + "_count"))) # df_result_3 = spark.read.format("csv").option("header", "true").load(society_4 + '_' + tag + '.csv').sort(functions.desc(str(society_4 + "_count"))) # df_result_4 = spark.read.format("csv").option("header", "true").load(society_5 + '_' + tag + '.csv').sort(functions.desc(str(society_5 + "_count"))) window = Window.orderBy(functions.col(str(society_2 + "_count")).desc()) df_result_1 = df_result_1.withColumn('id', functions.row_number().over(window)) df_result_1.show() window = Window.orderBy(functions.col(str(society_3 + "_count")).desc()) df_result_2 = df_result_2.withColumn('id', functions.row_number().over(window)) df_result_2.show() window = Window.orderBy(functions.col(str(society_4 + "_count")).desc()) df_result_3 = df_result_3.withColumn('id', functions.row_number().over(window)) df_result_3.show() window = Window.orderBy(functions.col(str(society_5 + "_count")).desc()) df_result_4 = df_result_4.withColumn('id', functions.row_number().over(window)) df_result_4.show() df_join = df_result_1.join(df_result_2, on = ['id'], how = 'outer').sort(functions.asc("id")) df_join.show() df_join = df_join.join(df_result_3, on = ['id'], how = 'outer').sort(functions.asc("id")) df_join.show() df_join = df_join.join(df_result_4, on = ['id'], how = 'outer').sort(functions.asc("id")) df_join.show() df_join.write.csv('tag_' + tag + '.csv',header = 'true')
def hist(columns, min_value, max_value, buckets=10): """ Get the histogram column in json format :param columns: Columns to be processed :param min_value: Min value used to calculate the buckets :param max_value: Max value used to calculate the buckets :param buckets: Number of buckets :return: """ columns = parse_columns(self, columns) for col_name in columns: # Create splits splits = create_buckets(min_value, max_value, buckets) # Create buckets in the dataFrame df = bucketizer(self, col_name, splits=splits) col_bucket = col_name + "_buckets" counts = (df.h_repartition( col_name=col_bucket).groupBy(col_bucket).agg( F.count(col_bucket).alias("count")).cols.rename( col_bucket, "value").sort(F.asc("value")).to_json()) # Fill the gaps in dict values. For example if we have 1,5,7,8,9 it get 1,2,3,4,5,6,7,8,9 new_array = [] for i in builtins.range(buckets): flag = False for c in counts: value = c["value"] count = c["count"] if value == i: new_array.append({"value": value, "count": count}) flag = True if flag is False: new_array.append({"value": i, "count": 0}) counts = new_array hist_data = [] for i in list(itertools.zip_longest(counts, splits)): if i[0] is None: hist_data.append({ "count": 0, "lower": i[1]["lower"], "upper": i[1]["upper"] }) elif "count" in i[0]: hist_data.append({ "count": i[0]["count"], "lower": i[1]["lower"], "upper": i[1]["upper"] }) return hist_data
def Predict(i, df1, df2, timeSeriesCol, predictionCol, joinCol): # this converts differenced predictions to raw predictions dZCol = 'DeltaZ'+str(i) f_strCol = 'forecast_'+str(i)+'day' df = df1.join(df2, [joinCol], how="inner")\ .orderBy(asc("Date")) df = df.withColumnRenamed(predictionCol, dZCol) df = df.withColumn(f_strCol, col(dZCol)+col(timeSeriesCol)) return df
def writeToS3(dataFrameFinal, targetDate, destinationPath): dataFrameFinal\ .withColumn("event_date", lit(targetDate).cast("date"))\ .sort(asc("htl_city_code"))\ .write\ .partitionBy("browsing_date", "meta_fnnl_step")\ .option("mapreduce.fileoutputcommitter.algorithm.version", "2")\ .mode('append')\ .parquet(destinationPath)
def get_most_popular_hashtag_by_time(): df.withColumn("date", to_timestamp(unix_timestamp('date', "EEE MMM dd HH:mm:ss +0000 yyyy").cast("timestamp"))).withColumn( 'time', date_format('date', "HH:mm:ss")) count_hashtags = df.groupBy('place', 'hashtag').agg(functions.count('hashtag').alias('hashtag_count')) most_popular_hashtag = count_hashtags.groupBy('place').agg(functions.max('hashtag_count').alias('max')) count_hashtags.join(most_popular_hashtag, ((count_hashtags.hashtag_count == most_popular_hashtag.max) & (count_hashtags.place == most_popular_hashtag.place))) \ .select(count_hashtags.place, count_hashtags.hashtag).orderBy('max', ascending=False).show(10) w = Window.partitionBy("place", "hashtag", "date", "hour") per_hour_frequency = most_popular_hashtag.withColumn("date", to_date("created_at")) \ .withColumn("tag_count", f.count('id').over(w)). \ select('place', 'date', 'hour', 'hashtag', 'tag_count'). \ distinct(). \ sort(functions.asc('place'), functions.asc('hashtag'), functions.asc('date'), functions.asc('hour'), functions.asc('tag_count'))
def run(years=available_years): """ Run analyses on individual years in sequence """ import pyspark.sql.functions as sqlf stations = mkstations('data/stations.csv') # allow passing a single year or a list of them if not type(years) is list: years = [years] for year in years: if not year in available_years: raise RuntimeError('Sorry, %s is not available in the dataset.' % year) df = mkdf('data/%s.csv' % year) print("\n%s\n====\n" % year) # Average minimum temperature r = df.filter(df.meas=='TMIN').groupBy().avg('degc').first() print('Avg min temp = %0.1f deg C' % (r['avg(degc)'] / 10.0)) # Average maximum temperature r = df.filter(df.meas=='TMAX').groupBy().avg('degc').first() print('Avg max temp = %0.1f deg C' % (r['avg(degc)'] / 10.0)) # Five hottest stations (on average) fivehot = df.filter(df.meas=='TMAX') \ .groupBy(df.sta) \ .agg(sqlf.avg('degc')) \ .sort(sqlf.desc('avg(degc)')) \ .limit(5).collect() print() i = 1 for s in fivehot: t = float(s['avg(degc)']) / 10.0 print('Hottest station #%s: %s (%s) - %0.1f deg C' % (i, s.sta, getcity(stations, s.sta), t)) i = i + 1 # Five coldest stations (on average) fivecold = df.filter(df.meas=='TMIN') \ .groupBy(df.sta) \ .agg(sqlf.avg('degc')) \ .sort(sqlf.asc('avg(degc)')) \ .limit(5).collect() print() i = 1 for s in fivecold: t = float(s['avg(degc)']) / 10.0 print('Coldest station #%s: %s (%s) - %0.1f deg C' % (i, s.sta, getcity(stations, s.sta), t)) i = i + 1
def get_history_product(self, old_dataframe: DataFrame, new_dataframe: DataFrame): inserted = self.__join_safe_null(new_dataframe, old_dataframe, how='anti') \ .withColumn('meta', lit('inserted')).withColumn('priority', lit(1)) deleted = self.__join_safe_null(old_dataframe, new_dataframe, how='anti') \ .withColumn('meta', lit('deleted')).withColumn('priority', lit(0)) not_changed = self.__join_safe_null(new_dataframe, old_dataframe, how='semi', keys=['id', 'name', 'score']) \ .withColumn('meta', lit('not_changed')).withColumn('priority', lit(1)) pre_changed = self.__join_safe_null(new_dataframe, inserted, how='anti') changed = self.__join_safe_null(pre_changed, not_changed, how='anti') \ .withColumn('meta', lit('changed')).withColumn('priority', lit(1)) return inserted.union(deleted).union(not_changed).union(changed) \ .sort(asc('id'), asc('priority')) \ .drop('priority')
def calcula_promedio(tabla_referencia, tabla_datos): '''esta funcion se encarga de calcular el salario promedio agrupando por el anio''' #se realiza la consulta a la BD y se lee con pandas query = 'SELECT ' + tabla_referencia + '.' + col_join + ',' + tabla_datos + '.salary,' + tabla_datos + '.yearID FROM ' +\ tabla_referencia + ' INNER JOIN ' + tabla_datos + ' ON ' + tabla_referencia + '.' + col_join + '=' + tabla_datos + '.' + col_join salarios = pd.read_sql(query, mydb) #se crea el dataframe de spark con el contextsql data_frame = sqlContext.createDataFrame(salarios) #se calcula el salario, se agrupa y se ordena media = data_frame.distinct().groupBy('yearID').mean('salary') media = media.sort(asc("yearID")) return media
def recommend_n_movies_for_users(self, n, users, implicit=False): model = self.model_implicit if implicit else self.model_explicit users = self.ratings.where(self.ratings.userId.isin(users)).distinct() subset = model.recommendForUserSubset(users, n) formatted_subset = subset.withColumn('recs_exp', explode('recommendations')) \ .select('userId', col('recs_exp.movieId'), col('recs_exp.rating').alias('rating')) \ .join(self.movies, 'movieId') \ .select('userId', 'title', 'rating') \ .orderBy(asc('userId'), desc('rating')) \ .select('userId', 'title') return formatted_subset
def read(self, iDF): plain_df_idx = iDF.rdd\ .zipWithIndex().toDF(["row","idx"])\ .orderBy(asc("idx"))\ .coalesce(10) Windowspec = Window.orderBy("idx") oDF = plain_df_idx\ .withColumn("seq", F.lead("row",count=1).over(Windowspec))\ .withColumn("seqID", F.lead("row",count=0).over(Windowspec)) parsedDF = oDF.filter(F.col("idx") % 2 == 0).select( "seqID", "seq", "+", "quality") return parsedDF
def _sort_by(ds_table, sortby): """ Sort by clause: parses a sort by clause and applies it over dataset """ if sortby: sortBy_columns = [] for sb in sortby: [(k, v)] = sb.items() if v == 'desc': sortBy_columns.append(func.desc(k)) else: sortBy_columns.append(func.asc(k)) ds_table = ds_table.sort(*sortBy_columns) return ds_table
def compute_avg_temperature(): df_temperature = df.select( df["province"], df["city_name"], df["city_code"], df["temperature"].cast(DecimalType(scale=2)), F.date_format(df["time"], "yyyy-MM-dd").alias("date"), F.hour(df["time"]).alias("hour")) # 只需要4个时间的数据 df_4_point_temperature = df_temperature.filter(df_temperature["hour"].isin( 2, 8, 12, 20)) df_avg_temperature = df_4_point_temperature.groupby("province","city_name","city_code","date").\ agg(F.count("temperature"),F.avg("temperature").alias("avg_temperature")).\ filter("count(temperature)=4").\ sort(F.asc("avg_temperature")).select("province", "city_name", "city_code", "date", F.format_number('avg_temperature', 2).alias("avg_temperature")) df_avg_temperature.show()
def _discrete_read_data( self, custom_reward_expression=None, gamma=None, multi_steps=None ): ts = TableSpec(table_name=self.table_name) dataset: Dataset = query_data( input_table_spec=ts, discrete_action=True, actions=["L", "R", "U", "D"], custom_reward_expression=custom_reward_expression, multi_steps=multi_steps, gamma=gamma, ) df = self.sqlCtx.read.parquet(dataset.parquet_url) df = df.orderBy(asc("sequence_number")) logger.info("Read parquet dataframe: ") df.show() return df
def runOtherFunctions(spark, personDf): df = spark.createDataFrame([("v1", "v2", "v3")], ["c1", "c2", "c3"]); # array df.select(df.c1, df.c2, df.c3, array("c1", "c2", "c3").alias("newCol")).show(truncate=False) # desc, asc personDf.show() personDf.sort(functions.desc("age"), functions.asc("name")).show() # pyspark 2.1.0 버전은 desc_nulls_first, desc_nulls_last, asc_nulls_first, asc_nulls_last 지원하지 않음 # split, length (pyspark에서 컬럼은 df["col"] 또는 df.col 형태로 사용 가능) df2 = spark.createDataFrame([("Splits str around pattern",)], ['value']) df2.select(df2.value, split(df2.value, " "), length(df2.value)).show(truncate=False) # rownum, rank f1 = StructField("date", StringType(), True) f2 = StructField("product", StringType(), True) f3 = StructField("amount", IntegerType(), True) schema = StructType([f1, f2, f3]) p1 = ("2017-12-25 12:01:00", "note", 1000) p2 = ("2017-12-25 12:01:10", "pencil", 3500) p3 = ("2017-12-25 12:03:20", "pencil", 23000) p4 = ("2017-12-25 12:05:00", "note", 1500) p5 = ("2017-12-25 12:05:07", "note", 2000) p6 = ("2017-12-25 12:06:25", "note", 1000) p7 = ("2017-12-25 12:08:00", "pencil", 500) p8 = ("2017-12-25 12:09:45", "note", 30000) dd = spark.createDataFrame([p1, p2, p3, p4, p5, p6, p7, p8], schema) w1 = Window.partitionBy("product").orderBy("amount") w2 = Window.orderBy("amount") dd.select(dd.product, dd.amount, functions.row_number().over(w1).alias("rownum"), functions.rank().over(w2).alias("rank")).show()
# <h1>Process Data using pyspark.sql</h1> # <p>Set the Hadoop configuration.</p> # In[8]: # Python expressions in a code cell will be outputted after computation expenditures_df.printSchema() # In[9]: # Sorting the data using spark sql from pyspark.sql.functions import desc, asc factor = expenditures_df.sort(desc('(% OF GDP)')).limit(10).toPandas() factor_re = expenditures_df.sort(asc('(% OF GDP)')).limit(10).toPandas() # In[10]: print factor # In[11]: life = life_expectancy_df.sort(desc('(YEARS)')).limit(10).toPandas() life_re = life_expectancy_df.sort(asc('(YEARS)')).limit(10).toPandas() # In[12]:
#!/usr/bin/python from pyspark import SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql.functions import asc, desc if __name__ == "__main__": sc = SparkContext(appName='resort data') sqlContext = SQLContext(sc) df = sqlContext.read.load('hdfs://discovery3:9000/tmp/dasmith/c19-20160919-a50-o08/pretty.parquet') #df = sqlContext.read.load('hdfs://discovery3:9000/tmp/dasmith/c19-20160402-a50-o08/out.parquet') df.registerTempTable("newspaper") df2 = sqlContext.sql("select series, date, count(*) as cnt from newspaper group by series, date order by cnt desc") df3 = df.join(df2, ['series', 'date']) df3.sort(desc("cnt"), asc("begin"), asc("end"))\ .write.json('/gss_gpfs_scratch/xu.shao/network/resorted-pretty.json')
# COMMAND ---------- # MAGIC %md **Use ``filter()`` to return only the rows that match the given predicate.** # COMMAND ---------- from pyspark.sql.functions import col, asc filterDF = explodeDF.filter(col("firstName") == "chris").sort(col("lastName")) display(filterDF) # COMMAND ---------- from pyspark.sql.functions import col, asc filterDF = explodeDF.filter((col("firstName") == "chris") | (col("firstName") == "michael")).sort(asc("lastName")) display(filterDF) # COMMAND ---------- # MAGIC %md # MAGIC **The ``where()`` clause is equivalent to ``filter()``.** # COMMAND ---------- whereDF = explodeDF.where((col("firstName") == "chris") | (col("firstName") == "michael")).sort(asc("lastName")) display(whereDF) # COMMAND ---------- # MAGIC %md
def _calculate_rate(instance_usage_df): instance_usage_data_json_list = [] try: sorted_oldest_ascending_df = instance_usage_df.sort( functions.asc("processing_meta.oldest_timestamp_string")) sorted_latest_descending_df = instance_usage_df.sort( functions.desc("processing_meta.latest_timestamp_string")) # Calculate the rate change by percentage oldest_dict = sorted_oldest_ascending_df.collect()[0].asDict() oldest_quantity = float(oldest_dict[ "processing_meta"]["oldest_quantity"]) latest_dict = sorted_latest_descending_df.collect()[0].asDict() latest_quantity = float(latest_dict[ "processing_meta"]["latest_quantity"]) rate_percentage = 100 * ( (oldest_quantity - latest_quantity) / oldest_quantity) # get any extra data extra_data_map = getattr(sorted_oldest_ascending_df.collect()[0], "extra_data_map", {}) except Exception as e: raise PreHourlyCalculateRateException( "Exception occurred in pre-hourly rate calculation. Error: %s" % str(e)) # create a new instance usage dict instance_usage_dict = {"tenant_id": latest_dict.get("tenant_id", "all"), "user_id": latest_dict.get("user_id", "all"), "resource_uuid": latest_dict.get("resource_uuid", "all"), "geolocation": latest_dict.get("geolocation", "all"), "region": latest_dict.get("region", "all"), "zone": latest_dict.get("zone", "all"), "host": latest_dict.get("host", "all"), "project_id": latest_dict.get("project_id", "all"), "aggregated_metric_name": latest_dict["aggregated_metric_name"], "quantity": rate_percentage, "firstrecord_timestamp_unix": oldest_dict["firstrecord_timestamp_unix"], "firstrecord_timestamp_string": oldest_dict["firstrecord_timestamp_string"], "lastrecord_timestamp_unix": latest_dict["lastrecord_timestamp_unix"], "lastrecord_timestamp_string": latest_dict["lastrecord_timestamp_string"], "record_count": oldest_dict["record_count"] + latest_dict["record_count"], "usage_date": latest_dict["usage_date"], "usage_hour": latest_dict["usage_hour"], "usage_minute": latest_dict["usage_minute"], "aggregation_period": latest_dict["aggregation_period"], "extra_data_map": extra_data_map } instance_usage_data_json = json.dumps(instance_usage_dict) instance_usage_data_json_list.append(instance_usage_data_json) # convert to rdd spark_context = instance_usage_df.rdd.context return spark_context.parallelize(instance_usage_data_json_list)
#featuresOut = df.select(df.command,df.date,df.exec_as,df.source,df.srcip,df.username,df.features) # Create a DF with training data #kmtraindata = featuresOut.sample(False, 0.5, 42) # Create KM model and fit using up to date data kmeans = KMeans(k=650, seed=42, featuresCol="features", predictionCol="prediction", maxIter=10, initSteps=3) kmodel = kmeans.fit(df) #test = kmodel.transform(featuresOut) ''' ########## DEMO ######### ''' df.groupBy(df.prediction).count().orderBy(asc('count')).show(50) groups = df.groupBy(df.prediction.alias("prediction2")).count().orderBy(asc('count')).filter('count < 40') df.join(groups, groups.prediction2==df.prediction).select('command','prediction').distinct().show() df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(500,truncate=False) groups = df.groupBy(df.prediction.alias("prediction2")).count().orderBy(desc('count')).filter('count > 100000') df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(500,truncate=False) groups = sc.parallelize(df.groupBy(df.prediction.alias("prediction2")).count().orderBy(desc('count')).head(10)).toDF() df.join(groups, groups.prediction2==df.prediction).select('command').distinct().show(50,truncate=False) # Create a new DF with some weird commands test1 = ctx.createDataFrame([ ], ["command"])