def yichche_app_day(data_path, from_i=1, to_i=31): sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") for i in range(from_i, to_i): date = time.strftime( "%Y-%m-%d", time.localtime(int(time.time() - i * 60 * 60 * 24))) df = sqlContext.sql( "select user_id as userid,appname,etl_dt from yyh_app_tmp_4 where etl_dt = '" + date + "'") tag_list = [ '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育', '摄影', '汽车', '美容', '母婴育儿', '电影', '搞笑', '健康', '教育', '音乐', '资讯' ] appname2tag_dict = load_appname2tag(data_path) exprs = [ max(when(col("tag") == x, lit(1)).otherwise(lit(0))).alias(x) for x in tag_list ] df = df.select("userid", "appname").where( col("appname").isin(list(appname2tag_dict.keys()))).distinct() df = df.withColumn( "tag", map_tag(appname2tag_dict)('appname')).select( "userid", 'tag').filter((col('tag') != '') & (col('tag').isNotNull()) & (col('tag') != 'null')).distinct() cur_date = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) df = df.groupby("userid").agg(*exprs) df = df.withColumn('source', lit('app')).withColumn('update_dt', lit(cur_date)) df.registerTempTable('tab_name') sqlContext.sql( "insert into table app_usertag_day partition(etl_dt = '" + date + "' ) select * from tab_name ") print('finished', date)
def match_all(df, data_path): ''' 匹配所有的优酷观影记录和爱奇艺标签分数, 存入yk2iqytag表 :param df: :param data_path: :return: ''' sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") # df = sqlContext.sql("select * from youku_mediadata") df = df.withColumn( "yk_movie", translate('title')("mediadata")).filter(col('yk_movie') != '') iqiyi_tags = [ 'entertainment', 'technology', 'shopping', 'lifestyle', 'business', 'fashion', 'tourism', 'game', 'finance', 'female', 'sports', 'photography', 'car' ] iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags) addcols = [ 'beauty', 'childcare', 'movie', 'funny', 'health', 'education', 'music', 'news' ] for i in addcols: iqiyi2tag_df = iqiyi2tag_df.withColumn(i, lit(0)).cache() yk_rdd = df.select('mediadata').rdd.map(list) yk_rdd = yk_rdd.map(lambda xs: [xs[0], find(xs[0], iqy)]) df = yk_rdd.toDF(['yktag', 'ykmovie']).filter(col('movie') != '') df = df.join(iqiyi2tag_df, df.ykmovie == iqiyi2tag_df.iqytag).drop('ykmovie') df.registerTempTable('tab_name') sqlContext.sql("insert into table yk2iqytag select * from tab_name")
def process_rdd(time, rdd): print("----------- %s -----------" % str(time)) try: sql_context = HiveContext(rdd.context) # Convert the RDD to Row RDD row_rdd = rdd.map(lambda w: Row(tweet=w, score=analyzeSentiment(w))) schema = StructType([ StructField("tweet", StringType(), True), StructField("score", FloatType(), True) ]) # Create a DF with the specified schema new_tweets_df = sql_context.createDataFrame(row_rdd, schema=schema) # Register the dataframe as table new_tweets_df.registerTempTable("new_tweets") # Insert new tweets,scores into table tweets sql_context.sql("INSERT INTO TABLE tweets SELECT * FROM new_tweets") # Get all the tweets from the table using SQL tweets_sentiment_df = sql_context.sql("SELECT * FROM tweets") tweets_sentiment_df.show() # Sends the tweets and their sentiment score to the dashboard send_df_to_dashboard(tweets_sentiment_df) except: e = sys.exc_info()[0] print("Error: %s" % e)
def momo_user_tag(data_path, month=None, date=None): sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") df = sqlContext.sql("select userid,mediadata from yyh_momo_tmp") df = df.withColumn("momotag", translate('usertag')("mediadata")).select("userid", 'momotag').filter(col('momotag') != '')\ .withColumn('momotag', explode(split('momotag', ',')))\ .filter((length('momotag') == 5))\ .withColumn('tag',lit('')) for item in momo_tag_list: df = df.withColumn( 'tag', when(col('momotag') == item[0], item[1]).otherwise(col('tag'))).cache() tag_list = [ '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育', '摄影', '汽车', '美容', '母婴育儿', '电影', '搞笑', '健康', '教育', '音乐', '资讯' ] exprs = [ max(when(col("tag") == x, lit(1)).otherwise(lit(0))).alias(x) for x in tag_list ] cur_date = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) df = df.groupby("userid").agg(*exprs)\ .withColumn('source', lit('momo'))\ .withColumn('update_dt', lit(cur_date)) #if month != None: # df = df.withColumn('etl_month', lit(month)) #if date != None: # df = df.withColumn('etl_dt', lit(date)) df.registerTempTable('tab_name') sqlContext.sql( "insert into table momo_usertag_month partition(etl_month = '" + month + "' ) select * from tab_name")
def momo_day(data_path, from_i=1, to_i=31): sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") for i in range(from_i, to_i): date = time.strftime( "%Y-%m-%d", time.localtime(int(time.time() - i * 60 * 60 * 24))) get_label(date)
def do_ets_task(sc, ets_dburl_env, wfc): # 定义客户标识 cust_no = '1' isvalid = '1' etsTempTable = wfc ets_url = ets_dburl_env[wfc[:-2]]['dst'] slave_url = ets_dburl_env[wfc[:-2]]['src'] dbinfo = load_source('getdbinfo', os.path.join(os.path.dirname(__file__), 'Utils.py')).getdbinfo(slave_url) tabledict = load_source('query_sql_slave', os.path.join(os.path.dirname(__file__), 'Utils.py')).query_sql_slave(dbinfo) slaveTempTable = tabledict.get(wfc[:-2]) driver = "com.mysql.jdbc.Driver" sqlContext = HiveContext(sc) # driver = "com.mysql.jdbc.Driver" dff = sqlContext.read.format("jdbc").options(url=slave_url, dbtable=slaveTempTable, driver=driver).load() dff.registerTempTable(slaveTempTable) dft = sqlContext.read.format("jdbc").options(url=ets_url, dbtable=etsTempTable, driver=driver).load() dft.registerTempTable(etsTempTable) ds_ets = sqlContext.sql(" select max(updatets) as max from %s " % (etsTempTable)) pp = ds_ets.collect()[0] max_updates = pp.max slave_sql = '' try: if max_updates is not None: print(u"ets库中的最大时间是:" + str(max_updates)) slave_sql = " select id, examineeid, examid, roomid, stationid, examinerid, totalscore, begintime ,endtime,scoresheetcode,status, updatetime" \ " from %s where `updatetime` > '%s' " % (slaveTempTable, max_updates) else: print(u"本次为初次抽取") slave_sql = " select id, examineeid, examid, roomid, stationid, examinerid, totalscore, begintime ,endtime,scoresheetcode,status, updatetime" \ " from %s " % (slaveTempTable) ds_slave = sqlContext.sql(slave_sql) print(u'slave 中 符合条件的记录数为:%s' % (ds_slave.count())) now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(u'开始组装数据...') src_fields = json.dumps({'osce_score': ['id', 'examineeid', 'examid', 'roomid', 'stationid', 'examinerid', 'totalscore', 'begintime', 'endtime', 'scoresheetcode', 'status', 'updatetime']}) # 字段值 filedvlue = ds_slave.map(lambda row: (row.id, row.examineeid, row.examid, row.roomid, row.stationid, row.examinerid, row.totalscore, str(row.begintime), str(row.endtime), row.scoresheetcode, row.status, cust_no, isvalid, md5(row), now_time, str(row.updatetime))) # 创建列 schemaString = "id,examineeid,examid,roomid,stationid,examinerid,totalscore,begintime," \ "endtime,scoresheetcode,status,cust_no,isvalid,src_fields_md5,createts,updatets" fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split(",")] schema = StructType(fields) # 使用列名和字段值创建datafrom schemaObj = sqlContext.createDataFrame(filedvlue, schema) print(u'组装数据完成...') # print schemaPeople # for row in schemaPeople: # print row.id print(u'开始执写入数据...') # 写入数据库 schemaObj.write.insertInto(etsTempTable, overwrite=False) print(u'写入完成') except Exception, e: # e.message 2.6 不支持 print (str(e)) raise Exception(str(e))
def main(): sys.stdout = open("out_main", "w") # direct output to file print('main starting: {0}'.format(timestamp()), flush=True) ''' create the base_rdd ''' spark = ps.sql.SparkSession.builder \ .enableHiveSupport() \ .appName('pyspark') \ .getOrCreate() spark.sparkContext.setLogLevel("WARN") sc = spark.sparkContext sqlContext = HiveContext(sc) sqlContext.sql("use default") print('creating base_rdd, t = {0}'.format(timestamp()), flush=True) ''' BaseETL.create_base_rdd() :param source: 'file', 's3' :param type: 'mini', 'train', 'valid' (mini is local only) :param n: count of files to load :returns: base_rdd ''' base_rdd = BaseETL.create_base_rdd(sc=sc, source='s3', type='train', n=349) \ .map(lambda base: (base[1], base[2][0], base[2][1])) \ .cache() print('finished, t = {0}'.format(timestamp()), flush=True) ''' pick pipeline to run ''' # pipeline_spender(sc, base_rdd) # grid search randfor classifier # pipeline_spend_amt(sc, base_rdd) # grid search randfor regressor print('end of main: {0}'.format(timestamp())) sc.stop()
def match_all_df(data_path): sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") df = sqlContext.sql("select * from youku_mediadata") df = df.withColumn("yk_movie", translate('title')("mediadata")).filter( col('yk_movie') != '').withColumn( 'movie', lit('')).persist(StorageLevel.DISK_ONLY) iqiyi_tags = [ '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育', '摄影', '汽车' ] iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags) for ele in iqy: df1 = df.withColumn( 'movie', when(col('yk_movie').like('%' + ele + '%'), ele).otherwise(col('movie'))).filter( col('movie') != '').select('mediadata', 'movie') df = df.withColumn( 'movie', when(col('yk_movie').like('%' + ele + '%'), ele).otherwise(col('movie'))).filter( col('movie') == '').persist(StorageLevel.DISK_ONLY) df1.registerTempTable('tab_name') sqlContext.sql("insert into table youku_iqy select * from tab_name ")
def yichche_app(data_path, month): sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") # df = sqlContext.sql("select distinct user_id as userid, appname,etl_dt from t01_sdk_device_app_info where etl_dt between '2018-10-18' and '2018-10-31'") df = sqlContext.sql( "select user_id as userid,appname,etl_dt from yyh_app_tmp_4") tag_list = [ '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育', '摄影', '汽车', '美容', '母婴育儿', '电影', '搞笑', '健康', '教育', '音乐', '资讯' ] appname2tag_dict = load_appname2tag(data_path) exprs = [ max(when(col("tag") == x, lit(1)).otherwise(lit(0))).alias(x) for x in tag_list ] df = df.select("userid", "appname").where( col("appname").isin(list(appname2tag_dict.keys()))) df = df.withColumn("tag", map_tag(appname2tag_dict)('appname')).select( "userid", 'tag').filter((col('tag') != '') & (col('tag').isNotNull()) & (col('tag') != 'null')) cur_date = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) df = df.groupby("userid").agg(*exprs) df = df.withColumn('source', lit('app')).withColumn('update_dt', lit(cur_date)) df.registerTempTable('tab_name') sqlContext.sql( "insert into table app_usertag_month partition(etl_month = '" + month + "' ) select * from tab_name ")
def get_context_test(): conf = SparkConf() sc = SparkContext('local[1]', conf=conf) sql_context = HiveContext(sc) sql_context.sql("""use fex_test""") sql_context.setConf("spark.sql.shuffle.partitions", "1") return sc, sql_context
def CreateTable(table_mode='external', table_name=None, hdfs_host=None, hdfs_port=None, path=None, other_options=''): ''' Create a hive table based on the schema inferred from SchemaExtractor :param table_mode: 'external' or '' (which encounters internal) :param table_name: String :param hdfs_host: :param hdfs_port: :param path: path to the parquet file :param other_options: including serds, etc. :return: ''' schema = SchemaExtractor(hdfs_host=hdfs_host, hdfs_port=hdfs_port, path=path) table_location = 'hdfs://' + hdfs_host + path hiveContext = HiveContext(sc) hive_query = """create {table_mode} table {table_name}({schema}) {options} stored as parquet location '{location_path}'""".format( table_mode=table_mode, table_name=table_name, schema=schema, location_path=table_location, options=other_options) print(hive_query) hiveContext.sql(hive_query)
def create_yk2iqytag(): sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") yk_mediadata = sqlContext.sql( "select distinct mediadata from t2pdm_data.t05_chehui_dsp_log_v2 where etl_dt between '2018-11-01' and '2018-11-30' and channelid=3 " ) match_all(yk_mediadata, data_path)
def cal_null_percent(self): from pyspark import SparkConf,SparkContext from pyspark.sql import HiveContext conf = SparkConf().setAppName(str(time.time())[-6:]) try: sc.stop() except: pass sc = SparkContext(conf=conf) hive_context = HiveContext(sc) final_d = {} for yesterday in self.date_list: sql_0 = ''' SELECT COUNT(1) FROM {1} WHERE pt=\'{0}\' AND model_type=13001 '''.format(yesterday, self.table) s = hive_context.sql(sql_0).collect()[0][0] d = {} for column_name in self.column_name_list: sql_1 = ''' SELECT COUNT(1) FROM {2} WHERE pt=\'{0}\' AND model_type=13001 AND {1} IS NOT NULL AND {1} != '' '''.format(yesterday, column_name, self.table) t = hive_context.sql(sql_1).collect()[0][0] d[column_name] = "%.5f%%" % ((1-t/s) * 100) final_d[yesterday] = d sc.stop() return final_d
def clean_logs(cfg, df_persona, df_keywords, log_table_names): sc = SparkContext.getOrCreate() sc.setLogLevel(cfg['log']['level']) hive_context = HiveContext(sc) cfg_clean = cfg['pipeline']['main_clean'] conditions = cfg_clean['conditions'] start_date, end_date, load_minutes = load_batch_config(cfg) timer_start = timeit.default_timer() showlog_table, showlog_output_table, clicklog_table, clicklog_output_table = log_table_names starting_time = datetime.strptime(start_date, "%Y-%m-%d") ending_time = datetime.strptime(end_date, "%Y-%m-%d") batched_round = 1 while starting_time < ending_time: time_start = starting_time.strftime("%Y-%m-%d %H:%M:%S") batch_time_end = starting_time + timedelta(minutes=load_minutes) batch_time_end = min(batch_time_end, ending_time) time_end = batch_time_end.strftime("%Y-%m-%d %H:%M:%S") print_batching_info("Main clean", batched_round, time_start, time_end) command = """select did, adv_id, adv_type as media, slot_id, spread_app_id, device_name, net_type, adv_bill_mode_cd as price_model, {time} as action_time from {table} where {time} >= '{time_start}' and {time} < '{time_end}'""" df_clicklog_batched = hive_context.sql( command.format(time='click_time', table=clicklog_table, time_start=time_start, time_end=time_end)) df_showlog_batched = hive_context.sql( command.format(time='show_time', table=showlog_table, time_start=time_start, time_end=time_end)) mode = 'overwrite' if batched_round == 1 else 'append' is_empty_showlog_batched = df_showlog_batched.rdd.isEmpty() if not is_empty_showlog_batched: df_showlog_batched = clean_batched_log(df_showlog_batched, df_persona, conditions, df_keywords) write_to_table(df_showlog_batched, showlog_output_table, mode=mode) is_empty_clicklog_batched = df_clicklog_batched.rdd.isEmpty() if not is_empty_clicklog_batched: df_clicklog_batched = clean_batched_log(df_clicklog_batched, df_persona, conditions, df_keywords) write_to_table(df_clicklog_batched, clicklog_output_table, mode=mode) batched_round += 1 starting_time = batch_time_end timer_end = timeit.default_timer() print('Total batching seconds: ' + str(timer_end - timer_start))
def main(sc, SQLContext): sqlContext = HiveContext(sc) sc.setLogLevel("ERROR") query = "USE {0}" query = query.format(db_name) sqlContext.sql(query) tables = sqlContext.sql("SHOW TABLES") tableNames = tables.select("tableName").rdd.map(lambda r: r) tableNames = tableNames.map(lambda x: x.tableName).collect() tableNames = [str(i) for i in tableNames] schema_empty_df = StructType([StructField("table_ddl",StringType(),True)]) empty_df = sqlContext.createDataFrame(sc.emptyRDD(), schema_empty_df) df1 = empty_df for i in tableNames: show_query = "show create table "+i drop_query = "drop table "+i+";\n" describe_query = "describe formatted "+i seperator = ";\n" try: rdd = sc.parallelize([drop_query]) newRDD = rdd.map(lambda x:{"table_ddl":x}) newDF = rdd.map(lambda p: Row(table_ddl=p)).toDF() df = df1.unionAll(newDF) desc = sqlContext.sql(describe_query) desc_1 = desc.select(['data_type']).where("col_name='Location'") desc_2 = desc_1.rdd.map(lambda x:x.data_type).collect() desc_3 = [str(i) for i in desc_2] desc_4 = ''.join(desc_3) df0 = sqlContext.sql(show_query) show_1 = df0.rdd.map(lambda x:x.createtab_stmt).collect() show_2 = [str(i) for i in show_1] show_3 = ''.join(show_2) if show_3.find("LOCATION '") < 0: loc_query = "LOCATION '"+desc_4+"'"+"\n TBLPROPERTIES (" final_create_table=show_3.replace("TBLPROPERTIES (", loc_query) else: final_create_table = show_3 list_final = [final_create_table] rdd_create_table = sc.parallelize(list_final) df_create_table = rdd_create_table.map(lambda p: Row(create_table_ddl=p)).toDF() df1 = df.unionAll(df_create_table) rdd1 = sc.parallelize([seperator]) newRDD1 = rdd1.map(lambda x:{"delim":x}) newDF1 = sqlContext.createDataFrame(newRDD1, ["delim"]) df1 = df1.unionAll(newDF1) except:
def hiveSaveNews(dfNewsContents, table_name): from pyspark.sql import HiveContext hiveContext = HiveContext(sc) tmpDf = hiveContext.createDataFrame( dfNewsContents[['news_code', 'title', 'site', 'writing_time', 'preproc_content', 'img', 'content', 'company']]) tmpDf.registerTempTable("tmpDf") hiveContext.sql("insert into table {table_name} select * from tmpDf".format(table_name=table_name))
def process(time, rdd): print("========= %s =========" % str(time)) try: sqlContext = HiveContext(sc) # FIX: memory error Spark 2.0 bug ( < 2.0 ) sqlContext.setConf("spark.sql.tungsten.enabled","false") # v2.01 spark = SparkSession.builder \ #.master("local") \ #.appName("Word Count") \ #.config("spark.some.config.option", "some-value") \ #.getOrCreate() # Get the singleton instance of SparkSession #nzs v1.0 spark = getSparkSessionInstance(rdd.context.getConf()) if rdd.count() < 1: return; # Convert RDD[String] to RDD[Row] to DataFrame sqlRdd = rdd.map( lambda x: json.loads(x)).map(lambda r: Row( metrics=r["metrics"], name=r["name"], value=r["value"] ) ) wordsDataFrame = sqlContext.createDataFrame(sqlRdd) wordsDataFrame.show() # Creates a temporary view using the DataFrame. wordsDataFrame.registerTempTable("starwarstemp") # Creates a query and get the alam dataset using the temp table wordCountsDataFrame = sqlContext.sql("select * from starwarstemp") wordCountsDataFrame.printSchema() with open(SparkFiles.get('webinar_streaming.sql')) as test_file: alertsql=test_file.read() #logging.info(alertsql) alertDataFrame = sqlContext.sql(alertsql) alertDataFrame.show() alertDataFrame.printSchema() # save all values to HBASE # IF NEED FILTER LATER .filter(lambda x: str(x["metrics"])=='action-credit-limit') \ # create HBASE mapper rowRdd = rdd.map( lambda x: json.loads(x))\ .map(lambda r: ( str(r["metrics"]) ,[ str(r["name"])+"-"+datetime.datetime.now().strftime("%Y%m%d%H%M%S"), "action" if str(r["metrics"])=="action-credit-limit" else "healt", str(r["metrics"]), str(r["value"])] )) table = 'starwarsinbox' host = 'node-master2-KcVkz' keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter" valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter" conf = {"hbase.zookeeper.quorum": host, "hbase.mapred.outputtable": table, "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat", "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"} rowRdd.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv) except Exception as merror: print (merror) raise
def main(): #Setting up spark configuration conf = SparkConf().setAppName('label') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = HiveContext(sc) #Reading Input file path and Output file path input1 = sys.argv[1] input2 = sys.argv[2] output = sys.argv[3] #Setting the schema for the data frame customSchema = StructType([ StructField("id", LongType(), False), StructField("timeSet", StringType(), False), StructField("country", StringType(), False), StructField("province", StringType(), False), StructField('city', StringType(), False), StructField('latitude', FloatType(), False), StructField('longtitude', FloatType(), False) ]) poi_schema = StructType([ StructField("poi_id", StringType(), False), StructField('poi_latitude', FloatType(), False), StructField('poi_longtitude', FloatType(), False) ]) #Reading data into data frame df_input1 = sqlContext.read.format('com.databricks.spark.csv').options( header='false').load(input1, schema=customSchema) df_input2 = sqlContext.read.format('com.databricks.spark.csv').options( header='true').load(input2, schema=poi_schema) #Performing join after Broadcast distance_calc = df_input1.join(broadcast(df_input2)).withColumn( 'distance_in_km', cal_distance(df_input1.latitude, df_input1.longtitude, df_input2.poi_latitude, df_input2.poi_longtitude)) distance_calc.registerTempTable("table1") #Cluster the data based on the column to group to reduce shuffling distance_calc = sqlContext.sql("SELECT * FROM table1 CLUSTER BY id") #Minimum distance is calculated poi_min_distance = distance_calc.groupBy('id').agg( min('distance_in_km').alias('distance_in_km')) #assigning the labels label_data = distance_calc.join(poi_min_distance, ['id', 'distance_in_km']).select( 'id', 'timeSet', 'country', 'province', 'city', 'latitude', 'longtitude', 'poi_id', 'poi_latitude', 'poi_longtitude', 'distance_in_km') label_data.registerTempTable("table2") #assigning the rank to remove duplicate within subgroup df_result = sqlContext.sql( "select id,timeSet,country,province,city,latitude,longtitude,poi_id,poi_latitude,poi_longtitude,distance_in_km,rank() over ( partition by id order by poi_id) as rank from table2" ) df_result = df_result.where(df_result['rank'] == 1) #Saving the result df_result.coalesce(1).write.format('com.databricks.spark.csv').save(output)
def get_data(from_i, to_i): sqlContext = HiveContext(sc) for i in range(from_i, to_i): date = time.strftime( "%Y-%m-%d", time.localtime(int(time.time() - i * 60 * 60 * 24))) sqlContext.sql( "insert into table usercenter_dw.yyh_momo_tmp select distinct imeimd5 as userid, mediadata,etl_dt from t2pdm_data.t05_chehui_dsp_log_v2 " "where etl_dt ='" + date + "' and channelid=4 and length(imeimd5) = 32") print("finished", date)
def get_context(): conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="__file__", conf=conf) sql_context = HiveContext(sc) sql_context.sql("""use fex""") sql_context.setConf("spark.sql.shuffle.partitions", "32") return sc, sql_context
class HiveOperate(): def __init__(self, sc): """ 创建Hive session :return: """ #self.sc = config.CreateSparkContext() self.hive_context = HiveContext(sc) return self.hive_context def df_insert_to_hive(self, df, table_name='channel_result', database='sparktest'): """ 将数据插入到hive中 :param df: :param table_name: :param database: :return: """ df.registerTempTable("result_tmp") result1 = self.hive_context.sql("select * from result_tmp limit 10") result1.show() self.hive_context.sql("use {}".format(database)) self.hive_context.sql("drop table if EXISTS {} ".format(table_name)) self.hive_context.sql( "create table {} as select * from result_tmp where 1 = 2 ".format( table_name)) self.hive_context.sql( " insert overwrite table {} select * from result_tmp".format( table_name))
def load_factdata(sc, cfg, starting_day, ending_day, attributes_condition): hive_context = HiveContext(sc) bucket_id_max = cfg['bucket_id_max'] # step 1: load the original fact data. # step 2: load the distribution e.g. 133904 uckeys. # step 3: inner join the original fact data with the distribution. e.g. 133904 uckeys. # step 4: filter the new fact data with date range e.g. 2020-01-30 - 2020-02-08, 10 days. # step 5: filter the new fact data with conditions. # step 1: load the original fact data command = """select uckey, day, hour, count_array from {} where bucket_id <= {} """.format(cfg['factdata'], bucket_id_max) df = hive_context.sql(command) df = add_count_map(df) # [Row(count_array=['3:4'], day='2019-11-02', hour=19, uckey='native,72bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPM,15,76', count_map={'3': '4'})] # Explode count_map to have pcat and count on separate columns df = df.select('uckey', 'day', 'hour', explode(df.count_map)).withColumnRenamed( "key", "price_cat").withColumnRenamed("value", "count") # [Row(uckey='native,72bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPM,15,76', day='2019-11-02', hour=19, price_cat='3', count='4')] # This is to have the fact data uckey-price_cat pair based on daily count to join the distribution. df = df.groupBy('uckey', 'day', 'price_cat').agg({ "count": "sum" }).withColumnRenamed("sum(count)", "count") # [Row(uckey='splash,5cd1c663263511e6af7500163e291137,WIFI,g_m,4,CPT,3,', day='2019-11-02', price_cat='1', count=56.0)] # step 2: load the distribution e.g. 133904 uckeys command = 'select uckey, price_cat from {} where ratio > 0'.format( cfg['distribution']) dfd = hive_context.sql(command) # step 3: inner join the original fact data with the distribution #distinct uckeys in joined fact data: e.g. 133904 df = df.join(dfd, [df.uckey == dfd.uckey, df.price_cat == dfd.price_cat], how="inner").drop(dfd.uckey).drop(dfd.price_cat) # e.g. df.select(df.uckey).distinct().count(): 133904 # step 4: filter the new fact data with date range e.g. 2020-01-30 - 2020-02-08, 10 days df = df.filter((df.day <= ending_day) & (df.day >= starting_day)) # e.g. df.count(): 15152287, df.select(df.uckey).distinct().count(): 92612 # step 5: filter the new fact data with conditions. uckey_attrs = cfg['uckey_attrs'] for attr_index in range(len(uckey_attrs)): df = df.withColumn( uckey_attrs[attr_index], udf(lambda x: x.split(',')[attr_index], StringType())(df.uckey)) # e.g. [Row(uckey=u'magazinelock,01,2G,,,CPM,13', day=u'2020-01-19', hour=8, count_array=[u'1:10'], # m=u'magazinelock', si=u'01', t=u'2G', g=u'', a=u'', pm=u'CPM', r=u'13')] if attributes_condition: for attr, attr_value in attributes_condition.items(): df = df.filter(df[attr] == str(attr_value)) return df
def tear_down(): for table in data.keys(): hiveContext = HiveContext(sc) df = hiveContext.createDataFrame(data[table], fields[table]) hiveContext.sql('use test_db') try: df.registerTempTable("demo") hiveContext.sql("insert into {table} partition(ds='{date}') select * from demo".format(table=table,date=date)) # hiveContext.sql("insert into {table} partition(ds="") select * from demo".format(table=table)) except Exception as e: df.saveAsTable("{table}".format(table=table))
def read_csv(sc, file_name, sep=",", storage="hive://", header=True, names=None, table_name=None, infer_limit=10000): table_name = table_name if table_name is not None else "df" + str(uuid.uuid4()) hc = HiveContext(sc) df = pd.read_csv(file_name, sep=sep, nrows=infer_limit) names = df.columns if not names else names types = [] for i in range(len(names)): tp = names[i] + " " if df.dtypes[i] == "O": tp += "STRING" elif df.dtypes[i] == "int64": tp += "INT" else: tp += "DOUBLE" types.append(tp) hc.sql('drop table if exists %s' %table_name) qw = """CREATE TABLE IF NOT EXISTS %s (%s) row format delimited fields terminated by '%s' LINES TERMINATED BY '\n'""" %(table_name, ','.join(types), sep) if header: qw += " tblproperties ('skip.header.line.count'='1')" hc.sql(qw) hc.sql("LOAD DATA LOCAL INPATH '%s' OVERWRITE INTO TABLE %s" %(file_name, table_name)) rdd = hc.sql("SELECT * FROM %s" %table_name) ctx = hc if storage.startswith("parquet://"): path = storage.replace("parquet://", "") rdd.saveAsParquetFile("%s/%s" %(path, table_name)) sq = HiveContext(sc) rdd = sq.parquetFile("%s/%s" %(path, table_name)) rdd.registerTempTable(table_name) rdd = sq.sql("select * from %s" %table_name) ctx = sq return DataFrame(ctx, table_name, data=rdd, columns=names, dtype=types)
def save_ucdocs_to_hive_tables(ucdocs_list, factdata_table_name): sc = SparkContext.getOrCreate() hive_context = HiveContext(sc) sc.setLogLevel('WARN') #drop the table if exists, then create a new table and insert data to this table. command = """ DROP TABLE IF EXISTS {} """.format(factdata_table_name) hive_context.sql(command) command = """ CREATE TABLE IF NOT EXISTS {} ( uckey string, bucket_id int, count_array array<string>, hour int, day string ) """.format(factdata_table_name) hive_context.sql(command) #save the factdata of ucdocs to the hive table. sqlContext = SQLContext(sc) df = sqlContext.read.json(sc.parallelize(ucdocs_list)) df.select('uckey', 'bucket_id', 'count_array', 'hour', 'day').write.option( "header", "true").option("encoding", "UTF-8").mode('append').format( 'hive').saveAsTable(factdata_table_name) #agg df from hourly counts to daily counts by suming up hourly counts. _udf = udf( lambda x: sum( [int(list(i.split(':'))[1]) for i in x if i and ':' in i]) if x else 0, IntegerType()) df = df.withColumn('hourly_counts', _udf(df.count_array)) df_dailycounts = df.groupBy('uckey', 'day').agg({ 'hourly_counts': 'sum' }).orderBy('uckey').orderBy('day').withColumnRenamed( 'sum(hourly_counts)', 'daily_counts') #save the transformed factdata with daily counts. factdata_table_name_dailycounts = factdata_table_name + '_dailycounts' command = """ DROP TABLE IF EXISTS {} """.format(factdata_table_name_dailycounts) hive_context.sql(command) command = """ CREATE TABLE IF NOT EXISTS {} ( uckey string, day string, daily_counts int ) """.format(factdata_table_name_dailycounts) hive_context.sql(command) df_dailycounts.select('uckey', 'day', 'daily_counts').write.option( "header", "true").option("encoding", "UTF-8").mode('append').format( 'hive').saveAsTable(factdata_table_name_dailycounts)
def main(): conf = (SparkConf().setMaster("yarn-client").setAppName('').set('spark.executorEnv.PYTHONHASHSEED','0')) conf.set("spark.mongodb.input.uri", uri) conf.set("spark.mongodb.input.database", database) conf.set("spark.mongodb.input.collection", collection) spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate() sc = spark.sparkContext sc.setLogLevel(logLevel='ERROR') hive_context = HiveContext(sc) origin_data = hive_context.read.format("com.mongodb.spark.sql").options(uri=uri, database=database, collection=collection).option("pipeline",'[{"$project":{"origin_user_id":1,"origin_post_id":1,"_id":0}}]').load() map_post_id = hive_context.sql('select post_id,origin_post_id as map_origin_post_id,user_id from prod_bd_mysql_syn.dim_post_upload where post_state = 1') post_id_status = hive_context.sql('select post_id as map_post_id,rec_status from prod_bd_mysql_syn.dim_post_info') map_post = origin_data.join(map_post_id,origin_data.origin_post_id==map_post_id.map_origin_post_id,'left_outer') map_data = map_post.join(post_id_status,map_post.post_id==post_id_status.map_post_id,'left_outer').select('origin_user_id','origin_post_id','post_id','user_id','rec_status').filter('rec_status is null') print(map_data.take(500),map_data.count())
def main(): #Setting up spark configuration conf = SparkConf().setAppName('analysis') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = HiveContext(sc) #Setting Input file path and output file path inputs = sys.argv[1] output = sys.argv[2] #Decaring Schema for data frame customSchema = StructType([ StructField("id", LongType(), False), StructField("timeSet", StringType(), False), StructField("country", StringType(), False), StructField("province", StringType(), False), StructField('city', StringType(), False), StructField('latitude', FloatType(), False), StructField('longtitude', FloatType(), False), StructField('poi_id', StringType(), False), StructField('poi_latitude', FloatType(), False), StructField('poi_longtitude', FloatType(), False), StructField('distance_in_km', FloatType(), False) ]) #read data into data frame df1 = sqlContext.read.format('com.databricks.spark.csv').options( header='false').load(inputs, schema=customSchema) #Register data frame into table df1.registerTempTable("temp_table") #Cluster the data based on the column to group to reduce shuffling df1 = sqlContext.sql("SELECT * FROM temp_table CLUSTER BY poi_id") df1.registerTempTable("table1") #Calculating POI average df_avg = sqlContext.sql( "SELECT poi_id,avg(distance_in_km) as avg_distane_in_km,stddev(distance_in_km) FROM table1 GROUP BY poi_id" ) df_avg.coalesce(1).write.format('com.databricks.spark.csv').save( output + '/average_stddeviation', header='true') #Calculating POI density df_density = sqlContext.sql( "SELECT poi_id,count(poi_id) as density FROM table1 GROUP BY poi_id") df_density.coalesce(1).write.format('com.databricks.spark.csv').save( output + '/density', header='true') #Calculating POI circle area df_circle_area = sqlContext.sql( "SELECT poi_id,3.14*(max(distance_in_km))*(max(distance_in_km))as circle_area FROM table1 GROUP BY poi_id" ) df_circle_area.coalesce(1).write.format('com.databricks.spark.csv').save( output + '/circle_area', header='true')
def main(): conf = SparkConf() dateStr = conf.get('spark.date') sc = SparkContext(conf=conf, appName='Loc City Data Prepare, ' + dateStr) hc = HiveContext(sc) sqlDict = prepareSql(dateStr) #mergedRdd = sc.emptyRDD() mergedRdd = sc.parallelize([]) for prod, sql in sqlDict.items(): print sql df = hc.sql(sql) #print 'df count:', df.count() rdd = df.map(lambda x: toCityLoc(x, prod)) rdd = rdd.filter(lambda x: x[0] is not None) rdd = rdd.map(lambda x: x[0]) mergedRdd = mergedRdd.union(rdd) #break mergedRdd.cache() print 'mergedRdd count:', mergedRdd.count() fromRdd = mergedRdd.map(lambda cityLoc: ( (cityLoc.area, cityLoc.fromPoi.displayName), (cityLoc.fromPoi, 1L))) toRdd = mergedRdd.map(lambda cityLoc: ( (cityLoc.area, cityLoc.toPoi.displayName), (cityLoc.toPoi, 1L))) count(fromRdd, dateStr, 'from') count(toRdd, dateStr, 'to') print 'success' sc.stop()
def data_anlysis(inputFile): # inputFile = 'tesTraffic.json' conf = SparkConf().setAppName("SparkSQLTraffic") sc = SparkContext() hiveCtx = HiveContext(sc) print("Loading traffic from " + inputFile) while True: input = hiveCtx.read.json(inputFile) input.registerTempTable("traffic") topTraffics = hiveCtx.sql( "SELECT placeid,size, color, direction, speed FROM traffic ORDER BY time LIMIT 10" ) print("#" * 20, "\n 1. According to lastest time order:", topTraffics.collect(),' record count:', len( topTraffics.collect())) # https://stackoverflow.com/questions/39535447/attributeerror-dataframe-object-has-no-attribute-map topTrafficText = topTraffics.rdd.map(lambda row: row.speed) isum = 0 for speed in topTrafficText.collect(): print("#" * 20, "\n 2. Just speed", speed) # for speed in singlelist: # print('\nspeed=', speed) isum += float(speed) average_speed = isum / len( topTraffics.collect()) show = colored("3. total flow is:", "red", attrs=['reverse', 'blink']) print(show, isum, "average spped is:", average_speed) time.sleep(3) sc.stop()
def youku_user_tag(): sqlContext = HiveContext(sc) sqlContext.sql("use usercenter_dw") df = sqlContext.sql("select * from youku_matched") drop_list = ['mediadata', 'yktag', 'iqytag'] df = df.select( [column for column in df.columns if column not in drop_list]) exprs = [avg(x).alias(x) for x in df.drop('userid').columns] cur_date = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) df = df.groupby("userid").agg(*exprs).withColumn('source', lit('youku')).withColumn( 'update_dt', lit(cur_date)) df.registerTempTable('tab_name') sqlContext.sql( "insert overwrite table youku_user_tag select * from tab_name")
def query12_input(query_name, conf=None, output_persist=False): sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) # SQL statements can be run by using the sql methods provided by sqlContext sql = "use tpcds_text_db_1_50" _ = sqlContext.sql(sql) # web_sales_sql = "select * from web_sales" # web_sales = sqlContext.sql(web_sales_sql) # web_sales.persist() # web_sales.registerAsTable("web_sales") # item_sql = "select * from item" # item = sqlContext.sql(item_sql) # item.persist() # item.registerAsTable("item") # date_dim_sql = "select * from date_dim" # date_dim = sqlContext.sql(date_dim_sql) # date_dim.persist() # date_dim.registerAsTable("date_dim") sqlContext.cacheTable("web_sales") sqlContext.cacheTable("item") sqlContext.cacheTable("date_dim") # discard the first query output = execute_sql(query_name, sqlContext, output_persist) # check the re-run statistics output = execute_sql(query_name, sqlContext) output['describe'] = output['output'].describe().show() sc.stop() return output
def run(fout, yarn=None, verbose=None, patterns=None, antipatterns=None, inst='GLOBAL'): """ Main function to run pyspark job. It requires a schema file, an HDFS directory with data and optional script with mapper/reducer functions. """ # define spark context, it's main object which allow to communicate with spark ctx = spark_context('cms', yarn, verbose) sqlContext = HiveContext(ctx) # read DBS and Phedex tables tables = {} tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose)) bdf = tables['bdf'] fdf = tables['fdf'] flf = tables['flf'] # join tables cols = ['*'] # to select all fields from table cols = [ 'b_block_id', 'b_block_name', 'f_block_id', 'f_file_id', 'fl_file_id', 'fl_lumi_section_num' ] # join tables stmt = 'SELECT %s FROM bdf JOIN fdf on bdf.b_block_id = fdf.f_block_id JOIN flf on fdf.f_file_id=flf.fl_file_id' % ','.join( cols) print(stmt) joins = sqlContext.sql(stmt) # keep table around joins.persist(StorageLevel.MEMORY_AND_DISK) # construct conditions # adler = ['ad8f6ad2','9c441343','f68d5dca','81c90e2a','471d2524','a3c1f077','6f0018a0','8bb03b60','d504882c','5ede357f','b05303c3','716d1776','7e9cf258','1945804b','ec7bc1d7','12c87747','94f2aa32'] # cond = 'f_adler32 in %s' % adler # cond = cond.replace('[', '(').replace(']', ')') # fjoin = joins.where(cond).distinct().select(cols) # print_rows(fjoin, stmt, verbose) fjoin = joins\ .groupBy(['b_block_name'])\ .agg({'fl_lumi_section_num':'count'})\ .withColumnRenamed('count(fl_lumi_section_num)', 'nlumis')\ # keep table around fjoin.persist(StorageLevel.MEMORY_AND_DISK) # write out results back to HDFS, the fout parameter defines area on HDFS # it is either absolute path or area under /user/USERNAME if fout: fjoin.write.format("com.databricks.spark.csv")\ .option("header", "true").save(fout) ctx.stop()
def main(): sc = SparkContext() hc = HiveContext(sc) df = hc.sql("""{{sql}}""") df_writer = DataFrameWriter(df) df_writer.saveAsTable(name='{{tableName}}', format='json', mode='overwrite', path='s3://data/{{tableName}}')
def query12_no(query_name, conf=None): sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) # SQL statements can be run by using the sql methods provided by sqlContext sql = "use tpcds_text_db_1_50" _ = sqlContext.sql(sql) output = execute_sql(query_name, sqlContext) output['describe'] = output['output'].describe().show() sc.stop() return output
def run(self): sc = SparkContext("local", "Course Activity") #sqlHC is the SQLHiveContext sqlHC = HiveContext(sc) lines=sqlHC.sql(""" select courseName,lmsUserId,createDateTime, eventType,eventName,eventNo from logdata where eventType not in ('enrollment','instructor','admin') and lmsUserId is not NULL and courseName is not NULL and eventNo is not NULL limit 10""") maplvl1=lines.flatMap(lambda p: mapp(p[0],str(p[1]),p[2].strftime('%Y-%m-%d'),p[4])) reduceRDD=maplvl1.reduceByKey(lambda a,b : a+b) with self.output().open('w') as out_file: for line in reduceRDD.collect(): out_file.write(line[0][0]+"\x01"+line[0][1]+"\x01"+line[0][2]+"\x01"+line[0][3]+"\x01"+str(line[1])+"\n")
def ch9_sql(): # Import Spark SQL from pyspark.sql import HiveContext, Row # Or if you can't include the hive requirements from pyspark.sql import SQLContext, Row hiveCtx = HiveContext(sc) input_file = hiveCtx.read.json("testweet.json") # Register the input_file schema RDD input_file.registerTempTable("tweets") # Select tweets based on the retweetCount topTweets = hiveCtx.sql("""SELECT text, retweetCount FROM tweets ORDER BY retweetCount LIMIT 10""") topTweetText = topTweets.map(lambda row: row.text) topTweetText.collect() topTweets.schema hiveCtx.cacheTable("tweets")
df_final = df_add_year.withColumn('Load_date', F.current_date()) df_final.repartition(10) # Registering data frame as a temp table for SparkSQL hive_ctx.registerDataFrameAsTable(df_final, "EMP_TEMP") # Target Type: APACHE HIVE # Database : EMPLOYEES # Table Name : EMPLOYEE_DIM # + ------------------------------- + # | COlUMN NAME| TYPE | PARTITION | # + ------------------------------- + # | EMP_NO | INT | | # | BIRTH_DATE | DATE | | # | FIRST_NAME | STRING | | # | LAST_NAME | STRING | | # | GENDER | STRING | | # | HIRE_DATE | DATE | | # | SALARY | INT | | # | FROM_DATE | DATE | | # | TO_DATE | DATE | | # | YEAR | INT | PRIMARY | # | LOAD_DATE | DATE | SUB | # + ------------------------------- + # Storage Format: ORC # Inserting data into the Target table hive_ctx.sql("INSERT OVERWRITE TABLE EMPLOYEES.EMPLOYEE_DIM PARTITION (year, Load_date) \ SELECT EMP_NO, BIRTH_DATE, FIRST_NAME, LAST_NAME, GENDER, HIRE_DATE, \ SALARY, FROM_DATE, TO_DATE, year, Load_date FROM EMP_TEMP")
would be stored in the Hive tables. Step 4. Then do a mapreduce to find the total number of count and timeAccess to the video and the videoFrame, and write it to the MySQL Analytics summary table. """ """ Step 1. Getting the video data from Hive. """ sqlVideo = ( "SELECT orgname,coursename, videosysname, videolength, videoTitle FROM coursevideos where videosysname is not null" ) videoslist = dict(sqlContext.sql(sqlVideo).map(lambda v: ((v[0], v[1], v[2]), (v[-2], v[-1]))).collect()) # unless you actually collect() the data from the RDD, you can't operate on it. # So, to actually use the data, call a take(x) method or a collect() method on the RDD before you start. # otherwise, to use RDD functions, DO NOT OPERATE ON IT IMMEDIATELY. it will all be done when the data is finally collected. # 1. To iterate over the data like a list, do a collect or take function first. # 2. To access the elements of a Row type object (result of collect), use the normal subscripts. That works just fine. # But here, to make it easier, we decided to change the Rows to tuples in a dictionary so that they can be accessed via moduleSysName or videoSysName # for video in videoslist: # print video, videoslist[video][0], videoslist[video][1] """ Step 2. Getting the event data from Hive. """
if s is None: return None try: f = float(s) return f except ValueError: return None ########## ############################## # Combine readmissions and effective_care # Transform readmissions df_readmissions = sqlContext.sql('select providerid, state, measureid, score from readmissions') readmissions = df_readmissions.map(lambda r: (r.providerid, r.state, r.measureid, getFloat(r.score) )) # Transform effective_care df_effective_care = sqlContext.sql('select providerid, state, measureid, score from effective_care') def getEffectiveCareScore(measureid, s): if measureid == "EDV": # The score of EDV measure has the following distinct values s = str(s).strip() if s == "Low (0 - 19,999 patients annually)": return 25. elif s == "Medium (20,000 - 39,999 patients annually)": return 50. elif s == "High (40,000 - 59,999 patients annually)":
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, IntegerType, FloatType, StringType conf = SparkConf().setAppName("spark_sql_datatype_struct") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([((1, 2.0, "3.0"),)]) schema = StructType([StructField("struct", StructType([StructField("first", IntegerType(), False), StructField( "second", FloatType(), False), StructField("third", StringType(), False)]), False)]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql( "select struct.first, struct.second, struct.third from temp_table").collect() sc.stop() for row in rows: print row
from pyspark import SparkContext sc = SparkContext("local", "best_hospitals") from pyspark.sql import HiveContext sqlContext = HiveContext(sc) # Select the top 10 hospital by average avgscore # Please note that we filter out those hospital not qualified for evaluation df_top10_hospitals = sqlContext.sql("select Q.providerid as id, AVG(Q.normalizedscore) as avgscore \ from total_quality Q join hospitals_qualified H on Q.providerid = H.providerid \ where Q.normalizedscore is not null and H.qualified = true \ group by Q.providerid \ order by avgscore DESC").limit(10) # Join with hospitals_qualified to get the hospital name and state # Note: couldn't figure out how to do it in the above select statement (together with Group By) in one-shot! :-( df_hospitals = sqlContext.table("hospitals_qualified") df_top10_hospitals_full = df_top10_hospitals.join(df_hospitals, df_top10_hospitals.id == df_hospitals.providerid).\ select(df_hospitals.providerid, df_hospitals.hospitalname, df_hospitals.state, df_top10_hospitals.avgscore) df_top10_hospitals_full = df_top10_hospitals_full.orderBy(df_top10_hospitals_full.avgscore.desc()) # Save it as a table df_top10_hospitals_full.registerTempTable("df") sqlContext.sql("drop table if exists top_10_hospitals") sqlContext.sql("CREATE TABLE top_10_hospitals AS SELECT * FROM df") print print "Top 10 hospitals" print rank = 1
self.saveWeather(updates) print "updates=", updates if __name__ == '__main__': print "Starting.", datetime.now() sc = SparkContext() #sqlContext = SQLContext(sc) sqlContext = HiveContext(sc) games = sqlContext.read.parquet(CreateStatsRDD.rddDir + "/" + Games.table_name + ".parquet") games.registerTempTable("games") games.cache() print "games=", games print games.take(2) stadium = sqlContext.load(source="com.databricks.spark.csv", header="true", path = SRCDIR + "/stadium.csv") stadium.registerTempTable("stadium") stadium.cache() print "stadium=", stadium.take(2) weather = UpdateWeather(sc, sqlContext, games, stadium) weather.update() badIds = sqlContext.sql("select game_id, count(*) from weather group by game_id having count(*) >1").collect() if len(badIds) > 0: print "BAD weather game_ids=", badIds else: print "no bad weather ids." sc.stop()
""" # TODO: refactor this into a utility function and update jobs # to always UTF8 encode mapper keys. if len(values) > 1: return tuple([value.encode('utf8') for value in values]) else: return values[0].encode('utf8') sc = SparkContext("local", "Course Activity") #sqlHC is the SQLHiveContext sqlHC = HiveContext(sc) lines=sqlHC.sql(""" select courseName,lmsUserId,createDateTime, eventType,eventName,eventNo from logdata where eventType not in ('enrollment','instructor','admin') and lmsUserId is not NULL and courseName is not NULL and eventNo is not NULL limit 100""") maplvl1=lines.flatMap(lambda p: mapp(p[0],str(p[1]),p[2].strftime('%Y-%m-%d'),p[4])) for linet in maplvl1.collect(): print linet reduceRDD = maplvl1.reduceByKey(lambda a, b : a + b) fo = open("tester","w") for line in reduceRDD.collect(): fo.write(line[0][0]+"\x01"+line[0][1]+"\x01"+line[0][2]+"\x01"+line[0][3]+"\x01"+str(line[1])+"\n")
date, close FROM eod_rel """) rdd_sum = df.rdd.groupBy(lambda x: x.symbol)\ .map(lambda x: (x[0], list(x[1])))\ .flatMapValues(lambda x: cal_per(x))\ .map(lambda x: x[1])\ .groupBy(lambda x: x["date"])\ .map(lambda x: (x[0], list(x[1])))\ .mapValues(lambda x: cal_sum_per(x))\ .map(lambda x: x[1]) for each in rdd_sum.collect(): print each if __name__ == "__main__": conf = SparkConf() conf.set("spark.executor.instances", "4") conf.set("spark.executor.cores", "4") conf.set("spark.executor.memory", "8g") sc = SparkContext(appName="bintrade.post.index", conf=conf) sql_context = HiveContext(sc) sql_context.sql("use fex") main(sc, sql_context) sc.stop()
from pyspark.storagelevel import StorageLevel if __name__ == "__main__": sc = SparkContext(appName="TweetsConversationHierarchyBuilder") sqlContext = HiveContext(sc) toplevel = sqlContext.sql(""" select distinct tt.id,tt.in_reply_to_status_id from (select t.id,t.in_reply_to_status_id from wrd10_socialmedia.fact_orc_raw_tweets t join wrd10_socialmedia.fact_orc_raw_tweets t1 on t.id = t1.in_reply_to_status_id where t.in_reply_to_status_id is null and t.country='Nigeria' and t1.country='Nigeria' union all select t2.id,t2.in_reply_to_status_id from wrd10_socialmedia.fact_orc_raw_twitter_userstream t2, wrd10_socialmedia.fact_orc_raw_twitter_userstream t3, wrd10_socialmedia.fact_orc_raw_tweets t4 where t2.in_reply_to_status_id is null and ((t2.id = t3.in_reply_to_status_id) or (t2.id=t4.in_reply_to_status_id)) and t2.country='Nigeria' and t3.country='Nigeria' and t4.country='Nigeria' ) tt """).persist(StorageLevel.MEMORY_AND_DISK) replies = sqlContext.sql(""" select distinct tt.id,tt.in_reply_to_status_id,tt.text,tt.user_name,tt.created_at,tt.country from (select t.id,t.in_reply_to_status_id,t.text,t.user_name,t.created_at,t.country from wrd10_socialmedia.fact_orc_raw_tweets t where t.in_reply_to_status_id is not null and t.country='Nigeria'
# Test reading from S3 print("Prepping hadoop for reading S3 file") #hadoopConf = SparkContext.hadoopConfiguration #sc.hadoopConfiguration.set("fs.s3n.awsAccessKeyId", "AKIAIJCHQUJC4PMAILDA") #sc.hadoopConfiguration.set("fs.s3n.awsSecretAccessKey", "/home/pshvets/Spark_SQL/ricaws.pem") # can contain "/" myRDD = sc.textFile("s3n://hive-qs-data/employee.csv") s3file = myRDD.count() print("File count on S3: ", str(s3file)) for rec in myRDD.collect(): print("Reading S3 file: ", rec) print("Done with reading file from S3") ############################################################### # Create customer specific Hive database. # This sqlContext.sql("CREATE DATABASE IF NOT EXISTS peter_hive_db") # Create pool for parallel processing pool = Pool() # Get environment variables app_variables = get_app_variables() # Compile a list of all property files under $SPARK_ETL_CONF_DIR folder path = app_variables.get('SPARK_ETL_CONF_DIR') prop_files = [os.path.join(path,fileiter) for fileiter in os.listdir(path) if fileiter.endswith('.json')] print (prop_files) # Data Extract if __name__ == "__main__": # Execute core functionality. Iterate over all propertry files in Spark-ETL config directory for prop_fileiter in prop_files:
convertRDD = hc.sql( "select col1, col2, col3 from temp_source").map(convert) mytable = hc.inferSchema(convertRDD) mytable.registerTempTable("temp_mytable") """ def convert(val): return val.upper() hc.registerFunction("temp_convert", convert) convertRDD = hc.sql( "select temp_convert(col1) as col1, col2, col3 from temp_source") convertRDD.registerAsTable("temp_mytable") hc.cacheTable("temp_mytable") def printRows(rows): for row in rows: print row datas = hc.sql("select * from temp_mytable").collect() printRows(datas)
# Createas a hive table and loads an input file into it # For input you can use examples/src/main/resources/kv1.txt from the spark # distribution from pyspark import SparkContext from pyspark.sql import HiveContext import json import sys if __name__ == "__main__": if len(sys.argv) != 4: print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]" sys.exit(-1) master = sys.argv[1] inputFile = sys.argv[2] inputTable = sys.argv[3] sc = SparkContext(master, "LoadHive") hiveCtx = HiveContext(sc) # Load some data into hive hiveCtx.sql( "CREATE TABLE IF NOT EXISTS " + inputTable + " (key INT, value STRING)") hiveCtx.sql( "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)
def main(): # set up the logger logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'mpg_cluster.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # NSJOIN dayidx # only partitioned by DAY day_idx = beeline.get_last_partitions('mapper.nsjoin').split('=')[1] # BAREBONES dayidx # only partitioned by DAY day_bb = [x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x] # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid) mappoints_data = sorted([x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x])[-1].split('/') [day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data] if day_idx != day_mps: logger.error('mapper.mappoints and mapper.nsjoin different day, possible data missing in the source.') return if len(day_bb) == 0: logger.warning('mapper.barebone data missing for this particular day.') #return logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx)) logger.info('begin spark process.') getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (day_idx, uuid_idx, day_idx, uuid_idx) geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx sc = SparkContext() hiveCtx = HiveContext(sc) rows = hiveCtx.sql(getting_mappoint_data) regInfoRows = hiveCtx.sql('select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx)) geo_total_cap = hiveCtx.sql(geo_total_cap_query) geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query) # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]] region_mpginfo_pair = rows.map(lambda x: [[x.mpgid, x.lat, x.lon, x.country, x.mpgload, x.asnum, x.ip], x.allowed_private_regions])\ .flatMapValues(lambda x: x).map(lambda x: [x[1], x[0]]) #region_mpginfo_pair.first() # rdd format: [regionid, [reg-lat, reg-lon, reg-capacity(bit mbps), reg-capacity(bit mfps), reg-country, reg-numvips, reg-service, reg-prp]] # ps. prp=1: private, prp=0: public region_latlon = regInfoRows.map(lambda x: [x.region, [x.latitude, x.longitude, x.peak_bitcap_mbps, x.peak_flitcap_mfps, x.country, x.numvips, 'W' if x.network=='freeflow' else ('S' if x.network=='essl' else 'O'), 1 if x.prp=='private' else 0]])\ .filter(lambda x: x[1][6]=='W' or x[1][6]=='S') region_public_list = region_latlon\ .filter(lambda x: x[1][7] == 0)\ .map(lambda x: ('all', [[x[0]]]))\ .reduceByKey(lambda a, b: [a[0]+b[0]])\ .map(lambda x: x[1][0]).collect() region_public_list = [0] + sorted(region_public_list[0]) # dummy region rdd2 = sc.parallelize([([0, [0, 0, 0.0, 0.0, 'US', 0, 'W', 1]])]) region_latlon = region_latlon.union(rdd2) # perform the join into tuple of (K, (V1, V2): # (regionid, ([mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load], [reg-lat, reg-lon, reg-cap, reg-country, reg-numvips, reg-service])) # rdd = (mpgid, regionid, [lat1, lon1, lat2, lon2, distance], # reg-cap-bit(gbps), reg-cap-flit(gbps), reg-country, reg-numvips, reg-services, # mpg-country, mpg-load, mpg-asnum, mpg-nsip, # mpg-lat, mpg-lon) mpgid_reg_geo = region_mpginfo_pair.join(region_latlon).map(lambda x: [x[1][0][0], x[0], geodesic_distance(x[1][0][1], x[1][0][2], x[1][1][0], x[1][1][1]), round(float(x[1][1][2])/1000.0, 3), round(float(x[1][1][3])/1000.0, 3), x[1][1][4], # reg-country x[1][1][5], # reg-numvips x[1][1][6], # reg-services x[1][0][3], x[1][0][4], x[1][0][5], x[1][0][6], x[1][0][1], x[1][0][2]]) # filtering on mapping distance < 500 miles # filtering on reg-country = mpg-country # filtering on region capacity fbps > 1Gbps # rdd format = (mpgid, [[regionid], distance, [capacity-w, capacity-s], numvips, 1, mpg-country, mpg-load, mpg-asnum, mpg-nsip, # mpg-lat, mpg-lon]) #mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: x[2][4] < 500)\ # .filter(lambda x: x[5] == x[8])\ # .filter(lambda x: x[3] > 1)\ # .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]])) # or this one, no-same-country constraint: mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: (x[2][4] < 500) or (x[5]==x[8] and x[2][4] < 1000))\ .filter(lambda x: x[3] > 1)\ .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]])) #mpgid_reg_distance.first() # group by mpgid # rdd format = (mpgid, [[reg-list], # avg_distance, # total_cap freeflow, # total_cap essl, # total num vips, # rg_count, # mpg-country, # mpg-load, # [mpg-asnum], # [mpg-nsip]) mpgid_reglist_avgDistance_capacity_nReg = mpgid_reg_distance\ .reduceByKey(lambda a, b: [a[0]+b[0], a[1]+b[1], [a[2][0]+b[2][0], a[2][1]+b[2][1]], a[3]+b[3], a[4]+b[4], a[5], a[6], a[7], a[8], a[9], a[10]])\ .map(lambda x: (x[0], [sorted(x[1][0]), # region_list round(x[1][1]/x[1][4], 2), # avg distance round(x[1][2][0], 2), # total capacity - w round(x[1][2][1], 2), # total capacity - s x[1][3], # numvips x[1][4], # total region count x[1][5], # mpg country x[1][6], # mpg load x[1][7], # mpg asnum x[1][8], # mpg nsip x[1][9], # mpg lat x[1][10]])) # mpg lon # disable the count #total_mpg_with_region = mpgid_reglist_avgDistance_capacity_nReg.count() # rdd format = (reg, [(reg-list), [[mpg-list], avg_distance, total_cap_w, total_cap_s, total_numvips # reg-count, cluster_country, mpg-load, mpg-count, mpg-lat, mpg-lon]]) reg_reglist_mpgid_avgDistance_capacity_nReg_country = mpgid_reglist_avgDistance_capacity_nReg\ .map(lambda x: (tuple(x[1][0]), [[x[0]], # mpgid list x[1][1], # avg_distance x[1][2], # region total capacity freeflow x[1][3], # region total capacity essl x[1][4], # total num vips x[1][5], # total region count [x[1][6]], # mpg country list x[1][7], # mpg load 1, # mpg-count x[1][8] if x[1][8] else [], # [mpg-asnum] x[1][9] if x[1][9] else [], # [mpg-nsip] [x[1][10]], # [mpg-lat] # single element array [x[1][11]], # [mpg-lon] # single element array [x[1][7]] # [mpg-load] # single element array ]))\ .reduceByKey(lambda a, b: [a[0]+b[0], a[1], a[2], a[3], a[4], a[5], a[6]+b[6], a[7]+b[7], a[8]+b[8], a[9]+b[9], a[10]+b[10], a[11]+b[11], a[12]+b[12], a[13]+b[13]])\ .filter(lambda x: sum(x[1][13]) > 0.0001)\ .map(lambda x: (x[0], [sorted(x[1][0]), # mpgid list x[1][1], # avg_distance x[1][2], # reg-cap-w x[1][3], # reg-cap-s x[1][4], # numvips x[1][5], # reg-count [str(y) for y in sorted(list(set(x[1][6])))], # mpg-country list x[1][7], # mpg-load x[1][8], # mpg-count [str(y) for y in sorted(list(set(x[1][9])))], # [mpg-asnum] [str(y) for y in sorted(list(set(x[1][10])))], # [mpg-nsip] geo_centroid(x[1][11], x[1][12], x[1][13]) # [mpg: lat, lon, por, porsigma] ]))\ .map(lambda x: ([':'.join([str(y) for y in list(x[1][6])]), # [mpg-country list] x[1][1], # avg_distance x[1][2], # reg-cap-w x[1][3], # reg-cap-s x[1][4], # numvips x[1][5], # reg-count x[1][7], # mpg-load x[1][8], # mpg-count ':'.join([str(y) for y in x[0]]), # [region-list] ':'.join([str(y) for y in list(x[1][0])]), # [mpg-list] ':'.join([str(y) for y in x[1][9]]) if len(x[1][9])>0 else 'NULL', # [mpg-asnum] ':'.join([str(y) for y in x[1][10]]) if len(x[1][10])>0 else 'NULL', # [mpg-nsip] x[1][11] # [mpg-lat, mpg-lon, mpg-por, mpg-porsigma] ], region_public_list ))\ .flatMapValues(lambda x: x)\ .map(lambda x: [x[1], x[0]]) reglist_mpgid_avgDistance_capacity_nReg_country = reg_reglist_mpgid_avgDistance_capacity_nReg_country\ .join(region_latlon)\ .map(lambda x: [x[1][0]]+[x[1][1]]+[geodesic_distance(x[1][0][12][0], x[1][0][12][1], x[1][1][0], x[1][1][1])] + [x[0]] if x[0] > 0\ else [x[1][0]]+[x[1][1]]+[[x[1][0][12][0], x[1][0][12][1], x[1][1][0], x[1][1][1], 0.0]] + [x[0]])\ .filter(lambda x: x[2][4] < 500)\ .map(lambda x: (tuple([x[0][0], x[0][1], x[0][2], x[0][3], x[0][4], x[0][5], x[0][6], x[0][7], x[0][8], x[0][9], x[0][10], x[0][11], x[0][12][0], x[0][12][1], x[0][12][2], x[0][12][3]]), # mpg-information [x[1][2], # pub.region.cap.ff x[1][3], # pub.region.cap.essl x[1][5], # pub.region.vip [x[3]] # single element region id ]))\ .reduceByKey(lambda a, b: [a[0]+b[0], # sum( pub.region.cap.ff ) a[1]+b[1], # sum( pub.region.cap.essl ) a[2]+b[2], # sum( pub.region.cap.vip ) a[3]+b[3] # [pub.regions] ])\ .map(lambda x: [x[0][0], # [mpg-country-list] x[0][1], # avg-distance x[0][12], # mpg-lat x[0][13], # mpg-lon x[0][14], # mpg-por x[0][15], # mpg-porsigma x[0][2], # pri.region.cap.ff (gbps) x[0][3], # pri.region.cap.essl (gbps) x[0][4], # pri.vips x[0][5], # pri.region.count round(float(x[1][0])/1000.0, 3), # pub.region.cap.ff (gbps) round(float(x[1][1])/1000.0, 3), # pub.region.cap.essl (gbps) x[1][2], # pub.vips len(x[1][3])-1, # pub.region.count x[0][6], # mpg-load round(x[0][7], 6), # mpg-count x[0][8], # [pri reg-list] ':'.join([str(y) for y in sorted(x[1][3])][1:]) if len(x[1][3])>1 else 'NULL', # [pub reg-list]) x[0][9], # [mpg-list] x[0][10], # [mpg-assum] x[0][11] # [mpg-nsip] ]) # data exporting to local country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist = pd.DataFrame(columns=['cl_geoname', 'cl_avgDistance', 'cl_lat', 'cl_lon', 'cl_por', 'cl_porsigma', 'pri_cap_ff_gbps', 'pri_cap_essl_gbps', 'pri_nvips', 'pri_nReg', 'pub_cap_ff_gbps', 'pub_cap_essl_gbps', 'pub_nvips', 'pub_nReg', 'cl_mpgLoad', 'cl_nMpg', 'pri_regList', 'pub_regList', 'mpgList', 'mpgASList', 'mpgNSIPList']) geo_cluster_full_info = reglist_mpgid_avgDistance_capacity_nReg_country.collect() logger.info('begin write to local disk.') for item in range(len(geo_cluster_full_info)): temp = geo_cluster_full_info[item] country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.loc[item] = temp # the above should be temp[1][0] for the mpglist data_folder = '/home/testgrp/MRQOS/project_mpd_clustering/data' filename = 'geo_full_cluster_info.%s.%s.csv' % (day_idx, uuid_idx) fileDestination = os.path.join(data_folder, filename) country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.to_csv(fileDestination, sep=',', index=False, header=False) logger.info('begin to upload to hdfs.') tablename = 'mrqos.mpg_cluster' hdfs_d = os.path.join(config.hdfs_table, 'mpg_cluster', 'datestamp=%s' % day_idx, 'uuid=%s' % uuid_idx) partition = '''datestamp=%s, uuid='%s' ''' % (day_idx, uuid_idx) processed_filename = '.'.join(filename.split('.')[0:-1])+'.processed.csv' cmd_str = ''' cat %s | awk -F, '{n=split($21,a,":"); if(n>5){$21=a[1]":"a[2]":"a[3]":"a[4]":"a[5];} m=split($20,b,":"); if(m>5){$20=b[1]":"b[2]":"b[3]":"b[4]":"b[5];}print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$20,$21;}' > %s ''' % (os.path.join(data_folder, filename), os.path.join(data_folder, processed_filename)) sp.check_call(cmd_str, shell=True) try: beeline.upload_to_hive(fileDestination, hdfs_d, partition, tablename, logger) # os.remove(fileDestination) except sp.CalledProcessError as e: logger.info('upload to HDFS + update Hive table failed.')
from pyspark.sql import HiveContext from pyspark import SparkConf from pyspark import SparkContext from operator import add conf = SparkConf().setAppName("EDF").setMaster("local") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) hiveContext.sql("use db_edf") df1 = hiveContext.sql("select site_id as id, timestp as tp, consumption as cp from consumption") df2 = hiveContext.sql("select site_id as id, industry as indus from sites") df3 = df1.join(df2, df1.id == df2.id).groupBy(["tp","indus"]).mean("cp") df3.show() df4 = df3.select(df3.tp, df3.indus, df3["AVG(cp)"].alias("mean_cp")).orderBy("tp") df4.show() df4.registerAsTable("tmp_table") hiveContext.sql("create table final_cdc_mean_by_inustry_each_5min_with_date as select from_unixtime(tp) as date_time, tp, indus, mean_cp from tmp_table")
print "Usage: spark-submit <Python Code File>" sys.exit(1) #App name which shows up in the Spark UI sc = SparkContext(appName='User Recommendation') #Context provides connection to Hive metastore sqlContext = HiveContext(sc) ''' Pulling data out of Hive. I created a relication of 'watson_bisum_purchases' table locally to test. ''' rdd = sqlContext.sql("SELECT person_id,deal_id,aasm_state FROM watson_bisum_purchases") ''' Creating datasets. Formating the data and also creating sample datasets in order to create and test the model. ''' #Formating all the data using the 'parse_rating' method above all_data = rdd.map(parse_rating) rec_list = sc.parallelize(all_data.collect()) #Grabbing all Unique Users(used for building recommendation list) users = rdd.groupBy(lambda x: x.person_id).map(lambda x: x[0]).collect() #Grabbing all Unique Deals/Products(used for building recommendation list)
# import dependencies import pyspark from pyspark.sql import HiveContext import sys import os # zip rows(lists) def parse_list(p): if p.Line!=None: return zip(p.Line,p.Latitude,p.Longitude,p.RecordedAtTime,p.vehicleID,p.Trip,p.TripDate,p.TripPattern,p.MonitoredCallRef,p.DistFromCall,p.CallDistAlongRoute,p.PresentableDistance) else: return [] if __name__=='__main__': sc = pyspark.SparkContext() sqlContext = HiveContext(sc) bus_file='BusTime/2015_*.jsons' #read multiple_josns bus = sqlContext.read.json(bus_file) bus.registerTempTable("bus") # register into table in order to use sql with open(sys.argv[-2]) as fr: #read sql query = fr.read() sqlContext.sql(query).flatMap(parse_list).map(lambda x: ",".join(map(str, x))).saveAsTextFile(sys.argv[-1])
def main(args): """ Main code for relevance computation """ start_time = time.time() # iq (code snippets that set below properties have been removed) driver = url = username = password = inputs = [driver, url, username, password] filename = str(args[0]) if os.path.exists(filename): pass else: sys.exit("Input file %s not found" % filename) file = open(filename, 'r') for line in file: key, val = line.split(",") if str(key).strip() == "dbalias": dbalias = str(val).strip() elif str(key).strip() == "numpartitions": numpartitions = int(val) elif str(key).strip() == "datadir": datadir = str(val).strip() else: print("Invalid key not set: %s" % str(key)) # Need to make sure that the datadir variable is set. try: print("datadir = '%s' " % datadir) except NameError: sys.exit("'datadir' variable not set. Check inputfile '%s'" % (datadir, filename)) # Spark and Hive contexts conf = SparkConf() sc = SparkContext(conf = conf) sqlContext = HiveContext(sc) df = utils.returnSparkDF(SQLContext(sc), inputs, "traffic") if df is None: sys.exit("'traffic' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "uniquedata") df = None df = utils.returnSparkDF(SQLContext(sc), inputs, "fbtraffic") if df is None: sys.exit("'fbtraffic' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "uniqueFBdata") df = None statement = "Select ud.loginid, ud.adid, ud.Type, ufd.Type as FBType "\ "from uniquedata ud left outer join uniqueFBdata ufd "\ "on ud.loginid = ufd.loginid and ud.adid = ufd.adid" adswithFBjoined = sqlContext.sql(statement) adswithFBjoined_cleaned = adswithFBjoined[adswithFBjoined['FBType'].isNull()] adswithFBjoined_cleaned = adswithFBjoined_cleaned.drop('FBType') sqlContext.registerDataFrameAsTable(adswithFBjoined_cleaned, "data") statement = "Select loginid, count(loginid) as viewcount from data group by loginid" temp = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(temp, "viewdata") statement = "Select d.* from data d, viewdata vd where d.loginid = vd.loginid and vd.viewcount > 1" temp2 = sqlContext.sql(statement) sqlContext.sql("drop table data") sqlContext.registerDataFrameAsTable(temp2, "data") temp, temp2 = (None, None) df = utils.returnSparkDF(SQLContext(sc), inputs, "agent") if df is None: sys.exit("'agent' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "agentdata") statement = "select loginid, adid, Type, count(adid) as counter from agentdata group by loginid, adid, Type" unique_adid_per_loginid = sqlContext.sql(statement) unique_adid_per_loginid = unique_adid_per_loginid.drop('counter') sqlContext.registerDataFrameAsTable(unique_adid_per_loginid, "agentdata") df = utils.returnSparkDF(SQLContext(sc), inputs, "favorite") if df is None: sys.exit("'favorite' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "favdata") df = None statement = "select * from data union all select * from agentdata union all select * from favdata" df2 = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(df2, "uniondata") df2 = None statement = "select loginid, max(Type) as UserMaxConversion from uniondata group by loginid" maxtype = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(maxtype, "maxconversiondata") statement = "select uniondata.loginid, uniondata.adid, uniondata.Type "\ "from uniondata, maxconversiondata where uniondata.loginid = maxconversiondata.loginid "\ "and uniondata.Type = maxconversiondata.UserMaxConversion" data = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(data, "data") # Delete tables tables = ["uniquedata", "FBdata", "uniqueFBdata", "agentdata", "favdata", "uniondata", "maxconversiondata"] for table in tables: sqlContext.sql("drop table if exists %s" % str(table)) df = utils.returnSparkDF(SQLContext(sc), inputs, "adclassified") if df is None: sys.exit("'adclassified' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "addata") df = None df = utils.returnSparkDF(SQLContext(sc), inputs, "geo") if df is None: sys.exit("'geo' query failed: SystemExit.") sqlContext.registerDataFrameAsTable(df, "geodata") df = None statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\ "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\ "from addata, geodata where addata.locationkey = geodata.locationkey" addata_for_join = sqlContext.sql(statement) statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\ "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\ "from addata, geodata where addata.locationkey = geodata.locationkey" addata_for_join = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(addata_for_join, "adtemp") statement = "select * from adtemp where PageViewCount < 10000" addata_for_join = sqlContext.sql(statement) sqlContext.registerDataFrameAsTable(addata_for_join, "addata_for_join") data, addata_for_join = (None, None) sqlContext.sql("drop table if exists addata") statement = "select a.*, b.AskingPrice, b.CollectiveDebt, b.PageViewCount, b.Municipal, b.CityPart "\ "from data a, addata_for_join b where a.adid = b.adid" data = sqlContext.sql(statement) data = data.fillna(0) data = data.repartition(numpartitions) # Save the files as csv using spark-csv from databricks try: st = time.time() data.write.format("com.databricks.spark.csv").save(datadir, mode="overwrite", codec="bzip2") et = time.time() print("File save time was: %.2f mins." % ((et-st)/60.)) except: sys.exit("Could not save files to dir '%s'. \n\nError = %s" % (datadir, sys.exc_info()[1])) finally: end_time = time.time() print("Spark ETL execution time = %.2f mins." % ((end_time-start_time)/60.)) # Stop spark and continue using in-memory computation (another script) sc.stop() return
# values的输出结果:Row(_c0=u'1', _c1=u'2', _c2=u'3.0'),数据类型被全部推断为“int”,也就是说数组的数据类型一定要一致,否则可以引发异常 """ """ source = sc.parallelize(['{"key" : [1, 2 , 3.0]}']) jsonRDD = hc.jsonRDD(source) jsonRDD.registerTempTable("temp_table") values = hc.sql("select key[0], key[1], key[2] from temp_table").collect() # values的输出结果:Row(_c0=1.0, _c1=2.0, _c2=3.0),数据类型被全部推断为“float” """ source = sc.parallelize( ['{"key" : [{"key1" : "value1", "key2" : [1, 2, 3], "key3" : [{"key4" : "value4", "key5" : [4, 5.0, 6]}]}]}']) jsonRDD = hc.jsonRDD(source) jsonRDD.registerTempTable("temp_table") values = hc.sql( "select key[0].key1, key[0].key2[0], key[0].key3[0].key4, key[0].key3[0].key5[1] from temp_table").collect() sc.stop() for value in values: print value
from pyspark.mllib.evaluation import MulticlassMetrics from copy import deepcopy sc = SparkContext() sqlContext = HiveContext(sc) qry = """SELECT *,white/population as white_percent, black/population as black_percent, asian/population as asian_percent, pacific_islander/population as pi_percent, other_race/population as other_race_percent, multiple_race/population as multiple_percent, hispanic/population as hispanic_percent FROM census_rest_success where days_open > 365""" df = sqlContext.sql(qry) ## Lets train a Support Vector Classifier on this data #CITATION: #http://stackoverflow.com/questions/33900726/count-number-of-non-nan-entries-in-each-column-of-spark-dataframe-with-pyspark def count_not_null(c): return sum(col(c).isNotNull().cast("integer")).alias(c) exprs = [count_not_null(c) for c in df.columns] df.agg(*exprs).show() df = df.dropna() features = df.select(df['goodforkids'], df['goodforgroup'], df['goodfordessert'], df['goodforlatenight'], df['goodforlunch'], df['goodfordinner'], df['goodforbrunch'], df['goodforbreakfast'], df['romantic'],
def parse(line): matcher = pattern.match(line) if matcher: return matcher.groups() else: return None columns = source.map(parse).filter( lambda columns: columns and len(columns) == 3) rows = columns.map( lambda columns: (columns[0], columns[1], columns[2])) schema = StructType([StructField("col1", StringType(), False), StructField( "col2", StringType(), False), StructField("col3", StringType(), False)]) table = hc.applySchema(rows, schema) table.registerAsTable("temp_mytable") datas = hc.sql("select * from temp_mytable").collect() sc.stop() if datas: for data in datas: print data
StructField("timestamp", TimestampType(), False), StructField("date", DateType(), False), StructField("array", ArrayType(IntegerType(), False), False), StructField("col_map", MapType(StringType(), StringType(), False), False), StructField( "struct", StructType( [ StructField("first", IntegerType(), False), StructField("second", FloatType(), False), StructField("third", StringType(), False), ] ), False, ), ] ) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql( "select byte, short, int, long, float, double, decimal, string, boolean, timestamp, date, array[0], array[1], array[2], col_map['key'], struct.first, struct.second, struct.third from temp_table" ).collect() sc.stop() for row in rows: print row
from pyspark.sql import HiveContext from pyspark.mllib.stat import Statistics from pyspark import SparkContext sc = SparkContext() sqlContext = HiveContext(sc) initialquery = sqlContext.sql("SELECT A.avg_procedure_score, B.patientsurveyscore FROM (SELECT p.hospitalid, avg(p.score) as avg_procedure_score FROM procedures p GROUP BY p.hospitalid) A JOIN survey_results B ON B.hospitalid = A.hospitalid") survey_score = initialquery.map(lambda x: x.patientsurveyscore) avg_procedure_scores = initialquery.map(lambda x: x.avg_procedure_score) print Statistics.corr(avg_procedure_scores, survey_score, method="pearson")