def main(sc): spark = HiveContext(sc) sqlContext = HiveContext(sc) #busiestcity rows = sc.textFile('../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv').mapPartitionsWithIndex(parseCSV3) df = sqlContext.createDataFrame(rows) flight_origin = df.select('YEAR', 'MONTH', 'ORIGIN_CITY_NAME', 'ORIGIN_AIRPORT_ID').groupBy('YEAR', 'MONTH', 'ORIGIN_CITY_NAME').count().withColumnRenamed('count', 'origin_count') flight_origin = flight_origin.withColumnRenamed('ORIGIN_CITY_NAME', 'City_of_Departure') flight_dest = df.select('YEAR', 'MONTH', 'DEST_CITY_NAME', 'DEST_AIRPORT_ID').groupBy('YEAR', 'MONTH', 'DEST_CITY_NAME').count().withColumnRenamed('count', 'dest_count') flight_dest = flight_dest.withColumnRenamed('DEST_CITY_NAME', 'City_of_Arrival') flight_dest = flight_dest.withColumnRenamed('YEAR', 'YEAR_dest') flight_dest = flight_dest.withColumnRenamed('MONTH', 'MONTH_dest') total_counts = flight_origin.join(flight_dest,((flight_origin.City_of_Departure == flight_dest.City_of_Arrival) & (flight_origin.YEAR == flight_dest.YEAR_dest) & (flight_origin.MONTH == flight_dest.MONTH_dest))) total_counts = total_counts.select(total_counts.City_of_Departure.alias('City'),(total_counts.origin_count + total_counts.dest_count).alias('sum_counts'), 'YEAR', 'MONTH') total_counts_city_pivot = total_counts.groupBy('City').pivot('MONTH').sum('sum_counts') total_counts_city_pivot.toPandas().to_csv('Output/busiest_city.csv') #grupedbyday rows = sc.textFile('../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv').mapPartitionsWithIndex(parseCSV4) df = sqlContext.createDataFrame(rows) grouped_by_day = df.groupBy('FL_DATE').count() grouped_by_day.toPandas().to_csv('Output/grouped_by_day.csv') #mostcommondeparturetime rows = sc.textFile('../lmf445/Flight_Project/Data/864625436_T_ONTIME_*.csv').mapPartitionsWithIndex(parseCSV5) df = sqlContext.createDataFrame(rows) departure_time_pivot = df.groupBy('DEP_TIME_BLK').pivot('MONTH').count() departure_time_pivot.toPandas().to_csv('Output/most_common_departure_time.csv')
def main(sc, SQLContext): sqlContext = HiveContext(sc) sc.setLogLevel("ERROR") query = "USE {0}" query = query.format(db_name) sqlContext.sql(query) tables = sqlContext.sql("SHOW TABLES") tableNames = tables.select("tableName").rdd.map(lambda r: r) tableNames = tableNames.map(lambda x: x.tableName).collect() tableNames = [str(i) for i in tableNames] schema_empty_df = StructType([StructField("table_ddl",StringType(),True)]) empty_df = sqlContext.createDataFrame(sc.emptyRDD(), schema_empty_df) df1 = empty_df for i in tableNames: show_query = "show create table "+i drop_query = "drop table "+i+";\n" describe_query = "describe formatted "+i seperator = ";\n" try: rdd = sc.parallelize([drop_query]) newRDD = rdd.map(lambda x:{"table_ddl":x}) newDF = rdd.map(lambda p: Row(table_ddl=p)).toDF() df = df1.unionAll(newDF) desc = sqlContext.sql(describe_query) desc_1 = desc.select(['data_type']).where("col_name='Location'") desc_2 = desc_1.rdd.map(lambda x:x.data_type).collect() desc_3 = [str(i) for i in desc_2] desc_4 = ''.join(desc_3) df0 = sqlContext.sql(show_query) show_1 = df0.rdd.map(lambda x:x.createtab_stmt).collect() show_2 = [str(i) for i in show_1] show_3 = ''.join(show_2) if show_3.find("LOCATION '") < 0: loc_query = "LOCATION '"+desc_4+"'"+"\n TBLPROPERTIES (" final_create_table=show_3.replace("TBLPROPERTIES (", loc_query) else: final_create_table = show_3 list_final = [final_create_table] rdd_create_table = sc.parallelize(list_final) df_create_table = rdd_create_table.map(lambda p: Row(create_table_ddl=p)).toDF() df1 = df.unionAll(df_create_table) rdd1 = sc.parallelize([seperator]) newRDD1 = rdd1.map(lambda x:{"delim":x}) newDF1 = sqlContext.createDataFrame(newRDD1, ["delim"]) df1 = df1.unionAll(newDF1) except:
class Testing_Resources_Generation_2(unittest.TestCase): def setUp(self): fpath = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) with open(fpath + '/data_source/bookings_fully_overlapped.json' ) as bookings_source: self.bookings = json.load(bookings_source) with open(fpath + '/data_source/cfg.json') as cfg_source: self.cfg = json.load(cfg_source) today = '20180402' self.days = optimizer.util.get_days_from_bookings(today, self.bookings) self.sc = SparkContext.getOrCreate() self.hive_context = HiveContext(self.sc) self.schema = optimizer.util.get_common_pyspark_schema() self.bookings_map = optimizer.util.get_bookings_map(self.bookings) def test_resources_count(self): df = self.hive_context.createDataFrame(self.sc.emptyRDD(), self.schema) df = optimizer.main.generate_resources(self.cfg, df, self.bookings_map, self.days, self.bookings, self.hive_context) self.assertTrue(df.count() == 7) def test_resource_1(self): df = self.hive_context.createDataFrame(self.sc.emptyRDD(), self.schema) df = optimizer.main.generate_resources(self.cfg, df, self.bookings_map, self.days, self.bookings, self.hive_context) pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'allocated', 'amount']) pandas_df_expected.loc[0] = [ '20180402', ['b1', 'b3', 'b2'], [], {}, 733 ] pandas_df_expected.loc[1] = [ '20180402', ['b1', 'b3'], ['b2'], {}, 11181 ] pandas_df_expected.loc[2] = [ '20180402', ['b1', 'b2'], ['b3'], {}, 3575 ] pandas_df_expected.loc[3] = [ '20180402', ['b1'], ['b3', 'b2'], {}, 6047 ] pandas_df_expected.loc[4] = [ '20180402', ['b3', 'b2'], ['b1'], {}, 1002 ] pandas_df_expected.loc[5] = [ '20180402', ['b3'], ['b1', 'b2'], {}, 12241 ] pandas_df_expected.loc[6] = [ '20180402', ['b2'], ['b1', 'b3'], {}, 1410 ] pandas_df_generated = df.select("*").toPandas() self.assertTrue( assert_frame_equal(pandas_df_expected, pandas_df_generated, check_dtype=False) == None)
def gen_report_table(hc,curUnixDay): rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6]))) HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor") #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_indoor order by entityid,clientmac,utoday" df_id_stat=hc.sql(sql) df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_id_mm df_min_max ,to caculate firtarrival and last arrival df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY] df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon) df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth") #newly added part for indoors7 and indoors30 based on current date df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0)) df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0)) df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac"))) df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30") hc.sql("drop table if exists df_indoor_fin") df_indoor_fin.write.saveAsTable("df_indoor_fin") rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5]))) HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow") # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday sql="select entityid,clientmac,utoday,UFIRSTDAY," sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt," sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 2505600 preceding) as day_30," # 2505600 is 29 days sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range 518400 preceding) as day_7," #518400 is 6 days sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY range 1 preceding) as pre_mon " sql=sql+"from df_flow order by entityid,clientmac,utoday" df_fl_stat=hc.sql(sql) df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac"))) #df_fl_mm df_min_max ,to caculate firtarrival and last arrival df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct() #distinct df is for lag function to work df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0) cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY] df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon) df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth") hc.sql("drop table if exists df_flow_fin") df_flow_fin.write.saveAsTable("df_flow_fin")
def main(sc): logger = getlogger('carga.serverlogs') spark = SparkSession(sc) hqlContext = HiveContext(sc) logger.debug('Inicio de Processo') # Lendo o arquivo texto e criando um RDD em memoria com Spark line = sc.textFile("/home/cloudera/Projeto6/serverlogs") dataset = line.map(lambda line: get_row(line)) # converte para o dataframe do pandas serverlogs_df = dataset.toDF() serverlogs_pd = serverlogs_df.toPandas() # converte o tipo da coluna bytes para numerico serverlogs_pd['bytes'] = pd.to_numeric(serverlogs_pd['bytes'], errors='coerce') # cria a coluna data serverlogs_pd['data'] = pd.to_datetime(serverlogs_pd.timestamp.str[:11]) # cria um dataframe do hive serverlogs_hdf = hqlContext.createDataFrame(serverlogs_pd) serverlogs_hdf.registerTempTable('serverlogs_tmp') executar_insert_tabela('default.t_serverlogs', 'serverlogs_tmp', logger, hqlContext) logger.debug('Todas tarefas do pipe executadas com sucesso!')
def sentiment_score(sc): #spark = HiveContext(sc) sqlContext = HiveContext(sc) df = sqlContext.read.json("/user/skk456/project/twitterData.json") text_list = df.select(df['text'], df['id']) text = df.select("text").rdd.flatMap(lambda x: x).collect() tweet_id = df.select("id").rdd.flatMap(lambda x: x).collect() tweets = map(lambda tweet: tokenize.sent_tokenize(tweet), text) sia = SentimentIntensityAnalyzer() score = map(lambda tweet: sia.polarity_scores(str(tweet)), tweets) comp = map(lambda i: i['compound'], score) pos = map(lambda i: i['pos'], score) neu = map(lambda i: i['neu'], score) neg = map(lambda i: i['neg'], score) score = pd.DataFrame() score['id'] = tweet_id score['positive'] = pos score['negative'] = neg score['neutral'] = neu score['compound'] = comp score_spark = sqlContext.createDataFrame(score) score_spark.rdd.saveAsTextFile('project/output1')
def setup(): ''' table setup ''' if not os.environ.has_key('SPARK_HOME'): raise Exception( "Environment variable SPARK_HOME must be set " + "to the root directory of the SPARK installation") spark_home_py = os.path.expandvars("$SPARK_HOME/python") sys.path.append(spark_home_py) file_list = glob.glob(spark_home_py + "/lib/py4j*.zip") if file_list is None: raise Exception( "p4j*.zip not found - this needs to be on the PYTHONPATH") sys.path.append(file_list[0]) try: from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext except ImportError: raise Exception("Required pyspark modules cannot be found") # Configure Spark conf = SparkConf().setAppName('SQLTODF_UT') conf = conf.setMaster(cfg.SPARK_MODE) sparkctx = SparkContext(conf=conf) pandasdf = pd.DataFrame({ 'name': ['Martin', 'Gemma'], 'age': [16, 52] }) sqlctx = HiveContext(sparkctx) sqldf = sqlctx.createDataFrame(pandasdf) sqldf.write.format('orc').mode('overwrite').saveAsTable( 'sqltodf_test') sparkctx.stop()
def do_ets_task(sc, ets_dburl_env, wfc): # 定义客户标识 cust_no = '1' isvalid = '1' etsTempTable = wfc ets_url = ets_dburl_env[wfc[:-2]]['dst'] slave_url = ets_dburl_env[wfc[:-2]]['src'] dbinfo = load_source('getdbinfo', os.path.join(os.path.dirname(__file__), 'Utils.py')).getdbinfo(slave_url) tabledict = load_source('query_sql_slave', os.path.join(os.path.dirname(__file__), 'Utils.py')).query_sql_slave(dbinfo) slaveTempTable = tabledict.get(wfc[:-2]) driver = "com.mysql.jdbc.Driver" sqlContext = HiveContext(sc) # driver = "com.mysql.jdbc.Driver" dff = sqlContext.read.format("jdbc").options(url=slave_url, dbtable=slaveTempTable, driver=driver).load() dff.registerTempTable(slaveTempTable) dft = sqlContext.read.format("jdbc").options(url=ets_url, dbtable=etsTempTable, driver=driver).load() dft.registerTempTable(etsTempTable) ds_ets = sqlContext.sql(" select max(updatets) as max from %s " % (etsTempTable)) pp = ds_ets.collect()[0] max_updates = pp.max slave_sql = '' try: if max_updates is not None: print(u"ets库中的最大时间是:" + str(max_updates)) slave_sql = " select id, examineeid, examid, roomid, stationid, examinerid, totalscore, begintime ,endtime,scoresheetcode,status, updatetime" \ " from %s where `updatetime` > '%s' " % (slaveTempTable, max_updates) else: print(u"本次为初次抽取") slave_sql = " select id, examineeid, examid, roomid, stationid, examinerid, totalscore, begintime ,endtime,scoresheetcode,status, updatetime" \ " from %s " % (slaveTempTable) ds_slave = sqlContext.sql(slave_sql) print(u'slave 中 符合条件的记录数为:%s' % (ds_slave.count())) now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print(u'开始组装数据...') src_fields = json.dumps({'osce_score': ['id', 'examineeid', 'examid', 'roomid', 'stationid', 'examinerid', 'totalscore', 'begintime', 'endtime', 'scoresheetcode', 'status', 'updatetime']}) # 字段值 filedvlue = ds_slave.map(lambda row: (row.id, row.examineeid, row.examid, row.roomid, row.stationid, row.examinerid, row.totalscore, str(row.begintime), str(row.endtime), row.scoresheetcode, row.status, cust_no, isvalid, md5(row), now_time, str(row.updatetime))) # 创建列 schemaString = "id,examineeid,examid,roomid,stationid,examinerid,totalscore,begintime," \ "endtime,scoresheetcode,status,cust_no,isvalid,src_fields_md5,createts,updatets" fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split(",")] schema = StructType(fields) # 使用列名和字段值创建datafrom schemaObj = sqlContext.createDataFrame(filedvlue, schema) print(u'组装数据完成...') # print schemaPeople # for row in schemaPeople: # print row.id print(u'开始执写入数据...') # 写入数据库 schemaObj.write.insertInto(etsTempTable, overwrite=False) print(u'写入完成') except Exception, e: # e.message 2.6 不支持 print (str(e)) raise Exception(str(e))
def main(sc): spark = HiveContext(sc) sqlContext = HiveContext(sc) print "holaaaaa" rows = sc.textFile( '../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv' ).mapPartitionsWithIndex(parseCSV) df = sqlContext.createDataFrame(rows) df = df.withColumn('DEP_DEL15', df['DEP_DEL15'].cast('int')) df = df.na.drop() delay_counts = df.select( 'ORIGIN', 'DEP_DEL15').groupby('ORIGIN').sum().withColumnRenamed( 'sum(DEP_DEL15)', 'origin_delay_count') delay_counts = delay_counts.join( flight_origin, delay_counts.ORIGIN == flight_origin.Airport_origin) delays_origin = delay_counts.select( 'Airport_origin', (delay_counts.origin_delay_count / delay_counts.origin_count ).alias('%_flights_departing_15+_minutes_late')) delays_origin = delays_origin.sort( desc('%_flights_departing_15+_minutes_late')) delays_origin.toPandas().to_csv('Output/MostDepartureDelays.csv')
def process_rdd(time, rdd): print("----------- %s -----------" % str(time)) try: sql_context = HiveContext(rdd.context) # Convert the RDD to Row RDD row_rdd = rdd.map(lambda w: Row(tweet=w, score=analyzeSentiment(w))) schema = StructType([ StructField("tweet", StringType(), True), StructField("score", FloatType(), True) ]) # Create a DF with the specified schema new_tweets_df = sql_context.createDataFrame(row_rdd, schema=schema) # Register the dataframe as table new_tweets_df.registerTempTable("new_tweets") # Insert new tweets,scores into table tweets sql_context.sql("INSERT INTO TABLE tweets SELECT * FROM new_tweets") # Get all the tweets from the table using SQL tweets_sentiment_df = sql_context.sql("SELECT * FROM tweets") tweets_sentiment_df.show() # Sends the tweets and their sentiment score to the dashboard send_df_to_dashboard(tweets_sentiment_df) except: e = sys.exc_info()[0] print("Error: %s" % e)
def write_to_hive(time, rdd): def process_row(x): row_dict = dict() row_dict["timestamp"] = 0 if "timestamp" not in x else x["timestamp"] row_dict["source_type"] = "" if "source.type" not in x else x["source.type"] row_dict["user_name"] = "" if "src_user_name" not in x else x["src_user_name"] row_dict["entity_name"] = "" if "ip_src_addr" not in x else x["ip_src_addr"] row_dict["guid"] = "" if "guid" not in x else x["guid"] row_dict["alert_score"] = 0.0 if "alert_score" not in x else x["alert_score"] row_dict["alerts"] = "" if "alerts" not in x else x["alerts"] row_dict["y"] = 0 if "y" not in x else x["y"] row_dict["m"] = None if "m" not in x else x["m"] row_dict["d"] = None if "d" not in x else x["d"] for numerical_colname in EVENT_MODEL_NUMERICAL_COLUMNS: row_dict[numerical_colname] = 0.0 if numerical_colname not in x else float(x[numerical_colname]) for categorical_colname in EVENT_MODEL_CATEGORICAL_COLUMNS: row_dict[categorical_colname] = "" if categorical_colname not in x else str(x[categorical_colname]) row = Row(**row_dict) return row try: spark = SparkSession \ .builder \ .appName("event-anomaly-online-score") \ .enableHiveSupport() \ .getOrCreate() hive_context = HiveContext(spark.sparkContext) hive_context.setConf("hive.exec.dynamic.partition", "true") hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict") row_rdd = rdd.map(process_row) sdf = hive_context.createDataFrame(row_rdd) sdf = sdf.drop_duplicates(subset=["guid"]) sdf.cache() source_type_list = [TENANT_NAME + "_" + data_source for data_source in DATA_SOURCE_LIST] model_dict = dict() for data_source in DATA_SOURCE_LIST: model_dict[data_source] = load_event_anomaly_model(spark=spark, data_source=data_source) for source_type in source_type_list: sdf_source = sdf.filter(sdf.source_type == source_type) if not sdf_source.rdd.isEmpty(): sdf_source.cache() database = source_type.split("_")[0] data_source = source_type.split("_")[1] table = data_source + "_event_alert_score" sdf_source.show(3) eas_sdf = get_event_anomaly_score(data_source=data_source, model_dict=model_dict, input_df=sdf_source) result_sdf = sdf_source.join(eas_sdf.select(["guid", "EAS"]), on="guid", how="left") result_sdf = result_sdf.na.fill(0.0, subset=["EAS"]) result_sdf.show(3) result_sdf.select("guid", "timestamp", "user_name", "entity_name", "source_type", "alerts", "alert_score", "EAS", "y", "m", "d").write.insertInto(database + "." + table) except Exception as e: pass
def hiveSaveNews(dfNewsContents, table_name): from pyspark.sql import HiveContext hiveContext = HiveContext(sc) tmpDf = hiveContext.createDataFrame( dfNewsContents[['news_code', 'title', 'site', 'writing_time', 'preproc_content', 'img', 'content', 'company']]) tmpDf.registerTempTable("tmpDf") hiveContext.sql("insert into table {table_name} select * from tmpDf".format(table_name=table_name))
def process(time, rdd): print("========= %s =========" % str(time)) try: sqlContext = HiveContext(sc) # FIX: memory error Spark 2.0 bug ( < 2.0 ) sqlContext.setConf("spark.sql.tungsten.enabled","false") # v2.01 spark = SparkSession.builder \ #.master("local") \ #.appName("Word Count") \ #.config("spark.some.config.option", "some-value") \ #.getOrCreate() # Get the singleton instance of SparkSession #nzs v1.0 spark = getSparkSessionInstance(rdd.context.getConf()) if rdd.count() < 1: return; # Convert RDD[String] to RDD[Row] to DataFrame sqlRdd = rdd.map( lambda x: json.loads(x)).map(lambda r: Row( metrics=r["metrics"], name=r["name"], value=r["value"] ) ) wordsDataFrame = sqlContext.createDataFrame(sqlRdd) wordsDataFrame.show() # Creates a temporary view using the DataFrame. wordsDataFrame.registerTempTable("starwarstemp") # Creates a query and get the alam dataset using the temp table wordCountsDataFrame = sqlContext.sql("select * from starwarstemp") wordCountsDataFrame.printSchema() with open(SparkFiles.get('webinar_streaming.sql')) as test_file: alertsql=test_file.read() #logging.info(alertsql) alertDataFrame = sqlContext.sql(alertsql) alertDataFrame.show() alertDataFrame.printSchema() # save all values to HBASE # IF NEED FILTER LATER .filter(lambda x: str(x["metrics"])=='action-credit-limit') \ # create HBASE mapper rowRdd = rdd.map( lambda x: json.loads(x))\ .map(lambda r: ( str(r["metrics"]) ,[ str(r["name"])+"-"+datetime.datetime.now().strftime("%Y%m%d%H%M%S"), "action" if str(r["metrics"])=="action-credit-limit" else "healt", str(r["metrics"]), str(r["value"])] )) table = 'starwarsinbox' host = 'node-master2-KcVkz' keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter" valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter" conf = {"hbase.zookeeper.quorum": host, "hbase.mapred.outputtable": table, "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat", "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable", "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"} rowRdd.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv) except Exception as merror: print (merror) raise
def tear_down(): for table in data.keys(): hiveContext = HiveContext(sc) df = hiveContext.createDataFrame(data[table], fields[table]) hiveContext.sql('use test_db') try: df.registerTempTable("demo") hiveContext.sql("insert into {table} partition(ds='{date}') select * from demo".format(table=table,date=date)) # hiveContext.sql("insert into {table} partition(ds="") select * from demo".format(table=table)) except Exception as e: df.saveAsTable("{table}".format(table=table))
def main(sc): spark = HiveContext(sc) sqlContext = HiveContext(sc) print "holaaaaa" rows = sc.textFile('../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv').mapPartitionsWithIndex(parseCSV) df = sqlContext.createDataFrame(rows) busiest_route_month_pivot = \ df.select('ORIGIN_AIRPORT_ID', 'ROUTE', 'MONTH') \ .groupBy('ROUTE').pivot('MONTH').count() busiest_route_month_pivot.toPandas().to_csv('Output/MonthlyRoutes.csv')
def do_ets_task(sc, ets_dburl_env, wfc): # 定义客户标识 cust_no = '1' isvalid = '1' etsTempTable = wfc ets_url = ets_dburl_env[wfc[:-2]]['dst'] slave_url = ets_dburl_env[wfc[:-2]]['src'] dbinfo = load_source('getdbinfo', os.path.join(os.path.dirname(__file__), 'Utils.py')).getdbinfo(slave_url) tabledict = load_source('query_sql_slave', os.path.join(os.path.dirname(__file__), 'Utils.py')).query_sql_slave(dbinfo) slaveTempTable = tabledict.get(wfc[:-2]) driver = "com.mysql.jdbc.Driver" sqlContext = HiveContext(sc) dff = sqlContext.read.format("jdbc").options(url=slave_url, dbtable=slaveTempTable, driver=driver).load() dff.registerTempTable(slaveTempTable) dft = sqlContext.read.format("jdbc").options(url=ets_url, dbtable=etsTempTable, driver=driver).load() dft.registerTempTable(etsTempTable) try: slave_sql = " select id, learn_id, learn_type, scoresheetcode " \ " from %s " % (slaveTempTable) ds_slave = sqlContext.sql(slave_sql) print(u"覆盖式插入:共%s条数据" % ds_slave.count()) # sqlContext.sql(" delete from %s " % etsTempTable) ddlsql = " truncate table %s " % etsTempTable # 删除表中数据 使用 jdbc方式 dbinfo = load_source('getdbinfo', os.path.join(os.path.dirname(__file__), 'Utils.py')).getdbinfo(ets_url) load_source('execute_sql_ets', os.path.join(os.path.dirname(__file__), 'Utils.py')).execute_sql_ets(ddlsql, dbinfo) now_time = datetime.datetime.now() print(u'开始组装数据...') src_fields = json.dumps({'GradeItem': ['id', 'learn_id', 'learn_type', 'scoresheetcode']}) # 字段值 filedvlue = ds_slave.map( lambda row: (row.id, row.learn_id, row.learn_type, row.scoresheetcode, cust_no, isvalid, md5(row), now_time, now_time)) # 创建列 schemaString = "id,learn_id,learn_type,scoresheetcode,cust_no,isvalid,src_fields_md5,createts,updatets" fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split(",")] schema = StructType(fields) # 使用列名和字段值创建datafrom schemaPeople = sqlContext.createDataFrame(filedvlue, schema) print(u'组装数据完成') # print schemaPeople # for row in schemaPeople: # print row.id print(u'开始执写入数据...') # 写入数据库 schemaPeople.write.insertInto(etsTempTable) print(u'写入完成') except Exception, e: # e.message 2.6 不支持 print(traceback.print_exc()) print(str(e)) raise Exception(str(e))
def select_func_6(signalDataDf,techList,sliceDict): numOfDailySignal = sliceDict.get('numOfDailySignal') conf = SparkConf().setAppName("spark_infer_schema") sc = SparkContext(conf=conf) hc = HiveContext(sc) # 读入数据 signalDataDf = signalDataDf.reset_index(drop=False) signalDataDf.rename(columns={'level_0':'TradingDay', 'level_1':'WindCode'},inplace=True) signalDataDf['TradingDay'] = signalDataDf['TradingDay'].astype('str') # print(signalDataDf) # 转成spark dataframe signalDataSpDf = hc.createDataFrame(signalDataDf) # print('signalDataSpDf') # signalDataSpDf.printSchema() # print(signalDataSpDf) signalDataSpDf.registerTempTable("signalData") # 分组排序 sql="select TradingDay,WindCode,Mom from ( " + \ "select TradingDay,WindCode,Mom,row_number() OVER (PARTITION BY TradingDay ORDER BY Mom DESC) rank from signalData" + \ ") tmp where rank<=" + str(numOfDailySignal) allSelectStockSpDf = hc.sql(sql) # print('allSelectStockSpDf') # allSelectStockSpDf.printSchema() # allSelectStockSpDf.show() # 转成pandas dataframe allSelectStockDf = allSelectStockSpDf.toPandas() allSelectStockDf['TradingDay'] = allSelectStockDf['TradingDay'].astype('datetime64') allSelectStockDf = allSelectStockDf.set_index([StockConst.TRADINGDAY,StockConst.INNERCODE]) # print('allSelectStockDf') # print(allSelectStockDf) # allSelectStockDf = SelectUtil.getTopNAndInsertVolWeight(numOfDailySignal, allSelectStockDf) volWeight = 1/numOfDailySignal allSelectStockDf.insert(len(allSelectStockDf.columns), 'volWeight', volWeight) print(allSelectStockDf) return allSelectStockDf
def main(): d = date.today() t = timedelta(days=50 * 365) start = d - t end = d + t total = [] for k in daterange(start, end): total.append(k) print sys.getsizeof(total) conf = SparkConf() sc = SparkContext(conf=conf) hc = HiveContext(sc) rd = sc.parallelize(total).map(getCalendarDetails) df = hc.createDataFrame(rd) df.printSchema() print rd.take(10) df.saveAsTable("DIM_DATE")
def main(sc): spark = HiveContext(sc) sqlContext = HiveContext(sc) rows = sc.textFile( '../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv' ).mapPartitionsWithIndex(parseCSV) df = sqlContext.createDataFrame(rows) df_unique = df.select(df.ROUTE.alias('ROUTE_UNIQUE'), 'DEST', 'ORIGIN').distinct() busyest_route_single = df.select('ROUTE').groupBy('ROUTE').count() busyest_route_single = busyest_route_single.join( df_unique, busyest_route_single.ROUTE == df_unique.ROUTE_UNIQUE) busyest_route_single = busyest_route_single.drop('ROUTE_UNIQUE') busyest_route_single = busyest_route_single.sort(desc('count')) busyest_route_single.show() busyest_route_single.toPandas().to_csv('Output/MostBussyRoute.csv')
def main(sc): spark = HiveContext(sc) sqlContext = HiveContext(sc) rows = sc.textFile( '../lmf445/Flight_Project/Data/864625436_T_ONTIME_*.csv') df = sqlContext.createDataFrame(rows) first = df.withColumn('ARR_DEL15', df['ARR_DEL15'].cast('int')) sec = first.na.drop() third = sec.select('ROUTE', 'ARR_DEL15', 'DEST', 'ORIGIN').filter( sec.ARR_DEL15 == 1).groupby('ROUTE').count().withColumnRenamed( 'count', 'delay_count') fourth = third.sort(desc('count')) fifth = sec.select('ROUTE', 'ARR_DEL15', 'DEST', 'ORIGIN').groupby('ROUTE').count() sixth = fifth.sort(desc('count')) eighth = fourth.join(sixth, 'ROUTE') ninth = eighth.sort(desc('count')) tenth = ninth.select('ROUTE', ninth['delay_count'] / ninth['count']) eleventh = tenth.sort(desc('(delay_count / count)')) twelveth = eleventh.join(eighth, 'ROUTE') thirteen = twelveth.sort(desc('(delay_count / count)')) thirteen.toPandas().to_csv('Output/route_delayone.csv')
def main(sc,load_id): sqlContext = HiveContext(sc) dept_df = sqlContext.sql("select name,id,email from dept_table") dept_df.createOrReplaceTempView('dept_df') dept_df_pd = dept_df.toPandas() dept_df_pd_copy = dept_df.toPandas() print('Data Shape',dept_df_pd.shape[0]) ######## Method Call ######################## def valid_email_algorithm(email): if len(email) > 9: if re.match(".+@[a-zA-Z0-9]+.[a-zA-Z0-9]$",email) !=None: return Y else: return N dept_df_pd['email_valid_flag'] = dept_df_pd.apply(lambda x:valid_email_algorithm(x['email'],dept_df_pd_copy),axis =1) dept_df_pd.describe() dept_spark_df = sqlContext.createDataFrame(dept_df_pd) ## if in case need to convert it back into spark data frame. dept_spark_df.write.option("compression","zlib").mode("overwrite").format("orc").save('/apps/hive/warehouse/dept.db/dept_table')
def main(sc): spark = HiveContext(sc) sqlContext = HiveContext(sc) rows = sc.textFile( '../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv' ).mapPartitionsWithIndex(parseCSV) df = sqlContext.createDataFrame(rows) flight_origin = df.select('FL_DATE', 'ORIGIN', 'ORIGIN_AIRPORT_ID')\ .groupBy('FL_DATE', 'ORIGIN') \ .count() \ .withColumnRenamed('count', 'origin_count') flight_origin = flight_origin.withColumnRenamed('ORIGIN', 'Airport_origin') flight_dest = df.select('FL_DATE', 'DEST', 'DEST_AIRPORT_ID') \ .groupBy('FL_DATE', 'DEST') \ .count() \ .withColumnRenamed('count', 'dest_count') flight_dest = flight_dest.withColumnRenamed('DEST', 'Airport_dest') flight_dest = flight_dest.withColumnRenamed('FL_DATE', 'FL_DATE_dest') total_counts = \ flight_origin.join(flight_dest, ((flight_origin.Airport_origin==flight_dest.Airport_dest) & (flight_origin.FL_DATE==flight_dest.FL_DATE_dest))) total_counts = \ total_counts.select(total_counts.Airport_origin.alias('Airport'), (total_counts.origin_count+total_counts.dest_count).alias('sum_counts'), 'FL_DATE')\ total_counts_airport_pivot = \ total_counts.groupBy('Airport').pivot('FL_DATE').sum('sum_counts') total_counts_airport_pivot.toPandas().to_csv( 'Output/busiest_airport_by_day.csv')
def main(): args = parser_arguments() start_date = args.start_date[0] end_date = args.end_date[0] sc = SparkContext() hc = HiveContext(sc) select = """ SELECT * FROM cluster_metrics_prod_2.container_fact where date between '{0}' and '{1}' """.format(start_date, end_date) df = hc.sql(select) header = { "jobid" : "string", "containerid" : "string", "start" : "bigint", "stop" : "bigint", "duration" : "bigint", "event" : "string", "size" : "double", "priority" : "int", "hostname" : "string", "system" : "string", "date" : "string" } all_rows = df.flatMap(split_data) schema_split_containers = hc.createDataFrame(all_rows) schema_split_containers.registerTempTable("split_containers") create_string = """ create table if not exists cluster_metrics_prod_2.container_fact_event_flattened ( jobid string, containerid string, start bigint, stop bigint, duration bigint, event string, size double, priority int, hostname string ) partitioned by ( system string, date string ) stored as orc """ set_dyn = "set hive.exec.dynamic.partition=true" set_nstat = "set hive.exec.dynamic.partition.mode=nonstrict" load_string = """ insert overwrite table cluster_metrics_prod_2.container_fact_event_flattened partition (system, date) select jobid, containerid, start, stop, duration, event, size, priority, hostname, system, date from split_containers """ print("Setting dynamic partition...") hc.sql(set_dyn) hc.sql(set_nstat) print("Creating Table...") hc.sql(create_string) print("Loading data into table...") hc.sql(load_string) print("DONE")
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row conf = SparkConf().setAppName("sparkmoviekeywords") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) inrdd = sc.textFile("/usr/azure/twitterdata/Tweets*") inrddsplt = inrdd.map(lambda line: line.split("|")) inrddspltflt = inrddsplt.filter(lambda spltln: spltln[5] == "None") inrddspltfltwrds = inrddsplt.filter(lambda spltln: spltln[8] != "neutral") inrdddf = inrddspltfltwrds.map(lambda spltln: (spltln[0],spltln[9].replace('_',' ').split(","))) inrddfmapv = inrdddf.flatMapValues(lambda x: x) keyworddf = inrddfmapv.map(lambda keyword: Row(movie=keyword[0],keywords=keyword[1])) keywordds = sqlContext.createDataFrame(keyworddf) keywordds.printSchema() keyworddsout = keywordds.groupBy("movie", "keywords").count() keyworddsout.printSchema() keyworddsordered = keyworddsout.orderBy("movie","count",ascending=False) keyworddsordered.show() keyworddsordered.write.format("orc").option("path","/usr/azure/moviekeywords").mode("overwrite").saveAsTable("moviekeywords")
from pyspark import SparkContext sc = SparkContext() from pyspark.sql import HiveContext sqlContext = HiveContext(sc) tbl_Effective_Care = sqlContext.sql("Select Provider_ID, Measure_ID, Score From tbl_Effective_Care_RAW where Score <> 'Not Available'").rdd tbl_Effective_Care.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_effective_care/") sqlContext.createDataFrame(tbl_Effective_Care).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_effective_care/") exit()
logSet = sc.textFile("hdfs://hdfs.hadoop.yourdomain.com:9000/hdfs/aatlogstore/YYYY/MM/*").map(loadData) aatlog = logSet.map(getPairKey) #make vidsession_log & vidsession data aatlogSet = aatlog.reduceByKey(lambda x, y: x) vidlogSet = aatlogSet.sortByKey().zipWithIndex().map(lambda x: ((x[0][0][0], x[1]), x[0][1])) vidlogSet.persist() vidlogLag = vidlogSet.map(lambda x:((x[0][0], x[0][1]+1), x[1])) vidLogRST = vidlogSet.leftOuterJoin(vidlogLag).map(mkVidLog) vidSet = vidlogSet.map(lambda x: (x[0][0], (x[0][1], x[1]))) vidProcSet = vidSet.combineByKey(vidCreate, vidMerge, vidMerge2) print "filter processing done " #save RDD to HIVE Tables hx = HiveContext(sc) vidTable = hx.createDataFrame(vidProcSet.map(lambda x:x[1])) vidLogTable = hx.createDataFrame(vidLogRST, samplingRatio=0.5) print "Dataframe Done" vidLogTable.saveAsTable('vidsession_log', mode='append') vidTable.saveAsTable('vidsession', mode='append') print "AATLOG Process Done" except Exception, e: print "MAIN Error %s" % e
sc = SparkContext() from pyspark.sql import HiveContext sqlContext = HiveContext(sc) import pandas as pd tbl_Survey_Responses = sqlContext.sql("Select Provider_ID, Communication_with_Nurses_Achievement_Points,Communication_with_Nurses_Improvement_Points, Communication_with_Nurses_Dimension_Score, Communication_with_Doctors_Achievement_Points, Communication_with_Doctors_Improvement_Points, Communication_with_Doctors_Dimension_Score, Responsiveness_of_Hospital_Staff_Achievement_Points, Responsiveness_of_Hospital_Staff_Improvement_Points, Responsiveness_of_Hospital_Staff_Dimension_Score, Pain_Management_Achievement_Points, Pain_Management_Improvement_Points, Pain_Management_Dimension_Score, Communication_about_Medicines_Achievement_Points, Communication_about_Medicines_Improvement_Points, Communication_about_Medicines_Dimension_Score, Cleanliness_and_Quietness_of_Hospital_Environment_Achievement_Points, Cleanliness_and_Quietness_of_Hospital_Environment_Improvement_Points, Cleanliness_and_Quietness_of_Hospital_Environment_Dimension_Score, Discharge_Information_Achievement_Points, Discharge_Information_Improvement_Points, Discharge_Information_Dimension_Score, Overall_Rating_of_Hospital_Achievement_Points, Overall_Rating_of_Hospital_Improvement_Points, Overall_Rating_of_Hospital_Dimension_Score, HCAHPS_Base_Score, HCAHPS_Consistency_Score From tbl_Survey_Responses_RAW") df_Survey_Responses = tbl_Survey_Responses.toPandas() df_Survey_Responses = df_Survey_Responses.iloc[1:] def Calculate_Points(x): try: if len(x.split(' ')) > 2: return float(x.split(' ')[0])/float(x.split(' ')[-1]) else: return float(x.split(' ')[0]) except ValueError: return '' for i in range(1,len(df_Survey_Responses.columns)): df_Survey_Responses.iloc[0:,i] = df_Survey_Responses.iloc[0:,i].map(Calculate_Points) SparkDF_Survey_Responses = sqlContext.createDataFrame(df_Survey_Responses).rdd SparkDF_Survey_Responses.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_survey_responses/") sqlContext.createDataFrame(SparkDF_Survey_Responses).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_survey_responses/") exit()
def processData(sc,hc,fs,con,incFileName,inThresh,outThresh,progTag): #incFileName="hdfs://namenode.navroomhdp.com:8020/data/t_cf_inc/100033/t_cf_20161028.txt" #inThresh=10 #outThresh=300 #************************************** # #this procedure will use incfile to caculate #flow(everyday one record) store as file #indoor(everyday one record) store as file #indoor_for_delete(every indoor records) store in hbase #indoor detail(every indoor records) store in hbase and as file # #destIndoorFile : /data/indoor/entityid/year/id_date.json used to generate report #destFlowFile : /data/flow/entityid/year/fl_date.json used to generate report #rec_destIndoorfile : /data/rec_indoor/entityid/year/id_date.json this folder is mirror of hbase records # # #************************************** destIndoorFile=get_str_indoorFileName(incFileName) #hdfs://namenode.navroomhdp.com:8020/data/indoor/100033/2016/id_20161028.txt rec_destIndoorfile=destIndoorFile.replace("/indoor/","/rec_indoor/") #hdfs://namenode.navroomhdp.com:8020/data/rec_indoor/101762/2016/id_20161011.txt destFlowFile =destIndoorFile.replace("/indoor/","/flow/").replace("id_","fl_") #hdfs://namenode.navroomhdp.com:8020/data/flow/101762/2016/fl_20161011.txt tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag) tmp_destFlowFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag) tmp_rec_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag) EntityID=int(get_str_entityID(incFileName)) #101762 histFileName=get_str_histFileName(incFileName) #processed file will be place here #hdfs://namenode.navroomhdp.com:8020/data/t_cf/101762/t_cf_20161011.txt if fs.exists(sc._jvm.Path(histFileName)): tmpFileName=get_str_tmpFileName(histFileName) #tmpFileName = hdfs://namenode.navroomhdp.com:8020/data/tmp/101762/t_cf_20161011.txt tmpFolderName=tmpFileName.rsplit('/',1)[0]+"tmp" #tmpFolderName=hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp #copy hist file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/hist and distroy the hist file sc._jvm.FileUtil.copy(fs,sc._jvm.Path(histFileName),fs,sc._jvm.Path(tmpFolderName+"/hist"),True,True,con) #copy inc file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/inc and destroy the inc file sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(tmpFolderName+"/inc"),True,True,con) #copymerge the 2 files (inc and hist) into one file sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmpFolderName),fs,sc._jvm.Path(tmpFileName),True,con,None) sc._jvm.FileUtil.copy(fs,sc._jvm.Path(tmpFileName),fs,sc._jvm.Path(incFileName),True,True,con) unixFirtDayofMonth = get_int_firstDayUnixDate(incFileName) # firtDayofMonth= 1475251200 it is 20161001 unixdate startUnixTime=get_int_fileNameUnixDate(incFileName) #1456808400 this is today's unix datetime rows_t_cf=sc.textFile(incFileName).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], stime=p[1],flag=p[2])) HiveContext.createDataFrame(hc,rows_t_cf).registerTempTable("t_cf_inc_tmp") hc.sql("select distinct clientmac,stime,flag from t_cf_inc_tmp").registerTempTable("t_cf") df=hc.sql("select distinct ClientMac,stime ,lag(stime) over (partition by ClientMac order by stime) as lag_time ,lead(stime) over (partition by ClientMac order by stime) as lead_time from t_cf where flag=1") df1=df.withColumn("diff" , df["stime"]-df["lag_time"]).na.fill(-1) df1.filter((df1.diff>=outThresh)|(df1.lag_time ==-1)|( df1.lead_time==-1)).registerTempTable("df2") df2=hc.sql("select ClientMac,stime,lag_time,lead_time,case when (diff < "+ str(outThresh) +" and diff>0) then diff ELSE 0 end as diff from df2") df3=df2.withColumn("lag_time1",df2.lag_time+df2.diff).drop( "lag_time") df3.withColumn("lag_time2",func.lead("lag_time1").over(Window.partitionBy("clientMac"))).registerTempTable("df3") df4=hc.sql("select ClientMac,cast(stime as int) as ETime ,cast(lag_time2 as int) as LTime,cast((lag_time2- stime) as int) as Seconds from df3").na.fill(-1) df5=df4.filter((df4.LTime>0)&(df4.Seconds>=inThresh)&(df4.ETime>startUnixTime)&(df4.ETime<(startUnixTime+86400))).withColumn("ENTITYID",lit(EntityID)) #86400 is seonds in one day df5.registerTempTable("df5") #DF5 will be save to hbase as indoor details(rec_destIndoorfile) ,df6 and df7 will be used for stats caculation df6=hc.sql("select ClientMac,ETime, LTime, Seconds ,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df5") df6.registerTempTable("df6_indoor") df7=hc.sql("select ClientMac,min(etime) as etime,max(ltime) as ltime,sum(Seconds) as seconds,utoday from df6_indoor group by ClientMac,utoday") df_current_result=df7.withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth)) flow_sql= "select ClientMac,min(stime) as etime,max(stime) as ltime from t_cf where stime >"+str(startUnixTime) + " and stime <"+str(startUnixTime+86400)+" group by clientmac" hc.sql(flow_sql).registerTempTable("df_flow_tmp") df_flow=hc.sql("select ClientMac,etime,ltime,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df_flow_tmp").withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth)) #df_flow.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_FLOW_TODAY") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save() #df_flow.saveAsTable("T_FLOW") if len(df5.head(1))==1: #df5 is not empty better than df5.rdd.isEmpty tmp_rec_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag) df5.select('clientmac','entityid','etime','ltime','seconds').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_rec_destIndoorFolder) #df5.write.mode('overwrite').json(tmp_rec_destIndoorFolder) df5.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save() if fs.exists(sc._jvm.Path(rec_destIndoorfile)): #the old indoor folder exists,will generate df_delete_pk for phoenix to delete invalid rows rows_rec_indoor=sc.textFile(rec_destIndoorfile).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=str(p[0]), entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]))) HiveContext.createDataFrame(hc,rows_rec_indoor).registerTempTable("df_old_indoor") df_old_indoor_pk=hc.sql("select ClientMac,ENTITYID,ETime from df_old_indoor") df_current_result_pk=hc.sql("select ClientMac,ENTITYID,ETime from df5") df_delete_pk = df_old_indoor_pk.subtract(df_current_result_pk) if len(df_delete_pk.head(1))==1: df_delete_pk.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR_FOR_DELETE").option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save() else: tmp_rec_destIndoorFolder="NONE" if len(df_flow.head(1))==1: tmp_destFlowFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag) df_flow.select('clientmac','entityid','etime','ltime','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destFlowFolder) #df_flow.write.mode('overwrite').json(tmp_destFlowFolder) else: tmp_destFlowFolder="NONE" if len(df_current_result.head(1))==1: tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag) df_current_result.select('clientmac','entityid','etime','ltime','seconds','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destIndoorFolder) #df_current_result.write.mode('overwrite').json(tmp_destIndoorFolder) else: tmp_destIndoorFolder="NONE" sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(histFileName),True,True,con) if fs.exists(sc._jvm.Path(destIndoorFile)): fs.delete(sc._jvm.Path(destIndoorFile)) if fs.exists(sc._jvm.Path(destFlowFile)): fs.delete(sc._jvm.Path(destFlowFile)) if fs.exists(sc._jvm.Path(rec_destIndoorfile)): fs.delete(sc._jvm.Path(rec_destIndoorfile)) #delete is a must if file already exists otherwise copymerge will fail if tmp_destIndoorFolder!="NONE": sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destIndoorFolder),fs,sc._jvm.Path(destIndoorFile),True,con,None) #destIndoorFile=get_str_indoorFileName(incFileName) if tmp_destFlowFolder!="NONE": sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destFlowFolder),fs,sc._jvm.Path(destFlowFile),True,con,None) if tmp_rec_destIndoorFolder!="NONE": sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_rec_destIndoorFolder),fs,sc._jvm.Path(rec_destIndoorfile),True,con,None)
datalines = datalines.map(lambda x: cleans(x)) #1 imports from pyspark.sql.types import * #2 create metadata fields = [StructField(field_name,StringType(),True) for field_name in firstline] schema = StructType(fields) #3 create a dataframe schemaLoans = sqlContext.createDataFrame(datalines, schema) #4 register it as a table called loans schemaLoans.registerTempTable("loans") #1 drop table, summarize and store in hive sqlContext.sql("drop table if exists LoansByTitle") sql = '''create table LoansByTitle stored as parquet as select title, count(1) as number from loans group by title order by number desc''' sqlContext.sql(sql) sqlContext.sql('drop table if exists raw') sql = '''create table raw stored as parquet as select title, emp_title,grade,home_ownership,int_rate,recoveries,collection_recovery_fee,loan_amnt,term from loans'''
from pyspark import SparkContext sc = SparkContext() from pyspark.sql import HiveContext sqlContext = HiveContext(sc) tbl_Readmissions = sqlContext.sql("Select Provider_ID, Measure_ID, Compared_to_National, Denominator, Score, Lower_Estimate, Higher_Estimate From tbl_Readmissions_RAW where Score <> 'Not Available' and Measure_ID = 'READM_30_HOSP_WIDE'").rdd tbl_Readmissions.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_readmissions/") sqlContext.createDataFrame(tbl_Readmissions).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_readmissions/") exit()
dailyDateIntervalDictionaryToCalculateFor = DateIntervalManager.createDailyIntervalDictionaryForPastYear(yesterday_date) number_of_days_in_dictionary = dailyDateIntervalDictionaryToCalculateFor.getNumberOfDaysInDictionary() minimum_number_of_days = int((4.0 / 7.0) * float(number_of_days_in_dictionary)) mapStockCsvToKeyValueClosure = StockRdd.getMapStockCsvToKeyValueForDatesInDictionaryClosure(dailyDateIntervalDictionaryToCalculateFor) symbol_creation_function_closure = StockRdd.getSymbolDataInstanceForDateDictionaryDataPointsClosure(dailyDateIntervalDictionaryToCalculateFor, yesterday_date) symbol_down_stocks_data_filtered = sample_data_rdd.map(mapStockCsvToKeyValueClosure)\ .filter(lambda line: not(line is None))\ .reduceByKey(lambda a,b : a + b)\ .map(lambda tuple : ( tuple[0], StockRdd.sort_and_compute_deltas( list(tuple[1]) ) ) )\ .filter(lambda tuple : len(list(tuple[1])) > minimum_number_of_days)\ .map(symbol_creation_function_closure)\ .filter(lambda symbol_and_instance_tuple: not(symbol_and_instance_tuple[1].getTodayPrice() is None))\ .map(StockRdd.getDownStocksDataTuple)\ .filter(lambda data_tuple: not(data_tuple[1] is None))\ .filter(lambda data_tuple: not(data_tuple[1] == float("inf"))) symbol_down_stocks_data_filtered_rows = symbol_down_stocks_data_filtered\ .map(lambda tuple : Row(symbol = tuple[0], span_unit_delta_percentage_ratio = tuple[1], today_price = tuple[2], today_unit_delta_percentage = tuple[3])) schemaDownStocks = sqlContext.createDataFrame(symbol_down_stocks_data_filtered_rows) down_stocks_table_name='down_stocks' schemaDownStocks.write.jdbc(url=mysql_url, table=down_stocks_table_name, mode="overwrite")
print 'Number of rows in the table {0}'.format(df_raw.count()) # removing all row not containing numbers for score variable # function to test if a string can be parsed in integer or not def CheckValidScore(s): try: int(s) return True except ValueError: return False # creating a RDD by filtering out invalid scores df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score)) # cretating datframe from the RDD df_clean = sqlCtx.createDataFrame(df_clean_rdd) print 'Number of rows in table after cleaning {0}'.format(df_clean.count()) # converting the data types for score column df_clean = df_clean.selectExpr('provider_id','measure_id','cast(score as int) as score') # get the maximum score per measure_id and name the column as 'max_score' df_max_scores = df_clean.groupBy('measure_id').max().collect() df_max_scores = sc.broadcast(df_max_scores) # function to extract max_score for each measure_id def get_max_score(id): return [score[1] for score in df_max_scores.value if score[0] == id][0] # creating a new RDD containing extra column for normalized score # that is ratio of current score with maximum score for the measure_id
from pyspark import SparkContext sc = SparkContext() from pyspark.sql import HiveContext sqlContext = HiveContext(sc) tbl_Measure_Dates = sqlContext.sql( "Select Measure_ID, Measure_Name, Measure_Start_Date, Measure_End_Date From tbl_Measure_Dates_RAW" ).rdd tbl_Measure_Dates.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_measure_dates/") sqlContext.createDataFrame(tbl_Measure_Dates).write.parquet( "/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_measure_dates/" ) exit()
############################## # Find the top 10 procedures with largest variability among hospitals # Get our data, but skip null normalizedscore df_total_quality = sqlContext.sql("select measureid, normalizedscore from total_quality where normalizedscore is not null") dataByMeasure = df_total_quality.map(lambda r: (r.measureid, r.normalizedscore)).groupByKey() # Group by measure id varByMeasure = dataByMeasure.map(lambda r: (r[0], float(np.var(list(r[1]))))) # Calculate variance sortedVarByMeasure = varByMeasure.sortBy(lambda r: r[1], ascending=False) # Sort by variance top_10_variance = sortedVarByMeasure.take(10) schema = StructType([ StructField("measureid", StringType(), True), StructField("variance", FloatType(), True)]) df_variance = sqlContext.createDataFrame(top_10_variance, schema) df_top_10_variance = df_variance.orderBy(df_variance.variance.desc()).limit(10) saveAsHiveTable(df_top_10_variance, "top_10_variating_procedure") print print "Top 10 variating procedures" print rank = 1 for i in df_top_10_variance.collect(): print "Rank:%d - MeasureId=%s, Variance=%f" % \ (rank, i.measureid, i.variance) rank += 1
def queryHive(self, timestart=0, timeend=0, tp=1, pt1= (0,0), pt2=(0,0), mmsi=['0'], output='csv', taskid=''): ''' :param timestart: :param timeend: :param tp: :param pt1: :param pt2: :param mmsi: :param output: :param taskid: :return: hivetableName ''' print "in queryHive" year = time.localtime(timestart).tm_year sqlContext = HiveContext(self.sc) # time filter table = None print 'mmsi',mmsi if mmsi[0] != '0': #for x mmsi: table = sqlContext.sql("select * from aisdynamiclog_{0} where drterminalcode={1}".format(year, mmsi[0])).rdd else: table = sqlContext.sql("select * from aisdynamiclog_{0}".format(year)).rdd #table.cache() #print table.count() # space filter rltrdd = None if tp == 0: pt2 = getlowerrightpoint(pt1[0], pt1[1], pt2[0], pt2[1]) pt_times1 = (revertdegree_f(pt1[0]), revertdegree_f(pt1[1])) pt_times2 = (revertdegree_f(pt2[0]), revertdegree_f(pt2[1])) if tp == 1: pt_times1 = (revertdegree_f(pt1[0]), revertdegree_f(pt1[1])) pt_times2 = (revertdegree_f(pt2[0]), revertdegree_f(pt2[1])) if tp == 2: pt_times1 = (revertdegree_f(pt1[0]), revertdegree_f(pt1[1])) if tp == 0 or tp == 1: rltrdd = table.filter(lambda row: callnglat.inside0_1(row, pt_times1, pt_times2)) elif tp == 2: rltrdd = table.filter(lambda row: callnglat.inside2(row, pt_times1, pt2[0])) #empty rdd if rltrdd.count() == 0: return newdf = sqlContext.createDataFrame(rltrdd) # regist with a unique name tablename = "tmptable" newdf.registerAsTable(tablename) #hivetablename = "testtable3" hivetablename = "query_{0}_{1}".format(taskid, year) sql_create = "CREATE TABLE IF NOT EXISTS {0} \ ROW FORMAT DELIMITED FIELDS TERMINATED BY \',\' LINES TERMINATED BY \'\\n\' \ AS SELECT * from ais_model where 1=0".format(hivetablename) sql_insert = "INSERT INTO TABLE {0} select * from {1}".format(hivetablename, tablename) sqlContext.sql(sql_create) sqlContext.sql(sql_insert) return hivetablename
predictPlayers.append(pp) else: #print "p not found in mlb?", p newname = alternateNames(p.upper(), pids) if newname is not None: #print "got it: ", newname pp['lookup_name'] = newname pp['player_id'] = encodedPlayerIds[str(pids[newname])] predictPlayers.append(pp) else: print "REALLY NOT FOUND.", p.upper() print "predictHitters=", predictHitters phRDD = sc.parallelize(predictHitters) phDF = sqlContext.createDataFrame(phRDD, samplingRatio=0.5) phDF.registerTempTable("fd_hitters") print "phDF=", phDF.take(2) print "predictPitchers=", predictPitchers ppRDD = sc.parallelize(predictPitchers) ppDF = sqlContext.createDataFrame(ppRDD, samplingRatio=0.5) ppDF.registerTempTable("fd_pitchers") print "ppDF=", ppDF.take(22) encodedHitterFeatures = sqlContext.parquetFile(rddDir + "/batting_features.enc.parquet") encodedHitterFeatures.registerTempTable("bfe") hfDF = sqlContext.sql("""select bfe.* from fd_hitters, bfe where fd_hitters.player_id = bfe.player_id and fd_hitters.game_date = bfe.game_date""")
if progTag==cnt: folderLists=x if progTag==cnt: folderLists=x cnt=cnt+1 for x in folderLists: entityid=int(get_str_entityID_byFolder(x)) df_id=df_mysql.filter(df_mysql.ID==entityid) inThresh =10 if (df_id.head(1)[0].IndoorSecondsThrehold==0) else df_id.head(1)[0].IndoorSecondsThrehold outThresh =300 if (df_id.head(1)[0].LeaveMinutesThrehold ==0) else df_id.head(1)[0].LeaveMinutesThrehold*60 incFiles=fs.listStatus(sc._jvm.Path(x)) for incFile in incFiles: if incFile.isFile(): curTime=str(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time()))) currrentFile=str(incFile.getPath()) processData(sc,hc,fs,con,currrentFile,inThresh,outThresh,progTag) pd_rowLog = pandas.Series([operType,curTime,currrentFile],index=pd_column) pd_dataFrameLog=pd_dataFrameLog.append(pd_rowLog, ignore_index=True) curDay=str(time.strftime("%Y-%m-%d",time.localtime(time.time()))) df_log=hc.createDataFrame(pd_dataFrameLog) df_log.sort(df_log.operType,df_log.processDate).repartition(1).write.mode('overwrite').json("/data/log/incData"+str(progTag)+"/"+curDay) sc.stop()
print 'Number of rows in the table {0}'.format(df_raw.count()) # removing all row not containing numbers for score variable # function to test if a string can be parsed in integer or not def CheckValidScore(s): try: int(s) return True except ValueError: return False # creating a RDD by filtering out invalid scores df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score)) # cretating datframe from the RDD df_clean = sqlCtx.createDataFrame(df_clean_rdd) print 'Number of rows in table after cleaning {0}'.format(df_clean.count()) # converting the data types for score column df_clean = df_clean.selectExpr('provider_id','measure_id','cast(score as int) as score') # creating dataframe for hospitals table df_hospital = sqlCtx.table("hospital") # joining hospitals table to hospital_state df_clean = df_clean.join(df_hospital, df_clean.provider_id == df_hospital.provider_id,'left_outer').select(df_clean.provider_id, df_clean.measure_id, df_hospital.state, df_clean.score) # get the maximum score per measure_id and name the column as 'max_score' df_max_scores = df_clean.groupBy('measure_id').max().collect()
from pyspark import SparkContext, SparkConf from pyspark.sql import Row, HiveContext #sc = SparkContext(appName="readHDFS") conf = SparkConf().setMaster("local").setAppName("fromHdfsToHive") sc = SparkContext(conf=conf) hvcontext = HiveContext(sc) ofiles = sc.wholeTextFiles( "hdfs:///user/cloudera/out/out4/result-*/part-00000") #returns (filename,content) ofiles_format = ofiles.map(lambda x: x[1]).map( lambda x: [line for line in x.splitlines()]).map(lambda x: x[0]).map( lambda x: tuple(x[1:-1].split(','))).map(lambda r: Row( dttime=r[0], symbol=r[1], txtype=r[2], totalvol=int(r[3]))) ofiles_df = hvcontext.createDataFrame(ofiles_format) ofiles_df.saveAsTable("stocks_Ordered") #save dataframe as a persistent Hive table
from pyspark.sql import HiveContext from pyspark import SparkConf from pyspark import SparkContext from operator import add import datetime conf = SparkConf().setAppName("EDF").setMaster("local") sc = SparkContext(conf=conf) hiveContext = HiveContext(sc) hiveContext.sql("use db_edf") firstday = 1325375940 #31/12/2011 23:59:00 ONE_DAY = 84600 df = hiveContext.sql("select timestp as tp, consumption as cp, site_id as id from consumption") #while (firstday < 1356998400): firstday += ONE_DAY daydate = datetime.datetime.fromtimestamp(float(firstday)).strftime('%Y-%m-%d') print(daydate) test = df.filter(df.tp < firstday).groupBy("id").sum("cp") test.show() vect_date = [daydate] * len(test.collect()) print(vect_date) df_date = hiveContext.createDataFrame(vect_date) df_date.show()
from pyspark import SparkContext, SparkConf from pyspark.sql import HiveContext, Row conf = SparkConf().setAppName("sparkmoviehashtags") sc = SparkContext(conf=conf) sqlContext = HiveContext(sc) inrdd = sc.textFile("/usr/azure/twitterdata/Tweets*") inrddsplt = inrdd.map(lambda line: line.split("|")) inrddspltflt = inrddsplt.filter(lambda spltln: spltln[5] == "None") inrddspltfltwrds = inrddsplt.filter(lambda spltln: spltln[7] != '') inrdddf = inrddspltfltwrds.map(lambda spltln: (spltln[0],spltln[7].split(","))) inrddfmapv = inrdddf.flatMapValues(lambda x: x) hashtgdf = inrddfmapv.map(lambda hashtag: Row(movie=hashtag[0],hashtags=hashtag[1])) hashtgds = sqlContext.createDataFrame(hashtgdf) hashtgds.printSchema() hashtgdsout = hashtgds.groupBy("movie", "hashtags").count() hashtgdsout.printSchema() hashtgdsordered = hashtgdsout.orderBy("movie","count",ascending=False) hashtgdsordered.show() hashtgdsordered.write.format("orc").option("path","/usr/azure/moviehashtags").mode("overwrite").saveAsTable("moviehashtags")
return None else: info = measureInfo[measureid] minScore = info[0] scoreRange = info[1] return float((score - minScore)/scoreRange) total_quality_normal = total_quality.map(lambda r: (r[0], r[1], r[2], normalizeScore(r[3], r[2]))) schema = StructType([ StructField("providerid", StringType(), True), StructField("state", StringType(), True), StructField("measureid", StringType(), True), StructField("normalizedscore", FloatType(), True)]) df_total_quality = sqlContext.createDataFrame(total_quality_normal, schema) saveAsHiveTable(df_total_quality, "total_quality") # Some hospitals have too few non-NA measure. To have a fair ranking, we want to set a min. bar # on the # of non-NA measure for our hospitals to participate in our evaluation. # For each hospital, find out the # of non-NA measure it has nonNAMeasureCount = dict(df_total_quality.map(lambda r: (r.providerid, r.normalizedscore)). combineByKey( # Use combineByKey to count the # of non-NA Measure lambda value: 0 if value is None else 1, lambda x, value: x if value is None else x + 1, lambda x, y: x + y).collect()) # Find the 25th percentile of non-NA measure, and this will be the min-bar of # of non-NA measure. minMeasureCount = np.percentile(nonNAMeasureCount.values(), 25.)
sys.setdefaultencoding('utf-8') bfile = open( '/Users/zimoli/Downloads/RBDA-MCINTOSH/Project/RBDAProject/spark_data/checkin_toronto.txt' ) lines = [] for line in bfile: lines.append(line[:-1].split('\t')) sc = SparkContext("local", "checkins") sqlContext = HiveContext(sc) rdd = sc.parallelize(lines) df = sqlContext.createDataFrame( rdd, ['business_id', 'user_id', 'dates', 'main_cate']) df_d = df.na.drop() timestamps = sc.broadcast([ "2018_10", "2018_09", "2018_08", "2018_07", "2018_06", "2018_05", "2018_04", "2018_03", "2018_02", "2018_01", "2018_00", "2017_11", "2017_10", "2017_09", "2017_08", "2017_07", "2017_06", "2017_05", "2017_04", "2017_03", "2017_02", "2017_01", "2017_00", "2016_11", "2016_10", "2016_09", "2016_08", "2016_07", "2016_06", "2016_05", "2016_04", "2016_03", "2016_02", "2016_01", "2016_00", "2015_11", "2015_10", "2015_09", "2015_08", "2015_07", "2015_06", "2015_05", "2015_04", "2015_03", "2015_02", "2015_01", "2015_00", "2014_11", "2014_10", "2014_09", "2014_08", "2014_07", "2014_06", "2014_05", "2014_04", "2014_03", "2014_02", "2014_01", "2014_00" ]) cates = sc.broadcast([
cluster_center_overall_deltas = map(get_overall_delta_percentage_closure, centers) # Convert the list to a list of tuples mapping cluster number to the performance percentage converted_center_delta_list = DunaganListUtility.convert_list_to_value_and_index_tuple_list(cluster_center_overall_deltas) # Sort the list of tuples by performance (in ascending order) converted_center_delta_list.sort(lambda tuple_1, tuple_2: cmp(tuple_1[1], tuple_2[1])) # Convert the list mapping cluster_number to performance percentage into a list of Row() object for insertion into a database table # (ClusterId, Delta-Percentage) Row list construction converted_center_delta_list_rows = map(lambda delta_tuple: Row(cluster_id=int(delta_tuple[0]), delta_percentage=float(delta_tuple[1])), converted_center_delta_list) print "\n\n\n\nAbout to sqlContext.createDataFrame(converted_center_delta_list_rows)\n\n\n\n" # Create a data frame from the list of Rows schemaCenterDeltas = sqlContext.createDataFrame(converted_center_delta_list_rows) print "\n\n\n\nAbout to schemaCenterDeltas.write.jdbc(url=mysql_url, table=cluster_total_delta_percentages)\n\n\n\n" # Write the data frame to the database in table cluster_total_delta_percentages schemaCenterDeltas.write.jdbc(url=mysql_url, table='cluster_total_delta_percentages', mode="overwrite") # Produce a list which maps cluster numbers to symbols to produce an xref database table # (ClusterId, Symbol) XRef Row List construction cluster_id_symbol_xref_rows_list = [] for cluster_id, list_of_symbols in clusterGroupsDictionaryRdd.items(): for symbol in list_of_symbols: print "cluster_id: " + str(cluster_id) + "\t\tsymbol: " + symbol xrefRow = Row(cluster_id=int(cluster_id), symbol=str(symbol)) cluster_id_symbol_xref_rows_list.append(xrefRow)
class Testing_Resources_Split_1(unittest.TestCase): def setUp(self): warnings.simplefilter("ignore", ResourceWarning) fpath = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) with open(fpath + '/data_source/bookings_overall.json') as bookings_source: self.bookings = json.load(bookings_source) with open(fpath + '/data_source/cfg.json') as cfg_source: self.cfg = json.load(cfg_source) today = '20180402' self.days = optimizer.util.get_days_from_bookings(today, self.bookings) self.sc = SparkContext.getOrCreate() self.hive_context = HiveContext(self.sc) self.schema = optimizer.util.get_common_pyspark_schema() self.bookings_map = optimizer.util.get_bookings_map(self.bookings) def compare_splitted_dfs(self, pandas_df_expected, rows, new_bk_id): booking_spark_df = self.hive_context.createDataFrame(rows, self.schema) spark_df_splitted_rdd = booking_spark_df.rdd.flatMap( optimizer.main.bb_split_method(self.cfg, self.bookings_map, new_bk_id)) spark_df_splitted = self.hive_context.createDataFrame( spark_df_splitted_rdd, self.schema) pandas_df_splitted = spark_df_splitted.select("*").toPandas() print(pandas_df_expected) print(pandas_df_splitted) self.assertTrue( assert_frame_equal(pandas_df_expected, pandas_df_splitted, check_dtype=False) == None) # es: elasticsearch def test_es_bookings_search(self): self.assertTrue(len(self.bookings) >= 3) def test_es_predictions_search(self): es_client_predictions = ESClient(self.cfg['es_host'], self.cfg['es_port'], self.cfg['es_predictions_index'], self.cfg['es_predictions_type']) predictions = es_client_predictions.search({"size": 100}) self.assertTrue(len(predictions) > 0) self.assertTrue(len(predictions) >= 40) def test_get_tbr_ratio(self): es_client_tbr = ESClient(self.cfg['es_host'], self.cfg['es_port'], self.cfg['es_tbr_index'], self.cfg['es_tbr_type']) ands = ['b6', 'b7'] get_tbr_ratio = optimizer.dao.query_builder.get_tbr_ratio( ands, self.bookings_map, es_client_tbr) print('get_tbr_ratio=' + str(get_tbr_ratio)) self.assertTrue(get_tbr_ratio == 1.0) def test_bb_split_method_case1(self): # Testcase type: 1 booking is splitted by another different booking # testcase 1: booking b8 is splitted with a new booking b6. # bk_id: b8, days: ['20180405'], a: ['1'], g: ['g_X'], si: ['1'] # bk_id: b6, days: ['20180405'], a: ['4'], g: ['g_f'], si: ['2'] # original dataframe: ['20180405', ['b8'], [], {}, 0] # splitted to : d1: ands=['b8', 'b6'], minus=[] && d2: ands=['b8'], minus=['b6'] # in this testecase by hand, d2 has the get_bb_count() value 3239, so d2 is valid, but d1 is invalid. pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'allocated', 'amount']) pandas_df_expected.loc[0] = ['20180405', ['b8'], ['b6'], {}, 3239] day, booking_id, new_bk_id = '20180405', 'b8', 'b6' rows = [(day, [booking_id], [], {}, 0)] return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id) def test_bb_split_method_case2(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'allocated', 'amount']) pandas_df_expected.loc[0] = ['20180405', ['b6', 'b7'], [], {}, 8900] day, booking_id, new_bk_id = '20180405', 'b6', 'b7' rows = [(day, [booking_id], [], {}, 0)] return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id) def test_bb_split_method_case3(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'allocated', 'amount']) pandas_df_expected.loc[0] = [ '20180405', ['b8'], ['b6', 'b7'], {}, 3239 ] pandas_df_expected.loc[1] = [ '20180405', ['b6', 'b7'], ['b8'], {}, 8900 ] new_bk_id = 'b7' rows = [('20180405', ['b8'], ['b6'], {}, 3239), ('20180405', ['b6'], ['b8'], {}, 0)] return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id) def test_bb_split_method_case4(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'allocated', 'amount']) pandas_df_expected.loc[0] = [ '20180405', ['b6', 'b7', 'b10'], [], {}, 8900 ] new_bk_id = 'b10' rows = [('20180405', ['b6', 'b7'], [], {}, 8900)] return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id) def test_bb_split_method_case5(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'allocated', 'amount']) pandas_df_expected.loc[0] = [ '20180405', ['b8'], ['b6', 'b7', 'b9'], {}, 3239 ] pandas_df_expected.loc[1] = [ '20180405', ['b6', 'b7'], ['b8', 'b9'], {}, 8900 ] new_bk_id = 'b9' rows = [('20180405', ['b8'], ['b6', 'b7'], {}, 3239), ('20180405', ['b6', 'b7'], ['b8'], {}, 8900)] return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id)
i = 0 for index, row in pd_df_reviews.iterrows(): tot = 0 totwords = 0 print(row) if pd.notnull(row['_2']): cleantext = cleanText(row['_2']) totwords = len(cleantext) #return the 5 n-grams with the highest PMI unigrams = top5_words(cleantext) bigrams = top5_bigram_collocations(cleantext) trigrams = top5_trigram_collocations(cleantext) df.ix[i, 'BusinessID']= row['_1'] df.ix[i, 'CleanText']= " ".join(cleantext).encode('utf-8') df.ix[i, 'TotWords']= totwords df.ix[i, 'unigrams']= unigrams df.ix[i, 'bigrams']= bigrams df.ix[i, 'trigrams']= trigrams i += 1 spark_df = sqlContext.createDataFrame(df, columns) # Save it as a table spark_df.registerTempTable("dfBusiness500000") sqlContext.sql("drop table if exists result2_bybusiness") sqlContext.sql("CREATE TABLE result2_bybusiness AS SELECT * FROM dfBusiness500000")
class StravaLoader(object): def __init__(self, data_source='local', activity_directory='strava-activities-subset', s3bucket='larsbk', athletes=None, activity_types=[ 'Ride', 'Run', 'NordicSki' ], sc=None, hiveContext=None, conf=(SparkConf().setAppName('Strava analysis')), filter_bug_inducing_rows=True ): ''' Initialize Strava Analysis object''' # INPUT PARAMETERS self.athletes = athletes # Athletes to analyze (optional) self.activity_types = activity_types # Activity_types to consider (default) self.filter_bug_inducing_rows = filter_bug_inducing_rows # CONFIGURE SPARK if sc != None and hiveContext != None: # Both contexts were supplied by user print 'Info: Using supplied SparkContext and HiveContext' self.sc = sc self.hiveContext = hiveContext else: # Initialize new contexts print 'Info: Intitializing SparkContext and hiveContext from (default) conf' self.sc = SparkContext(conf=conf) self.hiveContext = HiveContext(self.sc) self.schema = pickle.load(open('./schema.p', 'rb')) # The pre-defined schema self.df = None # Empry DataFrame to be populated later # CONFIGURE DATA SOURCE data_root_path = { 's3': 's3n://%s/%s/' % (s3bucket, activity_directory), 'local': './%s/' % activity_directory } if data_source not in data_root_path.keys(): # Check if data source is valid raise Exception(('Unrecognized data source %s. ' 'Supported sources: "%s".') \ % '", "'.join(data_root_path.keys())) self.data_source = data_source # This is a valid data source self.path = data_root_path[data_source] # This is the path to the data # (S3 SPECIFIC STUFF) if data_source == 's3': # Get a list of files in he activity_directorys bucket = boto3.resource('s3').Bucket(s3bucket) objects = bucket.objects.filter(Prefix='%s/gpx/' % activity_directory) files = [obj.key for obj in objects] # Make set of observed combinations of athlete and activity_type athlete_and_type = set([]) # Empty set to populate fpattern = '\/([\w]+)\/(?:[\w-]+)-([\w]+)\.gpx' # File name pattern for fname in files: match = re.match(activity_directory+'/gpx'+fpattern, fname) if match: athlete_and_type.add((match.group(1), match.group(2))) self.s3_athlete_and_type = athlete_and_type # Save set for later use pass def _get_athlete_directories(self): ''' Look for athlete directories in data_root_path \ and update self.athletes ''' if self.data_source in ['local']: self.athletes = [ directory for directory in os.listdir(self.path+'gpx/') if re.match('^[\w-]+$', directory) ] else: print ('Warning: Automatic directory/athlete detection not yet supported for ' 'data source %s. Using: "akrogvig", "lkrogvig", "brustad"') \ % self.data_source self.athletes = ['akrogvig', 'lkrogvig', 'brustad'] pass def _activities_exist(self, athlete, activity_type): ''' Checks if there exists activities of type <activity_type> for athlete <athlete>, returns a boolean value ''' # Check local directory with glob if self.data_source == 'local': return glob.glob(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type)) # Check if combination exists by using previously compiled sets elif self.data_source == 's3': return ((athlete, activity_type) in self.s3_athlete_and_type) def _load_dataset(self): ''' Loads strava activities from source to DataFrame self.df ''' # Get athlete list if not already set if not self.athletes: self._get_athlete_directories() # Initialize empty dataset self.df = self.hiveContext.createDataFrame( self.sc.emptyRDD(), self.schema ) for athlete in self.athletes: for activity_type in self.activity_types: # Check that there are files of that type (or else .load fails) if self._activities_exist(athlete, activity_type): # Read data dfadd = self.hiveContext.read.format('com.databricks.spark.xml') \ .options(rowTag='trkpt', treatEmptyValuesAsNulls=False) \ .schema(self.schema) \ .load(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type)) dfadd = dfadd.withColumn('athlete', lit(athlete)) \ .withColumn('activity_type', lit(activity_type)) self.df = self.df.unionAll(dfadd) if self.filter_bug_inducing_rows: self.df = self.df.filter(self.df['extensions.gpxtpx:TrackPointExtension.#VALUE'].isNull()) pass def derive_schema(self): ''' Loads all data in self.path and derives the schema, saves with pickle to "schema.p" ''' df = self.hiveContext.read.format('com.databricks.spark.xml') \ .options(rowTag='trkpt') \ .load(self.path+'gpx/*') df = df.withColumn('athlete',lit(None).cast(StringType())) \ .withColumn('activity_type',lit(None).cast(StringType())) df.printSchema() pickle.dump(df.schema, open("schema.p", "wb")) pass def get_dataset(self): ''' Returns strava activity dataset ''' if not self.df: self._load_dataset() return self.df
from pyspark import SparkContext sc = SparkContext() from pyspark.sql import HiveContext sqlContext = HiveContext(sc) tbl_Hospitals = sqlContext.sql("select Provider_ID, Hospital_Name, State from tbl_Hospitals_RAW").rdd tbl_Hospitals.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_hospitals/") sqlContext.createDataFrame(tbl_Hospitals).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_hospitals/") exit()
return df.write.jdbc(url="jdbc:mysql://localhost:3306/rdbms", table=table, mode="overwrite", properties={"user": "******"}) # Initializing Spark sc = SparkContext() sc.setLogLevel("WARN") rdd = sc.sequenceFile("hdfs:///flume/events/*/*/*").map( lambda x: Row(*x[1].split(","))) sqlContext = HiveContext(sc) df = sqlContext.createDataFrame(rdd, [ "purchaseDate2", "productName", "productPrice", "productCategory", "clientIPAddress" ]).cache() #sparkTopCategories topCategories = df.groupBy("productCategory")\ .count()\ .select(col("productCategory"), col("count").alias("cnt"))\ .orderBy(col("count").desc())\ .limit(10) topCategories.show() writeMYSQL(topCategories, "sparkTopCategories") #sparkTopProducts topProducts = df.groupBy("productCategory", "productName")\ .count()\ .orderBy(col("productName").asc(), col("productCategory").asc(), col("count").desc())\
# removing all row not containing numbers for score variable # function to test if a string can be parsed in integer or not def CheckValidScore(s): try: int(s) return True except ValueError: return False # creating a RDD by filtering out invalid scores df_survey_clean_rdd = df_survey_raw.rdd.filter(lambda row: CheckValidScore(row.hcahps_base_score)) df_care_clean_rdd = df_care_raw.rdd.filter(lambda row: CheckValidScore(row.score)) # creating dataframe from the RDD df_survey_clean = sqlCtx.createDataFrame(df_survey_clean_rdd) df_care_clean = sqlCtx.createDataFrame(df_care_clean_rdd) print 'Number of rows in survey table after cleaning {0}'.format(df_survey_clean.count()) print 'Number of rows in effective_care table after cleaning {0}'.format(df_care_clean.count()) # converting the data types for score column df_survey = df_survey_clean.selectExpr('provider_id','(cast(hcahps_base_score as int) + cast(hcahps_consistency_score as int)) as survey_score') # converting the data types for score column df_care = df_care_clean.selectExpr('provider_id','measure_id','cast(score as int) as score') # get the maximum score per measure_id and name the column as 'max_score' df_care_max_scores = df_care.groupBy('measure_id').max().collect() df_care_max_scores = sc.broadcast(df_care_max_scores)
features = [ 'area', 'down_oscillation', 'down_stroke', 'down_stroke_ratio', 'down_stroke_zaihe', 'down_up_oscillation_ratio', 'down_up_stroke_zaihe_ratio', 'down_up_zaihe_ratio', 'down_zaihe', 'left_upper_area', 'left_upper_area_ratio', 'max_weiyi', 'max_weiyi_zaihe', 'max_zaihe', 'min_max_zaihe_ratio', 'min_weiyi', 'min_weiyi_zaihe', 'min_zaihe', 'up_oscillation', 'up_stroke', 'up_stroke_ratio', 'up_stroke_zaihe', 'up_zaihe' ] print(current_timestamp(), '-' * 30 + 'starting') abnormal_sql = """ select * from industry.dagang_abnormal where dwdm like '03%' """ fault_sql = """ select * from industry.fault_segment_2c """ abnormal_data = sqlCtx.sql(abnormal_sql) \ .repartition(num_partitions, 'dxmc').cache() fault_data = sc.broadcast(sqlCtx.sql(fault_sql).toPandas()) print(current_timestamp(), '-' * 30 + 'spark learning_data rows count', abnormal_data.count()) res_rdd = abnormal_data.rdd.mapPartitions(partition_func).cache() print(current_timestamp(), '-' * 30 + 'res_rdd rows count: ', res_rdd.count()) res_df = sqlCtx.createDataFrame(res_rdd, schema=['res']).toPandas() res_df.to_csv('abnormal_feature.csv', index=False, header=None) print(current_timestamp(), '-' * 30 + 'finished')
#dates = forecast_data_v3.get_dates() dates = forecast_data_v4.get_dates() # print(dates) # print(type(dates)) np.reshape(prediction_results, (4, 1)) np.reshape(outage_probabilities, (4, 1)) np.reshape(dates, (4, 1)) # print(prediction_results.shape) # print(dates.shape) #test_results = [1,0,0,0] #t = np.asarray(test_results) #np.reshape(t,(4,1)) state = ['Rhode Island', 'Rhode Island', 'Rhode Island', 'Rhode Island'] city = ['Providence', 'Providence', 'Providence', 'Providence'] s = np.asarray(state) c = np.asarray(city) np.reshape(s, (4, 1)) np.reshape(c, (4, 1)) combined = np.vstack((s, c, dates, outage_probabilities, prediction_results)).T #combined = np.vstack((s, c, dates, outage_probabilities, t)).T final_df = pd.DataFrame( combined, columns=['state', 'city', 'date', 'probability', 'outage']) final_df = hive_context.createDataFrame(final_df) final_df.write.mode("overwrite").saveAsTable('RI_Outage_Table') final_df.show() final_df.printSchema() #result = sqlContext.sql('SELECT outage, date, "Providence, RI" AS location FROM RI_Outage_Table') #result.show()
class HqlSpark(): # init Spark sql def __init__(self, language='cn'): self.conf = SparkConf().setAppName("HqlSparkAPP").setSparkHome( "/home/spark").set("spark.driver.allowMultipleContexts", "true") self.sc = SparkContext(conf=self.conf) self.sql = HiveContext(self.sc) self.sql.sql('use anserchapp') self.curlan = language self.curDataWord = None # self.loadWord_state = False #get current language word def getAllWord(self, language='cn'): if self.curlan == language and self.curDataWord: return self.curlan = language sql_s = 'select * from searchapp_%s limit 100' % self.curlan self.curDataWord = self.sql.sql(sql_s) def refreshTable(self, tableName): self.sql.refreshTable(tableName) #create table def createTable(self, sql_sentence, tableName): # table_name = sql_sentence.split(' ')[2] self.sql.sql(sql_sentence) self.sql.refreshTable(tableName) print 'create table success' #insert data into table def insertData(self, sql_sentence): self.sql.sql(sql_sentence) # insert data into table from data_struct def insertDataFromStruct(self, data, tableName='searchapp_', d_type='cn', state=False): #data tuple or list list data, # rdd = self.sc.parallelize(data) if d_type == '': in_data = self.sql.createDataFrame(data, als._categry_shame) elif d_type == 'hint': in_data = self.sql.createDataFrame(data, als.hintWord_shame) d_type = '' else: in_data = self.sql.createDataFrame(data, als.searchApp_shame) # final_data = in_data if state: in_data.saveAsTable(tableName=tableName + d_type, Source='metastore_db', mode='append') # append overwrite else: in_data.saveAsTable(tableName=tableName + d_type, Source='metastore_db', mode='overwrite') # delete table def deleteDataFromTable(self, table='searchapp_', d_type='ch'): sql_sentence = 'delete from ' + table + d_type self.sql.dropTempTable(table + d_type) self.sql.refreshTable(table + d_type) def showTales(self): table_list = [] tables = self.sql.sql('show tables').collect() for table in tables: table_list.append(table['tableName']) return table_list def getData(self, sql_hive): datas = self.sql.sql(sql_hive).collect() return datas #according input words find hintword from table hintword def selectHintWord(self, base_wordFr): hintWord = self.sql.sql('select word,hintWord from hintword') word = hintWord.join(base_wordFr, hintWord.hintWord == base_wordFr.word, 'outer').select(hintWord.word).distinct() word_news = self.curDataWord.join( word, word.word == self.curDataWord.word, 'outer').select(self.curDataWord.word, 'priority', 'searchApp', 'searchCount', 'genre').distinct() word_news = word_news.dropna(how='any') return word_news #according to appId find word from searchapp def selectAppIdWord(self, appIds): result = None for appId in appIds: if result == None: result = self.curDataWord.filter( functions.array_contains(self.curDataWord.searchapp, appId)).select( 'word', 'priority', 'searchApp', 'searchCount', 'genre').distinct() res = self.curDataWord.filter( functions.array_contains(self.curDataWord.searchapp, appId)).select( 'word', 'priority', 'searchApp', 'searchCount', 'genre').distinct() result = result.unionAll(res) word = result.select('word') result = result.dropna(how='any') return result, word #according to genre id find word from searchApp def selectGenreWord(self, genreIds): result = None for gId in genreIds: if result == None: result = self.curDataWord.filter( functions.array_contains(self.curDataWord.genre, gId)).select( 'word', 'priority', 'searchApp', 'searchCount', 'genre').distinct() res = self.curDataWord.filter( functions.array_contains(self.curDataWord.genre, gId)).select('word', 'priority', 'searchApp', 'searchCount', 'genre').distinct() result = result.unionAll(res) return result # get all word for analysis def getAnalysisWords(self, appIds, genreIds): if appIds == None or len(appIds) <= 0: return None appWord, word = self.selectAppIdWord(appIds) genreWord = None thinkWord = None if genreIds != None and len(genreIds) > 0: genreWord = self.selectGenreWord(genreIds) if word and word.count() > 0: thinkWord = self.selectHintWord(word) if appWord and genreWord and thinkWord: appWord = appWord.unionAll(genreWord) appWord = appWord.unionAll(thinkWord) return appWord.distinct() # return appWord.unionAll(genreWord).unionAll(thinkWord).distinct() elif appWord and genreWord: return appWord.unionAll(genreWord).distinct() elif appWord and thinkWord: appWord = appWord.unionAll(thinkWord) return appWord.distinct() elif genreWord and thinkWord: genreWord = genreWord.unionAll(thinkWord) return genreWord.distinct() elif appWord: return appWord.distinct() elif genreWord: return genreWord.distinct() else: return thinkWord.distinct() #build Matrix def buildMatrix(self, words): class_all = self.sql.sql( "select genreID from category order by genreID desc") c_genres = class_all.collect() genres = {} i = 0 for c in c_genres: genres.setdefault(c.genreID, i) i += 1 datas = words.select('genre').collect() mlength = len(c_genres) nlength = len(datas) Matrix = numpy.zeros((nlength, mlength)) num = 0 print len(Matrix) for data in datas: for ge in data.genre: Matrix[num][genres.get(ge)] = 1 num += 1 return Matrix #get Input data def getInPut(self, appIds, genreIds): words = self.getAnalysisWords(appIds, genreIds) return self.buildMatrix(words) # k_means analysis def spark_means(self, Matrix, Kcluster=2, MaxIterations=10, runs=10): cluster_data = self.sc.parallelize(Matrix) trains = KMeans().train(cluster_data, Kcluster, MaxIterations, runs) results = trains.predict(cluster_data).collect() return results #combine word def combine_data(self, words=None, result=None): len_re = len(result) len_w = words.count() if len_re != len_w: print 'word num :', len_w, ' is not equal result num:', len_re if len_re < len_w: words = self.sql.createDataFrame(words.take(len_re)) else: result = result[0:len_w] print words.count(), len(result) result = map(lambda x: str(x), result) cluster_re = self.sc.parallelize(result, 1) # print cluster_re.collect(),words.map(list).count() re = words.map(list).repartition(1).zip(cluster_re).map(lambda p: Row(word=p[0][0], priority=int(p[0][1]),\ searchcount=int(p[0][3]),cluster=p[1])) cluster_sha = self.sql.createDataFrame(re) # cluster_sha.show() return cluster_sha # select Class Word def selectWord(self, cluster_sha, top_K=2): df = cluster_sha select_par = df.groupBy('cluster').agg({ 'searchcount': 'avg', 'priority': 'avg' }).collect() ClusterNum = len(select_par) clusterWord = [] for line in select_par: cluster_df = df.filter(df.cluster == line[0]).select( 'word', 'priority', 'searchcount') ClassWord = cluster_df.filter( cluster_df.searchcount > line[1]).select('word', 'priority') ClassWord = ClassWord.filter( ClassWord.priority >= line[2]).select('*').limit(top_K) KeyWords = cluster_df.filter( cluster_df.searchcount < line[1]).select('word', 'priority') KeyWords = KeyWords.filter( KeyWords.priority >= line[2]).select("*").limit(top_K) cluster = { 'cluster_id': line[0], 'classWord': ClassWord.toJSON().collect(), 'keyWord': KeyWords.toJSON().collect() } clusterWord.append(cluster) result = {'ClusterNum': ClusterNum, 'AllCluster': clusterWord} return result
class SparkEngine(object): def __init__(self, sc, debug=False): self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/" self.sc = sc # hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive self.hive_cxt = HiveContext(sc) self.sql_cxt = SQLContext(sc) if debug: print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt) print sc._conf.getAll() #TBD destructor Unpersist memory ### functionality to query and create tables def _create_df_table(self, schema, frame, name): if schema: df = self.hive_cxt.createDataFrame(frame, schema=schema) else: df = self.hive_cxt.createDataFrame(frame) df.printSchema() df.registerTempTable(name) self.hive_cxt.cacheTable(name) return df def _query_temporal_data(self): # step 1. create main temporal table # n_obs => first join causes for each pen entry * num location entries existent (dependent on time period) samples_temporal_tb = self.hive_cxt.sql(""" SELECT s.refers_to_object_id, created, visit_raw, room_floor, room_id, room_name, spot_id, spot_name, spot_description, room_count_objects, room_count_spots, spot_count_objects, abs(datediff( from_utc_timestamp(from_unixtime(created, "yyyy-MM-dd"), 'US/Eastern'), from_utc_timestamp(from_unixtime(visit_raw, "yyyy-MM-dd"), 'US/Eastern') )) as delta FROM samples s JOIN temporal t ON s.refers_to_object_id = t.refers_to_object_id ORDER by s.refers_to_object_id, created, delta """) samples_temporal_tb.registerTempTable('samplestemporal') self.hive_cxt.cacheTable('samplestemporal') return samples_temporal_tb def _minimize_query(self): # From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp tb_samples = self.hive_cxt.sql(""" SELECT * FROM ( SELECT *, MIN(delta) OVER ( PARTITION BY refers_to_object_id, created) AS min_delta, row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks FROM samplestemporal st ORDER BY refers_to_object_id ) query where query.ranks = 1 """) tb_samples = tb_samples.withColumn("meta_store", lit(1)) tb_samples.registerTempTable('minimizedsamples') self.hive_cxt.cacheTable('minimizedsamples') return tb_samples def execute_query(self, (samples_schema, samples_frame, samples_name), (temporal_schema, temporal_frame, temporal_name), cols): self.df_samples = self._create_df_table(samples_schema, samples_frame, samples_name) self.df_temporal = self._create_df_table(temporal_schema, temporal_frame, temporal_name) self.tb_meta = self._query_temporal_data() self.tb_meta_min = self._minimize_query() # combine to the original pen data (meta_store indicates if we had object data to integrate) self.df_samplesmeta = self.df_samples.join(self.tb_meta_min, ['refers_to_object_id', 'created'], "left_outer") self.df_samplesmeta = self.df_samplesmeta.fillna({'meta_store': 0}) self.df_samplesmeta.printSchema() # pickle file to pandas: alternatively we can store as a json or parquet columnar format dropped_cols = ['delta', 'min_delta', 'ranks'] + cols samplesmeta_pd = self.df_samplesmeta.toPandas() samplesmeta_pd = samplesmeta_pd.drop(dropped_cols, axis=1) samplesmeta_pd.to_pickle(self.export_path + "penmeta_spark.pkl")
# Configure Spark Settings conf = SparkConf() conf.set("spark.executor.memory", "1g") conf.set("spark.cores.max", "2") conf.setAppName("Spark") # # Initialize SparkContext. sc = SparkContext('local', conf=conf) # Test with a data file, I used an auto data file sqlContext = HiveContext(sc) sqlContext.setConf("hive.exec.dynamic.partition", "true") sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict") sqlContext.sql("CREATE DATABASE IF NOT EXISTS test") pandas_df = pd.read_csv('D:\\value.csv') # assuming the file contains a header # pandas_df = pd.read_csv('file.csv', names = ['column 1','column 2']) # if no header s_df = sqlContext.createDataFrame(pandas_df) s_df.count() #print (s_df.count()) s_df.write.mode("overwrite").saveAsTable("test.value") #sqlContext.sql("CREATE DATABASE IF NOT EXISTS test") tab = sqlContext.sql("select * from test.value") tab.printSchema() tab.show(5, False)
def parsePoint(d): ## wont be able to use line.split here? d_copy = deepcopy(d) # I hate using deepcopy so much pred = d_copy['success_metric'] d.pop('success_metric', None) values = [float(x) for x in d.values()] ##this block is unusable until we have our Hive Data return (pred, Vectors.dense(values)) # training set trainParsed = sc.parallelize(map(parsePoint, train_dict)) # test set testParsed = sc.parallelize(map(parsePoint, test_dict)) ## create validation set trainDf = sqlContext.createDataFrame(trainParsed, ["label", "features"]) testDf = sqlContext.createDataFrame(testParsed, ["label", "features"]) lm_model = LinearRegression(featuresCol="features", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6) lm_model_fit = lm_model.fit(trainDf) lm_transform = lm_model_fit.transform(trainDf) results = lm_transform.select(lm_transform['prediction'], lm_transform['label']) MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("Linear Regression training Mean Squared Error = " + str(MSE)) lm_transform = lm_model_fit.transform(testDf) results = lm_transform.select(lm_transform['prediction'], lm_transform['label']) MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count() print("Linear Regression testing Mean Squared Error = " + str(MSE)) res = results.collect() predsAndLabels = sc.parallelize([i.asDict().values() for i in res])
|attr|obj| +----+---+ | 1| a| | 2| a| | 3| a| | 1| b| | 2| b| | 3| b| | 1| c| | 3| c| +----+---+ """ schema = StructType([StructField("attr", StringType(), True), StructField("obj", StringType(), True)]) aoDF = sqlCtx.createDataFrame(aoPair, schema) #Window that moves over rows of same obj and sorted by attr window = Window.orderBy("attr").partitionBy("obj") ## Prev column contains previous attr of the same object """ Transformed Table +----+---+----+ |attr|obj|prev| +----+---+----+ | 1| a|null| | 2| a| 1| | 3| a| 2| | 1| b|null|
class Unittest_HWM_Allocation_SortedOrder1(unittest.TestCase): def setUp(self): warnings.simplefilter("ignore", ResourceWarning) fpath = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) with open(fpath + '/data_source/bookings_overall.json') as bookings_source: self.bookings = json.load(bookings_source) with open(fpath + '/data_source/cfg.json') as cfg_source: self.cfg = json.load(cfg_source) today = '20180402' self.days = optimizer.util.get_days_from_bookings(today, self.bookings) self.sc = SparkContext.getOrCreate() self.hive_context = HiveContext(self.sc) self.schema = optimizer.util.get_common_pyspark_schema() def compare_two_dfs(self, pandas_df_expected, df_to_test_rows): df = self.hive_context.createDataFrame(df_to_test_rows, self.schema) df_allocated = optimizer.algo.hwm.hwm_allocation( df, self.bookings, self.days) pandas_df_allocated = df_allocated.select("*").toPandas() print(pandas_df_expected) print(pandas_df_allocated) return self.assertTrue( assert_frame_equal(pandas_df_expected, pandas_df_allocated, check_dtype=False) == None) def test_hwm_allocation_case1(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b8'], ['b6'], 3239, { 'b8': 88 } ] df_to_test_rows = [(['20180405', ['b8'], ['b6'], {}, 3239])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case2(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b11', 'b12'], [], 8900, { 'b11': 11, 'b12': 12 } ] df_to_test_rows = [(['20180405', ['b11', 'b12'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case3(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b6', 'b7', 'b10'], [], 8900, { 'b6': 66, 'b7': 77, 'b10': 100 } ] df_to_test_rows = [(['20180405', ['b6', 'b7', 'b10'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case4(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b8'], ['b6', 'b7', 'b9'], 3239, { 'b8': 88 } ] pandas_df_expected.loc[1] = [ '20180405', ['b6', 'b7'], ['b8', 'b9'], 8900, { 'b6': 66, 'b7': 77 } ] df_to_test_rows = [(['20180405', ['b8'], ['b6', 'b7', 'b9'], {}, 3239]), (['20180405', ['b6', 'b7'], ['b8', 'b9'], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case5(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b6', 'b7', 'b10', 'b11', 'b12'], ['b8', 'b9'], 8900, { 'b6': 66, 'b7': 77, 'b10': 100, 'b11': 11, 'b12': 12 } ] df_to_test_rows = [([ '20180405', ['b6', 'b7', 'b10', 'b11', 'b12'], ['b8', 'b9'], {}, 8900 ])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case6(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b13', 'b12'], [], 8900, { 'b13': 8900 } ] df_to_test_rows = [(['20180405', ['b13', 'b12'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case7(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b15', 'b14'], [], 8900, { 'b15': 8900 } ] df_to_test_rows = [(['20180405', ['b15', 'b14'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case8(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b17', 'b16'], [], 8900, { 'b17': 4450, 'b16': 4450 } ] df_to_test_rows = [(['20180405', ['b17', 'b16'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case9(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b18', 'b17'], [], 8900, { 'b18': 4451, 'b17': 4449 } ] df_to_test_rows = [(['20180405', ['b18', 'b17'], [], {}, 8900])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows) def test_hwm_allocation_case10(self): pandas_df_expected = pandas.DataFrame( columns=['day', 'ands', 'minus', 'amount', 'allocated']) pandas_df_expected.loc[0] = [ '20180405', ['b6', 'b7', 'b10', 'b12', 'b16', 'b17', 'b18'], ['b8', 'b9'], 8900, { 'b18': 4451, 'b17': 4449 } ] # b6, b7, b10, b12, b16, b17, b18 have the same attributes. df_to_test_rows = [([ '20180405', ['b6', 'b7', 'b10', 'b12', 'b16', 'b17', 'b18'], ['b8', 'b9'], {}, 8900 ])] return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)
encounters = parts.map(lambda p: (p[0], datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"), p[2], p[3].strip().replace(',','_'))) fields = [StructField("PATIENT_NUM", StringType(), True), StructField("START_DATE", DateType(), True), StructField("ENCOUNTER_NUM", StringType(), True), StructField("ICD9S", StringType(), True)] schema_encounters = StructType(fields) # fields = [StructField("PATIENT_NUM", StringType(), True), # StructField("ENCOUNTER_NUM", StringType(), True), # StructField("START_DATE", StringType(), True), # Seq(StructField("ICD9S", ArrayType(StringType(), True), True) ] # schema_encounters = StructType(fields) # Apply the schema to the RDD. schemaEncounters = sqlContext.createDataFrame(encounters, schema_encounters) schemaEncounters.printSchema() schemaEncounters.registerTempTable("encounters") # order data by patient, start date, _then_ encounter encounteres_ordered = sqlContext.sql("select PATIENT_NUM, START_DATE, ENCOUNTER_NUM, ICD9S from encounters order by PATIENT_NUM, START_DATE, ENCOUNTER_NUM") encounteres_ordered.registerTempTable("encounteres_ordered") #sqlContext.sql("select collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").show(5) #sqlContext.sql("select PATIENT_NUM, collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").show(20, truncate=False) rdd = sqlContext.sql("select collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").rdd def splitter(p):