def main(sc):
	spark = HiveContext(sc)
	sqlContext = HiveContext(sc)
	#busiestcity
	rows = sc.textFile('../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv').mapPartitionsWithIndex(parseCSV3)
	df = sqlContext.createDataFrame(rows)
	flight_origin = df.select('YEAR', 'MONTH', 'ORIGIN_CITY_NAME', 'ORIGIN_AIRPORT_ID').groupBy('YEAR', 'MONTH', 'ORIGIN_CITY_NAME').count().withColumnRenamed('count', 'origin_count')
	flight_origin = flight_origin.withColumnRenamed('ORIGIN_CITY_NAME', 'City_of_Departure')

	flight_dest = df.select('YEAR', 'MONTH', 'DEST_CITY_NAME', 'DEST_AIRPORT_ID').groupBy('YEAR', 'MONTH', 'DEST_CITY_NAME').count().withColumnRenamed('count', 'dest_count')
	flight_dest = flight_dest.withColumnRenamed('DEST_CITY_NAME', 'City_of_Arrival')
	flight_dest = flight_dest.withColumnRenamed('YEAR', 'YEAR_dest')
	flight_dest = flight_dest.withColumnRenamed('MONTH', 'MONTH_dest')
	total_counts = flight_origin.join(flight_dest,((flight_origin.City_of_Departure == flight_dest.City_of_Arrival) & (flight_origin.YEAR == flight_dest.YEAR_dest) & (flight_origin.MONTH == flight_dest.MONTH_dest)))
	total_counts = total_counts.select(total_counts.City_of_Departure.alias('City'),(total_counts.origin_count + total_counts.dest_count).alias('sum_counts'), 'YEAR', 'MONTH')
	total_counts_city_pivot = total_counts.groupBy('City').pivot('MONTH').sum('sum_counts')
	total_counts_city_pivot.toPandas().to_csv('Output/busiest_city.csv')
	#grupedbyday
	rows = sc.textFile('../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv').mapPartitionsWithIndex(parseCSV4)
	df = sqlContext.createDataFrame(rows)
	grouped_by_day = df.groupBy('FL_DATE').count()
	grouped_by_day.toPandas().to_csv('Output/grouped_by_day.csv')
	#mostcommondeparturetime
	rows = sc.textFile('../lmf445/Flight_Project/Data/864625436_T_ONTIME_*.csv').mapPartitionsWithIndex(parseCSV5)
	df = sqlContext.createDataFrame(rows)
	departure_time_pivot = df.groupBy('DEP_TIME_BLK').pivot('MONTH').count()
	departure_time_pivot.toPandas().to_csv('Output/most_common_departure_time.csv')
示例#2
0
def main(sc, SQLContext):

    sqlContext = HiveContext(sc)

    sc.setLogLevel("ERROR")

    query = "USE {0}"

    query = query.format(db_name)

    sqlContext.sql(query)

    tables = sqlContext.sql("SHOW TABLES")

    tableNames = tables.select("tableName").rdd.map(lambda r: r)

    tableNames = tableNames.map(lambda x: x.tableName).collect()

    tableNames = [str(i) for i in tableNames]

    schema_empty_df = StructType([StructField("table_ddl",StringType(),True)])

    empty_df = sqlContext.createDataFrame(sc.emptyRDD(), schema_empty_df)

    df1 = empty_df

    for i in tableNames:
        show_query = "show create table "+i
        drop_query = "drop table "+i+";\n"
        describe_query = "describe formatted "+i
        seperator = ";\n"
        try:
            rdd = sc.parallelize([drop_query])
            newRDD = rdd.map(lambda x:{"table_ddl":x})
            newDF = rdd.map(lambda p: Row(table_ddl=p)).toDF()
            df = df1.unionAll(newDF)
            desc = sqlContext.sql(describe_query)
            desc_1 = desc.select(['data_type']).where("col_name='Location'")
            desc_2 = desc_1.rdd.map(lambda x:x.data_type).collect()
            desc_3 = [str(i) for i in desc_2]
            desc_4 = ''.join(desc_3)
            df0 = sqlContext.sql(show_query)
            show_1 = df0.rdd.map(lambda x:x.createtab_stmt).collect()
            show_2 = [str(i) for i in show_1]
            show_3 = ''.join(show_2)
            if show_3.find("LOCATION '") < 0:
                loc_query = "LOCATION '"+desc_4+"'"+"\n TBLPROPERTIES ("
                final_create_table=show_3.replace("TBLPROPERTIES (", loc_query)
            else:
                final_create_table = show_3
            list_final = [final_create_table]
            rdd_create_table = sc.parallelize(list_final)
		      	df_create_table = rdd_create_table.map(lambda p: Row(create_table_ddl=p)).toDF()
            df1 = df.unionAll(df_create_table)
            rdd1 = sc.parallelize([seperator])
            newRDD1 = rdd1.map(lambda x:{"delim":x})
            newDF1 = sqlContext.createDataFrame(newRDD1, ["delim"])
            df1 = df1.unionAll(newDF1)
        except:
示例#3
0
class Testing_Resources_Generation_2(unittest.TestCase):
    def setUp(self):
        fpath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
        with open(fpath + '/data_source/bookings_fully_overlapped.json'
                  ) as bookings_source:
            self.bookings = json.load(bookings_source)
        with open(fpath + '/data_source/cfg.json') as cfg_source:
            self.cfg = json.load(cfg_source)
        today = '20180402'
        self.days = optimizer.util.get_days_from_bookings(today, self.bookings)
        self.sc = SparkContext.getOrCreate()
        self.hive_context = HiveContext(self.sc)
        self.schema = optimizer.util.get_common_pyspark_schema()
        self.bookings_map = optimizer.util.get_bookings_map(self.bookings)

    def test_resources_count(self):
        df = self.hive_context.createDataFrame(self.sc.emptyRDD(), self.schema)
        df = optimizer.main.generate_resources(self.cfg, df, self.bookings_map,
                                               self.days, self.bookings,
                                               self.hive_context)
        self.assertTrue(df.count() == 7)

    def test_resource_1(self):
        df = self.hive_context.createDataFrame(self.sc.emptyRDD(), self.schema)
        df = optimizer.main.generate_resources(self.cfg, df, self.bookings_map,
                                               self.days, self.bookings,
                                               self.hive_context)

        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'allocated', 'amount'])
        pandas_df_expected.loc[0] = [
            '20180402', ['b1', 'b3', 'b2'], [], {}, 733
        ]
        pandas_df_expected.loc[1] = [
            '20180402', ['b1', 'b3'], ['b2'], {}, 11181
        ]
        pandas_df_expected.loc[2] = [
            '20180402', ['b1', 'b2'], ['b3'], {}, 3575
        ]
        pandas_df_expected.loc[3] = [
            '20180402', ['b1'], ['b3', 'b2'], {}, 6047
        ]
        pandas_df_expected.loc[4] = [
            '20180402', ['b3', 'b2'], ['b1'], {}, 1002
        ]
        pandas_df_expected.loc[5] = [
            '20180402', ['b3'], ['b1', 'b2'], {}, 12241
        ]
        pandas_df_expected.loc[6] = [
            '20180402', ['b2'], ['b1', 'b3'], {}, 1410
        ]
        pandas_df_generated = df.select("*").toPandas()

        self.assertTrue(
            assert_frame_equal(pandas_df_expected,
                               pandas_df_generated,
                               check_dtype=False) == None)
示例#4
0
 def gen_report_table(hc,curUnixDay):
     rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
     HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
     #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY 
     sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_indoor order by entityid,clientmac,utoday" 
     df_id_stat=hc.sql(sql)
     df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_id_mm df_min_max ,to caculate firtarrival and last arrival 
     df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
     df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
     df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
     
     #newly added part for indoors7 and indoors30 based on current date
     df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
     df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
     df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
     hc.sql("drop table if exists df_indoor_fin")
     df_indoor_fin.write.saveAsTable("df_indoor_fin")
     
     rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
     HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
     
     # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
     sql="select entityid,clientmac,utoday,UFIRSTDAY,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_flow order by entityid,clientmac,utoday" 
     df_fl_stat=hc.sql(sql)
     df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_fl_mm df_min_max ,to caculate firtarrival and last arrival 
     df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
     df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
     df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
     hc.sql("drop table if exists df_flow_fin")
     df_flow_fin.write.saveAsTable("df_flow_fin") 
示例#5
0
def main(sc):

    logger = getlogger('carga.serverlogs')
    spark = SparkSession(sc)
    hqlContext = HiveContext(sc)

    logger.debug('Inicio de Processo')

    # Lendo o arquivo texto e criando um RDD em memoria com Spark
    line = sc.textFile("/home/cloudera/Projeto6/serverlogs")

    dataset = line.map(lambda line: get_row(line))

    # converte para o dataframe do pandas
    serverlogs_df = dataset.toDF()
    serverlogs_pd = serverlogs_df.toPandas()

    # converte o tipo da coluna bytes para numerico
    serverlogs_pd['bytes'] = pd.to_numeric(serverlogs_pd['bytes'],
                                           errors='coerce')

    # cria a coluna data
    serverlogs_pd['data'] = pd.to_datetime(serverlogs_pd.timestamp.str[:11])

    # cria um dataframe do hive
    serverlogs_hdf = hqlContext.createDataFrame(serverlogs_pd)

    serverlogs_hdf.registerTempTable('serverlogs_tmp')

    executar_insert_tabela('default.t_serverlogs', 'serverlogs_tmp', logger,
                           hqlContext)

    logger.debug('Todas tarefas do pipe executadas com sucesso!')
def sentiment_score(sc):

    #spark = HiveContext(sc)
    sqlContext = HiveContext(sc)
    df = sqlContext.read.json("/user/skk456/project/twitterData.json")
    text_list = df.select(df['text'], df['id'])
    text = df.select("text").rdd.flatMap(lambda x: x).collect()
    tweet_id = df.select("id").rdd.flatMap(lambda x: x).collect()
    tweets = map(lambda tweet: tokenize.sent_tokenize(tweet), text)
    sia = SentimentIntensityAnalyzer()
    score = map(lambda tweet: sia.polarity_scores(str(tweet)), tweets)
    comp = map(lambda i: i['compound'], score)
    pos = map(lambda i: i['pos'], score)
    neu = map(lambda i: i['neu'], score)
    neg = map(lambda i: i['neg'], score)

    score = pd.DataFrame()
    score['id'] = tweet_id
    score['positive'] = pos
    score['negative'] = neg
    score['neutral'] = neu
    score['compound'] = comp

    score_spark = sqlContext.createDataFrame(score)
    score_spark.rdd.saveAsTextFile('project/output1')
示例#7
0
        def setup():
            ''' table setup '''
            if not os.environ.has_key('SPARK_HOME'):
                raise Exception(
                    "Environment variable SPARK_HOME must be set " +
                    "to the root directory of the SPARK installation")
            spark_home_py = os.path.expandvars("$SPARK_HOME/python")
            sys.path.append(spark_home_py)
            file_list = glob.glob(spark_home_py + "/lib/py4j*.zip")
            if file_list is None:
                raise Exception(
                    "p4j*.zip not found - this needs to be on the PYTHONPATH")
            sys.path.append(file_list[0])
            try:
                from pyspark import SparkContext, SparkConf
                from pyspark.sql import HiveContext
            except ImportError:
                raise Exception("Required pyspark modules cannot be found")

            # Configure Spark
            conf = SparkConf().setAppName('SQLTODF_UT')
            conf = conf.setMaster(cfg.SPARK_MODE)
            sparkctx = SparkContext(conf=conf)

            pandasdf = pd.DataFrame({
                'name': ['Martin', 'Gemma'],
                'age': [16, 52]
            })
            sqlctx = HiveContext(sparkctx)
            sqldf = sqlctx.createDataFrame(pandasdf)
            sqldf.write.format('orc').mode('overwrite').saveAsTable(
                'sqltodf_test')
            sparkctx.stop()
示例#8
0
def do_ets_task(sc, ets_dburl_env, wfc):
    # 定义客户标识
    cust_no = '1'
    isvalid = '1'
    etsTempTable = wfc
    ets_url = ets_dburl_env[wfc[:-2]]['dst']
    slave_url = ets_dburl_env[wfc[:-2]]['src']
    dbinfo = load_source('getdbinfo', os.path.join(os.path.dirname(__file__), 'Utils.py')).getdbinfo(slave_url)
    tabledict = load_source('query_sql_slave', os.path.join(os.path.dirname(__file__), 'Utils.py')).query_sql_slave(dbinfo)
    slaveTempTable = tabledict.get(wfc[:-2])
    driver = "com.mysql.jdbc.Driver"
    sqlContext = HiveContext(sc)
    # driver = "com.mysql.jdbc.Driver"
    dff = sqlContext.read.format("jdbc").options(url=slave_url, dbtable=slaveTempTable, driver=driver).load()
    dff.registerTempTable(slaveTempTable)
    dft = sqlContext.read.format("jdbc").options(url=ets_url, dbtable=etsTempTable, driver=driver).load()
    dft.registerTempTable(etsTempTable)
    ds_ets = sqlContext.sql(" select max(updatets) as max from %s " % (etsTempTable))
    pp = ds_ets.collect()[0]
    max_updates = pp.max
    slave_sql = ''
    try:
        if max_updates is not None:
            print(u"ets库中的最大时间是:" + str(max_updates))
            slave_sql = " select id, examineeid, examid, roomid, stationid, examinerid, totalscore, begintime ,endtime,scoresheetcode,status, updatetime" \
                        "  from %s where `updatetime` > '%s' " % (slaveTempTable, max_updates)
        else:
            print(u"本次为初次抽取")
            slave_sql = " select id, examineeid, examid, roomid, stationid, examinerid, totalscore, begintime ,endtime,scoresheetcode,status, updatetime" \
                        " from %s " % (slaveTempTable)
        ds_slave = sqlContext.sql(slave_sql)
        print(u'slave 中 符合条件的记录数为:%s' % (ds_slave.count()))
        now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(u'开始组装数据...')
        src_fields = json.dumps({'osce_score': ['id', 'examineeid', 'examid', 'roomid', 'stationid', 'examinerid',
                                         'totalscore', 'begintime', 'endtime', 'scoresheetcode', 'status', 'updatetime']})
        # 字段值
        filedvlue = ds_slave.map(lambda row: (row.id, row.examineeid, row.examid, row.roomid, row.stationid,
                                              row.examinerid, row.totalscore, str(row.begintime), str(row.endtime),
                                              row.scoresheetcode, row.status, cust_no, isvalid,
                                              md5(row), now_time, str(row.updatetime)))
        # 创建列
        schemaString = "id,examineeid,examid,roomid,stationid,examinerid,totalscore,begintime," \
                       "endtime,scoresheetcode,status,cust_no,isvalid,src_fields_md5,createts,updatets"
        fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split(",")]
        schema = StructType(fields)
        # 使用列名和字段值创建datafrom
        schemaObj = sqlContext.createDataFrame(filedvlue, schema)
        print(u'组装数据完成...')
        # print schemaPeople
        # for row in schemaPeople:
        #     print row.id
        print(u'开始执写入数据...')
        # 写入数据库
        schemaObj.write.insertInto(etsTempTable, overwrite=False)
        print(u'写入完成')
    except Exception, e:
        # e.message 2.6 不支持
        print (str(e))
        raise Exception(str(e))
def main(sc):
    spark = HiveContext(sc)
    sqlContext = HiveContext(sc)
    print "holaaaaa"
    rows = sc.textFile(
        '../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv'
    ).mapPartitionsWithIndex(parseCSV)
    df = sqlContext.createDataFrame(rows)

    df = df.withColumn('DEP_DEL15', df['DEP_DEL15'].cast('int'))
    df = df.na.drop()

    delay_counts = df.select(
        'ORIGIN', 'DEP_DEL15').groupby('ORIGIN').sum().withColumnRenamed(
            'sum(DEP_DEL15)', 'origin_delay_count')
    delay_counts = delay_counts.join(
        flight_origin, delay_counts.ORIGIN == flight_origin.Airport_origin)
    delays_origin = delay_counts.select(
        'Airport_origin',
        (delay_counts.origin_delay_count / delay_counts.origin_count
         ).alias('%_flights_departing_15+_minutes_late'))

    delays_origin = delays_origin.sort(
        desc('%_flights_departing_15+_minutes_late'))

    delays_origin.toPandas().to_csv('Output/MostDepartureDelays.csv')
示例#10
0
def process_rdd(time, rdd):
    print("----------- %s -----------" % str(time))
    try:
        sql_context = HiveContext(rdd.context)
        # Convert the RDD to Row RDD
        row_rdd = rdd.map(lambda w: Row(tweet=w, score=analyzeSentiment(w)))
        schema = StructType([
            StructField("tweet", StringType(), True),
            StructField("score", FloatType(), True)
        ])
        # Create a DF with the specified schema
        new_tweets_df = sql_context.createDataFrame(row_rdd, schema=schema)
        # Register the dataframe as table
        new_tweets_df.registerTempTable("new_tweets")
        # Insert new tweets,scores into table tweets
        sql_context.sql("INSERT INTO TABLE tweets SELECT * FROM new_tweets")
        # Get all the tweets from the table using SQL
        tweets_sentiment_df = sql_context.sql("SELECT * FROM tweets")
        tweets_sentiment_df.show()

        # Sends the tweets and their sentiment score to the dashboard
        send_df_to_dashboard(tweets_sentiment_df)
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)
def write_to_hive(time, rdd):
    def process_row(x):
        row_dict = dict()
        row_dict["timestamp"] = 0 if "timestamp" not in x else x["timestamp"]
        row_dict["source_type"] = "" if "source.type" not in x else x["source.type"]
        row_dict["user_name"] = "" if "src_user_name" not in x else x["src_user_name"]
        row_dict["entity_name"] = "" if "ip_src_addr" not in x else x["ip_src_addr"]
        row_dict["guid"] = "" if "guid" not in x else x["guid"]
        row_dict["alert_score"] = 0.0 if "alert_score" not in x else x["alert_score"]
        row_dict["alerts"] = "" if "alerts" not in x else x["alerts"]
        row_dict["y"] = 0 if "y" not in x else x["y"]
        row_dict["m"] = None if "m" not in x else x["m"]
        row_dict["d"] = None if "d" not in x else x["d"]
        for numerical_colname in EVENT_MODEL_NUMERICAL_COLUMNS:
            row_dict[numerical_colname] = 0.0 if numerical_colname not in x else float(x[numerical_colname])
        for categorical_colname in EVENT_MODEL_CATEGORICAL_COLUMNS:
            row_dict[categorical_colname] = "" if categorical_colname not in x else str(x[categorical_colname])

        row = Row(**row_dict)

        return row

    try:
        spark = SparkSession \
            .builder \
            .appName("event-anomaly-online-score") \
            .enableHiveSupport() \
            .getOrCreate()
        hive_context = HiveContext(spark.sparkContext)
        hive_context.setConf("hive.exec.dynamic.partition", "true")
        hive_context.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
        row_rdd = rdd.map(process_row)
        sdf = hive_context.createDataFrame(row_rdd)
        sdf = sdf.drop_duplicates(subset=["guid"])
        sdf.cache()
        source_type_list = [TENANT_NAME + "_" + data_source for data_source in DATA_SOURCE_LIST]
        model_dict = dict()
        for data_source in DATA_SOURCE_LIST:
            model_dict[data_source] = load_event_anomaly_model(spark=spark, data_source=data_source)

        for source_type in source_type_list:
            sdf_source = sdf.filter(sdf.source_type == source_type)
            if not sdf_source.rdd.isEmpty():
                sdf_source.cache()
                database = source_type.split("_")[0]
                data_source = source_type.split("_")[1]
                table = data_source + "_event_alert_score"
                sdf_source.show(3)
                eas_sdf = get_event_anomaly_score(data_source=data_source, model_dict=model_dict,
                                                  input_df=sdf_source)
                result_sdf = sdf_source.join(eas_sdf.select(["guid", "EAS"]), on="guid", how="left")
                result_sdf = result_sdf.na.fill(0.0, subset=["EAS"])
                result_sdf.show(3)
                result_sdf.select("guid", "timestamp", "user_name", "entity_name", "source_type", "alerts",
                                  "alert_score",
                                  "EAS", "y", "m", "d").write.insertInto(database + "." + table)

    except Exception as e:
        pass
示例#12
0
def hiveSaveNews(dfNewsContents, table_name):
    from pyspark.sql import HiveContext

    hiveContext = HiveContext(sc)
    tmpDf = hiveContext.createDataFrame(
        dfNewsContents[['news_code', 'title', 'site', 'writing_time', 'preproc_content', 'img', 'content', 'company']])
    tmpDf.registerTempTable("tmpDf")
    hiveContext.sql("insert into table {table_name} select * from tmpDf".format(table_name=table_name))
    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            sqlContext = HiveContext(sc)
            # FIX: memory error Spark 2.0 bug ( < 2.0 )
            sqlContext.setConf("spark.sql.tungsten.enabled","false")

            # v2.01 spark = SparkSession.builder \
            #.master("local") \
            #.appName("Word Count") \
            #.config("spark.some.config.option", "some-value") \
            #.getOrCreate()
            # Get the singleton instance of SparkSession
            #nzs v1.0 spark = getSparkSessionInstance(rdd.context.getConf())

            if rdd.count() < 1:
                return;

            # Convert RDD[String] to RDD[Row] to DataFrame
            sqlRdd = rdd.map( lambda x: json.loads(x)).map(lambda r: Row( metrics=r["metrics"], name=r["name"], value=r["value"] ) )
            wordsDataFrame = sqlContext.createDataFrame(sqlRdd)
            wordsDataFrame.show()
            # Creates a temporary view using the DataFrame.			
            wordsDataFrame.registerTempTable("starwarstemp")
            # Creates a query and get the alam dataset using the temp table 
            wordCountsDataFrame = sqlContext.sql("select * from  starwarstemp")
            wordCountsDataFrame.printSchema()


            with open(SparkFiles.get('webinar_streaming.sql')) as test_file:
                alertsql=test_file.read()
                #logging.info(alertsql)

            alertDataFrame = sqlContext.sql(alertsql)			
            alertDataFrame.show()
            alertDataFrame.printSchema()			

            # save all values to HBASE 
            # IF NEED FILTER LATER .filter(lambda x: str(x["metrics"])=='action-credit-limit') \
            # create HBASE mapper 
            rowRdd = rdd.map( lambda x: json.loads(x))\
                .map(lambda r: ( str(r["metrics"]) ,[ str(r["name"])+"-"+datetime.datetime.now().strftime("%Y%m%d%H%M%S"), "action" if str(r["metrics"])=="action-credit-limit" else  "healt", str(r["metrics"]), str(r["value"])] ))
            
            table = 'starwarsinbox'
            host = 'node-master2-KcVkz'
            keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
            valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
            conf = {"hbase.zookeeper.quorum": host,
            "hbase.mapred.outputtable": table,
            "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
            "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
            "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
            rowRdd.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv)
        except Exception as merror:
            print (merror)
            raise
示例#14
0
def tear_down():
    for table in data.keys():
        hiveContext = HiveContext(sc)
        df = hiveContext.createDataFrame(data[table], fields[table])
        hiveContext.sql('use test_db')
        try:
            df.registerTempTable("demo")
            hiveContext.sql("insert into {table} partition(ds='{date}') select * from demo".format(table=table,date=date))
            # hiveContext.sql("insert into {table} partition(ds="")  select * from demo".format(table=table))
        except Exception as e:
            df.saveAsTable("{table}".format(table=table))
def main(sc):
    spark = HiveContext(sc)
    sqlContext = HiveContext(sc)
    print "holaaaaa"
    rows = sc.textFile('../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv').mapPartitionsWithIndex(parseCSV)
    df = sqlContext.createDataFrame(rows)
    busiest_route_month_pivot = \
        df.select('ORIGIN_AIRPORT_ID', 'ROUTE', 'MONTH') \
            .groupBy('ROUTE').pivot('MONTH').count()

    busiest_route_month_pivot.toPandas().to_csv('Output/MonthlyRoutes.csv')
示例#16
0
def do_ets_task(sc, ets_dburl_env, wfc):
    # 定义客户标识
    cust_no = '1'
    isvalid = '1'
    etsTempTable = wfc
    ets_url = ets_dburl_env[wfc[:-2]]['dst']
    slave_url = ets_dburl_env[wfc[:-2]]['src']
    dbinfo = load_source('getdbinfo', os.path.join(os.path.dirname(__file__), 'Utils.py')).getdbinfo(slave_url)
    tabledict = load_source('query_sql_slave', os.path.join(os.path.dirname(__file__), 'Utils.py')).query_sql_slave(dbinfo)
    slaveTempTable = tabledict.get(wfc[:-2])
    driver = "com.mysql.jdbc.Driver"
    sqlContext = HiveContext(sc)
    dff = sqlContext.read.format("jdbc").options(url=slave_url, dbtable=slaveTempTable, driver=driver).load()
    dff.registerTempTable(slaveTempTable)

    dft = sqlContext.read.format("jdbc").options(url=ets_url, dbtable=etsTempTable, driver=driver).load()
    dft.registerTempTable(etsTempTable)
    try:
        slave_sql = " select id, learn_id, learn_type, scoresheetcode " \
                    " from  %s  " % (slaveTempTable)
        ds_slave = sqlContext.sql(slave_sql)
        print(u"覆盖式插入:共%s条数据" % ds_slave.count())
        # sqlContext.sql(" delete from %s " % etsTempTable)
        ddlsql = " truncate table %s " % etsTempTable
        # 删除表中数据 使用 jdbc方式
        dbinfo = load_source('getdbinfo', os.path.join(os.path.dirname(__file__), 'Utils.py')).getdbinfo(ets_url)
        load_source('execute_sql_ets', os.path.join(os.path.dirname(__file__), 'Utils.py')).execute_sql_ets(ddlsql, dbinfo)
        now_time = datetime.datetime.now()
        print(u'开始组装数据...')
        src_fields = json.dumps({'GradeItem': ['id', 'learn_id', 'learn_type', 'scoresheetcode']})
        # 字段值
        filedvlue = ds_slave.map(
            lambda row: (row.id, row.learn_id, row.learn_type, row.scoresheetcode, cust_no, isvalid,
                         md5(row), now_time, now_time))
        # 创建列
        schemaString = "id,learn_id,learn_type,scoresheetcode,cust_no,isvalid,src_fields_md5,createts,updatets"
        fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split(",")]
        schema = StructType(fields)
        # 使用列名和字段值创建datafrom
        schemaPeople = sqlContext.createDataFrame(filedvlue, schema)
        print(u'组装数据完成')
        # print schemaPeople
        # for row in schemaPeople:
        #     print row.id
        print(u'开始执写入数据...')
        # 写入数据库
        schemaPeople.write.insertInto(etsTempTable)
        print(u'写入完成')
    except Exception, e:
        # e.message 2.6 不支持
        print(traceback.print_exc())
        print(str(e))
        raise Exception(str(e))
示例#17
0
def select_func_6(signalDataDf,techList,sliceDict):

    numOfDailySignal = sliceDict.get('numOfDailySignal')

    conf = SparkConf().setAppName("spark_infer_schema")

    sc = SparkContext(conf=conf)

    hc = HiveContext(sc)

    # 读入数据

    signalDataDf = signalDataDf.reset_index(drop=False)
    signalDataDf.rename(columns={'level_0':'TradingDay', 'level_1':'WindCode'},inplace=True)
    signalDataDf['TradingDay'] = signalDataDf['TradingDay'].astype('str')
    # print(signalDataDf)

    # 转成spark dataframe
    signalDataSpDf = hc.createDataFrame(signalDataDf)
    # print('signalDataSpDf')
    # signalDataSpDf.printSchema()
    # print(signalDataSpDf)

    signalDataSpDf.registerTempTable("signalData")

    # 分组排序
    sql="select TradingDay,WindCode,Mom from ( " + \
	    "select TradingDay,WindCode,Mom,row_number() OVER (PARTITION BY TradingDay ORDER BY Mom DESC) rank from signalData" + \
	    ") tmp where rank<=" + str(numOfDailySignal)
    allSelectStockSpDf = hc.sql(sql)
    # print('allSelectStockSpDf')
    # allSelectStockSpDf.printSchema()
    # allSelectStockSpDf.show()

    # 转成pandas dataframe
    allSelectStockDf = allSelectStockSpDf.toPandas()
    allSelectStockDf['TradingDay'] = allSelectStockDf['TradingDay'].astype('datetime64')
    allSelectStockDf = allSelectStockDf.set_index([StockConst.TRADINGDAY,StockConst.INNERCODE])
    # print('allSelectStockDf')
    # print(allSelectStockDf)

    # allSelectStockDf = SelectUtil.getTopNAndInsertVolWeight(numOfDailySignal, allSelectStockDf)
    volWeight = 1/numOfDailySignal
    allSelectStockDf.insert(len(allSelectStockDf.columns), 'volWeight', volWeight)
    print(allSelectStockDf)

    return allSelectStockDf
def main():
    d = date.today()
    t = timedelta(days=50 * 365)
    start = d - t
    end = d + t
    total = []
    for k in daterange(start, end):
        total.append(k)
    print sys.getsizeof(total)
    conf = SparkConf()
    sc = SparkContext(conf=conf)
    hc = HiveContext(sc)
    rd = sc.parallelize(total).map(getCalendarDetails)
    df = hc.createDataFrame(rd)
    df.printSchema()
    print rd.take(10)
    df.saveAsTable("DIM_DATE")
示例#19
0
def main(sc):
    spark = HiveContext(sc)
    sqlContext = HiveContext(sc)
    rows = sc.textFile(
        '../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv'
    ).mapPartitionsWithIndex(parseCSV)
    df = sqlContext.createDataFrame(rows)

    df_unique = df.select(df.ROUTE.alias('ROUTE_UNIQUE'), 'DEST',
                          'ORIGIN').distinct()

    busyest_route_single = df.select('ROUTE').groupBy('ROUTE').count()
    busyest_route_single = busyest_route_single.join(
        df_unique, busyest_route_single.ROUTE == df_unique.ROUTE_UNIQUE)
    busyest_route_single = busyest_route_single.drop('ROUTE_UNIQUE')
    busyest_route_single = busyest_route_single.sort(desc('count'))

    busyest_route_single.show()

    busyest_route_single.toPandas().to_csv('Output/MostBussyRoute.csv')
示例#20
0
def main(sc):
    spark = HiveContext(sc)
    sqlContext = HiveContext(sc)
    rows = sc.textFile(
        '../lmf445/Flight_Project/Data/864625436_T_ONTIME_*.csv')
    df = sqlContext.createDataFrame(rows)
    first = df.withColumn('ARR_DEL15', df['ARR_DEL15'].cast('int'))
    sec = first.na.drop()
    third = sec.select('ROUTE', 'ARR_DEL15', 'DEST', 'ORIGIN').filter(
        sec.ARR_DEL15 == 1).groupby('ROUTE').count().withColumnRenamed(
            'count', 'delay_count')
    fourth = third.sort(desc('count'))
    fifth = sec.select('ROUTE', 'ARR_DEL15', 'DEST',
                       'ORIGIN').groupby('ROUTE').count()
    sixth = fifth.sort(desc('count'))
    eighth = fourth.join(sixth, 'ROUTE')
    ninth = eighth.sort(desc('count'))
    tenth = ninth.select('ROUTE', ninth['delay_count'] / ninth['count'])
    eleventh = tenth.sort(desc('(delay_count / count)'))
    twelveth = eleventh.join(eighth, 'ROUTE')
    thirteen = twelveth.sort(desc('(delay_count / count)'))
    thirteen.toPandas().to_csv('Output/route_delayone.csv')
示例#21
0
def main(sc,load_id):
	
	sqlContext = HiveContext(sc)
	dept_df = sqlContext.sql("select name,id,email from dept_table")
	dept_df.createOrReplaceTempView('dept_df')
	dept_df_pd = dept_df.toPandas()
	dept_df_pd_copy = dept_df.toPandas()
	print('Data Shape',dept_df_pd.shape[0])
	
	######## Method Call ########################
	def valid_email_algorithm(email):
		if len(email) > 9:
			if re.match(".+@[a-zA-Z0-9]+.[a-zA-Z0-9]$",email) !=None:
				return Y
			
		else:
			return N
	
	dept_df_pd['email_valid_flag'] = dept_df_pd.apply(lambda x:valid_email_algorithm(x['email'],dept_df_pd_copy),axis =1)
	dept_df_pd.describe()
	dept_spark_df = sqlContext.createDataFrame(dept_df_pd)  ## if in case need to convert it back into spark data frame.
	dept_spark_df.write.option("compression","zlib").mode("overwrite").format("orc").save('/apps/hive/warehouse/dept.db/dept_table')
def main(sc):
    spark = HiveContext(sc)
    sqlContext = HiveContext(sc)
    rows = sc.textFile(
        '../lmf445/Flight_Project/Data/864625436_T_ONTIME_2*.csv'
    ).mapPartitionsWithIndex(parseCSV)
    df = sqlContext.createDataFrame(rows)

    flight_origin = df.select('FL_DATE', 'ORIGIN', 'ORIGIN_AIRPORT_ID')\
     .groupBy('FL_DATE', 'ORIGIN') \
        .count() \
        .withColumnRenamed('count', 'origin_count')
    flight_origin = flight_origin.withColumnRenamed('ORIGIN', 'Airport_origin')

    flight_dest = df.select('FL_DATE', 'DEST', 'DEST_AIRPORT_ID') \
        .groupBy('FL_DATE', 'DEST') \
        .count() \
        .withColumnRenamed('count', 'dest_count')

    flight_dest = flight_dest.withColumnRenamed('DEST', 'Airport_dest')
    flight_dest = flight_dest.withColumnRenamed('FL_DATE', 'FL_DATE_dest')

    total_counts = \
        flight_origin.join(flight_dest,
        ((flight_origin.Airport_origin==flight_dest.Airport_dest) &
        (flight_origin.FL_DATE==flight_dest.FL_DATE_dest)))

    total_counts = \
        total_counts.select(total_counts.Airport_origin.alias('Airport'),
        (total_counts.origin_count+total_counts.dest_count).alias('sum_counts'),
        'FL_DATE')\

    total_counts_airport_pivot = \
        total_counts.groupBy('Airport').pivot('FL_DATE').sum('sum_counts')

    total_counts_airport_pivot.toPandas().to_csv(
        'Output/busiest_airport_by_day.csv')
def main():
    args = parser_arguments()
    start_date = args.start_date[0]
    end_date = args.end_date[0]

    sc = SparkContext()
    hc = HiveContext(sc)


    select = """
        SELECT 
            * 
        FROM 
            cluster_metrics_prod_2.container_fact
        where 
            date between '{0}' and '{1}'
        """.format(start_date, end_date)

    df = hc.sql(select)

    header = {
            "jobid"         : "string",
            "containerid"   : "string", 
            "start"         : "bigint", 
            "stop"          : "bigint",
            "duration"      : "bigint",
            "event"         : "string", 
            "size"          : "double", 
            "priority"      : "int", 
            "hostname"      : "string", 
            "system"        : "string", 
            "date"          : "string"
            }
            

    all_rows = df.flatMap(split_data)
    schema_split_containers = hc.createDataFrame(all_rows)
    schema_split_containers.registerTempTable("split_containers")


    create_string = """
        create table if not exists cluster_metrics_prod_2.container_fact_event_flattened
            (
            jobid       string,
            containerid string,
            start       bigint,
            stop        bigint,
            duration    bigint,
            event       string,
            size        double,
            priority    int,
            hostname    string
            )
        partitioned by
            (
            system      string,
            date        string
            )
        stored as orc
    """

    set_dyn = "set hive.exec.dynamic.partition=true"
    set_nstat = "set hive.exec.dynamic.partition.mode=nonstrict"

    load_string = """
        insert overwrite table 
            cluster_metrics_prod_2.container_fact_event_flattened 
        partition
            (system, date) 
        select 
            jobid, 
            containerid, 
            start, 
            stop, 
            duration, 
            event, 
            size, 
            priority, 
            hostname, 
            system, 
            date 
        from 
            split_containers
    """

    print("Setting dynamic partition...")
    hc.sql(set_dyn)
    hc.sql(set_nstat)

    print("Creating Table...")
    hc.sql(create_string)
    print("Loading data into table...")
    hc.sql(load_string)
    print("DONE")
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row

conf = SparkConf().setAppName("sparkmoviekeywords")
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

inrdd = sc.textFile("/usr/azure/twitterdata/Tweets*")
inrddsplt = inrdd.map(lambda line: line.split("|"))
inrddspltflt = inrddsplt.filter(lambda spltln: spltln[5] == "None")
inrddspltfltwrds = inrddsplt.filter(lambda spltln: spltln[8] != "neutral")
inrdddf = inrddspltfltwrds.map(lambda spltln: (spltln[0],spltln[9].replace('_',' ').split(",")))

inrddfmapv = inrdddf.flatMapValues(lambda x: x)
keyworddf = inrddfmapv.map(lambda keyword: Row(movie=keyword[0],keywords=keyword[1]))

keywordds = sqlContext.createDataFrame(keyworddf)
keywordds.printSchema()

keyworddsout = keywordds.groupBy("movie", "keywords").count()
keyworddsout.printSchema()
keyworddsordered = keyworddsout.orderBy("movie","count",ascending=False)
keyworddsordered.show()

keyworddsordered.write.format("orc").option("path","/usr/azure/moviekeywords").mode("overwrite").saveAsTable("moviekeywords")
from pyspark import SparkContext
sc = SparkContext()

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

tbl_Effective_Care = sqlContext.sql("Select Provider_ID, Measure_ID, Score From tbl_Effective_Care_RAW where Score <> 'Not Available'").rdd

tbl_Effective_Care.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_effective_care/")
sqlContext.createDataFrame(tbl_Effective_Care).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_effective_care/")
exit()
示例#26
0
		logSet = sc.textFile("hdfs://hdfs.hadoop.yourdomain.com:9000/hdfs/aatlogstore/YYYY/MM/*").map(loadData)

		aatlog = logSet.map(getPairKey)

		#make vidsession_log & vidsession data
		aatlogSet = aatlog.reduceByKey(lambda x, y: x)
		vidlogSet = aatlogSet.sortByKey().zipWithIndex().map(lambda x: ((x[0][0][0], x[1]), x[0][1]))
		vidlogSet.persist()
		vidlogLag = vidlogSet.map(lambda x:((x[0][0], x[0][1]+1), x[1]))
		vidLogRST = vidlogSet.leftOuterJoin(vidlogLag).map(mkVidLog)

		vidSet = vidlogSet.map(lambda x: (x[0][0], (x[0][1], x[1])))
		vidProcSet = vidSet.combineByKey(vidCreate, vidMerge, vidMerge2)

		print "filter processing done "
		#save RDD to HIVE Tables
		hx = HiveContext(sc)

		vidTable = hx.createDataFrame(vidProcSet.map(lambda x:x[1]))
		vidLogTable = hx.createDataFrame(vidLogRST, samplingRatio=0.5)

		print "Dataframe Done"
		vidLogTable.saveAsTable('vidsession_log', mode='append')
		vidTable.saveAsTable('vidsession', mode='append')

		print "AATLOG Process Done"

	except Exception, e:
		print "MAIN Error %s" % e

sc = SparkContext()

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

import pandas as pd

tbl_Survey_Responses = sqlContext.sql("Select Provider_ID, Communication_with_Nurses_Achievement_Points,Communication_with_Nurses_Improvement_Points, Communication_with_Nurses_Dimension_Score, Communication_with_Doctors_Achievement_Points, Communication_with_Doctors_Improvement_Points, Communication_with_Doctors_Dimension_Score, Responsiveness_of_Hospital_Staff_Achievement_Points, Responsiveness_of_Hospital_Staff_Improvement_Points, Responsiveness_of_Hospital_Staff_Dimension_Score, Pain_Management_Achievement_Points, Pain_Management_Improvement_Points, Pain_Management_Dimension_Score, Communication_about_Medicines_Achievement_Points, Communication_about_Medicines_Improvement_Points, Communication_about_Medicines_Dimension_Score, Cleanliness_and_Quietness_of_Hospital_Environment_Achievement_Points, Cleanliness_and_Quietness_of_Hospital_Environment_Improvement_Points, Cleanliness_and_Quietness_of_Hospital_Environment_Dimension_Score, Discharge_Information_Achievement_Points, Discharge_Information_Improvement_Points, Discharge_Information_Dimension_Score, Overall_Rating_of_Hospital_Achievement_Points, Overall_Rating_of_Hospital_Improvement_Points, Overall_Rating_of_Hospital_Dimension_Score, HCAHPS_Base_Score, HCAHPS_Consistency_Score From tbl_Survey_Responses_RAW")

df_Survey_Responses = tbl_Survey_Responses.toPandas()

df_Survey_Responses = df_Survey_Responses.iloc[1:]
def Calculate_Points(x):
	try:
		if len(x.split(' ')) > 2:
			return float(x.split(' ')[0])/float(x.split(' ')[-1])
		else:
			return float(x.split(' ')[0])
	except ValueError:
		return ''

for i in range(1,len(df_Survey_Responses.columns)):
	df_Survey_Responses.iloc[0:,i] = df_Survey_Responses.iloc[0:,i].map(Calculate_Points) 

SparkDF_Survey_Responses = sqlContext.createDataFrame(df_Survey_Responses).rdd

SparkDF_Survey_Responses.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_survey_responses/")
sqlContext.createDataFrame(SparkDF_Survey_Responses).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_survey_responses/")

exit()
示例#28
0
文件: main_inc.py 项目: dalinqin/src
    def processData(sc,hc,fs,con,incFileName,inThresh,outThresh,progTag):
        #incFileName="hdfs://namenode.navroomhdp.com:8020/data/t_cf_inc/100033/t_cf_20161028.txt"
        
        #inThresh=10
        #outThresh=300
        
        #**************************************
        #
        #this procedure will use incfile to caculate 
        #flow(everyday one record) store as file
        #indoor(everyday one record) store as file
        #indoor_for_delete(every indoor records) store in hbase
        #indoor detail(every indoor records) store in hbase and as file
        #
        #destIndoorFile     : /data/indoor/entityid/year/id_date.json      used to generate report
        #destFlowFile       : /data/flow/entityid/year/fl_date.json        used to generate report
        #rec_destIndoorfile : /data/rec_indoor/entityid/year/id_date.json  this folder is mirror of hbase records
        #
        #
        #**************************************

        destIndoorFile=get_str_indoorFileName(incFileName)
        #hdfs://namenode.navroomhdp.com:8020/data/indoor/100033/2016/id_20161028.txt
        rec_destIndoorfile=destIndoorFile.replace("/indoor/","/rec_indoor/")
        #hdfs://namenode.navroomhdp.com:8020/data/rec_indoor/101762/2016/id_20161011.txt
        destFlowFile  =destIndoorFile.replace("/indoor/","/flow/").replace("id_","fl_")
        #hdfs://namenode.navroomhdp.com:8020/data/flow/101762/2016/fl_20161011.txt
        tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag)
        tmp_destFlowFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag)
        tmp_rec_destIndoorFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag)
        EntityID=int(get_str_entityID(incFileName))
        #101762
        histFileName=get_str_histFileName(incFileName) #processed file will be place here
        #hdfs://namenode.navroomhdp.com:8020/data/t_cf/101762/t_cf_20161011.txt
        if fs.exists(sc._jvm.Path(histFileName)):
            tmpFileName=get_str_tmpFileName(histFileName)
            #tmpFileName = hdfs://namenode.navroomhdp.com:8020/data/tmp/101762/t_cf_20161011.txt
            tmpFolderName=tmpFileName.rsplit('/',1)[0]+"tmp"
            #tmpFolderName=hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp
            #copy hist file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/hist and distroy the hist file
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(histFileName),fs,sc._jvm.Path(tmpFolderName+"/hist"),True,True,con) 
            #copy inc file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/inc and  destroy the inc file
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(tmpFolderName+"/inc"),True,True,con)
            #copymerge the 2 files (inc and hist) into one file 
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmpFolderName),fs,sc._jvm.Path(tmpFileName),True,con,None)
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(tmpFileName),fs,sc._jvm.Path(incFileName),True,True,con)

        unixFirtDayofMonth = get_int_firstDayUnixDate(incFileName)
        # firtDayofMonth= 1475251200 it is 20161001 unixdate
        startUnixTime=get_int_fileNameUnixDate(incFileName) #1456808400 this is today's unix datetime
 
        rows_t_cf=sc.textFile(incFileName).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], stime=p[1],flag=p[2]))
        HiveContext.createDataFrame(hc,rows_t_cf).registerTempTable("t_cf_inc_tmp")
        hc.sql("select distinct clientmac,stime,flag from t_cf_inc_tmp").registerTempTable("t_cf")
        
        df=hc.sql("select distinct ClientMac,stime ,lag(stime) over (partition by ClientMac order by stime) as lag_time ,lead(stime) over (partition by ClientMac order by stime) as lead_time from t_cf where flag=1")
        df1=df.withColumn("diff" , df["stime"]-df["lag_time"]).na.fill(-1)
        df1.filter((df1.diff>=outThresh)|(df1.lag_time ==-1)|( df1.lead_time==-1)).registerTempTable("df2")
        df2=hc.sql("select ClientMac,stime,lag_time,lead_time,case when (diff < "+ str(outThresh) +" and diff>0) then diff ELSE 0 end as diff from df2")
        df3=df2.withColumn("lag_time1",df2.lag_time+df2.diff).drop( "lag_time")
        df3.withColumn("lag_time2",func.lead("lag_time1").over(Window.partitionBy("clientMac"))).registerTempTable("df3")
        
        df4=hc.sql("select ClientMac,cast(stime as int) as ETime ,cast(lag_time2 as int) as LTime,cast((lag_time2- stime) as int) as Seconds from df3").na.fill(-1)
        df5=df4.filter((df4.LTime>0)&(df4.Seconds>=inThresh)&(df4.ETime>startUnixTime)&(df4.ETime<(startUnixTime+86400))).withColumn("ENTITYID",lit(EntityID)) #86400 is seonds in one day
        df5.registerTempTable("df5")
        #DF5 will be save to hbase as indoor details(rec_destIndoorfile) ,df6 and df7 will be used for stats caculation
        
        df6=hc.sql("select ClientMac,ETime, LTime, Seconds ,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df5")
        df6.registerTempTable("df6_indoor")
        df7=hc.sql("select ClientMac,min(etime) as etime,max(ltime) as ltime,sum(Seconds) as seconds,utoday from df6_indoor group by ClientMac,utoday")
        df_current_result=df7.withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth))

        flow_sql=  "select ClientMac,min(stime) as etime,max(stime) as ltime from t_cf where stime >"+str(startUnixTime) + " and stime <"+str(startUnixTime+86400)+" group by clientmac"
        hc.sql(flow_sql).registerTempTable("df_flow_tmp")
        df_flow=hc.sql("select ClientMac,etime,ltime,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df_flow_tmp").withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth))
       
        #df_flow.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_FLOW_TODAY") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
        #df_flow.saveAsTable("T_FLOW")
        if len(df5.head(1))==1:  #df5 is not empty better than df5.rdd.isEmpty
            tmp_rec_destIndoorFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag)
            df5.select('clientmac','entityid','etime','ltime','seconds').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_rec_destIndoorFolder)            
            #df5.write.mode('overwrite').json(tmp_rec_destIndoorFolder)
            df5.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
            if fs.exists(sc._jvm.Path(rec_destIndoorfile)):  #the old indoor folder exists,will generate df_delete_pk for phoenix to delete invalid rows
                rows_rec_indoor=sc.textFile(rec_destIndoorfile).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=str(p[0]), entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4])))
                HiveContext.createDataFrame(hc,rows_rec_indoor).registerTempTable("df_old_indoor")
                df_old_indoor_pk=hc.sql("select ClientMac,ENTITYID,ETime from df_old_indoor") 
                df_current_result_pk=hc.sql("select ClientMac,ENTITYID,ETime from df5") 
                df_delete_pk = df_old_indoor_pk.subtract(df_current_result_pk)
                if len(df_delete_pk.head(1))==1:
                    df_delete_pk.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR_FOR_DELETE").option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
        else:
            tmp_rec_destIndoorFolder="NONE"
            
        if len(df_flow.head(1))==1:
            tmp_destFlowFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag)
            df_flow.select('clientmac','entityid','etime','ltime','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destFlowFolder)
            #df_flow.write.mode('overwrite').json(tmp_destFlowFolder)
        else:
            tmp_destFlowFolder="NONE"
            
        if len(df_current_result.head(1))==1:
            tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag)
            df_current_result.select('clientmac','entityid','etime','ltime','seconds','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destIndoorFolder)
            #df_current_result.write.mode('overwrite').json(tmp_destIndoorFolder)
        else:
            tmp_destIndoorFolder="NONE"
        
        sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(histFileName),True,True,con) 

        if fs.exists(sc._jvm.Path(destIndoorFile)):
            fs.delete(sc._jvm.Path(destIndoorFile))
        if fs.exists(sc._jvm.Path(destFlowFile)):
            fs.delete(sc._jvm.Path(destFlowFile))
        if fs.exists(sc._jvm.Path(rec_destIndoorfile)):
            fs.delete(sc._jvm.Path(rec_destIndoorfile))        
        #delete is a must if file already exists otherwise copymerge will fail  
        
        if tmp_destIndoorFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destIndoorFolder),fs,sc._jvm.Path(destIndoorFile),True,con,None)
            #destIndoorFile=get_str_indoorFileName(incFileName)
        if tmp_destFlowFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destFlowFolder),fs,sc._jvm.Path(destFlowFile),True,con,None)
        if tmp_rec_destIndoorFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_rec_destIndoorFolder),fs,sc._jvm.Path(rec_destIndoorfile),True,con,None)
示例#29
0
datalines = datalines.map(lambda x: cleans(x))

#1 imports

from pyspark.sql.types import *

#2 create metadata

fields = [StructField(field_name,StringType(),True) for field_name in firstline]

schema = StructType(fields)

#3 create a dataframe

schemaLoans = sqlContext.createDataFrame(datalines, schema)

#4 register it as a table called loans

schemaLoans.registerTempTable("loans")

#1 drop table, summarize and store in hive

sqlContext.sql("drop table if exists LoansByTitle")

sql = '''create table LoansByTitle stored as parquet as select title, count(1) as number from loans group by title order by number desc'''

sqlContext.sql(sql)
sqlContext.sql('drop table if exists raw')

sql = '''create table raw stored as parquet as select title, emp_title,grade,home_ownership,int_rate,recoveries,collection_recovery_fee,loan_amnt,term from loans'''
from pyspark import SparkContext
sc = SparkContext()

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

tbl_Readmissions = sqlContext.sql("Select Provider_ID, Measure_ID, Compared_to_National, Denominator, Score, Lower_Estimate, Higher_Estimate From tbl_Readmissions_RAW where Score <> 'Not Available' and Measure_ID = 'READM_30_HOSP_WIDE'").rdd

tbl_Readmissions.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_readmissions/")
sqlContext.createDataFrame(tbl_Readmissions).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_readmissions/")

exit()
dailyDateIntervalDictionaryToCalculateFor = DateIntervalManager.createDailyIntervalDictionaryForPastYear(yesterday_date)

number_of_days_in_dictionary = dailyDateIntervalDictionaryToCalculateFor.getNumberOfDaysInDictionary()

minimum_number_of_days = int((4.0 / 7.0) * float(number_of_days_in_dictionary))

mapStockCsvToKeyValueClosure = StockRdd.getMapStockCsvToKeyValueForDatesInDictionaryClosure(dailyDateIntervalDictionaryToCalculateFor)
symbol_creation_function_closure = StockRdd.getSymbolDataInstanceForDateDictionaryDataPointsClosure(dailyDateIntervalDictionaryToCalculateFor, yesterday_date)

symbol_down_stocks_data_filtered = sample_data_rdd.map(mapStockCsvToKeyValueClosure)\
                                           .filter(lambda line: not(line is None))\
                                           .reduceByKey(lambda a,b : a + b)\
                                           .map(lambda tuple : ( tuple[0], StockRdd.sort_and_compute_deltas( list(tuple[1]) ) ) )\
                                           .filter(lambda tuple : len(list(tuple[1])) > minimum_number_of_days)\
                                           .map(symbol_creation_function_closure)\
                                           .filter(lambda symbol_and_instance_tuple: not(symbol_and_instance_tuple[1].getTodayPrice() is None))\
                                           .map(StockRdd.getDownStocksDataTuple)\
                                           .filter(lambda data_tuple: not(data_tuple[1] is None))\
                                           .filter(lambda data_tuple: not(data_tuple[1] == float("inf")))

symbol_down_stocks_data_filtered_rows = symbol_down_stocks_data_filtered\
                                            .map(lambda tuple : Row(symbol = tuple[0], span_unit_delta_percentage_ratio = tuple[1], today_price = tuple[2], today_unit_delta_percentage = tuple[3]))


schemaDownStocks = sqlContext.createDataFrame(symbol_down_stocks_data_filtered_rows)
down_stocks_table_name='down_stocks'
schemaDownStocks.write.jdbc(url=mysql_url, table=down_stocks_table_name, mode="overwrite")


print 'Number of rows in the table {0}'.format(df_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())

# converting the data types for score column
df_clean = df_clean.selectExpr('provider_id','measure_id','cast(score as int) as score')

# get the maximum score per measure_id and name the column as 'max_score'
df_max_scores = df_clean.groupBy('measure_id').max().collect()
df_max_scores = sc.broadcast(df_max_scores)

# function to extract max_score for each measure_id
def get_max_score(id):
    return [score[1] for score in df_max_scores.value if score[0] == id][0]

# creating a new RDD containing extra column for normalized score
# that is ratio of current score with maximum score for the measure_id
from pyspark import SparkContext

sc = SparkContext()

from pyspark.sql import HiveContext

sqlContext = HiveContext(sc)

tbl_Measure_Dates = sqlContext.sql(
    "Select Measure_ID, Measure_Name, Measure_Start_Date, Measure_End_Date From tbl_Measure_Dates_RAW"
).rdd

tbl_Measure_Dates.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_measure_dates/")
sqlContext.createDataFrame(tbl_Measure_Dates).write.parquet(
    "/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_measure_dates/"
)


exit()
##############################
# Find the top 10 procedures with largest variability among hospitals

# Get our data, but skip null normalizedscore
df_total_quality = sqlContext.sql("select measureid, normalizedscore from total_quality where normalizedscore is not null")

dataByMeasure = df_total_quality.map(lambda r: (r.measureid, r.normalizedscore)).groupByKey() # Group by measure id
varByMeasure = dataByMeasure.map(lambda r: (r[0], float(np.var(list(r[1]))))) # Calculate variance
sortedVarByMeasure = varByMeasure.sortBy(lambda r: r[1], ascending=False) # Sort by variance

top_10_variance = sortedVarByMeasure.take(10)

schema = StructType([
    StructField("measureid", StringType(), True),
    StructField("variance", FloatType(), True)])
df_variance = sqlContext.createDataFrame(top_10_variance, schema)
df_top_10_variance = df_variance.orderBy(df_variance.variance.desc()).limit(10)

saveAsHiveTable(df_top_10_variance, "top_10_variating_procedure")

print
print "Top 10 variating procedures"
print
rank = 1
for i in df_top_10_variance.collect():
    print "Rank:%d - MeasureId=%s, Variance=%f" % \
          (rank, i.measureid, i.variance)
    rank += 1

示例#35
0
	def queryHive(self, timestart=0, timeend=0, tp=1, pt1= (0,0), pt2=(0,0), mmsi=['0'], output='csv', taskid=''):
		'''

		:param timestart:
		:param timeend:
		:param tp:
		:param pt1:
		:param pt2:
		:param mmsi:
		:param output:
		:param taskid:

		:return:
			hivetableName
		'''
		print "in queryHive"		
		year = time.localtime(timestart).tm_year
		sqlContext = HiveContext(self.sc)
		# time filter
		table = None
		print 'mmsi',mmsi
		if mmsi[0] != '0':
			#for x mmsi:
			table = sqlContext.sql("select * from aisdynamiclog_{0} where drterminalcode={1}".format(year, mmsi[0])).rdd
		else:
			table = sqlContext.sql("select * from aisdynamiclog_{0}".format(year)).rdd
		#table.cache()
		#print table.count()
		# space filter
		rltrdd = None

		if tp == 0:
			pt2 = getlowerrightpoint(pt1[0], pt1[1], pt2[0], pt2[1])
			pt_times1 = (revertdegree_f(pt1[0]), revertdegree_f(pt1[1]))
			pt_times2 = (revertdegree_f(pt2[0]), revertdegree_f(pt2[1]))
		if tp == 1:
			pt_times1 = (revertdegree_f(pt1[0]), revertdegree_f(pt1[1]))
			pt_times2 = (revertdegree_f(pt2[0]), revertdegree_f(pt2[1]))
		if tp == 2:
			pt_times1 = (revertdegree_f(pt1[0]), revertdegree_f(pt1[1]))

		if tp == 0 or tp == 1:
			rltrdd = table.filter(lambda row: callnglat.inside0_1(row, pt_times1, pt_times2))
		elif tp == 2:
			rltrdd = table.filter(lambda row: callnglat.inside2(row, pt_times1, pt2[0]))

		#empty rdd
		if rltrdd.count() == 0:
			return
		newdf = sqlContext.createDataFrame(rltrdd)

		# regist with a unique name
		tablename = "tmptable"
		newdf.registerAsTable(tablename)

		#hivetablename = "testtable3"
		hivetablename = "query_{0}_{1}".format(taskid, year)

		sql_create = "CREATE TABLE IF NOT EXISTS {0} \
				ROW FORMAT DELIMITED FIELDS TERMINATED BY \',\' LINES TERMINATED BY \'\\n\' \
				AS SELECT * from ais_model where 1=0".format(hivetablename)

		sql_insert = "INSERT INTO TABLE {0} select * from {1}".format(hivetablename, tablename)

		sqlContext.sql(sql_create)
		sqlContext.sql(sql_insert)

		return hivetablename
示例#36
0
            predictPlayers.append(pp)
        else:
            #print "p not found in mlb?", p
            newname = alternateNames(p.upper(), pids)
            if newname is not None:
                #print "got it: ", newname
                pp['lookup_name'] = newname
                pp['player_id'] = encodedPlayerIds[str(pids[newname])]
                predictPlayers.append(pp)
            else:
                print "REALLY NOT FOUND.", p.upper()
        

    print "predictHitters=", predictHitters
    phRDD = sc.parallelize(predictHitters)
    phDF = sqlContext.createDataFrame(phRDD, samplingRatio=0.5)
    phDF.registerTempTable("fd_hitters")
    print "phDF=", phDF.take(2)
    
    print "predictPitchers=", predictPitchers
    ppRDD = sc.parallelize(predictPitchers)
    ppDF = sqlContext.createDataFrame(ppRDD, samplingRatio=0.5)
    ppDF.registerTempTable("fd_pitchers")
    print "ppDF=", ppDF.take(22)

    encodedHitterFeatures = sqlContext.parquetFile(rddDir + "/batting_features.enc.parquet")
    encodedHitterFeatures.registerTempTable("bfe")

    hfDF = sqlContext.sql("""select bfe.* from fd_hitters, bfe where
                            fd_hitters.player_id = bfe.player_id
                            and fd_hitters.game_date = bfe.game_date""")
示例#37
0
文件: main_inc.py 项目: dalinqin/src
        if progTag==cnt:                
            folderLists=x         
        if progTag==cnt:                
            folderLists=x         
        cnt=cnt+1
        
    for x in folderLists:
        entityid=int(get_str_entityID_byFolder(x))
        df_id=df_mysql.filter(df_mysql.ID==entityid)
        inThresh  =10  if (df_id.head(1)[0].IndoorSecondsThrehold==0) else df_id.head(1)[0].IndoorSecondsThrehold
        outThresh =300 if (df_id.head(1)[0].LeaveMinutesThrehold ==0) else df_id.head(1)[0].LeaveMinutesThrehold*60
        incFiles=fs.listStatus(sc._jvm.Path(x))
        for incFile in incFiles:
            if incFile.isFile():
                curTime=str(time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())))
                currrentFile=str(incFile.getPath())
                processData(sc,hc,fs,con,currrentFile,inThresh,outThresh,progTag)
                pd_rowLog = pandas.Series([operType,curTime,currrentFile],index=pd_column)
                pd_dataFrameLog=pd_dataFrameLog.append(pd_rowLog, ignore_index=True)
    
    
    curDay=str(time.strftime("%Y-%m-%d",time.localtime(time.time())))

    
    df_log=hc.createDataFrame(pd_dataFrameLog)
    df_log.sort(df_log.operType,df_log.processDate).repartition(1).write.mode('overwrite').json("/data/log/incData"+str(progTag)+"/"+curDay) 
 
        
    sc.stop()
    
print 'Number of rows in the table {0}'.format(df_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())

# converting the data types for score column
df_clean = df_clean.selectExpr('provider_id','measure_id','cast(score as int) as score')
# creating dataframe for hospitals table
df_hospital = sqlCtx.table("hospital")

# joining hospitals table to hospital_state
df_clean = df_clean.join(df_hospital, df_clean.provider_id == df_hospital.provider_id,'left_outer').select(df_clean.provider_id, df_clean.measure_id, df_hospital.state, df_clean.score)




# get the maximum score per measure_id and name the column as 'max_score'
df_max_scores = df_clean.groupBy('measure_id').max().collect()
from pyspark import SparkContext, SparkConf
from pyspark.sql import Row, HiveContext

#sc = SparkContext(appName="readHDFS")
conf = SparkConf().setMaster("local").setAppName("fromHdfsToHive")
sc = SparkContext(conf=conf)
hvcontext = HiveContext(sc)

ofiles = sc.wholeTextFiles(
    "hdfs:///user/cloudera/out/out4/result-*/part-00000")
#returns (filename,content)

ofiles_format = ofiles.map(lambda x: x[1]).map(
    lambda x: [line for line in x.splitlines()]).map(lambda x: x[0]).map(
        lambda x: tuple(x[1:-1].split(','))).map(lambda r: Row(
            dttime=r[0], symbol=r[1], txtype=r[2], totalvol=int(r[3])))

ofiles_df = hvcontext.createDataFrame(ofiles_format)
ofiles_df.saveAsTable("stocks_Ordered")
#save dataframe as a persistent Hive table
示例#40
0

from pyspark.sql import HiveContext
from pyspark import SparkConf
from pyspark import SparkContext
from operator import add
import datetime

conf = SparkConf().setAppName("EDF").setMaster("local")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)
hiveContext.sql("use db_edf")

firstday = 1325375940 #31/12/2011 23:59:00
ONE_DAY = 84600

df = hiveContext.sql("select timestp as tp, consumption as cp, site_id as id from consumption")

#while (firstday < 1356998400):
	
firstday += ONE_DAY
daydate = datetime.datetime.fromtimestamp(float(firstday)).strftime('%Y-%m-%d')
print(daydate)
test = df.filter(df.tp < firstday).groupBy("id").sum("cp")
test.show()

vect_date = [daydate] * len(test.collect())
print(vect_date)
df_date = hiveContext.createDataFrame(vect_date)
df_date.show() 
from pyspark import SparkContext, SparkConf
from pyspark.sql import HiveContext, Row

conf = SparkConf().setAppName("sparkmoviehashtags")
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

inrdd = sc.textFile("/usr/azure/twitterdata/Tweets*")
inrddsplt = inrdd.map(lambda line: line.split("|"))
inrddspltflt = inrddsplt.filter(lambda spltln: spltln[5] == "None")
inrddspltfltwrds = inrddsplt.filter(lambda spltln: spltln[7] != '')
inrdddf = inrddspltfltwrds.map(lambda spltln: (spltln[0],spltln[7].split(",")))

inrddfmapv = inrdddf.flatMapValues(lambda x: x)
hashtgdf = inrddfmapv.map(lambda hashtag: Row(movie=hashtag[0],hashtags=hashtag[1]))

hashtgds = sqlContext.createDataFrame(hashtgdf)
hashtgds.printSchema()

hashtgdsout = hashtgds.groupBy("movie", "hashtags").count()
hashtgdsout.printSchema()
hashtgdsordered = hashtgdsout.orderBy("movie","count",ascending=False)
hashtgdsordered.show()

hashtgdsordered.write.format("orc").option("path","/usr/azure/moviehashtags").mode("overwrite").saveAsTable("moviehashtags")
示例#42
0
        return None
    else:
        info = measureInfo[measureid]
        minScore = info[0]
        scoreRange = info[1]
        return float((score - minScore)/scoreRange)

total_quality_normal = total_quality.map(lambda r: (r[0], r[1], r[2], normalizeScore(r[3], r[2])))

schema = StructType([
    StructField("providerid", StringType(), True),
    StructField("state", StringType(), True),
    StructField("measureid", StringType(), True),
    StructField("normalizedscore", FloatType(), True)])

df_total_quality = sqlContext.createDataFrame(total_quality_normal, schema)
saveAsHiveTable(df_total_quality, "total_quality")


# Some hospitals have too few non-NA measure.  To have a fair ranking, we want to set a min. bar
# on the # of non-NA measure for our hospitals to participate in our evaluation.

# For each hospital, find out the # of non-NA measure it has
nonNAMeasureCount = dict(df_total_quality.map(lambda r: (r.providerid, r.normalizedscore)).
                         combineByKey( # Use combineByKey to count the # of non-NA Measure
                            lambda value: 0 if value is None else 1,
                            lambda x, value: x if value is None else x + 1,
                            lambda x, y: x + y).collect())

# Find the 25th percentile of non-NA measure, and this will be the min-bar of # of non-NA measure.
minMeasureCount = np.percentile(nonNAMeasureCount.values(), 25.)
示例#43
0
sys.setdefaultencoding('utf-8')

bfile = open(
    '/Users/zimoli/Downloads/RBDA-MCINTOSH/Project/RBDAProject/spark_data/checkin_toronto.txt'
)

lines = []

for line in bfile:
    lines.append(line[:-1].split('\t'))

sc = SparkContext("local", "checkins")
sqlContext = HiveContext(sc)

rdd = sc.parallelize(lines)
df = sqlContext.createDataFrame(
    rdd, ['business_id', 'user_id', 'dates', 'main_cate'])
df_d = df.na.drop()

timestamps = sc.broadcast([
    "2018_10", "2018_09", "2018_08", "2018_07", "2018_06", "2018_05",
    "2018_04", "2018_03", "2018_02", "2018_01", "2018_00", "2017_11",
    "2017_10", "2017_09", "2017_08", "2017_07", "2017_06", "2017_05",
    "2017_04", "2017_03", "2017_02", "2017_01", "2017_00", "2016_11",
    "2016_10", "2016_09", "2016_08", "2016_07", "2016_06", "2016_05",
    "2016_04", "2016_03", "2016_02", "2016_01", "2016_00", "2015_11",
    "2015_10", "2015_09", "2015_08", "2015_07", "2015_06", "2015_05",
    "2015_04", "2015_03", "2015_02", "2015_01", "2015_00", "2014_11",
    "2014_10", "2014_09", "2014_08", "2014_07", "2014_06", "2014_05",
    "2014_04", "2014_03", "2014_02", "2014_01", "2014_00"
])
cates = sc.broadcast([
cluster_center_overall_deltas = map(get_overall_delta_percentage_closure, centers)

# Convert the list to a list of tuples mapping cluster number to the performance percentage
converted_center_delta_list = DunaganListUtility.convert_list_to_value_and_index_tuple_list(cluster_center_overall_deltas)

# Sort the list of tuples by performance (in ascending order)
converted_center_delta_list.sort(lambda tuple_1, tuple_2: cmp(tuple_1[1], tuple_2[1]))

# Convert the list mapping cluster_number to performance percentage into a list of Row() object for insertion into a database table
# (ClusterId, Delta-Percentage) Row list construction
converted_center_delta_list_rows = map(lambda delta_tuple: Row(cluster_id=int(delta_tuple[0]), delta_percentage=float(delta_tuple[1])), converted_center_delta_list)

print "\n\n\n\nAbout to sqlContext.createDataFrame(converted_center_delta_list_rows)\n\n\n\n"

# Create a data frame from the list of Rows
schemaCenterDeltas = sqlContext.createDataFrame(converted_center_delta_list_rows)

print "\n\n\n\nAbout to schemaCenterDeltas.write.jdbc(url=mysql_url, table=cluster_total_delta_percentages)\n\n\n\n"

# Write the data frame to the database in table cluster_total_delta_percentages
schemaCenterDeltas.write.jdbc(url=mysql_url, table='cluster_total_delta_percentages', mode="overwrite")

# Produce a list which maps cluster numbers to symbols to produce an xref database table
# (ClusterId,  Symbol) XRef Row List construction
cluster_id_symbol_xref_rows_list = []
for cluster_id, list_of_symbols in clusterGroupsDictionaryRdd.items():
    for symbol in list_of_symbols:
        print "cluster_id: " + str(cluster_id) + "\t\tsymbol: " + symbol
        xrefRow = Row(cluster_id=int(cluster_id), symbol=str(symbol))
        cluster_id_symbol_xref_rows_list.append(xrefRow)
示例#45
0
class Testing_Resources_Split_1(unittest.TestCase):
    def setUp(self):
        warnings.simplefilter("ignore", ResourceWarning)
        fpath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
        with open(fpath +
                  '/data_source/bookings_overall.json') as bookings_source:
            self.bookings = json.load(bookings_source)
        with open(fpath + '/data_source/cfg.json') as cfg_source:
            self.cfg = json.load(cfg_source)
        today = '20180402'
        self.days = optimizer.util.get_days_from_bookings(today, self.bookings)
        self.sc = SparkContext.getOrCreate()
        self.hive_context = HiveContext(self.sc)
        self.schema = optimizer.util.get_common_pyspark_schema()
        self.bookings_map = optimizer.util.get_bookings_map(self.bookings)

    def compare_splitted_dfs(self, pandas_df_expected, rows, new_bk_id):
        booking_spark_df = self.hive_context.createDataFrame(rows, self.schema)
        spark_df_splitted_rdd = booking_spark_df.rdd.flatMap(
            optimizer.main.bb_split_method(self.cfg, self.bookings_map,
                                           new_bk_id))
        spark_df_splitted = self.hive_context.createDataFrame(
            spark_df_splitted_rdd, self.schema)
        pandas_df_splitted = spark_df_splitted.select("*").toPandas()
        print(pandas_df_expected)
        print(pandas_df_splitted)

        self.assertTrue(
            assert_frame_equal(pandas_df_expected,
                               pandas_df_splitted,
                               check_dtype=False) == None)

    # es: elasticsearch
    def test_es_bookings_search(self):
        self.assertTrue(len(self.bookings) >= 3)

    def test_es_predictions_search(self):
        es_client_predictions = ESClient(self.cfg['es_host'],
                                         self.cfg['es_port'],
                                         self.cfg['es_predictions_index'],
                                         self.cfg['es_predictions_type'])
        predictions = es_client_predictions.search({"size": 100})

        self.assertTrue(len(predictions) > 0)
        self.assertTrue(len(predictions) >= 40)

    def test_get_tbr_ratio(self):
        es_client_tbr = ESClient(self.cfg['es_host'], self.cfg['es_port'],
                                 self.cfg['es_tbr_index'],
                                 self.cfg['es_tbr_type'])
        ands = ['b6', 'b7']
        get_tbr_ratio = optimizer.dao.query_builder.get_tbr_ratio(
            ands, self.bookings_map, es_client_tbr)

        print('get_tbr_ratio=' + str(get_tbr_ratio))
        self.assertTrue(get_tbr_ratio == 1.0)

    def test_bb_split_method_case1(self):
        # Testcase type: 1 booking is splitted by another different booking
        # testcase 1: booking b8 is splitted with a new booking b6.
        # bk_id: b8, days: ['20180405'], a: ['1'], g: ['g_X'], si: ['1']
        # bk_id: b6, days: ['20180405'], a: ['4'], g: ['g_f'], si: ['2']
        # original dataframe: ['20180405', ['b8'], [], {}, 0]
        # splitted to : d1: ands=['b8', 'b6'], minus=[]  &&  d2: ands=['b8'], minus=['b6']
        # in this testecase by hand, d2 has the get_bb_count() value 3239, so d2 is valid, but d1 is invalid.
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'allocated', 'amount'])
        pandas_df_expected.loc[0] = ['20180405', ['b8'], ['b6'], {}, 3239]

        day, booking_id, new_bk_id = '20180405', 'b8', 'b6'

        rows = [(day, [booking_id], [], {}, 0)]
        return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id)

    def test_bb_split_method_case2(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'allocated', 'amount'])
        pandas_df_expected.loc[0] = ['20180405', ['b6', 'b7'], [], {}, 8900]

        day, booking_id, new_bk_id = '20180405', 'b6', 'b7'
        rows = [(day, [booking_id], [], {}, 0)]
        return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id)

    def test_bb_split_method_case3(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'allocated', 'amount'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b8'], ['b6', 'b7'], {}, 3239
        ]
        pandas_df_expected.loc[1] = [
            '20180405', ['b6', 'b7'], ['b8'], {}, 8900
        ]

        new_bk_id = 'b7'
        rows = [('20180405', ['b8'], ['b6'], {}, 3239),
                ('20180405', ['b6'], ['b8'], {}, 0)]
        return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id)

    def test_bb_split_method_case4(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'allocated', 'amount'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b6', 'b7', 'b10'], [], {}, 8900
        ]

        new_bk_id = 'b10'
        rows = [('20180405', ['b6', 'b7'], [], {}, 8900)]
        return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id)

    def test_bb_split_method_case5(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'allocated', 'amount'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b8'], ['b6', 'b7', 'b9'], {}, 3239
        ]
        pandas_df_expected.loc[1] = [
            '20180405', ['b6', 'b7'], ['b8', 'b9'], {}, 8900
        ]

        new_bk_id = 'b9'
        rows = [('20180405', ['b8'], ['b6', 'b7'], {}, 3239),
                ('20180405', ['b6', 'b7'], ['b8'], {}, 8900)]
        return self.compare_splitted_dfs(pandas_df_expected, rows, new_bk_id)
示例#46
0
i = 0
for index, row in pd_df_reviews.iterrows():
    tot = 0
    totwords = 0
    print(row)
    if pd.notnull(row['_2']):
        cleantext = cleanText(row['_2'])
        totwords = len(cleantext)
        #return the 5 n-grams with the highest PMI
        unigrams = top5_words(cleantext)
        bigrams = top5_bigram_collocations(cleantext)
        trigrams = top5_trigram_collocations(cleantext)

    df.ix[i, 'BusinessID']= row['_1']
    df.ix[i, 'CleanText']= " ".join(cleantext).encode('utf-8')
    df.ix[i, 'TotWords']= totwords
    df.ix[i, 'unigrams']= unigrams
    df.ix[i, 'bigrams']= bigrams
    df.ix[i, 'trigrams']= trigrams
    i += 1

spark_df = sqlContext.createDataFrame(df, columns)

# Save it as a table
spark_df.registerTempTable("dfBusiness500000")

sqlContext.sql("drop table if exists result2_bybusiness")
sqlContext.sql("CREATE TABLE result2_bybusiness AS SELECT * FROM dfBusiness500000")

示例#47
0
class StravaLoader(object):

    def __init__(self, 
                 data_source='local', 
                 activity_directory='strava-activities-subset',
                 s3bucket='larsbk',
                 athletes=None,
                 activity_types=[
                    'Ride',
                    'Run',
                    'NordicSki'
                 ],
                 sc=None,
                 hiveContext=None,
                 conf=(SparkConf().setAppName('Strava analysis')),
                 filter_bug_inducing_rows=True
                 ):

        ''' Initialize Strava Analysis object'''


        # INPUT PARAMETERS

        self.athletes = athletes # Athletes to analyze (optional)
        self.activity_types = activity_types # Activity_types to consider (default)
        self.filter_bug_inducing_rows = filter_bug_inducing_rows


        # CONFIGURE SPARK

        if sc != None and hiveContext != None: # Both contexts were supplied by user
            print 'Info: Using supplied SparkContext and HiveContext'
            self.sc = sc
            self.hiveContext = hiveContext

        else: # Initialize new contexts
            print 'Info: Intitializing SparkContext and hiveContext from (default) conf'
            self.sc = SparkContext(conf=conf)
            self.hiveContext = HiveContext(self.sc)

        self.schema = pickle.load(open('./schema.p', 'rb')) # The pre-defined schema
        self.df = None # Empry DataFrame to be populated later


        # CONFIGURE DATA SOURCE

        data_root_path = {
                's3': 's3n://%s/%s/' % (s3bucket, activity_directory), 
                'local': './%s/' % activity_directory
        }
        
        if data_source not in data_root_path.keys(): # Check if data source is valid 
            raise Exception(('Unrecognized data source %s. '
                             'Supported sources: "%s".') \
                             % '", "'.join(data_root_path.keys()))
        
        self.data_source = data_source # This is a valid data source
        self.path = data_root_path[data_source] # This is the path to the data


        # (S3 SPECIFIC STUFF)

        if data_source == 's3':

            # Get a list of files in he activity_directorys
            bucket = boto3.resource('s3').Bucket(s3bucket) 
            objects = bucket.objects.filter(Prefix='%s/gpx/' % activity_directory)
            files = [obj.key for obj in objects] 

            # Make set of observed combinations of athlete and activity_type
            athlete_and_type = set([]) # Empty set to populate
            fpattern = '\/([\w]+)\/(?:[\w-]+)-([\w]+)\.gpx' # File name pattern
            for fname in files:
                match = re.match(activity_directory+'/gpx'+fpattern, fname)
                if match:
                    athlete_and_type.add((match.group(1), match.group(2)))

            self.s3_athlete_and_type = athlete_and_type # Save set for later use

        pass


    def _get_athlete_directories(self):
        '''
        Look for athlete directories in data_root_path \
        and update self.athletes
        '''

        if self.data_source in ['local']:

            self.athletes = [
                directory for directory in os.listdir(self.path+'gpx/')
                if re.match('^[\w-]+$', directory)
            ]

        else:
            print ('Warning: Automatic directory/athlete detection not yet supported for '
                   'data source %s. Using: "akrogvig", "lkrogvig", "brustad"') \
                   % self.data_source

            self.athletes = ['akrogvig', 'lkrogvig', 'brustad']

        pass


    def _activities_exist(self, athlete, activity_type):
        '''
        Checks if there exists activities of type <activity_type> for athlete <athlete>, 
        returns a boolean value
        '''

        # Check local directory with glob
        if self.data_source == 'local':
            return glob.glob(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type))

        # Check if combination exists by using previously compiled sets
        elif self.data_source == 's3':
            return ((athlete, activity_type) in self.s3_athlete_and_type)

    def _load_dataset(self):
        '''
        Loads strava activities from source to DataFrame self.df
        '''

        # Get athlete list if not already set
        if not self.athletes:
            self._get_athlete_directories()

        # Initialize empty dataset
        self.df = self.hiveContext.createDataFrame(
            self.sc.emptyRDD(),
            self.schema
        )

        for athlete in self.athletes:
            for activity_type in self.activity_types:
        
                # Check that there are files of that type (or else .load fails)
                if self._activities_exist(athlete, activity_type):

                    # Read data
                    dfadd = self.hiveContext.read.format('com.databricks.spark.xml') \
                                    .options(rowTag='trkpt', treatEmptyValuesAsNulls=False) \
                                    .schema(self.schema) \
                                    .load(self.path+'gpx/%s/*%s.gpx' % (athlete, activity_type))
                
                    dfadd = dfadd.withColumn('athlete', lit(athlete)) \
                                 .withColumn('activity_type', lit(activity_type))
                
                    self.df = self.df.unionAll(dfadd)

        if self.filter_bug_inducing_rows:
            self.df = self.df.filter(self.df['extensions.gpxtpx:TrackPointExtension.#VALUE'].isNull())

        pass


    def derive_schema(self):
        '''
        Loads all data in self.path and derives the schema, saves with pickle to "schema.p"
        '''

        df = self.hiveContext.read.format('com.databricks.spark.xml') \
                    .options(rowTag='trkpt') \
                    .load(self.path+'gpx/*')

        df = df.withColumn('athlete',lit(None).cast(StringType())) \
               .withColumn('activity_type',lit(None).cast(StringType()))

        df.printSchema()
        pickle.dump(df.schema, open("schema.p", "wb"))

        pass


    def get_dataset(self):
        '''
        Returns strava activity dataset
        '''
        if not self.df:
            self._load_dataset()
        
        return self.df
from pyspark import SparkContext
sc = SparkContext()

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

tbl_Hospitals = sqlContext.sql("select Provider_ID, Hospital_Name, State from tbl_Hospitals_RAW").rdd

tbl_Hospitals.saveAsTextFile("/user/w205/hospital_compare_txt_TRANSFORMED/txt_hospitals/")
sqlContext.createDataFrame(tbl_Hospitals).write.parquet("/user/w205/hospital_compare_tbl_TRANSFORMED/tbl_hospitals/")


exit()
    return df.write.jdbc(url="jdbc:mysql://localhost:3306/rdbms",
                         table=table,
                         mode="overwrite",
                         properties={"user": "******"})


# Initializing Spark
sc = SparkContext()
sc.setLogLevel("WARN")

rdd = sc.sequenceFile("hdfs:///flume/events/*/*/*").map(
    lambda x: Row(*x[1].split(",")))
sqlContext = HiveContext(sc)

df = sqlContext.createDataFrame(rdd, [
    "purchaseDate2", "productName", "productPrice", "productCategory",
    "clientIPAddress"
]).cache()

#sparkTopCategories
topCategories = df.groupBy("productCategory")\
    .count()\
    .select(col("productCategory"), col("count").alias("cnt"))\
    .orderBy(col("count").desc())\
    .limit(10)
topCategories.show()
writeMYSQL(topCategories, "sparkTopCategories")

#sparkTopProducts
topProducts = df.groupBy("productCategory", "productName")\
    .count()\
    .orderBy(col("productName").asc(), col("productCategory").asc(), col("count").desc())\
# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_survey_clean_rdd = df_survey_raw.rdd.filter(lambda row: CheckValidScore(row.hcahps_base_score))
df_care_clean_rdd = df_care_raw.rdd.filter(lambda row: CheckValidScore(row.score))

# creating dataframe from the RDD
df_survey_clean = sqlCtx.createDataFrame(df_survey_clean_rdd)
df_care_clean = sqlCtx.createDataFrame(df_care_clean_rdd)
print 'Number of rows in survey table after cleaning {0}'.format(df_survey_clean.count())
print 'Number of rows in effective_care table after cleaning {0}'.format(df_care_clean.count())


# converting the data types for score column
df_survey = df_survey_clean.selectExpr('provider_id','(cast(hcahps_base_score as int) + cast(hcahps_consistency_score as int)) as survey_score')

# converting the data types for score column
df_care = df_care_clean.selectExpr('provider_id','measure_id','cast(score as int) as score')

# get the maximum score per measure_id and name the column as 'max_score'
df_care_max_scores = df_care.groupBy('measure_id').max().collect()
df_care_max_scores = sc.broadcast(df_care_max_scores)
示例#51
0
    features = [
        'area', 'down_oscillation', 'down_stroke', 'down_stroke_ratio',
        'down_stroke_zaihe', 'down_up_oscillation_ratio',
        'down_up_stroke_zaihe_ratio', 'down_up_zaihe_ratio', 'down_zaihe',
        'left_upper_area', 'left_upper_area_ratio', 'max_weiyi',
        'max_weiyi_zaihe', 'max_zaihe', 'min_max_zaihe_ratio', 'min_weiyi',
        'min_weiyi_zaihe', 'min_zaihe', 'up_oscillation', 'up_stroke',
        'up_stroke_ratio', 'up_stroke_zaihe', 'up_zaihe'
    ]

    print(current_timestamp(), '-' * 30 + 'starting')
    abnormal_sql = """
        select * from industry.dagang_abnormal
        where dwdm like '03%'
    """
    fault_sql = """
        select * from industry.fault_segment_2c
    """
    abnormal_data = sqlCtx.sql(abnormal_sql) \
        .repartition(num_partitions, 'dxmc').cache()
    fault_data = sc.broadcast(sqlCtx.sql(fault_sql).toPandas())

    print(current_timestamp(), '-' * 30 + 'spark learning_data rows count',
          abnormal_data.count())
    res_rdd = abnormal_data.rdd.mapPartitions(partition_func).cache()
    print(current_timestamp(), '-' * 30 + 'res_rdd rows count: ',
          res_rdd.count())
    res_df = sqlCtx.createDataFrame(res_rdd, schema=['res']).toPandas()
    res_df.to_csv('abnormal_feature.csv', index=False, header=None)
    print(current_timestamp(), '-' * 30 + 'finished')
#dates = forecast_data_v3.get_dates()

dates = forecast_data_v4.get_dates()
# print(dates)
# print(type(dates))
np.reshape(prediction_results, (4, 1))
np.reshape(outage_probabilities, (4, 1))
np.reshape(dates, (4, 1))
# print(prediction_results.shape)
# print(dates.shape)
#test_results = [1,0,0,0]
#t = np.asarray(test_results)
#np.reshape(t,(4,1))
state = ['Rhode Island', 'Rhode Island', 'Rhode Island', 'Rhode Island']
city = ['Providence', 'Providence', 'Providence', 'Providence']
s = np.asarray(state)
c = np.asarray(city)
np.reshape(s, (4, 1))
np.reshape(c, (4, 1))

combined = np.vstack((s, c, dates, outage_probabilities, prediction_results)).T
#combined = np.vstack((s, c, dates, outage_probabilities, t)).T
final_df = pd.DataFrame(
    combined, columns=['state', 'city', 'date', 'probability', 'outage'])
final_df = hive_context.createDataFrame(final_df)
final_df.write.mode("overwrite").saveAsTable('RI_Outage_Table')
final_df.show()
final_df.printSchema()
#result = sqlContext.sql('SELECT outage, date, "Providence, RI" AS location FROM RI_Outage_Table')
#result.show()
示例#53
0
class HqlSpark():
    # init Spark sql
    def __init__(self, language='cn'):
        self.conf = SparkConf().setAppName("HqlSparkAPP").setSparkHome(
            "/home/spark").set("spark.driver.allowMultipleContexts", "true")
        self.sc = SparkContext(conf=self.conf)
        self.sql = HiveContext(self.sc)
        self.sql.sql('use anserchapp')
        self.curlan = language
        self.curDataWord = None
        # self.loadWord_state = False

    #get current language word
    def getAllWord(self, language='cn'):
        if self.curlan == language and self.curDataWord:
            return
        self.curlan = language
        sql_s = 'select * from searchapp_%s limit 100' % self.curlan
        self.curDataWord = self.sql.sql(sql_s)

    def refreshTable(self, tableName):
        self.sql.refreshTable(tableName)

    #create table
    def createTable(self, sql_sentence, tableName):
        # table_name = sql_sentence.split(' ')[2]
        self.sql.sql(sql_sentence)
        self.sql.refreshTable(tableName)
        print 'create table success'

    #insert data into table
    def insertData(self, sql_sentence):
        self.sql.sql(sql_sentence)

    # insert data into table from data_struct
    def insertDataFromStruct(self,
                             data,
                             tableName='searchapp_',
                             d_type='cn',
                             state=False):  #data tuple or list list   data,
        # rdd = self.sc.parallelize(data)
        if d_type == '':
            in_data = self.sql.createDataFrame(data, als._categry_shame)
        elif d_type == 'hint':
            in_data = self.sql.createDataFrame(data, als.hintWord_shame)
            d_type = ''
        else:
            in_data = self.sql.createDataFrame(data, als.searchApp_shame)
        # final_data = in_data
        if state:
            in_data.saveAsTable(tableName=tableName + d_type,
                                Source='metastore_db',
                                mode='append')  #   append  overwrite
        else:
            in_data.saveAsTable(tableName=tableName + d_type,
                                Source='metastore_db',
                                mode='overwrite')

    # delete table
    def deleteDataFromTable(self, table='searchapp_', d_type='ch'):
        sql_sentence = 'delete from ' + table + d_type
        self.sql.dropTempTable(table + d_type)
        self.sql.refreshTable(table + d_type)

    def showTales(self):
        table_list = []
        tables = self.sql.sql('show tables').collect()
        for table in tables:
            table_list.append(table['tableName'])
        return table_list

    def getData(self, sql_hive):
        datas = self.sql.sql(sql_hive).collect()
        return datas

    #according input words find hintword from table hintword
    def selectHintWord(self, base_wordFr):
        hintWord = self.sql.sql('select word,hintWord from hintword')
        word = hintWord.join(base_wordFr,
                             hintWord.hintWord == base_wordFr.word,
                             'outer').select(hintWord.word).distinct()
        word_news = self.curDataWord.join(
            word, word.word == self.curDataWord.word,
            'outer').select(self.curDataWord.word, 'priority', 'searchApp',
                            'searchCount', 'genre').distinct()
        word_news = word_news.dropna(how='any')
        return word_news

    #according to appId find word from searchapp
    def selectAppIdWord(self, appIds):
        result = None
        for appId in appIds:
            if result == None:
                result = self.curDataWord.filter(
                    functions.array_contains(self.curDataWord.searchapp,
                                             appId)).select(
                                                 'word', 'priority',
                                                 'searchApp', 'searchCount',
                                                 'genre').distinct()
            res = self.curDataWord.filter(
                functions.array_contains(self.curDataWord.searchapp,
                                         appId)).select(
                                             'word', 'priority', 'searchApp',
                                             'searchCount',
                                             'genre').distinct()
            result = result.unionAll(res)
            word = result.select('word')
            result = result.dropna(how='any')
        return result, word

    #according to genre id find word from searchApp
    def selectGenreWord(self, genreIds):
        result = None
        for gId in genreIds:
            if result == None:
                result = self.curDataWord.filter(
                    functions.array_contains(self.curDataWord.genre,
                                             gId)).select(
                                                 'word', 'priority',
                                                 'searchApp', 'searchCount',
                                                 'genre').distinct()
            res = self.curDataWord.filter(
                functions.array_contains(self.curDataWord.genre,
                                         gId)).select('word', 'priority',
                                                      'searchApp',
                                                      'searchCount',
                                                      'genre').distinct()
            result = result.unionAll(res)
        return result

    # get all word for analysis
    def getAnalysisWords(self, appIds, genreIds):
        if appIds == None or len(appIds) <= 0:
            return None
        appWord, word = self.selectAppIdWord(appIds)
        genreWord = None
        thinkWord = None
        if genreIds != None and len(genreIds) > 0:
            genreWord = self.selectGenreWord(genreIds)
        if word and word.count() > 0:
            thinkWord = self.selectHintWord(word)

        if appWord and genreWord and thinkWord:
            appWord = appWord.unionAll(genreWord)
            appWord = appWord.unionAll(thinkWord)
            return appWord.distinct()
            # return appWord.unionAll(genreWord).unionAll(thinkWord).distinct()
        elif appWord and genreWord:
            return appWord.unionAll(genreWord).distinct()
        elif appWord and thinkWord:
            appWord = appWord.unionAll(thinkWord)
            return appWord.distinct()
        elif genreWord and thinkWord:
            genreWord = genreWord.unionAll(thinkWord)
            return genreWord.distinct()
        elif appWord:
            return appWord.distinct()
        elif genreWord:
            return genreWord.distinct()
        else:
            return thinkWord.distinct()

    #build Matrix
    def buildMatrix(self, words):
        class_all = self.sql.sql(
            "select genreID from category order by genreID desc")
        c_genres = class_all.collect()
        genres = {}
        i = 0
        for c in c_genres:
            genres.setdefault(c.genreID, i)
            i += 1

        datas = words.select('genre').collect()
        mlength = len(c_genres)
        nlength = len(datas)
        Matrix = numpy.zeros((nlength, mlength))
        num = 0
        print len(Matrix)
        for data in datas:
            for ge in data.genre:
                Matrix[num][genres.get(ge)] = 1
            num += 1
        return Matrix

    #get Input data
    def getInPut(self, appIds, genreIds):
        words = self.getAnalysisWords(appIds, genreIds)
        return self.buildMatrix(words)

    # k_means analysis
    def spark_means(self, Matrix, Kcluster=2, MaxIterations=10, runs=10):
        cluster_data = self.sc.parallelize(Matrix)
        trains = KMeans().train(cluster_data, Kcluster, MaxIterations, runs)
        results = trains.predict(cluster_data).collect()
        return results

    #combine word
    def combine_data(self, words=None, result=None):
        len_re = len(result)
        len_w = words.count()
        if len_re != len_w:
            print 'word num :', len_w, ' is not equal result num:', len_re
        if len_re < len_w:
            words = self.sql.createDataFrame(words.take(len_re))

        else:
            result = result[0:len_w]
            print words.count(), len(result)
        result = map(lambda x: str(x), result)
        cluster_re = self.sc.parallelize(result, 1)
        # print cluster_re.collect(),words.map(list).count()
        re = words.map(list).repartition(1).zip(cluster_re).map(lambda p: Row(word=p[0][0], priority=int(p[0][1]),\
         searchcount=int(p[0][3]),cluster=p[1]))
        cluster_sha = self.sql.createDataFrame(re)
        # cluster_sha.show()
        return cluster_sha

    # select Class Word
    def selectWord(self, cluster_sha, top_K=2):
        df = cluster_sha
        select_par = df.groupBy('cluster').agg({
            'searchcount': 'avg',
            'priority': 'avg'
        }).collect()
        ClusterNum = len(select_par)
        clusterWord = []
        for line in select_par:
            cluster_df = df.filter(df.cluster == line[0]).select(
                'word', 'priority', 'searchcount')
            ClassWord = cluster_df.filter(
                cluster_df.searchcount > line[1]).select('word', 'priority')
            ClassWord = ClassWord.filter(
                ClassWord.priority >= line[2]).select('*').limit(top_K)

            KeyWords = cluster_df.filter(
                cluster_df.searchcount < line[1]).select('word', 'priority')
            KeyWords = KeyWords.filter(
                KeyWords.priority >= line[2]).select("*").limit(top_K)
            cluster = {
                'cluster_id': line[0],
                'classWord': ClassWord.toJSON().collect(),
                'keyWord': KeyWords.toJSON().collect()
            }
            clusterWord.append(cluster)
        result = {'ClusterNum': ClusterNum, 'AllCluster': clusterWord}
        return result
示例#54
0
class SparkEngine(object):
    def __init__(self, sc, debug=False):
        self.export_path = os.environ['COOPERHEWITT_ROOT'] + "/export/"
        self.sc = sc
        # hive requires writable permissions: ~/ephemeral-hdfs/bin/hadoop fs -chmod 777 /tmp/hive
        self.hive_cxt = HiveContext(sc)
        self.sql_cxt  = SQLContext(sc)
        if debug:
            print "{0}\n{1}\n{2}\n".format(sc.master, self.hive_cxt, self.sql_cxt)
            print sc._conf.getAll()
        #TBD destructor Unpersist memory

### functionality to query and create tables
    def _create_df_table(self, schema, frame, name):
        if schema: df = self.hive_cxt.createDataFrame(frame, schema=schema)
        else: df = self.hive_cxt.createDataFrame(frame)
        df.printSchema()
        df.registerTempTable(name)
        self.hive_cxt.cacheTable(name)
        return df

    def _query_temporal_data(self):
        # step 1. create main temporal table
        # n_obs => first join causes for each pen entry * num location entries existent (dependent on time period)
        samples_temporal_tb = self.hive_cxt.sql("""
            SELECT  s.refers_to_object_id, created, visit_raw,
                    room_floor, room_id, room_name,
                    spot_id, spot_name, spot_description,
                    room_count_objects, room_count_spots, spot_count_objects,
                    abs(datediff(
                        from_utc_timestamp(from_unixtime(created,   "yyyy-MM-dd"), 'US/Eastern'),
                        from_utc_timestamp(from_unixtime(visit_raw, "yyyy-MM-dd"), 'US/Eastern')
                    )) as delta
            FROM samples s
            JOIN temporal t
            ON s.refers_to_object_id = t.refers_to_object_id
            ORDER by s.refers_to_object_id, created, delta
        """)
        samples_temporal_tb.registerTempTable('samplestemporal')
        self.hive_cxt.cacheTable('samplestemporal')
        return samples_temporal_tb

    def _minimize_query(self):
        # From the temporal table, we need minimize the location (multiple locations) to the appropriate sample timestamp
        tb_samples = self.hive_cxt.sql("""
            SELECT *
            FROM (
                SELECT *,
                MIN(delta)   OVER ( PARTITION BY refers_to_object_id, created) AS min_delta,
                row_number() OVER ( PARTITION BY refers_to_object_id, created) AS ranks
                FROM samplestemporal st
                ORDER BY refers_to_object_id
            ) query
            where query.ranks = 1
        """)
        tb_samples = tb_samples.withColumn("meta_store", lit(1))
        tb_samples.registerTempTable('minimizedsamples')
        self.hive_cxt.cacheTable('minimizedsamples')
        return tb_samples

    def execute_query(self, (samples_schema,  samples_frame, samples_name),
                                        (temporal_schema, temporal_frame, temporal_name),
                                        cols):
        self.df_samples       = self._create_df_table(samples_schema,  samples_frame,  samples_name)
        self.df_temporal      = self._create_df_table(temporal_schema, temporal_frame, temporal_name)
        self.tb_meta          = self._query_temporal_data()
        self.tb_meta_min      = self._minimize_query()
        # combine to the original pen data (meta_store indicates if we had object data to integrate)
        self.df_samplesmeta   = self.df_samples.join(self.tb_meta_min, ['refers_to_object_id', 'created'], "left_outer")
        self.df_samplesmeta   = self.df_samplesmeta.fillna({'meta_store': 0})
        self.df_samplesmeta.printSchema()
        # pickle file to pandas: alternatively we can store as a json or parquet columnar format
        dropped_cols = ['delta', 'min_delta', 'ranks'] + cols
        samplesmeta_pd  = self.df_samplesmeta.toPandas()
        samplesmeta_pd  = samplesmeta_pd.drop(dropped_cols, axis=1)
        samplesmeta_pd.to_pickle(self.export_path + "penmeta_spark.pkl")
示例#55
0
# Configure Spark Settings

conf = SparkConf()
conf.set("spark.executor.memory", "1g")
conf.set("spark.cores.max", "2")
conf.setAppName("Spark")

# # Initialize SparkContext.

sc = SparkContext('local', conf=conf)
# Test with a data file, I used an auto data file
sqlContext = HiveContext(sc)

sqlContext.setConf("hive.exec.dynamic.partition", "true")
sqlContext.setConf("hive.exec.dynamic.partition.mode", "nonstrict")
sqlContext.sql("CREATE DATABASE IF NOT EXISTS test")
pandas_df = pd.read_csv('D:\\value.csv')
# assuming the file contains a header
# pandas_df = pd.read_csv('file.csv', names = ['column 1','column 2']) # if no header
s_df = sqlContext.createDataFrame(pandas_df)
s_df.count()

#print (s_df.count())

s_df.write.mode("overwrite").saveAsTable("test.value")
#sqlContext.sql("CREATE DATABASE IF NOT EXISTS test")

tab = sqlContext.sql("select * from test.value")
tab.printSchema()
tab.show(5, False)
示例#56
0
def parsePoint(d): ## wont be able to use line.split here?
    d_copy = deepcopy(d) # I hate using deepcopy so much
    pred = d_copy['success_metric']
    d.pop('success_metric', None)
    values = [float(x) for x in d.values()] ##this block is unusable until we have our Hive Data
    return (pred, Vectors.dense(values))

# training set
trainParsed = sc.parallelize(map(parsePoint, train_dict))
# test set
testParsed = sc.parallelize(map(parsePoint, test_dict))


## create validation set

trainDf = sqlContext.createDataFrame(trainParsed, ["label", "features"])
testDf = sqlContext.createDataFrame(testParsed, ["label", "features"])
lm_model = LinearRegression(featuresCol="features", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6)
lm_model_fit = lm_model.fit(trainDf)
lm_transform = lm_model_fit.transform(trainDf)
results = lm_transform.select(lm_transform['prediction'], lm_transform['label'])
MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()
print("Linear Regression training Mean Squared Error = " + str(MSE))

lm_transform = lm_model_fit.transform(testDf)
results = lm_transform.select(lm_transform['prediction'], lm_transform['label'])
MSE = results.map(lambda (p,l):(p-l)**2).reduce(lambda x,y:x+y)/results.count()
print("Linear Regression testing Mean Squared Error = " + str(MSE))

res = results.collect()
predsAndLabels = sc.parallelize([i.asDict().values() for i in res])
示例#57
0
	|attr|obj|
	+----+---+
	|   1|  a|
	|   2|  a|
	|   3|  a|
	|   1|  b|
	|   2|  b|
	|   3|  b|
	|   1|  c|
	|   3|  c|
	+----+---+
"""

schema = StructType([StructField("attr", StringType(), True), StructField("obj", StringType(), True)])

aoDF = sqlCtx.createDataFrame(aoPair, schema)

#Window that moves over rows of same obj and sorted by attr

window = Window.orderBy("attr").partitionBy("obj")

## Prev column contains previous attr of the same object
"""
	Transformed Table	
	+----+---+----+
	|attr|obj|prev|
	+----+---+----+
	|   1|  a|null|
	|   2|  a|   1|
	|   3|  a|   2|
	|   1|  b|null|
print 'Number of rows in the table {0}'.format(df_raw.count())

# removing all row not containing numbers for score variable

# function to test if a string can be parsed in integer or not
def CheckValidScore(s):
    try:
        int(s)
        return True
    except ValueError:
        return False

# creating a RDD by filtering out invalid scores
df_clean_rdd = df_raw.rdd.filter(lambda row: CheckValidScore(row.score))
# cretating datframe from the RDD
df_clean = sqlCtx.createDataFrame(df_clean_rdd)
print 'Number of rows in table after cleaning {0}'.format(df_clean.count())

# converting the data types for score column
df_clean = df_clean.selectExpr('provider_id','measure_id','cast(score as int) as score')

# get the maximum score per measure_id and name the column as 'max_score'
df_max_scores = df_clean.groupBy('measure_id').max().collect()
df_max_scores = sc.broadcast(df_max_scores)

# function to extract max_score for each measure_id
def get_max_score(id):
    return [score[1] for score in df_max_scores.value if score[0] == id][0]

# creating a new RDD containing extra column for normalized score
# that is ratio of current score with maximum score for the measure_id
示例#59
0
class Unittest_HWM_Allocation_SortedOrder1(unittest.TestCase):
    def setUp(self):
        warnings.simplefilter("ignore", ResourceWarning)
        fpath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
        with open(fpath +
                  '/data_source/bookings_overall.json') as bookings_source:
            self.bookings = json.load(bookings_source)
        with open(fpath + '/data_source/cfg.json') as cfg_source:
            self.cfg = json.load(cfg_source)
        today = '20180402'
        self.days = optimizer.util.get_days_from_bookings(today, self.bookings)
        self.sc = SparkContext.getOrCreate()
        self.hive_context = HiveContext(self.sc)
        self.schema = optimizer.util.get_common_pyspark_schema()

    def compare_two_dfs(self, pandas_df_expected, df_to_test_rows):
        df = self.hive_context.createDataFrame(df_to_test_rows, self.schema)
        df_allocated = optimizer.algo.hwm.hwm_allocation(
            df, self.bookings, self.days)
        pandas_df_allocated = df_allocated.select("*").toPandas()
        print(pandas_df_expected)
        print(pandas_df_allocated)

        return self.assertTrue(
            assert_frame_equal(pandas_df_expected,
                               pandas_df_allocated,
                               check_dtype=False) == None)

    def test_hwm_allocation_case1(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b8'], ['b6'], 3239, {
                'b8': 88
            }
        ]

        df_to_test_rows = [(['20180405', ['b8'], ['b6'], {}, 3239])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case2(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b11', 'b12'], [], 8900, {
                'b11': 11,
                'b12': 12
            }
        ]

        df_to_test_rows = [(['20180405', ['b11', 'b12'], [], {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case3(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b6', 'b7', 'b10'], [], 8900, {
                'b6': 66,
                'b7': 77,
                'b10': 100
            }
        ]

        df_to_test_rows = [(['20180405', ['b6', 'b7', 'b10'], [], {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case4(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b8'], ['b6', 'b7', 'b9'], 3239, {
                'b8': 88
            }
        ]
        pandas_df_expected.loc[1] = [
            '20180405', ['b6', 'b7'], ['b8', 'b9'], 8900, {
                'b6': 66,
                'b7': 77
            }
        ]

        df_to_test_rows = [(['20180405', ['b8'], ['b6', 'b7', 'b9'], {},
                             3239]),
                           (['20180405', ['b6', 'b7'], ['b8', 'b9'], {},
                             8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case5(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b6', 'b7', 'b10', 'b11', 'b12'], ['b8', 'b9'], 8900,
            {
                'b6': 66,
                'b7': 77,
                'b10': 100,
                'b11': 11,
                'b12': 12
            }
        ]

        df_to_test_rows = [([
            '20180405', ['b6', 'b7', 'b10', 'b11', 'b12'], ['b8', 'b9'], {},
            8900
        ])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case6(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b13', 'b12'], [], 8900, {
                'b13': 8900
            }
        ]

        df_to_test_rows = [(['20180405', ['b13', 'b12'], [], {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case7(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b15', 'b14'], [], 8900, {
                'b15': 8900
            }
        ]

        df_to_test_rows = [(['20180405', ['b15', 'b14'], [], {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case8(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b17', 'b16'], [], 8900, {
                'b17': 4450,
                'b16': 4450
            }
        ]

        df_to_test_rows = [(['20180405', ['b17', 'b16'], [], {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case9(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b18', 'b17'], [], 8900, {
                'b18': 4451,
                'b17': 4449
            }
        ]

        df_to_test_rows = [(['20180405', ['b18', 'b17'], [], {}, 8900])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)

    def test_hwm_allocation_case10(self):
        pandas_df_expected = pandas.DataFrame(
            columns=['day', 'ands', 'minus', 'amount', 'allocated'])
        pandas_df_expected.loc[0] = [
            '20180405', ['b6', 'b7', 'b10', 'b12', 'b16', 'b17', 'b18'],
            ['b8', 'b9'], 8900, {
                'b18': 4451,
                'b17': 4449
            }
        ]  # b6, b7, b10, b12, b16, b17, b18 have the same attributes.

        df_to_test_rows = [([
            '20180405', ['b6', 'b7', 'b10', 'b12', 'b16', 'b17', 'b18'],
            ['b8', 'b9'], {}, 8900
        ])]
        return self.compare_two_dfs(pandas_df_expected, df_to_test_rows)
encounters = parts.map(lambda p: (p[0], datetime.strptime(p[1], "%Y-%m-%d %H:%M:%S"), p[2], p[3].strip().replace(',','_')))

fields = [StructField("PATIENT_NUM", StringType(), True),
	StructField("START_DATE", DateType(), True),
	StructField("ENCOUNTER_NUM", StringType(), True),
	StructField("ICD9S", StringType(), True)]
schema_encounters = StructType(fields)

# fields = [StructField("PATIENT_NUM", StringType(), True),
# 	StructField("ENCOUNTER_NUM", StringType(), True),
# 	StructField("START_DATE", StringType(), True),
# 	Seq(StructField("ICD9S", ArrayType(StringType(), True), True) ]
# schema_encounters = StructType(fields)

# Apply the schema to the RDD.
schemaEncounters = sqlContext.createDataFrame(encounters, schema_encounters)
schemaEncounters.printSchema()
schemaEncounters.registerTempTable("encounters")

# order data by patient, start date, _then_ encounter

encounteres_ordered = sqlContext.sql("select PATIENT_NUM, START_DATE, ENCOUNTER_NUM, ICD9S from encounters order by PATIENT_NUM, START_DATE, ENCOUNTER_NUM")
encounteres_ordered.registerTempTable("encounteres_ordered")

#sqlContext.sql("select collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").show(5)

#sqlContext.sql("select PATIENT_NUM, collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").show(20, truncate=False)

rdd = sqlContext.sql("select collect_list(ICD9S) as icd9s from encounteres_ordered group by PATIENT_NUM").rdd

def splitter(p):