Exemplo n.º 1
0
def yichche_app_day(data_path, from_i=1, to_i=31):
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    for i in range(from_i, to_i):
        date = time.strftime(
            "%Y-%m-%d", time.localtime(int(time.time() - i * 60 * 60 * 24)))
        df = sqlContext.sql(
            "select user_id as userid,appname,etl_dt from yyh_app_tmp_4 where etl_dt = '"
            + date + "'")
        tag_list = [
            '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育',
            '摄影', '汽车', '美容', '母婴育儿', '电影', '搞笑', '健康', '教育', '音乐', '资讯'
        ]
        appname2tag_dict = load_appname2tag(data_path)
        exprs = [
            max(when(col("tag") == x, lit(1)).otherwise(lit(0))).alias(x)
            for x in tag_list
        ]
        df = df.select("userid", "appname").where(
            col("appname").isin(list(appname2tag_dict.keys()))).distinct()
        df = df.withColumn(
            "tag",
            map_tag(appname2tag_dict)('appname')).select(
                "userid",
                'tag').filter((col('tag') != '') & (col('tag').isNotNull())
                              & (col('tag') != 'null')).distinct()
        cur_date = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
        df = df.groupby("userid").agg(*exprs)
        df = df.withColumn('source',
                           lit('app')).withColumn('update_dt', lit(cur_date))
        df.registerTempTable('tab_name')
        sqlContext.sql(
            "insert into table app_usertag_day partition(etl_dt = '" + date +
            "' ) select * from tab_name ")
        print('finished', date)
Exemplo n.º 2
0
def match_all(df, data_path):
    '''
    匹配所有的优酷观影记录和爱奇艺标签分数,
    存入yk2iqytag表
    :param df:
    :param data_path:
    :return:
    '''
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    # df = sqlContext.sql("select * from youku_mediadata")
    df = df.withColumn(
        "yk_movie",
        translate('title')("mediadata")).filter(col('yk_movie') != '')
    iqiyi_tags = [
        'entertainment', 'technology', 'shopping', 'lifestyle', 'business',
        'fashion', 'tourism', 'game', 'finance', 'female', 'sports',
        'photography', 'car'
    ]
    iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags)
    addcols = [
        'beauty', 'childcare', 'movie', 'funny', 'health', 'education',
        'music', 'news'
    ]
    for i in addcols:
        iqiyi2tag_df = iqiyi2tag_df.withColumn(i, lit(0)).cache()
    yk_rdd = df.select('mediadata').rdd.map(list)
    yk_rdd = yk_rdd.map(lambda xs: [xs[0], find(xs[0], iqy)])
    df = yk_rdd.toDF(['yktag', 'ykmovie']).filter(col('movie') != '')
    df = df.join(iqiyi2tag_df,
                 df.ykmovie == iqiyi2tag_df.iqytag).drop('ykmovie')
    df.registerTempTable('tab_name')
    sqlContext.sql("insert into table yk2iqytag select * from tab_name")
Exemplo n.º 3
0
def process_rdd(time, rdd):
    print("----------- %s -----------" % str(time))
    try:
        sql_context = HiveContext(rdd.context)
        # Convert the RDD to Row RDD
        row_rdd = rdd.map(lambda w: Row(tweet=w, score=analyzeSentiment(w)))
        schema = StructType([
            StructField("tweet", StringType(), True),
            StructField("score", FloatType(), True)
        ])
        # Create a DF with the specified schema
        new_tweets_df = sql_context.createDataFrame(row_rdd, schema=schema)
        # Register the dataframe as table
        new_tweets_df.registerTempTable("new_tweets")
        # Insert new tweets,scores into table tweets
        sql_context.sql("INSERT INTO TABLE tweets SELECT * FROM new_tweets")
        # Get all the tweets from the table using SQL
        tweets_sentiment_df = sql_context.sql("SELECT * FROM tweets")
        tweets_sentiment_df.show()

        # Sends the tweets and their sentiment score to the dashboard
        send_df_to_dashboard(tweets_sentiment_df)
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)
Exemplo n.º 4
0
def momo_user_tag(data_path, month=None, date=None):
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    df = sqlContext.sql("select userid,mediadata from yyh_momo_tmp")
    df = df.withColumn("momotag", translate('usertag')("mediadata")).select("userid", 'momotag').filter(col('momotag') != '')\
        .withColumn('momotag', explode(split('momotag', ',')))\
        .filter((length('momotag') == 5))\
        .withColumn('tag',lit(''))
    for item in momo_tag_list:
        df = df.withColumn(
            'tag',
            when(col('momotag') == item[0],
                 item[1]).otherwise(col('tag'))).cache()
    tag_list = [
        '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育',
        '摄影', '汽车', '美容', '母婴育儿', '电影', '搞笑', '健康', '教育', '音乐', '资讯'
    ]
    exprs = [
        max(when(col("tag") == x, lit(1)).otherwise(lit(0))).alias(x)
        for x in tag_list
    ]
    cur_date = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
    df = df.groupby("userid").agg(*exprs)\
        .withColumn('source', lit('momo'))\
        .withColumn('update_dt', lit(cur_date))
    #if month != None:
    #   df = df.withColumn('etl_month', lit(month))
    #if date != None:
    #   df = df.withColumn('etl_dt', lit(date))
    df.registerTempTable('tab_name')
    sqlContext.sql(
        "insert into table momo_usertag_month partition(etl_month = '" +
        month + "' ) select * from tab_name")
Exemplo n.º 5
0
def momo_day(data_path, from_i=1, to_i=31):
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    for i in range(from_i, to_i):
        date = time.strftime(
            "%Y-%m-%d", time.localtime(int(time.time() - i * 60 * 60 * 24)))
        get_label(date)
Exemplo n.º 6
0
def do_ets_task(sc, ets_dburl_env, wfc):
    # 定义客户标识
    cust_no = '1'
    isvalid = '1'
    etsTempTable = wfc
    ets_url = ets_dburl_env[wfc[:-2]]['dst']
    slave_url = ets_dburl_env[wfc[:-2]]['src']
    dbinfo = load_source('getdbinfo', os.path.join(os.path.dirname(__file__), 'Utils.py')).getdbinfo(slave_url)
    tabledict = load_source('query_sql_slave', os.path.join(os.path.dirname(__file__), 'Utils.py')).query_sql_slave(dbinfo)
    slaveTempTable = tabledict.get(wfc[:-2])
    driver = "com.mysql.jdbc.Driver"
    sqlContext = HiveContext(sc)
    # driver = "com.mysql.jdbc.Driver"
    dff = sqlContext.read.format("jdbc").options(url=slave_url, dbtable=slaveTempTable, driver=driver).load()
    dff.registerTempTable(slaveTempTable)
    dft = sqlContext.read.format("jdbc").options(url=ets_url, dbtable=etsTempTable, driver=driver).load()
    dft.registerTempTable(etsTempTable)
    ds_ets = sqlContext.sql(" select max(updatets) as max from %s " % (etsTempTable))
    pp = ds_ets.collect()[0]
    max_updates = pp.max
    slave_sql = ''
    try:
        if max_updates is not None:
            print(u"ets库中的最大时间是:" + str(max_updates))
            slave_sql = " select id, examineeid, examid, roomid, stationid, examinerid, totalscore, begintime ,endtime,scoresheetcode,status, updatetime" \
                        "  from %s where `updatetime` > '%s' " % (slaveTempTable, max_updates)
        else:
            print(u"本次为初次抽取")
            slave_sql = " select id, examineeid, examid, roomid, stationid, examinerid, totalscore, begintime ,endtime,scoresheetcode,status, updatetime" \
                        " from %s " % (slaveTempTable)
        ds_slave = sqlContext.sql(slave_sql)
        print(u'slave 中 符合条件的记录数为:%s' % (ds_slave.count()))
        now_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        print(u'开始组装数据...')
        src_fields = json.dumps({'osce_score': ['id', 'examineeid', 'examid', 'roomid', 'stationid', 'examinerid',
                                         'totalscore', 'begintime', 'endtime', 'scoresheetcode', 'status', 'updatetime']})
        # 字段值
        filedvlue = ds_slave.map(lambda row: (row.id, row.examineeid, row.examid, row.roomid, row.stationid,
                                              row.examinerid, row.totalscore, str(row.begintime), str(row.endtime),
                                              row.scoresheetcode, row.status, cust_no, isvalid,
                                              md5(row), now_time, str(row.updatetime)))
        # 创建列
        schemaString = "id,examineeid,examid,roomid,stationid,examinerid,totalscore,begintime," \
                       "endtime,scoresheetcode,status,cust_no,isvalid,src_fields_md5,createts,updatets"
        fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split(",")]
        schema = StructType(fields)
        # 使用列名和字段值创建datafrom
        schemaObj = sqlContext.createDataFrame(filedvlue, schema)
        print(u'组装数据完成...')
        # print schemaPeople
        # for row in schemaPeople:
        #     print row.id
        print(u'开始执写入数据...')
        # 写入数据库
        schemaObj.write.insertInto(etsTempTable, overwrite=False)
        print(u'写入完成')
    except Exception, e:
        # e.message 2.6 不支持
        print (str(e))
        raise Exception(str(e))
Exemplo n.º 7
0
def main():
    sys.stdout = open("out_main", "w")  # direct output to file
    print('main starting: {0}'.format(timestamp()), flush=True)
    ''' create the base_rdd '''
    spark = ps.sql.SparkSession.builder \
                  .enableHiveSupport() \
                  .appName('pyspark') \
                  .getOrCreate()
    spark.sparkContext.setLogLevel("WARN")
    sc = spark.sparkContext
    sqlContext = HiveContext(sc)
    sqlContext.sql("use default")

    print('creating base_rdd, t = {0}'.format(timestamp()), flush=True)
    ''' BaseETL.create_base_rdd()
        :param source:  'file', 's3'
        :param type:    'mini', 'train', 'valid'  (mini is local only)
        :param n:   count  of files to load
        :returns:  base_rdd '''
    base_rdd = BaseETL.create_base_rdd(sc=sc, source='s3', type='train', n=349) \
                      .map(lambda base: (base[1], base[2][0], base[2][1])) \
                      .cache()
    print('finished, t = {0}'.format(timestamp()), flush=True)
    ''' pick pipeline to run '''
    # pipeline_spender(sc, base_rdd)   # grid search randfor classifier
    # pipeline_spend_amt(sc, base_rdd)   # grid search randfor regressor

    print('end of main: {0}'.format(timestamp()))
    sc.stop()
Exemplo n.º 8
0
def match_all_df(data_path):
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    df = sqlContext.sql("select * from youku_mediadata")
    df = df.withColumn("yk_movie",
                       translate('title')("mediadata")).filter(
                           col('yk_movie') != '').withColumn(
                               'movie',
                               lit('')).persist(StorageLevel.DISK_ONLY)
    iqiyi_tags = [
        '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育',
        '摄影', '汽车'
    ]
    iqiyi2tag_df, iqy = iqiyi2tag(data_path, iqiyi_tags)
    for ele in iqy:
        df1 = df.withColumn(
            'movie',
            when(col('yk_movie').like('%' + ele + '%'),
                 ele).otherwise(col('movie'))).filter(
                     col('movie') != '').select('mediadata', 'movie')
        df = df.withColumn(
            'movie',
            when(col('yk_movie').like('%' + ele + '%'),
                 ele).otherwise(col('movie'))).filter(
                     col('movie') == '').persist(StorageLevel.DISK_ONLY)
        df1.registerTempTable('tab_name')
        sqlContext.sql("insert into table youku_iqy select * from tab_name ")
Exemplo n.º 9
0
def yichche_app(data_path, month):
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    # df = sqlContext.sql("select distinct user_id as userid, appname,etl_dt from t01_sdk_device_app_info where etl_dt between '2018-10-18' and '2018-10-31'")
    df = sqlContext.sql(
        "select user_id as userid,appname,etl_dt from yyh_app_tmp_4")
    tag_list = [
        '娱乐', '科技', '购物', '生活', '企业办公', '时尚', '旅游', '游戏', '财经', '女性', '体育',
        '摄影', '汽车', '美容', '母婴育儿', '电影', '搞笑', '健康', '教育', '音乐', '资讯'
    ]
    appname2tag_dict = load_appname2tag(data_path)
    exprs = [
        max(when(col("tag") == x, lit(1)).otherwise(lit(0))).alias(x)
        for x in tag_list
    ]
    df = df.select("userid", "appname").where(
        col("appname").isin(list(appname2tag_dict.keys())))
    df = df.withColumn("tag",
                       map_tag(appname2tag_dict)('appname')).select(
                           "userid", 'tag').filter((col('tag') != '')
                                                   & (col('tag').isNotNull())
                                                   & (col('tag') != 'null'))
    cur_date = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
    df = df.groupby("userid").agg(*exprs)
    df = df.withColumn('source',
                       lit('app')).withColumn('update_dt', lit(cur_date))
    df.registerTempTable('tab_name')
    sqlContext.sql(
        "insert into table app_usertag_month partition(etl_month = '" + month +
        "' ) select * from tab_name ")
Exemplo n.º 10
0
def get_context_test():
    conf = SparkConf()
    sc = SparkContext('local[1]', conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex_test""")
    sql_context.setConf("spark.sql.shuffle.partitions", "1")
    return sc, sql_context
Exemplo n.º 11
0
def CreateTable(table_mode='external',
                table_name=None,
                hdfs_host=None,
                hdfs_port=None,
                path=None,
                other_options=''):
    ''' Create a hive table based on the schema inferred from SchemaExtractor

    :param table_mode: 'external' or '' (which encounters internal)
    :param table_name: String
    :param hdfs_host:
    :param hdfs_port:
    :param path: path to the parquet file
    :param other_options: including serds, etc.
    :return:
    '''
    schema = SchemaExtractor(hdfs_host=hdfs_host,
                             hdfs_port=hdfs_port,
                             path=path)
    table_location = 'hdfs://' + hdfs_host + path
    hiveContext = HiveContext(sc)
    hive_query = """create {table_mode} table {table_name}({schema}) {options} stored as parquet location '{location_path}'""".format(
        table_mode=table_mode,
        table_name=table_name,
        schema=schema,
        location_path=table_location,
        options=other_options)
    print(hive_query)
    hiveContext.sql(hive_query)
Exemplo n.º 12
0
def create_yk2iqytag():
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    yk_mediadata = sqlContext.sql(
        "select distinct mediadata from t2pdm_data.t05_chehui_dsp_log_v2 where etl_dt between '2018-11-01' and '2018-11-30' and channelid=3 "
    )
    match_all(yk_mediadata, data_path)
Exemplo n.º 13
0
def get_context_test():
    conf = SparkConf()
    sc = SparkContext('local[1]', conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex_test""")
    sql_context.setConf("spark.sql.shuffle.partitions", "1")
    return sc, sql_context
Exemplo n.º 14
0
		def cal_null_percent(self):
			from pyspark import SparkConf,SparkContext
			from pyspark.sql import HiveContext
			conf = SparkConf().setAppName(str(time.time())[-6:])
			try:
				sc.stop()
			except:
				pass
			sc = SparkContext(conf=conf)
			hive_context = HiveContext(sc)
			final_d = {}
			for yesterday in self.date_list:
				sql_0 = '''
				SELECT COUNT(1)
				FROM {1}
				WHERE pt=\'{0}\'
				AND model_type=13001
				'''.format(yesterday, self.table)
				s = hive_context.sql(sql_0).collect()[0][0]
				d = {}
				for column_name in self.column_name_list:
					sql_1 = '''
					SELECT COUNT(1)
					FROM {2}
					WHERE pt=\'{0}\'
					AND model_type=13001
					AND {1} IS NOT NULL
					AND {1} != ''
					'''.format(yesterday, column_name, self.table)
					t = hive_context.sql(sql_1).collect()[0][0]
					d[column_name] = "%.5f%%" % ((1-t/s) * 100)
				final_d[yesterday] = d
				sc.stop()
			return final_d
Exemplo n.º 15
0
def clean_logs(cfg, df_persona, df_keywords, log_table_names):
    sc = SparkContext.getOrCreate()
    sc.setLogLevel(cfg['log']['level'])
    hive_context = HiveContext(sc)
    cfg_clean = cfg['pipeline']['main_clean']
    conditions = cfg_clean['conditions']
    start_date, end_date, load_minutes = load_batch_config(cfg)

    timer_start = timeit.default_timer()
    showlog_table, showlog_output_table, clicklog_table, clicklog_output_table = log_table_names
    starting_time = datetime.strptime(start_date, "%Y-%m-%d")
    ending_time = datetime.strptime(end_date, "%Y-%m-%d")

    batched_round = 1
    while starting_time < ending_time:
        time_start = starting_time.strftime("%Y-%m-%d %H:%M:%S")
        batch_time_end = starting_time + timedelta(minutes=load_minutes)
        batch_time_end = min(batch_time_end, ending_time)
        time_end = batch_time_end.strftime("%Y-%m-%d %H:%M:%S")
        print_batching_info("Main clean", batched_round, time_start, time_end)

        command = """select did, adv_id, adv_type as media, slot_id, 
                    spread_app_id, device_name, net_type, 
                    adv_bill_mode_cd as price_model, {time} as action_time 
                    from {table} where {time} >= '{time_start}' and {time} < '{time_end}'"""

        df_clicklog_batched = hive_context.sql(
            command.format(time='click_time',
                           table=clicklog_table,
                           time_start=time_start,
                           time_end=time_end))

        df_showlog_batched = hive_context.sql(
            command.format(time='show_time',
                           table=showlog_table,
                           time_start=time_start,
                           time_end=time_end))

        mode = 'overwrite' if batched_round == 1 else 'append'
        is_empty_showlog_batched = df_showlog_batched.rdd.isEmpty()
        if not is_empty_showlog_batched:
            df_showlog_batched = clean_batched_log(df_showlog_batched,
                                                   df_persona, conditions,
                                                   df_keywords)
            write_to_table(df_showlog_batched, showlog_output_table, mode=mode)
        is_empty_clicklog_batched = df_clicklog_batched.rdd.isEmpty()
        if not is_empty_clicklog_batched:
            df_clicklog_batched = clean_batched_log(df_clicklog_batched,
                                                    df_persona, conditions,
                                                    df_keywords)
            write_to_table(df_clicklog_batched,
                           clicklog_output_table,
                           mode=mode)

        batched_round += 1
        starting_time = batch_time_end

    timer_end = timeit.default_timer()
    print('Total batching seconds: ' + str(timer_end - timer_start))
Exemplo n.º 16
0
def main(sc, SQLContext):

    sqlContext = HiveContext(sc)

    sc.setLogLevel("ERROR")

    query = "USE {0}"

    query = query.format(db_name)

    sqlContext.sql(query)

    tables = sqlContext.sql("SHOW TABLES")

    tableNames = tables.select("tableName").rdd.map(lambda r: r)

    tableNames = tableNames.map(lambda x: x.tableName).collect()

    tableNames = [str(i) for i in tableNames]

    schema_empty_df = StructType([StructField("table_ddl",StringType(),True)])

    empty_df = sqlContext.createDataFrame(sc.emptyRDD(), schema_empty_df)

    df1 = empty_df

    for i in tableNames:
        show_query = "show create table "+i
        drop_query = "drop table "+i+";\n"
        describe_query = "describe formatted "+i
        seperator = ";\n"
        try:
            rdd = sc.parallelize([drop_query])
            newRDD = rdd.map(lambda x:{"table_ddl":x})
            newDF = rdd.map(lambda p: Row(table_ddl=p)).toDF()
            df = df1.unionAll(newDF)
            desc = sqlContext.sql(describe_query)
            desc_1 = desc.select(['data_type']).where("col_name='Location'")
            desc_2 = desc_1.rdd.map(lambda x:x.data_type).collect()
            desc_3 = [str(i) for i in desc_2]
            desc_4 = ''.join(desc_3)
            df0 = sqlContext.sql(show_query)
            show_1 = df0.rdd.map(lambda x:x.createtab_stmt).collect()
            show_2 = [str(i) for i in show_1]
            show_3 = ''.join(show_2)
            if show_3.find("LOCATION '") < 0:
                loc_query = "LOCATION '"+desc_4+"'"+"\n TBLPROPERTIES ("
                final_create_table=show_3.replace("TBLPROPERTIES (", loc_query)
            else:
                final_create_table = show_3
            list_final = [final_create_table]
            rdd_create_table = sc.parallelize(list_final)
		      	df_create_table = rdd_create_table.map(lambda p: Row(create_table_ddl=p)).toDF()
            df1 = df.unionAll(df_create_table)
            rdd1 = sc.parallelize([seperator])
            newRDD1 = rdd1.map(lambda x:{"delim":x})
            newDF1 = sqlContext.createDataFrame(newRDD1, ["delim"])
            df1 = df1.unionAll(newDF1)
        except:
Exemplo n.º 17
0
def hiveSaveNews(dfNewsContents, table_name):
    from pyspark.sql import HiveContext

    hiveContext = HiveContext(sc)
    tmpDf = hiveContext.createDataFrame(
        dfNewsContents[['news_code', 'title', 'site', 'writing_time', 'preproc_content', 'img', 'content', 'company']])
    tmpDf.registerTempTable("tmpDf")
    hiveContext.sql("insert into table {table_name} select * from tmpDf".format(table_name=table_name))
    def process(time, rdd):
        print("========= %s =========" % str(time))

        try:
            sqlContext = HiveContext(sc)
            # FIX: memory error Spark 2.0 bug ( < 2.0 )
            sqlContext.setConf("spark.sql.tungsten.enabled","false")

            # v2.01 spark = SparkSession.builder \
            #.master("local") \
            #.appName("Word Count") \
            #.config("spark.some.config.option", "some-value") \
            #.getOrCreate()
            # Get the singleton instance of SparkSession
            #nzs v1.0 spark = getSparkSessionInstance(rdd.context.getConf())

            if rdd.count() < 1:
                return;

            # Convert RDD[String] to RDD[Row] to DataFrame
            sqlRdd = rdd.map( lambda x: json.loads(x)).map(lambda r: Row( metrics=r["metrics"], name=r["name"], value=r["value"] ) )
            wordsDataFrame = sqlContext.createDataFrame(sqlRdd)
            wordsDataFrame.show()
            # Creates a temporary view using the DataFrame.			
            wordsDataFrame.registerTempTable("starwarstemp")
            # Creates a query and get the alam dataset using the temp table 
            wordCountsDataFrame = sqlContext.sql("select * from  starwarstemp")
            wordCountsDataFrame.printSchema()


            with open(SparkFiles.get('webinar_streaming.sql')) as test_file:
                alertsql=test_file.read()
                #logging.info(alertsql)

            alertDataFrame = sqlContext.sql(alertsql)			
            alertDataFrame.show()
            alertDataFrame.printSchema()			

            # save all values to HBASE 
            # IF NEED FILTER LATER .filter(lambda x: str(x["metrics"])=='action-credit-limit') \
            # create HBASE mapper 
            rowRdd = rdd.map( lambda x: json.loads(x))\
                .map(lambda r: ( str(r["metrics"]) ,[ str(r["name"])+"-"+datetime.datetime.now().strftime("%Y%m%d%H%M%S"), "action" if str(r["metrics"])=="action-credit-limit" else  "healt", str(r["metrics"]), str(r["value"])] ))
            
            table = 'starwarsinbox'
            host = 'node-master2-KcVkz'
            keyConv = "org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter"
            valueConv = "org.apache.spark.examples.pythonconverters.StringListToPutConverter"
            conf = {"hbase.zookeeper.quorum": host,
            "hbase.mapred.outputtable": table,
            "mapreduce.outputformat.class": "org.apache.hadoop.hbase.mapreduce.TableOutputFormat",
            "mapreduce.job.output.key.class": "org.apache.hadoop.hbase.io.ImmutableBytesWritable",
            "mapreduce.job.output.value.class": "org.apache.hadoop.io.Writable"}
            rowRdd.saveAsNewAPIHadoopDataset(conf=conf,keyConverter=keyConv,valueConverter=valueConv)
        except Exception as merror:
            print (merror)
            raise
Exemplo n.º 19
0
def main():
    #Setting up spark configuration
    conf = SparkConf().setAppName('label')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'
    sqlContext = HiveContext(sc)
    #Reading Input file path and Output file path
    input1 = sys.argv[1]
    input2 = sys.argv[2]
    output = sys.argv[3]
    #Setting the schema for the data frame
    customSchema = StructType([
        StructField("id", LongType(), False),
        StructField("timeSet", StringType(), False),
        StructField("country", StringType(), False),
        StructField("province", StringType(), False),
        StructField('city', StringType(), False),
        StructField('latitude', FloatType(), False),
        StructField('longtitude', FloatType(), False)
    ])
    poi_schema = StructType([
        StructField("poi_id", StringType(), False),
        StructField('poi_latitude', FloatType(), False),
        StructField('poi_longtitude', FloatType(), False)
    ])
    #Reading data into data frame
    df_input1 = sqlContext.read.format('com.databricks.spark.csv').options(
        header='false').load(input1, schema=customSchema)
    df_input2 = sqlContext.read.format('com.databricks.spark.csv').options(
        header='true').load(input2, schema=poi_schema)
    #Performing join after Broadcast
    distance_calc = df_input1.join(broadcast(df_input2)).withColumn(
        'distance_in_km',
        cal_distance(df_input1.latitude, df_input1.longtitude,
                     df_input2.poi_latitude, df_input2.poi_longtitude))
    distance_calc.registerTempTable("table1")
    #Cluster the data  based on the column to group to reduce shuffling
    distance_calc = sqlContext.sql("SELECT * FROM table1 CLUSTER BY id")
    #Minimum distance is calculated
    poi_min_distance = distance_calc.groupBy('id').agg(
        min('distance_in_km').alias('distance_in_km'))
    #assigning the labels
    label_data = distance_calc.join(poi_min_distance,
                                    ['id', 'distance_in_km']).select(
                                        'id', 'timeSet', 'country', 'province',
                                        'city', 'latitude', 'longtitude',
                                        'poi_id', 'poi_latitude',
                                        'poi_longtitude', 'distance_in_km')
    label_data.registerTempTable("table2")
    #assigning the rank to remove duplicate within subgroup
    df_result = sqlContext.sql(
        "select id,timeSet,country,province,city,latitude,longtitude,poi_id,poi_latitude,poi_longtitude,distance_in_km,rank() over ( partition by id order by poi_id) as rank from table2"
    )
    df_result = df_result.where(df_result['rank'] == 1)
    #Saving the result
    df_result.coalesce(1).write.format('com.databricks.spark.csv').save(output)
Exemplo n.º 20
0
def get_data(from_i, to_i):
    sqlContext = HiveContext(sc)
    for i in range(from_i, to_i):
        date = time.strftime(
            "%Y-%m-%d", time.localtime(int(time.time() - i * 60 * 60 * 24)))
        sqlContext.sql(
            "insert into table usercenter_dw.yyh_momo_tmp select distinct imeimd5 as userid, mediadata,etl_dt from t2pdm_data.t05_chehui_dsp_log_v2 "
            "where etl_dt ='" + date +
            "' and channelid=4 and length(imeimd5) = 32")
        print("finished", date)
Exemplo n.º 21
0
def get_context():
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="__file__", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex""")
    sql_context.setConf("spark.sql.shuffle.partitions", "32")
    return sc, sql_context
Exemplo n.º 22
0
def get_context():
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="__file__", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("""use fex""")
    sql_context.setConf("spark.sql.shuffle.partitions", "32")
    return sc, sql_context
Exemplo n.º 23
0
class HiveOperate():
    def __init__(self, sc):
        """
        创建Hive session
        :return:
        """
        #self.sc = config.CreateSparkContext()
        self.hive_context = HiveContext(sc)
        return self.hive_context

    def df_insert_to_hive(self,
                          df,
                          table_name='channel_result',
                          database='sparktest'):
        """
        将数据插入到hive中
        :param df:
        :param table_name:
        :param database:
        :return:
        """
        df.registerTempTable("result_tmp")
        result1 = self.hive_context.sql("select * from result_tmp limit 10")
        result1.show()
        self.hive_context.sql("use {}".format(database))
        self.hive_context.sql("drop table if EXISTS  {} ".format(table_name))
        self.hive_context.sql(
            "create table {} as select * from result_tmp where 1 = 2 ".format(
                table_name))
        self.hive_context.sql(
            " insert overwrite table {} select * from result_tmp".format(
                table_name))
Exemplo n.º 24
0
def load_factdata(sc, cfg, starting_day, ending_day, attributes_condition):
    hive_context = HiveContext(sc)
    bucket_id_max = cfg['bucket_id_max']
    # step 1: load the original fact data.
    # step 2: load the distribution e.g. 133904 uckeys.
    # step 3: inner join the original fact data with the distribution. e.g. 133904 uckeys.
    # step 4: filter the new fact data with date range e.g. 2020-01-30 - 2020-02-08, 10 days.
    # step 5: filter the new fact data with conditions.

    # step 1: load the original fact data
    command = """select uckey, day, hour, count_array from {} where bucket_id <= {} 
              """.format(cfg['factdata'], bucket_id_max)
    df = hive_context.sql(command)
    df = add_count_map(df)
    # [Row(count_array=['3:4'], day='2019-11-02', hour=19, uckey='native,72bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPM,15,76', count_map={'3': '4'})]

    # Explode count_map to have pcat and count on separate columns
    df = df.select('uckey', 'day', 'hour',
                   explode(df.count_map)).withColumnRenamed(
                       "key", "price_cat").withColumnRenamed("value", "count")
    # [Row(uckey='native,72bcd2720e5011e79bc8fa163e05184e,WIFI,g_m,5,CPM,15,76', day='2019-11-02', hour=19, price_cat='3', count='4')]

    # This is to have the fact data uckey-price_cat pair based on daily count to join the distribution.
    df = df.groupBy('uckey', 'day', 'price_cat').agg({
        "count": "sum"
    }).withColumnRenamed("sum(count)", "count")
    # [Row(uckey='splash,5cd1c663263511e6af7500163e291137,WIFI,g_m,4,CPT,3,', day='2019-11-02', price_cat='1', count=56.0)]

    # step 2: load the distribution e.g. 133904 uckeys
    command = 'select uckey, price_cat from {} where ratio > 0'.format(
        cfg['distribution'])
    dfd = hive_context.sql(command)

    # step 3: inner join the original fact data with the distribution #distinct uckeys in joined fact data: e.g. 133904
    df = df.join(dfd, [df.uckey == dfd.uckey, df.price_cat == dfd.price_cat],
                 how="inner").drop(dfd.uckey).drop(dfd.price_cat)
    # e.g. df.select(df.uckey).distinct().count(): 133904

    # step 4: filter the new fact data with date range e.g. 2020-01-30 - 2020-02-08, 10 days
    df = df.filter((df.day <= ending_day) & (df.day >= starting_day))
    # e.g. df.count(): 15152287, df.select(df.uckey).distinct().count(): 92612

    # step 5: filter the new fact data with conditions.
    uckey_attrs = cfg['uckey_attrs']
    for attr_index in range(len(uckey_attrs)):
        df = df.withColumn(
            uckey_attrs[attr_index],
            udf(lambda x: x.split(',')[attr_index], StringType())(df.uckey))
    # e.g. [Row(uckey=u'magazinelock,01,2G,,,CPM,13', day=u'2020-01-19', hour=8, count_array=[u'1:10'],
    # m=u'magazinelock', si=u'01', t=u'2G', g=u'', a=u'', pm=u'CPM', r=u'13')]
    if attributes_condition:
        for attr, attr_value in attributes_condition.items():
            df = df.filter(df[attr] == str(attr_value))

    return df
Exemplo n.º 25
0
def tear_down():
    for table in data.keys():
        hiveContext = HiveContext(sc)
        df = hiveContext.createDataFrame(data[table], fields[table])
        hiveContext.sql('use test_db')
        try:
            df.registerTempTable("demo")
            hiveContext.sql("insert into {table} partition(ds='{date}') select * from demo".format(table=table,date=date))
            # hiveContext.sql("insert into {table} partition(ds="")  select * from demo".format(table=table))
        except Exception as e:
            df.saveAsTable("{table}".format(table=table))
Exemplo n.º 26
0
def read_csv(sc, file_name, sep=",", storage="hive://", header=True,
             names=None, table_name=None, infer_limit=10000):
    table_name = table_name if table_name is not None else "df" + str(uuid.uuid4())
    hc = HiveContext(sc)
    df = pd.read_csv(file_name, sep=sep, nrows=infer_limit)
    names = df.columns if not names else names
    types = []
    for i in range(len(names)):
        tp = names[i] + " "
        if df.dtypes[i] == "O":
            tp += "STRING"
        elif df.dtypes[i] == "int64":
            tp += "INT"
        else:
            tp += "DOUBLE"
        types.append(tp)
    hc.sql('drop table if exists %s' %table_name)
    qw = """CREATE TABLE IF NOT EXISTS %s (%s) row format delimited fields terminated by '%s'
LINES TERMINATED BY '\n'""" %(table_name, ','.join(types), sep)
    if header:
        qw += " tblproperties ('skip.header.line.count'='1')"
    hc.sql(qw)
    hc.sql("LOAD DATA LOCAL INPATH '%s' OVERWRITE INTO TABLE %s" %(file_name, table_name))
    rdd = hc.sql("SELECT * FROM %s" %table_name)
    ctx = hc
    if storage.startswith("parquet://"):
        path = storage.replace("parquet://", "")
        rdd.saveAsParquetFile("%s/%s" %(path, table_name))
        sq = HiveContext(sc)
        rdd = sq.parquetFile("%s/%s" %(path, table_name))
        rdd.registerTempTable(table_name)
        rdd = sq.sql("select * from %s" %table_name)
        ctx = sq
    return DataFrame(ctx, table_name, data=rdd, columns=names, dtype=types)
def save_ucdocs_to_hive_tables(ucdocs_list, factdata_table_name):
    sc = SparkContext.getOrCreate()
    hive_context = HiveContext(sc)
    sc.setLogLevel('WARN')

    #drop the table if exists, then create a new table and insert data to this table.
    command = """
    DROP TABLE IF EXISTS  {}
    """.format(factdata_table_name)
    hive_context.sql(command)

    command = """
    CREATE TABLE IF NOT EXISTS {}
    (
    uckey string,
    bucket_id int,
    count_array array<string>,
    hour int,
    day string
    )
    """.format(factdata_table_name)
    hive_context.sql(command)

    #save the factdata of ucdocs to the hive table.
    sqlContext = SQLContext(sc)
    df = sqlContext.read.json(sc.parallelize(ucdocs_list))
    df.select('uckey', 'bucket_id', 'count_array', 'hour', 'day').write.option(
        "header", "true").option("encoding", "UTF-8").mode('append').format(
            'hive').saveAsTable(factdata_table_name)

    #agg df from hourly counts to daily counts by suming up hourly counts.
    _udf = udf(
        lambda x: sum(
            [int(list(i.split(':'))[1]) for i in x if i and ':' in i])
        if x else 0, IntegerType())
    df = df.withColumn('hourly_counts', _udf(df.count_array))
    df_dailycounts = df.groupBy('uckey', 'day').agg({
        'hourly_counts': 'sum'
    }).orderBy('uckey').orderBy('day').withColumnRenamed(
        'sum(hourly_counts)', 'daily_counts')

    #save the transformed factdata with daily counts.
    factdata_table_name_dailycounts = factdata_table_name + '_dailycounts'
    command = """
    DROP TABLE IF EXISTS  {}
    """.format(factdata_table_name_dailycounts)
    hive_context.sql(command)

    command = """
    CREATE TABLE IF NOT EXISTS {}
    (
    uckey string,
    day string,
    daily_counts int
    )
    """.format(factdata_table_name_dailycounts)
    hive_context.sql(command)
    df_dailycounts.select('uckey', 'day', 'daily_counts').write.option(
        "header", "true").option("encoding", "UTF-8").mode('append').format(
            'hive').saveAsTable(factdata_table_name_dailycounts)
Exemplo n.º 28
0
def main():
    conf = (SparkConf().setMaster("yarn-client").setAppName('').set('spark.executorEnv.PYTHONHASHSEED','0'))
    conf.set("spark.mongodb.input.uri", uri)
    conf.set("spark.mongodb.input.database", database)
    conf.set("spark.mongodb.input.collection", collection)
    spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()
    sc = spark.sparkContext
    sc.setLogLevel(logLevel='ERROR')
    hive_context = HiveContext(sc)
    origin_data = hive_context.read.format("com.mongodb.spark.sql").options(uri=uri, database=database, collection=collection).option("pipeline",'[{"$project":{"origin_user_id":1,"origin_post_id":1,"_id":0}}]').load()
    map_post_id = hive_context.sql('select post_id,origin_post_id as map_origin_post_id,user_id from prod_bd_mysql_syn.dim_post_upload where post_state = 1')
    post_id_status = hive_context.sql('select post_id as map_post_id,rec_status from prod_bd_mysql_syn.dim_post_info')
    map_post = origin_data.join(map_post_id,origin_data.origin_post_id==map_post_id.map_origin_post_id,'left_outer')
    map_data = map_post.join(post_id_status,map_post.post_id==post_id_status.map_post_id,'left_outer').select('origin_user_id','origin_post_id','post_id','user_id','rec_status').filter('rec_status is null')
    print(map_data.take(500),map_data.count())
Exemplo n.º 29
0
def main():
    #Setting up spark configuration
    conf = SparkConf().setAppName('analysis')
    sc = SparkContext(conf=conf)
    assert sc.version >= '1.5.1'
    sqlContext = HiveContext(sc)
    #Setting Input file path and output file path
    inputs = sys.argv[1]
    output = sys.argv[2]
    #Decaring Schema for data frame
    customSchema = StructType([
        StructField("id", LongType(), False),
        StructField("timeSet", StringType(), False),
        StructField("country", StringType(), False),
        StructField("province", StringType(), False),
        StructField('city', StringType(), False),
        StructField('latitude', FloatType(), False),
        StructField('longtitude', FloatType(), False),
        StructField('poi_id', StringType(), False),
        StructField('poi_latitude', FloatType(), False),
        StructField('poi_longtitude', FloatType(), False),
        StructField('distance_in_km', FloatType(), False)
    ])
    #read data into data frame
    df1 = sqlContext.read.format('com.databricks.spark.csv').options(
        header='false').load(inputs, schema=customSchema)
    #Register data frame into table
    df1.registerTempTable("temp_table")
    #Cluster the data based on the column to group to reduce shuffling
    df1 = sqlContext.sql("SELECT * FROM temp_table CLUSTER BY poi_id")
    df1.registerTempTable("table1")
    #Calculating POI average
    df_avg = sqlContext.sql(
        "SELECT poi_id,avg(distance_in_km) as avg_distane_in_km,stddev(distance_in_km) FROM table1 GROUP BY poi_id"
    )
    df_avg.coalesce(1).write.format('com.databricks.spark.csv').save(
        output + '/average_stddeviation', header='true')
    #Calculating POI density
    df_density = sqlContext.sql(
        "SELECT poi_id,count(poi_id) as density FROM table1 GROUP BY poi_id")
    df_density.coalesce(1).write.format('com.databricks.spark.csv').save(
        output + '/density', header='true')
    #Calculating POI circle area
    df_circle_area = sqlContext.sql(
        "SELECT poi_id,3.14*(max(distance_in_km))*(max(distance_in_km))as circle_area FROM table1 GROUP BY poi_id"
    )
    df_circle_area.coalesce(1).write.format('com.databricks.spark.csv').save(
        output + '/circle_area', header='true')
def main():
    conf = SparkConf()
    dateStr = conf.get('spark.date')
    sc = SparkContext(conf=conf, appName='Loc City Data Prepare, ' + dateStr)
    hc = HiveContext(sc)

    sqlDict = prepareSql(dateStr)
    #mergedRdd = sc.emptyRDD()
    mergedRdd = sc.parallelize([])
    for prod, sql in sqlDict.items():
        print sql
        df = hc.sql(sql)
        #print 'df count:', df.count()
        rdd = df.map(lambda x: toCityLoc(x, prod))
        rdd = rdd.filter(lambda x: x[0] is not None)
        rdd = rdd.map(lambda x: x[0])
        mergedRdd = mergedRdd.union(rdd)
        #break

    mergedRdd.cache()
    print 'mergedRdd count:', mergedRdd.count()
    fromRdd = mergedRdd.map(lambda cityLoc: (
        (cityLoc.area, cityLoc.fromPoi.displayName), (cityLoc.fromPoi, 1L)))
    toRdd = mergedRdd.map(lambda cityLoc: (
        (cityLoc.area, cityLoc.toPoi.displayName), (cityLoc.toPoi, 1L)))
    count(fromRdd, dateStr, 'from')
    count(toRdd, dateStr, 'to')
    print 'success'
    sc.stop()
Exemplo n.º 31
0
def data_anlysis(inputFile):
    # inputFile = 'tesTraffic.json'
    conf = SparkConf().setAppName("SparkSQLTraffic")
    sc = SparkContext()
    hiveCtx = HiveContext(sc)
    print("Loading traffic from " + inputFile)
    while True:
        input = hiveCtx.read.json(inputFile)
        input.registerTempTable("traffic")
        topTraffics = hiveCtx.sql(
            "SELECT placeid,size, color, direction, speed FROM traffic ORDER BY time LIMIT 10"
        )
        print("#" * 20, "\n 1. According to lastest time order:", topTraffics.collect(),' record count:', len( topTraffics.collect()))

        # https://stackoverflow.com/questions/39535447/attributeerror-dataframe-object-has-no-attribute-map
        topTrafficText = topTraffics.rdd.map(lambda row: row.speed)
        isum = 0
        for speed in topTrafficText.collect():
            print("#" * 20, "\n 2. Just speed", speed)
            
            # for speed in singlelist:
            #     print('\nspeed=', speed)
            isum += float(speed)
        average_speed = isum / len( topTraffics.collect())
        show = colored("3. total flow is:", "red", attrs=['reverse', 'blink'])
        print(show, isum, "average spped is:", average_speed)
        time.sleep(3)

    sc.stop()
Exemplo n.º 32
0
def youku_user_tag():
    sqlContext = HiveContext(sc)
    sqlContext.sql("use usercenter_dw")
    df = sqlContext.sql("select * from youku_matched")
    drop_list = ['mediadata', 'yktag', 'iqytag']
    df = df.select(
        [column for column in df.columns if column not in drop_list])
    exprs = [avg(x).alias(x) for x in df.drop('userid').columns]
    cur_date = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
    df = df.groupby("userid").agg(*exprs).withColumn('source',
                                                     lit('youku')).withColumn(
                                                         'update_dt',
                                                         lit(cur_date))
    df.registerTempTable('tab_name')
    sqlContext.sql(
        "insert overwrite table youku_user_tag select * from tab_name")
Exemplo n.º 33
0
def query12_input(query_name, conf=None, output_persist=False):
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # SQL statements can be run by using the sql methods provided by sqlContext
    sql = "use tpcds_text_db_1_50"
    _ = sqlContext.sql(sql)

#    web_sales_sql = "select * from web_sales"
#    web_sales = sqlContext.sql(web_sales_sql)
#    web_sales.persist()
#    web_sales.registerAsTable("web_sales")
#    item_sql = "select * from item"
#    item = sqlContext.sql(item_sql)
#    item.persist()
#    item.registerAsTable("item")
#    date_dim_sql = "select * from date_dim"
#    date_dim = sqlContext.sql(date_dim_sql)
#    date_dim.persist()
#    date_dim.registerAsTable("date_dim")
    sqlContext.cacheTable("web_sales")
    sqlContext.cacheTable("item")
    sqlContext.cacheTable("date_dim")

    # discard the first query
    output = execute_sql(query_name, sqlContext, output_persist)
    # check the re-run statistics
    output = execute_sql(query_name, sqlContext)
    output['describe'] = output['output'].describe().show()

    sc.stop()
    return output
Exemplo n.º 34
0
def run(fout,
        yarn=None,
        verbose=None,
        patterns=None,
        antipatterns=None,
        inst='GLOBAL'):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    # define spark context, it's main object which allow to communicate with spark
    ctx = spark_context('cms', yarn, verbose)
    sqlContext = HiveContext(ctx)

    # read DBS and Phedex tables
    tables = {}
    tables.update(dbs_tables(sqlContext, inst=inst, verbose=verbose))
    bdf = tables['bdf']
    fdf = tables['fdf']
    flf = tables['flf']

    # join tables
    cols = ['*']  # to select all fields from table
    cols = [
        'b_block_id', 'b_block_name', 'f_block_id', 'f_file_id', 'fl_file_id',
        'fl_lumi_section_num'
    ]

    # join tables
    stmt = 'SELECT %s FROM bdf JOIN fdf on bdf.b_block_id = fdf.f_block_id JOIN flf on fdf.f_file_id=flf.fl_file_id' % ','.join(
        cols)
    print(stmt)
    joins = sqlContext.sql(stmt)

    # keep table around
    joins.persist(StorageLevel.MEMORY_AND_DISK)

    # construct conditions
    #    adler = ['ad8f6ad2','9c441343','f68d5dca','81c90e2a','471d2524','a3c1f077','6f0018a0','8bb03b60','d504882c','5ede357f','b05303c3','716d1776','7e9cf258','1945804b','ec7bc1d7','12c87747','94f2aa32']
    #    cond = 'f_adler32 in %s' % adler
    #    cond = cond.replace('[', '(').replace(']', ')')
    #    fjoin = joins.where(cond).distinct().select(cols)

    #    print_rows(fjoin, stmt, verbose)
    fjoin = joins\
            .groupBy(['b_block_name'])\
            .agg({'fl_lumi_section_num':'count'})\
            .withColumnRenamed('count(fl_lumi_section_num)', 'nlumis')\

    # keep table around
    fjoin.persist(StorageLevel.MEMORY_AND_DISK)

    # write out results back to HDFS, the fout parameter defines area on HDFS
    # it is either absolute path or area under /user/USERNAME
    if fout:
        fjoin.write.format("com.databricks.spark.csv")\
                .option("header", "true").save(fout)

    ctx.stop()
Exemplo n.º 35
0
def main():
    sc = SparkContext()
    hc = HiveContext(sc)

    df = hc.sql("""{{sql}}""")
    df_writer = DataFrameWriter(df)
    df_writer.saveAsTable(name='{{tableName}}',
                          format='json',
                          mode='overwrite',
                          path='s3://data/{{tableName}}')
Exemplo n.º 36
0
def query12_no(query_name, conf=None):
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)

    # SQL statements can be run by using the sql methods provided by sqlContext
    sql = "use tpcds_text_db_1_50"
    _ = sqlContext.sql(sql)

    output = execute_sql(query_name, sqlContext)
    output['describe'] = output['output'].describe().show()

    sc.stop()
    return output
Exemplo n.º 37
0
    def run(self):
	sc = SparkContext("local", "Course Activity")
	#sqlHC is the SQLHiveContext        
	sqlHC = HiveContext(sc)
	
	lines=sqlHC.sql(""" select courseName,lmsUserId,createDateTime,
		            eventType,eventName,eventNo from logdata where 
			    eventType not in ('enrollment','instructor','admin') 
			    and lmsUserId is not NULL 
   			    and courseName is not NULL 
			    and eventNo is not NULL limit 10""")


	maplvl1=lines.flatMap(lambda p: mapp(p[0],str(p[1]),p[2].strftime('%Y-%m-%d'),p[4]))
	reduceRDD=maplvl1.reduceByKey(lambda a,b : a+b)
	with self.output().open('w') as out_file:
		for line in reduceRDD.collect():
        		out_file.write(line[0][0]+"\x01"+line[0][1]+"\x01"+line[0][2]+"\x01"+line[0][3]+"\x01"+str(line[1])+"\n")
Exemplo n.º 38
0
def ch9_sql():
    # Import Spark SQL
    from pyspark.sql import HiveContext, Row
    # Or if you can't include the hive requirements 
    from pyspark.sql import SQLContext, Row

    hiveCtx = HiveContext(sc)

    input_file = hiveCtx.read.json("testweet.json")
    # Register the input_file schema RDD 
    input_file.registerTempTable("tweets")
    # Select tweets based on the retweetCount
    topTweets = hiveCtx.sql("""SELECT text, retweetCount FROM
      tweets ORDER BY retweetCount LIMIT 10""")

    topTweetText = topTweets.map(lambda row: row.text)  
    topTweetText.collect()

    topTweets.schema
    hiveCtx.cacheTable("tweets")
Exemplo n.º 39
0
df_final = df_add_year.withColumn('Load_date', F.current_date())

df_final.repartition(10)

# Registering data frame as a temp table for SparkSQL
hive_ctx.registerDataFrameAsTable(df_final, "EMP_TEMP")

# Target Type: APACHE HIVE
# Database   : EMPLOYEES
# Table Name : EMPLOYEE_DIM
# + ------------------------------- +
# | COlUMN NAME| TYPE   | PARTITION |
# + ------------------------------- +
# | EMP_NO     | INT    |           |
# | BIRTH_DATE | DATE   |           |
# | FIRST_NAME | STRING |           |
# | LAST_NAME  | STRING |           |
# | GENDER     | STRING |           |
# | HIRE_DATE  | DATE   |           |
# | SALARY     | INT    |           |
# | FROM_DATE  | DATE   |           |
# | TO_DATE    | DATE   |           |
# | YEAR       | INT    | PRIMARY   |
# | LOAD_DATE  | DATE   | SUB       |
# + ------------------------------- +
# Storage Format: ORC

# Inserting data into the Target table
hive_ctx.sql("INSERT OVERWRITE TABLE EMPLOYEES.EMPLOYEE_DIM PARTITION (year, Load_date) \
            SELECT EMP_NO, BIRTH_DATE, FIRST_NAME, LAST_NAME, GENDER, HIRE_DATE, \
            SALARY, FROM_DATE, TO_DATE, year, Load_date FROM EMP_TEMP")
		would be stored in the Hive tables.
		
Step 4.	Then do a mapreduce to find the total number of count
		and timeAccess to the video and the videoFrame, and
		write it to the MySQL Analytics summary table.
"""


"""
Step 1. Getting the video data from Hive.
"""

sqlVideo = (
    "SELECT orgname,coursename, videosysname, videolength, videoTitle FROM coursevideos where videosysname is not null"
)
videoslist = dict(sqlContext.sql(sqlVideo).map(lambda v: ((v[0], v[1], v[2]), (v[-2], v[-1]))).collect())
# unless you actually collect() the data from the RDD, you can't operate on it.
# So, to actually use the data, call a take(x) method or a collect() method on the RDD before you start.
# otherwise, to use RDD functions, DO NOT OPERATE ON IT IMMEDIATELY. it will all be done when the data is finally collected.

# 1. To iterate over the data like a list, do a collect or take function first.
# 2. To access the elements of a Row type object (result of collect), use the normal subscripts. That works just fine.
# But here, to make it easier, we decided to change the Rows to tuples in a dictionary so that they can be accessed via moduleSysName or videoSysName

# for video in videoslist:
# 	print video, videoslist[video][0], videoslist[video][1]


"""
Step 2. Getting the event data from Hive.
"""
Exemplo n.º 41
0
    if s is None:
        return None
    try:
        f = float(s)
        return f
    except ValueError:
        return None

##########


##############################
# Combine readmissions and effective_care

# Transform readmissions
df_readmissions = sqlContext.sql('select providerid, state, measureid, score from readmissions')

readmissions = df_readmissions.map(lambda r: (r.providerid, r.state, r.measureid, getFloat(r.score) ))

# Transform effective_care
df_effective_care = sqlContext.sql('select providerid, state, measureid, score from effective_care')

def getEffectiveCareScore(measureid, s):
    if measureid == "EDV":
        # The score of EDV measure has the following distinct values
        s = str(s).strip()
        if s == "Low (0 - 19,999 patients annually)":
            return 25.
        elif s == "Medium (20,000 - 39,999 patients annually)":
            return 50.
        elif s == "High (40,000 - 59,999 patients annually)":
Exemplo n.º 42
0
# coding: utf-8

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
from pyspark.sql import StructType, StructField, IntegerType, FloatType, StringType

conf = SparkConf().setAppName("spark_sql_datatype_struct")

sc = SparkContext(conf=conf)

hc = HiveContext(sc)

source = sc.parallelize([((1, 2.0, "3.0"),)])

schema = StructType([StructField("struct", StructType([StructField("first", IntegerType(), False), StructField(
    "second", FloatType(), False), StructField("third", StringType(), False)]), False)])

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select struct.first, struct.second, struct.third from temp_table").collect()

sc.stop()

for row in rows:
    print row
Exemplo n.º 43
0
from pyspark import SparkContext
sc = SparkContext("local", "best_hospitals")

from pyspark.sql import HiveContext
sqlContext = HiveContext(sc)

# Select the top 10 hospital by average avgscore
# Please note that we filter out those hospital not qualified for evaluation
df_top10_hospitals = sqlContext.sql("select Q.providerid as id, AVG(Q.normalizedscore) as avgscore \
from total_quality Q join hospitals_qualified H on Q.providerid = H.providerid \
where Q.normalizedscore is not null and H.qualified = true \
group by Q.providerid \
order by avgscore DESC").limit(10)

# Join with hospitals_qualified to get the hospital name and state
# Note: couldn't figure out how to do it in the above select statement (together with Group By) in one-shot! :-(
df_hospitals = sqlContext.table("hospitals_qualified")
df_top10_hospitals_full = df_top10_hospitals.join(df_hospitals, df_top10_hospitals.id == df_hospitals.providerid).\
    select(df_hospitals.providerid, df_hospitals.hospitalname, df_hospitals.state, df_top10_hospitals.avgscore)

df_top10_hospitals_full = df_top10_hospitals_full.orderBy(df_top10_hospitals_full.avgscore.desc())

# Save it as a table
df_top10_hospitals_full.registerTempTable("df")
sqlContext.sql("drop table if exists top_10_hospitals")
sqlContext.sql("CREATE TABLE top_10_hospitals AS SELECT * FROM df")

print
print "Top 10 hospitals"
print
rank = 1
Exemplo n.º 44
0
        self.saveWeather(updates)
        print "updates=", updates

if __name__ == '__main__':
    print "Starting.", datetime.now()
    sc = SparkContext()
    #sqlContext = SQLContext(sc)
    sqlContext = HiveContext(sc)

    games = sqlContext.read.parquet(CreateStatsRDD.rddDir + "/" + Games.table_name + ".parquet")
    games.registerTempTable("games")
    games.cache()
    print "games=", games
    print games.take(2)

    stadium = sqlContext.load(source="com.databricks.spark.csv", header="true", path = SRCDIR + "/stadium.csv")
    stadium.registerTempTable("stadium")
    stadium.cache()
    print "stadium=", stadium.take(2)
    
    weather = UpdateWeather(sc, sqlContext, games, stadium)
    weather.update()

    badIds = sqlContext.sql("select game_id, count(*) from weather group by game_id having count(*) >1").collect()
    if len(badIds) > 0:
        print "BAD weather game_ids=", badIds
    else:
        print "no bad weather ids."
    sc.stop()
Exemplo n.º 45
0
        """
        # TODO: refactor this into a utility function and update jobs
        # to always UTF8 encode mapper keys.
        if len(values) > 1:
            return tuple([value.encode('utf8') for value in values])
        else:
            return values[0].encode('utf8')

    
sc = SparkContext("local", "Course Activity")
	#sqlHC is the SQLHiveContext        
sqlHC = HiveContext(sc)

lines=sqlHC.sql(""" select courseName,lmsUserId,createDateTime,
		            eventType,eventName,eventNo from logdata where 
			    eventType not in ('enrollment','instructor','admin') 
			    and lmsUserId is not NULL 
   			    and courseName is not NULL 
			    and eventNo is not NULL limit 100""")


maplvl1=lines.flatMap(lambda p: mapp(p[0],str(p[1]),p[2].strftime('%Y-%m-%d'),p[4]))
for linet in maplvl1.collect():
	print linet

reduceRDD = maplvl1.reduceByKey(lambda a, b : a + b)

fo = open("tester","w")

for line in reduceRDD.collect():
	fo.write(line[0][0]+"\x01"+line[0][1]+"\x01"+line[0][2]+"\x01"+line[0][3]+"\x01"+str(line[1])+"\n")
Exemplo n.º 46
0
        date,
        close
    FROM
        eod_rel 
    """)

    rdd_sum = df.rdd.groupBy(lambda x: x.symbol)\
      .map(lambda x: (x[0], list(x[1])))\
      .flatMapValues(lambda x: cal_per(x))\
      .map(lambda x: x[1])\
      .groupBy(lambda x: x["date"])\
      .map(lambda x: (x[0], list(x[1])))\
      .mapValues(lambda x: cal_sum_per(x))\
      .map(lambda x: x[1])

    for each in rdd_sum.collect():
        print each



if __name__ == "__main__":
    conf = SparkConf()
    conf.set("spark.executor.instances", "4")
    conf.set("spark.executor.cores", "4")
    conf.set("spark.executor.memory", "8g")
    sc = SparkContext(appName="bintrade.post.index", conf=conf)
    sql_context = HiveContext(sc)
    sql_context.sql("use fex")
    main(sc, sql_context)
    sc.stop()
from pyspark.storagelevel import StorageLevel

if __name__ == "__main__":
	sc = SparkContext(appName="TweetsConversationHierarchyBuilder")
	sqlContext = HiveContext(sc)

	toplevel = sqlContext.sql("""
		select distinct tt.id,tt.in_reply_to_status_id from 
		(select t.id,t.in_reply_to_status_id
		  from wrd10_socialmedia.fact_orc_raw_tweets t 
		  join wrd10_socialmedia.fact_orc_raw_tweets t1 on t.id = t1.in_reply_to_status_id 
		 where t.in_reply_to_status_id is null
		   and t.country='Nigeria'
		   and t1.country='Nigeria'
		 union all
		 select t2.id,t2.in_reply_to_status_id
		  from wrd10_socialmedia.fact_orc_raw_twitter_userstream t2,
		  wrd10_socialmedia.fact_orc_raw_twitter_userstream t3,
		  wrd10_socialmedia.fact_orc_raw_tweets t4
		 where t2.in_reply_to_status_id is null and ((t2.id = t3.in_reply_to_status_id) or (t2.id=t4.in_reply_to_status_id))
		   and t2.country='Nigeria'
		   and t3.country='Nigeria'
		   and t4.country='Nigeria'
		) tt
	""").persist(StorageLevel.MEMORY_AND_DISK)
	replies = sqlContext.sql("""
		select distinct tt.id,tt.in_reply_to_status_id,tt.text,tt.user_name,tt.created_at,tt.country from
		(select t.id,t.in_reply_to_status_id,t.text,t.user_name,t.created_at,t.country
		  from wrd10_socialmedia.fact_orc_raw_tweets t 
		 where t.in_reply_to_status_id is not null
		   and t.country='Nigeria'
Exemplo n.º 48
0
# Test reading from S3
print("Prepping hadoop for reading S3 file")
#hadoopConf = SparkContext.hadoopConfiguration
#sc.hadoopConfiguration.set("fs.s3n.awsAccessKeyId", "AKIAIJCHQUJC4PMAILDA")
#sc.hadoopConfiguration.set("fs.s3n.awsSecretAccessKey", "/home/pshvets/Spark_SQL/ricaws.pem") # can contain "/"
myRDD = sc.textFile("s3n://hive-qs-data/employee.csv")
s3file = myRDD.count()
print("File count on S3: ", str(s3file))
for rec in myRDD.collect():
	print("Reading S3 file: ", rec)
print("Done with reading file from S3")
###############################################################

# Create customer specific Hive database.
# This 
sqlContext.sql("CREATE DATABASE IF NOT EXISTS peter_hive_db")

# Create pool for parallel processing
pool = Pool()

# Get environment variables
app_variables = get_app_variables()
# Compile a list of all property files under $SPARK_ETL_CONF_DIR folder
path = app_variables.get('SPARK_ETL_CONF_DIR')
prop_files = [os.path.join(path,fileiter) for fileiter in os.listdir(path) if fileiter.endswith('.json')]
print (prop_files)

# Data Extract
if __name__ == "__main__":
	# Execute core functionality. Iterate over all propertry files in Spark-ETL config directory
	for prop_fileiter in prop_files:
Exemplo n.º 49
0
convertRDD = hc.sql(
    "select col1, col2, col3 from temp_source").map(convert)

mytable = hc.inferSchema(convertRDD)

mytable.registerTempTable("temp_mytable")
"""


def convert(val):
    return val.upper()

hc.registerFunction("temp_convert", convert)

convertRDD = hc.sql(
    "select temp_convert(col1) as col1, col2, col3 from temp_source")

convertRDD.registerAsTable("temp_mytable")


hc.cacheTable("temp_mytable")


def printRows(rows):
    for row in rows:
        print row

datas = hc.sql("select * from temp_mytable").collect()

printRows(datas)
Exemplo n.º 50
0
# Createas a hive table and loads an input file into it
# For input you can use examples/src/main/resources/kv1.txt from the spark
# distribution
from pyspark import SparkContext
from pyspark.sql import HiveContext
import json
import sys

if __name__ == "__main__":
    if len(sys.argv) != 4:
        print "Error usage: LoadHive [sparkmaster] [inputFile] [inputtable]"
        sys.exit(-1)
    master = sys.argv[1]
    inputFile = sys.argv[2]
    inputTable = sys.argv[3]
    sc = SparkContext(master, "LoadHive")
    hiveCtx = HiveContext(sc)
    # Load some data into hive
    hiveCtx.sql(
        "CREATE TABLE IF NOT EXISTS " +
        inputTable +
        " (key INT, value STRING)")
    hiveCtx.sql(
        "LOAD DATA LOCAL INPATH '" + inputFile + "' INTO TABLE " + inputTable)
Exemplo n.º 51
0
def main():
    # set up the logger
    logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'mpg_cluster.log'),
                            level=logging.INFO,
                            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
                            datefmt='%m/%d/%Y %H:%M:%S')
    logger = logging.getLogger(__name__)

    # NSJOIN dayidx # only partitioned by DAY
    day_idx = beeline.get_last_partitions('mapper.nsjoin').split('=')[1]
    # BAREBONES dayidx # only partitioned by DAY
    day_bb = [x for x in beeline.show_partitions('mapper.barebones').split('\n') if '=%s' % (day_idx) in x]
    # MAPPOINTS dayidx # partitioned by DAY and UUID (pick the last uuid)
    mappoints_data = sorted([x for x in beeline.show_partitions('mapper.mappoints').split('\n') if '=%s' % (day_idx) in x])[-1].split('/')
    [day_mps, uuid_idx] = [x.split('=')[1] for x in mappoints_data]

    if day_idx != day_mps:
        logger.error('mapper.mappoints and mapper.nsjoin different day, possible data missing in the source.')
        return

    if len(day_bb) == 0:
        logger.warning('mapper.barebone data missing for this particular day.')
        #return

    logger.info('Processing data in day=%s, uuid=%s' % (day_idx, uuid_idx))

    logger.info('begin spark process.')
    getting_mappoint_data = ''' select b1.mpgid mpgid, b1.lat lat, b1.lon lon, b1.country country, b1.mpgload mpgload, b1.allowed_private_regions allowed_private_regions, b2.asnum asnum, b2.ip ip from (select mpgid, lat, lon, country, mpgload, allowed_private_regions from mapper.mappoints where day=%s and uuid="%s" and lat is not NULL and lon is not NULL and ghostonly=0 ) b1 left outer join (select collect_set(ns_ip) ip, collect_set(asnum) asnum, mpgid from (select ns_ip, mpd_uuid, mpgid, asnum, demand, day from mapper.nsjoin where day=%s and mpd_uuid="%s" and demand>0.01 order by demand desc) a group by mpgid) b2 on b2.mpgid=b1.mpgid ''' % (day_idx, uuid_idx, day_idx, uuid_idx)
    geo_total_cap_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='private' group by country, network) a ''' % day_idx
    geo_total_cap_public_query = ''' select * from (select country, network, sum(peak_bitcap_mbps) peak_bitcap_mbps, sum(peak_flitcap_mfps) peak_flitcap_mfps, sum(numvips) numvips from mapper.regioncapday where day=%s and network in ('freeflow', 'essl') and prp='public' group by country, network) a ''' % day_idx

    sc = SparkContext()
    hiveCtx = HiveContext(sc)

    rows = hiveCtx.sql(getting_mappoint_data)

    regInfoRows = hiveCtx.sql('select * from mapper.regioncapday where day=%s and peak_bitcap_mbps is not null and peak_flitcap_mfps is not null' % (day_idx))
    geo_total_cap = hiveCtx.sql(geo_total_cap_query)
    geo_total_cap_p = hiveCtx.sql(geo_total_cap_public_query)


    # rdd format: [regionid, [mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load, mpg-asnum, mpg-nsip]]
    region_mpginfo_pair = rows.map(lambda x: [[x.mpgid,
                                               x.lat,
                                               x.lon,
                                               x.country,
                                               x.mpgload,
                                               x.asnum,
                                               x.ip], x.allowed_private_regions])\
                                .flatMapValues(lambda x: x).map(lambda x: [x[1], x[0]])

    #region_mpginfo_pair.first()

    # rdd format: [regionid, [reg-lat, reg-lon, reg-capacity(bit mbps), reg-capacity(bit mfps), reg-country, reg-numvips, reg-service, reg-prp]]
    # ps. prp=1: private, prp=0: public
    region_latlon = regInfoRows.map(lambda x: [x.region, [x.latitude,
                                                          x.longitude,
                                                          x.peak_bitcap_mbps,
                                                          x.peak_flitcap_mfps,
                                                          x.country,
                                                          x.numvips,
                                                          'W' if x.network=='freeflow' else ('S' if x.network=='essl' else 'O'),
                                                          1 if x.prp=='private' else 0]])\
                                .filter(lambda x: x[1][6]=='W' or x[1][6]=='S')

    region_public_list = region_latlon\
        .filter(lambda x: x[1][7] == 0)\
        .map(lambda x: ('all', [[x[0]]]))\
        .reduceByKey(lambda a, b: [a[0]+b[0]])\
        .map(lambda x: x[1][0]).collect()

    region_public_list = [0] + sorted(region_public_list[0])

    # dummy region
    rdd2 = sc.parallelize([([0, [0, 0, 0.0, 0.0, 'US', 0, 'W', 1]])])
    region_latlon = region_latlon.union(rdd2)

    # perform the join into tuple of (K, (V1, V2):
    # (regionid, ([mpgid, mpg-lat, mpg-lon, mpg-country, mpg-load], [reg-lat, reg-lon, reg-cap, reg-country, reg-numvips, reg-service]))
    # rdd  = (mpgid, regionid, [lat1, lon1, lat2, lon2, distance],
    #               reg-cap-bit(gbps), reg-cap-flit(gbps), reg-country, reg-numvips, reg-services,
    #               mpg-country, mpg-load, mpg-asnum, mpg-nsip,
    #               mpg-lat, mpg-lon)
    mpgid_reg_geo = region_mpginfo_pair.join(region_latlon).map(lambda x: [x[1][0][0],
                                                                           x[0],
                                                                           geodesic_distance(x[1][0][1],
                                                                                             x[1][0][2],
                                                                                             x[1][1][0],
                                                                                             x[1][1][1]),
                                                                           round(float(x[1][1][2])/1000.0, 3),
                                                                           round(float(x[1][1][3])/1000.0, 3),
                                                                           x[1][1][4], # reg-country
                                                                           x[1][1][5], # reg-numvips
                                                                           x[1][1][6], # reg-services
                                                                           x[1][0][3],
                                                                           x[1][0][4],
                                                                           x[1][0][5],
                                                                           x[1][0][6],
                                                                           x[1][0][1],
                                                                           x[1][0][2]])

    # filtering on mapping distance < 500 miles
    # filtering on reg-country = mpg-country
    # filtering on region capacity fbps > 1Gbps
    # rdd format = (mpgid, [[regionid], distance, [capacity-w, capacity-s], numvips, 1, mpg-country, mpg-load, mpg-asnum, mpg-nsip,
    #                        mpg-lat, mpg-lon])
    #mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: x[2][4] < 500)\
    #    .filter(lambda x: x[5] == x[8])\
    #    .filter(lambda x: x[3] > 1)\
    #    .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]]))

    # or this one, no-same-country constraint:
    mpgid_reg_distance = mpgid_reg_geo.filter(lambda x: (x[2][4] < 500) or (x[5]==x[8] and x[2][4] < 1000))\
        .filter(lambda x: x[3] > 1)\
        .map(lambda x: (x[0], [[x[1]], x[2][4], [x[3], 0] if x[7]=='W' else [0, x[3]], x[6], 1, x[8], x[9], x[10], x[11], x[12], x[13]]))

    #mpgid_reg_distance.first()

    # group by mpgid
    # rdd format = (mpgid, [[reg-list],
    #                       avg_distance,
    #                       total_cap freeflow,
    #                       total_cap essl,
    #                       total num vips,
    #                       rg_count,
    #                       mpg-country,
    #                       mpg-load,
    #                       [mpg-asnum],
    #                       [mpg-nsip])
    mpgid_reglist_avgDistance_capacity_nReg = mpgid_reg_distance\
        .reduceByKey(lambda a, b: [a[0]+b[0], a[1]+b[1], [a[2][0]+b[2][0], a[2][1]+b[2][1]], a[3]+b[3], a[4]+b[4],
                                   a[5], a[6], a[7], a[8], a[9], a[10]])\
        .map(lambda x: (x[0], [sorted(x[1][0]), # region_list
                               round(x[1][1]/x[1][4], 2), # avg distance
                               round(x[1][2][0], 2), # total capacity - w
                               round(x[1][2][1], 2), # total capacity - s
                               x[1][3], # numvips
                               x[1][4], # total region count
                               x[1][5], # mpg country
                               x[1][6], # mpg load
                               x[1][7], # mpg asnum
                               x[1][8], # mpg nsip
                               x[1][9], # mpg lat
                               x[1][10]])) # mpg lon

    # disable the count
    #total_mpg_with_region = mpgid_reglist_avgDistance_capacity_nReg.count()

    # rdd format = (reg, [(reg-list), [[mpg-list], avg_distance, total_cap_w, total_cap_s, total_numvips
    #                           reg-count, cluster_country, mpg-load, mpg-count, mpg-lat, mpg-lon]])
    reg_reglist_mpgid_avgDistance_capacity_nReg_country = mpgid_reglist_avgDistance_capacity_nReg\
        .map(lambda x: (tuple(x[1][0]), [[x[0]], # mpgid list
                                          x[1][1], # avg_distance
                                          x[1][2], # region total capacity freeflow
                                          x[1][3], # region total capacity essl
                                          x[1][4], # total num vips
                                          x[1][5], # total region count
                                          [x[1][6]], # mpg country list
                                          x[1][7], # mpg load
                                          1, # mpg-count
                                          x[1][8] if x[1][8] else [], # [mpg-asnum]
                                          x[1][9] if x[1][9] else [], # [mpg-nsip]
                                          [x[1][10]], # [mpg-lat] # single element array
                                          [x[1][11]], # [mpg-lon] # single element array
                                          [x[1][7]] # [mpg-load] # single element array
                                         ]))\
        .reduceByKey(lambda a, b: [a[0]+b[0],
                                   a[1],
                                   a[2],
                                   a[3],
                                   a[4],
                                   a[5],
                                   a[6]+b[6],
                                   a[7]+b[7],
                                   a[8]+b[8],
                                   a[9]+b[9],
                                   a[10]+b[10],
                                   a[11]+b[11],
                                   a[12]+b[12],
                                   a[13]+b[13]])\
        .filter(lambda x: sum(x[1][13]) > 0.0001)\
        .map(lambda x: (x[0], [sorted(x[1][0]), # mpgid list
                               x[1][1], # avg_distance
                               x[1][2], # reg-cap-w
                               x[1][3], # reg-cap-s
                               x[1][4], # numvips
                               x[1][5], # reg-count
                               [str(y) for y in sorted(list(set(x[1][6])))], # mpg-country list
                               x[1][7], # mpg-load
                               x[1][8], # mpg-count
                               [str(y) for y in sorted(list(set(x[1][9])))], # [mpg-asnum]
                               [str(y) for y in sorted(list(set(x[1][10])))], # [mpg-nsip]
                               geo_centroid(x[1][11], x[1][12], x[1][13]) # [mpg: lat, lon, por, porsigma]
                               ]))\
        .map(lambda x: ([':'.join([str(y) for y in list(x[1][6])]), # [mpg-country list]
                        x[1][1], # avg_distance
                        x[1][2], # reg-cap-w
                        x[1][3], # reg-cap-s
                        x[1][4], # numvips
                        x[1][5], # reg-count
                        x[1][7], # mpg-load
                        x[1][8], # mpg-count
                        ':'.join([str(y) for y in x[0]]), # [region-list]
                        ':'.join([str(y) for y in list(x[1][0])]), # [mpg-list]
                        ':'.join([str(y) for y in x[1][9]]) if len(x[1][9])>0 else 'NULL', # [mpg-asnum]
                        ':'.join([str(y) for y in x[1][10]]) if len(x[1][10])>0 else 'NULL', # [mpg-nsip]
                        x[1][11] # [mpg-lat, mpg-lon, mpg-por, mpg-porsigma]
                        ],
                        region_public_list
                        ))\
        .flatMapValues(lambda x: x)\
        .map(lambda x: [x[1], x[0]])

    reglist_mpgid_avgDistance_capacity_nReg_country = reg_reglist_mpgid_avgDistance_capacity_nReg_country\
        .join(region_latlon)\
        .map(lambda x: [x[1][0]]+[x[1][1]]+[geodesic_distance(x[1][0][12][0],
                                                             x[1][0][12][1],
                                                             x[1][1][0],
                                                             x[1][1][1])] + [x[0]] if x[0] > 0\
             else [x[1][0]]+[x[1][1]]+[[x[1][0][12][0],
                                       x[1][0][12][1],
                                       x[1][1][0],
                                       x[1][1][1],
                                       0.0]] + [x[0]])\
        .filter(lambda x: x[2][4] < 500)\
        .map(lambda x: (tuple([x[0][0],
                              x[0][1],
                              x[0][2],
                              x[0][3],
                              x[0][4],
                              x[0][5],
                              x[0][6],
                              x[0][7],
                              x[0][8],
                              x[0][9],
                              x[0][10],
                              x[0][11],
                              x[0][12][0],
                              x[0][12][1],
                              x[0][12][2],
                              x[0][12][3]]), # mpg-information
                        [x[1][2], # pub.region.cap.ff
                         x[1][3], # pub.region.cap.essl
                         x[1][5], # pub.region.vip
                         [x[3]] # single element region id
                         ]))\
        .reduceByKey(lambda a, b: [a[0]+b[0], # sum( pub.region.cap.ff )
                                   a[1]+b[1], # sum( pub.region.cap.essl )
                                   a[2]+b[2], # sum( pub.region.cap.vip )
                                   a[3]+b[3] # [pub.regions]
                                   ])\
        .map(lambda x: [x[0][0], # [mpg-country-list]
                        x[0][1], # avg-distance
                        x[0][12], # mpg-lat
                        x[0][13], # mpg-lon
                        x[0][14], # mpg-por
                        x[0][15], # mpg-porsigma
                        x[0][2], # pri.region.cap.ff (gbps)
                        x[0][3], # pri.region.cap.essl (gbps)
                        x[0][4], # pri.vips
                        x[0][5], # pri.region.count
                        round(float(x[1][0])/1000.0, 3), # pub.region.cap.ff (gbps)
                        round(float(x[1][1])/1000.0, 3), # pub.region.cap.essl (gbps)
                        x[1][2], # pub.vips
                        len(x[1][3])-1, # pub.region.count
                        x[0][6], # mpg-load
                        round(x[0][7], 6), # mpg-count
                        x[0][8], # [pri reg-list]
                        ':'.join([str(y) for y in sorted(x[1][3])][1:]) if len(x[1][3])>1 else 'NULL', # [pub reg-list])
                        x[0][9], # [mpg-list]
                        x[0][10], # [mpg-assum]
                        x[0][11] # [mpg-nsip]
                        ])

    # data exporting to local
    country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist = pd.DataFrame(columns=['cl_geoname',
                                                                                           'cl_avgDistance',
                                                                                           'cl_lat',
                                                                                           'cl_lon',
                                                                                           'cl_por',
                                                                                           'cl_porsigma',
                                                                                           'pri_cap_ff_gbps',
                                                                                           'pri_cap_essl_gbps',
                                                                                           'pri_nvips',
                                                                                           'pri_nReg',
                                                                                           'pub_cap_ff_gbps',
                                                                                           'pub_cap_essl_gbps',
                                                                                           'pub_nvips',
                                                                                           'pub_nReg',
                                                                                           'cl_mpgLoad',
                                                                                           'cl_nMpg',
                                                                                           'pri_regList',
                                                                                           'pub_regList',
                                                                                           'mpgList',
                                                                                           'mpgASList',
                                                                                           'mpgNSIPList'])

    geo_cluster_full_info = reglist_mpgid_avgDistance_capacity_nReg_country.collect()

    logger.info('begin write to local disk.')
    for item in range(len(geo_cluster_full_info)):
        temp = geo_cluster_full_info[item]
        country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.loc[item] = temp # the above should be temp[1][0] for the mpglist

    data_folder = '/home/testgrp/MRQOS/project_mpd_clustering/data'
    filename = 'geo_full_cluster_info.%s.%s.csv' % (day_idx, uuid_idx)
    fileDestination = os.path.join(data_folder, filename)
    country_avgDistance_capacity_nReg_mpgLoad_nMpg_reglist_mpglist.to_csv(fileDestination,
                                                                          sep=',', index=False, header=False)

    logger.info('begin to upload to hdfs.')
    tablename = 'mrqos.mpg_cluster'
    hdfs_d = os.path.join(config.hdfs_table,
                          'mpg_cluster',
                          'datestamp=%s' % day_idx,
                          'uuid=%s' % uuid_idx)
    partition = '''datestamp=%s, uuid='%s' ''' % (day_idx, uuid_idx)
    processed_filename = '.'.join(filename.split('.')[0:-1])+'.processed.csv'
    cmd_str = ''' cat %s | awk -F, '{n=split($21,a,":"); if(n>5){$21=a[1]":"a[2]":"a[3]":"a[4]":"a[5];} m=split($20,b,":"); if(m>5){$20=b[1]":"b[2]":"b[3]":"b[4]":"b[5];}print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12,$13,$14,$15,$16,$20,$21;}' > %s ''' % (os.path.join(data_folder, filename),
                                                                                                                                                                                                                                                              os.path.join(data_folder, processed_filename))
    sp.check_call(cmd_str, shell=True)
    try:
        beeline.upload_to_hive(fileDestination, hdfs_d, partition, tablename, logger)
        # os.remove(fileDestination)
    except sp.CalledProcessError as e:
        logger.info('upload to HDFS + update Hive table failed.')
from pyspark.sql import HiveContext
from pyspark import SparkConf
from pyspark import SparkContext
from operator import add

conf = SparkConf().setAppName("EDF").setMaster("local")
sc = SparkContext(conf=conf)
hiveContext = HiveContext(sc)
hiveContext.sql("use db_edf")

df1 = hiveContext.sql("select site_id as id, timestp as tp, consumption as cp from consumption")
df2 = hiveContext.sql("select site_id as id, industry as indus from sites")

df3 = df1.join(df2, df1.id == df2.id).groupBy(["tp","indus"]).mean("cp")

df3.show()

df4 = df3.select(df3.tp, df3.indus, df3["AVG(cp)"].alias("mean_cp")).orderBy("tp")

df4.show()

df4.registerAsTable("tmp_table")
hiveContext.sql("create table final_cdc_mean_by_inustry_each_5min_with_date as select from_unixtime(tp) as date_time, tp, indus, mean_cp from tmp_table")
        print "Usage: spark-submit <Python Code File>"
        sys.exit(1)

    #App name which shows up in the Spark UI
    sc = SparkContext(appName='User Recommendation')


    #Context provides connection to Hive metastore
    sqlContext = HiveContext(sc)
    

    '''
    Pulling data out of Hive.  I created a relication of 'watson_bisum_purchases' table locally to test.
    '''
    
    rdd = sqlContext.sql("SELECT person_id,deal_id,aasm_state FROM watson_bisum_purchases")


    '''
    Creating datasets.  Formating the data and also creating sample datasets in order to create and test the model. 
    '''
    
    #Formating all the data using the 'parse_rating' method above
    all_data = rdd.map(parse_rating)
    rec_list = sc.parallelize(all_data.collect())


    #Grabbing all Unique Users(used for building recommendation list)
    users = rdd.groupBy(lambda x: x.person_id).map(lambda x: x[0]).collect()

    #Grabbing all Unique Deals/Products(used for building recommendation list)
Exemplo n.º 54
0
# import dependencies
import pyspark
from pyspark.sql import HiveContext
import sys
import os

# zip rows(lists)
def parse_list(p):
    if p.Line!=None:
        return zip(p.Line,p.Latitude,p.Longitude,p.RecordedAtTime,p.vehicleID,p.Trip,p.TripDate,p.TripPattern,p.MonitoredCallRef,p.DistFromCall,p.CallDistAlongRoute,p.PresentableDistance)
    else:
        return []

if __name__=='__main__':
    sc = pyspark.SparkContext()
    sqlContext = HiveContext(sc)
    bus_file='BusTime/2015_*.jsons' #read multiple_josns
    bus = sqlContext.read.json(bus_file)
    bus.registerTempTable("bus") # register into table in order to use sql
    with open(sys.argv[-2]) as fr: #read sql
        query = fr.read()
    sqlContext.sql(query).flatMap(parse_list).map(lambda x: ",".join(map(str, x))).saveAsTextFile(sys.argv[-1])
def main(args):
    """ Main code for relevance computation """
    
    start_time = time.time()
    
    # iq (code snippets that set below properties have been removed)
    driver   = 
    url      = 
    username = 
    password = 
    inputs = [driver, url, username, password]

    
    filename = str(args[0])
    if os.path.exists(filename):
        pass
    else:
        sys.exit("Input file %s not found" % filename)
    file = open(filename, 'r')
    for line in file:
        key, val = line.split(",")
        if str(key).strip() == "dbalias":
            dbalias = str(val).strip()
        elif str(key).strip() == "numpartitions":
            numpartitions = int(val)
        elif str(key).strip() == "datadir":
            datadir = str(val).strip()
        else:
            print("Invalid key not set: %s" % str(key))
    # Need to make sure that the datadir variable is set.
    try:
        print("datadir = '%s' " % datadir)
    except NameError:
        sys.exit("'datadir' variable not set. Check inputfile '%s'" 
                 % (datadir, filename))
            
    # Spark and Hive contexts
    conf = SparkConf()
    sc = SparkContext(conf = conf)
    sqlContext = HiveContext(sc)

    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "traffic")
    if df is None: sys.exit("'traffic' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "uniquedata")
    df = None
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "fbtraffic")
    if df is None: sys.exit("'fbtraffic' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "uniqueFBdata")
    df = None    

    statement = "Select ud.loginid, ud.adid, ud.Type, ufd.Type as FBType "\
                "from uniquedata ud left outer join uniqueFBdata ufd "\
                "on ud.loginid = ufd.loginid and ud.adid = ufd.adid"
    adswithFBjoined = sqlContext.sql(statement)
    adswithFBjoined_cleaned = adswithFBjoined[adswithFBjoined['FBType'].isNull()]
    adswithFBjoined_cleaned = adswithFBjoined_cleaned.drop('FBType')

    sqlContext.registerDataFrameAsTable(adswithFBjoined_cleaned, "data")

    statement = "Select loginid, count(loginid) as viewcount from data group by loginid"
    temp = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(temp, "viewdata")
    
    statement = "Select d.* from data d, viewdata vd where d.loginid = vd.loginid and vd.viewcount > 1"
    temp2 = sqlContext.sql(statement)
    
    sqlContext.sql("drop table data")
    sqlContext.registerDataFrameAsTable(temp2, "data")
        
    temp, temp2  = (None, None)

    df = utils.returnSparkDF(SQLContext(sc), inputs, "agent")
    if df is None: sys.exit("'agent' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "agentdata")

    statement = "select loginid, adid, Type, count(adid) as counter from agentdata group by loginid, adid, Type"
    unique_adid_per_loginid = sqlContext.sql(statement)
    unique_adid_per_loginid = unique_adid_per_loginid.drop('counter')
    sqlContext.registerDataFrameAsTable(unique_adid_per_loginid, "agentdata")
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "favorite")
    if df is None: sys.exit("'favorite' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "favdata")
    df = None
    
    statement = "select * from data union all select * from agentdata union all select * from favdata"
    df2 = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(df2, "uniondata")
    df2 = None
    
    statement = "select loginid, max(Type) as UserMaxConversion from uniondata group by loginid"
    maxtype = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(maxtype, "maxconversiondata")

    statement = "select uniondata.loginid, uniondata.adid, uniondata.Type "\
                "from uniondata, maxconversiondata where uniondata.loginid = maxconversiondata.loginid "\
                "and uniondata.Type = maxconversiondata.UserMaxConversion"
    data = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(data, "data")
       
    # Delete tables
    tables = ["uniquedata", "FBdata", "uniqueFBdata", "agentdata", 
              "favdata", "uniondata", "maxconversiondata"]
    for table in tables:
        sqlContext.sql("drop table if exists %s" % str(table))

    df = utils.returnSparkDF(SQLContext(sc), inputs, "adclassified")
    if df is None: sys.exit("'adclassified' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "addata")
    df = None
    
    df = utils.returnSparkDF(SQLContext(sc), inputs, "geo")
    if df is None: sys.exit("'geo' query failed: SystemExit.")
    sqlContext.registerDataFrameAsTable(df, "geodata")
    df = None
    
    statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\
                "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\
                "from addata, geodata where addata.locationkey = geodata.locationkey"
    addata_for_join = sqlContext.sql(statement)

    statement = "select addata.adid, addata.AskingPrice, addata.CollectiveDebt, "\
                "addata.PageViewCount, geodata.Municipal, geodata.CityPart "\
                "from addata, geodata where addata.locationkey = geodata.locationkey"
    addata_for_join = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(addata_for_join, "adtemp")

    statement = "select * from adtemp where PageViewCount < 10000"
    addata_for_join = sqlContext.sql(statement)
    sqlContext.registerDataFrameAsTable(addata_for_join, "addata_for_join")                   
    
    data, addata_for_join = (None, None)
    sqlContext.sql("drop table if exists addata")

    statement = "select a.*, b.AskingPrice, b.CollectiveDebt, b.PageViewCount, b.Municipal, b.CityPart "\
                "from data a, addata_for_join b where a.adid = b.adid"
    data = sqlContext.sql(statement)
    data = data.fillna(0)
    data = data.repartition(numpartitions)

    
    # Save the files as csv using spark-csv from databricks
    try:
        st = time.time()
        data.write.format("com.databricks.spark.csv").save(datadir, mode="overwrite", codec="bzip2")
        et = time.time()
        print("File save time was: %.2f mins." % ((et-st)/60.))
    except:
        sys.exit("Could not save files to dir '%s'. \n\nError = %s" % (datadir, sys.exc_info()[1]))
    finally:            
        end_time = time.time()    
        print("Spark ETL execution time = %.2f mins." % ((end_time-start_time)/60.))
    
    
    # Stop spark and continue using in-memory computation (another script)
    sc.stop()
        
    return
Exemplo n.º 56
0
# values的输出结果:Row(_c0=u'1', _c1=u'2', _c2=u'3.0'),数据类型被全部推断为“int”,也就是说数组的数据类型一定要一致,否则可以引发异常
"""

"""
source = sc.parallelize(['{"key" : [1, 2 , 3.0]}'])

jsonRDD = hc.jsonRDD(source)

jsonRDD.registerTempTable("temp_table")

values = hc.sql("select key[0], key[1], key[2] from temp_table").collect()

# values的输出结果:Row(_c0=1.0, _c1=2.0, _c2=3.0),数据类型被全部推断为“float”
"""

source = sc.parallelize(
    ['{"key" : [{"key1" : "value1", "key2" : [1, 2, 3], "key3" : [{"key4" : "value4", "key5" : [4, 5.0, 6]}]}]}'])

jsonRDD = hc.jsonRDD(source)

jsonRDD.registerTempTable("temp_table")

values = hc.sql(
    "select key[0].key1, key[0].key2[0], key[0].key3[0].key4, key[0].key3[0].key5[1] from temp_table").collect()

sc.stop()

for value in values:
    print value
Exemplo n.º 57
0
from pyspark.mllib.evaluation import MulticlassMetrics

from copy import deepcopy

sc = SparkContext()
sqlContext = HiveContext(sc)
qry = """SELECT *,white/population as white_percent,
         black/population as black_percent,
         asian/population as asian_percent,
         pacific_islander/population as pi_percent,
         other_race/population as other_race_percent,
         multiple_race/population as multiple_percent,
         hispanic/population as hispanic_percent
         FROM census_rest_success where days_open > 365"""

df = sqlContext.sql(qry)

## Lets train a Support Vector Classifier on this data
#CITATION:
#http://stackoverflow.com/questions/33900726/count-number-of-non-nan-entries-in-each-column-of-spark-dataframe-with-pyspark
def count_not_null(c):
    return sum(col(c).isNotNull().cast("integer")).alias(c)

exprs = [count_not_null(c) for c in df.columns]
df.agg(*exprs).show()

df = df.dropna()

features = df.select(df['goodforkids'], df['goodforgroup'], df['goodfordessert'],
           df['goodforlatenight'], df['goodforlunch'], df['goodfordinner'],
           df['goodforbrunch'], df['goodforbreakfast'], df['romantic'],
def parse(line):
    matcher = pattern.match(line)

    if matcher:
        return matcher.groups()
    else:
        return None

columns = source.map(parse).filter(
    lambda columns: columns and len(columns) == 3)

rows = columns.map(
    lambda columns: (columns[0], columns[1], columns[2]))

schema = StructType([StructField("col1", StringType(), False), StructField(
    "col2", StringType(), False), StructField("col3", StringType(), False)])


table = hc.applySchema(rows, schema)

table.registerAsTable("temp_mytable")

datas = hc.sql("select * from temp_mytable").collect()

sc.stop()

if datas:
    for data in datas:
        print data
Exemplo n.º 59
0
        StructField("timestamp", TimestampType(), False),
        StructField("date", DateType(), False),
        StructField("array", ArrayType(IntegerType(), False), False),
        StructField("col_map", MapType(StringType(), StringType(), False), False),
        StructField(
            "struct",
            StructType(
                [
                    StructField("first", IntegerType(), False),
                    StructField("second", FloatType(), False),
                    StructField("third", StringType(), False),
                ]
            ),
            False,
        ),
    ]
)

table = hc.applySchema(source, schema)

table.registerAsTable("temp_table")

rows = hc.sql(
    "select byte, short, int, long, float, double, decimal, string, boolean, timestamp, date, array[0], array[1], array[2], col_map['key'], struct.first, struct.second, struct.third from temp_table"
).collect()

sc.stop()

for row in rows:
    print row
Exemplo n.º 60
0
from pyspark.sql import HiveContext
from pyspark.mllib.stat import Statistics
from pyspark import SparkContext

sc = SparkContext()

sqlContext = HiveContext(sc)

initialquery = sqlContext.sql("SELECT         A.avg_procedure_score,         B.patientsurveyscore FROM         (SELECT                 p.hospitalid,                 avg(p.score) as avg_procedure_score         FROM                 procedures p         GROUP BY                 p.hospitalid) A JOIN         survey_results B ON B.hospitalid = A.hospitalid")

survey_score = initialquery.map(lambda x: x.patientsurveyscore)
avg_procedure_scores = initialquery.map(lambda x: x.avg_procedure_score)

print Statistics.corr(avg_procedure_scores, survey_score, method="pearson")