Пример #1
0
def read_csv(sc, file_name, sep=",", storage="hive://", header=True,
             names=None, table_name=None, infer_limit=10000):
    table_name = table_name if table_name is not None else "df" + str(uuid.uuid4())
    hc = HiveContext(sc)
    df = pd.read_csv(file_name, sep=sep, nrows=infer_limit)
    names = df.columns if not names else names
    types = []
    for i in range(len(names)):
        tp = names[i] + " "
        if df.dtypes[i] == "O":
            tp += "STRING"
        elif df.dtypes[i] == "int64":
            tp += "INT"
        else:
            tp += "DOUBLE"
        types.append(tp)
    hc.sql('drop table if exists %s' %table_name)
    qw = """CREATE TABLE IF NOT EXISTS %s (%s) row format delimited fields terminated by '%s'
LINES TERMINATED BY '\n'""" %(table_name, ','.join(types), sep)
    if header:
        qw += " tblproperties ('skip.header.line.count'='1')"
    hc.sql(qw)
    hc.sql("LOAD DATA LOCAL INPATH '%s' OVERWRITE INTO TABLE %s" %(file_name, table_name))
    rdd = hc.sql("SELECT * FROM %s" %table_name)
    ctx = hc
    if storage.startswith("parquet://"):
        path = storage.replace("parquet://", "")
        rdd.saveAsParquetFile("%s/%s" %(path, table_name))
        sq = HiveContext(sc)
        rdd = sq.parquetFile("%s/%s" %(path, table_name))
        rdd.registerTempTable(table_name)
        rdd = sq.sql("select * from %s" %table_name)
        ctx = sq
    return DataFrame(ctx, table_name, data=rdd, columns=names, dtype=types)
Пример #2
0
tweets ORDER BY retweetCount LIMIT 10""")


# 在python中访问topTweet这个SchemaRDD中的text列
topTweetText = topTweets.map(lambda row: row.text)

# 使用 python从Hive 读取
from pyspark.sql import HiveContext

hiveCtx = HiveContext(sc)
rows = hiveCtx.sql("SELECT key,value FROM mytable")
keys = rows.map(lambda row: row[0])

# python中的Parquet数据读取
# 从一个有name和favouriteAnimal字段的Parquet文件中读取数据
rows = hiveCtx.parquetFile(parquetFile)
names = rows.map(lambda row:row.name)
print "Everyone"
print names.collect()

# python 中的Parqust数据查询
# 寻找熊猫爱好者
tbl = rows.registerTempTable("people")
pandaFriends = hiveCtx.sql("SELECT name FROM people WHERE favouriteAnimal = \"panda\"")
print "Panda friends"
print pandaFriends.map(lambda row: row.name).collect()

# Parquet文件保存
pandaFriends.saveAsTextFile("hdfs://...")

Пример #3
0
    print "Starting.", datetime.now()
    sconf = SparkConf().set("spark.buffer.pageSize", 1024*1024).setAppName("FanDuelGame")
    sc = SparkContext(conf=sconf)
    sqlContext = HiveContext(sc)

    rddDir = CreateStatsRDD.rddDir
    (filename, dataDirectory, gameDescription, actualModel)=getCommandLine()
    print "start: ", datetime.now()
    game = FanDuelGame(sqlContext, filename, dataDirectory, gameDescription)
    eligiblePlayers = game.getEligiblePlayers()
    print "eligiblePlayers=", eligiblePlayers

    print "gameDate=", game.gameDate

    # get MLB.com players
    gamePlayers = sqlContext.parquetFile(rddDir + "/" + "game_players.parquet")
    gamePlayers.registerTempTable("game_players")
    gamePlayers.cache()

    ldf = sqlContext.sql("select distinct lookup_name, player_id from game_players where '" + str(game.gameDate) + "' >= effective_start_dt and '" + str(game.gameDate) + "' < effective_stop_dt").collect()
    print "ldf=", ldf
    pids = {}
    for row in ldf:
        x = row.asDict()
        pids[x['lookup_name'].upper()] = x['player_id']
    
    print "pids=", pids
    with open(rddDir + "batting_encoded.json", 'r') as f:
        encoded = json.load(f)
    encodedPlayerIds = encoded['player_id']
    decodedHitterPlayerIds = dict(zip(encodedPlayerIds.values(), encodedPlayerIds.keys()))
Пример #4
0
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType

if __name__ == "__main__":
    sc = SparkContext(appName="SparkSQL:[com.mvad.spark.demo][pysparkdemo]")
    sqlContext = HiveContext(sc)

    # RDD is created from a list of rows
    df = sqlContext.parquetFile(
        "/mvad/warehouse/session/dspan/date=2015-05-01/")
    df.registerTempTable("sessionlog")
    for table in sqlContext.tableNames():
        print table
    df.printSchema()

    sqlContext.udf.register("intarr2str",
                            lambda array: "".join(map(str, array)))
    sql1 = """ select intarr2str(cookie) as cookiestr,eventTime,eventType,geoInfo.country as country,
      geoInfo.province as province from sessionlog limit 10 """.replace(
        '\n', ' ')
    sample = sqlContext.sql(sql1)
    sample.show()

    sql2 = """select eventType, count(cookie) as count from sessionlog
      group by eventType """.replace('\n', ' ')
    result = sqlContext.sql(sql2)
    result.cache()

    # only show 20 records
    result.show()
appName = "NetflowReplication:QA"
conf = SparkConf().setAppName(appName)

conf.setExecutorEnv(
    'PYTHONPATH',
    '/opt/spark/python:/opt/spark/python/lib/py4j-0.8.2.1-src.zip')

conf.set("spark.driver.maxResultSize", "2g")
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

if len(sys.argv) < 3:
    print "Usage: /opt/spark/bin/spark-submit " + sys.argv[
        0] + " <netflow input path> <file with list of IP addresses to filter> <output filtered netflow text directory>"
    sys.exit()
path = sys.argv[1]
input_ip = sys.argv[2]
output = sys.argv[3]

list = []
for line in open(input_ip):
    line = line.strip('\n')
    line = sum(
        [int(i) * 2**(8 * j) for i, j in zip(line.split('.'), [3, 2, 1, 0])])
    list.append(line)
print list
df = sqlContext.parquetFile(path)
df.count()
df_filtered = df.where(col("IPV4_SRC_ADDR").isin(list))
df_filtered.rdd.map(lambda row: [str(c) for c in row]).saveAsTextFile(output)
# -*- coding: utf-8 -*-

from __future__ import absolute_import, print_function, division, unicode_literals

import sys

from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, IntegerType

if __name__ == '__main__':
    conf = SparkConf().setAppName('Restaurants Parquet')
    sc = SparkContext(conf=conf)
    hive_ctx = HiveContext(sc)

    inputs = hive_ctx.parquetFile(sys.argv[1])
    inputs.registerTempTable('restaurants')

    hive_ctx.registerFunction("LEN", lambda s: len(s), IntegerType())

    print('### Schema ###')
    inputs.printSchema()
    print()

    print('### Restaurants in Tokyo ###')
    restaurants_in_tokyo = hive_ctx.sql("""
        SELECT
            r.id,
            r.alphabet
        FROM
            restaurants r
        WHERE
Пример #7
0
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext

sc = SparkContext('yarn-cluster')
hc = HiveContext(sc)
parquetFile = hc.parquetFile("/czdataset/weather/weather_data/000000_0")
parquetFile.registerTempTable("weatherStation")
stations = hc.sql("SELECT wban_number, yearmonthday, cz_year, cz_month, dayofmonth FROM weatherStation")
stations.write.parquet("/user/sachin/output/hive-spark")
path = '/data/collector/xyz/2016/01/01/00/'
binInterval = 05
startEpoch = 1481220000
sourceEpoch = 1451606400
outpath = '/data/replicated_data/'
no_of_hours = 3


def unionAll(dfs):
    return reduce(DataFrame.unionAll, dfs)


list = []
for i in range(00, 60, binInterval):
    i = "{:0>2}".format(i)
    df = sqlContext.parquetFile(path + str(i) + '/*')
    list.append(df)

while no_of_hours > 0:
    OffSet = int(startEpoch) - sourceEpoch
    count = 0
    for i in list:
        count = "{:0>2}".format(count)
        df = i.withColumn('FIRST_SWITCHED', i.FIRST_SWITCHED + OffSet)
        dfs = [[df]]
        dfs = [y for x in dfs for y in x]
        df_final = unionAll(dfs)
        df_final.coalesce(8).write.parquet(outpath + str(startEpoch) + '/' +
                                           str(count))
        count = int(count) + binInterval
    startEpoch = int(startEpoch) + 3600
Пример #9
0
result = hiveCtx.sql("select * from rows")
result.first()
result_data = result.map(lambda x: x.data)  #获取data字段
result_data.collect()
result.printSchema()  #输出结构信息

#数据缓存
hiveCtx.cacheTable('rows')

#读取hive数据库的数据
score_data = hiveCtx.sql('select name,score from testdb.score')
score = score_data.map(lambda x: x[1])
score.collect()

#读取parquet文件
parquet_data = hiveCtx.parquetFile('hdfs://192.168.0.104:9000/users')
parquet_data.first()
gender = parquet_data.map(lambda x: x.gender)
gender.collect()
parquet_data.registerTempTable('users')
male_data = hiveCtx.sql("select * from users where gender='male'")
male_data.collect()

#将RDD转化为SchemaRDD
happyPeopleRDD = sc.parallelize([Row(name='lin', age=25)])
happyPeopleSchemaRDD = hiveCtx.inferSchema(happyPeopleRDD)
happyPeopleSchemaRDD.registerTempTable('happyPeople')
result = hiveCtx.sql('select name from happyPeople')
result.collect()

#用户自定义函数
Пример #10
0
from pyspark import SparkContext
from pyspark.sql import HiveContext
from pyspark.sql.types import Row, StructField, StructType, StringType, IntegerType


if __name__ == "__main__":
    sc = SparkContext(appName="SparkSQL:[demo][pysparkdemo]")
    sqlContext = HiveContext(sc)

    # RDD is created from a list of rows
    df = sqlContext.parquetFile("/mvad/warehouse/session/dspan/date=2015-05-01/")
    df.registerTempTable("sessionlog")
    for table in sqlContext.tableNames():
        print table
    df.printSchema()

    sqlContext.udf.register("intarr2str",lambda array:"".join(map(str,array)) )
    sql1 = """ select intarr2str(cookie) as cookiestr,eventTime,eventType,geoInfo.country as country,
      geoInfo.province as province from sessionlog limit 10 """.replace('\n',' ')
    sample = sqlContext.sql(sql1)
    sample.show()


    sql2 = """select eventType, count(cookie) as count from sessionlog
      group by eventType """.replace('\n',' ')
    result = sqlContext.sql(sql2)
    result.cache()

    # only show 20 records
    result.show()
    result.show(100)
from pyspark.sql import HiveContext
from pyspark.sql.functions import col

appName = "ParquetPyspark::Filter"
conf = SparkConf().setAppName(appName)

conf.setExecutorEnv(
    'PYTHONPATH',
    '/opt/spark/python:/opt/spark/python/lib/py4j-0.8.2.1-src.zip')

conf.set("spark.driver.maxResultSize", "2g")
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)

if len(sys.argv) < 3:
    print "Usage: /opt/spark/bin/spark-submit --master yarn  --deploy-mode client   --executor-memory 2G --num-executors 2 --total-executor-cores 2 " + sys.argv[
        0] + " <input parquet files directory> <file with list of IP addresses to filter> <output filtered text directory>"
    sys.exit()
input_path = sys.argv[1]
input_ip = sys.argv[2]
output_path = sys.argv[3]

list = []
for line in open(input_ip).readlines():
    line = line.strip('\n')
    list.append(line)
df = sqlContext.parquetFile(input_path)
df_filtered = df.where(col("IPV4_SRC_ADDR").isin(list))
df_filtered.rdd.map(lambda row: [str(field) for field in row]).saveAsTextFile(
    output_path)