Python SQLContext.registerFunction примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark

Класс/Тип: SQLContext

Метод/Функция: registerFunction

Примеров на hotexamples.com: 10

Python SQLContext.registerFunction - 10 примеров найдено. Это лучшие примеры Python кода для pyspark.SQLContext.registerFunction, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

sql(30)

createDataFrame(30)

SQLContext(28)

getOrCreate(17)

setConf(14)

registerDataFrameAsTable(10)

load(4)

cacheTable(4)

jsonFile(3)

show(3)

parquetFile(3)

registerFunction(3)

withColumn(2)

dropTempTable(2)

tableNames(2)

clearCache(2)

range(2)

applySchema(2)

jsonRDD(2)

inferSchema(2)

groupby(1)

printSchema(1)

select(1)

persist(1)

filter(1)

Пример #1

Показать файл

Файл: sql_udf.py Проект: littleQ-zzq/spark-by-python

        return name + "," + "good"


if __name__ == "__main__":
    conf = SparkConf().setMaster("local[2]").setAppName("sql_udf")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    json_path = os.path.abspath("../doc/book.json")

    # json读取并隐射
    json_df = sqlContext.read.json(json_path)
    json_df.registerTempTable("json_book")

    # UDF自定义函数
    sqlContext.registerFunction("name_place", name_place)

    evalRDD = sqlContext.sql("SELECT name_place(name, place, price,evaluation) AS book_eval FROM json_book")

    #bookMap = lengthRDD.map(lambda books: (books.name, books.author, books.price, books.publish, books.place))

    evalRDD.show()

    # 查询结果进行隐射
    bookMap = evalRDD.map(lambda books: (books.book_eval))

    general_list = []
    good_list = []

    for book in bookMap.collect():
        book = book.encode("utf-8").split(',')

Пример #2

Показать файл


def analysis_email(email):
    """
    邮箱分割
    """
    return email.split("@")[1].split(".")[0]


if __name__ == "__main__":
    conf = SparkConf().setAppName("analysis_demo").setMaster("local[2]")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # UDF自定义函数注册
    sqlContext.registerFunction("analysis_email", analysis_email)

    file_path = os.path.abspath("../doc/analysis.txt")
    lines = sc.textFile(file_path)

    info = lines.map(lambda lines: lines.split("----")). \
        map(lambda info: Row(email=info[0], username=info[1], realname=info[2],
                             idcard=info[3], password=info[4], phone=info[5]))

    schemaInfo = sqlContext.createDataFrame(info)
    schemaInfo.registerTempTable("information")
    # cache表
    #sqlContext.cacheTable("information")
    #sqlContext.uncacheTable("information")
    """
    :邮箱分析与统计

Пример #3

Показать файл

Файл: login_days_main.py Проект: wangcunxin/spark_py

    master = "spark://hadoop:7077"
    appName = "spark_loginflowlog"
    #input = "/impala/parquet/back/back-portal-loginflowlog/dat=%s*" % ym
    input = '/input/loginfowlog/*'

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home
    conf = (SparkConf()
            .setMaster(master)
            .setAppName(appName)
            .set("spark.sql.parquet.binaryAsString","true")
            )
    sc = SparkContext(conf = conf)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_mac", lambda x: normal_mac(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "loginflowlog")
    #_sql = "select to_mac(upper(usermac)),count(distinct dat) days from loginflowlog group by to_mac(upper(usermac))"
    _sql = "select to_mac(upper(usermac)),count(distinct logtime) days from loginflowlog group by to_mac(upper(usermac))"
    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    lists = []
    for r in rs:
        usermac = r[0]
        days = r[1]
        t = (usermac,days)
        lists.append(t)

Пример #4

Показать файл

Файл: loginflowlog2mysql_update.py Проект: wangcunxin/spark_py

    master = "local[*]"

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    # logFile = 'hdfs://master:8020/impala/parquet/back/back-portal-loginflowlog/dat=' + day
    logFile = "/input/loginfowlog/02*"
    conf = (SparkConf()
            .setMaster(master)
            .setAppName("loginflowlog2mysql")
            # .set("spark.kryoserializer.buffer.mb", "256")
            .set("spark.sql.parquet.binaryAsString", "true"))
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    sqlContext.registerFunction("to_datestr", lambda x: longTime2str(x), StringType())

    df = sqlContext.read.parquet(logFile)

    rdd = df.select('logintype', 'logtype', 'hosid', 'suppid', 'logtime', 'usermac')

    fields = [
        StructField('logintype', StringType(), True),
        StructField('logtype', StringType(), True),
        StructField('hosid', StringType(), True),
        StructField('suppid', StringType(), True),
        StructField('logtime', LongType(), True),
        StructField('usermac', StringType(), True)
    ]
    schema = StructType(fields)

Пример #5

Показать файл

Файл: userlogin_repeat.py Проект: wangcunxin/spark_py

                        p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
                        p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
                        p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
    logger.debug('-->users:' + str(users.count()))
    schema_string = "id gw_id supp_id user_id user_type " \
                    "user_name login_time logout_time mac ip " \
                    "user_agent download_flow upload_flow os browser " \
                    "ratio batch_no hos_id"

    fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
    schema = StructType(fields)
    schema_users = sql_context.applySchema(users, schema)
    schema_users.registerTempTable("wxcity_userlogin_info")

    # regist udf
    sql_context.registerFunction("get_date", lambda x: DateUtil.str_to_date(x).date(), DateType())
    sql_context.registerFunction("date_diff", lambda x, k: DateUtil.date_diff(x, k), IntegerType())
    sql_context.registerFunction("get_hour", lambda x: DateUtil.str_to_date(x).hour(), IntegerType())
    sql_context.registerFunction("to_int", lambda x: int(x), IntegerType())
    sql_context.registerFunction("timestamp_diff", lambda x, k: DateUtil.timestamp_diff(x, k), IntegerType())

    lines_list = UserLoginRepeatService().exec_file(sql_context, time_begin, time_end)

    # group by day,hosid,(mac),2, 5, 10, 30, 60
    #repeat_list = sc.textFile(ConfigSparkPath.userlogin_repeat_path % time_begin).map(lambda line:line.split('\t')).filter(lambda x:len(x)==8)
    repeat_list = sc.parallelize(lines_list).map(lambda line:line.split('\t'))
    schema_string = "day hos_id mac t2 t5 " \
                    "t10 t30 t60"
    fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
    schema = StructType(fields)
    schema_repeat_list = sql_context.applySchema(repeat_list, schema)

Пример #6

Показать файл

Файл: main.py Проект: wangcunxin/spark_py

    # --set datetime
    DAY_OFFSET = 1
    now = datetime.datetime.now()
    pro_time = now - datetime.timedelta(days=DAY_OFFSET)
    day = pro_time.strftime("%Y%m%d")

    master = "spark://hadoop:7077"
    appName = "spark_pageflow_outflow"
    input = "/impala/parquet/site/site-pageflowv1/dat=%s" % day

    spark_home = '/opt/cloud/spark'
    os.environ['SPARK_HOME'] = spark_home

    sc = SparkContext(master, appName)
    sql_context = SQLContext(sc)
    sql_context.registerFunction("to_day", lambda x: mill_date_str(x), StringType())
    sql_context.registerFunction("to_str", lambda x: bytearray_str(x), StringType())

    parquet_df = sql_context.read.parquet(input)
    sql_context.registerDataFrameAsTable(parquet_df, "site_pageflowv1")

    _sql = "select to_str(url),to_day(createtime) day,count(1) pv,count(distinct to_str(guuid)) uv " \
           "from site_pageflowv1 where dat= %s and to_str(name)='outflow' " \
           "group by to_str(url),to_day(createtime)" % day

    rs_df = sql_context.sql(_sql)
    rs = rs_df.collect()
    logger.info("---->" + str(len(rs)))

    list = []
    for r in rs:

Пример #7

Показать файл

Файл: analysis_demo.py Проект: imperio-wxm/spark-by-python

import os


def analysis_email(email):
    """
    邮箱分割
    """
    return email.split("@")[1].split(".")[0]

if __name__ == "__main__":
    conf = SparkConf().setAppName("analysis_demo").setMaster("local[2]")
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)

    # UDF自定义函数注册
    sqlContext.registerFunction("analysis_email", analysis_email)

    file_path = os.path.abspath("../doc/analysis.txt")
    lines = sc.textFile(file_path)

    info = lines.map(lambda lines: lines.split("----")). \
        map(lambda info: Row(email=info[0], username=info[1], realname=info[2],
                             idcard=info[3], password=info[4], phone=info[5]))

    schemaInfo = sqlContext.createDataFrame(info)
    schemaInfo.registerTempTable("information")
    # cache表
    #sqlContext.cacheTable("information")
    #sqlContext.uncacheTable("information")

    """

Пример #8

Показать файл

Файл: adcount.py Проект: wangcunxin/spark_py

    _adLoadDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823568766},
        {'uid': '2', 'adid': 'b','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823569766},
        {'uid': '3', 'adid': 'c','guuid':'aa','guuidctime':1,'url':'','referer':'','hosid':'132','gwid':'','ua':'','ip':'','createtime':1450823550766},
        {'uid': '4', 'adid': 'd','guuid':'bb','guuidctime':1,'url':'','referer':'','hosid':'133','gwid':'','ua':'','ip':'','createtime':1450823268766},
    ]).registerAsTable("adload")
    _adPlayDF=sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823568766},
        {'uid': '2', 'adid': 'b','guuid':'aa','createtime':1450823569766},
        {'uid': '4', 'adid': 'd','guuid':'bb','createtime':1450823268766},
    ]).registerAsTable("adplay")
    _adClickDF =sqlContext.createDataFrame([
        {'uid': '1', 'adid': 'a','guuid':'aa','createtime':1450823580766},
    ]).registerAsTable("adclick")
    '''
    sqlContext.registerFunction("dateformat", lambda x:longTime2str(x),StringType())

    adLoadDf=sqlContext.sql('select hosid,dateformat(createtime) day,adid,count(guuid) pv,count(distinct guuid) uv '
                            'from adload where createtime is not null and dateformat(createtime)=%s '
                            'group by adid,hosid,dateformat(createtime)' % (lastdate)).registerAsTable("radload")

    adPlayDf=sqlContext.sql('select gh.hosid,dateformat(ap.createtime) day,adid,count(ap.guuid) pv,count(distinct ap.guuid) uv '
                            'from adplay ap left join ghid gh on ap.guuid=gh.guuid where dateformat(ap.createtime)=%s '
                            'group by ap.adid,gh.hosid,dateformat(ap.createtime)' % (lastdate)).registerAsTable("radplay")

    # sqlContext.sql('select sum(pv) from radplay').foreach(printx)
    adClick=sqlContext.sql('select gh.hosid,dateformat(ac.createtime) day,ac.adid,count(ac.guuid) pv,count(distinct ac.guuid) uv '
                            'from adclick ac left join ghid gh on ac.guuid=gh.guuid where dateformat(ac.createtime)=%s '
                            'group by ac.adid,gh.hosid,dateformat(ac.createtime)' % (lastdate)).registerAsTable("radclick")

Пример #9

Показать файл

Файл: pyspark_basic.py Проект: zhoumeiling233/pythonstudy

data = [[2, 3, 4], [1, 2, 3], [7, 6, 5]]
data_df = spark.createDataFrame(data,
                                list('abc'))  # create a DF, with columns name
data_df2 = spark.createDataFrame(data)  # create a DF

data = [[2, 3, 4], [1, 2, 3], [7, 6, 5]]

sqlContext.registerDataFrameAsTable(data_df2,
                                    "test_table")  # register a Tmp Table
test_data = spark.sql('select * from test_table')
# sqlContext.dropTempTable("test_table")

sqlContext.udf.register("stringLengthInt", lambda x: len(str(x)),
                        IntegerType())  # register a Function for SQL
sqlContext.registerFunction("stringLengthInt", lambda x: len(str(x)),
                            IntegerType())
sqlContext.sql("SELECT stringLengthInt('test') as len").show()
sqlContext.sql("SELECT stringLengthInt(a) as len from test_table ").show()

df_as1 = data_df.alias("df_as1")  # alias
df_as2 = data_df.alias("df_as2")
joined_df = df_as1.join(df_as2,
                        col("df_as1.a") == col("df_as2.a"), 'inner')  # 保留了全部列名
joined_df.select("df_as1.a", "df_as2.a", "df_as2.b", "df_as2.c").show()

print(data_df.columns)

# ---------------------------------------------------------------------------------
data1 = [[2, u'Alice'], [5, u'Bob']]
data2 = [[u'Tom', 80], [u'Bob', 85]]
data3 = [[2, 2, u'Alice'], [5, 5, u'Bob'], [5, 53, u'Bob'], [7, 1, u'Alice']]

Пример #10

Показать файл

Файл: cross_product.py Проект: JasonSanchez/email-like-enron

    sqlContext = SQLContext(sc)

    # path to hillary/enron avro
    enr = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/enron_data/*.avro").repartition(16)
    hil = sqlContext.read.format(
        "com.databricks.spark.avro").load(
            "s3n://datasets-396316040607/hillary/*.avro").repartition(16)

    # register tables
    sqlContext.registerDataFrameAsTable(hil, "hillary")
    sqlContext.registerDataFrameAsTable(enr, "enron")

    # register udf
    sqlContext.registerFunction(
        "getCos", lambda x, y: get_cosine(text_to_vector(x), text_to_vector(y))
    )

    # do the cosine similarity on the text, get the top 1000 matches
    out = sqlContext.sql("SELECT h.author h_auth, e.author e_auth, "
                         "e.contents e_mail, h.contents h_mail, "
                         "getCos(e.contents, h.contents) as cos_sim "
                         "from hillary as h join enron as e order by cos_sim "
                         "desc limit 1000")

    # write back out to s3
    # write back out to s3
    out.save("s3n://datasets-396316040607/cos_sim/", format="json")