Python SQLContext.applySchema примеры использования

Язык программирования: Python

Пространство имен/Пакет: pyspark

Класс/Тип: SQLContext

Метод/Функция: applySchema

Примеров на hotexamples.com: 4

Python SQLContext.applySchema - 4 примера найдено. Это лучшие примеры Python кода для pyspark.SQLContext.applySchema, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

sql(30)

createDataFrame(30)

SQLContext(28)

getOrCreate(17)

setConf(14)

registerDataFrameAsTable(10)

load(4)

cacheTable(4)

jsonFile(3)

show(3)

parquetFile(3)

registerFunction(3)

withColumn(2)

dropTempTable(2)

tableNames(2)

clearCache(2)

range(2)

applySchema(2)

jsonRDD(2)

inferSchema(2)

groupby(1)

printSchema(1)

select(1)

persist(1)

filter(1)

Пример #1

Показать файл

Файл: userlogin_repeat.py Проект: wangcunxin/spark_py

        gwid_hosid_dict[gw_id] = hos_id
    logger.debug('-->gwid_hosid:' + str(gwid_hosid_dict.__len__()))
    users = lines.map(lambda x: x[1].split(',')).filter(lambda x: len(x) == 17) \
        .map(lambda p: (p[0].strip(), p[1].strip(), p[2].strip(), p[3].strip(), p[4].strip(), \
                        p[5].strip(), p[6].strip(), p[7].strip(), p[8].strip(), p[9].strip(), \
                        p[10].strip(), p[11].strip(), p[12].strip(), p[13].strip(), p[14].strip(), \
                        p[15].strip(), p[16].strip(), gwid_hosid_dict.get(p[1].strip(), "")))
    logger.debug('-->users:' + str(users.count()))
    schema_string = "id gw_id supp_id user_id user_type " \
                    "user_name login_time logout_time mac ip " \
                    "user_agent download_flow upload_flow os browser " \
                    "ratio batch_no hos_id"

    fields = [StructField(field_name, StringType(), True) for field_name in schema_string.split(' ')]
    schema = StructType(fields)
    schema_users = sql_context.applySchema(users, schema)
    schema_users.registerTempTable("wxcity_userlogin_info")

    # regist udf
    sql_context.registerFunction("get_date", lambda x: DateUtil.str_to_date(x).date(), DateType())
    sql_context.registerFunction("date_diff", lambda x, k: DateUtil.date_diff(x, k), IntegerType())
    sql_context.registerFunction("get_hour", lambda x: DateUtil.str_to_date(x).hour(), IntegerType())
    sql_context.registerFunction("to_int", lambda x: int(x), IntegerType())
    sql_context.registerFunction("timestamp_diff", lambda x, k: DateUtil.timestamp_diff(x, k), IntegerType())

    lines_list = UserLoginRepeatService().exec_file(sql_context, time_begin, time_end)

    # group by day,hosid,(mac),2, 5, 10, 30, 60
    #repeat_list = sc.textFile(ConfigSparkPath.userlogin_repeat_path % time_begin).map(lambda line:line.split('\t')).filter(lambda x:len(x)==8)
    repeat_list = sc.parallelize(lines_list).map(lambda line:line.split('\t'))
    schema_string = "day hos_id mac t2 t5 " \

Пример #2

Показать файл

Файл: analyse_day_count_update.py Проект: wangcunxin/spark_py

    # hosid_count.foreach(print_str)

    user_top_hosid = hosid_count.groupByKey().mapValues(list).sortByKey() \
        .map(topcount)
    # (u'00:66:4B:9B:0F:C9', u'')
    # user_top_hosid.foreach(print_str)

    # user,days,count
    days_count = parts.map(convert_day).groupByKey().mapValues(set).map(compute_mark)
    #(u'95:15:DF:EE:41:E9', u'\u5b558\u6708|\u5b55\u524d\u671f', u'2014-04-06')

    #days_count.foreach(print_str)
    # join:mac,mark,hosid
    mac_mark_hosid = days_count.join(user_top_hosid).map(generate_ret)
    #mac_mark_hosid.take(10)
    #mac_mark_hosid.foreach(print_str)
    fields = [
        StructField('user', StringType(), True),
        StructField('stage', StringType(), True),
        StructField('conceive', StringType(), True),
        StructField('area', StringType(), True)
        ]
    schema = StructType(fields)
    dest = sqlContext.applySchema(mac_mark_hosid, schema)
    dest.registerTempTable("user_profile")
    # combine partition
    dest.coalesce(10).write.parquet(output,'overwrite')


    sc.stop()

Пример #3

Показать файл

    kpi_min, kpi_max, input_l3, input_l5, output_l5, start_date, end_date = args.kpi_min, args.kpi_max, args.input_l3, args.input_l5, args.output_l5, args.start_date, args.end_date

    if kpi_min and kpi_max and input_l5 and start_date and end_date:
        conf = SparkConf().setAppName("SparkSQL Evaluation Level5")
        sc = SparkContext(conf=conf)
        sqlContext = SQLContext(sc)
        l3 = sc.textFile(input_l3).coalesce(4)
        d3 = l3.map(lambda z: z.replace('"', '')).map(lambda z: z.split(
            ',')).map(lambda p: (str(p[0]), float(p[1]), float(p[2])))
        field3 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("KPI_1", FloatType(), True),
            StructField("KPI_2", FloatType(), True)
        ]
        s3 = StructType(field3)
        schema3 = sqlContext.applySchema(d3, s3)
        schema3.registerTempTable("Level3")

        l5 = sc.textFile(input_l5).coalesce(24)
        d5 = l5.map(lambda z: z.replace('"', '')).map(
            lambda z: z.split(',')).map(lambda p: (p[0], p[
                1], datetime.strptime(p[2], "%Y-%m-%d %H:%M:%S").date(),
                                                   int(p[3])))
        field5 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("CAMPAIGN_NAME", StringType(), True),
            StructField("EVENT_DATE", DateType(), True),
            StructField("EVENT_TYPE_ID", IntegerType(), True)
        ]
        s5 = StructType(field5)
        schema5 = sqlContext.applySchema(d5, s5)

Пример #4

Показать файл

Файл: spark_SparkSQL_Test_Level3.py Проект: anjumr06/sparkTest

    args = parser.parse_args()

    kpi_min, kpi_max, input_l3, output_l3, start_date, end_date = args.kpi_min, args.kpi_max, args.input_l3, args.output_l3, args.start_date, args.end_date

    if kpi_min and kpi_max and input_l3 and output_l3 and start_date and end_date:
        conf = SparkConf().setAppName("SparkSQL Evaluation Level3")
        sc = SparkContext(conf=conf)
        sqlContext = SQLContext(sc)
        l3 = sc.textFile(input_l3).coalesce(4)
        d3 = l3.map(lambda z: z.replace('"', '')).map(lambda z: z.split(
            ',')).map(lambda p: (str(p[0]), float(p[1]), float(p[2])))
        field3 = [
            StructField("CONSUMER_ID", StringType(), False),
            StructField("KPI_1", FloatType(), True),
            StructField("KPI_2", FloatType(), True)
        ]
        s3 = StructType(field3)
        schema3 = sqlContext.applySchema(d3, s3)
        schema3.registerTempTable("Level3")
        sqlContext.cacheTable("Level3")

        query3 = sqlContext.sql("SELECT SEGMENT_ID, SLAB, COUNT(*), SUM(KPI_1), MIN(KPI_1), MAX(KPI_1), AVG(KPI_1) FROM \
                (SELECT KPI_1, CASE WHEN KPI_1 >= "                                                        + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN 1 ELSE 0 END SEGMENT_ID, \
                CASE WHEN KPI_1 >= "                                         + str(kpi_min) + " AND KPI_1 <= " + str(kpi_max) + " THEN '" + str(kpi_min) + " - " + str(kpi_max) + \
                    "' ELSE NULL END SLAB FROM Level3) DUMP \
                WHERE SLAB IS NOT NULL \
                GROUP BY SLAB, SEGMENT_ID \
                ORDER BY SEGMENT_ID"                                        )

        query3.coalesce(1).saveAsTextFile(output_l3)