Python SQLContext.range示例

def run(t1, context_string, configPath='./jsonFormat/ACTCLR.json'):
    spark = SQLContext(SparkContext.getOrCreate())

    hive = HiveContext(SparkContext.getOrCreate())

    jsonobj = {}

    #load json obj from file
    with open(configPath) as cfgPath:
        jsonobj = json.load(cfgPath)

    print("jsonobj", jsonobj)
    tables = {}

    for name, prof in jsonobj["tables"].items():
        #parse table
        print("prof", name, prof)

        linenum = prof["property"]["lines"]
        df = spark.range(linenum)

        for fieldName, fieldProf in prof["field"].items():
            if fieldProf["type"].startswith(
                    "\"") and fieldProf["type"].endswith("\""):
                fieldProf["type"] = fieldProf["type"][1:-1]
            if fieldProf["type"].upper().startswith("VARCHAR") or \
                fieldProf["type"].upper().startswith("CHAR"):
                t_type = StringType
                baseType = "String"
            elif fieldProf["type"].upper().startswith("DECIMAL"):
                t_type = FloatType
                baseType = "Float"
            elif fieldProf["type"].upper().startswith("SMALLINT"):
                t_type = IntegerType
                baseType = "Integer"
            elif fieldProf["type"].upper().startswith("DATE"):
                t_type = DataType
                baseType = "Date"
            else:
                t_type = StringType
                baseType = "String"

            if fieldProf["createMod"] == "":
                udf_func = udf(
                    getattr(excuteFunc, baseType +
                            "Method")(constraint=fieldProf["constraint"]),
                    t_type())
            else:
                udf_func = udf(
                    getattr(excuteFunc, fieldProf["createMod"].upper() +
                            "_Method")(constraint=fieldProf["constraint"]),
                    t_type())

            df = df.withColumn(fieldName, udf_func("id"))

        tables[name] = df

    for i, j in tables.items():
        j.show(n=100, truncate=False)

    return [tables["ACTCLR"]]

示例#2

显示文件

文件： introducing_spark.py 项目： kmandawe/spark2-demo

complex_data = sc.parallelize([Row(col_list=[1, 2, 3], col_dict={"k1": 0}, col_row=Row(a=10, b=20, c=30),
                                   col_time=datetime(2014, 8, 1, 14, 1, 5)),
                               Row(col_list=[1, 2, 3, 4, 5], col_dict={"k1": 0, "k2": 1}, col_row=Row(a=40, b=50, c=60),
                                   col_time=datetime(2014, 8, 2, 14, 1, 6)),
                               Row(col_list=[1, 2, 3, 4, 5, 6, 7], col_dict={"k1": 0, "k2": 1, "k3": 2},
                                   col_row=Row(a=70, b=80, c=90),
                                   col_time=datetime(2014, 8, 3, 14, 1, 7))
                               ])

complex_data_df = complex_data.toDF()
complex_data_df.show()

sqlContext = SQLContext(sc)
print(sqlContext)

df = sqlContext.range(5)
print(df)
df.show()
print(df.count())

data = [("Alice", 50), ("Bob", 80), ("Charlee", 75)]
sqlContext.createDataFrame(data).show()

sqlContext.createDataFrame(data, ['Name', 'Score']).show()

complex_data = [
    (1.0, 10, "Alice", True, [1, 2, 3], {"k1": 0}, Row(a=1, b=2, c=3), datetime(2014, 8, 1, 14, 1, 5)),
    (2.0, 20, "Bob", True, [1, 2, 3, 4, 5], {"k1": 0, "k2": 1}, Row(a=1, b=2, c=3), datetime(2014, 8, 1, 14, 1, 5)),
    (3.0, 30, "Charlee", False, [1, 2, 3, 4, 5, 6], {"k1": 0, "k2": 1, "k3": 2}, Row(a=1, b=2, c=3),
     datetime(2014, 8, 1, 14, 1, 5)),
]