def run(t1, context_string, configPath='./jsonFormat/ACTCLR.json'): spark = SQLContext(SparkContext.getOrCreate()) hive = HiveContext(SparkContext.getOrCreate()) jsonobj = {} #load json obj from file with open(configPath) as cfgPath: jsonobj = json.load(cfgPath) print("jsonobj", jsonobj) tables = {} for name, prof in jsonobj["tables"].items(): #parse table print("prof", name, prof) linenum = prof["property"]["lines"] df = spark.range(linenum) for fieldName, fieldProf in prof["field"].items(): if fieldProf["type"].startswith( "\"") and fieldProf["type"].endswith("\""): fieldProf["type"] = fieldProf["type"][1:-1] if fieldProf["type"].upper().startswith("VARCHAR") or \ fieldProf["type"].upper().startswith("CHAR"): t_type = StringType baseType = "String" elif fieldProf["type"].upper().startswith("DECIMAL"): t_type = FloatType baseType = "Float" elif fieldProf["type"].upper().startswith("SMALLINT"): t_type = IntegerType baseType = "Integer" elif fieldProf["type"].upper().startswith("DATE"): t_type = DataType baseType = "Date" else: t_type = StringType baseType = "String" if fieldProf["createMod"] == "": udf_func = udf( getattr(excuteFunc, baseType + "Method")(constraint=fieldProf["constraint"]), t_type()) else: udf_func = udf( getattr(excuteFunc, fieldProf["createMod"].upper() + "_Method")(constraint=fieldProf["constraint"]), t_type()) df = df.withColumn(fieldName, udf_func("id")) tables[name] = df for i, j in tables.items(): j.show(n=100, truncate=False) return [tables["ACTCLR"]]
complex_data = sc.parallelize([Row(col_list=[1, 2, 3], col_dict={"k1": 0}, col_row=Row(a=10, b=20, c=30), col_time=datetime(2014, 8, 1, 14, 1, 5)), Row(col_list=[1, 2, 3, 4, 5], col_dict={"k1": 0, "k2": 1}, col_row=Row(a=40, b=50, c=60), col_time=datetime(2014, 8, 2, 14, 1, 6)), Row(col_list=[1, 2, 3, 4, 5, 6, 7], col_dict={"k1": 0, "k2": 1, "k3": 2}, col_row=Row(a=70, b=80, c=90), col_time=datetime(2014, 8, 3, 14, 1, 7)) ]) complex_data_df = complex_data.toDF() complex_data_df.show() sqlContext = SQLContext(sc) print(sqlContext) df = sqlContext.range(5) print(df) df.show() print(df.count()) data = [("Alice", 50), ("Bob", 80), ("Charlee", 75)] sqlContext.createDataFrame(data).show() sqlContext.createDataFrame(data, ['Name', 'Score']).show() complex_data = [ (1.0, 10, "Alice", True, [1, 2, 3], {"k1": 0}, Row(a=1, b=2, c=3), datetime(2014, 8, 1, 14, 1, 5)), (2.0, 20, "Bob", True, [1, 2, 3, 4, 5], {"k1": 0, "k2": 1}, Row(a=1, b=2, c=3), datetime(2014, 8, 1, 14, 1, 5)), (3.0, 30, "Charlee", False, [1, 2, 3, 4, 5, 6], {"k1": 0, "k2": 1, "k3": 2}, Row(a=1, b=2, c=3), datetime(2014, 8, 1, 14, 1, 5)), ]