def sqlType(cls): return StructType([ StructField("type", ByteType(), False), StructField("size", IntegerType(), True), StructField("indices", ArrayType(IntegerType(), False), True), StructField("values", ArrayType(DoubleType(), False), True) ])
def table2(): ##################################################### # Import/Create Second Schema # # Another RDD is created from a list of tuples another_rdd = sc.parallelize([("John", "England", 120), ("Jenny", "Spain", 45), ("Sarah", "Japan", 55)]) # Schema with two fields - person_name and person_age schema = StructType([ StructField("Person_First_Name", StringType(), False), StructField("Person_Location_Country", StringType(), False), StructField("Person_Avg_Spend", IntegerType(), False) ]) # Create a SchemaRDD by applying the schema to the RDD and print the schema another_schemardd = sqlCtx.applySchema(another_rdd, schema) # Print Schema on Screen print('Print the Second Schema - People_Details\n') another_schemardd.printSchema() ##################################################### # Save Data Above as a Parqet File # SchemaRDDs can be saved as Parquet files, maintaining the schema # information. another_schemardd.saveAsParquetFile( "/home/dan/Desktop/People_Details.parquet") # Register this SchemaRDD as a table. return another_schemardd.registerAsTable("People_Details")
def run(self): sc = SparkContext("local", "gender") sqlContext = SQLContext(sc) #StringType =(str, unicode) _out = self.output().open('w') #lines = sc.textFile("myUser.csv") #fobj = self.input().open("r") #lines = sc.textFile(fobj.name) print(type(self.required_tasks['insert_source'].output())) print(self.required_tasks['insert_source']) #print(self.input()['insert_source'].input()) lines = sc.textFile("myUser.csv") parts = lines.map(lambda l: l.split(",")) users = parts.map(lambda p: (p[0], p[1], p[2], p[3], p[4], p[5], p[ 6], p[7], p[8], p[9], p[10], p[11], p[12], p[13], p[14], p[15], p[ 16], p[17], p[18], p[19])) schemaString = "userId lmsUserId lmsName orgName name gender registrationDate emailId mothertounge highestEduDegree goals city state active firstAccesDate lastAccessDate allowCert yearOfBirth pincode aadharId" print(schemaString) _out.write(schemaString) fields = [ StructField(field_name, StringType(), True) for field_name in schemaString.split() ] schema = StructType(fields) #schemaUser = sqlContext.createDataFrame(users, schema) schemaUser = sqlContext.applySchema(users, schema) schemaUser.registerTempTable("users") results = sqlContext.sql("SELECT gender FROM users") genders = results.map(lambda p: (p, 1)) counts = genders.reduceByKey( lambda a, b: a + b ) #.map(lambda t : ("Gender " + t(0) + " No " + t(1))).collect() for name in counts.collect(): _out.write(str(name)) _out.close()
def get_channel_mapping(spark: pyspark.SparkContext) -> pyspark.sql.DataFrame: """ Creates the channel mapping dataframe from the hard-coded values Parameters ---------- spark : pyspark.SparkContext Spark context to initialize variables and get data from hive Returns ------- pyspark.sql.DataFrame PySpark dataframe with channel mapping data """ channel_mapping = spark.createDataFrame( [ ("01", "Distribution Channel 01"), ("10", "Other"), ("11", "DSD Bis Intercompany"), ("12", "DSD Pizza Intercomp"), ("20", """Warehouse/Exports"""), ("30", "Foodservice"), ("40", "DSD Pizza"), ("45", "DSD"), ("50", "KFI"), ("55", "Plant Ingredient"), ("60", "Imports"), ("65", "Bulk FS - Specialty"), ], StructType([ StructField("bic_zdistr_ch", StringType(), True), StructField("channel_desc", StringType(), True), ]), # add your columns label here ) return channel_mapping
def dshape_to_schema(ds): """Convert datashape to SparkSQL type system. Examples -------- >>> print(dshape_to_schema('int32')) # doctest: +SKIP IntegerType >>> print(dshape_to_schema('5 * int32') # doctest: +SKIP ArrayType(IntegerType,false) >>> print(dshape_to_schema('5 * ?int32')) # doctest: +SKIP ArrayType(IntegerType,true) >>> print(dshape_to_schema('{name: string, amount: int32}')) # doctest: +SKIP StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,false) # doctest: +SKIP)) >>> print(dshape_to_schema('10 * {name: string, amount: ?int32}')) # doctest: +SKIP ArrayType(StructType(List(StructField(name,StringType,false),StructField(amount,IntegerType,true))),false) """ if isinstance(ds, str): return dshape_to_schema(dshape(ds)) if isinstance(ds, Tuple): raise TypeError('Please provide a Record dshape for these column ' 'types: %s' % (ds.dshapes, )) if isinstance(ds, Record): return StructType([ StructField(name, dshape_to_schema(deoption(typ)), isinstance(typ, datashape.Option)) for name, typ in ds.fields ]) if isinstance(ds, DataShape): if isdimension(ds[0]): elem = ds.subshape[0] if isinstance(elem, DataShape) and len(elem) == 1: elem = elem[0] return ArrayType(dshape_to_schema(deoption(elem)), isinstance(elem, Option)) else: return dshape_to_schema(ds[0]) if ds in dshape_to_sparksql: return dshape_to_sparksql[ds] raise NotImplementedError()
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, ArrayType, IntegerType conf = SparkConf().setAppName("spark_sql_datatype_array") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([([1, 2, 3], )]) schema = StructType( [StructField("array", ArrayType(IntegerType(), False), False)]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql("select array[0], array[1], array[2] from temp_table").collect() sc.stop() for row in rows: print row
sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([ "row1_col1 row1_col2 row1_col3", "row2_col1 row2_col2 row3_col3", "row3_col1 row3_col2 row3_col3" ]) columns = source.map(lambda line: line.split(" ")).filter( lambda columns: columns and len(columns) == 3) rows = columns.map(lambda columns: (columns[0], columns[1], columns[2])) schema = StructType([ StructField("col1", StringType(), False), StructField("col2", StringType(), False), StructField("col3", StringType(), False) ]) table = hc.applySchema(rows, schema) table.registerAsTable("temp_mytable") rdd = hc.sql("select count(*) from temp_mytable where col1 = '' group by col2") print rdd.toDebugString() sc.stop()
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, ArrayType, MapType conf = SparkConf().setAppName("spark_sql_datatype_complex") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([([1, 2, 3], {"key1": 1, "key2": 2}, (1, 2.0, "3.0"))]) schema = StructType([ StructField("array", ArrayType(IntegerType(), False), False), StructField("col_map", MapType(StringType(), IntegerType(), False), False), StructField( "struct", StructType([ StructField("first", IntegerType(), False), StructField("second", FloatType(), False), StructField("third", StringType(), False) ]), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql(
if __name__ == "__main__": sc = SparkContext(appName="PythonSQL") sqlContext = SQLContext(sc) # RDD is created from a list of rows some_rdd = sc.parallelize([Row(name="John", age=19), Row(name="Smith", age=23), Row(name="Sarah", age=18)]) # Infer schema from the first row, create a SchemaRDD and print the schema some_schemardd = sqlContext.inferSchema(some_rdd) some_schemardd.printSchema() # Another RDD is created from a list of tuples another_rdd = sc.parallelize([("John", 19), ("Smith", 23), ("Sarah", 18)]) # Schema with two fields - person_name and person_age schema = StructType([StructField("person_name", StringType(), False), StructField("person_age", IntegerType(), False)]) # Create a SchemaRDD by applying the schema to the RDD and print the schema another_schemardd = sqlContext.applySchema(another_rdd, schema) another_schemardd.printSchema() # root # |-- age: integer (nullable = true) # |-- name: string (nullable = true) # A JSON dataset is pointed to by path. # The path can be either a single text file or a directory storing text files. path = os.path.join(os.environ['SPARK_HOME'], "examples/src/main/resources/people.json") # Create a SchemaRDD from the file(s) pointed to by path people = sqlContext.jsonFile(path) # root # |-- person_name: string (nullable = false)
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from datetime import datetime, date from pyspark.sql import StructType, StructField, DateType, TimestampType conf = SparkConf().setAppName("spark_sql_datatype_date_or_datetime") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([(date(2015, 9, 22), datetime(2015, 9, 22, 9, 39, 45))]) schema = StructType([ StructField("date", DateType(), False), StructField("timestamp", TimestampType(), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql("select date, timestamp from temp_table").collect() sc.stop() for row in rows: print row
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, LongType conf = SparkConf().setAppName("spark_sql_datatype_extend2") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([(85070591730234615847396907784232501249, 85070591730234615847396907784232501249)]) schema = StructType([ StructField("col1", LongType(), False), StructField("col2", LongType(), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql("select * from temp_table").collect() sc.stop() for row in rows: print row """ # java.lang.ClassCastException: java.math.BigInteger cannot be cast to
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, StringType, BooleanType, BinaryType, NullType conf = SparkConf().setAppName("spark_sql_datatype_str_bool_binary_none") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([("str", False, bytearray(range(0, 256)), None)]) schema = StructType([ StructField("str", StringType(), False), StructField("bool", BooleanType(), False), StructField("bytes", BinaryType(), False), StructField("none", NullType()) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql("select str, bool, bytes, none from temp_table").collect() sc.stop() for row in rows: print row
if columns and len(columns) == 3: successLines.add(1) return True else: errorLines.add(1) return False columns = source.map(lambda line: line.split(" ")).filter(lineFilter) rows = columns.map(lambda columns: (columns[0], columns[1], columns[2])) schema = StructType([ StructField("col1", StringType()), StructField("col2", StringType()), StructField("col3", StringType()) ]) table = hc.applySchema(rows, schema) table.registerAsTable("temp_mytable") datas = hc.sql("select * from temp_mytable").collect() sc.stop() if datas: for data in datas: print data
from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, IntegerType, FloatType, StringType conf = SparkConf().setAppName("spark_sql_datatype_struct") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([((1, 2.0, "3.0"), )]) schema = StructType([ StructField( "struct", StructType([ StructField("first", IntegerType(), False), StructField("second", FloatType(), False), StructField("third", StringType(), False) ]), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql( "select struct.first, struct.second, struct.third from temp_table" ).collect() sc.stop()
conf = SparkConf().setAppName("spark_sql_datatype") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([ (int("127"), int("32767"), int("2147483647"), long("9223372036854775807"), float("1.1"), float("2.2"), Decimal("3.3"), "str", bool(0), datetime(2015, 9, 22, 9, 39, 45), date(2015, 9, 22), [1, 2, 3], { "key": "value" }, (1, 2.0, "3.0")) ]) schema = StructType([ StructField("byte", ByteType(), False), StructField("short", ShortType(), False), StructField("int", IntegerType(), False), StructField("long", LongType(), False), StructField("float", FloatType(), False), StructField("double", DoubleType(), False), StructField("decimal", DecimalType(), False), StructField("string", StringType(), False), StructField("boolean", BooleanType(), False), StructField("timestamp", TimestampType(), False), StructField("date", DateType(), False), StructField("array", ArrayType(IntegerType(), False), False), StructField("col_map", MapType(StringType(), StringType(), False), False), StructField( "struct", StructType([
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext, StructType, StructField, StringType, IntegerType, ArrayType, FloatType, MapType conf = SparkConf().setAppName("spark_sql_udf") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([("value", )]) schema = StructType([StructField("col", StringType(), False)]) table = hc.applySchema(source, schema) table.registerTempTable("temp_table") def func_string(): return "abc" hc.registerFunction("func_string", func_string) rows = hc.sql("select func_string() from temp_table").collect() def func_int(): return 123
# coding: utf-8 from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext from pyspark.sql import StructType, StructField, MapType, StringType, IntegerType conf = SparkConf().setAppName("spark_sql_datatype_map") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize([({"key1": 1, "key2": 2}, )]) schema = StructType([ StructField("col_map", MapType(StringType(), IntegerType(), False), False) ]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") rows = hc.sql( "select col_map['key1'], col_map['key2'] from temp_table").collect() sc.stop() for row in rows: print row
from pyspark import SparkConf, SparkContext from pyspark.sql import HiveContext import decimal from datetime import datetime, date from pyspark.sql import StructType, StructField, LongType conf = SparkConf().setAppName("spark_sql_datatype_long") sc = SparkContext(conf=conf) hc = HiveContext(sc) source = sc.parallelize( [(9223372036854775807, 9223372036854775807)]) schema = StructType([StructField("col1", LongType(), False), StructField("col2", LongType(), False)]) table = hc.applySchema(source, schema) table.registerAsTable("temp_table") """ rows = hc.sql("select col1 + col2 from temp_table").collect() """ """ rows = hc.sql( "select cast(col1 as bigint) + cast(col2 as bigint) from temp_table").collect() """