def test_nested_array(): schema = StructType([ StructField("id", IntegerType()), StructField("scores", ArrayType(LongType())), ]) assert schema == parse_schema(schema.simpleString()) assert schema == parse_schema("STRUCT<id:int,scores:ARRAY<bigint>>") assert schema == parse_schema("STRUCT<id:int,scores:ARRAY<long>>")
def build_schema(self, metadata, fileconfig): data_types = metadata.data_types derived_columns = fileconfig.get('derived_columns') if derived_columns: col_num = len(data_types) - len(derived_columns) else: col_num = len(data_types) columns_to_drop = fileconfig.get('columns_to_drop') if columns_to_drop: col_num = col_num + len(columns_to_drop.split(',')) schema = StructType() for i in range(col_num): schema.add(StructField("_c" + str(i), StringType(), True)) self.logger.info('schema=%s' % schema.simpleString()) return schema
from pyspark.sql import SparkSession from pyspark.sql.types import StructType spark = SparkSession \ .builder \ .appName("KConsumer") \ .getOrCreate() temperatureSchema = StructType().add("day", "string").add("tempInCelsius", "string") temperatureSchema.simpleString() temperature_streaming_df = spark \ .readStream \ .option("sep", ",") \ .schema(temperatureSchema) \ .csv("E://PycharmProjects//pythonProject//data//tempratureData.csv") temperature_streaming_df.isStreaming temperature_streaming_df.printSchema() query = temperature_streaming_df \ .writeStream \ .format("console") \ .outputMode("append") \ .start()