Exemplo n.º 1
0
def test_nested_array():
    schema = StructType([
        StructField("id", IntegerType()),
        StructField("scores", ArrayType(LongType())),
    ])

    assert schema == parse_schema(schema.simpleString())
    assert schema == parse_schema("STRUCT<id:int,scores:ARRAY<bigint>>")
    assert schema == parse_schema("STRUCT<id:int,scores:ARRAY<long>>")
Exemplo n.º 2
0
    def build_schema(self, metadata, fileconfig):
        data_types = metadata.data_types

        derived_columns = fileconfig.get('derived_columns')
        if derived_columns:
            col_num = len(data_types) - len(derived_columns)
        else:
            col_num = len(data_types)

        columns_to_drop = fileconfig.get('columns_to_drop')
        if columns_to_drop:
            col_num = col_num + len(columns_to_drop.split(','))

        schema = StructType()

        for i in range(col_num):
            schema.add(StructField("_c" + str(i), StringType(), True))

        self.logger.info('schema=%s' % schema.simpleString())

        return schema
Exemplo n.º 3
0
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType


spark = SparkSession \
    .builder \
    .appName("KConsumer") \
    .getOrCreate()
temperatureSchema = StructType().add("day",
                                     "string").add("tempInCelsius", "string")
temperatureSchema.simpleString()
temperature_streaming_df = spark \
    .readStream \
    .option("sep", ",") \
    .schema(temperatureSchema) \
    .csv("E://PycharmProjects//pythonProject//data//tempratureData.csv")
temperature_streaming_df.isStreaming
temperature_streaming_df.printSchema()
query = temperature_streaming_df \
    .writeStream \
    .format("console") \
    .outputMode("append") \
    .start()