示例#1
0
    s = io.BytesIO(binary)
    r = sr.Recognizer()
    with sr.AudioFile(s) as source:
        audio = r.record(source)
    try:
        print("Transcribing...")
        text = r.recognize_sphinx(audio)
        print("Done!")
        return text
    except:
        msg = "no_transcription_available"
        print("Darn! Could not transcribe audio.")
        return msg

sttudf = udf(lambda z:recognize(z), StringType())
splitudf = udf(lambda x: splitWav(x), ArrayType(BinaryType()))
convertudf = udf(lambda x: convertToWav(x), BinaryType())

df = spark.read.format("binaryFile").option("pathGlobFilter", "DTNS*.mp3").option("recursiveFileLookup", "true").load("s3a://jordan-podcast-s3/")
df = df.withColumn("WAVAudio", convertudf(df.content)).drop("modificationTime","length","content")
df = df.withColumn("splitwavs", splitudf(df.WAVAudio)).drop("WAVAudio")
df = df.withColumn("splitwavs", explode(df.splitwavs))
df = df.repartition(36)
df = df.withColumn("transcriptions", sttudf(df.splitwavs)).drop("splitwavs")
df = df.groupby("path").agg(collect_list('transcriptions').alias("transcriptions"))
df = df.withColumn("transcriptions", concat_ws(" ", "transcriptions"))
df.write.format('org.elasticsearch.spark.sql')\
        .option('es.nodes', '10.0.0.6:9200, 10.0.0.14:9200, 10.0.0.10:9200')\
        .option('es.port', 9200)\
        .option('es.resource', "podcast2/test")\
        .save()
示例#2
0
 def sqlType(cls):
     return StructType([StructField("wkb", BinaryType(), True)])
示例#3
0
def prepare_vad_udf(num_padding_frames, threshold, aggressiveness, frame_duration_ms):
    # Each audio file returns multiple voiced fragments. I need an Array, don't I?
    return_type = StructType(
        [
            StructField("start_ms", ArrayType(IntegerType())),
            StructField("end_ms", ArrayType(IntegerType())),
            StructField("voiced_buffer", ArrayType(BinaryType())),
        ]
    )
    # Try using ArrayType(BinaryType()). Need to convert numpy array to bytearray
    # Need a java UDF to reinterpet bytes, it seems https://stackoverflow.com/a/57848517
    # Or I could just use np.ndarray.view(np.int8) right here.
    AUDIO_FORMAT = AudioFormat(sample_rate=16_000, channels=1, sample_byte_width=2)
    FRAME_DURATION_SAMPLES = (AUDIO_FORMAT.sample_rate * frame_duration_ms) // 1000
    FRAME_DURATION_BYTES = (
        FRAME_DURATION_SAMPLES * AUDIO_FORMAT.channels * AUDIO_FORMAT.sample_byte_width
    )

    @pandas_udf(return_type)
    def vad(
        audio_series: pd.Series,
        audio_types_series: pd.Series,
        audio_document_id_series: pd.Series,
    ) -> pd.DataFrame:
        df_rows = []
        for audio_buffer, audio_type, audio_document_id in zip(
            audio_series, audio_types_series, audio_document_id_series
        ):
            wav_bytes_buffer = BytesIO(DecodeToWavPipe(audio_buffer, audio_type))
            with wave.open(wav_bytes_buffer, "rb") as fh:
                num_frames = fh.getnframes()
                assert fh.getframerate() == AUDIO_FORMAT.sample_rate
                assert fh.getnchannels() == AUDIO_FORMAT.channels
                assert fh.getsampwidth() == AUDIO_FORMAT.sample_byte_width
                pcm_buffer = fh.readframes(num_frames)
                del wav_bytes_buffer
                num_frames = len(pcm_buffer) // FRAME_DURATION_BYTES
                buffers = [
                    pcm_buffer[
                        FRAME_DURATION_BYTES * i : FRAME_DURATION_BYTES * (i + 1)
                    ]
                    for i in range(num_frames)
                ]
                del pcm_buffer
                generator = vad_split(
                    buffers, AUDIO_FORMAT, num_padding_frames, threshold, aggressiveness
                )

                voiced_buffer_list, start_ms_list, end_ms_list = [], [], []
                total_serialized_bytes = 0
                for voiced_buffer, start_ms, end_ms in generator:
                    total_serialized_bytes += 2 * len(voiced_buffer)
                    if (
                        total_serialized_bytes
                        > 2 * 1024 * 1024 * 1024 - 1024 * 1024 * 1024
                    ):
                        two_sum = lambda x, y: (sum(x), sum(y))
                        ignored_bytes = 0
                        ignored_ms = 0.0
                        for voiced_buffer, start_ms, end_ms in generator:
                            ignored_bytes += len(voiced_buffer)
                            ignored_ms += end_ms - start_ms
                        ignored_gigabytes = ((ignored_bytes / 1024) / 1024) / 1024
                        ignored_hours = ((ignored_ms / 1000) / 60) / 60
                        print(
                            f"WARNING: truncating voice-activity-detected audio to less than 2GB for {audio_document_id}. Wasted {ignored_gigabytes}GB of data. Wasted {ignored_hours} hours of data."
                        )
                        break
                    voiced_buffer_list.append(voiced_buffer)
                    start_ms_list.append(start_ms)
                    end_ms_list.append(end_ms)
                del buffers
                # mb_total = sum(voiced_buffer.nbytes / 1024 / 1024 for voiced_buffer in voiced_buffer_list)
                # print("GALVEZ: Chunk size in MB: ", mb_total)
                df_rows.append(
                    {
                        "start_ms": start_ms_list,
                        "end_ms": end_ms_list,
                        "voiced_buffer": voiced_buffer_list,
                    }
                )
        return pd.DataFrame(df_rows)

    return vad
示例#4
0
    def test_as_spark_type_koalas_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            List[bytes]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.character]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[datetime.date]: (np.dtype("object"), ArrayType(DateType())),
            List[np.int8]: (np.dtype("object"), ArrayType(ByteType())),
            List[np.byte]: (np.dtype("object"), ArrayType(ByteType())),
            List[decimal.Decimal]:
            (np.dtype("object"), ArrayType(DecimalType(38, 18))),
            List[float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float32]: (np.dtype("object"), ArrayType(FloatType())),
            List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())),
            List[int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int64]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int16]: (np.dtype("object"), ArrayType(ShortType())),
            List[str]: (np.dtype("object"), ArrayType(StringType())),
            List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())),
            List[datetime.datetime]:
            (np.dtype("object"), ArrayType(TimestampType())),
            List[np.datetime64]:
            (np.dtype("object"), ArrayType(TimestampType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(koalas_dtype(numpy_or_python_type),
                             (dtype, spark_type))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            koalas_dtype(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            koalas_dtype(np.dtype("object"))
示例#5
0
lz4_clevel = 1


# this is a UDF that takes care of summing histograms across
# various spark results where the outputs are histogram blobs
def agg_histos_raw(series, processor_instance, lz4_clevel):
    goodlines = series[series.str.len() > 0]
    if goodlines.size == 1:  # short-circuit trivial aggregations
        return goodlines[0]
    outhist = processor_instance.accumulator.identity()
    for line in goodlines:
        outhist.add(pkl.loads(lz4f.decompress(line)))
    return lz4f.compress(pkl.dumps(outhist), compression_level=lz4_clevel)


@fn.pandas_udf(BinaryType(), fn.PandasUDFType.GROUPED_AGG)
def agg_histos(series):
    global processor_instance, lz4_clevel
    return agg_histos_raw(series, processor_instance, lz4_clevel)


def reduce_histos_raw(df, processor_instance, lz4_clevel):
    histos = df['histos']
    mask = (histos.str.len() > 0)
    outhist = processor_instance.accumulator.identity()
    for line in histos[mask]:
        outhist.add(pkl.loads(lz4f.decompress(line)))
    return pd.DataFrame(data={'histos': np.array([lz4f.compress(pkl.dumps(outhist), compression_level=lz4_clevel)], dtype='O')})


@fn.pandas_udf(StructType([StructField('histos', BinaryType(), True)]), fn.PandasUDFType.GROUPED_MAP)
示例#6
0
    def test_verify_type_not_nullable(self):
        import array
        import datetime
        import decimal

        schema = StructType([
            StructField('s', StringType(), nullable=False),
            StructField('i', IntegerType(), nullable=True)])

        class MyObj:
            def __init__(self, **kwargs):
                for k, v in kwargs.items():
                    setattr(self, k, v)

        # obj, data_type
        success_spec = [
            # String
            ("", StringType()),
            (u"", StringType()),
            (1, StringType()),
            (1.0, StringType()),
            ([], StringType()),
            ({}, StringType()),

            # UDT
            (ExamplePoint(1.0, 2.0), ExamplePointUDT()),

            # Boolean
            (True, BooleanType()),

            # Byte
            (-(2**7), ByteType()),
            (2**7 - 1, ByteType()),

            # Short
            (-(2**15), ShortType()),
            (2**15 - 1, ShortType()),

            # Integer
            (-(2**31), IntegerType()),
            (2**31 - 1, IntegerType()),

            # Long
            (-(2**63), LongType()),
            (2**63 - 1, LongType()),

            # Float & Double
            (1.0, FloatType()),
            (1.0, DoubleType()),

            # Decimal
            (decimal.Decimal("1.0"), DecimalType()),

            # Binary
            (bytearray([1, 2]), BinaryType()),

            # Date/Timestamp
            (datetime.date(2000, 1, 2), DateType()),
            (datetime.datetime(2000, 1, 2, 3, 4), DateType()),
            (datetime.datetime(2000, 1, 2, 3, 4), TimestampType()),

            # Array
            ([], ArrayType(IntegerType())),
            (["1", None], ArrayType(StringType(), containsNull=True)),
            ([1, 2], ArrayType(IntegerType())),
            ((1, 2), ArrayType(IntegerType())),
            (array.array('h', [1, 2]), ArrayType(IntegerType())),

            # Map
            ({}, MapType(StringType(), IntegerType())),
            ({"a": 1}, MapType(StringType(), IntegerType())),
            ({"a": None}, MapType(StringType(), IntegerType(), valueContainsNull=True)),

            # Struct
            ({"s": "a", "i": 1}, schema),
            ({"s": "a", "i": None}, schema),
            ({"s": "a"}, schema),
            ({"s": "a", "f": 1.0}, schema),
            (Row(s="a", i=1), schema),
            (Row(s="a", i=None), schema),
            (["a", 1], schema),
            (["a", None], schema),
            (("a", 1), schema),
            (MyObj(s="a", i=1), schema),
            (MyObj(s="a", i=None), schema),
            (MyObj(s="a"), schema),
        ]

        # obj, data_type, exception class
        failure_spec = [
            # String (match anything but None)
            (None, StringType(), ValueError),

            # UDT
            (ExamplePoint(1.0, 2.0), PythonOnlyUDT(), ValueError),

            # Boolean
            (1, BooleanType(), TypeError),
            ("True", BooleanType(), TypeError),
            ([1], BooleanType(), TypeError),

            # Byte
            (-(2**7) - 1, ByteType(), ValueError),
            (2**7, ByteType(), ValueError),
            ("1", ByteType(), TypeError),
            (1.0, ByteType(), TypeError),

            # Short
            (-(2**15) - 1, ShortType(), ValueError),
            (2**15, ShortType(), ValueError),

            # Integer
            (-(2**31) - 1, IntegerType(), ValueError),
            (2**31, IntegerType(), ValueError),

            # Float & Double
            (1, FloatType(), TypeError),
            (1, DoubleType(), TypeError),

            # Decimal
            (1.0, DecimalType(), TypeError),
            (1, DecimalType(), TypeError),
            ("1.0", DecimalType(), TypeError),

            # Binary
            (1, BinaryType(), TypeError),

            # Date/Timestamp
            ("2000-01-02", DateType(), TypeError),
            (946811040, TimestampType(), TypeError),

            # Array
            (["1", None], ArrayType(StringType(), containsNull=False), ValueError),
            ([1, "2"], ArrayType(IntegerType()), TypeError),

            # Map
            ({"a": 1}, MapType(IntegerType(), IntegerType()), TypeError),
            ({"a": "1"}, MapType(StringType(), IntegerType()), TypeError),
            ({"a": None}, MapType(StringType(), IntegerType(), valueContainsNull=False),
             ValueError),

            # Struct
            ({"s": "a", "i": "1"}, schema, TypeError),
            (Row(s="a"), schema, ValueError),     # Row can't have missing field
            (Row(s="a", i="1"), schema, TypeError),
            (["a"], schema, ValueError),
            (["a", "1"], schema, TypeError),
            (MyObj(s="a", i="1"), schema, TypeError),
            (MyObj(s=None, i="1"), schema, ValueError),
        ]

        # Check success cases
        for obj, data_type in success_spec:
            try:
                _make_type_verifier(data_type, nullable=False)(obj)
            except Exception:
                self.fail("verify_type(%s, %s, nullable=False)" % (obj, data_type))

        # Check failure cases
        for obj, data_type, exp in failure_spec:
            msg = "verify_type(%s, %s, nullable=False) == %s" % (obj, data_type, exp)
            with self.assertRaises(exp, msg=msg):
                _make_type_verifier(data_type, nullable=False)(obj)
示例#7
0
def weighted_pointmap(vega, df):
    if df.rdd.isEmpty():
        return None

    if len(df.schema.names) == 1:
        col_point = df.schema.names[0]
        render_mode = 0
    elif len(df.schema.names) == 2:
        col_point = df.schema.names[0]
        col_count = df.schema.names[1]
        render_mode = 1
    elif len(df.schema.names) == 3:
        col_point = df.schema.names[0]
        col_color = df.schema.names[1]
        col_stroke = df.schema.names[2]
        render_mode = 2
    else:
        return None

    from pyspark.sql.functions import pandas_udf, PandasUDFType, col, lit
    from pyspark.sql.types import (StructType, StructField, BinaryType,
                                   IntegerType)
    from ._wrapper_func import TransformAndProjection, Projection

    bounding_box = vega.bounding_box()
    top_left = 'POINT (' + str(bounding_box[0]) + ' ' + str(
        bounding_box[3]) + ')'
    bottom_right = 'POINT (' + str(bounding_box[2]) + ' ' + str(
        bounding_box[1]) + ')'

    height = vega.height()
    width = vega.width()
    coor = vega.coor()
    aggregation_type = vega.aggregation_type()

    if coor == 'EPSG:3857':
        if render_mode == 2:
            df = df.select(
                Projection(col(col_point), lit(bottom_right), lit(top_left),
                           lit(int(height)), lit(int(width))).alias(col_point),
                col(col_color), col(col_stroke))
            agg_schema = StructType([
                StructField(col_point, BinaryType(), True),
                StructField(col_color, IntegerType(), True),
                StructField(col_stroke, IntegerType(), True)
            ])

            @pandas_udf(agg_schema, PandasUDFType.MAP_ITER)
            def render_agg_UDF_3857_2(batch_iter):
                for pdf in batch_iter:
                    dd = pdf.groupby([col_point])
                    ll = [col_color, col_stroke]
                    dd = dd[ll].agg([aggregation_type]).reset_index()
                    dd.columns = [col_point, col_color, col_stroke]
                    yield dd

            @pandas_udf("string", PandasUDFType.GROUPED_AGG)
            def weighted_pointmap_wkb_3857_2(point, c, s, conf=vega):
                from arctern import weighted_point_map_layer
                return weighted_point_map_layer(conf,
                                                point,
                                                False,
                                                color_weights=c,
                                                size_weights=s)

            agg_df = df.mapInPandas(render_agg_UDF_3857_2)
            agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF()
            hex_data = agg_df.agg(
                weighted_pointmap_wkb_3857_2(
                    agg_df[col_point], agg_df[col_color],
                    agg_df[col_stroke])).collect()[0][0]
        elif render_mode == 1:
            df = df.select(
                Projection(col(col_point), lit(bottom_right), lit(top_left),
                           lit(int(height)), lit(int(width))).alias(col_point),
                col(col_count))
            agg_schema = StructType([
                StructField(col_point, BinaryType(), True),
                StructField(col_count, IntegerType(), True)
            ])

            @pandas_udf(agg_schema, PandasUDFType.MAP_ITER)
            def render_agg_UDF_3857_1(batch_iter):
                for pdf in batch_iter:
                    dd = pdf.groupby([col_point])
                    dd = dd[col_count].agg([aggregation_type]).reset_index()
                    dd.columns = [col_point, col_count]
                    yield dd

            @pandas_udf("string", PandasUDFType.GROUPED_AGG)
            def weighted_pointmap_wkb_3857_1(point, c, conf=vega):
                from arctern import weighted_point_map_layer
                return weighted_point_map_layer(conf,
                                                point,
                                                False,
                                                color_weights=c)

            agg_df = df.mapInPandas(render_agg_UDF_3857_1)
            agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF()
            hex_data = agg_df.agg(
                weighted_pointmap_wkb_3857_1(
                    agg_df[col_point], agg_df[col_count])).collect()[0][0]
        else:
            df = df.select(
                Projection(col(col_point), lit(bottom_right), lit(top_left),
                           lit(int(height)), lit(int(width))).alias(col_point))

            @pandas_udf("string", PandasUDFType.GROUPED_AGG)
            def weighted_pointmap_wkb(point, conf=vega):
                from arctern import weighted_point_map_layer
                return weighted_point_map_layer(conf, point, False)

            df = df.rdd.coalesce(1, shuffle=True).toDF()
            hex_data = df.agg(weighted_pointmap_wkb(
                df[col_point])).collect()[0][0]
        return hex_data

    if render_mode == 2:
        df = df.select(
            TransformAndProjection(col(col_point), lit(str(coor)),
                                   lit('EPSG:3857'), lit(bottom_right),
                                   lit(top_left), lit(int(height)),
                                   lit(int(width))).alias(col_point),
            col(col_color), col(col_stroke))
        agg_schema = StructType([
            StructField(col_point, BinaryType(), True),
            StructField(col_color, IntegerType(), True),
            StructField(col_stroke, IntegerType(), True)
        ])

        @pandas_udf(agg_schema, PandasUDFType.MAP_ITER)
        def render_agg_UDF_2(batch_iter):
            for pdf in batch_iter:
                dd = pdf.groupby([col_point])
                ll = [col_color, col_stroke]
                dd = dd[ll].agg([aggregation_type]).reset_index()
                dd.columns = [col_point, col_color, col_stroke]
                yield dd

        @pandas_udf("string", PandasUDFType.GROUPED_AGG)
        def weighted_pointmap_wkb_2(point, c, s, conf=vega):
            from arctern import weighted_point_map_layer
            return weighted_point_map_layer(conf,
                                            point,
                                            False,
                                            color_weights=c,
                                            size_weights=s)

        agg_df = df.mapInPandas(render_agg_UDF_2)
        agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF()
        hex_data = agg_df.agg(
            weighted_pointmap_wkb_2(agg_df[col_point], agg_df[col_color],
                                    agg_df[col_stroke])).collect()[0][0]
    elif render_mode == 1:
        df = df.select(
            TransformAndProjection(col(col_point), lit(str(coor)),
                                   lit('EPSG:3857'), lit(bottom_right),
                                   lit(top_left), lit(int(height)),
                                   lit(int(width))).alias(col_point),
            col(col_count))
        agg_schema = StructType([
            StructField(col_point, BinaryType(), True),
            StructField(col_count, IntegerType(), True)
        ])

        @pandas_udf(agg_schema, PandasUDFType.MAP_ITER)
        def render_agg_UDF_1(batch_iter):
            for pdf in batch_iter:
                dd = pdf.groupby([col_point])
                dd = dd[col_count].agg([aggregation_type]).reset_index()
                dd.columns = [col_point, col_count]
                yield dd

        @pandas_udf("string", PandasUDFType.GROUPED_AGG)
        def weighted_pointmap_wkb_1(point, c, conf=vega):
            from arctern import weighted_point_map_layer
            return weighted_point_map_layer(conf,
                                            point,
                                            False,
                                            color_weights=c)

        agg_df = df.mapInPandas(render_agg_UDF_1)
        agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF()
        hex_data = agg_df.agg(
            weighted_pointmap_wkb_1(agg_df[col_point],
                                    agg_df[col_count])).collect()[0][0]
    else:
        df = df.select(
            TransformAndProjection(col(col_point), lit(str(coor)),
                                   lit('EPSG:3857'), lit(bottom_right),
                                   lit(top_left), lit(int(height)),
                                   lit(int(width))).alias(col_point))

        @pandas_udf("string", PandasUDFType.GROUPED_AGG)
        def weighted_pointmap_wkb_0(point, conf=vega):
            from arctern import weighted_point_map_layer
            return weighted_point_map_layer(conf, point, False)

        df = df.rdd.coalesce(1, shuffle=True).toDF()
        hex_data = df.agg(weighted_pointmap_wkb_0(
            df[col_point])).collect()[0][0]
    return hex_data
示例#8
0
 def spark_dtype(self):
     return BinaryType()
示例#9
0
                    pdf['width'] = 128
                    pdf['nChannels'] = 4
                    pdf['mode'] = 24
                    pdf['data'] = np.asarray(b'this is binary data')

            yield pdf


schema = StructType([
    StructField('path', StringType(), True),  # input
    StructField('slice', IntegerType(), True),  # intermediate
    StructField('xtile', IntegerType(), True),  # intermediate
    StructField('ytile', IntegerType(), True),  # intermediate
    StructField('origin', StringType(), True),  # output
    StructField('height', IntegerType(), False),
    StructField('width', IntegerType(), False),
    StructField('nChannels', IntegerType(), False),
    StructField('mode', IntegerType(), False),
    StructField('data', BinaryType(), False)
])

# Integration test
df = spark.range(4, numPartitions=10).withColumn(
    'path',
    expr("concat('s3a://this/is/my/object/path_czi', string(id), '.czi')"))

#Dataframe.mapInPandas returns zero, one or more rows for every input

dfx = df.mapInPandas(pandas_czi_splitter, schema=schema)
dfx.count()
display(dfx)
示例#10
0
def choroplethmap(vega, df):
    if df.rdd.isEmpty():
        return None

    if len(df.schema.names) != 2:
        return None

    col_polygon = df.schema.names[0]
    col_count = df.schema.names[1]

    from pyspark.sql.functions import pandas_udf, PandasUDFType, col, lit
    from pyspark.sql.types import (StructType, StructField, BinaryType,
                                   IntegerType)
    from ._wrapper_func import TransformAndProjection, Projection

    bounding_box = vega.bounding_box()
    top_left = 'POINT (' + str(bounding_box[0]) + ' ' + str(
        bounding_box[3]) + ')'
    bottom_right = 'POINT (' + str(bounding_box[2]) + ' ' + str(
        bounding_box[1]) + ')'

    height = vega.height()
    width = vega.width()
    coor = vega.coor()
    aggregation_type = vega.aggregation_type()

    if coor != 'EPSG:3857':
        df = df.select(
            TransformAndProjection(col(col_polygon), lit(str(coor)),
                                   lit('EPSG:3857'), lit(bottom_right),
                                   lit(top_left), lit(int(height)),
                                   lit(int(width))).alias(col_polygon),
            col(col_count))
    else:
        df = df.select(
            Projection(col(col_polygon), lit(bottom_right), lit(top_left),
                       lit(int(height)), lit(int(width))).alias(col_polygon),
            col(col_count))

    agg_schema = StructType([
        StructField(col_polygon, BinaryType(), True),
        StructField(col_count, IntegerType(), True)
    ])

    @pandas_udf(agg_schema, PandasUDFType.MAP_ITER)
    def render_agg_UDF(batch_iter):
        for pdf in batch_iter:
            dd = pdf.groupby([col_polygon])
            dd = dd[col_count].agg([aggregation_type]).reset_index()
            dd.columns = [col_polygon, col_count]
            yield dd

    @pandas_udf("string", PandasUDFType.GROUPED_AGG)
    def choroplethmap_wkb(wkb, w, conf=vega):
        from arctern import choropleth_map_layer
        return choropleth_map_layer(conf, wkb, w, False)

    agg_df = df.mapInPandas(render_agg_UDF)
    agg_df = agg_df.rdd.coalesce(1, shuffle=True).toDF()
    hex_data = agg_df.agg(
        choroplethmap_wkb(agg_df[col_polygon],
                          agg_df[col_count])).collect()[0][0]
    return hex_data
示例#11
0
#arr = rdd.take(1)[0]
#
#Image.open(BytesIO(arr))

# COMMAND ----------

from PIL import Image
from io import BytesIO
from pyspark.sql.types import BinaryType, StructType, StructField
from functools import partial

rdd = fin.flatMap(
    partial(msg_map, func=lambda r: r.data, conn=conn_d['/center_camera/image_color/compressed'])
)

rddTuple = rdd.map(lambda x: (bytearray(x),))
schema = StructType([StructField('rawdata', BinaryType(), False)])
df = rddTuple.toDF(schema)
df.cache()

# COMMAND ----------

from sparkdl.image.imageIO import PIL_decode, imageArrayToStruct
from pyspark.sql.functions import col
from pyspark.ml.image import ImageSchema

imageUdf = udf(lambda b: imageArrayToStruct(PIL_decode(b)), ImageSchema.imageSchema['image'].dataType)


img = df.withColumn('image', imageUdf(col('rawdata')))
display(img.select('image'))
示例#12
0

# COMMAND ----------

# Start Timing
start_download = timer()

# COMMAND ----------

# DBTITLE 1,Whole Download Pipeline
from pyspark.sql.types import StructField, BinaryType, StructType, StringType, IntegerType

# Define Schema for output table
schema = StructType(fields=[
    StructField('id', StringType(), True),
    StructField('img_binary', BinaryType(), True),
    StructField('img_size', IntegerType(), True),
    StructField('img_width', IntegerType(), True),
    StructField('img_height', IntegerType(), True),
    StructField('error_code', StringType(), True),
])

df_dl_links = spark.read.parquet(INPUT_FILE)

print("About to process {0} rows".format(df_dl_links.count()))
# Sort the dataframe along id
df_dl_links = df_dl_links.sort("id")
# Load the previously downloaded images (if job restarted)
if os.path.isdir("/dbfs" + OUTPUT_FILE):
    download_history_df = spark.read.parquet(OUTPUT_FILE)
else:
示例#13
0
_SPARK_TYPE_MAPPING = {
    "bool": BooleanType(),
    "boolean": BooleanType(),
    "byte": ByteType(),
    "tinyint": ByteType(),
    "short": ShortType(),
    "smallint": ShortType(),
    "int": IntegerType(),
    "long": LongType(),
    "bigint": LongType(),
    "float": FloatType(),
    "double": DoubleType(),
    "str": StringType(),
    "string": StringType(),
    "binary": BinaryType(),
}


class SchemaError(Exception):
    def __init__(self, message: str):
        self.message = message


class SchemaBuilder(RikaiModelSchemaVisitor):
    def visitStructType(
            self, ctx: RikaiModelSchemaParser.StructTypeContext) -> StructType:
        return StructType(
            [self.visitStructField(field) for field in ctx.field()])

    def visitStructField(
示例#14
0
def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType:
    """Convert pyarrow type to Spark data type."""
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types

    spark_type: DataType
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None:
        spark_type = TimestampNTZType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_duration(at):
        spark_type = DayTimeIntervalType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError("MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
        spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at))
        return StructType(
            [
                StructField(field.name, from_arrow_type(field.type), nullable=field.nullable)
                for field in at
            ]
        )
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " + str(at))
    return spark_type
def spark_streaming_to_pubsublite(
    project_number: int, location: str, topic_id: str
) -> None:
    # [START pubsublite_spark_streaming_to_pubsublite]
    from pyspark.sql import SparkSession
    from pyspark.sql.functions import array, create_map, col, lit, when
    from pyspark.sql.types import BinaryType, StringType
    import uuid

    # TODO(developer):
    # project_number = 11223344556677
    # location = "us-central1-a"
    # topic_id = "your-topic-id"

    spark = SparkSession.builder.appName("write-app").getOrCreate()

    # Create a RateStreamSource that generates consecutive numbers with timestamps:
    # |-- timestamp: timestamp (nullable = true)
    # |-- value: long (nullable = true)
    sdf = spark.readStream.format("rate").option("rowsPerSecond", 1).load()

    # Transform the dataframe to match the required data fields and data types:
    # https://github.com/googleapis/java-pubsublite-spark#data-schema
    sdf = (
        sdf.withColumn("key", lit("example").cast(BinaryType()))
        .withColumn("data", col("value").cast(StringType()).cast(BinaryType()))
        .withColumnRenamed("timestamp", "event_timestamp")
        # Populate the attributes field. For example, an even value will
        # have {"key1", [b"even"]}.
        .withColumn(
            "attributes",
            create_map(
                lit("key1"),
                array(when(col("value") % 2 == 0, b"even").otherwise(b"odd")),
            ),
        )
        .drop("value")
    )

    # After the transformation, the schema of the dataframe should look like:
    # |-- key: binary (nullable = false)
    # |-- data: binary (nullable = true)
    # |-- event_timestamp: timestamp (nullable = true)
    # |-- attributes: map (nullable = false)
    # |    |-- key: string
    # |    |-- value: array (valueContainsNull = false)
    # |    |    |-- element: binary (containsNull = false)
    sdf.printSchema()

    query = (
        sdf.writeStream.format("pubsublite")
        .option(
            "pubsublite.topic",
            f"projects/{project_number}/locations/{location}/topics/{topic_id}",
        )
        # Required. Use a unique checkpoint location for each job.
        .option("checkpointLocation", "/tmp/app" + uuid.uuid4().hex)
        .outputMode("append")
        .trigger(processingTime="1 second")
        .start()
    )

    # Wait 60 seconds to terminate the query.
    query.awaitTermination(60)
    query.stop()
示例#16
0
 def sqlType(cls):
     return StructType(
         [StructField("raster_source_kryo", BinaryType(), False)])
示例#17
0
    def test_as_spark_type(self):
        type_mapper = {
            # binary
            np.character: BinaryType(),
            np.bytes_: BinaryType(),
            np.string_: BinaryType(),
            bytes: BinaryType(),
            # integer
            np.int8: ByteType(),
            np.byte: ByteType(),
            np.int16: ShortType(),
            np.int32: IntegerType(),
            np.int64: LongType(),
            np.int: LongType(),
            int: LongType(),
            # floating
            np.float32: FloatType(),
            np.float: DoubleType(),
            np.float64: DoubleType(),
            float: DoubleType(),
            # string
            np.str: StringType(),
            np.unicode_: StringType(),
            str: StringType(),
            # bool
            np.bool: BooleanType(),
            bool: BooleanType(),
            # datetime
            np.datetime64: TimestampType(),
            datetime.datetime: TimestampType(),
            # DateType
            datetime.date: DateType(),
            # DecimalType
            decimal.Decimal: DecimalType(38, 18),
            # ArrayType
            np.ndarray: ArrayType(StringType()),
            List[bytes]: ArrayType(BinaryType()),
            List[np.character]: ArrayType(BinaryType()),
            List[np.bytes_]: ArrayType(BinaryType()),
            List[np.string_]: ArrayType(BinaryType()),
            List[bool]: ArrayType(BooleanType()),
            List[np.bool]: ArrayType(BooleanType()),
            List[datetime.date]: ArrayType(DateType()),
            List[np.int8]: ArrayType(ByteType()),
            List[np.byte]: ArrayType(ByteType()),
            List[decimal.Decimal]: ArrayType(DecimalType(38, 18)),
            List[float]: ArrayType(DoubleType()),
            List[np.float]: ArrayType(DoubleType()),
            List[np.float64]: ArrayType(DoubleType()),
            List[np.float32]: ArrayType(FloatType()),
            List[np.int32]: ArrayType(IntegerType()),
            List[int]: ArrayType(LongType()),
            List[np.int]: ArrayType(LongType()),
            List[np.int64]: ArrayType(LongType()),
            List[np.int16]: ArrayType(ShortType()),
            List[str]: ArrayType(StringType()),
            List[np.unicode_]: ArrayType(StringType()),
            List[datetime.datetime]: ArrayType(TimestampType()),
            List[np.datetime64]: ArrayType(TimestampType()),
        }

        for numpy_or_python_type, spark_type in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)

        with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))
示例#18
0
__null_type: NullType = NullType()
_NULL_TYPE: str = __null_type.simpleString()
assert _NULL_TYPE == __null_type.typeName()


__bool_type: BooleanType = BooleanType()
_BOOL_TYPE: str = __bool_type.simpleString()
assert _BOOL_TYPE == __bool_type.typeName()


__str_type: StringType = StringType()
_STR_TYPE: str = __str_type.simpleString()
assert _STR_TYPE == __str_type.typeName()


__binary_type: BinaryType = BinaryType()
_BINARY_TYPE: str = __binary_type.simpleString()
assert _BINARY_TYPE == __binary_type.typeName()


__byte_type: ByteType = ByteType()
_TINYINT_TYPE: str = __byte_type.simpleString()

__short_type: ShortType = ShortType()
_SMALLINT_TYPE: str = __short_type.simpleString()

__int_type: IntegerType = IntegerType()
_INT_TYPE: str = __int_type.simpleString()
assert _INT_TYPE == int.__name__
assert __int_type.typeName().startswith(_INT_TYPE)
示例#19
0
 def sqlType(self):
     return StructField("wkb", BinaryType(), False)
示例#20
0
    "array": ArrayType,
    "bigint": LongType,
    "date": DateType,
    "byte": ByteType,
    "short": ShortType,
    "datetime": TimestampType,
    "binary": BinaryType,
    "null": NullType,
    "vector": VectorUDT
}

SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }

# Profiler
PROFILER_COLUMN_TYPES = {
    "categorical", "numeric", "date", "null", "array", "binary"
}

SPARK_DTYPES_TO_PROFILER = {
    "int": ["smallint", "tinyint", "bigint", "int"],
    "decimal": ["float", "double"],
    "string": "string",
    "date": {"date", "timestamp"},
    "boolean": "boolean",
    "binary": "binary",
    "array": "array",
示例#21
0
    _array_type_mappings,
    _acceptable_types)

__null_type = NullType()
_NULL_TYPE = __null_type.simpleString()
assert _NULL_TYPE == __null_type.typeName()

__bool_type = BooleanType()
_BOOL_TYPE = __bool_type.simpleString()
assert _BOOL_TYPE == __bool_type.typeName()

__str_type = StringType()
_STR_TYPE = __str_type.simpleString()
assert _STR_TYPE == __str_type.typeName()

__binary_type = BinaryType()
_BINARY_TYPE = __binary_type.simpleString()
assert _BINARY_TYPE == __binary_type.typeName()

__byte_type = ByteType()
_TINYINT_TYPE = __byte_type.simpleString()

__short_type = ShortType()
_SMALLINT_TYPE = __short_type.simpleString()

__int_type = IntegerType()
_INT_TYPE = __int_type.simpleString()
assert _INT_TYPE == int.__name__
assert __int_type.typeName().startswith(_INT_TYPE)

__long_type = LongType()
示例#22
0
                    # WARNING: Assumes that your return type default constructor returns a "reasonable" value.
                    # May return None instead?
                    duration = python_return_type()
                except subprocess.TimeoutExpired:
                    print(f"Restarting on {audio_file}")
                    # Call again. Sometimes gcsfuse just stalls, so we need restartability
                    return get_soxi_info_udf(audio_file_series)
                durations.append(duration)
            return pd.Series(durations)

    return get_soxi_info_udf


get_audio_seconds_udf = _prepare_soxi_udf("-D", DoubleType(), float)
get_audio_sample_rate_udf = _prepare_soxi_udf("-r", StringType(), str)
get_audio_annotations_udf = _prepare_soxi_udf("-a", BinaryType(), bytes)

# Can I return an array type of struct types?
AUDIO_SEGMENTS_RETURN_TYPE = T.StructType(
    [
        T.StructField("audio_name", T.ArrayType(T.StringType())),
        T.StructField("audio", T.ArrayType(T.BinaryType())),
    ]
)


@F.pandas_udf(AUDIO_SEGMENTS_RETURN_TYPE)
def create_audio_segments_udf(
    audio_bytes_series: pd.Series,
    audio_type_series: pd.Series,
    audio_name_series: pd.Series,
示例#23
0
    def test_as_spark_type_pandas_on_spark_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(pandas_on_spark_type(numpy_or_python_type),
                             (dtype, spark_type))

            if isinstance(numpy_or_python_type, CategoricalDtype):
                # Nested CategoricalDtype is not yet supported.
                continue

            self.assertEqual(as_spark_type(List[numpy_or_python_type]),
                             ArrayType(spark_type))
            self.assertEqual(
                pandas_on_spark_type(List[numpy_or_python_type]),
                (np.dtype("object"), ArrayType(spark_type)),
            )

            # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+
            if sys.version_info >= (3, 8) and LooseVersion(
                    np.__version__) >= LooseVersion("1.21"):
                import numpy.typing as ntp

                self.assertEqual(
                    as_spark_type(ntp.NDArray[numpy_or_python_type]),
                    ArrayType(spark_type))
                self.assertEqual(
                    pandas_on_spark_type(ntp.NDArray[numpy_or_python_type]),
                    (np.dtype("object"), ArrayType(spark_type)),
                )

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            pandas_on_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            pandas_on_spark_type(np.dtype("object"))
示例#24
0
    def setUpClass(cls):
        from datetime import date, datetime
        from decimal import Decimal

        super(ArrowTests, cls).setUpClass()
        cls.warnings_lock = threading.Lock()

        # Synchronize default timezone between Python and Java
        cls.tz_prev = os.environ.get("TZ", None)  # save current tz if set
        tz = "America/Los_Angeles"
        os.environ["TZ"] = tz
        time.tzset()

        cls.spark.conf.set("spark.sql.session.timeZone", tz)

        # Test fallback
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "false"
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "true"

        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true"
        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false"

        # Enable Arrow optimization in this tests.
        cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        # Disable fallback by default to easily detect the failures.
        cls.spark.conf.set(
            "spark.sql.execution.arrow.pyspark.fallback.enabled", "false")

        cls.schema_wo_null = StructType([
            StructField("1_str_t", StringType(), True),
            StructField("2_int_t", IntegerType(), True),
            StructField("3_long_t", LongType(), True),
            StructField("4_float_t", FloatType(), True),
            StructField("5_double_t", DoubleType(), True),
            StructField("6_decimal_t", DecimalType(38, 18), True),
            StructField("7_date_t", DateType(), True),
            StructField("8_timestamp_t", TimestampType(), True),
            StructField("9_binary_t", BinaryType(), True),
        ])
        cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True)
        cls.data_wo_null = [
            (
                "a",
                1,
                10,
                0.2,
                2.0,
                Decimal("2.0"),
                date(1969, 1, 1),
                datetime(1969, 1, 1, 1, 1, 1),
                bytearray(b"a"),
            ),
            (
                "b",
                2,
                20,
                0.4,
                4.0,
                Decimal("4.0"),
                date(2012, 2, 2),
                datetime(2012, 2, 2, 2, 2, 2),
                bytearray(b"bb"),
            ),
            (
                "c",
                3,
                30,
                0.8,
                6.0,
                Decimal("6.0"),
                date(2100, 3, 3),
                datetime(2100, 3, 3, 3, 3, 3),
                bytearray(b"ccc"),
            ),
            (
                "d",
                4,
                40,
                1.0,
                8.0,
                Decimal("8.0"),
                date(2262, 4, 12),
                datetime(2262, 3, 3, 3, 3, 3),
                bytearray(b"dddd"),
            ),
        ]
        cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
示例#25
0
from pyspark.sql.types import LongType, IntegerType, StringType, StructType, StructField, BinaryType

field = [
    StructField("record_id", LongType(), True),
    StructField("op", IntegerType(), True),
    StructField("conn", IntegerType(), True),
    StructField("time", LongType(), True),
    StructField("topic", StringType(), True),
    StructField("dtype", StringType(), True),
    StructField("header", StringType(), True),
    StructField(
        "data",
        StructType([
            StructField('message_definition', StringType(), True),
            StructField('md5sum', StringType(), True),
            StructField('msg_raw', BinaryType(), True),
        ]))
]

dfSchema = StructType(field)
sorted_fields = sorted(dfSchema.fields, key=lambda x: x.name)
sorted_schema = StructType(fields=sorted_fields)

df_records = sqlContext.createDataFrame(sc.emptyRDD(), sorted_schema)

# COMMAND ----------

from pyspark.sql import Row


def convert_to_row(rid, opid, connid, dheader, ddata):
示例#26
0
    def test_supported_types(self):

        values = [
            1, 2, 3, 4, 5, 1.1, 2.2,
            Decimal(1.123), [1, 2, 2], True, 'hello',
            bytearray([0x01, 0x02])
        ]
        output_fields = [('id', IntegerType()), ('byte', ByteType()),
                         ('short', ShortType()), ('int', IntegerType()),
                         ('long', LongType()), ('float', FloatType()),
                         ('double', DoubleType()),
                         ('decim', DecimalType(10, 3)),
                         ('array', ArrayType(IntegerType())),
                         ('bool', BooleanType()), ('str', StringType()),
                         ('bin', BinaryType())]

        output_schema = StructType([StructField(*x) for x in output_fields])
        df = self.spark.createDataFrame([values], schema=output_schema)

        # Different forms of group map pandas UDF, results of these are the same
        udf1 = pandas_udf(
            lambda pdf: pdf.assign(byte=pdf.byte * 2,
                                   short=pdf.short * 2,
                                   int=pdf.int * 2,
                                   long=pdf.long * 2,
                                   float=pdf.float * 2,
                                   double=pdf.double * 2,
                                   decim=pdf.decim * 2,
                                   bool=False if pdf.bool else True,
                                   str=pdf.str + 'there',
                                   array=pdf.array,
                                   bin=pdf.bin), output_schema,
            PandasUDFType.GROUPED_MAP)

        udf2 = pandas_udf(
            lambda _, pdf: pdf.assign(byte=pdf.byte * 2,
                                      short=pdf.short * 2,
                                      int=pdf.int * 2,
                                      long=pdf.long * 2,
                                      float=pdf.float * 2,
                                      double=pdf.double * 2,
                                      decim=pdf.decim * 2,
                                      bool=False if pdf.bool else True,
                                      str=pdf.str + 'there',
                                      array=pdf.array,
                                      bin=pdf.bin), output_schema,
            PandasUDFType.GROUPED_MAP)

        udf3 = pandas_udf(
            lambda key, pdf: pdf.assign(id=key[0],
                                        byte=pdf.byte * 2,
                                        short=pdf.short * 2,
                                        int=pdf.int * 2,
                                        long=pdf.long * 2,
                                        float=pdf.float * 2,
                                        double=pdf.double * 2,
                                        decim=pdf.decim * 2,
                                        bool=False if pdf.bool else True,
                                        str=pdf.str + 'there',
                                        array=pdf.array,
                                        bin=pdf.bin), output_schema,
            PandasUDFType.GROUPED_MAP)

        result1 = df.groupby('id').apply(udf1).sort('id').toPandas()
        expected1 = df.toPandas().groupby('id').apply(
            udf1.func).reset_index(drop=True)

        result2 = df.groupby('id').apply(udf2).sort('id').toPandas()
        expected2 = expected1

        result3 = df.groupby('id').apply(udf3).sort('id').toPandas()
        expected3 = expected1

        assert_frame_equal(expected1, result1)
        assert_frame_equal(expected2, result2)
        assert_frame_equal(expected3, result3)
示例#27
0
    train_df.show()
    test_df.show()

    # Under the hood, each of the partitions is fully loaded in memory, which may be expensive.
    # This ensure that each of the paritions has a small size.
    train_df = train_df.repartition(100)
    test_df = test_df.repartition(100)

    imageSchema = StructType([
        StructField("origin", StringType(), True),
        StructField("height", IntegerType(), False),
        StructField("width", IntegerType(), False),
        StructField("nChannels", IntegerType(), False),
        StructField("mode", IntegerType(), False),
        StructField("data", BinaryType(), False)
    ])

    schema = StructType([
        StructField("image", imageSchema),
        StructField("label", IntegerType(), False)
    ])

    image_df = (train_df.rdd.map(create_image_dataframe).toDF(schema))

    image_df.show()
    image_df.printSchema()
    image_df.select("image.*").show()

    image_df.select("image.data").show()