Exemplo n.º 1
0
def infer_spark_type(typeclass) -> t.DataType:
    if typeclass in (None, NoneType):
        return t.NullType()
    elif typeclass is str:
        return t.StringType()
    elif typeclass in {bytes, bytearray}:
        return t.BinaryType()
    elif typeclass is bool:
        return t.BooleanType()
    elif typeclass is date:
        return t.DateType()
    elif typeclass is datetime:
        return t.TimestampType()
    elif typeclass is Decimal:
        return t.DecimalType(precision=36, scale=6)
    elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal):
        (precision, scale) = typeclass.__constraints__
        return t.DecimalType(precision=precision, scale=scale)
    elif typeclass is float:
        return t.DoubleType()
    elif typeclass is int:
        return t.IntegerType()
    elif typeclass is long:
        return t.LongType()
    elif typeclass is short:
        return t.ShortType()
    elif typeclass is byte:
        return t.ByteType()
    elif getattr(typeclass, "__origin__", None) is not None:
        return infer_complex_spark_type(typeclass)
    elif is_pyspark_class(typeclass):
        return transform(typeclass)
    else:
        raise TypeError(f"Don't know how to represent {typeclass} in Spark")
Exemplo n.º 2
0
def infer_pd_series_spark_type(
        pser: pd.Series,
        dtype: Dtype,
        prefer_timestamp_ntz: bool = False) -> types.DataType:
    """Infer Spark DataType from pandas Series dtype.

    :param pser: :class:`pandas.Series` to be inferred
    :param dtype: the Series' dtype
    :param prefer_timestamp_ntz: if true, infers datetime without timezone as
        TimestampNTZType type. If false, infers it as TimestampType.
    :return: the inferred Spark data type
    """
    if dtype == np.dtype("object"):
        if len(pser) == 0 or pser.isnull().all():
            return types.NullType()
        elif hasattr(pser.iloc[0], "__UDT__"):
            return pser.iloc[0].__UDT__
        else:
            return from_arrow_type(
                pa.Array.from_pandas(pser).type, prefer_timestamp_ntz)
    elif isinstance(dtype, CategoricalDtype):
        if isinstance(pser.dtype, CategoricalDtype):
            return as_spark_type(pser.cat.codes.dtype,
                                 prefer_timestamp_ntz=prefer_timestamp_ntz)
        else:
            # `pser` must already be converted to codes.
            return as_spark_type(pser.dtype,
                                 prefer_timestamp_ntz=prefer_timestamp_ntz)
    else:
        return as_spark_type(dtype, prefer_timestamp_ntz=prefer_timestamp_ntz)
Exemplo n.º 3
0
def get_spark_data_type(input_value):
    return {
        "str": T.StringType(),
        "int": T.LongType(),
        "bool": T.BooleanType(),
        "float": T.DoubleType(),
        "NoneType": T.NullType(),
    }[type(input_value).__name__]
Exemplo n.º 4
0
def get_type(obj):
    if obj is None:
        return T.NullType()

    if type(obj)==type(type):
        return python_type_mappings.get(obj)()

    if type(obj)==str:
        return string_type_mapping.get(obj)()

    raise TypeError('type ', type(obj), 'cannot be mapped')
Exemplo n.º 5
0
def main(args):
    spark = sql.SparkSession.builder.appName('update-analyzer').getOrCreate()

    msg_struct = types.StructType([
        types.StructField('text', types.StringType(), True),
        types.StructField('user_id', types.StringType(), True),
        types.StructField('update_id', types.StringType(), True)
    ])

    analyzer = vader.SentimentIntensityAnalyzer()
    analyzer_bcast = spark.sparkContext.broadcast(analyzer)
    vhost_bcast = args.vhost
    vport_bcast = args.vport

    def sentiment_generator_impl(text, user_id, update_id):
        va = analyzer_bcast.value
        english = SpacyMagic.get('en_core_web_sm')
        result = english(text)
        sents = [str(sent) for sent in result.sents]
        sentiments = [va.polarity_scores(str(s)) for s in sents]
        obj = dict(user_id=user_id,
                   update_id=update_id,
                   text=text,
                   sentiments=sentiments)
        try:
            con = httplib.HTTPConnection(host=vhost_bcast, port=vport_bcast)
            con.request('POST', '/', body=json.dumps(obj))
            con.close()
        except Exception as e:
            logging.warn('unable to POST to visualizer, error:')
            logging.warn(e.message)

    sentiment_generator = functions.udf(sentiment_generator_impl,
                                        types.NullType())

    records = (spark.readStream.format('kafka').option(
        'kafka.bootstrap.servers',
        args.brokers).option('subscribe', args.topic).load().select(
            functions.column('value').cast(
                types.StringType()).alias('value')).select(
                    functions.from_json(
                        functions.column('value'),
                        msg_struct).alias('json')).select(
                            functions.column('json.user_id'),
                            functions.column('json.update_id'),
                            functions.column('json.text'),
                            sentiment_generator(
                                functions.column('json.text'),
                                functions.column('json.user_id'),
                                functions.column('json.update_id'))).
               writeStream.format("console").start())

    records.awaitTermination()
Exemplo n.º 6
0
def infer_pd_series_spark_type(s: pd.Series) -> types.DataType:
    """Infer Spark DataType from pandas Series dtype.

    :param s: :class:`pandas.Series` to be inferred
    :return: the inferred Spark data type
    """
    dt = s.dtype
    if dt == np.dtype("object"):
        if len(s) == 0 or s.isnull().all():
            return types.NullType()
        elif hasattr(s.iloc[0], "__UDT__"):
            return s.iloc[0].__UDT__
        else:
            return from_arrow_type(pa.Array.from_pandas(s).type)
    else:
        return as_spark_type(dt)
Exemplo n.º 7
0
def infer_pd_series_spark_type(pser: pd.Series, dtype: Dtype) -> types.DataType:
    """Infer Spark DataType from pandas Series dtype.

    :param pser: :class:`pandas.Series` to be inferred
    :param dtype: the Series' dtype
    :return: the inferred Spark data type
    """
    if dtype == np.dtype("object"):
        if len(pser) == 0 or pser.isnull().all():
            return types.NullType()
        elif hasattr(pser.iloc[0], "__UDT__"):
            return pser.iloc[0].__UDT__
        else:
            return from_arrow_type(pa.Array.from_pandas(pser).type)
    elif isinstance(dtype, CategoricalDtype):
        # `pser` must already be converted to codes.
        return as_spark_type(pser.dtype)
    else:
        return as_spark_type(dtype)
Exemplo n.º 8
0
 def infer_schema(rec):
     """infers dataframe schema for a record. Assumes every dict is a Struct, not a Map"""
     if isinstance(rec, dict):
         return pst.StructType([
             pst.StructField(key, DataWriter.infer_schema(value), True)
             for key, value in sorted(rec.items())
         ])
     elif isinstance(rec, list):
         if len(rec) == 0:
             #raise ValueError("can't infer type of an empty list")
             return pst.ArrayType(pst.NullType())
         elem_type = DataWriter.infer_schema(rec[0])
         for elem in rec:
             this_type = DataWriter.infer_schema(elem)
             if elem_type != this_type:
                 raise ValueError(
                     "can't infer type of a list with inconsistent elem types"
                 )
         return pst.ArrayType(elem_type)
     else:
         return pst._infer_type(rec)
Exemplo n.º 9
0
import re

from pyspark.sql import SparkSession, Row
from pyspark.sql import functions as F
from pyspark.sql import types as T

# Define UDF

null_negative_int = F.udf(
    lambda val: T.NullType()
    if val is None or val < 0 else val, T.IntegerType())

null_negative_double = F.udf(
    lambda val: T.NullType()
    if val is None or val < 0 else val, T.DoubleType())

to_float_list = F.udf(
    lambda lst: [float(x_) if is_float(x_) else None for x_ in lst],
    T.ArrayType(T.DoubleType()),
)

is_float = lambda val: re.match(r"^-?\d+(?:\.\d+)?$", val) is not None


def clean(spark, rows):
    # Load Data
    df = spark.createDataFrame(Row(**row) for row in rows)

    # Clean column country
    re_country = "[a-zA-Z][a-zA-Z\s\-]*"
Exemplo n.º 10
0
    ]), True),
t.StructField(
    "h",
    t.ArrayType(
        t.MapType(
            t.StringType(),
            t.StructType([
                t.StructField("d", t.DoubleType(), False),
                t.StructField(
                    "e",
                    t.StructType([
                        t.StructField("a", t.StringType(), False),
                        t.StructField("b", t.IntegerType(), False),
                        t.StructField("c", t.BooleanType(), False)
                    ]), False),
                t.StructField("f", t.NullType(), False)
            ]), True), False), False),
t.StructField(
    "i",
    t.MapType(
        t.StructType([
            t.StructField("d", t.DoubleType(), False),
            t.StructField(
                "e",
                t.StructType([
                    t.StructField("a", t.StringType(), False),
                    t.StructField("b", t.IntegerType(), False),
                    t.StructField("c", t.BooleanType(), False)
                ]), False),
            t.StructField("f", t.NullType(), False)
        ]),
Exemplo n.º 11
0
import pytest
from pyspark.sql import types as t

from tinsel.lib import infer_spark_type, transform_field, maybe_unlift_optional, struct
from tinsel.types import NoneType, long, short, byte, decimal

DEFAULT_NAME = "some_field"


@struct
class Dummy(NamedTuple):
    pass


PRIMITIVES = [
    (NoneType, t.NullType()),
    (int, t.IntegerType()),
    (float, t.DoubleType()),
    (str, t.StringType()),
    (bytes, t.BinaryType()),
    (bytearray, t.BinaryType()),
    (bool, t.BooleanType()),
]

SYNTHETIC_PRIMITIVES = [
    (long, t.LongType()),
    (short, t.ShortType()),
    (byte, t.ByteType()),
]

DATE_TYPES = [