示例#1
0
    def test_as_spark_type_koalas_dtype(self):
        type_mapper = {
            # binary
            np.character: (np.character, BinaryType()),
            np.bytes_: (np.bytes_, BinaryType()),
            np.string_: (np.bytes_, BinaryType()),
            bytes: (np.bytes_, BinaryType()),
            # integer
            np.int8: (np.int8, ByteType()),
            np.byte: (np.int8, ByteType()),
            np.int16: (np.int16, ShortType()),
            np.int32: (np.int32, IntegerType()),
            np.int64: (np.int64, LongType()),
            np.int: (np.int64, LongType()),
            int: (np.int64, LongType()),
            # floating
            np.float32: (np.float32, FloatType()),
            np.float: (np.float64, DoubleType()),
            np.float64: (np.float64, DoubleType()),
            float: (np.float64, DoubleType()),
            # string
            np.str: (np.unicode_, StringType()),
            np.unicode_: (np.unicode_, StringType()),
            str: (np.unicode_, StringType()),
            # bool
            np.bool: (np.bool, BooleanType()),
            bool: (np.bool, BooleanType()),
            # datetime
            np.datetime64: (np.datetime64, TimestampType()),
            datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()),
            # DateType
            datetime.date: (np.dtype("object"), DateType()),
            # DecimalType
            decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)),
            # ArrayType
            np.ndarray: (np.dtype("object"), ArrayType(StringType())),
            List[bytes]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.character]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())),
            List[bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())),
            List[datetime.date]: (np.dtype("object"), ArrayType(DateType())),
            List[np.int8]: (np.dtype("object"), ArrayType(ByteType())),
            List[np.byte]: (np.dtype("object"), ArrayType(ByteType())),
            List[decimal.Decimal]:
            (np.dtype("object"), ArrayType(DecimalType(38, 18))),
            List[float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())),
            List[np.float32]: (np.dtype("object"), ArrayType(FloatType())),
            List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())),
            List[int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int64]: (np.dtype("object"), ArrayType(LongType())),
            List[np.int16]: (np.dtype("object"), ArrayType(ShortType())),
            List[str]: (np.dtype("object"), ArrayType(StringType())),
            List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())),
            List[datetime.datetime]:
            (np.dtype("object"), ArrayType(TimestampType())),
            List[np.datetime64]:
            (np.dtype("object"), ArrayType(TimestampType())),
            # CategoricalDtype
            CategoricalDtype(categories=["a", "b", "c"]): (
                CategoricalDtype(categories=["a", "b", "c"]),
                LongType(),
            ),
        }

        for numpy_or_python_type, (dtype, spark_type) in type_mapper.items():
            self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
            self.assertEqual(pandas_on_spark_type(numpy_or_python_type),
                             (dtype, spark_type))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            as_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            as_spark_type(np.dtype("object"))

        with self.assertRaisesRegex(TypeError,
                                    "Type uint64 was not understood."):
            pandas_on_spark_type(np.dtype("uint64"))

        with self.assertRaisesRegex(TypeError,
                                    "Type object was not understood."):
            pandas_on_spark_type(np.dtype("object"))
示例#2
0
        ]
    },
    format="csv",
    format_options={'withHeader': True})
#convert to dataframes
rpa_AE_RO_df = rpa_AE_RO_dyf.toDF()
rpa_summ_hr_df = rpa_summ_hr_dyf.toDF()
rpa_df = rpa_dyf.toDF()

#-----------------------------------Creation of LOOKUP Dataframes----------------------------------------------#

#1. Transform the date column and create the composite join key---RPA-AE RO
#1.1. Convert CalendarDate to date format from string
#1.1.1. Create UDF function to perform the casting operation(m/dd/yyyy)
func_str_to_date_lookup = udf(lambda x: datetime.strptime(x, '%m/%d/%Y'),
                              DateType())
#1.2.2. Augment the transformed value in rpa_AE_RO_transformed_df
rpa_AE_RO_transformed_df = rpa_AE_RO_df.withColumn(
    'new_date', func_str_to_date_lookup(col('CalendarDate')))
#1.2. Concatenate the new_date column and Employeenumber
rpa_AE_RO_transformed_df = rpa_AE_RO_transformed_df.withColumn(
    'join_key', concat('new_date', lit('_'), 'EmployeeNumber'))
#1.3. Drop the field calendar date
rpa_AE_RO_transformed_df = rpa_AE_RO_transformed_df.drop('CalendarDate')

#2. Transform the date column and create the composite join key---RPA-SUMM HR
#2.1. Convert CalendarDate to date format from string
#2.1.1. Create UDF function to perform the casting operation(m/dd/yyyy)
func_str_to_date_lookup_hr = udf(lambda x: datetime.strptime(x, '%m/%d/%Y'),
                                 DateType())
#2.2.2. Augment the transformed value in rpa_AE_RO_transformed_df
示例#3
0
    union_result_df = union_result_df.union(water_equivalent_snow_fall_df)
    union_result_df = union_result_df.union(sunsine_df)
    print "End union processing"

    #union_result_df.show()
    union_result_df.printSchema()

    #perform pivoting based on weather category to convert summarize rows into column result
    curation_result_df = union_result_df.groupBy(
        union_result_df.station_identifier,
        union_result_df.observation_date).pivot("weather_category").agg(
            round(sum(union_result_df.calculation_result),
                  2)).sort(union_result_df.observation_date)

    #udf function to convert observation_date in proper format
    func = udf(lambda x: datetime.strptime(x, '%Y%m%d'), DateType())

    curation_result_df = curation_result_df.withColumn(
        "observation_date_format", func(col('observation_date')))
    curation_result_df.printSchema()

    #write final result in hdfs location
    print 'started writing weather curated data to hdfs location'
    curation_result_df.na.fill(0.0).select(
        "station_identifier", "observation_date_format", "Precipitation",
        "MaxTemparature", "Snowfall", "SnowDepth", "Evaporation",
        "WaterEquivalentSnowDepth", "WaterEquivalentSnowFall",
        "Sunshine").write.format("csv").save(
            path="hdfs:///tmp/weathercurated_result", mode='overwrite')
    print 'End processing to writing'
# | value      |
# +------------+
# |{"custom"...|
# +------------+
#
# and create separated fields like this:
# +------------+-----+-----------+
# |    customer|score| riskDate  |
# +------------+-----+-----------+
# |"sam@tes"...| -1.4| 2020-09...|
# +------------+-----+-----------+

eventSchema = StructType([
    StructField('customer', StringType()),
    StructField('score', FloatType()),
    StructField('riskDate', DateType()),
])

df = df.withColumn('value', from_json('value', eventSchema))\
.select(col('value.*'))

# Storing them in a temporary view called CustomerRisk
df.createOrReplaceTempView('CustomerRisk')

# Execute a sql statement against a temporary view, selecting the customer and the score from the temporary view, creating a dataframe called customerRiskStreamingDF
customerRiskStreamingDF = spark.sql('''
SELECT customer, score
FROM CustomerRisk
''')

# Sink the customerRiskStreamingDF dataframe to the console in append mode
示例#5
0
# Ejercicio01: Largo del titulo de la pelicula
# Alumnos: Carla Alvarez, Daniel Garcia, Juan Carlos Lopez, Carlos Mellado
# Ejecutar con: spark-submit lp.py > lp.txt
# Visualizar resultado: cat lp.txt
# Consideraciones: archivo 'peliculas.txt' debe haber sido cargado a hdfs en la raiz

import re
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

# Definiendo el contexto para el script (en pyspark no es necesario) y cargando los datos
sc = SparkContext()
spark = SparkSession(sc)
schema = StructType([StructField('Id', IntegerType(), True), StructField('NombrePelicula', StringType(), False), StructField('Fecha', DateType(), True)])
data = spark.read.format('csv').option('header', 'False').option('sep', '\t').option('mode', 'DROPMALFORMED').load('peliculas.txt', schema=schema)

# Buscando el titulo con la maxima cantidad de caracteres
titulos = data.rdd.map(lambda fila: re.sub(r'\([0-9{4})]*\)', '', fila.NombrePelicula).lower().strip())
titulos = titulos.map(lambda x: (str(x), len(str(x))))
resultado = titulos.takeOrdered(1, key = lambda x: -x[1])

# Mostrando resultados
print('La pelicula con el titulo mas largo es:')
print('Pelicula: {0}'.format(str(resultado[0][0])))
print('Largo: {0}'.format(str(resultado[0][1])))
示例#6
0
def print_each_line(eachLine):
    print eachLine
    return


sparkcontext = SparkContext(conf=sparkconfig)
sqlContext = SQLContext(sparkcontext)

toIntegerfunc = UserDefinedFunction(lambda eachElement: int(eachElement),
                                    IntegerType())
toBooleanfunc = UserDefinedFunction(
    lambda eachElement: True if eachElement == 'P' else False, BooleanType())

toDateFunc = UserDefinedFunction(
    lambda eachElement: datetime.strptime(eachElement, '%m/%d/%Y'), DateType())


csvDF = sqlContext.read \
                  .option("header", "true") \
                  .csv("/home/dharshekthvel/Downloads/stop.csv")

schema_modified_df = csvDF.withColumn("VEHICLE_ID", toIntegerfunc(csvDF["VEHICLE_ID"]))\
                          .withColumn("PLAN_STATUS", toBooleanfunc(csvDF["PLAN_STATUS"]))\
                          .withColumn("OPD_DATE", toDateFunc(csvDF["OPD_DATE"]))

#csvDF.printSchema()
#csvDF.show(2)

schema_modified_df.printSchema()
schema_modified_df.show(20)
示例#7
0
spark = SpakrSession.builder.config("jar1", "jar2").appName("").getOrCreate()

# for reading a file without the schema known, we can use inferSchema while creating the dataframe

df1 = spark.read.csv("file_path.csv",
                     inferSchema=True,
                     header=True,
                     sep=";",
                     mode="DROPMALFORMED")

# for reading a file with pre-defined Schema

schema = StructType([
    StructField("col_1", IntegerType()),
    StructField("col_2", DoubleType()),
    StructField("col_3", DateType())
])

df2 = spark.read.csv("file_path.csv",
                     schema=schema,
                     header=True,
                     sep=";",
                     mode="DROPMALFORMED")

# =========================== #

# spark can read data from several places - local, s3, hdfs

df3 = spark.read.csv(
    "s3a://....")  # s3a for EC2, s3/s3a for EMR (s3 protocol is faster)
df4 = spark.read.csv("hdfs:///.....")
示例#8
0
 def _get_target_schema():
     return StructType([
         StructField("row", IntegerType(), False),
         StructField("ID", IntegerType(), False),
         StructField("Name", StringType(), True),
         StructField("Age", IntegerType(), True),
         StructField("Photo", StringType(), True),
         StructField("Nationality", StringType(), True),
         StructField("Flag", StringType(), True),
         StructField("Overall", IntegerType(), True),
         StructField("Potential", IntegerType(), False),
         StructField("Club", StringType(), True),
         StructField("Club_logo", StringType(), True),
         StructField("Value", StringType(), True),
         StructField("Wage", StringType(), True),
         StructField("Special", IntegerType(), True),
         StructField("Preferred_foot", StringType(), True),
         StructField("International_reputation", IntegerType(), False),
         StructField("Weak_foot", IntegerType(), True),
         StructField("Skill_moves", IntegerType(), True),
         StructField("Work_rate", StringType(), True),
         StructField("Body_type", StringType(), True),
         StructField("Real_face", StringType(), True),
         StructField("Position", StringType(), True),
         StructField("Jersey_number", StringType(), False),
         StructField("Joined", DateType(), True),
         StructField("Loaned_from", StringType(), True),
         StructField("Contract_valid_until", StringType(), True),
         StructField("Height", StringType(), True),
         StructField("Weight", StringType(), True),
         StructField("LS", StringType(), True),
         StructField("ST", StringType(), False),
         StructField("RS", StringType(), True),
         StructField("LW", StringType(), True),
         StructField("LF", StringType(), True),
         StructField("CF", StringType(), False),
         StructField("RF", StringType(), True),
         StructField("RW", StringType(), True),
         StructField("LAM", StringType(), True),
         StructField("CAM", StringType(), False),
         StructField("RAM", StringType(), True),
         StructField("LM", StringType(), True),
         StructField("LCM", StringType(), True),
         StructField("CM", StringType(), True),
         StructField("RCM", StringType(), False),
         StructField("RM", StringType(), True),
         StructField("LWB", StringType(), True),
         StructField("LDM", StringType(), True),
         StructField("CDM", StringType(), True),
         StructField("RDM", StringType(), True),
         StructField("RWB", StringType(), False),
         StructField("LB", StringType(), True),
         StructField("LCB", StringType(), True),
         StructField("CB", StringType(), False),
         StructField("RCB", StringType(), True),
         StructField("RB", StringType(), True),
         StructField("Crossing", IntegerType(), True),
         StructField("Finishing", IntegerType(), True),
         StructField("HeadingAccuracy", IntegerType(), False),
         StructField("ShortPassing", IntegerType(), True),
         StructField("Volleys", IntegerType(), True),
         StructField("Dribbling", IntegerType(), True),
         StructField("Curve", IntegerType(), True),
         StructField("FKAccuracy", IntegerType(), False),
         StructField("Long_Passing", IntegerType(), True),
         StructField("BallControl", IntegerType(), True),
         StructField("Acceleration", IntegerType(), True),
         StructField("SprintSpeed", IntegerType(), True),
         StructField("Agility", IntegerType(), True),
         StructField("Reactions", IntegerType(), False),
         StructField("Balance", IntegerType(), True),
         StructField("ShotPower", IntegerType(), True),
         StructField("Jumping", IntegerType(), True),
         StructField("Stamina", IntegerType(), True),
         StructField("Strength", IntegerType(), True),
         StructField("LongShots", IntegerType(), True),
         StructField("Aggression", IntegerType(), False),
         StructField("Interceptions", IntegerType(), True),
         StructField("Positioning", IntegerType(), True),
         StructField("Vision", IntegerType(), True),
         StructField("Penalties", IntegerType(), True),
         StructField("Composure", IntegerType(), False),
         StructField("Marking", IntegerType(), True),
         StructField("StandingTackle", IntegerType(), True),
         StructField("SlidingTackle", IntegerType(), True),
         StructField("GKDiving", IntegerType(), True),
         StructField("GKHandling", IntegerType(), True),
         StructField("GKKicking", IntegerType(), False),
         StructField("GKPositioning", IntegerType(), True),
         StructField("GKReflexes", IntegerType(), True),
         StructField("Release_clause", StringType(), True)
     ])
)
# TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
customerJSONSchema = StructType(
    [
        StructField("customerName", StringType()),
        StructField("email", StringType()),
        StructField("phone", StringType()),
        StructField("birthDay", StringType()),
    ]
)
# TO-DO: create a StructType for the Kafka stedi-events topic which has the Customer Risk JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
eventRiskCustomerSchema = StructType(
    [
        StructField("customer", StringType()),
        StructField("score", DecimalType(precision=3, scale=1)),
        StructField("riskDate", DateType()),
    ]
)
#TO-DO: create a spark application object
spark = SparkSession.builder.appName("stedi-app").getOrCreate()
#TO-DO: set the spark log level to WARN
spark.sparkContext.setLogLevel("WARN")
# TO-DO: using the spark application object, read a streaming dataframe from the Kafka topic redis-server as the source
# Be sure to specify the option that reads all the events from the topic including those that were published before you started the spark stream
redisServerRawStreamingDF = spark.readStream.format("kafka")\
                                            .option("kafka.bootstrap.servers", "kafka:19092")\
                                            .option("subscribe", "redis-server")\
                                            .option("startingOffsets", "earliest")\
                                            .load()
# TO-DO: cast the value column in the streaming dataframe as a STRING 
redisServerStreamingDF = redisServerRawStreamingDF.selectExpr("cast(key as string) key", "cast(value as string) value")
    my_log(log, f"min date : {l_args.last_date}")
    # retrieve new covid cases per population group
    cases_per_date = client.get(
        l_args.dataset_identifier,
        group="cdc_case_earliest_dt, sex, age_group, race_ethnicity_combined",
        select=
        "cdc_case_earliest_dt, sex, age_group, race_ethnicity_combined, count(*)",
        where=f"cdc_case_earliest_dt > '{l_args.last_date}'",
        limit=200000,
        content_type="json")
    my_log(log, f"nb new records : {len(cases_per_date)}")
    # transform to data frame
    df_cases = spark.read.json(spark.sparkContext.parallelize(cases_per_date))
    my_log(log, f"nb rows in dataframe : {df_cases.count()}")
    df_cases = df_cases.withColumn(
        "race_ethnicity_combined",
        parse_race_ethnicity("race_ethnicity_combined"))
    df_cases = df_cases.withColumn(
        "cdc_case_earliest_dt",
        col("cdc_case_earliest_dt").cast(DateType()))
    # write to postgres

    df_cases.write\
            .format("jdbc")\
            .option("url", "jdbc:postgresql:capstone")\
            .option("dbtable", l_args.table)\
            .option("user","postgres")\
            .option("password", "postgres")\
            .mode("overwrite")\
            .save()
# if you ask explicitly, spark will try to infer the schema automatically
infer_schema = spark.read.csv(path='../data/covid19.csv',header=True,inferSchema=True)
infer_schema.printSchema()
# in this case it gets the integers right, but just treats the date as a string

# or you can specify the schema explicitly

from pyspark.sql.types import (StructField, 
                               StringType, 
                               IntegerType,
                               DateType,
                               StructType)

data_schema = [StructField('continent',StringType(),True),
              StructField('location',StringType(),True),
              StructField('date',DateType(),True),
              StructField('total_cases',IntegerType(),True),
              StructField('new_cases',IntegerType(),True),
              StructField('total_deaths',IntegerType(),True),
              StructField('new_deaths',IntegerType(),True)]

correct_struc = StructType(fields=data_schema)

dataframe = spark.read.csv(path='../data/covid19.csv', header=True, schema=correct_struc)

# and we can confirm that this time the types are correct
print(dataframe.printSchema())

# if we wanted to convert to the older-style RDD we easily could
rdd = dataframe.rdd
print(f'Created `rdd` {type(rdd)} from `dataframe` {type(dataframe)}.')
示例#12
0
    def setUpClass(cls):
        from datetime import date, datetime
        from decimal import Decimal

        super(ArrowTests, cls).setUpClass()
        cls.warnings_lock = threading.Lock()

        # Synchronize default timezone between Python and Java
        cls.tz_prev = os.environ.get("TZ", None)  # save current tz if set
        tz = "America/Los_Angeles"
        os.environ["TZ"] = tz
        time.tzset()

        cls.spark.conf.set("spark.sql.session.timeZone", tz)

        # Test fallback
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "false"
        cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.enabled") == "true"

        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "true")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true"
        cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled",
                           "false")
        assert cls.spark.conf.get(
            "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false"

        # Enable Arrow optimization in this tests.
        cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
        # Disable fallback by default to easily detect the failures.
        cls.spark.conf.set(
            "spark.sql.execution.arrow.pyspark.fallback.enabled", "false")

        cls.schema_wo_null = StructType([
            StructField("1_str_t", StringType(), True),
            StructField("2_int_t", IntegerType(), True),
            StructField("3_long_t", LongType(), True),
            StructField("4_float_t", FloatType(), True),
            StructField("5_double_t", DoubleType(), True),
            StructField("6_decimal_t", DecimalType(38, 18), True),
            StructField("7_date_t", DateType(), True),
            StructField("8_timestamp_t", TimestampType(), True),
            StructField("9_binary_t", BinaryType(), True),
        ])
        cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True)
        cls.data_wo_null = [
            (
                "a",
                1,
                10,
                0.2,
                2.0,
                Decimal("2.0"),
                date(1969, 1, 1),
                datetime(1969, 1, 1, 1, 1, 1),
                bytearray(b"a"),
            ),
            (
                "b",
                2,
                20,
                0.4,
                4.0,
                Decimal("4.0"),
                date(2012, 2, 2),
                datetime(2012, 2, 2, 2, 2, 2),
                bytearray(b"bb"),
            ),
            (
                "c",
                3,
                30,
                0.8,
                6.0,
                Decimal("6.0"),
                date(2100, 3, 3),
                datetime(2100, 3, 3, 3, 3, 3),
                bytearray(b"ccc"),
            ),
            (
                "d",
                4,
                40,
                1.0,
                8.0,
                Decimal("8.0"),
                date(2262, 4, 12),
                datetime(2262, 3, 3, 3, 3, 3),
                bytearray(b"dddd"),
            ),
        ]
        cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
示例#13
0
def from_arrow_type(at: "pa.DataType",
                    prefer_timestamp_ntz: bool = False) -> DataType:
    """Convert pyarrow type to Spark data type."""
    from distutils.version import LooseVersion
    import pyarrow as pa
    import pyarrow.types as types

    spark_type: DataType
    if types.is_boolean(at):
        spark_type = BooleanType()
    elif types.is_int8(at):
        spark_type = ByteType()
    elif types.is_int16(at):
        spark_type = ShortType()
    elif types.is_int32(at):
        spark_type = IntegerType()
    elif types.is_int64(at):
        spark_type = LongType()
    elif types.is_float32(at):
        spark_type = FloatType()
    elif types.is_float64(at):
        spark_type = DoubleType()
    elif types.is_decimal(at):
        spark_type = DecimalType(precision=at.precision, scale=at.scale)
    elif types.is_string(at):
        spark_type = StringType()
    elif types.is_binary(at):
        spark_type = BinaryType()
    elif types.is_date32(at):
        spark_type = DateType()
    elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None:
        spark_type = TimestampNTZType()
    elif types.is_timestamp(at):
        spark_type = TimestampType()
    elif types.is_duration(at):
        spark_type = DayTimeIntervalType()
    elif types.is_list(at):
        if types.is_timestamp(at.value_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = ArrayType(from_arrow_type(at.value_type))
    elif types.is_map(at):
        if LooseVersion(pa.__version__) < LooseVersion("2.0.0"):
            raise TypeError(
                "MapType is only supported with pyarrow 2.0.0 and above")
        if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type):
            raise TypeError("Unsupported type in conversion from Arrow: " +
                            str(at))
        spark_type = MapType(from_arrow_type(at.key_type),
                             from_arrow_type(at.item_type))
    elif types.is_struct(at):
        if any(types.is_struct(field.type) for field in at):
            raise TypeError(
                "Nested StructType not supported in conversion from Arrow: " +
                str(at))
        return StructType([
            StructField(field.name,
                        from_arrow_type(field.type),
                        nullable=field.nullable) for field in at
        ])
    elif types.is_dictionary(at):
        spark_type = from_arrow_type(at.value_type)
    elif types.is_null(at):
        spark_type = NullType()
    else:
        raise TypeError("Unsupported type in conversion from Arrow: " +
                        str(at))
    return spark_type
示例#14
0
def equivalent_type(f):
    if f == 'datetime64[ns]': return DateType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()
示例#15
0
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, TimestampType
schema = StructType([
    StructField('FIPS', IntegerType(), True),
    StructField('Admin2', StringType(), True),
    StructField('Province_State', StringType(), True),
    StructField('Country_Region', StringType(), True),
    StructField('Last_Update', TimestampType(), True),
    StructField('Lat', DoubleType(), True),
    StructField('Long_', DoubleType(), True),
    StructField('Confirmed', IntegerType(), True),
    StructField('Deaths', IntegerType(), True),
    StructField('Recovered', IntegerType(), True),
    StructField('Active', IntegerType(), True),
    StructField('Combined_Key', StringType(), True),
    StructField('process_date', DateType(), True),
])

# Create initial empty Spark DataFrame based on preceding schema
jhu_daily = spark.createDataFrame([], schema)

# COMMAND ----------

# MAGIC %md ## Loops Through Each File
# MAGIC The following code snippet processes each file to:
# MAGIC * Extract out the filename which is needed to know which date the data is referring
# MAGIC * The schema of the files change over time so we need slightly different logic to insert data for each different schema

# COMMAND ----------

import os
示例#16
0
               when(F.col("naics_code").isin(722511), "full_service_restaurants").\
               when(F.col("naics_code").isin(722513), "limited_service_restaurants").\
               when(F.col("naics_code").isin(446110, 446191), "pharmacies_and_drug_stores").\
               when(F.col("naics_code").isin(311811,722515), "snack_and_bakeries").\
               when(F.col("naics_code").isin(445210,445220,445230,445291,445292,445299), "specialty_food_stores").\
               when(F.col("naics_code").isin(445110), "supermarkets_except_convenience_stores")).\
    select("placekey","safegraph_place_id","naics_code","file_name")

    def explodeVisits(date_range_start, visit_by_day):
        start = datetime.datetime(*map(int, date_range_start[:10].split('-')))
        return {(start + datetime.timedelta(days=days)): visits
                for days, visits in enumerate(json.loads(visit_by_day))}

    #Credit to the professor, I levarage this piece of code from class

    udfExpand = F.udf(explodeVisits, T.MapType(DateType(), T.IntegerType()))
    df = spark.read.csv("hdfs:///data/share/bdm/weekly-patterns-nyc-2019-2020/*", header=True) \
           .select("placekey","safegraph_place_id",
              F.explode(udfExpand('date_range_start', 'visits_by_day')) \
                 .alias('date', "visits"))

    # .where(f"date=='{date}'")
    #Credit to the professor, I leverage this piece of code from class


    def find_median(values_list):
        try:
            median = np.median(values_list)
            return round(float(median), 2)
        except Exception:
            return None
示例#17
0
from pyspark.sql.types import StructType, StructField, StringType, DateType

from src.database.contracts import wash_contract as c

SCHEMA = StructType([
    StructField(c.ID, StringType()),
    StructField(c.DATE, DateType()),
    StructField(c.AGE, StringType()),
    StructField(c.RACE, StringType()),
    StructField(c.SEX, StringType())
])

COLUMNS = [c.ID, c.DATE, c.AGE, c.RACE, c.SEX]
示例#18
0
    "struct": StructType,
    "array": ArrayType,
    "bigint": LongType,
    "date": DateType,
    "byte": ByteType,
    "short": ShortType,
    "datetime": TimestampType,
    "binary": BinaryType,
    "null": NullType,
    "vector": VectorUDT
}

SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }

# Profiler
PROFILER_COLUMN_TYPES = {
    "categorical", "numeric", "date", "null", "array", "binary"
}
PYTHON_TO_PROFILER = {
    "string": "categorical",
    "boolean": "categorical",
    "int": "numeric",
    "decimal": "numeric",
    "date": "date",
    "array": "array",
    "binaty": "binary",
示例#19
0
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType
import random
"""DO NOT ADD IN FINAL"""

spark = SparkSession.builder.appName("splitter").master(
    "local[*]").getOrCreate()

schema = StructType([
    StructField("created_at", DateType(), True),  # 2020-10-15 00:00:01
    StructField("tweet_id", StringType(), True),  # 1.31652922155725E+018
    StructField("likes", FloatType(), True),  # like count
    StructField("retweet_count", FloatType(), True),  # retweet count
    StructField("source", StringType(), True),  # twitter source
    StructField("user_id", StringType(), True),
    StructField("user_name", StringType(), True),
    StructField("user_screen_name", StringType(), True),
    StructField("user_description", StringType(), True),
    StructField("user_join_date", DateType(), True), \
    StructField("user_followers_count", FloatType(), True),
    StructField("user_location", StringType(), True),
    StructField("lat", FloatType(), True),
    StructField("long", FloatType(), True),
    StructField("city", StringType(), True),
    StructField("country", StringType(), True),
    StructField("continent", StringType(), True),
    StructField("state", StringType(), True),
    StructField("state_code", StringType(), True),
    StructField("collected_at", DateType(), True),
    StructField("tweet", StringType(), True),  # tweet body
    StructField("sentiment", FloatType(), True)
示例#20
0
    'WEATHER_DELAY': 0,
    'NAS_DELAY': 0,
    'SECURITY_DELAY': 0,
    'LATE_AIRCRAFT_DELAY': 0
})

#adding status and delay column to dataframe
from pyspark.sql.functions import col, when
df1 = df1.withColumn('delay',
                     when(df1.ARR_DELAY > 0, df1.ARR_DELAY).otherwise(0))
df1 = df1.withColumn('status', when(df1.ARR_DELAY > 0, 1).otherwise(0))

#converting datatype of columns
from pyspark.sql.types import IntegerType, DoubleType, DateType

df1 = df1.withColumn("FL_DATE", df1["FL_DATE"].cast(DateType()))
df1 = df1.withColumn("year", df1["year"].cast(IntegerType()))
df1 = df1.withColumn("month", df1["month"].cast(IntegerType()))
df1 = df1.withColumn("day", df1["day"].cast(IntegerType()))

#changinge columns name
df2 = df1.withColumnRenamed("FL_DATE", "fl_date").withColumnRenamed(
    "OP_CARRIER", "op_carrier").withColumnRenamed(
        "OP_CARRIER_FL_NUM", "op_carrier_fl_num").withColumnRenamed(
            "OP_CARRIER_FL_NUM", "op_carrier_fl_num"
        ).withColumnRenamed("ORIGIN", "origin").withColumnRenamed(
            "DEST", "dest"
        ).withColumnRenamed("CRS_DEP_TIME", "crs_dep_time").withColumnRenamed(
            "DEP_TIME", "dep_time"
        ).withColumnRenamed("DEP_DELAY", "dep_delay").withColumnRenamed(
            "TAXI_OUT", "taxi_out"
示例#21
0
def get_stock_dataframe():
    prices_DF = sqlContext.read.format('com.databricks.spark.csv')\
         .schema(prices_schema())\
         .options(header='true')\
         .load(prices)\
         .dropDuplicates().cache()

    text = sc.textFile(symbols).map(lambda l: l.split('\t'))
    symbols_dict = text.collectAsMap()
    for k, v in symbols_dict.iteritems():
        symbols_dict[k] = v.replace("&#39;", "\'")

    def integrate_company(symbol_prices):
        for key, value in symbols_dict.iteritems():
            if key == symbol_prices:
                company = value
        return company

    udf_company = udf(integrate_company, StringType())
    company_DF = prices_DF.withColumn('Company',
                                      udf_company(prices_DF.Symbol)).cache()

    sectors_DF = sqlContext.read.format('com.databricks.spark.csv')\
         .schema(sectors_schema())\
         .options(header='true')\
         .load(sectors)\
         .dropDuplicates().cache()

    sectors_dict = map(lambda row: row.asDict(), sectors_DF.collect())

    def integrate_sector(symbol_prices):
        sector_value = ""
        for item in sectors_dict:
            if item['Symbol'] == symbol_prices:
                sector_value = item['Sector']
        return sector_value

    udf_sector = udf(integrate_sector, StringType())
    stock_sec_DF = company_DF.withColumn('Sector',
                                         udf_sector(company_DF.Symbol))

    def integrate_industry(symbol_prices):
        industry_value = ""
        for item in sectors_dict:
            if item['Symbol'] == symbol_prices:
                industry_value = item['Industry']
        return industry_value

    udf_industry = udf(integrate_industry, StringType())
    stock_DF = stock_sec_DF.withColumn('Industry',
                                       udf_industry(stock_sec_DF.Symbol))

    stock_new_DF = stock_DF.withColumn("Date", stock_DF["Date"].cast(
        DateType())).orderBy("Date", "Symbol")

    def percent_change(open, close):
        change = (open - close) / open
        return change

    udf_percentchange = udf(percent_change, FloatType())
    stock_final_DF = stock_new_DF.withColumn(
        'Percentage_Change',
        udf_percentchange(stock_new_DF.Open, stock_new_DF.Close))

    stock_final_DF.saveAsParquetFile("stocks_full.parquet")

    return stock_final_DF
df7 = pd.DataFrame.from_dict(json.loads(response.text))


fig = px.line(df7, x="day", y="deaths", color='month')
fig=plot(fig,output_type='div')
displayHTML(fig)

# COMMAND ----------

# MAGIC %md ## Clustering data To Get Hotspots

# COMMAND ----------

from pyspark.sql.types import  StructField,StringType,IntegerType,StructType,FloatType,DateType,DateConverter

data_schema=[StructField("Date",DateType(),True),
             StructField("State_or_UT",StringType(),True),
             StructField("Indian_cases",IntegerType(),True),
             StructField("Foreign_cases" ,IntegerType(),True),
             StructField("Cured",IntegerType(),True),
             StructField("Latitude",FloatType(),True),
             StructField("Longitude",FloatType(),True),
             StructField("Death",IntegerType(),True),
             StructField("Total_cases",IntegerType(),True)
            ]

dfschema=StructType(data_schema)

# COMMAND ----------

df=spark.read.csv("dbfs:/FileStore/tables/complete1.csv",mode='FAILFAST',header=True,schema=dfschema)
示例#23
0
spark = SparkSession.builder.appName('ReviewsOdsSession').getOrCreate()

#establising source files
AmzRevjson = "/edw/Reviews/SourceCode/data/asin_reviews.json"

#read data into a frame
Reviews_df = spark.read.json(AmzRevjson)

#tranformed frame
Cl_Reviews_df = Reviews_df.select(
    'asin',
    col('overall').alias("reviewRating"), 'reviewerID', 'reviewerName',
    (when(col('unixReviewTime').isNull(),
          '1990-01-01').otherwise(from_unixtime(
              'unixReviewTime',
              'yyyy-MM-dd'))).cast(DateType()).alias("reviewDate")).filter(
                  col("reviewDate") >= '2011-01-01')

#Write to DB
mode = "append"
table = os.environ['SCHEMA_STG'] + ".reviews"
url = "jdbc:postgresql://" + os.environ['PGHOST'] + "/" + os.environ['DB_DWH']
properties = {
    "user": os.environ['PGUSER'],
    "password": os.environ['PGPASSWD'],
    "driver": 'org.postgresql.Driver'
}
Cl_Reviews_df.write.jdbc(url=url,
                         table=table,
                         mode=mode,
                         properties=properties)
示例#24
0
def main(context):
    """Main function takes a Spark SQL context."""
    # YOUR CODE HERE
    # YOU MAY ADD OTHER FUNCTIONS AS NEEDED

    start = time.time()
    # task 1
    if(read_raw):
        comments = sqlContext.read.json('comments-minimal.json.bz2')
        submissions = sqlContext.read.json('submissions.json.bz2')
        label = sqlContext.read.load('labeled_data.csv', format = 'csv', sep = ',',header="true")
        print("load done")
        comments.write.parquet('comments')
        submissions.write.parquet('submissions')
        label.write.parquet('label')
    else:
        comments = context.read.load('comments')
        submissions = context.read.load('submissions')
        label = context.read.load('label')
    print("task 1 complete: read data")
    #result.show()

    if(training):
        # task 2
        associate = associated(comments, label).select(col('id'), col('body'), col('labeldjt'))
        print("task 2 complete: associate data")

        # task 4, 5
        newColumn = associate.withColumn('ngrams', sanitize_udf(associate['body']))
        print("task 4, 5 complete: generate unigrams")

        # task 6A
        cv = CountVectorizer(inputCol = 'ngrams', outputCol = "features", binary = True)
        model = cv.fit(newColumn)
        tmp = model.transform(newColumn)
        print("task 6A complete: cv model")

        # task 6B
        result = tmp.withColumn('poslabel', F.when(col('labeldjt') == 1, 1).otherwise(0))
        result = result.withColumn('neglabel', F.when(col('labeldjt') == -1, 1).otherwise(0))
        pos = result.select(col('poslabel').alias('label'), col('features'))
        neg = result.select(col('neglabel').alias('label'), col('features'))
        print("task 6B complete: relabel data")

        # task 7
        # Initialize two logistic regression models.
        # Replace labelCol with the column containing the label, and featuresCol with the column containing the features.
        poslr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10)
        neglr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10)
        # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers.
        posEvaluator = BinaryClassificationEvaluator()
        negEvaluator = BinaryClassificationEvaluator()
        # There are a few parameters associated with logistic regression. We do not know what they are a priori.
        # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try.
        # We will assume the parameter is 1.0. Grid search takes forever.
        posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build()
        negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build()
        # We initialize a 5 fold cross-validation pipeline.
        posCrossval = CrossValidator(
            estimator = poslr,
            evaluator = posEvaluator,
            estimatorParamMaps = posParamGrid,
            numFolds = 5)
        negCrossval = CrossValidator(
            estimator = neglr,
            evaluator = negEvaluator,
            estimatorParamMaps = negParamGrid,
            numFolds = 5)
        # Although crossvalidation creates its own train/test sets for
        # tuning, we still need a labeled test set, because it is not
        # accessible from the crossvalidator (argh!)
        # Split the data 50/50
        posTrain, posTest = pos.randomSplit([0.5, 0.5])
        negTrain, negTest = neg.randomSplit([0.5, 0.5])

        # Train the models
        print("Training positive classifier...")
        posModel = posCrossval.fit(posTrain)
        print("Training negative classifier...")
        negModel = negCrossval.fit(negTrain)
        # Once we train the models, we don't want to do it again. We can save the models and load them again later.
        posModel.save("pos.model")
        negModel.save("neg.model")
        model.save("cv.model")
        print("task 7 complete: training")

        # posModel = CrossValidatorModel.load('pos.model')
        # negModel = CrossValidatorModel.load('neg.model')

        # point 7
        pos_trans = posModel.transform(posTest)
        neg_trans = negModel.transform(negTest)

        pos_results = pos_trans.select(['probability', 'label'])
        pos_trans_collect = pos_results.collect()
        pos_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in pos_trans_collect]
        pos_scoreAndLabels = sc.parallelize(pos_trans_results_list)

        pos_metrics = metric(pos_scoreAndLabels)
        print("The ROC score of positive results is: ", pos_metrics.areaUnderROC)

        neg_results = neg_trans.select(['probability', 'label'])
        neg_trans_collect = neg_results.collect()
        neg_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in neg_trans_collect]
        neg_scoreAndLabels = sc.parallelize(neg_trans_results_list)

        neg_metrics = metric(neg_scoreAndLabels)
        print("The ROC score of negative results is: ", neg_metrics.areaUnderROC)

        plot_ROC(pos_trans_results_list, 'positive_results')
        plot_ROC(neg_trans_results_list, 'negative_results')
        print("point 7 complete: ROC")

    else:
        model = CountVectorizerModel.load('cv.model')
        posModel = CrossValidatorModel.load('pos.model')
        negModel = CrossValidatorModel.load('neg.model')
        print("model loaded")

        # task 8
        comments_tmp = comments.select(col('id'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('score').alias('com_score'))
        comments_full = comments_tmp.withColumn('link_id', process_id_udf(comments_tmp['link_id']))
        submissions_full = submissions.select(col('id').alias('sub_id'), col('title'), col('score').alias('sub_score'))

        if(joinFull):
            com_sub = comments_full.join(submissions_full, comments_full.link_id == submissions_full.sub_id, 'inner')
            com_sub = com_sub.select(col('id'), col('title'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('com_score'), col('sub_score'))
            com_sub.write.parquet('com_sub')
        else:
            com_sub = context.read.load('com_sub')# .sample(False, 0.01, None)
        print('task 8 complete: comment with submission')

        # task 9
        filtered = com_sub.filter("body NOT LIKE '%/s%' and body NOT LIKE '&gt;%'")
        filtered_result = filtered.withColumn('ngrams', sanitize_udf(filtered['body']))
        feaResult = model.transform(filtered_result).select(col('id'), col('link_id'), col('created_utc'), \
                                    col('features'), col('author_flair_text'), col('com_score'), col('sub_score'), col('title'))
        posResult = posModel.transform(feaResult)
        negResult = negModel.transform(feaResult)
        print("transformed")

        pos = posResult.withColumn('pos', threshold_pos_udf(posResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'pos', 'com_score', 'sub_score', 'title')
        neg = negResult.withColumn('neg', threshold_neg_udf(negResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'neg', 'com_score', 'sub_score', 'title')
        #final_probs = pos.join(neg, pos.id == neg.id_neg, 'inner').select('id', 'created_utc', 'author_flair_text', 'title', 'pos', 'neg')
        #final_probs.show()
        #pos.write.parquet('pos')
        #neg.write.parquet('neg')
        print('task 9 complete: predict')

        # task 10
        # compute 1
        num_rows = pos.count()
        pos_filtered = pos.filter(pos.pos == 1)
        neg_filtered = neg.filter(neg.neg == 1)
        num_pos = pos_filtered.count()
        num_neg = neg_filtered.count()

        print('Percentage of positive comments: {}'.format(num_pos / num_rows))
        print('Percentage of negative comments: {}'.format(num_neg / num_rows))
        print('finish compute 1')

        # compute 2
        pos_time = pos.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType()))
        neg_time = neg.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType()))

        num_pos_time = pos_time.groupBy('time').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('time')
        num_neg_time = neg_time.groupBy('time').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('time')

        num_pos_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_pos_time')
        num_neg_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_neg_time')
        print('finish compute 2')

        # compute 3
        state = sqlContext.createDataFrame(states, StringType())
        pos_state = pos.groupBy('author_flair_text').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive'))
        neg_state = neg.groupBy('author_flair_text').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative'))

        pos_state = pos_state.join(state, pos_state.author_flair_text == state.value, 'inner')
        pos_state = pos_state.na.drop(subset=['value'])
        pos_state = pos_state.select(col('author_flair_text').alias('state'), col('Percentage of positive').alias('Positive'))

        neg_state = neg_state.join(state, neg_state.author_flair_text == state.value, 'inner')
        neg_state = neg_state.na.drop(subset=['value'])
        neg_state = neg_state.select(col('author_flair_text').alias('state'), col('Percentage of negative').alias('Negative'))

        pos_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_state')
        neg_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_state')
        print('finish compute 3')

        # compute 4
        pos_com_score = pos.groupBy('com_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('com_score')
        pos_sub_score = pos.groupBy('sub_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('sub_score')
        neg_com_score = neg.groupBy('com_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('com_score')
        neg_sub_score = neg.groupBy('sub_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('sub_score')

        pos_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_com_score')
        pos_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_sub_score')
        neg_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_com_score')
        neg_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_sub_score')
        print('finish compute 4')

        # compute 5
        pos_story = pos.groupBy('title').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy(F.desc('Percentage of positive')).limit(10)
        neg_story = neg.groupBy('title').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy(F.desc('Percentage of negative')).limit(10)

        pos_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_story')
        neg_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_story')
        print('finish compute 5')

        end = time.time()
        print('time consumed: {}'.format(end - start))
示例#25
0
def process_log_data(spark, input_data, output_data):
    """
    Spark pipeline to process and save csv formatted log data 
    Saves to parquet file type on S3
    Args:
        spark (object): Spark session
        input_data (str): S3 bucket input name
        output_data (str): S3 bucket output name
    Returns:
        None
    """
    # get filepath to log data file
    log_data = input_data + 'log-data/*/*/*.json'

    # read log data file
    df = spark.read.json(log_data)

    # filter by actions for song plays
    df = df.filter(df.page == 'NextSong')

    # extract columns for users table
    users_table = df[['userId', 'firstName', 'lastName', 'gender',
                      'level']].drop_duplicates()

    # write users table to parquet files
    users_table.write.parquet(output_data + 'user_data', mode='overwrite')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000.),
                        returnType=TimestampType())
    df = df.withColumn('start_time', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: datetime.fromtimestamp(int(x) / 1000.),
                       returnType=DateType())
    df = df.withColumn('datetime', get_datetime(df.ts))

    # add year and month -- needed for songplays_table partitione
    df = df.withColumn("year", year(df.datetime).alias('year'))
    df = df.withColumn("month", month(df.datetime).alias('month'))

    # extract columns to create time table
    time_table = df.select(
        'start_time', 'year', 'month',
        hour(df.datetime).alias('hour'),
        dayofmonth(df.datetime).alias('dayofmonth'),
        weekofyear(df.datetime).alias('weekofyear'),
        dayofweek(df.datetime).alias('weekday')).drop_duplicates()

    # write time table to parquet files partitioned by year and month
    time_table.write.parquet(output_data + 'time_data',
                             mode='overwrite',
                             partitionBy=('year', 'month'))

    # read in song data to use for songplays table
    song_df = spark.read.parquet(output_data + 'song_data')

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df[['start_time', 'userId', 'level', 'sessionId', 'location', 'userAgent',
                                                                  'song', 'length', 'year', 'month']] \
        .join(song_df[['song_id', 'artist_id', 'title', 'duration']],
            on = (df.song == song_df.title) & (df.length == song_df.duration), how = 'inner') \
        .withColumn('songplay_id', monotonically_increasing_id()) \
        .select('songplay_id', 'start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId',
                                                            'location', 'userAgent', 'year', 'month') \
        .drop_duplicates()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.parquet(output_data + 'songplays_data',
                                  mode='overwrite',
                                  partitionBy=('year', 'month'))
def insert_time_dim(start_date_id, end_date_id):
    time_begin = datetime.strptime(str(start_date_id), "%Y%m%d").date()
    time_end = datetime.strptime(str(end_date_id), "%Y%m%d").date()

    print('time_begin')
    print(time_begin)

    print('time_end')
    print(time_end)

    # tao dataframe tu time_begin va time_end
    data = [(time_begin, time_end)]
    df = spark.createDataFrame(data, ["minDate", "maxDate"])
    # convert kieu dl va ten field
    df = df.select(
        df.minDate.cast(DateType()).alias("minDate"),
        df.maxDate.cast(DateType()).alias("maxDate"))

    # chay vong lap lay tat ca cac ngay giua mindate va maxdate
    df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \
        .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \
        .select("*", f.posexplode("repeat").alias("date", "val")) \
        .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \
        .select('date')

    # convert date thanh cac option ngay_thang_nam
    df = df.withColumn('id', f.date_format(df.date, "yyyyMMdd")) \
        .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \
        .withColumn('ngay_trong_tuan', f.from_unixtime(f.unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \
        .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \
        .withColumn('thang', f.month(df.date)) \
        .withColumn('quy', f.quarter(df.date)) \
        .withColumn('nam', f.year(df.date))
    df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1)

    data_time = DynamicFrame.fromDF(df, glueContext, 'data_time')

    # convert data
    data_time = data_time.resolveChoice(specs=[('tuan_trong_thang',
                                                'cast:int')])

    # chon cac truong va kieu du lieu day vao db
    applymapping1 = ApplyMapping.apply(
        frame=data_time,
        mappings=[("id", "string", "id", "bigint"),
                  ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'),
                  ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"),
                  ("tuan_trong_thang", "int", "tuan_trong_thang", "int"),
                  ("tuan_trong_nam", "int", "tuan_trong_nam", "int"),
                  ("thang", "int", "thang", "int"),
                  ("quy", "int", "quy", "int"), ("nam", "int", "nam", "int"),
                  ("date", "date", "ngay", "timestamp")])

    resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")
    dropnullfields3 = DropNullFields.apply(
        frame=resolvechoice2, transformation_ctx="dropnullfields3")

    # ghi dl vao db
    preactions = 'delete student.time_dim where id >= ' + str(start_date_id)
    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dropnullfields3,
        catalog_connection="glue_redshift",
        connection_options={
            "preactions": preactions,
            "dbtable": "student.time_dim",
            "database": "student_native_report"
        },
        redshift_tmp_dir=
        "s3n://dts-odin/temp/tu-student_native_report/student/time_dim",
        transformation_ctx="datasink4")
    .add("evntloc_key",IntegerType(), True) \
    .add("cntrycd",StringType(),True) \
    .add("continent",StringType(),True) \
    .add("region",StringType(),True) \
    .add("cntry",StringType(),True) \

evntcatschema = StructType() \
    .add("evntcat_key",IntegerType(),True) \
    .add("relcd",StringType(),True) \
    .add("reldesc",StringType(),True) \
    .add("relcat",StringType(),True)


dateschema = StructType() \
    .add("datekey",IntegerType(),True) \
    .add("date",DateType(),True) \
    .add("dayofweekname",StringType(),True) \
    .add("dayofweek",IntegerType(),True) \
    .add("dayofmonth",IntegerType(),True) \
    .add("dayofyear",IntegerType(),True) \
    .add("calendarweek",IntegerType(),True) \
    .add("calendarmonthname",StringType(),True) \
    .add("calendarmonth",IntegerType(),True) \
    .add("calendaryear",IntegerType(),True) \
    .add("lastdayinmonth",StringType(),True)


gbldatafactschema = StructType() \
    .add("globaleventid",IntegerType(),True) \
    .add("event_date",DateType(),True) \
    .add("actcd",StringType(),True) \
示例#28
0
# coding=utf-8

from datetime import datetime
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DateType
from pyspark import SparkContext
from pyspark.sql import SQLContext

# 创建SparkContext
sc = SparkContext('local')

# 创建SQLContext
sqlContext = SQLContext(sc)

# 创建DataFrame
df = sqlContext.createDataFrame([("11/25/1991", "11/24/1991", "11/30/1991"),
                                 ("11/25/1391", "11/24/1992", "11/30/1992")],
                                schema=['first', 'second', 'third'])

# 调用withColumn基于原first列进行数据类型转换生成新列test
func = udf(lambda x: datetime.strptime(x, '%m/%d/%Y'), DateType())
df = df.withColumn('test', func(col('first')))

# 数据打印
df.show()

# 打印元数据信息
df.printSchema()
示例#29
0
def main():
    # create spark session
    spark = SparkSession.builder.master("local[*]").getOrCreate()
    spark.catalog.clearCache()

    # Connect to MariaDB Platform
    try:
        connection = mariadb.connect(
            user="******",
            password="******",  # pragma: allowlist secret
            host="localhost",
            port=3306,
            database="baseball",
        )
    except mariadb.Error as e:
        print(f"Error connecting to MariaDB Platform: {e}")
        sys.exit(1)

    printout("success connecting...")

    # setup schema
    schema = StructType([
        StructField(name="game_id", dataType=IntegerType(), nullable=True),
        StructField(name="batter", dataType=IntegerType(), nullable=True),
        StructField(name="hit", dataType=IntegerType(), nullable=True),
        StructField(name="atbat", dataType=IntegerType(), nullable=True),
        StructField(name="local_date", dataType=DateType(), nullable=True),
    ])

    # create empty spark dataframe using schema
    df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema)

    # import batter counts table and game table
    cursor = connection.cursor()
    count = 0
    printout("creating table...")
    cursor.execute(
        f"SELECT bc.game_id, bc.batter, bc.Hit, bc.atbat, gt.local_date \
        FROM batter_counts bc INNER JOIN game_temp gt on bc.game_id = gt.game_id ORDER BY game_id"
    )
    printout("importing table...")
    for (game_id, batter, hit, atbat, local_date) in cursor:
        to_insert = spark.createDataFrame([
            (game_id, batter, hit, atbat, local_date),
        ])
        df = df.union(to_insert)
        count += 1
        if count % 500 == 0:
            print(f"\timporting row {count}...")
    print(df.show(n=200))
    df.createOrReplaceTempView("rolling_avg_temp")
    df.persist(StorageLevel.MEMORY_AND_DISK)

    # solve for rolling batting averages
    printout("solving for rolling batting averages...")
    rolling_df = spark.sql(
        f"""SELECT rat1.batter, SUM(rat2.Hit) AS sum_hits , SUM(rat2.atbat) AS sum_bats \
        FROM rolling_avg_temp rat1 JOIN rolling_avg_temp rat2 ON rat2.local_date \
        BETWEEN DATE_ADD(rat1.local_date, - 100) AND rat1.local_date AND \
        rat1.batter = rat2.batter GROUP BY rat1.batter""")

    print(rolling_df.show(n=20))
    rolling_df.createOrReplaceTempView("rolling_df")
    rolling_df.persist(StorageLevel.MEMORY_AND_DISK)

    # create array column of all necessary data
    printout("converting data to array...")
    rolling_df = spark.sql(
        """SELECT * , SPLIT(CONCAT(CASE WHEN batter IS NULL THEN "" \
        ELSE batter END, " ", CASE WHEN sum_hits IS NULL OR sum_bats IS NULL THEN "" \
        ELSE ROUND(sum_hits/sum_bats, 3) END), " ") \
        AS array_with_rolling_averages FROM rolling_df""")
    print(rolling_df.show(n=20))

    # fit array column to count vectorizer
    printout("running vectorizer and transformer...")
    count_vectorizer = CountVectorizer(inputCol="array_with_rolling_averages",
                                       outputCol="array_vector")
    count_vectorizer_fitted = count_vectorizer.fit(rolling_df)

    # transform the fitted count vectorizer
    rolling_df = count_vectorizer_fitted.transform(rolling_df)
    print(rolling_df.show(n=20, truncate=False))

    return
示例#30
0
import datetime
from decimal import Decimal
import uuid

from typing import Counter, Iterable, List, Tuple
from pyspark.sql import SparkSession
from pyspark.sql.types import DateType, DecimalType, Row, StringType, StructField, StructType

invoice_schema = StructType([
    StructField("invoice_id", StringType(), False),
    StructField("invoice_date", DateType(), False),
    StructField("due_date", DateType(), False),
    StructField("period_start_date", DateType(), False),
    StructField("period_end_date", DateType(), False),
    StructField("total_amount", DecimalType(scale=2), False),
    StructField("canonical_vendor_id", StringType(), False),
])
line_item_schema = StructType([
    StructField("invoice_id", StringType(), False),
    StructField("line_item_id", StringType(), False),
    StructField("period_start_date", DateType(), False),
    StructField("period_end_date", DateType(), False),
    StructField("total_amount", DecimalType(scale=2), False),
    StructField("canonical_line_item_id", StringType(), False),
])


def map_vendor_not_seen_in_a_while(p: Tuple[str, Iterable[Row]]):
    vendor_id, ins = p
    ins = sorted(ins, key=lambda i: i.invoice_date)
    for i, invoice in enumerate(ins):