def test_as_spark_type_koalas_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), List[bytes]: (np.dtype("object"), ArrayType(BinaryType())), List[np.character]: (np.dtype("object"), ArrayType(BinaryType())), List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())), List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())), List[bool]: (np.dtype("object"), ArrayType(BooleanType())), List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())), List[datetime.date]: (np.dtype("object"), ArrayType(DateType())), List[np.int8]: (np.dtype("object"), ArrayType(ByteType())), List[np.byte]: (np.dtype("object"), ArrayType(ByteType())), List[decimal.Decimal]: (np.dtype("object"), ArrayType(DecimalType(38, 18))), List[float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float32]: (np.dtype("object"), ArrayType(FloatType())), List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())), List[int]: (np.dtype("object"), ArrayType(LongType())), List[np.int]: (np.dtype("object"), ArrayType(LongType())), List[np.int64]: (np.dtype("object"), ArrayType(LongType())), List[np.int16]: (np.dtype("object"), ArrayType(ShortType())), List[str]: (np.dtype("object"), ArrayType(StringType())), List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())), List[datetime.datetime]: (np.dtype("object"), ArrayType(TimestampType())), List[np.datetime64]: (np.dtype("object"), ArrayType(TimestampType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(pandas_on_spark_type(numpy_or_python_type), (dtype, spark_type)) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): pandas_on_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): pandas_on_spark_type(np.dtype("object"))
] }, format="csv", format_options={'withHeader': True}) #convert to dataframes rpa_AE_RO_df = rpa_AE_RO_dyf.toDF() rpa_summ_hr_df = rpa_summ_hr_dyf.toDF() rpa_df = rpa_dyf.toDF() #-----------------------------------Creation of LOOKUP Dataframes----------------------------------------------# #1. Transform the date column and create the composite join key---RPA-AE RO #1.1. Convert CalendarDate to date format from string #1.1.1. Create UDF function to perform the casting operation(m/dd/yyyy) func_str_to_date_lookup = udf(lambda x: datetime.strptime(x, '%m/%d/%Y'), DateType()) #1.2.2. Augment the transformed value in rpa_AE_RO_transformed_df rpa_AE_RO_transformed_df = rpa_AE_RO_df.withColumn( 'new_date', func_str_to_date_lookup(col('CalendarDate'))) #1.2. Concatenate the new_date column and Employeenumber rpa_AE_RO_transformed_df = rpa_AE_RO_transformed_df.withColumn( 'join_key', concat('new_date', lit('_'), 'EmployeeNumber')) #1.3. Drop the field calendar date rpa_AE_RO_transformed_df = rpa_AE_RO_transformed_df.drop('CalendarDate') #2. Transform the date column and create the composite join key---RPA-SUMM HR #2.1. Convert CalendarDate to date format from string #2.1.1. Create UDF function to perform the casting operation(m/dd/yyyy) func_str_to_date_lookup_hr = udf(lambda x: datetime.strptime(x, '%m/%d/%Y'), DateType()) #2.2.2. Augment the transformed value in rpa_AE_RO_transformed_df
union_result_df = union_result_df.union(water_equivalent_snow_fall_df) union_result_df = union_result_df.union(sunsine_df) print "End union processing" #union_result_df.show() union_result_df.printSchema() #perform pivoting based on weather category to convert summarize rows into column result curation_result_df = union_result_df.groupBy( union_result_df.station_identifier, union_result_df.observation_date).pivot("weather_category").agg( round(sum(union_result_df.calculation_result), 2)).sort(union_result_df.observation_date) #udf function to convert observation_date in proper format func = udf(lambda x: datetime.strptime(x, '%Y%m%d'), DateType()) curation_result_df = curation_result_df.withColumn( "observation_date_format", func(col('observation_date'))) curation_result_df.printSchema() #write final result in hdfs location print 'started writing weather curated data to hdfs location' curation_result_df.na.fill(0.0).select( "station_identifier", "observation_date_format", "Precipitation", "MaxTemparature", "Snowfall", "SnowDepth", "Evaporation", "WaterEquivalentSnowDepth", "WaterEquivalentSnowFall", "Sunshine").write.format("csv").save( path="hdfs:///tmp/weathercurated_result", mode='overwrite') print 'End processing to writing'
# | value | # +------------+ # |{"custom"...| # +------------+ # # and create separated fields like this: # +------------+-----+-----------+ # | customer|score| riskDate | # +------------+-----+-----------+ # |"sam@tes"...| -1.4| 2020-09...| # +------------+-----+-----------+ eventSchema = StructType([ StructField('customer', StringType()), StructField('score', FloatType()), StructField('riskDate', DateType()), ]) df = df.withColumn('value', from_json('value', eventSchema))\ .select(col('value.*')) # Storing them in a temporary view called CustomerRisk df.createOrReplaceTempView('CustomerRisk') # Execute a sql statement against a temporary view, selecting the customer and the score from the temporary view, creating a dataframe called customerRiskStreamingDF customerRiskStreamingDF = spark.sql(''' SELECT customer, score FROM CustomerRisk ''') # Sink the customerRiskStreamingDF dataframe to the console in append mode
# Ejercicio01: Largo del titulo de la pelicula # Alumnos: Carla Alvarez, Daniel Garcia, Juan Carlos Lopez, Carlos Mellado # Ejecutar con: spark-submit lp.py > lp.txt # Visualizar resultado: cat lp.txt # Consideraciones: archivo 'peliculas.txt' debe haber sido cargado a hdfs en la raiz import re from pyspark.context import SparkContext from pyspark.sql import SQLContext from pyspark.sql.session import SparkSession from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType # Definiendo el contexto para el script (en pyspark no es necesario) y cargando los datos sc = SparkContext() spark = SparkSession(sc) schema = StructType([StructField('Id', IntegerType(), True), StructField('NombrePelicula', StringType(), False), StructField('Fecha', DateType(), True)]) data = spark.read.format('csv').option('header', 'False').option('sep', '\t').option('mode', 'DROPMALFORMED').load('peliculas.txt', schema=schema) # Buscando el titulo con la maxima cantidad de caracteres titulos = data.rdd.map(lambda fila: re.sub(r'\([0-9{4})]*\)', '', fila.NombrePelicula).lower().strip()) titulos = titulos.map(lambda x: (str(x), len(str(x)))) resultado = titulos.takeOrdered(1, key = lambda x: -x[1]) # Mostrando resultados print('La pelicula con el titulo mas largo es:') print('Pelicula: {0}'.format(str(resultado[0][0]))) print('Largo: {0}'.format(str(resultado[0][1])))
def print_each_line(eachLine): print eachLine return sparkcontext = SparkContext(conf=sparkconfig) sqlContext = SQLContext(sparkcontext) toIntegerfunc = UserDefinedFunction(lambda eachElement: int(eachElement), IntegerType()) toBooleanfunc = UserDefinedFunction( lambda eachElement: True if eachElement == 'P' else False, BooleanType()) toDateFunc = UserDefinedFunction( lambda eachElement: datetime.strptime(eachElement, '%m/%d/%Y'), DateType()) csvDF = sqlContext.read \ .option("header", "true") \ .csv("/home/dharshekthvel/Downloads/stop.csv") schema_modified_df = csvDF.withColumn("VEHICLE_ID", toIntegerfunc(csvDF["VEHICLE_ID"]))\ .withColumn("PLAN_STATUS", toBooleanfunc(csvDF["PLAN_STATUS"]))\ .withColumn("OPD_DATE", toDateFunc(csvDF["OPD_DATE"])) #csvDF.printSchema() #csvDF.show(2) schema_modified_df.printSchema() schema_modified_df.show(20)
spark = SpakrSession.builder.config("jar1", "jar2").appName("").getOrCreate() # for reading a file without the schema known, we can use inferSchema while creating the dataframe df1 = spark.read.csv("file_path.csv", inferSchema=True, header=True, sep=";", mode="DROPMALFORMED") # for reading a file with pre-defined Schema schema = StructType([ StructField("col_1", IntegerType()), StructField("col_2", DoubleType()), StructField("col_3", DateType()) ]) df2 = spark.read.csv("file_path.csv", schema=schema, header=True, sep=";", mode="DROPMALFORMED") # =========================== # # spark can read data from several places - local, s3, hdfs df3 = spark.read.csv( "s3a://....") # s3a for EC2, s3/s3a for EMR (s3 protocol is faster) df4 = spark.read.csv("hdfs:///.....")
def _get_target_schema(): return StructType([ StructField("row", IntegerType(), False), StructField("ID", IntegerType(), False), StructField("Name", StringType(), True), StructField("Age", IntegerType(), True), StructField("Photo", StringType(), True), StructField("Nationality", StringType(), True), StructField("Flag", StringType(), True), StructField("Overall", IntegerType(), True), StructField("Potential", IntegerType(), False), StructField("Club", StringType(), True), StructField("Club_logo", StringType(), True), StructField("Value", StringType(), True), StructField("Wage", StringType(), True), StructField("Special", IntegerType(), True), StructField("Preferred_foot", StringType(), True), StructField("International_reputation", IntegerType(), False), StructField("Weak_foot", IntegerType(), True), StructField("Skill_moves", IntegerType(), True), StructField("Work_rate", StringType(), True), StructField("Body_type", StringType(), True), StructField("Real_face", StringType(), True), StructField("Position", StringType(), True), StructField("Jersey_number", StringType(), False), StructField("Joined", DateType(), True), StructField("Loaned_from", StringType(), True), StructField("Contract_valid_until", StringType(), True), StructField("Height", StringType(), True), StructField("Weight", StringType(), True), StructField("LS", StringType(), True), StructField("ST", StringType(), False), StructField("RS", StringType(), True), StructField("LW", StringType(), True), StructField("LF", StringType(), True), StructField("CF", StringType(), False), StructField("RF", StringType(), True), StructField("RW", StringType(), True), StructField("LAM", StringType(), True), StructField("CAM", StringType(), False), StructField("RAM", StringType(), True), StructField("LM", StringType(), True), StructField("LCM", StringType(), True), StructField("CM", StringType(), True), StructField("RCM", StringType(), False), StructField("RM", StringType(), True), StructField("LWB", StringType(), True), StructField("LDM", StringType(), True), StructField("CDM", StringType(), True), StructField("RDM", StringType(), True), StructField("RWB", StringType(), False), StructField("LB", StringType(), True), StructField("LCB", StringType(), True), StructField("CB", StringType(), False), StructField("RCB", StringType(), True), StructField("RB", StringType(), True), StructField("Crossing", IntegerType(), True), StructField("Finishing", IntegerType(), True), StructField("HeadingAccuracy", IntegerType(), False), StructField("ShortPassing", IntegerType(), True), StructField("Volleys", IntegerType(), True), StructField("Dribbling", IntegerType(), True), StructField("Curve", IntegerType(), True), StructField("FKAccuracy", IntegerType(), False), StructField("Long_Passing", IntegerType(), True), StructField("BallControl", IntegerType(), True), StructField("Acceleration", IntegerType(), True), StructField("SprintSpeed", IntegerType(), True), StructField("Agility", IntegerType(), True), StructField("Reactions", IntegerType(), False), StructField("Balance", IntegerType(), True), StructField("ShotPower", IntegerType(), True), StructField("Jumping", IntegerType(), True), StructField("Stamina", IntegerType(), True), StructField("Strength", IntegerType(), True), StructField("LongShots", IntegerType(), True), StructField("Aggression", IntegerType(), False), StructField("Interceptions", IntegerType(), True), StructField("Positioning", IntegerType(), True), StructField("Vision", IntegerType(), True), StructField("Penalties", IntegerType(), True), StructField("Composure", IntegerType(), False), StructField("Marking", IntegerType(), True), StructField("StandingTackle", IntegerType(), True), StructField("SlidingTackle", IntegerType(), True), StructField("GKDiving", IntegerType(), True), StructField("GKHandling", IntegerType(), True), StructField("GKKicking", IntegerType(), False), StructField("GKPositioning", IntegerType(), True), StructField("GKReflexes", IntegerType(), True), StructField("Release_clause", StringType(), True) ])
) # TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic customerJSONSchema = StructType( [ StructField("customerName", StringType()), StructField("email", StringType()), StructField("phone", StringType()), StructField("birthDay", StringType()), ] ) # TO-DO: create a StructType for the Kafka stedi-events topic which has the Customer Risk JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic eventRiskCustomerSchema = StructType( [ StructField("customer", StringType()), StructField("score", DecimalType(precision=3, scale=1)), StructField("riskDate", DateType()), ] ) #TO-DO: create a spark application object spark = SparkSession.builder.appName("stedi-app").getOrCreate() #TO-DO: set the spark log level to WARN spark.sparkContext.setLogLevel("WARN") # TO-DO: using the spark application object, read a streaming dataframe from the Kafka topic redis-server as the source # Be sure to specify the option that reads all the events from the topic including those that were published before you started the spark stream redisServerRawStreamingDF = spark.readStream.format("kafka")\ .option("kafka.bootstrap.servers", "kafka:19092")\ .option("subscribe", "redis-server")\ .option("startingOffsets", "earliest")\ .load() # TO-DO: cast the value column in the streaming dataframe as a STRING redisServerStreamingDF = redisServerRawStreamingDF.selectExpr("cast(key as string) key", "cast(value as string) value")
my_log(log, f"min date : {l_args.last_date}") # retrieve new covid cases per population group cases_per_date = client.get( l_args.dataset_identifier, group="cdc_case_earliest_dt, sex, age_group, race_ethnicity_combined", select= "cdc_case_earliest_dt, sex, age_group, race_ethnicity_combined, count(*)", where=f"cdc_case_earliest_dt > '{l_args.last_date}'", limit=200000, content_type="json") my_log(log, f"nb new records : {len(cases_per_date)}") # transform to data frame df_cases = spark.read.json(spark.sparkContext.parallelize(cases_per_date)) my_log(log, f"nb rows in dataframe : {df_cases.count()}") df_cases = df_cases.withColumn( "race_ethnicity_combined", parse_race_ethnicity("race_ethnicity_combined")) df_cases = df_cases.withColumn( "cdc_case_earliest_dt", col("cdc_case_earliest_dt").cast(DateType())) # write to postgres df_cases.write\ .format("jdbc")\ .option("url", "jdbc:postgresql:capstone")\ .option("dbtable", l_args.table)\ .option("user","postgres")\ .option("password", "postgres")\ .mode("overwrite")\ .save()
# if you ask explicitly, spark will try to infer the schema automatically infer_schema = spark.read.csv(path='../data/covid19.csv',header=True,inferSchema=True) infer_schema.printSchema() # in this case it gets the integers right, but just treats the date as a string # or you can specify the schema explicitly from pyspark.sql.types import (StructField, StringType, IntegerType, DateType, StructType) data_schema = [StructField('continent',StringType(),True), StructField('location',StringType(),True), StructField('date',DateType(),True), StructField('total_cases',IntegerType(),True), StructField('new_cases',IntegerType(),True), StructField('total_deaths',IntegerType(),True), StructField('new_deaths',IntegerType(),True)] correct_struc = StructType(fields=data_schema) dataframe = spark.read.csv(path='../data/covid19.csv', header=True, schema=correct_struc) # and we can confirm that this time the types are correct print(dataframe.printSchema()) # if we wanted to convert to the older-style RDD we easily could rdd = dataframe.rdd print(f'Created `rdd` {type(rdd)} from `dataframe` {type(dataframe)}.')
def setUpClass(cls): from datetime import date, datetime from decimal import Decimal super(ArrowTests, cls).setUpClass() cls.warnings_lock = threading.Lock() # Synchronize default timezone between Python and Java cls.tz_prev = os.environ.get("TZ", None) # save current tz if set tz = "America/Los_Angeles" os.environ["TZ"] = tz time.tzset() cls.spark.conf.set("spark.sql.session.timeZone", tz) # Test fallback cls.spark.conf.set("spark.sql.execution.arrow.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "false" cls.spark.conf.set("spark.sql.execution.arrow.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "true") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "true" cls.spark.conf.set("spark.sql.execution.arrow.fallback.enabled", "false") assert cls.spark.conf.get( "spark.sql.execution.arrow.pyspark.fallback.enabled") == "false" # Enable Arrow optimization in this tests. cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true") # Disable fallback by default to easily detect the failures. cls.spark.conf.set( "spark.sql.execution.arrow.pyspark.fallback.enabled", "false") cls.schema_wo_null = StructType([ StructField("1_str_t", StringType(), True), StructField("2_int_t", IntegerType(), True), StructField("3_long_t", LongType(), True), StructField("4_float_t", FloatType(), True), StructField("5_double_t", DoubleType(), True), StructField("6_decimal_t", DecimalType(38, 18), True), StructField("7_date_t", DateType(), True), StructField("8_timestamp_t", TimestampType(), True), StructField("9_binary_t", BinaryType(), True), ]) cls.schema = cls.schema_wo_null.add("10_null_t", NullType(), True) cls.data_wo_null = [ ( "a", 1, 10, 0.2, 2.0, Decimal("2.0"), date(1969, 1, 1), datetime(1969, 1, 1, 1, 1, 1), bytearray(b"a"), ), ( "b", 2, 20, 0.4, 4.0, Decimal("4.0"), date(2012, 2, 2), datetime(2012, 2, 2, 2, 2, 2), bytearray(b"bb"), ), ( "c", 3, 30, 0.8, 6.0, Decimal("6.0"), date(2100, 3, 3), datetime(2100, 3, 3, 3, 3, 3), bytearray(b"ccc"), ), ( "d", 4, 40, 1.0, 8.0, Decimal("8.0"), date(2262, 4, 12), datetime(2262, 3, 3, 3, 3, 3), bytearray(b"dddd"), ), ] cls.data = [tuple(list(d) + [None]) for d in cls.data_wo_null]
def from_arrow_type(at: "pa.DataType", prefer_timestamp_ntz: bool = False) -> DataType: """Convert pyarrow type to Spark data type.""" from distutils.version import LooseVersion import pyarrow as pa import pyarrow.types as types spark_type: DataType if types.is_boolean(at): spark_type = BooleanType() elif types.is_int8(at): spark_type = ByteType() elif types.is_int16(at): spark_type = ShortType() elif types.is_int32(at): spark_type = IntegerType() elif types.is_int64(at): spark_type = LongType() elif types.is_float32(at): spark_type = FloatType() elif types.is_float64(at): spark_type = DoubleType() elif types.is_decimal(at): spark_type = DecimalType(precision=at.precision, scale=at.scale) elif types.is_string(at): spark_type = StringType() elif types.is_binary(at): spark_type = BinaryType() elif types.is_date32(at): spark_type = DateType() elif types.is_timestamp(at) and prefer_timestamp_ntz and at.tz is None: spark_type = TimestampNTZType() elif types.is_timestamp(at): spark_type = TimestampType() elif types.is_duration(at): spark_type = DayTimeIntervalType() elif types.is_list(at): if types.is_timestamp(at.value_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = ArrayType(from_arrow_type(at.value_type)) elif types.is_map(at): if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): raise TypeError( "MapType is only supported with pyarrow 2.0.0 and above") if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) elif types.is_struct(at): if any(types.is_struct(field.type) for field in at): raise TypeError( "Nested StructType not supported in conversion from Arrow: " + str(at)) return StructType([ StructField(field.name, from_arrow_type(field.type), nullable=field.nullable) for field in at ]) elif types.is_dictionary(at): spark_type = from_arrow_type(at.value_type) elif types.is_null(at): spark_type = NullType() else: raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) return spark_type
def equivalent_type(f): if f == 'datetime64[ns]': return DateType() elif f == 'int64': return LongType() elif f == 'int32': return IntegerType() elif f == 'float64': return FloatType() else: return StringType()
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, TimestampType schema = StructType([ StructField('FIPS', IntegerType(), True), StructField('Admin2', StringType(), True), StructField('Province_State', StringType(), True), StructField('Country_Region', StringType(), True), StructField('Last_Update', TimestampType(), True), StructField('Lat', DoubleType(), True), StructField('Long_', DoubleType(), True), StructField('Confirmed', IntegerType(), True), StructField('Deaths', IntegerType(), True), StructField('Recovered', IntegerType(), True), StructField('Active', IntegerType(), True), StructField('Combined_Key', StringType(), True), StructField('process_date', DateType(), True), ]) # Create initial empty Spark DataFrame based on preceding schema jhu_daily = spark.createDataFrame([], schema) # COMMAND ---------- # MAGIC %md ## Loops Through Each File # MAGIC The following code snippet processes each file to: # MAGIC * Extract out the filename which is needed to know which date the data is referring # MAGIC * The schema of the files change over time so we need slightly different logic to insert data for each different schema # COMMAND ---------- import os
when(F.col("naics_code").isin(722511), "full_service_restaurants").\ when(F.col("naics_code").isin(722513), "limited_service_restaurants").\ when(F.col("naics_code").isin(446110, 446191), "pharmacies_and_drug_stores").\ when(F.col("naics_code").isin(311811,722515), "snack_and_bakeries").\ when(F.col("naics_code").isin(445210,445220,445230,445291,445292,445299), "specialty_food_stores").\ when(F.col("naics_code").isin(445110), "supermarkets_except_convenience_stores")).\ select("placekey","safegraph_place_id","naics_code","file_name") def explodeVisits(date_range_start, visit_by_day): start = datetime.datetime(*map(int, date_range_start[:10].split('-'))) return {(start + datetime.timedelta(days=days)): visits for days, visits in enumerate(json.loads(visit_by_day))} #Credit to the professor, I levarage this piece of code from class udfExpand = F.udf(explodeVisits, T.MapType(DateType(), T.IntegerType())) df = spark.read.csv("hdfs:///data/share/bdm/weekly-patterns-nyc-2019-2020/*", header=True) \ .select("placekey","safegraph_place_id", F.explode(udfExpand('date_range_start', 'visits_by_day')) \ .alias('date', "visits")) # .where(f"date=='{date}'") #Credit to the professor, I leverage this piece of code from class def find_median(values_list): try: median = np.median(values_list) return round(float(median), 2) except Exception: return None
from pyspark.sql.types import StructType, StructField, StringType, DateType from src.database.contracts import wash_contract as c SCHEMA = StructType([ StructField(c.ID, StringType()), StructField(c.DATE, DateType()), StructField(c.AGE, StringType()), StructField(c.RACE, StringType()), StructField(c.SEX, StringType()) ]) COLUMNS = [c.ID, c.DATE, c.AGE, c.RACE, c.SEX]
"struct": StructType, "array": ArrayType, "bigint": LongType, "date": DateType, "byte": ByteType, "short": ShortType, "datetime": TimestampType, "binary": BinaryType, "null": NullType, "vector": VectorUDT } SPARK_DTYPES_DICT_OBJECTS = \ {"string": StringType(), "int": IntegerType(), "float": FloatType(), "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()), "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(), "datetime": TimestampType(), "binary": BinaryType(), "null": NullType() } # Profiler PROFILER_COLUMN_TYPES = { "categorical", "numeric", "date", "null", "array", "binary" } PYTHON_TO_PROFILER = { "string": "categorical", "boolean": "categorical", "int": "numeric", "decimal": "numeric", "date": "date", "array": "array", "binaty": "binary",
from pyspark.sql import SparkSession from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, DateType import random """DO NOT ADD IN FINAL""" spark = SparkSession.builder.appName("splitter").master( "local[*]").getOrCreate() schema = StructType([ StructField("created_at", DateType(), True), # 2020-10-15 00:00:01 StructField("tweet_id", StringType(), True), # 1.31652922155725E+018 StructField("likes", FloatType(), True), # like count StructField("retweet_count", FloatType(), True), # retweet count StructField("source", StringType(), True), # twitter source StructField("user_id", StringType(), True), StructField("user_name", StringType(), True), StructField("user_screen_name", StringType(), True), StructField("user_description", StringType(), True), StructField("user_join_date", DateType(), True), \ StructField("user_followers_count", FloatType(), True), StructField("user_location", StringType(), True), StructField("lat", FloatType(), True), StructField("long", FloatType(), True), StructField("city", StringType(), True), StructField("country", StringType(), True), StructField("continent", StringType(), True), StructField("state", StringType(), True), StructField("state_code", StringType(), True), StructField("collected_at", DateType(), True), StructField("tweet", StringType(), True), # tweet body StructField("sentiment", FloatType(), True)
'WEATHER_DELAY': 0, 'NAS_DELAY': 0, 'SECURITY_DELAY': 0, 'LATE_AIRCRAFT_DELAY': 0 }) #adding status and delay column to dataframe from pyspark.sql.functions import col, when df1 = df1.withColumn('delay', when(df1.ARR_DELAY > 0, df1.ARR_DELAY).otherwise(0)) df1 = df1.withColumn('status', when(df1.ARR_DELAY > 0, 1).otherwise(0)) #converting datatype of columns from pyspark.sql.types import IntegerType, DoubleType, DateType df1 = df1.withColumn("FL_DATE", df1["FL_DATE"].cast(DateType())) df1 = df1.withColumn("year", df1["year"].cast(IntegerType())) df1 = df1.withColumn("month", df1["month"].cast(IntegerType())) df1 = df1.withColumn("day", df1["day"].cast(IntegerType())) #changinge columns name df2 = df1.withColumnRenamed("FL_DATE", "fl_date").withColumnRenamed( "OP_CARRIER", "op_carrier").withColumnRenamed( "OP_CARRIER_FL_NUM", "op_carrier_fl_num").withColumnRenamed( "OP_CARRIER_FL_NUM", "op_carrier_fl_num" ).withColumnRenamed("ORIGIN", "origin").withColumnRenamed( "DEST", "dest" ).withColumnRenamed("CRS_DEP_TIME", "crs_dep_time").withColumnRenamed( "DEP_TIME", "dep_time" ).withColumnRenamed("DEP_DELAY", "dep_delay").withColumnRenamed( "TAXI_OUT", "taxi_out"
def get_stock_dataframe(): prices_DF = sqlContext.read.format('com.databricks.spark.csv')\ .schema(prices_schema())\ .options(header='true')\ .load(prices)\ .dropDuplicates().cache() text = sc.textFile(symbols).map(lambda l: l.split('\t')) symbols_dict = text.collectAsMap() for k, v in symbols_dict.iteritems(): symbols_dict[k] = v.replace("'", "\'") def integrate_company(symbol_prices): for key, value in symbols_dict.iteritems(): if key == symbol_prices: company = value return company udf_company = udf(integrate_company, StringType()) company_DF = prices_DF.withColumn('Company', udf_company(prices_DF.Symbol)).cache() sectors_DF = sqlContext.read.format('com.databricks.spark.csv')\ .schema(sectors_schema())\ .options(header='true')\ .load(sectors)\ .dropDuplicates().cache() sectors_dict = map(lambda row: row.asDict(), sectors_DF.collect()) def integrate_sector(symbol_prices): sector_value = "" for item in sectors_dict: if item['Symbol'] == symbol_prices: sector_value = item['Sector'] return sector_value udf_sector = udf(integrate_sector, StringType()) stock_sec_DF = company_DF.withColumn('Sector', udf_sector(company_DF.Symbol)) def integrate_industry(symbol_prices): industry_value = "" for item in sectors_dict: if item['Symbol'] == symbol_prices: industry_value = item['Industry'] return industry_value udf_industry = udf(integrate_industry, StringType()) stock_DF = stock_sec_DF.withColumn('Industry', udf_industry(stock_sec_DF.Symbol)) stock_new_DF = stock_DF.withColumn("Date", stock_DF["Date"].cast( DateType())).orderBy("Date", "Symbol") def percent_change(open, close): change = (open - close) / open return change udf_percentchange = udf(percent_change, FloatType()) stock_final_DF = stock_new_DF.withColumn( 'Percentage_Change', udf_percentchange(stock_new_DF.Open, stock_new_DF.Close)) stock_final_DF.saveAsParquetFile("stocks_full.parquet") return stock_final_DF
df7 = pd.DataFrame.from_dict(json.loads(response.text)) fig = px.line(df7, x="day", y="deaths", color='month') fig=plot(fig,output_type='div') displayHTML(fig) # COMMAND ---------- # MAGIC %md ## Clustering data To Get Hotspots # COMMAND ---------- from pyspark.sql.types import StructField,StringType,IntegerType,StructType,FloatType,DateType,DateConverter data_schema=[StructField("Date",DateType(),True), StructField("State_or_UT",StringType(),True), StructField("Indian_cases",IntegerType(),True), StructField("Foreign_cases" ,IntegerType(),True), StructField("Cured",IntegerType(),True), StructField("Latitude",FloatType(),True), StructField("Longitude",FloatType(),True), StructField("Death",IntegerType(),True), StructField("Total_cases",IntegerType(),True) ] dfschema=StructType(data_schema) # COMMAND ---------- df=spark.read.csv("dbfs:/FileStore/tables/complete1.csv",mode='FAILFAST',header=True,schema=dfschema)
spark = SparkSession.builder.appName('ReviewsOdsSession').getOrCreate() #establising source files AmzRevjson = "/edw/Reviews/SourceCode/data/asin_reviews.json" #read data into a frame Reviews_df = spark.read.json(AmzRevjson) #tranformed frame Cl_Reviews_df = Reviews_df.select( 'asin', col('overall').alias("reviewRating"), 'reviewerID', 'reviewerName', (when(col('unixReviewTime').isNull(), '1990-01-01').otherwise(from_unixtime( 'unixReviewTime', 'yyyy-MM-dd'))).cast(DateType()).alias("reviewDate")).filter( col("reviewDate") >= '2011-01-01') #Write to DB mode = "append" table = os.environ['SCHEMA_STG'] + ".reviews" url = "jdbc:postgresql://" + os.environ['PGHOST'] + "/" + os.environ['DB_DWH'] properties = { "user": os.environ['PGUSER'], "password": os.environ['PGPASSWD'], "driver": 'org.postgresql.Driver' } Cl_Reviews_df.write.jdbc(url=url, table=table, mode=mode, properties=properties)
def main(context): """Main function takes a Spark SQL context.""" # YOUR CODE HERE # YOU MAY ADD OTHER FUNCTIONS AS NEEDED start = time.time() # task 1 if(read_raw): comments = sqlContext.read.json('comments-minimal.json.bz2') submissions = sqlContext.read.json('submissions.json.bz2') label = sqlContext.read.load('labeled_data.csv', format = 'csv', sep = ',',header="true") print("load done") comments.write.parquet('comments') submissions.write.parquet('submissions') label.write.parquet('label') else: comments = context.read.load('comments') submissions = context.read.load('submissions') label = context.read.load('label') print("task 1 complete: read data") #result.show() if(training): # task 2 associate = associated(comments, label).select(col('id'), col('body'), col('labeldjt')) print("task 2 complete: associate data") # task 4, 5 newColumn = associate.withColumn('ngrams', sanitize_udf(associate['body'])) print("task 4, 5 complete: generate unigrams") # task 6A cv = CountVectorizer(inputCol = 'ngrams', outputCol = "features", binary = True) model = cv.fit(newColumn) tmp = model.transform(newColumn) print("task 6A complete: cv model") # task 6B result = tmp.withColumn('poslabel', F.when(col('labeldjt') == 1, 1).otherwise(0)) result = result.withColumn('neglabel', F.when(col('labeldjt') == -1, 1).otherwise(0)) pos = result.select(col('poslabel').alias('label'), col('features')) neg = result.select(col('neglabel').alias('label'), col('features')) print("task 6B complete: relabel data") # task 7 # Initialize two logistic regression models. # Replace labelCol with the column containing the label, and featuresCol with the column containing the features. poslr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10) neglr = LogisticRegression(labelCol = "label", featuresCol = "features", maxIter = 10) # This is a binary classifier so we need an evaluator that knows how to deal with binary classifiers. posEvaluator = BinaryClassificationEvaluator() negEvaluator = BinaryClassificationEvaluator() # There are a few parameters associated with logistic regression. We do not know what they are a priori. # We do a grid search to find the best parameters. We can replace [1.0] with a list of values to try. # We will assume the parameter is 1.0. Grid search takes forever. posParamGrid = ParamGridBuilder().addGrid(poslr.regParam, [1.0]).build() negParamGrid = ParamGridBuilder().addGrid(neglr.regParam, [1.0]).build() # We initialize a 5 fold cross-validation pipeline. posCrossval = CrossValidator( estimator = poslr, evaluator = posEvaluator, estimatorParamMaps = posParamGrid, numFolds = 5) negCrossval = CrossValidator( estimator = neglr, evaluator = negEvaluator, estimatorParamMaps = negParamGrid, numFolds = 5) # Although crossvalidation creates its own train/test sets for # tuning, we still need a labeled test set, because it is not # accessible from the crossvalidator (argh!) # Split the data 50/50 posTrain, posTest = pos.randomSplit([0.5, 0.5]) negTrain, negTest = neg.randomSplit([0.5, 0.5]) # Train the models print("Training positive classifier...") posModel = posCrossval.fit(posTrain) print("Training negative classifier...") negModel = negCrossval.fit(negTrain) # Once we train the models, we don't want to do it again. We can save the models and load them again later. posModel.save("pos.model") negModel.save("neg.model") model.save("cv.model") print("task 7 complete: training") # posModel = CrossValidatorModel.load('pos.model') # negModel = CrossValidatorModel.load('neg.model') # point 7 pos_trans = posModel.transform(posTest) neg_trans = negModel.transform(negTest) pos_results = pos_trans.select(['probability', 'label']) pos_trans_collect = pos_results.collect() pos_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in pos_trans_collect] pos_scoreAndLabels = sc.parallelize(pos_trans_results_list) pos_metrics = metric(pos_scoreAndLabels) print("The ROC score of positive results is: ", pos_metrics.areaUnderROC) neg_results = neg_trans.select(['probability', 'label']) neg_trans_collect = neg_results.collect() neg_trans_results_list = [(float(i[0][0]), 1.0-float(i[1])) for i in neg_trans_collect] neg_scoreAndLabels = sc.parallelize(neg_trans_results_list) neg_metrics = metric(neg_scoreAndLabels) print("The ROC score of negative results is: ", neg_metrics.areaUnderROC) plot_ROC(pos_trans_results_list, 'positive_results') plot_ROC(neg_trans_results_list, 'negative_results') print("point 7 complete: ROC") else: model = CountVectorizerModel.load('cv.model') posModel = CrossValidatorModel.load('pos.model') negModel = CrossValidatorModel.load('neg.model') print("model loaded") # task 8 comments_tmp = comments.select(col('id'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('score').alias('com_score')) comments_full = comments_tmp.withColumn('link_id', process_id_udf(comments_tmp['link_id'])) submissions_full = submissions.select(col('id').alias('sub_id'), col('title'), col('score').alias('sub_score')) if(joinFull): com_sub = comments_full.join(submissions_full, comments_full.link_id == submissions_full.sub_id, 'inner') com_sub = com_sub.select(col('id'), col('title'), col('link_id'), col('created_utc'), col('body'), col('author_flair_text'), col('com_score'), col('sub_score')) com_sub.write.parquet('com_sub') else: com_sub = context.read.load('com_sub')# .sample(False, 0.01, None) print('task 8 complete: comment with submission') # task 9 filtered = com_sub.filter("body NOT LIKE '%/s%' and body NOT LIKE '>%'") filtered_result = filtered.withColumn('ngrams', sanitize_udf(filtered['body'])) feaResult = model.transform(filtered_result).select(col('id'), col('link_id'), col('created_utc'), \ col('features'), col('author_flair_text'), col('com_score'), col('sub_score'), col('title')) posResult = posModel.transform(feaResult) negResult = negModel.transform(feaResult) print("transformed") pos = posResult.withColumn('pos', threshold_pos_udf(posResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'pos', 'com_score', 'sub_score', 'title') neg = negResult.withColumn('neg', threshold_neg_udf(negResult['probability'])).select('id', 'created_utc', 'author_flair_text', 'neg', 'com_score', 'sub_score', 'title') #final_probs = pos.join(neg, pos.id == neg.id_neg, 'inner').select('id', 'created_utc', 'author_flair_text', 'title', 'pos', 'neg') #final_probs.show() #pos.write.parquet('pos') #neg.write.parquet('neg') print('task 9 complete: predict') # task 10 # compute 1 num_rows = pos.count() pos_filtered = pos.filter(pos.pos == 1) neg_filtered = neg.filter(neg.neg == 1) num_pos = pos_filtered.count() num_neg = neg_filtered.count() print('Percentage of positive comments: {}'.format(num_pos / num_rows)) print('Percentage of negative comments: {}'.format(num_neg / num_rows)) print('finish compute 1') # compute 2 pos_time = pos.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType())) neg_time = neg.withColumn('time', F.from_unixtime(col('created_utc')).cast(DateType())) num_pos_time = pos_time.groupBy('time').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('time') num_neg_time = neg_time.groupBy('time').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('time') num_pos_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_pos_time') num_neg_time.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('num_neg_time') print('finish compute 2') # compute 3 state = sqlContext.createDataFrame(states, StringType()) pos_state = pos.groupBy('author_flair_text').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')) neg_state = neg.groupBy('author_flair_text').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')) pos_state = pos_state.join(state, pos_state.author_flair_text == state.value, 'inner') pos_state = pos_state.na.drop(subset=['value']) pos_state = pos_state.select(col('author_flair_text').alias('state'), col('Percentage of positive').alias('Positive')) neg_state = neg_state.join(state, neg_state.author_flair_text == state.value, 'inner') neg_state = neg_state.na.drop(subset=['value']) neg_state = neg_state.select(col('author_flair_text').alias('state'), col('Percentage of negative').alias('Negative')) pos_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_state') neg_state.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_state') print('finish compute 3') # compute 4 pos_com_score = pos.groupBy('com_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('com_score') pos_sub_score = pos.groupBy('sub_score').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy('sub_score') neg_com_score = neg.groupBy('com_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('com_score') neg_sub_score = neg.groupBy('sub_score').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy('sub_score') pos_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_com_score') pos_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_sub_score') neg_com_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_com_score') neg_sub_score.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_sub_score') print('finish compute 4') # compute 5 pos_story = pos.groupBy('title').agg((F.sum('pos') / F.count('pos')).alias('Percentage of positive')).orderBy(F.desc('Percentage of positive')).limit(10) neg_story = neg.groupBy('title').agg((F.sum('neg') / F.count('neg')).alias('Percentage of negative')).orderBy(F.desc('Percentage of negative')).limit(10) pos_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('pos_story') neg_story.coalesce(1).write.mode("overwrite").format("com.databricks.spark.csv").option("header", "true").csv('neg_story') print('finish compute 5') end = time.time() print('time consumed: {}'.format(end - start))
def process_log_data(spark, input_data, output_data): """ Spark pipeline to process and save csv formatted log data Saves to parquet file type on S3 Args: spark (object): Spark session input_data (str): S3 bucket input name output_data (str): S3 bucket output name Returns: None """ # get filepath to log data file log_data = input_data + 'log-data/*/*/*.json' # read log data file df = spark.read.json(log_data) # filter by actions for song plays df = df.filter(df.page == 'NextSong') # extract columns for users table users_table = df[['userId', 'firstName', 'lastName', 'gender', 'level']].drop_duplicates() # write users table to parquet files users_table.write.parquet(output_data + 'user_data', mode='overwrite') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: datetime.fromtimestamp(int(x) / 1000.), returnType=TimestampType()) df = df.withColumn('start_time', get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: datetime.fromtimestamp(int(x) / 1000.), returnType=DateType()) df = df.withColumn('datetime', get_datetime(df.ts)) # add year and month -- needed for songplays_table partitione df = df.withColumn("year", year(df.datetime).alias('year')) df = df.withColumn("month", month(df.datetime).alias('month')) # extract columns to create time table time_table = df.select( 'start_time', 'year', 'month', hour(df.datetime).alias('hour'), dayofmonth(df.datetime).alias('dayofmonth'), weekofyear(df.datetime).alias('weekofyear'), dayofweek(df.datetime).alias('weekday')).drop_duplicates() # write time table to parquet files partitioned by year and month time_table.write.parquet(output_data + 'time_data', mode='overwrite', partitionBy=('year', 'month')) # read in song data to use for songplays table song_df = spark.read.parquet(output_data + 'song_data') # extract columns from joined song and log datasets to create songplays table songplays_table = df[['start_time', 'userId', 'level', 'sessionId', 'location', 'userAgent', 'song', 'length', 'year', 'month']] \ .join(song_df[['song_id', 'artist_id', 'title', 'duration']], on = (df.song == song_df.title) & (df.length == song_df.duration), how = 'inner') \ .withColumn('songplay_id', monotonically_increasing_id()) \ .select('songplay_id', 'start_time', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent', 'year', 'month') \ .drop_duplicates() # write songplays table to parquet files partitioned by year and month songplays_table.write.parquet(output_data + 'songplays_data', mode='overwrite', partitionBy=('year', 'month'))
def insert_time_dim(start_date_id, end_date_id): time_begin = datetime.strptime(str(start_date_id), "%Y%m%d").date() time_end = datetime.strptime(str(end_date_id), "%Y%m%d").date() print('time_begin') print(time_begin) print('time_end') print(time_end) # tao dataframe tu time_begin va time_end data = [(time_begin, time_end)] df = spark.createDataFrame(data, ["minDate", "maxDate"]) # convert kieu dl va ten field df = df.select( df.minDate.cast(DateType()).alias("minDate"), df.maxDate.cast(DateType()).alias("maxDate")) # chay vong lap lay tat ca cac ngay giua mindate va maxdate df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \ .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \ .select("*", f.posexplode("repeat").alias("date", "val")) \ .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \ .select('date') # convert date thanh cac option ngay_thang_nam df = df.withColumn('id', f.date_format(df.date, "yyyyMMdd")) \ .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \ .withColumn('ngay_trong_tuan', f.from_unixtime(f.unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \ .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \ .withColumn('thang', f.month(df.date)) \ .withColumn('quy', f.quarter(df.date)) \ .withColumn('nam', f.year(df.date)) df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1) data_time = DynamicFrame.fromDF(df, glueContext, 'data_time') # convert data data_time = data_time.resolveChoice(specs=[('tuan_trong_thang', 'cast:int')]) # chon cac truong va kieu du lieu day vao db applymapping1 = ApplyMapping.apply( frame=data_time, mappings=[("id", "string", "id", "bigint"), ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'), ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"), ("tuan_trong_thang", "int", "tuan_trong_thang", "int"), ("tuan_trong_nam", "int", "tuan_trong_nam", "int"), ("thang", "int", "thang", "int"), ("quy", "int", "quy", "int"), ("nam", "int", "nam", "int"), ("date", "date", "ngay", "timestamp")]) resolvechoice2 = ResolveChoice.apply(frame=applymapping1, choice="make_cols", transformation_ctx="resolvechoice2") dropnullfields3 = DropNullFields.apply( frame=resolvechoice2, transformation_ctx="dropnullfields3") # ghi dl vao db preactions = 'delete student.time_dim where id >= ' + str(start_date_id) datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf( frame=dropnullfields3, catalog_connection="glue_redshift", connection_options={ "preactions": preactions, "dbtable": "student.time_dim", "database": "student_native_report" }, redshift_tmp_dir= "s3n://dts-odin/temp/tu-student_native_report/student/time_dim", transformation_ctx="datasink4")
.add("evntloc_key",IntegerType(), True) \ .add("cntrycd",StringType(),True) \ .add("continent",StringType(),True) \ .add("region",StringType(),True) \ .add("cntry",StringType(),True) \ evntcatschema = StructType() \ .add("evntcat_key",IntegerType(),True) \ .add("relcd",StringType(),True) \ .add("reldesc",StringType(),True) \ .add("relcat",StringType(),True) dateschema = StructType() \ .add("datekey",IntegerType(),True) \ .add("date",DateType(),True) \ .add("dayofweekname",StringType(),True) \ .add("dayofweek",IntegerType(),True) \ .add("dayofmonth",IntegerType(),True) \ .add("dayofyear",IntegerType(),True) \ .add("calendarweek",IntegerType(),True) \ .add("calendarmonthname",StringType(),True) \ .add("calendarmonth",IntegerType(),True) \ .add("calendaryear",IntegerType(),True) \ .add("lastdayinmonth",StringType(),True) gbldatafactschema = StructType() \ .add("globaleventid",IntegerType(),True) \ .add("event_date",DateType(),True) \ .add("actcd",StringType(),True) \
# coding=utf-8 from datetime import datetime from pyspark.sql.functions import col, udf from pyspark.sql.types import DateType from pyspark import SparkContext from pyspark.sql import SQLContext # 创建SparkContext sc = SparkContext('local') # 创建SQLContext sqlContext = SQLContext(sc) # 创建DataFrame df = sqlContext.createDataFrame([("11/25/1991", "11/24/1991", "11/30/1991"), ("11/25/1391", "11/24/1992", "11/30/1992")], schema=['first', 'second', 'third']) # 调用withColumn基于原first列进行数据类型转换生成新列test func = udf(lambda x: datetime.strptime(x, '%m/%d/%Y'), DateType()) df = df.withColumn('test', func(col('first'))) # 数据打印 df.show() # 打印元数据信息 df.printSchema()
def main(): # create spark session spark = SparkSession.builder.master("local[*]").getOrCreate() spark.catalog.clearCache() # Connect to MariaDB Platform try: connection = mariadb.connect( user="******", password="******", # pragma: allowlist secret host="localhost", port=3306, database="baseball", ) except mariadb.Error as e: print(f"Error connecting to MariaDB Platform: {e}") sys.exit(1) printout("success connecting...") # setup schema schema = StructType([ StructField(name="game_id", dataType=IntegerType(), nullable=True), StructField(name="batter", dataType=IntegerType(), nullable=True), StructField(name="hit", dataType=IntegerType(), nullable=True), StructField(name="atbat", dataType=IntegerType(), nullable=True), StructField(name="local_date", dataType=DateType(), nullable=True), ]) # create empty spark dataframe using schema df = spark.createDataFrame(spark.sparkContext.emptyRDD(), schema) # import batter counts table and game table cursor = connection.cursor() count = 0 printout("creating table...") cursor.execute( f"SELECT bc.game_id, bc.batter, bc.Hit, bc.atbat, gt.local_date \ FROM batter_counts bc INNER JOIN game_temp gt on bc.game_id = gt.game_id ORDER BY game_id" ) printout("importing table...") for (game_id, batter, hit, atbat, local_date) in cursor: to_insert = spark.createDataFrame([ (game_id, batter, hit, atbat, local_date), ]) df = df.union(to_insert) count += 1 if count % 500 == 0: print(f"\timporting row {count}...") print(df.show(n=200)) df.createOrReplaceTempView("rolling_avg_temp") df.persist(StorageLevel.MEMORY_AND_DISK) # solve for rolling batting averages printout("solving for rolling batting averages...") rolling_df = spark.sql( f"""SELECT rat1.batter, SUM(rat2.Hit) AS sum_hits , SUM(rat2.atbat) AS sum_bats \ FROM rolling_avg_temp rat1 JOIN rolling_avg_temp rat2 ON rat2.local_date \ BETWEEN DATE_ADD(rat1.local_date, - 100) AND rat1.local_date AND \ rat1.batter = rat2.batter GROUP BY rat1.batter""") print(rolling_df.show(n=20)) rolling_df.createOrReplaceTempView("rolling_df") rolling_df.persist(StorageLevel.MEMORY_AND_DISK) # create array column of all necessary data printout("converting data to array...") rolling_df = spark.sql( """SELECT * , SPLIT(CONCAT(CASE WHEN batter IS NULL THEN "" \ ELSE batter END, " ", CASE WHEN sum_hits IS NULL OR sum_bats IS NULL THEN "" \ ELSE ROUND(sum_hits/sum_bats, 3) END), " ") \ AS array_with_rolling_averages FROM rolling_df""") print(rolling_df.show(n=20)) # fit array column to count vectorizer printout("running vectorizer and transformer...") count_vectorizer = CountVectorizer(inputCol="array_with_rolling_averages", outputCol="array_vector") count_vectorizer_fitted = count_vectorizer.fit(rolling_df) # transform the fitted count vectorizer rolling_df = count_vectorizer_fitted.transform(rolling_df) print(rolling_df.show(n=20, truncate=False)) return
import datetime from decimal import Decimal import uuid from typing import Counter, Iterable, List, Tuple from pyspark.sql import SparkSession from pyspark.sql.types import DateType, DecimalType, Row, StringType, StructField, StructType invoice_schema = StructType([ StructField("invoice_id", StringType(), False), StructField("invoice_date", DateType(), False), StructField("due_date", DateType(), False), StructField("period_start_date", DateType(), False), StructField("period_end_date", DateType(), False), StructField("total_amount", DecimalType(scale=2), False), StructField("canonical_vendor_id", StringType(), False), ]) line_item_schema = StructType([ StructField("invoice_id", StringType(), False), StructField("line_item_id", StringType(), False), StructField("period_start_date", DateType(), False), StructField("period_end_date", DateType(), False), StructField("total_amount", DecimalType(scale=2), False), StructField("canonical_line_item_id", StringType(), False), ]) def map_vendor_not_seen_in_a_while(p: Tuple[str, Iterable[Row]]): vendor_id, ins = p ins = sorted(ins, key=lambda i: i.invoice_date) for i, invoice in enumerate(ins):