def as_spark_type(tpe) -> types.DataType: """ Given a python type, returns the equivalent spark type. Accepts: - the built-in types in python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - python3's typing system """ if tpe in (str, "str", "string"): return types.StringType() elif tpe in (bytes, ): return types.BinaryType() elif tpe in (np.int8, "int8", "byte"): return types.ByteType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() elif tpe in (int, "int", np.int, np.int32): return types.IntegerType() elif tpe in (np.int64, "int64", "long", "bigint"): return types.LongType() elif tpe in (float, "float", np.float): return types.FloatType() elif tpe in (np.float64, "float64", "double"): return types.DoubleType() elif tpe in (datetime.datetime, np.datetime64): return types.TimestampType() elif tpe in (datetime.date, ): return types.DateType() elif tpe in (bool, "boolean", "bool", np.bool): return types.BooleanType() elif tpe in (): return types.ArrayType(types.StringType())
def main(): df = spark \ .read \ .format('kafka') \ .option('kafka.bootstrap.servers', BOOTSTRAP_SERVERS) \ .option('subscribe', TOPIC_NAME) \ .option('group.id', GROUP_ID) \ .option('startingOffsets', utils.get_starting_offsets(TOPIC_NAME)) \ .load() \ .cache() ads_data = df.select( F.from_json(F.col('value').cast('string'), kafka_schema) \ .alias('json') ) \ .select('json.*') \ .withColumn('announcementid', F.col('announcementid').cast('long')) \ .withColumn('floorNumber', F.col('floorNumber').cast('int')) \ .withColumn('floorsCount', F.col('floorsCount').cast('int')) \ .withColumn('roomsCount', F.col('roomsCount').cast('int')) \ .withColumn('ptn_dadd', F.col('dateInserted').cast(T.DateType())) ads_data \ .write \ .format('orc') \ .mode('append') \ .partitionBy('ptn_dadd') \ .saveAsTable(RESULT_TABLE) partition_offsets_mapping = { str(partition): offset + 1 for partition, offset in df.groupBy('partition').agg({'offset': 'max'}).collect() } utils.dump_offsets(TOPIC_NAME, partition_offsets_mapping)
def main(inputs,output): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(inputs, schema=tmax_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() sqlTrans = SQLTransformer(statement = 'select *,dayofyear(date) as day FROM __THIS__') sqlTrans1 = SQLTransformer(statement = 'SELECT today.station,today.date,today.latitude,today.longitude,today.elevation,today.tmax, yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station') assemble_features = VectorAssembler(inputCols = ['latitude','longitude','elevation','day','yesterday_tmax'], outputCol = 'features') gbt = GBTRegressor(featuresCol = 'features', labelCol='tmax') pipeline = Pipeline(stages=[sqlTrans1,sqlTrans,assemble_features,gbt]) weather_model = pipeline.fit(train) predictions = weather_model.transform(validation) #predictions.show() evaluator = RegressionEvaluator(labelCol = 'tmax', predictionCol = 'prediction', metricName = 'rmse') score = evaluator.evaluate(predictions) print("Root Mean Squared Error (RMSE) on test data = %g" % score) weather_model.write().overwrite().save(output)
def expand_date(df): df = df.withColumn('Date', df.Date.cast(T.DateType())) return df \ .withColumn('Year', F.year(df.Date)) \ .withColumn('Month', F.month(df.Date)) \ .withColumn('Week', F.weekofyear(df.Date)) \ .withColumn('Day', F.dayofmonth(df.Date))
def main(input_dir,output_dir): # main logic starts here df_schema = types.StructType([ types.StructField('title_clean', types.StringType()), types.StructField('title', types.StringType()), types.StructField('created_utc_iso', types.DateType()), types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())) ]) headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80) split_sentiment_df = headlines_df.withColumn( 'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1) ).withColumn( 'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2) ).cache() for year_int in range(2008,2020): print('Plotting for '+str(year_int)) headlines_year = split_sentiment_df.where( functions.year(split_sentiment_df['created_utc_iso']) == year_int ).withColumn('year',functions.year(split_sentiment_df['created_utc_iso'])) headlines_grouped = headlines_year.groupBy(headlines_year['year']).agg( functions.collect_set(headlines_year['title_clean']).alias('titles_group') ) headlines_joined = headlines_grouped.select( functions.array_join(headlines_grouped['titles_group'],' ').alias('joined') ) string_to_plot = headlines_joined.collect()[0]['joined'] #only one row remaining of concatenated headlines wordcloud = WordCloud(background_color='white', stopwords=stopwords, width=1000, height=500).generate(string_to_plot) wordcloud.to_file(output_dir + '/'+str(year_int)+'_words.png')
def main(inputs, output): '''define the schema''' tweets_schema = types.StructType([ types.StructField('username', types.StringType()), types.StructField('date', types.DateType()), types.StructField('retweets', types.StringType()), types.StructField('favorites', types.StringType()), types.StructField('text', types.StringType()), types.StructField('geo', types.StringType()), types.StructField('mentions', types.StringType()), types.StructField('hashtags', types.StringType()), types.StructField('id', types.StringType()), types.StructField('permalink', types.StringType()) ]) ''' pass the schema when reading input file to avoid Spark DataFrames from directly infering the Schema from the input ''' df = spark.read.format("csv").option('header','true').option('delimiter','\u0001').schema(tweets_schema).load(inputs) df = df.select('date', 'text', 'hashtags') ''' start preprocessing ''' df = df.filter(df['text'].isNotNull()) df = df.filter(df['hashtags'].isNotNull()) df = df.withColumn('hashtags', lower(df['hashtags'])) df = process_tweet_text(df) df = get_sentiment(df) df = get_party(df) df.show()
def infer_spark_type(typeclass) -> t.DataType: if typeclass in (None, NoneType): return t.NullType() elif typeclass is str: return t.StringType() elif typeclass in {bytes, bytearray}: return t.BinaryType() elif typeclass is bool: return t.BooleanType() elif typeclass is date: return t.DateType() elif typeclass is datetime: return t.TimestampType() elif typeclass is Decimal: return t.DecimalType(precision=36, scale=6) elif isinstance(typeclass, type) and issubclass(typeclass, BoundDecimal): (precision, scale) = typeclass.__constraints__ return t.DecimalType(precision=precision, scale=scale) elif typeclass is float: return t.DoubleType() elif typeclass is int: return t.IntegerType() elif typeclass is long: return t.LongType() elif typeclass is short: return t.ShortType() elif typeclass is byte: return t.ByteType() elif getattr(typeclass, "__origin__", None) is not None: return infer_complex_spark_type(typeclass) elif is_pyspark_class(typeclass): return transform(typeclass) else: raise TypeError(f"Don't know how to represent {typeclass} in Spark")
def main(inputs, output, start_year, end_year): #Specifying the schema for the dataframe amazon_schema = types.StructType([ types.StructField('marketplace', types.StringType()), types.StructField('customer_id', types.IntegerType()), types.StructField('review_id', types.StringType()), types.StructField('product_id', types.StringType()), types.StructField('product_parent', types.LongType()), types.StructField('product_title', types.StringType()), types.StructField('product_category', types.StringType()), types.StructField('star_rating', types.IntegerType()), types.StructField('helpful_votes', types.IntegerType()), types.StructField('total_votes', types.IntegerType()), types.StructField('vine', types.StringType()), types.StructField('verified_purchase', types.StringType()), types.StructField('review_headline', types.StringType()), types.StructField('review_body', types.StringType()), types.StructField('review_date', types.DateType()) ]) #Loading the data into dataframe raw_dataset = spark.read.option('sep', '\t').csv(inputs, schema=amazon_schema, header='true') raw_dataset = raw_dataset.repartition(96) #print("No of rows in raw_dataset:",raw_dataset.count()) #Keeping only those rows which are verified purchases verified_purchases_df = raw_dataset.filter( col('verified_purchase') == "Y").cache() #print("No of rows in verified_purchases_df:",verified_purchases_df.count()) #10-core products only - Keeping only the products which have more than 10 reviews product_count = verified_purchases_df.groupby('product_id').count().filter( col('count') > 10) ten_core_dataset = verified_purchases_df.join(broadcast( product_count.select('product_id')), on='product_id') ten_core_dataset.registerTempTable('ten_core_dataset') #print("No of rows in ten_core_dataset:",ten_core_dataset.count()) #Selecting data in the given time range sliced_data = spark.sql( "SELECT * from ten_core_dataset WHERE year(review_date) BETWEEN " + start_year + " AND " + end_year) #sliced_data = spark.sql("SELECT * from ten_core_dataset WHERE year(review_date) BETWEEN "+str(2010)+" AND "+str(2015)) #sliced_data.registerTempTable("sliced_data") print("No of rows in sliced_dataset:", sliced_data.count()) #splitting the datasets year-wise # years_year=[2010,2011,2012,2013,2014,2015] # for i in years_year: # split_to_years = spark.sql("SELECT * from sliced_data WHERE year(review_date)="+str(i)) # print("No of rows in",i,split_to_years.count()) # split_to_years.write.partitionBy('product_category').parquet(output+"_"+str(i)) #Storing the data partitioned on product categories for easy access later on sliced_data.write.partitionBy('product_category').parquet(output)
def import_data(request): if request.method == 'POST': new_file = next(iter(request.FILES.values())) # new_file = request.FILES['myfile'] path = 'data/' + str(new_file.name) # import pdb # pdb.set_trace() project_id = request.user.project.id company_name = request.user.project.company data = Spark.sc.textFile(path) header = data.first() # fields2 = [(typ.StructField(h, typ.DateType(), True)) for h in header.split(',') if ('Date' in str(h) else (typ.StructField(h, typ.StringType(), True)) # for h in header.split(',')] # fields1 = [typ.StructField(h, typ.StringType(), True) # for h in header.split(',')] fields = [] date_column = None drop_column = None for index, field_name in enumerate(header.split(',')): if ('Date' in str(field_name)): date_column = index drop_column = field_name fields.append(typ.StructField(field_name, typ.DateType(), True)) else: fields.append( typ.StructField(field_name, typ.StringType(), True)) schema = typ.StructType(fields) data = data.filter(lambda row: row != header) \ .map(lambda row: [dt.strptime(elem, '%d/%m/%Y') if (index==date_column) else str(elem) for index, elem in enumerate(row.split(','))]) data_df = Spark.sqlContext.createDataFrame(data, schema) table_name = str(company_name) + '_Test' if drop_column: data_df.drop(drop_column).collect() # import pdb # pdb.set_trace() if date_column: try: unique_value = CustomFields.objects.get(project_id=project_id) unique_value.date_column = date_column unique_value.save() except: pass data_df.write.format('jdbc').options( url='jdbc:mysql://localhost:3306/disease', dbtable=table_name, user='******', password='******').mode('append').save() return render(request, "import_data.html")
def main(inputs, keyspace, table): if table == "yelp_business": business_schema = StructType([ types.StructField('business_id', types.StringType(), True), types.StructField('name', types.StringType(), True), types.StructField('neighborhood', types.StringType(), True), types.StructField('address', types.StringType(), True), types.StructField('city', types.StringType(), True), types.StructField('state', types.StringType(), True), types.StructField('postal_code', types.StringType(), True), types.StructField('latitude', types.FloatType(), True), types.StructField('longitude', types.FloatType(), True), types.StructField('stars', types.FloatType(), True), types.StructField('review_count', types.LongType(), True), types.StructField('is_open', types.IntegerType(), True) ]) business = spark.read.json(inputs, schema=business_schema) df = business.drop('neighborhood').filter(business.is_open == 1) df.cache() business_data = sc.textFile(inputs).map(json_key_value_1).map( lambda x: Row(x[0], x[1], x[2], x[3])) df_1 = business_data.toDF() df_2 = df_1.withColumnRenamed("_1", "bus_id").withColumnRenamed( "_2", "attributes").withColumnRenamed( "_3", "categories").withColumnRenamed("_4", "hours") df_2.cache() result = df.join(df_2, df.business_id == df_2.bus_id, how='inner').drop(df_2.bus_id) elif table == "yelp_checkin": checkin_data = sc.textFile(inputs).map(json_key_value_2).map( lambda x: Row(str(uuid.uuid1()), x[0], x[1])) df = checkin_data.toDF().cache() df_1 = df.withColumnRenamed("_1", "id").withColumnRenamed( "_2", "time").withColumnRenamed("_3", "business_id") result = df_1 if table == "yelp_review": reviews_schema = types.StructType([ types.StructField('business_id', types.StringType(), True), types.StructField('cool', types.LongType(), True), types.StructField('date', types.DateType(), True), types.StructField('funny', types.LongType(), True), types.StructField('review_id', types.StringType(), True), types.StructField('stars', types.LongType(), True), types.StructField('text', types.StringType(), True), types.StructField('useful', types.LongType(), True), types.StructField('user_id', types.StringType(), True) ]) reviews = spark.read.json(inputs, schema=reviews_schema) uuidUdf = udf(lambda: str(uuid.uuid1()), types.StringType()) result = reviews.withColumn("id", uuidUdf()) result.repartition(300).write.format( "org.apache.spark.sql.cassandra").options(table=table, keyspace=keyspace).save()
def schema(): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) return tmax_schema
def __get_basic_schema(self): return TableSchema( [ t.StructField("name", t.StringType(), False), t.StructField("date", t.DateType(), False), t.StructField("visits", t.IntegerType(), True), ], primary_key=["name", "date"], partition_by=["name"], )
def test_repartition_by_druid_segment_size(spark): add_dataframe_druid_extension() schema = t.StructType([ t.StructField('date', t.DateType()), t.StructField('country', t.StringType()), t.StructField('dau', t.IntegerType()), t.StructField('revenue', t.DoubleType()), ]) rows = [ row(date=to_date("2019-10-17"), country="US", dau=50, revenue=100.0), row(date=to_date("2019-10-17"), country="GB", dau=20, revenue=20.0), row(date=to_date("2019-10-17"), country="DE", dau=20, revenue=20.0), row(date=to_date("2019-10-16"), country="US", dau=50, revenue=100.0), row(date=to_date("2019-10-16"), country="FI", dau=20, revenue=20.0), row(date=to_date("2019-10-16"), country="GB", dau=20, revenue=20.0), row(date=to_date("2019-10-16"), country="DE", dau=20, revenue=20.0) ] df: DataFrame = get_df(spark, rows, schema) # note how we can call .repartitionByDruidSegmentSize directly on Dataset[Row] # the nice thing is this allows continuous method chaining on Dataset without braking the chain df = df.repartition_by_druid_segment_size('date', segment_granularity='DAY', rows_per_segment=2) # group & count # because we can't know which exact rows end up in each partition within the same date # however we know how many partitions there should be for each date df = df.groupBy('__PARTITION_TIME__', '__PARTITION_NUM__').count() expected: DataFrame = get_df( spark, [ row(__PARTITION_TIME__=to_date("2019-10-17"), __PARTITION_NUM__=0, count=2), row(__PARTITION_TIME__=to_date("2019-10-16"), __PARTITION_NUM__=1, count=2), row(__PARTITION_TIME__=to_date("2019-10-17"), __PARTITION_NUM__=1, count=1), row(__PARTITION_TIME__=to_date("2019-10-16"), __PARTITION_NUM__=0, count=2), ], t.StructType([ t.StructField('__PARTITION_TIME__', t.TimestampType()), t.StructField('__PARTITION_NUM__', t.IntegerType()), t.StructField('count', t.LongType()), ])) assert_df(df, expected)
def invoice_dataframe(self, invoice_source): """ Fact Invoice records and attributes from dataA Sources """ ri = ( self.read_source(source=invoice_source).where( "business_unit_id == '10'").where("sale_type in ('I', 'E')"). where("system_id not in ('SA', '30')") # TODO Add year/period filter into config .where("concat(year,period) > '201609'")) ri = ri.withColumn('iptmeta_source_system', F.lit('dataA')) ri = ri.withColumn( 'ship1_material_id_int', F.when(ri.ship_mat1_id.rlike('[^0-9]+'), F.lit(None)).otherwise( ri.ship_mat1_id.cast(T.IntegerType()))) lstrip_0_udf = lstrip_0() ri = ri.withColumn('sold_customer_id_lstrip_0', lstrip_0_udf(ri.sold_customer_id)) ri = ri.withColumn('ship_customer_id_lstrip_0', lstrip_0_udf(ri.ship_customer_id)) # Strip leading zeros from numeric material_id's ri = ri.withColumn( 'mmf_material', F.concat(ri.system_id, F.lit('/'), F.coalesce(ri.ship1_material_id_int, ri.ship_mat1_id))) ri = ri.withColumn( 'commercial_print_customer_key', F.concat(ri.system_id, F.lit('/'), ri.sold_customer_id_lstrip_0, ri.system_id, F.lit('/'), ri.ship_customer_id_lstrip_0)) ri = ri.withColumn("inv_date", F.col('inv_date').cast(T.DateType())) ri = ri.withColumn("inv_month", F.month(F.col('inv_date'))) ri = ri.withColumn("inv_year", F.year(F.col('inv_date'))) ri = ri.withColumn('invoice_volume', F.coalesce(ri.line_qty, F.lit(MISSING_NUMBER))) ri = ri.withColumn( 'invoice_uom_id', F.coalesce(ri.invoice_uom_id, F.lit(MISSING_STRING_ID))) ri = ri.withColumnRenamed('sales_rep_id', 'ri_sales_rep_id') # some lines have multiple quality class values so if any are prime we treat the whole line as GOOD ri = ri.withColumn( 'prime_flag', F.max( F.when(F.isnull(ri.quality_class), 1).when(ri.quality_class == 'GOOD', 1).otherwise(0)).over( W.partitionBy(ri.system_id, ri.invoice_id, ri.line_number))) ri = ri.withColumn( 'quality_class', F.when(ri.prime_flag == 1, F.lit('GOOD')).otherwise(ri.quality_class)) return ri
def process_user_json(input_json_user): user_schema = types.StructType([ types.StructField('user_id', types.StringType(), True), types.StructField('average_stars', types.DoubleType(), True), types.StructField('review_count', types.LongType(), True), types.StructField('yelping_since', types.DateType(), True) ]) users_df = spark.read.json(input_json_user, schema=user_schema).repartition(100) write_to_cassandra(users_df, TABLE_USER)
def myschema(): comments_schema = types.StructType([ types.StructField('index', types.IntegerType()), types.StructField('listing_id', types.IntegerType()), types.StructField('id', types.IntegerType()), types.StructField('date', types.DateType()), types.StructField('reviewer_id', types.IntegerType()), types.StructField('reviewer_name', types.StringType()), types.StructField('comments', types.StringType()) ]) return (comments_schema)
def process_etl_immig(spark): """ Function to load data from dataset, process it and write to parquet files """ # Reading Immigration data immig_df = spark.read.format('com.github.saurfang.sas.spark').load(Inpath + 'i94_apr16_sub.sas7bdat') #immig_df = spark.read.options(header='True', inferSchema='True', delimiter=',').csv(Inpath + "immigration_data_apr16.csv") # Reading supporting documents for immigration data i94res_df = spark.read.options(header='True', inferSchema='True', delimiter=',').csv(Inpath + "i94res_country_codes_immigration.csv") i94res_df = i94res_df.drop("_c3") i94res_df = i94res_df.dropna(how='any',subset = ['i94res','Country']).drop_duplicates() i94port_df = spark.read.options(header='True', inferSchema='True', delimiter=';').csv(Inpath + "i94port_city_codes_immigration.csv") i94port_df = i94port_df.drop("_c3") i94port_df = i94port_df.dropna(how='any',subset = ['i94port','City','State_CD']).drop_duplicates() # Cleaning the data immig_df = immig_df.dropna(how='any',subset=['cicid','i94res','i94port','arrdate','i94addr','i94bir','gender','visatype']) immig_df = immig_df.drop_duplicates() get_date_sas = udf(lambda x: (datetime(1960, 1, 1) + timedelta(days=int(x))), T.DateType()) immig_df = immig_df.withColumn("arrival_date",get_date_sas(immig_df.arrdate)) # create view of immigration and supporting data to extract using SQL queries immig_df.createOrReplaceTempView("immigration_data") i94res_df.createOrReplaceTempView("country_data") i94port_df.createOrReplaceTempView("port_data") # extract columns to create staging immigration table stg_immig = spark.sql("""SELECT DISTINCT CAST(id.cicid AS INT) AS ID, INITCAP(cd.Country) AS origin_country, INITCAP(pd.City) AS city, id.i94addr AS state_cd, id.arrival_date, CAST(id.i94bir AS INT) AS age, id.gender AS gender, id.visatype AS visa_type FROM immigration_data id JOIN country_data cd ON id.i94res = cd.i94res JOIN port_data pd ON id.i94port = pd.i94port AND id.i94addr = pd.State_CD WHERE city IS NOT NULL """) # stg_immig.show(25) stg_immig.write.parquet(Outpath + "/immigration",mode = 'overwrite')
def main(inputs, keyspace, table): schema = types.StructType([ types.StructField('id', types.StringType(), True), types.StructField('host', types.StringType(), True), types.StructField('datetime', types.DateType(), True), types.StructField('path', types.StringType(), True), types.StructField('bytes', types.IntegerType(), True) ]) fields = spark.sparkContext.textFile(inputs).flatMap(getFields) data = spark.createDataFrame(fields, schema) data.write.format("org.apache.spark.sql.cassandra").options( table=table, keyspace=keyspace).save()
def transform_tealium_event_dataframe_for_date(sql_context, target_date, df_events): df_events = (df_events.withColumn( 'date', F.udf(lambda x: datetime.datetime.strptime(x, '%Y%m%d'), T.DateType())(F.col('data.udo.yyyymmdd'))).select( F.col('post_time').alias("microtime"), F.col('data.firstparty_tealium_cookies.device_id').alias( "device_id"), F.col('data.udo.user_agent').alias('user_agent'), 'date', F.col('data.dom.referrer').alias('referrer')).filter( F.col('date') == target_date).distinct()) return df_events
def get_dtypes_spark(type): switcher = { 'int32': st.IntegerType(), 'int64': st.LongType(), 'float32': st.FloatType(), 'float64': st.DoubleType(), 'date64': st.DateType(), #TimestampType 'str': st.StringType(), 'boolean': st.BooleanType() } func = switcher.get(type, "nothing") # Execute the function return func
def as_spark_type(tpe) -> types.DataType: """ Given a Python type, returns the equivalent spark type. Accepts: - the built-in types in Python - the built-in types in numpy - list of pairs of (field_name, type) - dictionaries of field_name -> type - Python3's typing system """ # TODO: Add "boolean" and "string" types. # ArrayType if tpe in (np.ndarray,): return types.ArrayType(types.StringType()) elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list): return types.ArrayType(as_spark_type(tpe.__args__[0])) # BinaryType elif tpe in (bytes, np.character, np.bytes_, np.string_): return types.BinaryType() # BooleanType elif tpe in (bool, np.bool, "bool", "?"): return types.BooleanType() # DateType elif tpe in (datetime.date,): return types.DateType() # NumericType elif tpe in (np.int8, np.byte, "int8", "byte", "b"): return types.ByteType() elif tpe in (decimal.Decimal,): # TODO: considering about the precision & scale for decimal type. return types.DecimalType(38, 18) elif tpe in (float, np.float, np.float64, "float", "float64", "double"): return types.DoubleType() elif tpe in (np.float32, "float32", "f"): return types.FloatType() elif tpe in (np.int32, "int32", "i"): return types.IntegerType() elif tpe in (int, np.int, np.int64, "int", "int64", "long", "bigint"): return types.LongType() elif tpe in (np.int16, "int16", "short"): return types.ShortType() # StringType elif tpe in (str, np.unicode_, "str", "U"): return types.StringType() # TimestampType elif tpe in (datetime.datetime, np.datetime64, "datetime64[ns]", "M"): return types.TimestampType() else: raise TypeError("Type %s was not understood." % tpe)
def main(inputs, output): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(inputs, schema=tmax_schema) query = "SELECT t.station AS station, t.date AS date, t.day AS day, t.latitude AS latitude, t.longitude AS longitude, t.elevation AS elevation, t.tmax AS tmax, y.tmax AS tmax_yesterday FROM (SELECT station, date, latitude, longitude, elevation, tmax, DAYOFYEAR(date) AS day, date_sub(date,1) AS date_yesterday FROM __THIS__) t, (SELECT station, date, latitude, longitude, elevation, tmax, DAYOFYEAR(date) AS day, date_sub(date,1) AS date_yesterday FROM __THIS__) y WHERE t.date = y.date_yesterday AND t.station = y.station" sqlTrans = SQLTransformer(statement=query) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() # train.show() validation = validation.cache() assembler = VectorAssembler(inputCols=[ "latitude", "longitude", "elevation", "day", "tmax_yesterday" ], outputCol="features") classifier = GBTRegressor(featuresCol='features', labelCol='tmax') pipeline = Pipeline(stages=[sqlTrans, assembler, classifier]) model = pipeline.fit(train) predictions = model.transform(validation) predictions.show() r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) print("R-square for the validation data is: " + str(r2)) model.write().overwrite().save(output) r2 = r2_evaluator.evaluate(model.transform(train)) print("R-square for the training data is: " + str(r2)) print(model.stages[-1].featureImportances) sfu_predict = [("sfu", datetime.date(2018, 11, 12), 49.2771, -122.9146, 330.0, 12.0), ("sfu", datetime.date(2018, 11, 13), 49.2771, -122.9146, 330.0, 12.0)] sfu_predict_df = spark.createDataFrame(sfu_predict, schema=tmax_schema) sfu_predict_df.show() sfu_predictions = model.transform(sfu_predict_df).select( 'station', 'date', 'prediction') sfu_predictions.show()
def invoice_dataframe(self, invoice_source): inv = self.read_source(source=invoice_source) convert_date = F.udf( lambda xdate: datetime.datetime.strptime(xdate, '%Y%m%d') if len(xdate) == 8 else datetime.datetime(1, 1, 1, 0, 0), T.DateType()) inv = inv.withColumnRenamed('invoice_date', 'invoice_date_original') inv = inv.withColumn( 'invoice_date', convert_date(inv.invoice_date_original.cast( T.StringType()))) # Invoice_date must be 10 or more # Filter the invoice dataframe to simplify later processing inv = self.dataB_filter_plantvals(inv) inv = self.dataB_filter_report_date(inv) return inv
def load_to_df(self, json_file): temp_df = self.sqlContext.read.json(json_file) df = temp_df.select( col('id').cast(types.IntegerType()), col('name'), col('has_test'), col('published_at').cast(types.DateType()), col('created_at').cast(types.DateType()), col('url'), col('area.name').alias('area_name'), col('salary.from').alias('salary_from'), col('salary.to').alias('salary_to'), col('salary.currency').alias('salary_currency'), col('address.street').alias('address.street'), col('address.building').alias('address_building'), col('address.raw').alias('address_raw'), col('address.metro.station_name').alias('metro_name'), col('employer.id').alias('employer_id').cast(types.IntegerType()), col('employer.name').alias('employer_name'), col('snippet.requirement').alias('snippet_requirement'), col('snippet.responsibility').alias('snippet_responsibility'), col('contacts.name').alias('contacts_name'), col('contacts.email').alias('contacts_email')) return df
def process_immig_data(spark, input_data, output_data): """ Summary : Procedure to process log data from S3 song files and extract and write User, time and songdata parquet files back to S3 Parameters spark - The spark session creted in the main function input_data - The location of the root folder on S3 under which all the json files are stored. output_data - The location of the root folder on S3 under which all the processed parquet files will be stored. Python functions needed to convert epoch time in logs to datetimestamp to extract all time relevant information. """ # get filepath to log data file log_data = input_data + 'log_data/*' # read log data file immg_data = spark.read.format('csv').options(header='true', delimiter=',' ) \ .load('immigration_data_sample.csv') def convert_datetime(x): try: start = datetime(1960, 1, 1) return start + timedelta(days=int(x)) except: return None # cleanup udf_datetime_from_sas = udf(lambda x: convert_datetime(x), T.DateType()) immg_data = immg_data \ .withColumn("i94yr", col("i94yr").cast("integer")) \ .withColumn("i94mon", col("i94mon").cast("integer")) \ .withColumn("i94cit", col("i94cit").cast("integer")) \ .withColumn("i94res", col("i94res").cast("integer")) \ .withColumn("i94visa", col("i94visa").cast("integer")) \ .withColumn("biryear", col("biryear").cast("integer")) \ .withColumn("admnum", col("admnum").cast("integer")) \ .withColumn("arrival_date", udf_datetime_from_sas(col("arrdate").cast("integer"))) \ .withColumn("departure_date", udf_datetime_from_sas(col("depdate").cast("integer"))) #drop duplicates immg_data = immg_data.distinct() immg_data.createOrReplaceTempView("immg_data") # write time table to parquet files partitioned by year and month immg_data.write.partitionBy("i94yr","i94mon").mode("overwrite") .parquet(os.path.join(output_data,'immg_data.parquet'))
def main(input_dir, keyspace, table): data = spark.sparkContext.textFile(input_dir) request = data.map(parse) df_schema = types.StructType([ types.StructField('id', types.StringType(), True), types.StructField('host', types.StringType(), True), types.StructField('datetime', types.DateType(), True), types.StructField('path', types.StringType(), True), types.StructField('bytes', types.IntegerType(), True) ]) df = spark.createDataFrame(request, df_schema).dropna().repartition('host') df.write.format("org.apache.spark.sql.cassandra").mode('overwrite').option('confirm.truncate', True) \ .options(table=table, keyspace=keyspace).save()
def main(input, model_file): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(input, schema=tmax_schema) train, validation = data.randomSplit([0.75, 0.25], seed=123) train = train.cache() validation = validation.cache() y_tmax = SQLTransformer( statement= "SELECT today.station,today.latitude,today.longitude,today.elevation,today.date,today.tmax,yesterday.tmax AS yesterday_tmax FROM __THIS__ as today INNER JOIN __THIS__ as yesterday ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station" ) getvalues = SQLTransformer( statement= "SELECT station,latitude,longitude,elevation,dayofyear(date) AS dayofyear,tmax,yesterday_tmax from __THIS__" ) assemble_features = VectorAssembler(inputCols=[ 'latitude', 'longitude', 'elevation', 'dayofyear', 'yesterday_tmax' ], outputCol='features') classifier = GBTRegressor(featuresCol='features', labelCol='tmax') pipeline = Pipeline( stages=[y_tmax, getvalues, assemble_features, classifier]) model = pipeline.fit(train) predictions = model.transform(validation) r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') r2 = r2_evaluator.evaluate(predictions) print('-----------------------------------') print('r2: %g' % (r2, )) print('-----------------------------------') rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') rmse = rmse_evaluator.evaluate(predictions) print('rmse: %g' % (rmse, )) model.write().overwrite().save(model_file)
def main(inputs, model_filew): tmax_schema = types.StructType([ types.StructField('station', types.StringType()), types.StructField('date', types.DateType()), types.StructField('latitude', types.FloatType()), types.StructField('longitude', types.FloatType()), types.StructField('elevation', types.FloatType()), types.StructField('tmax', types.FloatType()), ]) data = spark.read.csv(inputs, schema=tmax_schema) train, validation = data.randomSplit([0.75, 0.25]) train = train.cache() validation = validation.cache() # data_process=SQLTransformer(statement="SELECT *, dayofyear(date) AS day_of_year FROM __THIS__ ") data_process = SQLTransformer( statement= "SELECT today.latitude,today.longitude,today.tmax AS tmax, today.elevation, \ dayofyear(today.date) AS day_of_year,yesterday.tmax AS yesterday_tmax\ FROM __THIS__ as today \ INNER JOIN __THIS__ as yesterday \ ON date_sub(today.date, 1) = yesterday.date AND today.station = yesterday.station" ) # assemble_features = VectorAssembler(inputCols=['latitude','longitude','elevation', 'day_of_year'], outputCol='features') assemble_features = VectorAssembler(inputCols=[ 'latitude', 'longitude', 'elevation', 'day_of_year', 'yesterday_tmax' ], outputCol='features') classifier = GBTRegressor(featuresCol='features', labelCol='tmax') # classifier = GeneralizedLinearRegression(featuresCol='features', labelCol='tmax',family='gaussian', link='identity') pipeline = Pipeline(stages=[data_process, assemble_features, classifier]) model = pipeline.fit(train) predictions = model.transform(validation) # predictions.show() r2_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='r2') rmse_evaluator = RegressionEvaluator(predictionCol='prediction', labelCol='tmax', metricName='rmse') r2 = r2_evaluator.evaluate(predictions) rmse = rmse_evaluator.evaluate(predictions) print('r-square for GBT model: %g' % (r2, )) print('root mean square error for GBT model: %g' % (rmse, )) model.write().overwrite().save(model_file)
def test_prepending_a_mapping_with_duplicated_columns( self, input_columns, mapped_df): """Output schema is correct for newly prepended mapping with columns that are also included in the input schema""" new_mapping = [ ("created_date", "meta.created_at_sec", "DateType"), ("birthday", "birthday", "DateType"), ] new_columns = [name for (name, path, data_type) in new_mapping] new_columns_deduplicated = [ x for x in new_columns if x not in input_columns ] new_mapped_df = Mapper( mapping=new_mapping, mode="prepend", ignore_missing_columns=True).transform(mapped_df) assert new_columns_deduplicated + input_columns == new_mapped_df.columns assert mapped_df.schema["birthday"].dataType == T.TimestampType() assert new_mapped_df.schema["birthday"].dataType == T.DateType()
def get_spark_type(field, required_type): if isinstance(required_type, type(db_types.DATE())): return spk_types.StructField(field, spk_types.DateType(), True) elif isinstance(required_type, type(db_types.DATETIME())): return spk_types.StructField(field, spk_types.TimestampType(), True) elif isinstance(required_type, type(db_types.VARCHAR())): return spk_types.StructField(field, spk_types.StringType(), True) elif isinstance(required_type, type(db_types.INT())): return spk_types.StructField( field, spk_types.LongType(), True ) # db type enforced earlier than spark ones, so spark types needs to be less restrictive than spark ones so needs to choose LongType instead of IntegerType elif isinstance(required_type, type(db_types.FLOAT())): return spk_types.StructField(field, spk_types.FloatType(), True) elif isinstance(required_type, type(db_types.BOOLEAN())): return spk_types.StructField(field, spk_types.BooleanType(), True) else: raise Exception( "Type not recognized, field={}, required_type={}".format( field, required_type))