def test_pandas_udf_arrow_overflow(self): from distutils.version import LooseVersion import pandas as pd import pyarrow as pa df = self.spark.range(0, 1) @pandas_udf(returnType="byte") def udf(column): return pd.Series([128]) # Arrow 0.11.0+ allows enabling or disabling safe type check. if LooseVersion(pa.__version__) >= LooseVersion("0.11.0"): # When enabling safe type check, Arrow 0.11.0+ disallows overflow cast. with self.sql_conf({ "spark.sql.execution.pandas.arrowSafeTypeConversion": True}): with self.assertRaisesRegexp(Exception, "Exception thrown when converting pandas.Series"): df.withColumn('udf', udf('id')).collect() # Disabling safe type check, let Arrow do the cast anyway. with self.sql_conf({"spark.sql.execution.pandas.arrowSafeTypeConversion": False}): df.withColumn('udf', udf('id')).collect() else: # SQL config `arrowSafeTypeConversion` no matters for older Arrow. # Overflow cast causes an error. with self.sql_conf({"spark.sql.execution.pandas.arrowSafeTypeConversion": False}): with self.assertRaisesRegexp(Exception, "Integer value out of bounds"): df.withColumn('udf', udf('id')).collect()
def make_prediction(event, df): event_timestamp, event_dayofweek, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, event_passenger_count = event[0], event[1], event[2], event[3], event[4], event[5], event[6] udf_diff_timeofday=udf(utils.diff_timeofday, IntegerType()) udf_shortest_distance=udf(utils.shortest_distance, FloatType()) df = df.withColumn("diff_timeofday", udf_diff_timeofday(df.pickup, lit(event_timestamp))).filter("`diff_timeofday` < 30") df = df.withColumn("event_sum_distance", udf_shortest_distance(df.pick_lat, df.pick_lon, lit(pickup_lat), lit(pickup_lon))+udf_shortest_distance(df.drop_lat, df.drop_lon, lit(dropoff_lat), lit(dropoff_lon))).filter("`event_sum_distance` < 2") df = df.sort('event_sum_distance') if df.count() < 10: return [0,0] a = pd.DataFrame(df.take(50)) a.columns = df.columns speed_array = a.as_matrix(["avg_speed"]) dist_sf_array = a.as_matrix(["dist_sf"]) distance_array = a["trip_distance"].tolist() fare_array = a["total_notip"].tolist() time_array = a["trip_time_in_secs"].tolist() #set initial parameter values x0 = [0.5, 0.5, 3.0, 3.0] bnds = ((0.25, 0.75), (0.25, 0.75), (0.1,20), (0,10)) #perform the fit res = optimize.minimize(func_to_optimize, x0, args=(distance_array, time_array, fare_array), method='TNC', bounds=bnds) grid_dist = utils.grid_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon) #get the predictions time_pred = utils.time_prediction(speed_array.mean(), grid_dist, dist_sf_array.mean()) fare_pred = utils.fare_prediction(res.x[0], grid_dist, dist_sf_array.mean(), res.x[1], res.x[2], res.x[3]) if res.success == True: return [fare_pred, time_pred] else: return [0,0]
def test_pandas_udf_detect_unsafe_type_conversion(self): from distutils.version import LooseVersion import pandas as pd import numpy as np import pyarrow as pa values = [1.0] * 3 pdf = pd.DataFrame({'A': values}) df = self.spark.createDataFrame(pdf).repartition(1) @pandas_udf(returnType="int") def udf(column): return pd.Series(np.linspace(0, 1, 3)) # Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast. if LooseVersion(pa.__version__) >= LooseVersion("0.11.0"): with self.sql_conf({ "spark.sql.execution.pandas.arrowSafeTypeConversion": True}): with self.assertRaisesRegexp(Exception, "Exception thrown when converting pandas.Series"): df.select(['A']).withColumn('udf', udf('A')).collect() # Disabling Arrow safe type check. with self.sql_conf({ "spark.sql.execution.pandas.arrowSafeTypeConversion": False}): df.select(['A']).withColumn('udf', udf('A')).collect()
def cat2Num(self, df, indices): '''sbaronia - extract the categorical data and make df out of it so oneHotEncoding can be run on them''' protocol_ind0 = df.select(df.id,df.rawFeatures[indices[0]].alias("features0")).cache() protocol_ind1 = df.select(df.id,df.rawFeatures[indices[1]].alias("features1")).cache() ind0_enc = self.oneHotEncoding(protocol_ind0,"features0").cache() ind1_enc = self.oneHotEncoding(protocol_ind1,"features1").cache() '''sbaronia - add those hot encoded features columns to original df''' int_join_1 = df.join(ind0_enc, ind0_enc.id == df.id, 'inner').drop(ind0_enc.id).cache() int_join_2 = int_join_1.join(ind1_enc, int_join_1.id == ind1_enc.id, 'inner').drop(int_join_1.id).cache() '''sbaronia - now create a new column features which has converted vector form and drop rest columns''' comb_udf = udf(replaceCat2Num,StringType()) int_join_2 = int_join_2.select(int_join_2.id,int_join_2.rawFeatures, \ comb_udf(int_join_2.rawFeatures, \ int_join_2.num_features0, \ int_join_2.num_features1).alias("features")).cache() '''sbaronia - convert list of numerical features to DenseVector so they can be used in KMeans''' dense_udf = udf(lambda line: DenseVector.parse(line), VectorUDT()) feat = int_join_2.select(int_join_2.id,int_join_2.rawFeatures,dense_udf(int_join_2.features).alias("features")).cache() return feat
def extract_nlp(sql_sc): schema = StructType([ StructField("title", StringType()), StructField("first_sentence", StringType()), StructField("infoboxnames", StringType()), StructField("urlwords", StringType()) ]) noun_ex = udf(extract_nouns, StringType()) pos_ex = udf(extract_hypernyms, StringType()) df = sql_sc.read \ .format("com.databricks.spark.csv") \ .schema(schema) \ .option("header", "false") \ .option("quotechar", '|') \ .option("delimiter", ',') \ .load(DATAP + '/dump/articles_annotated_pre.csv') df = df.withColumn("nouns", noun_ex(df.first_sentence)) df = df.withColumn("pos_hyps", pos_ex(df.first_sentence)) df['title', 'urlwords', 'infoboxnames', 'nouns', 'pos_hyps'].write \ .format("com.databricks.spark.csv") \ .option("header", "false") \ .option("quotechar", '|') \ .option("delimiter", ",") \ .csv(DATAP + '/dump/articles_annotated')
def extract_pre(sql_sc): schema = StructType([ StructField("title", StringType()), StructField("text", StringType()) ]) first_ex = udf(extract_first_sentence, StringType()) inf_ex2 = udf(extract_names, StringType()) url_ex = udf(extract_urlpattern, StringType()) df = sql_sc.read \ .format("com.databricks.spark.csv") \ .schema(schema) \ .option("header", "false") \ .option("quotechar", '|') \ .option("delimiter", ',') \ .load(DATAP + '/dump/articles.csv') #df = df.repartition(100) df = df.withColumn("first_sentence", first_ex(df.text)) df = df.withColumn('infoboxnames', inf_ex2(df.text)) df = df.withColumn("urlwords", url_ex(df.title)) df['title', 'first_sentence', 'infoboxnames', 'urlwords'].write \ .format("com.databricks.spark.csv") \ .option("header", "false") \ .option("quotechar", '|') \ .option("delimiter", ",") \ .csv(DATAP + '/dump/articles_annotated_pre')
def text_features(p_df): """ Extracts features derived from the quora question texts. :param p_df: A DataFrame. :return: A DataFrame. """ diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType()) common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType()) unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType()) p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2")) p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2"))) p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words")) p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words"))) p_df = p_df.withColumn( "unique_chars_q1", unique_chars("question1") ).withColumn("unique_chars_q2", unique_chars("question2")) assembler = VectorAssembler( inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"], outputCol="text_features" ) p_df = assembler.transform(p_df) return p_df
def test_udf_in_generate(self): from pyspark.sql.functions import udf, explode df = self.spark.range(5) f = udf(lambda x: list(range(x)), ArrayType(LongType())) row = df.select(explode(f(*df))).groupBy().sum().first() self.assertEqual(row[0], 10) df = self.spark.range(3) res = df.select("id", explode(f(df.id))).collect() self.assertEqual(res[0][0], 1) self.assertEqual(res[0][1], 0) self.assertEqual(res[1][0], 2) self.assertEqual(res[1][1], 0) self.assertEqual(res[2][0], 2) self.assertEqual(res[2][1], 1) range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType())) res = df.select("id", explode(range_udf(df.id))).collect() self.assertEqual(res[0][0], 0) self.assertEqual(res[0][1], -1) self.assertEqual(res[1][0], 0) self.assertEqual(res[1][1], 0) self.assertEqual(res[2][0], 1) self.assertEqual(res[2][1], 0) self.assertEqual(res[3][0], 1) self.assertEqual(res[3][1], 1)
def test_udf_wrapper(self): from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType def f(x): """Identity""" return x return_type = IntegerType() f_ = udf(f, return_type) self.assertTrue(f.__doc__ in f_.__doc__) self.assertEqual(f, f_.func) self.assertEqual(return_type, f_.returnType) class F(object): """Identity""" def __call__(self, x): return x f = F() return_type = IntegerType() f_ = udf(f, return_type) self.assertTrue(f.__doc__ in f_.__doc__) self.assertEqual(f, f_.func) self.assertEqual(return_type, f_.returnType) f = functools.partial(f, x=1) return_type = IntegerType() f_ = udf(f, return_type) self.assertTrue(f.__doc__ in f_.__doc__) self.assertEqual(f, f_.func) self.assertEqual(return_type, f_.returnType)
def setup_method(self, method): sparkConf = create_spark_conf().setMaster("local[4]")\ .setAppName("test wide and deep") self.sc = init_nncontext(sparkConf) self.sqlContext = SQLContext(self.sc) data_path = os.path.join(os.path.split(__file__)[0], "../../resources/recommender") categorical_gender_udf = udf(lambda gender: categorical_from_vocab_list(gender, ["F", "M"], start=1)) bucket_udf = udf(lambda feature1, feature2: hash_bucket(str(feature1) + "_" + str(feature2), bucket_size=100)) self.data_in = self.sqlContext.read.parquet(data_path) \ .withColumn("gender", categorical_gender_udf(col("gender")).cast("int")) \ .withColumn("occupation-gender", bucket_udf(col("occupation"), col("gender")).cast("int")) self.column_info = ColumnFeatureInfo( wide_base_cols=["occupation", "gender"], wide_base_dims=[21, 3], wide_cross_cols=["occupation-gender"], wide_cross_dims=[100], indicator_cols=["occupation", "gender"], indicator_dims=[21, 3], embed_cols=["userId", "itemId"], embed_in_dims=[100, 100], embed_out_dims=[20, 20], continuous_cols=["age"])
def test_nondeterministic_udf(self): # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations import random udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic() self.assertEqual(udf_random_col.deterministic, False) df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND')) udf_add_ten = udf(lambda rand: rand + 10, IntegerType()) [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect() self.assertEqual(row[0] + 10, row[1])
def select_prediction_udf(self, column): if column not in self.get_output_names(): raise ValueError("Column '" + column + "' is not defined as the output column in MOJO Pipeline.") if self.get_named_mojo_output_columns(): func = udf(lambda d: d, DoubleType()) return func("prediction." + column).alias(column) else: idx = self.get_output_names().index(column) func = udf(lambda arr: arr[idx], DoubleType()) return func("prediction.preds").alias(column)
def processMSC(): """ Parses MSC records as per defined rules :return: Records returned in pipe-delimited format """ # Assumption: MSC folder under the provided input path inputDir = os.path.join(args.inputdir, "INPUT") lines = sc.textFile(inputDir) # Call the parsing function parsedMSCLines = lines.map(parseMSCRecords) # The schema is encoded in a string. schemaString = "RecordType FirstNum SecondNum CallDate CallHour Duration StartTower StartLAC CallType" fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()] schema = StructType(fields) # Apply the schema to the RDD. schemaData = sqlContext.createDataFrame(parsedMSCLines, schema) modify_phone_number_udf = udf(mod_number, StringType()) ph_num_mod = schemaData.select( schemaData.RecordType, modify_phone_number_udf(schemaData.FirstNum).alias('FirstNum'), modify_phone_number_udf(schemaData.SecondNum).alias('SecondNum'), schemaData.CallDate, schemaData.CallHour, schemaData.Duration, schemaData.StartTower, schemaData.StartLAC, schemaData.CallType) get_phone_type_udf = udf(get_phone_type, StringType()) first_ph_type = ph_num_mod.withColumn('FirstPhoneType', get_phone_type_udf(ph_num_mod.FirstNum)) sec_ph_type = first_ph_type.withColumn('SecondPhoneType', get_phone_type_udf(first_ph_type.SecondNum)) final_df = sec_ph_type.select( sec_ph_type.RecordType, sec_ph_type.FirstNum, sec_ph_type.SecondNum, sec_ph_type.CallDate, sec_ph_type.CallHour, sec_ph_type.Duration, sec_ph_type.StartTower, sec_ph_type.StartLAC, sec_ph_type.CallType, F.when(sec_ph_type.FirstPhoneType.isin(["mobile", "landline", "shortcode"]) & sec_ph_type.SecondPhoneType.isin(["mobile", "landline", "shortcode"]), "National") .otherwise("International").alias('PhoneType')) print final_df.show()
def cat2Num(self, df, indices): unique_values = [] for i in indices: d = udf(lambda r: r[i], StringType()) dt = df.select(d(df.rawFeatures)).distinct().collect() unique_values.extend(dt) unique_count = len(unique_values) convertUDF = udf(lambda r: to_onehot(r, indices, unique_values, unique_count), ArrayType(DoubleType())) newdf = df.withColumn("features", convertUDF(df.rawFeatures)) return newdf
def test_complex_nested_udt_in_df(self): from pyspark.sql.functions import udf schema = StructType().add("key", LongType()).add("val", PythonOnlyUDT()) df = self.spark.createDataFrame( [(i % 3, PythonOnlyPoint(float(i), float(i))) for i in range(10)], schema=schema) df.collect() gd = df.groupby("key").agg({"val": "collect_list"}) gd.collect() udf = udf(lambda k, v: [(k, v[0])], ArrayType(df.schema)) gd.select(udf(*gd)).collect()
def test_nonparam_udf_with_aggregate(self): import pyspark.sql.functions as f df = self.spark.createDataFrame([(1, 2), (1, 2)]) f_udf = f.udf(lambda: "const_str") rows = df.distinct().withColumn("a", f_udf()).collect() self.assertEqual(rows, [Row(_1=1, _2=2, a=u'const_str')])
def find_candidate_matches(match_field_value, data_df, match_column, id_column, sqlContext): # Change the name of we are working on # (id, match_col, col1, col2,..., [ngram1, ngram2,...]) udf_n_grams = udf(lambda r: ','.join(n_grams(2, r)), StringType()) data_with_ngrams_df = data_df.withColumn('ngrams', udf_n_grams(match_column)).\ withColumnRenamed(match_column, 'match_col').\ withColumnRenamed(id_column, 'id').cache() identical_removed_df = data_with_ngrams_df.dropDuplicates(['match_col']) id_ngram_rdd = identical_removed_df.flatMap(lambda r: unpack_n_grams(r.id, r.match_col, r.ngrams)) # (id, match_col,ngram1), (id, match_col, ngram2),... id_ngram_schema = StructType([StructField('id', IntegerType(), True), StructField('match_col', StringType(), True), StructField('ngram', StringType(), True)]) id_ngram_df = sqlContext.createDataFrame(id_ngram_rdd, id_ngram_schema) # (id, match_col, ngram1) n_grams_str = ','.join(['\"' + s + '\"' for s in n_grams(2, match_field_value)]) filtered_id_ngram_df = id_ngram_df.filter('ngram in (%s)'%n_grams_str)\ .dropDuplicates(['match_col'])\ .cache() sqlContext.registerDataFrameAsTable(filtered_id_ngram_df, 'maintable') sqlContext.registerFunction('mySim', lambda x: jaccard_similarity(x, '\"%s\"'%match_field_value), DoubleType()) candidate_match_df = sqlContext.sql('select id from maintable where mySim(match_col) >=0.5') return [r.id for r in candidate_match_df.collect()]
def test_udf_with_order_by_and_limit(self): from pyspark.sql.functions import udf my_copy = udf(lambda x: x, IntegerType()) df = self.spark.range(10).orderBy("id") res = df.select(df.id, my_copy(df.id).alias("copy")).limit(1) res.explain(True) self.assertEqual(res.collect(), [Row(id=0, copy=0)])
def call_speech_vocab(df): para_cleanup_udf=udf(speech_vocabulary, FloatType()) df_with_vocab_score = df.withColumn('vocab_score', para_cleanup_udf(df['para_cleaned_text'])) print(df_with_vocab_score.printSchema()) print(df_with_vocab_score.show(3)) print(df_with_vocab_score.count()) return df_with_vocab_score
def test_udf_with_aggregate_function(self): df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) from pyspark.sql.functions import udf, col, sum from pyspark.sql.types import BooleanType my_filter = udf(lambda a: a == 1, BooleanType()) sel = df.select(col("key")).distinct().filter(my_filter(col("key"))) self.assertEqual(sel.collect(), [Row(key=1)]) my_copy = udf(lambda x: x, IntegerType()) my_add = udf(lambda a, b: int(a + b), IntegerType()) my_strlen = udf(lambda x: len(x), IntegerType()) sel = df.groupBy(my_copy(col("key")).alias("k"))\ .agg(sum(my_strlen(col("value"))).alias("s"))\ .select(my_add(col("k"), col("s")).alias("t")) self.assertEqual(sel.collect(), [Row(t=4), Row(t=3)])
def visulisationData(): mindate = 820497600 # 1996 maxdate = 1104516000 # 2005 # request.arg.get('nbins') binborders = list(map(int,list(np.linspace(start=mindate,stop=maxdate,num=200,endpoint=False)))) def f(x): for i in range(len(binborders)): if x < binborders[i]: return binborders[i-1] return binborders[-1] stato = ( doctopdat .filter("timestamp > '" + str(mindate) + "'") .filter("timestamp < '" + str(maxdate) + "'") .withColumn('bin',udf(f,LongType())(doctopdat['timestamp'])) .select(['bin'] + ['C' + str(i + 1) for i in range(20)]) ) rows = stato.groupBy('bin').sum().sort('bin').collect() data = [{'key': 'topic ' + str(i), 'values': []} for i in range(20)] for row in rows: for i in range(20): col = 'sum(C' + str(i + 1) + ')' data[i]['values'].append([row['bin'], row[col]]) return jsonify(data=data)
def toPandas(self, df): """ This is similar to the Spark DataFrame built-in toPandas() method, but it handles MLlib Vector columns differently. It converts MLlib Vectors into rows of scipy.sparse.csr_matrix, which is generally friendlier for PyData tools like scikit-learn. .. note:: Experimental: This will likely be replaced in later releases with improved APIs. :param df: Spark DataFrame :return: Pandas dataframe """ cols = df.columns # Convert any MLlib Vector columns to scipy.sparse.csr_matrix matrixCols = [] def toscipy(v): if isinstance(v, DenseVector): return csr_matrix((v.values, np.array(range(v.size)), np.array([0, v.size])), shape=(1, v.size)) elif isinstance(v, SparseVector): return csr_matrix((v.values, v.indices, np.array([0, len(v.indices)])), shape=(1, v.size)) else: raise TypeError("Converter.toPandas found unknown Vector type: %s" % type(v)) tosparse = udf(lambda v: toscipy(v), CSRVectorUDT()) for i in range(len(cols)): c = cols[i] if isinstance(df.schema.fields[i].dataType, VectorUDT): cols[i] = tosparse(df[c]).alias(c) matrixCols.append(c) else: cols[i] = df[c] return df.select(*cols).toPandas()
def test_udf_in_subquery(self): f = udf(lambda x: x, "long") with self.tempView("v"): self.spark.range(1).filter(f("id") >= 0).createTempView("v") sql = self.spark.sql result = sql("select i from values(0L) as data(i) where i in (select id from v)") self.assertEqual(result.collect(), [Row(i=0)])
def test_udf_in_filter_on_top_of_outer_join(self): from pyspark.sql.functions import udf left = self.spark.createDataFrame([Row(a=1)]) right = self.spark.createDataFrame([Row(a=1)]) df = left.join(right, on='a', how='left_outer') df = df.withColumn('b', udf(lambda x: 'x')(df.a)) self.assertEqual(df.filter('b = "x"').collect(), [Row(a=1, b='x')])
def call_utf_encoder(df): utf_encoder_udf=udf(unicode_encode, StringType()) df_cleaned = df.withColumn('speech_text_utf', utf_encoder_udf(df['text'])).drop(df['text']) print(df_cleaned.printSchema()) print(df_cleaned.show(10)) print(df_cleaned.count()) return df_cleaned
def call_para_cleanup(df): para_cleanup_udf=udf(para_segmenter_and_cleanup, StringType()) df_cleaned = df.withColumn('para_cleaned_text', para_cleanup_udf(df['speech_text_utf'])).drop(df['speech_text_utf']) print(df_cleaned.printSchema()) print(df_cleaned.show(10)) print(df_cleaned.count()) return df_cleaned
def test_udf_with_udt(self): row = Row(label=1.0, point=ExamplePoint(1.0, 2.0)) df = self.spark.createDataFrame([row]) self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first()) udf = UserDefinedFunction(lambda p: p.y, DoubleType()) self.assertEqual(2.0, df.select(udf(df.point)).first()[0]) udf2 = UserDefinedFunction(lambda p: ExamplePoint(p.x + 1, p.y + 1), ExamplePointUDT()) self.assertEqual(ExamplePoint(2.0, 3.0), df.select(udf2(df.point)).first()[0]) row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0)) df = self.spark.createDataFrame([row]) self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first()) udf = UserDefinedFunction(lambda p: p.y, DoubleType()) self.assertEqual(2.0, df.select(udf(df.point)).first()[0]) udf2 = UserDefinedFunction(lambda p: PythonOnlyPoint(p.x + 1, p.y + 1), PythonOnlyUDT()) self.assertEqual(PythonOnlyPoint(2.0, 3.0), df.select(udf2(df.point)).first()[0])
def read_receipts(path, ids=None): """ read receipts data from signals :param path: :return: """ lines = sc.textFile(path) receipts = lines.map(lambda x: x.split('|')) print receipts.take(10) # print receipts.take(10) schema_str = """cust_id product_id variant_id division_id source_id qty date origin price discount_type used_for_recs date_modified""" fields = [StructField(field_name, StringType(), True) for field_name in schema_str.split()] schema = StructType(fields) receipts_df = sqlContext.createDataFrame(receipts, schema) udfstring_to_date = udf(parse_created_on, IntegerType()) # udfstring_to_date = udf(parse, DateType()) receipts_df = (receipts_df .withColumn("delta_date", udfstring_to_date("date"))) receipts_out = (receipts_df .filter(receipts_df.qty > 1) .select(receipts_df.cust_id, receipts_df.qty, receipts_df.price, receipts_df.delta_date) # .pivot() not available until Spark 1.6 .groupby("cust_id").agg({"price": "mean", "qty": "mean", "delta_date": "mean"}) ) fig, axes = plt.subplots(nrows=1, ncols=1) axes.set_title('receipts') receipts_out.toPandas().boxplot(ax=axes) plt.show() receipts_out.show(n=30) return receipts_df
def test_udf_in_filter_on_top_of_join(self): # regression test for SPARK-18589 left = self.spark.createDataFrame([Row(a=1)]) right = self.spark.createDataFrame([Row(b=1)]) f = udf(lambda a, b: a == b, BooleanType()) df = left.crossJoin(right).filter(f("a", "b")) self.assertEqual(df.collect(), [Row(a=1, b=1)])
def splitStrCol(self, column, featureNames, mark): """This functions split a column into different ones. In the case of this method, the column provided should be a string of the following form 'word,foo'. :param column Name of the target column, this column is going to be replaced. :param featureNames List of strings of the new column names after splitting the strings. :param mark String that specifies the splitting mark of the string, this frequently is ',' or ';'. """ # Check if column argument is a string datatype: self.__assertTypeStr(column, "column") # Check if mark argument is a string datatype: self.__assertTypeStr(mark, "mark") assert (column in self.__df.columns), "Error: column specified does not exist in dataFrame." assert (type(featureNames) == type([])), "Error: featureNames must be a list of strings." # Setting a udf that split the string into a list of strings. # This is "word, foo" ----> ["word", "foo"] func = udf(lambda x: x.split(mark), ArrayType(StringType())) self.__df = self.__df.withColumn(column, func(col(column))) self.undoVecAssembler(column=column, featureNames=featureNames) self.__addTransformation() # checkpoint in case return self
# In[23]: dictCategories3 # In[24]: from pyspark.sql.functions import udf from pyspark.sql.types import IntegerType def categoriesToInt(cat): return dictCategories3[cat] udfCategoriesToInt = udf(categoriesToInt, IntegerType()) # In[25]: df3_incoded = df3.withColumn("authentication_orientation", udfCategoriesToInt("authentication_orientation") ) # In[26]: df3_incoded.limit(10).toPandas() # In[27]:
import random from datetime import datetime, timedelta import pyspark.sql.functions as F import uuid import sys output_path = sys.argv[1] def rand_date(): delta = random.randint(0, 10000) dt = datetime.now() - timedelta(days=delta) return dt.date().isoformat() def rand_string(): return str(uuid.uuid4()) df = sqlContext.range(1000000, 2000000) df = df.select('id', F.udf(rand_date)().alias('as_of'), F.rand().alias('a'), F.rand().alias('b'), F.rand().alias('c')).orderBy(F.asc('as_of')) df.write.json(output_path)
""" function to assign every timestamp in data to the time interval it belongs to a timestamp belongs to a specific time interval means: start_time of this time interval <= timestamp < start_time of this time interval """ timestamp = datetime.strptime(timestamp, '%H:%M:%S') for i in range(len(endPoints)): if timestamp < endPoints[i]: break return i from pyspark.sql import functions as f from pyspark.sql.types import StringType # convert into user defined function so that can be applied to pysaprk dataframe udf_assign_interval = f.udf(assign_interval,StringType()) data = data.withColumn('INTERVAL',udf_assign_interval('TIME')) # calculate mean price of each time intervel according to symbol, date, interval data = data.groupby(['SYMBOL','DATE','INTERVAL']).agg({'PRICE':'mean'}) data = data.withColumnRenamed('avg(PRICE)', 'AVG_PRICE') data = data.select(data.SYMBOL,data.DATE,data.INTERVAL.cast('double'),data.AVG_PRICE) data = data.orderBy(["SYMBOL","DATE","INTERVAL"], ascending=[1, 1]) # apply window function to get previous time interval avg_price w = Window().partitionBy(col('SYMBOL')).orderBy([col('SYMBOL'),col('DATE'),col('INTERVAL')]) data = data.select("*", lag('AVG_PRICE').over(w).alias('PRE_AVG_PRICE'))
def process_log_data(spark, input_data, output_data): """ Purpose: Process log/event data using Spark and output as parquet to S3 Description: Download JSON log/event data from S3 bucket and set up temporary view so Spark SQL queries may be run on the dataset and then again loaded back Parameters: spark : Spark Session input_data : location of song_data JSON files with metadata about events output_data : dimensional tables in parquet format will be stored """ # read log data file print('Read log data from JSON file') df = spark.read.json(os.path.join(input_data, "log_data/*/*/*.json")) print(df.count()) df.printSchema() # create temp view for Spark SQL queries print('create temp view for Spark SQL queries - logs_data') df.createOrReplaceTempView("logs_data") # filter by actions for song plays songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location', 'userAgent'] # extract columns for users table print('Users table: ') users_table = spark.sql(""" SELECT DISTINCT userId, firstName, lastName, gender, level FROM logs_data WHERE page = 'NextSong' AND userId IS NOT NULL """).dropDuplicates(['userId']) print(users_table.limit(5).toPandas()) # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users.parquet'), 'overwrite') print("users.parquet completed") # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df = df.withColumn('timestamp', get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0))) df = df.withColumn("datetime", get_datetime(df.ts)) # create temp view for SQL queries print('create temp view for Spark SQL queries - time_data') df.createOrReplaceTempView("time_data") # extract columns to create time table print('Time table: ') time_table = spark.sql(""" SELECT DISTINCT datetime as start_time, hour(datetime) AS hour, day(datetime) AS day, weekofyear(datetime) AS week, month(datetime) AS month, year(datetime) AS year, dayofweek(datetime) AS weekday FROM time_data """) print(time_table.limit(5).toPandas()) # write time table to parquet files partitioned by year and month time_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'time.parquet'), 'overwrite') print("time.parquet completed") # read in song data to use for songplays table song_df = spark.read.json( os.path.join(input_data, "song-data/A/A/A/*.json")) # create temp view for Spark SQL queries print('create temp view for Spark SQL queries - song_df') song_df.createOrReplaceTempView("songs_data") # extract columns from joined song and log datasets to create songplays table print('Songplays table: ') songplays_table = spark.sql(""" SELECT monotonically_increasing_id() AS songplay_id, to_timestamp(logs_data.ts/1000) AS start_time, month(to_timestamp(logs_data.ts/1000)) AS month, year(to_timestamp(logs_data.ts/1000)) AS year, logs_data.userId AS user_id, logs_data.level AS level, songs_data.song_id AS song_id, songs_data.artist_id AS artist_id, logs_data.sessionId AS session_id, logs_data.location AS location, logs_data.userAgent AS user_agent FROM logs_data JOIN songs_data ON logs_data.artist = songs_data.artist_name """) print(songplays_table.limit(5).toPandas()) #songplays_table = songplays_table.selectExpr("start_time") songplays_table.select( monotonically_increasing_id().alias('songplay_id')).collect() # write songplays table to parquet files partitioned by year and month songplays_table.write.partitionBy('year', 'month').parquet( os.path.join(output_data, 'songplays.parquet'), 'overwrite') print("songplays.parquet completed") print("process_log_data IS DONE")
# In[152]: input_flat = input_f.flatMap(lambda x: x.split(' ')) input_dist = input_flat.distinct() input_filter = input_dist.filter(lambda x: x is not None and x!="") # In[153]: count_non_zero=udf(lambda row:len([x for x in row if x!=None]),IntegerType()) #udf to get count of non_zero columns sqr=udf(lambda row:(sum([x**2 for x in row if x!=None]))**(1/2),DoubleType()) #udf to get sqrt_of_sum # In[178]: #calculating tf data=input_flat.collect() dffilt = sqlContext.createDataFrame(input_filter.collect(), StringType()) for i in range(input_1.count()): d1=data.index('~') #print(i,d1) df = sqlContext.createDataFrame(data[0:d1], StringType())
def test_udf_with_order_by_and_limit(self): my_copy = udf(lambda x: x, IntegerType()) df = self.spark.range(10).orderBy("id") res = df.select(df.id, my_copy(df.id).alias("copy")).limit(1) self.assertEqual(res.collect(), [Row(id=0, copy=0)])
def test_udf_in_filter_on_top_of_outer_join(self): left = self.spark.createDataFrame([Row(a=1)]) right = self.spark.createDataFrame([Row(a=1)]) df = left.join(right, on="a", how="left_outer") df = df.withColumn("b", udf(lambda x: "x")(df.a)) self.assertEqual(df.filter('b = "x"').collect(), [Row(a=1, b="x")])
spark = SparkSession.builder.appName('cf').getOrCreate() input_file = '/home/ehsan/WIP/DataMininig/train_rating.txt' input_file = 'train_rating.txt' schema = StructType([\ StructField('train_id',IntegerType(), False),\ StructField('user_id', IntegerType(), False),\ StructField('business_id', IntegerType(), False),\ StructField('rating', IntegerType(), False),\ StructField('date', StringType(), False)]) lines = spark.read.csv(input_file, schema=schema, sep=',') lines = lines.drop('train_id').na.drop() date2intUDF = udf(date2int, IntegerType()) lines = lines.withColumn( 'date', date2intUDF('date')) #Converts the date from string to dateType #lines = lines.rdd #parts = lines.map(lambda row: row.value.split("::")) #ratingsRDD = parts.map(lambda p: Row(user_id=int(p[1]), business_id=int(p[2]),rating=float(p[3]), date=long(p[4]))) #ratings = spark.createDataFrame(ratingsRDD) #(training, test) = ratings.randomSplit([0.8, 0.2]) (training, test) = lines.randomSplit([0.8, 0.2]) # Build the recommendation model using ALS on the training data als = ALS(maxIter=10, rank=100, regParam=0.01, userCol="user_id",
def vectorize_searches_F(topics): return udf(lambda x: vectorize_searches(x, topics), MapType(IntegerType(), IntegerType()))
#string = string.strip(punctuation) # tokenize string tokens = re.findall(r"[\w'-]+", string) # remove stopwords tokens = [w for w in tokens if not w in bc_stopwords.value] # word stemming tokens = list(map(bc_stemmer.value, tokens)) else: tokens = [] return tokens smart_tokenize_F = udf(lambda x: smart_tokenize(x), ArrayType(StringType())) def stringTokenizer(tokens): stop_words = stopwords.words('english') stemmer = SnowballStemmer("english").stem if pd.isnull(tokens): tokens = "" return smart_tokenize(tokens, stop_words, stemmer) def combine_text(strings): strings = map(str, strings) return ' '.join(strings)
inferSchema='true') bs_df.show() print(bs_df.printSchema()) def valueToCategory(value, encoding_index): if (value == encoding_index): return 1 else: return 0 #Explode season column into separate columns such as season_<val> and drop season from pyspark.sql.functions import udf from pyspark.sql.functions import lit from pyspark.sql.types import * from pyspark.sql.functions import col udfValueToCategory = udf(valueToCategory, IntegerType()) bs_df_encoded = (bs_df.withColumn( "season_1", udfValueToCategory(col('season'), lit(1))).withColumn( "season_2", udfValueToCategory(col('season'), lit(2))).withColumn( "season_3", udfValueToCategory(col('season'), lit(3))).withColumn( "season_4", udfValueToCategory(col('season'), lit(4)))) bs_df_encoded = bs_df_encoded.drop('season') #https://stackoverflow.com/questions/40161879/pyspark-withcolumn-with-two-conditions-and-three-outcomes bs_df_encoded = (bs_df_encoded.withColumn( "weather_1", udfValueToCategory(col('weather'), lit(1))).withColumn( "weather_2", udfValueToCategory(col('weather'), lit(2))).withColumn( "weather_3", udfValueToCategory(col('weather'), lit(3))).withColumn(
Converts SAS date Parameters: arg1 (days) Returns: date object or None """ try: start = dt.datetime(1960, 1, 1).date() return start + dt.timedelta(days=int(x)) except: return None udf_to_datetime_sas = udf(lambda x: to_datetime(x), DateType()) def to_datetimefrstr(x): """ Summary line. Converts date format Parameters: arg1 (date) Returns: date object or None """ try: return dt.datetime.strptime(x, '%m%d%Y')
else: return r def country_exists(r): if r in tofullname: return tofullname[r] else: return '' def event_exists(r): if r in toevent: return toevent[r] else: return '' c_exists = udf(country_exists,StringType()) e_exists = udf(event_exists,StringType()) dfsub1 = df.withColumn("ActionGeo_CountryCode",c_exists(col("ActionGeo_CountryCode"))) \ .withColumn("Actor1Type1Code",e_exists(col("Actor1Type1Code"))) sqlContext.registerDataFrameAsTable(dfsub1, 'temp') df2 = sqlContext.sql("""SELECT ActionGeo_CountryCode, CAST(SQLDATE AS INTEGER), CAST(MonthYear AS INTEGER), CAST(Year AS INTEGER), CASE WHEN Actor1Type1Code = '' AND Actor2Type1Code <> '' THEN Actor2Type1Code ELSE Actor1Type1Code END AS Actor1Type1Code, CAST(NumArticles AS INTEGER), CAST(GoldsteinScale AS INTEGER), CAST(AvgTone AS INTEGER) FROM temp WHERE ActionGeo_CountryCode <> '' AND ActionGeo_CountryCode IS NOT NULL AND Actor1Type1Code <> '' AND Actor1Type1Code IS NOT NULL
def gapply(grouped_data, func, schema, *cols): """Applies the function ``func`` to data grouped by key. In particular, given a dataframe grouped by some set of key columns key1, key2, ..., keyn, this method groups all the values for each row with the same key columns into a single Pandas dataframe and by default invokes ``func((key1, key2, ..., keyn), values)`` where the number and order of the key arguments is determined by columns on which this instance's parent :class:`DataFrame` was grouped and ``values`` is a ``pandas.DataFrame`` of columns selected by ``cols``, in that order. If there is only one key then the key tuple is automatically unpacked, with ``func(key, values)`` called. ``func`` is expected to return a ``pandas.DataFrame`` of the specified schema ``schema``, which should be of type :class:`StructType` (output columns are of this name and order). If ``spark.conf.get("spark.sql.retainGroupColumns")`` is not ``u'true'``, then ``func`` is called with an empty key tuple (note it is set to ``u'true'`` by default). If no ``cols`` are specified, then all grouped columns will be offered, in the order of the columns in the original dataframe. In either case, the Pandas columns will be named according to the DataFrame column names. The order of the rows passed in as Pandas rows is not guaranteed to be stable relative to the original row order. :note: Users must ensure that the grouped values for every group must fit entirely in memory. :note: This method is only available if Pandas is installed. :param grouped_data: data grouped by key :param func: a two argument function, which may be either a lambda or named function :param schema: the return schema for ``func``, a :class:`StructType` :param cols: list of column names (string only) :raise ValueError: if ``"*"`` is in ``cols`` :raise ValueError: if ``cols`` contains duplicates :raise ValueError: if ``schema`` is not a :class:`StructType` :raise ImportError: if ``pandas`` module is not installed :raise ImportError: if ``pandas`` version is too old (less than 0.7.1) :return: the new :class:`DataFrame` with the original key columns replicated for each returned value in each group's resulting pandas dataframe, the schema being the original key schema prepended to ``schema``, where all the resulting groups' rows are concatenated. Of course, if retaining group columns is disabled, then the output will exactly match ``schema`` since no keys can be prepended. >>> import pandas as pd >>> from pyspark.sql import SparkSession >>> from spark_sklearn.group_apply import gapply >>> from spark_sklearn.util import createLocalSparkSession >>> spark = createLocalSparkSession() >>> df = (spark ... .createDataFrame([Row(course="dotNET", year=2012, earnings=10000), ... Row(course="Java", year=2012, earnings=20000), ... Row(course="dotNET", year=2012, earnings=5000), ... Row(course="dotNET", year=2013, earnings=48000), ... Row(course="Java", year=2013, earnings=30000)]) ... .select("course", "year", "earnings")) >>> def yearlyMedian(_, vals): ... all_years = set(vals['year']) ... # Note that interpolation is performed, so we need to cast back to int. ... yearly_median = [(year, int(vals['earnings'][vals['year'] == year].median())) ... for year in all_years] ... return pd.DataFrame.from_records(yearly_median) >>> newSchema = StructType().add("year", LongType()).add("median_earnings", LongType()) >>> gapply(df.groupBy("course"), yearlyMedian, newSchema).orderBy("median_earnings").show() +------+----+---------------+ |course|year|median_earnings| +------+----+---------------+ |dotNET|2012| 7500| | Java|2012| 20000| | Java|2013| 30000| |dotNET|2013| 48000| +------+----+---------------+ <BLANKLINE> >>> def twoKeyYearlyMedian(_, vals): ... return pd.DataFrame.from_records([(int(vals["earnings"].median()),)]) >>> newSchema = StructType([df.schema["earnings"]]) >>> gapply(df.groupBy("course", "year"), twoKeyYearlyMedian, newSchema, "earnings").orderBy( ... "earnings").show() +------+----+--------+ |course|year|earnings| +------+----+--------+ |dotNET|2012| 7500| | Java|2012| 20000| | Java|2013| 30000| |dotNET|2013| 48000| +------+----+--------+ <BLANKLINE> >>> spark.stop(); SparkSession._instantiatedContext = None """ import pandas as pd minPandasVersion = '0.7.1' if LooseVersion(pd.__version__) < LooseVersion(minPandasVersion): raise ImportError( 'Pandas installed but version is {}, {} required'.format( pd.__version__, minPandasVersion)) # Do a null aggregation to retrieve the keys first (should be no computation) # Also consistent with spark.sql.retainGroupColumns keySchema = grouped_data.agg({}).schema keyCols = grouped_data.agg({}).columns if not cols: # Extract the full column list with the parent df javaDFName = "org$apache$spark$sql$RelationalGroupedDataset$$df" parentDF = java_gateway.get_field(grouped_data._jgd, javaDFName) allCols = DataFrame(parentDF, None).columns keyColsSet = set(keyCols) cols = [col for col in allCols if col not in keyColsSet] if "*" in cols: raise ValueError("cols expected to contain only singular columns") if len(set(cols)) < len(cols): raise ValueError("cols expected not to contain duplicate columns") if not isinstance(schema, StructType): raise ValueError("output schema should be a StructType") inputAggDF = grouped_data.agg({col: 'collect_list' for col in cols}) # Recover canonical order (aggregation may change column order) canonicalOrder = chain( keyCols, [inputAggDF['collect_list(' + col + ')'] for col in cols]) inputAggDF = inputAggDF.select(*canonicalOrder) # Wraps the user-provided function with another python function, which prepares the # input in the form specified by the documentation. Then, once the function completes, # this wrapper prepends the keys to the output values and converts from pandas. def pandasWrappedFunc(*args): nvals = len(cols) keys, collectedCols = args[:-nvals], args[-nvals:] paramKeys = tuple(keys) if len(paramKeys) == 1: paramKeys = paramKeys[0] valuesDF = pd.DataFrame.from_dict(dict(zip(cols, collectedCols))) valuesDF = valuesDF[list(cols)] # reorder to canonical outputDF = func(paramKeys, valuesDF) valCols = outputDF.columns.tolist() for key, keyName in zip(keys, keyCols): outputDF[keyName] = key outputDF = outputDF[keyCols + valCols] # reorder to canonical # To recover native python types for serialization, we need # to convert the pandas dataframe to a numpy array, then to a # native list (can't go straight to native, since pandas will # attempt to perserve the numpy type). return outputDF.values.tolist() keyPrependedSchema = StructType(list(chain(keySchema, schema))) outputAggSchema = ArrayType(keyPrependedSchema, containsNull=False) pandasUDF = udf(pandasWrappedFunc, outputAggSchema) outputAggDF = inputAggDF.select(pandasUDF(*inputAggDF)) explodedDF = outputAggDF.select(explode(*outputAggDF).alias("gapply")) # automatically retrieves nested schema column names return explodedDF.select("gapply.*")
def process_log_data(spark, input_data, output_data): """ reads log file, filter the records by `NextSong` action. converts milliseconds into timestamp. fetches dimension specific attributes and insert them into respective table. also fills fact table `songplays`. :param spark: spark session. :param input_data: input data path. :param output_data: output data path. :return: None. """ # get file path to log data file log_data = f'{input_data}/log_data/2018/11/*.json' # read log data file df = spark.read.json(log_data).dropDuplicates() # filter by actions for song plays df = df.filter("page='NextSong'") # extract columns for users table user_table = df.select('userId', 'firstName', 'lastName', 'gender', 'level') # write users table to parquet files user_table.coalesce(1).write.format('json').mode('overwrite').save( f'{output_data}/users/') # create timestamp column from original timestamp column get_timestamp = udf(lambda x: str(int(int(x) / 1000))) df = df.withColumn("timestamp", get_timestamp(df.ts)) # create datetime column from original timestamp column get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0))) df = df.withColumn("datetime", get_datetime(df.ts)) # extract columns to create time table time_table = df.select('timestamp', hour('datetime').alias('hour'), dayofmonth('datetime').alias('day'), weekofyear('datetime').alias('week'), month('datetime').alias('month'), year('datetime').alias('year'), date_format('datetime', 'F').alias('weekday')) # write time table to parquet files partitioned by year and month time_table.coalesce(1).write.format("json").partitionBy(["year", "month"]) \ .mode("overwrite") \ .save(f'{output_data}/time/year=2018') # read in song data to use for songplays table song_df = spark.read.format('json').load( f'{output_data}/songs/year=*/artist_id=*') song_df.createOrReplaceTempView('songs') artists_songs_df = spark.read.format('json').load( f'{output_data}/artists_songs/') df.orderBy('artist').show(20) artists_songs_df.orderBy('artist_name').show(20) songplays_table = df.join(artists_songs_df, [df.artist == artists_songs_df.artist_name, df.song == artists_songs_df.song_title, artists_songs_df.length == df.length], how='left') \ .withColumn('start_time', get_datetime(df['ts']).cast('timestamp')) songplays_table = songplays_table.select( songplays_table['userId'].alias('user_id'), songplays_table['level'], artists_songs_df['song_id'], artists_songs_df['artist_id'], songplays_table['sessionId'].alias('session_id'), df['location'], month('start_time').alias('month'), year('start_time').alias('year'), songplays_table['userAgent'].alias('user_agent'), ) songplays_table.show(20) # write songplays table to parquet files partitioned by year and month songplays_table.coalesce(1).write.format("json") \ .partitionBy(["year", "month"]) \ .mode("overwrite") \ .save(f'{output_data}/songsplays/')
df = new_df else: df = df.unionAll(new_df) # Clean df_pickups = df.filter((df.Start_Lon >= -80) & (df.Start_Lon <= -70) & (df.Start_Lat >= 40) & (df.Start_Lat <= 50)) df_dropoffs = df.filter((df.End_Lon >= -80) & (df.End_Lon <= -70) & (df.End_Lat >= 40) & (df.End_Lat <= 50)) def is_float(value): try: float(value) return True except ValueError: return False is_float_udf = udf(is_float, BooleanType()) df_pickups = df_pickups.filter((is_float_udf(df.Start_Lon)) & (is_float_udf(df.Start_Lat))) df_dropoffs = df_dropoffs.filter((is_float_udf(df.End_Lon)) & (is_float_udf(df.End_Lat))) # Discretize Location df_pickups = df_pickups.withColumn("Lon", sqlfunctions.round(df.Start_Lon, 2)) df_pickups = df_pickups.withColumn("Lat", sqlfunctions.round(df.Start_Lat, 2)) df_dropoffs = df_dropoffs.withColumn("Lon", sqlfunctions.round(df.End_Lon, 2)) df_dropoffs = df_dropoffs.withColumn("Lat", sqlfunctions.round(df.End_Lat, 2)) # Select Columns df_pickups = df_pickups.select(df_pickups.Pickup_Time, df_pickups.Lon, df_pickups.Lat) df_dropoffs = df_dropoffs.select(df_dropoffs.Dropoff_Time, df_dropoffs.Lon, df_dropoffs.Lat) # Discretize Time def discretize_time(row):
def tel_to_matrix_udf(telemetry_keys): return udf(lambda x: telemetry_to_matrix(x, telemetry_keys), ArrayType(ArrayType(FloatType())))
#------------------------------------------------------------- import sys from pyspark import SparkConf, SparkContext from pyspark.ml import Pipeline from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler, IndexToString from pyspark.ml.regression import LinearRegression from pyspark.sql import SQLContext import pyspark.sql.functions as sf from pyspark.sql.functions import udf from sklearn.model_selection import train_test_split from slicing.spark_modules import spark_utils, join_data_parallel, union_data_parallel binner = udf(lambda arg: int(int(arg) // 5)) if __name__ == "__main__": args = sys.argv if len(args) > 1: k = int(args[1]) w = float(args[2].replace(',', '.')) alpha = int(args[3]) if args[4] == "True": b_update = True else: b_update = False debug = args[5] loss_type = int(args[6]) enumerator = args[7] dataset = args[8]
print("MY LIST Y = " , mylist[y]) print("Duration = " , duration) print("average = " , avg) break else: y = y + 1 else: x = x + 1 return bottleneck, avg, blist[0],blist[1],clist[0], clist[1] # COMMAND ---------- func = F.udf(lambda x: calculate(x), T.StructType( [T.StructField("val1", T.IntegerType(), True), T.StructField("val2", T.FloatType(), True), T.StructField("val3", T.IntegerType(), True), T.StructField("val4", T.IntegerType(), True), T.StructField("val5", T.IntegerType(), True), T.StructField("val6", T.IntegerType(), True)])) # COMMAND ---------- dfd = dfc.withColumn('vals', func('lowSpeed')) # COMMAND ---------- dfd.show() # COMMAND ---------- dfe = dfd.withColumn("BottleneckCount",col('vals.val1')).withColumn("AverageDuration",col('vals.val2')).drop("lowSpeed","vals")
def add_udf_column(points): udf_multiply_latlong = udf(multiply_latlong, IntegerType()) points = points.withColumn("multiplied_latlong", udf_multiply_latlong(col("lat"), col("long"))) return points
def calc_onehot_udf(feature_keys, u_features): return udf(lambda x: one_hot_features(x, feature_keys, u_features), ArrayType(FloatType()))
def python_plus_one(self): return udf(lambda v: v + 1, 'double')
spark = SparkSession.builder\ .master("local[*]")\ .appName("trial app 1")\ .config('spark.sql.warehouse.dir', 'file:///C:/tmp') \ .getOrCreate() #------------------- #select, filter row = Row(name="Alice", age=11) row['name'], row['age'] data = [('Alice', 1), ('Bob', 2)] df = spark.createDataFrame(data, ['name', 'age']) df.show() doubled = udf(lambda s: s * 2, IntegerType()) df_doubled = df.select(df.name, doubled(df.age).alias('age')) df_doubled_filter = df_doubled.filter(df_doubled.age > 3) #-------------------------- #distinct, sort data = [('Alice', 1), ('Bob', 2), ('Bob', 4)] df = spark.createDataFrame(data, ['name', 'age']) df.show() df_distinct = df.distinct() df_distinct.show() df_sorted = df.sort(df.age) df_sorted.show()
def udf_func(func): return udf(func, returnType=return_type)
from pyspark.sql.functions import udf from pyspark.sql.types import StringType def do_something(x): return x + 'hello' sample_udf = udf(lambda x: do_something(x), StringType())
def derive_basic_features_set(drug_reviews_df, fda_df, spark): reviews_spark_df_schema = StructType([StructField("drug_name", StringType(), False), \ StructField("condition", StringType(), True), \ StructField("review", StringType(), True), \ StructField("rating", IntegerType(), False), \ StructField("review_date", StringType(), False), \ StructField("useful_count", IntegerType(), False)]) fda_recalls_spark_df_schema = StructType([StructField("recall_date", StringType(), False), \ StructField("brand", StringType(), True), \ StructField("product_description", StringType(), True), \ StructField("recall_reason", StringType(), False), \ StructField("company", StringType(), False)]) reviews_spark_df = spark.createDataFrame(review_df, reviews_spark_df_schema) fda_recalls_spark_df = spark.createDataFrame(fda_df, fda_recalls_spark_df_schema) fda_product_descriptions = [] for row in fda_recalls_spark_df.select("product_description", "recall_date").collect(): fda_product_descriptions.append( [row.product_description, row.recall_date]) wordCount = udf(lambda s: len(s.split(" ")), IntegerType()) cleanWords = udf(lambda x: cleanText(x)) avgWordLength = udf(lambda x: average_word_length(x, is_list=True), FloatType()) drugRecalledUDF = udf(lambda x: drugRecalled(x, fda_product_descriptions)) drugRecallDateUDF = udf( lambda x: drugRecallDate(x, fda_product_descriptions)) textblob_sentimentUDF = udf(lambda x: TextBlobAnalysis(x)) vader_sentimentUDF = udf(lambda x: VaderAnalysis(x)) stanford_sentimentUDF = udf(lambda x: StanfordAnalysis(x)) reviews_spark_df = reviews_spark_df.withColumn("partial_name", f.lower(f.trim(f.split(reviews_spark_df.drug_name, "/")[0]))). \ withColumn("is_recalled", drugRecalledUDF(reviews_spark_df.drug_name)). \ withColumn("review_date", f.to_date(reviews_spark_df.review_date)).fillna('', subset=['review_date']). \ withColumn("condition", f.trim(f.lower(reviews_spark_df.condition))). \ withColumn("review_length", f.length(reviews_spark_df.review)). \ withColumn("review_word_count", wordCount(reviews_spark_df.review)). \ withColumn("cleaned_words", cleanWords(reviews_spark_df.review)) reviews_added_cols_spark_df = reviews_spark_df.withColumn("review_cleaned_word_count", f.length(reviews_spark_df.cleaned_words)). \ withColumn("review_avg_word_length", avgWordLength(reviews_spark_df.review)). \ withColumn("review_avg_cleaned_word_length", avgWordLength(reviews_spark_df.cleaned_words)). \ withColumn("combined_sentiment", textblob_sentimentUDF(reviews_spark_df.cleaned_words)). \ withColumn("combined_vader", vader_sentimentUDF(reviews_spark_df.cleaned_words)). \ withColumn("combined_stanford", stanford_sentimentUDF(reviews_spark_df.cleaned_words)) split_textblob = f.split(reviews_added_cols_spark_df['combined_sentiment'], '!') split_vader = f.split(reviews_added_cols_spark_df['combined_vader'], '!') split_stanford = f.split(reviews_added_cols_spark_df['combined_stanford'], '!') reviews_spark_df_split = reviews_added_cols_spark_df.withColumn('Sentimental_Polarity', split_textblob.getItem(0)). \ withColumn('Sentimental_Subjectivity', split_textblob.getItem(1)). \ withColumn('vader_neg', split_vader.getItem(0)). \ withColumn('vader_neu', split_vader.getItem(1)). \ withColumn('vader_pos', split_vader.getItem(2)). \ withColumn('vader_comp', split_vader.getItem(3)). \ withColumn('stanford_neg', split_stanford.getItem(0)). \ withColumn('stanford_neu', split_stanford.getItem(1)). \ withColumn('stanford_pos', split_stanford.getItem(2)) return reviews_spark_df_split
# Import dependencies ZIP execfile("./__pyfiles__/load.py") ### ### Sentence polarity using NLTK ### def compute_nltk_polarity(msg_body): nltk.data.path.append("./nltk_data.zip/nltk_data") sid = SentimentIntensityAnalyzer() msg_body = sid.polarity_scores(msg_body) return msg_body compute_nltk_polarity_udf = func.udf(compute_nltk_polarity, MapType(StringType(), FloatType(), False)) spark.udf.register('compute_nltk_polarity', compute_nltk_polarity_udf) # Load and preprocess sample data _, messages = load_data(sc, filter=[2015, 2016, 2017], sample=0.01) messages = messages.withColumn('created_utc', func.from_unixtime(messages['created_utc'], 'yyyy-MM-dd HH:mm:ss.SS').cast(DateType())) \ .withColumnRenamed('created_utc', 'creation_date') # Clean messages cleaned_messages = messages.filter( "body != '[removed]' and body != '[deleted]'") nlp_nltk_polartiy = cleaned_messages.selectExpr( 'id', 'creation_date', 'body', "compute_nltk_polarity(body) as nltk_scores") nlp_nltk_polartiy = nlp_nltk_polartiy.selectExpr(
token_doc_map_df = sqlContext.createDataFrame(token_doc_map, ['token', 'doc_id']) # Group By token and doc id : it gives in which all document a token appears token_doc_grp = token_doc_map_df.groupBy(token_doc_map_df.token, token_doc_map_df.doc_id).count() # Group By token. Gives Document Frequency. ToDo:Can do it in one pass doc_freq = token_doc_grp.groupBy(token_doc_grp.token).count().withColumnRenamed('count', 'df') print 'token_doc_map : \n', token_doc_map.take(10) token_doc_map_df.show(10, truncate=False) token_doc_grp.show(10, truncate=False) doc_freq.show(10, truncate=False) return doc_freq corpus_df = df(corpus) tokendfUDF = udf(idf, FloatType()) idf_df = corpus_df.withColumn("idf", tokendfUDF("df")) token_idf = idf_df.select(idf_df.token, idf_df.idf) print 'Calculated df : \n', corpus_df.show(10, truncate=False) print 'idf_df : \n', idf_df.show(10, truncate=False) print 'token_idf : ', token_idf.show(10, truncate=False) ''' idf_df : +--------------+---+---------+ |token |df |idf | +--------------+---+---------+ |adventure |44 |4.1764364| |check |40 |4.269527 |
def process_drones(rdd): ''' Driver function to process drone rdd's and select sensor data close to the event ''' if rdd.isEmpty(): print("RDD is empty") else: s3_bucket = 's3a://dronesensordata/drone_detect_data' df = rdd.toDF() df = df.selectExpr("_1 as device_id",\ "_2 as latitude",\ "_3 as longitude",\ "_4 as TimeStamp",\ "_5 as barometric_reading", \ "_6 as gyrometer_x",\ "_7 as gyrometer_y",\ "_8 as wind_speed") GroupedDF = df.groupBy("device_id").agg(f.collect_list('barometric_reading').\ alias('barometric_reading'),\ f.min('latitude').\ alias('latitude'),\ f.min('longitude').\ alias('longitude'),\ f.collect_list('gyrometer_x').\ alias('gyrometer_x'),\ f.collect_list('gyrometer_y').\ alias('gyrometer_y'),\ f.collect_list('wind_speed').\ alias('wind_speed'),\ f.collect_list('TimeStamp').\ alias('TimeStamp')) anamoly_udf = udf(detect_barometric_anamoly, BooleanType()) crashed_udf = udf(detect_crashed_drones, BooleanType()) minimum_udf = udf(get_min, FloatType()) processed_DF = GroupedDF.withColumn("malfunctioning", anamoly_udf("barometric_reading", \ "TimeStamp")) \ .withColumn("crashed", crashed_udf("barometric_reading"))\ malfunctioning_DF = processed_DF.filter(processed_DF['malfunctioning']) crashed_DF = processed_DF.filter(processed_DF['crashed']) tuple_list = [(row.latitude, row.longitude, row.device_id) for row in crashed_DF.select('latitude', 'longitude', 'device_id').collect()] for latitude, longitude, device_id in tuple_list: data = dumps({ "device_id": device_id, "latitude": latitude, "longitude": longitude }).encode('utf-8') Producer.send('crashed-devices', value=data) Producer.flush() malfunctioning_DF.write\ .mode('append')\ .parquet('{}/malfunctioning_devices_sensor_data.parquet'.format(s3_bucket)) connector = PostgresConnector(args.psnode, args.dbname, args.pusername, args.password) connector.write(processed_DF, 'devices', 'overwrite')
def get_weather_station_weather_df(spark, stations_id): ''' Download the weather station data during all hours of the 5 years for given station ids and return a dataframe ''' cache_file = workdir + 'data/weather_stations.parquet' if isdir(cache_file): print('Skip downloading weather station: already done') return spark.read.parquet(cache_file) get_station_weather_month_udf = \ udf(get_station_weather_month, ArrayType(StructType([ StructField('day', IntegerType()), StructField('hour', IntegerType()), StructField('dew_point_temp', FloatType()), StructField('rel_hum', FloatType()), StructField('wind_dir', FloatType()), StructField('wind_spd', FloatType()), StructField('visibility', FloatType()), StructField('stn_press', FloatType()), StructField('hmdx', FloatType()), StructField('wind_chill', FloatType()), StructField('temp', FloatType()), StructField('risky_weather', FloatType()) ]))) month_per_year_df = spark.createDataFrame(zip(range(1, 13),), ['month']) years_df = spark.createDataFrame(zip(range(2012, 2019),), ['year']) months_df = years_df.crossJoin(month_per_year_df) stations_months_df = stations_id.crossJoin(months_df) c = col('col') def create_date(year, month, day): return datetime.datetime.strptime(f'{year}-{month}-{day}', "%Y-%m-%d") create_date_udf = udf(create_date, DateType()) df = (stations_months_df .repartition(200, 'year', 'month') .withColumn('weather', get_station_weather_month_udf('station_id', 'year', 'month')) .select('station_id', 'year', 'month', explode('weather')) .select('station_id', create_date_udf('year', 'month', c['day']).alias('date'), c['hour'].alias('hour'), c['dew_point_temp'].alias('dew_point_temp'), c['rel_hum'].alias('rel_hum'), c['wind_dir'].alias('wind_dir'), c['wind_spd'].alias('wind_spd'), c['visibility'].alias('visibility'), c['stn_press'].alias('stn_press'), c['hmdx'].alias('hmdx'), c['wind_chill'].alias('wind_chill'), c['temp'].alias('temp'), c['risky_weather'].alias('risky_weather'))) # We make a moving average of risky_weather since the effect of a risky # weather is distributed in the next hours def weighted_average(c, window, offsets, weights): def value(i): return lag(c, -i).over(window) values = [coalesce(value(i) * w, lit(0)) for i, w in zip(offsets, weights)] return sum(values, lit(0)) window = (Window .partitionBy('station_id') .orderBy('date')) offsets = range(-23, 1) weights = [exp(0.5*t) for t in offsets] weights = [w/sum(weights) for w in weights] df = df.withColumn('risky_weather', weighted_average('risky_weather', window, offsets, weights)) df.write.parquet(cache_file) return df