Пример #1
0
    def test_pandas_udf_arrow_overflow(self):
        from distutils.version import LooseVersion
        import pandas as pd
        import pyarrow as pa

        df = self.spark.range(0, 1)

        @pandas_udf(returnType="byte")
        def udf(column):
            return pd.Series([128])

        # Arrow 0.11.0+ allows enabling or disabling safe type check.
        if LooseVersion(pa.__version__) >= LooseVersion("0.11.0"):
            # When enabling safe type check, Arrow 0.11.0+ disallows overflow cast.
            with self.sql_conf({
                    "spark.sql.execution.pandas.arrowSafeTypeConversion": True}):
                with self.assertRaisesRegexp(Exception,
                                             "Exception thrown when converting pandas.Series"):
                    df.withColumn('udf', udf('id')).collect()

            # Disabling safe type check, let Arrow do the cast anyway.
            with self.sql_conf({"spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
                df.withColumn('udf', udf('id')).collect()
        else:
            # SQL config `arrowSafeTypeConversion` no matters for older Arrow.
            # Overflow cast causes an error.
            with self.sql_conf({"spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
                with self.assertRaisesRegexp(Exception,
                                             "Integer value out of bounds"):
                    df.withColumn('udf', udf('id')).collect()
Пример #2
0
def  make_prediction(event, df):
    event_timestamp, event_dayofweek, pickup_lat, pickup_lon, dropoff_lat, dropoff_lon, event_passenger_count = event[0], event[1], event[2], event[3], event[4], event[5], event[6]
    udf_diff_timeofday=udf(utils.diff_timeofday, IntegerType())
    udf_shortest_distance=udf(utils.shortest_distance, FloatType())
    df = df.withColumn("diff_timeofday", udf_diff_timeofday(df.pickup, lit(event_timestamp))).filter("`diff_timeofday` < 30")
    df = df.withColumn("event_sum_distance",
        udf_shortest_distance(df.pick_lat, df.pick_lon, lit(pickup_lat), lit(pickup_lon))+udf_shortest_distance(df.drop_lat, df.drop_lon, lit(dropoff_lat), lit(dropoff_lon))).filter("`event_sum_distance` < 2")
    df = df.sort('event_sum_distance')
    if df.count() < 10:
        return [0,0]
    a = pd.DataFrame(df.take(50))
    a.columns = df.columns

    speed_array = a.as_matrix(["avg_speed"])
    dist_sf_array = a.as_matrix(["dist_sf"])
    distance_array = a["trip_distance"].tolist()
    fare_array = a["total_notip"].tolist()
    time_array = a["trip_time_in_secs"].tolist()

    #set initial parameter values
    x0 = [0.5, 0.5, 3.0, 3.0]
    bnds = ((0.25, 0.75), (0.25, 0.75), (0.1,20), (0,10))
    
    #perform the fit
    res = optimize.minimize(func_to_optimize, x0, args=(distance_array, time_array, fare_array), method='TNC', bounds=bnds)
    grid_dist = utils.grid_distance(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)

    #get the predictions
    time_pred = utils.time_prediction(speed_array.mean(), grid_dist, dist_sf_array.mean())
    fare_pred = utils.fare_prediction(res.x[0], grid_dist, dist_sf_array.mean(), res.x[1], res.x[2], res.x[3])
    if res.success == True:
        return [fare_pred, time_pred]
    else:
        return [0,0]
Пример #3
0
    def test_pandas_udf_detect_unsafe_type_conversion(self):
        from distutils.version import LooseVersion
        import pandas as pd
        import numpy as np
        import pyarrow as pa

        values = [1.0] * 3
        pdf = pd.DataFrame({'A': values})
        df = self.spark.createDataFrame(pdf).repartition(1)

        @pandas_udf(returnType="int")
        def udf(column):
            return pd.Series(np.linspace(0, 1, 3))

        # Since 0.11.0, PyArrow supports the feature to raise an error for unsafe cast.
        if LooseVersion(pa.__version__) >= LooseVersion("0.11.0"):
            with self.sql_conf({
                    "spark.sql.execution.pandas.arrowSafeTypeConversion": True}):
                with self.assertRaisesRegexp(Exception,
                                             "Exception thrown when converting pandas.Series"):
                    df.select(['A']).withColumn('udf', udf('A')).collect()

        # Disabling Arrow safe type check.
        with self.sql_conf({
                "spark.sql.execution.pandas.arrowSafeTypeConversion": False}):
            df.select(['A']).withColumn('udf', udf('A')).collect()
Пример #4
0
    def cat2Num(self, df, indices):
        '''sbaronia - extract the categorical data and make df out of it
        so oneHotEncoding can be run on them'''
        protocol_ind0 = df.select(df.id,df.rawFeatures[indices[0]].alias("features0")).cache()
        protocol_ind1 = df.select(df.id,df.rawFeatures[indices[1]].alias("features1")).cache()

        ind0_enc = self.oneHotEncoding(protocol_ind0,"features0").cache()
        ind1_enc = self.oneHotEncoding(protocol_ind1,"features1").cache()
        
        '''sbaronia - add those hot encoded features columns to original df'''
        int_join_1 = df.join(ind0_enc, ind0_enc.id == df.id, 'inner').drop(ind0_enc.id).cache()
        int_join_2 = int_join_1.join(ind1_enc, int_join_1.id == ind1_enc.id, 'inner').drop(int_join_1.id).cache()

        '''sbaronia - now create a new column features which has 
        converted vector form and drop rest columns'''
        comb_udf = udf(replaceCat2Num,StringType())
        int_join_2 = int_join_2.select(int_join_2.id,int_join_2.rawFeatures, \
                                       comb_udf(int_join_2.rawFeatures, \
                                       int_join_2.num_features0, \
                                       int_join_2.num_features1).alias("features")).cache()
        
        '''sbaronia - convert list of numerical features to DenseVector
        so they can be used in KMeans'''
        dense_udf = udf(lambda line: DenseVector.parse(line), VectorUDT())
        feat = int_join_2.select(int_join_2.id,int_join_2.rawFeatures,dense_udf(int_join_2.features).alias("features")).cache()
      
        return feat
Пример #5
0
def extract_nlp(sql_sc):
    schema = StructType([
        StructField("title", StringType()),
        StructField("first_sentence", StringType()),
        StructField("infoboxnames", StringType()),
        StructField("urlwords", StringType())
    ])

    noun_ex = udf(extract_nouns, StringType())
    pos_ex = udf(extract_hypernyms, StringType())

    df = sql_sc.read \
        .format("com.databricks.spark.csv") \
        .schema(schema) \
        .option("header", "false") \
        .option("quotechar", '|') \
        .option("delimiter", ',') \
        .load(DATAP + '/dump/articles_annotated_pre.csv')
    df = df.withColumn("nouns", noun_ex(df.first_sentence))
    df = df.withColumn("pos_hyps", pos_ex(df.first_sentence))
    df['title', 'urlwords', 'infoboxnames', 'nouns', 'pos_hyps'].write \
        .format("com.databricks.spark.csv") \
        .option("header", "false") \
        .option("quotechar", '|') \
        .option("delimiter", ",") \
        .csv(DATAP + '/dump/articles_annotated')
Пример #6
0
def extract_pre(sql_sc):
    schema = StructType([
        StructField("title", StringType()),
        StructField("text", StringType())
    ])
    first_ex = udf(extract_first_sentence, StringType())
    inf_ex2 = udf(extract_names, StringType())
    url_ex = udf(extract_urlpattern, StringType())

    df = sql_sc.read \
        .format("com.databricks.spark.csv") \
        .schema(schema) \
        .option("header", "false") \
        .option("quotechar", '|') \
        .option("delimiter", ',') \
        .load(DATAP + '/dump/articles.csv')
    #df = df.repartition(100)
    df = df.withColumn("first_sentence", first_ex(df.text))
    df = df.withColumn('infoboxnames', inf_ex2(df.text))
    df = df.withColumn("urlwords", url_ex(df.title))
    df['title', 'first_sentence', 'infoboxnames', 'urlwords'].write \
        .format("com.databricks.spark.csv") \
        .option("header", "false") \
        .option("quotechar", '|') \
        .option("delimiter", ",") \
        .csv(DATAP + '/dump/articles_annotated_pre')
Пример #7
0
def text_features(p_df):
    """
    Extracts features derived from the quora question texts.
    :param p_df: A DataFrame.
    :return: A DataFrame.  
    """
    diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType())
    common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType())
    unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType())


    p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2"))
    p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2")))
    p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words"))
    p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words")))
    p_df = p_df.withColumn(
        "unique_chars_q1", unique_chars("question1")
    ).withColumn("unique_chars_q2", unique_chars("question2"))

    assembler = VectorAssembler(
        inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"],
        outputCol="text_features"
    )
    p_df = assembler.transform(p_df)    
    return p_df
Пример #8
0
    def test_udf_in_generate(self):
        from pyspark.sql.functions import udf, explode
        df = self.spark.range(5)
        f = udf(lambda x: list(range(x)), ArrayType(LongType()))
        row = df.select(explode(f(*df))).groupBy().sum().first()
        self.assertEqual(row[0], 10)

        df = self.spark.range(3)
        res = df.select("id", explode(f(df.id))).collect()
        self.assertEqual(res[0][0], 1)
        self.assertEqual(res[0][1], 0)
        self.assertEqual(res[1][0], 2)
        self.assertEqual(res[1][1], 0)
        self.assertEqual(res[2][0], 2)
        self.assertEqual(res[2][1], 1)

        range_udf = udf(lambda value: list(range(value - 1, value + 1)), ArrayType(IntegerType()))
        res = df.select("id", explode(range_udf(df.id))).collect()
        self.assertEqual(res[0][0], 0)
        self.assertEqual(res[0][1], -1)
        self.assertEqual(res[1][0], 0)
        self.assertEqual(res[1][1], 0)
        self.assertEqual(res[2][0], 1)
        self.assertEqual(res[2][1], 0)
        self.assertEqual(res[3][0], 1)
        self.assertEqual(res[3][1], 1)
Пример #9
0
    def test_udf_wrapper(self):
        from pyspark.sql.functions import udf
        from pyspark.sql.types import IntegerType

        def f(x):
            """Identity"""
            return x

        return_type = IntegerType()
        f_ = udf(f, return_type)

        self.assertTrue(f.__doc__ in f_.__doc__)
        self.assertEqual(f, f_.func)
        self.assertEqual(return_type, f_.returnType)

        class F(object):
            """Identity"""
            def __call__(self, x):
                return x

        f = F()
        return_type = IntegerType()
        f_ = udf(f, return_type)

        self.assertTrue(f.__doc__ in f_.__doc__)
        self.assertEqual(f, f_.func)
        self.assertEqual(return_type, f_.returnType)

        f = functools.partial(f, x=1)
        return_type = IntegerType()
        f_ = udf(f, return_type)

        self.assertTrue(f.__doc__ in f_.__doc__)
        self.assertEqual(f, f_.func)
        self.assertEqual(return_type, f_.returnType)
Пример #10
0
 def setup_method(self, method):
     sparkConf = create_spark_conf().setMaster("local[4]")\
         .setAppName("test wide and deep")
     self.sc = init_nncontext(sparkConf)
     self.sqlContext = SQLContext(self.sc)
     data_path = os.path.join(os.path.split(__file__)[0], "../../resources/recommender")
     categorical_gender_udf = udf(lambda gender:
                                  categorical_from_vocab_list(gender, ["F", "M"], start=1))
     bucket_udf = udf(lambda feature1, feature2:
                      hash_bucket(str(feature1) + "_" + str(feature2), bucket_size=100))
     self.data_in = self.sqlContext.read.parquet(data_path) \
         .withColumn("gender", categorical_gender_udf(col("gender")).cast("int")) \
         .withColumn("occupation-gender",
                     bucket_udf(col("occupation"), col("gender")).cast("int"))
     self.column_info = ColumnFeatureInfo(
         wide_base_cols=["occupation", "gender"],
         wide_base_dims=[21, 3],
         wide_cross_cols=["occupation-gender"],
         wide_cross_dims=[100],
         indicator_cols=["occupation", "gender"],
         indicator_dims=[21, 3],
         embed_cols=["userId", "itemId"],
         embed_in_dims=[100, 100],
         embed_out_dims=[20, 20],
         continuous_cols=["age"])
Пример #11
0
 def test_nondeterministic_udf(self):
     # Test that nondeterministic UDFs are evaluated only once in chained UDF evaluations
     import random
     udf_random_col = udf(lambda: int(100 * random.random()), IntegerType()).asNondeterministic()
     self.assertEqual(udf_random_col.deterministic, False)
     df = self.spark.createDataFrame([Row(1)]).select(udf_random_col().alias('RAND'))
     udf_add_ten = udf(lambda rand: rand + 10, IntegerType())
     [row] = df.withColumn('RAND_PLUS_TEN', udf_add_ten('RAND')).collect()
     self.assertEqual(row[0] + 10, row[1])
Пример #12
0
    def select_prediction_udf(self, column):
        if column not in self.get_output_names():
            raise ValueError("Column '" + column + "' is not defined as the output column in MOJO Pipeline.")

        if self.get_named_mojo_output_columns():
            func = udf(lambda d: d, DoubleType())
            return func("prediction." + column).alias(column)
        else:
            idx = self.get_output_names().index(column)
            func = udf(lambda arr: arr[idx], DoubleType())
            return func("prediction.preds").alias(column)
def processMSC():
    """
    Parses MSC records as per defined rules
    :return: Records returned in pipe-delimited format
    """
    # Assumption: MSC folder under the provided input path
    inputDir = os.path.join(args.inputdir, "INPUT")
    lines = sc.textFile(inputDir)

    # Call the parsing function
    parsedMSCLines = lines.map(parseMSCRecords)

    # The schema is encoded in a string.
    schemaString = "RecordType FirstNum SecondNum CallDate CallHour Duration StartTower StartLAC CallType"
    fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split()]
    schema = StructType(fields)

    # Apply the schema to the RDD.
    schemaData = sqlContext.createDataFrame(parsedMSCLines, schema)
    
    modify_phone_number_udf = udf(mod_number, StringType())
    ph_num_mod = schemaData.select(
        schemaData.RecordType,
        modify_phone_number_udf(schemaData.FirstNum).alias('FirstNum'),
        modify_phone_number_udf(schemaData.SecondNum).alias('SecondNum'),
        schemaData.CallDate,
        schemaData.CallHour,
        schemaData.Duration,
        schemaData.StartTower,
        schemaData.StartLAC,
        schemaData.CallType)

    get_phone_type_udf = udf(get_phone_type, StringType())

    first_ph_type = ph_num_mod.withColumn('FirstPhoneType', get_phone_type_udf(ph_num_mod.FirstNum))

    sec_ph_type = first_ph_type.withColumn('SecondPhoneType', get_phone_type_udf(first_ph_type.SecondNum))

    final_df = sec_ph_type.select(
        sec_ph_type.RecordType,
        sec_ph_type.FirstNum,
        sec_ph_type.SecondNum,
        sec_ph_type.CallDate,
        sec_ph_type.CallHour,
        sec_ph_type.Duration,
        sec_ph_type.StartTower,
        sec_ph_type.StartLAC,
        sec_ph_type.CallType,
        F.when(sec_ph_type.FirstPhoneType.isin(["mobile", "landline", "shortcode"])
               & sec_ph_type.SecondPhoneType.isin(["mobile", "landline", "shortcode"]), "National")
            .otherwise("International").alias('PhoneType'))

    print final_df.show()
    def cat2Num(self, df, indices):
        unique_values = []
        for i in indices:
            d = udf(lambda r: r[i], StringType())
            dt = df.select(d(df.rawFeatures)).distinct().collect()
            unique_values.extend(dt)

        unique_count = len(unique_values)
        convertUDF = udf(lambda r: to_onehot(r, indices, unique_values, unique_count), ArrayType(DoubleType()))
        newdf = df.withColumn("features", convertUDF(df.rawFeatures))

        return newdf
Пример #15
0
    def test_complex_nested_udt_in_df(self):
        from pyspark.sql.functions import udf

        schema = StructType().add("key", LongType()).add("val", PythonOnlyUDT())
        df = self.spark.createDataFrame(
            [(i % 3, PythonOnlyPoint(float(i), float(i))) for i in range(10)],
            schema=schema)
        df.collect()

        gd = df.groupby("key").agg({"val": "collect_list"})
        gd.collect()
        udf = udf(lambda k, v: [(k, v[0])], ArrayType(df.schema))
        gd.select(udf(*gd)).collect()
Пример #16
0
    def test_nonparam_udf_with_aggregate(self):
        import pyspark.sql.functions as f

        df = self.spark.createDataFrame([(1, 2), (1, 2)])
        f_udf = f.udf(lambda: "const_str")
        rows = df.distinct().withColumn("a", f_udf()).collect()
        self.assertEqual(rows, [Row(_1=1, _2=2, a=u'const_str')])
Пример #17
0
def find_candidate_matches(match_field_value, data_df, match_column, id_column, sqlContext):
    # Change the name of we are working on
    # (id, match_col, col1, col2,..., [ngram1, ngram2,...])
    udf_n_grams = udf(lambda r: ','.join(n_grams(2, r)), StringType())
    data_with_ngrams_df = data_df.withColumn('ngrams', udf_n_grams(match_column)).\
        withColumnRenamed(match_column, 'match_col').\
        withColumnRenamed(id_column, 'id').cache()

    identical_removed_df = data_with_ngrams_df.dropDuplicates(['match_col'])
    id_ngram_rdd = identical_removed_df.flatMap(lambda r: unpack_n_grams(r.id, r.match_col, r.ngrams))  # (id, match_col,ngram1), (id, match_col, ngram2),...

    id_ngram_schema = StructType([StructField('id', IntegerType(), True),
                                  StructField('match_col', StringType(), True),
                                  StructField('ngram', StringType(), True)])

    id_ngram_df = sqlContext.createDataFrame(id_ngram_rdd, id_ngram_schema)  # (id, match_col, ngram1)

    n_grams_str = ','.join(['\"' + s + '\"' for s in n_grams(2, match_field_value)])

    filtered_id_ngram_df = id_ngram_df.filter('ngram in (%s)'%n_grams_str)\
        .dropDuplicates(['match_col'])\
        .cache()

    sqlContext.registerDataFrameAsTable(filtered_id_ngram_df, 'maintable')
    sqlContext.registerFunction('mySim', lambda x: jaccard_similarity(x, '\"%s\"'%match_field_value), DoubleType())

    candidate_match_df = sqlContext.sql('select id from maintable where mySim(match_col) >=0.5')

    return [r.id for r in candidate_match_df.collect()]
Пример #18
0
 def test_udf_with_order_by_and_limit(self):
     from pyspark.sql.functions import udf
     my_copy = udf(lambda x: x, IntegerType())
     df = self.spark.range(10).orderBy("id")
     res = df.select(df.id, my_copy(df.id).alias("copy")).limit(1)
     res.explain(True)
     self.assertEqual(res.collect(), [Row(id=0, copy=0)])
def call_speech_vocab(df):
    para_cleanup_udf=udf(speech_vocabulary, FloatType())
    df_with_vocab_score = df.withColumn('vocab_score', para_cleanup_udf(df['para_cleaned_text']))
    print(df_with_vocab_score.printSchema())
    print(df_with_vocab_score.show(3))
    print(df_with_vocab_score.count())
    return df_with_vocab_score
Пример #20
0
    def test_udf_with_aggregate_function(self):
        df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        from pyspark.sql.functions import udf, col, sum
        from pyspark.sql.types import BooleanType

        my_filter = udf(lambda a: a == 1, BooleanType())
        sel = df.select(col("key")).distinct().filter(my_filter(col("key")))
        self.assertEqual(sel.collect(), [Row(key=1)])

        my_copy = udf(lambda x: x, IntegerType())
        my_add = udf(lambda a, b: int(a + b), IntegerType())
        my_strlen = udf(lambda x: len(x), IntegerType())
        sel = df.groupBy(my_copy(col("key")).alias("k"))\
            .agg(sum(my_strlen(col("value"))).alias("s"))\
            .select(my_add(col("k"), col("s")).alias("t"))
        self.assertEqual(sel.collect(), [Row(t=4), Row(t=3)])
Пример #21
0
def visulisationData():
    mindate = 820497600 # 1996
    maxdate = 1104516000 # 2005
    # request.arg.get('nbins')
    binborders = list(map(int,list(np.linspace(start=mindate,stop=maxdate,num=200,endpoint=False))))
    def f(x):
        for i in range(len(binborders)):
            if x < binborders[i]:
                return binborders[i-1]
        return binborders[-1]
    stato = (
        doctopdat
        .filter("timestamp > '" + str(mindate) + "'")
        .filter("timestamp < '" + str(maxdate) + "'")
        .withColumn('bin',udf(f,LongType())(doctopdat['timestamp']))
        .select(['bin'] + ['C' + str(i + 1) for i in range(20)])
    )
    rows = stato.groupBy('bin').sum().sort('bin').collect()
    data = [{'key': 'topic ' + str(i), 'values': []} for i in range(20)]

    for row in rows:
        for i in range(20):
            col = 'sum(C' + str(i + 1) + ')'
            data[i]['values'].append([row['bin'], row[col]])

    return jsonify(data=data)
Пример #22
0
    def toPandas(self, df):
        """
        This is similar to the Spark DataFrame built-in toPandas() method, but it handles
        MLlib Vector columns differently.  It converts MLlib Vectors into rows of
        scipy.sparse.csr_matrix, which is generally friendlier for PyData tools like scikit-learn.

        .. note:: Experimental: This will likely be replaced in later releases with improved APIs.

        :param df: Spark DataFrame
        :return:  Pandas dataframe
        """
        cols = df.columns
        # Convert any MLlib Vector columns to scipy.sparse.csr_matrix
        matrixCols = []
        def toscipy(v):
            if isinstance(v, DenseVector):
                return csr_matrix((v.values, np.array(range(v.size)), np.array([0, v.size])),
                                  shape=(1, v.size))
            elif isinstance(v, SparseVector):
                return csr_matrix((v.values, v.indices, np.array([0, len(v.indices)])),
                                  shape=(1, v.size))
            else:
                raise TypeError("Converter.toPandas found unknown Vector type: %s" % type(v))
        tosparse = udf(lambda v: toscipy(v), CSRVectorUDT())
        for i in range(len(cols)):
            c = cols[i]
            if isinstance(df.schema.fields[i].dataType, VectorUDT):
                cols[i] = tosparse(df[c]).alias(c)
                matrixCols.append(c)
            else:
                cols[i] = df[c]
        return df.select(*cols).toPandas()
Пример #23
0
 def test_udf_in_subquery(self):
     f = udf(lambda x: x, "long")
     with self.tempView("v"):
         self.spark.range(1).filter(f("id") >= 0).createTempView("v")
         sql = self.spark.sql
         result = sql("select i from values(0L) as data(i) where i in (select id from v)")
         self.assertEqual(result.collect(), [Row(i=0)])
Пример #24
0
 def test_udf_in_filter_on_top_of_outer_join(self):
     from pyspark.sql.functions import udf
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(a=1)])
     df = left.join(right, on='a', how='left_outer')
     df = df.withColumn('b', udf(lambda x: 'x')(df.a))
     self.assertEqual(df.filter('b = "x"').collect(), [Row(a=1, b='x')])
def call_utf_encoder(df):
    utf_encoder_udf=udf(unicode_encode, StringType())
    df_cleaned = df.withColumn('speech_text_utf', utf_encoder_udf(df['text'])).drop(df['text'])
    print(df_cleaned.printSchema())
    print(df_cleaned.show(10))
    print(df_cleaned.count())
    return df_cleaned
def call_para_cleanup(df):
    para_cleanup_udf=udf(para_segmenter_and_cleanup, StringType())
    df_cleaned = df.withColumn('para_cleaned_text', para_cleanup_udf(df['speech_text_utf'])).drop(df['speech_text_utf'])
    print(df_cleaned.printSchema())
    print(df_cleaned.show(10))
    print(df_cleaned.count())
    return df_cleaned
Пример #27
0
    def test_udf_with_udt(self):
        row = Row(label=1.0, point=ExamplePoint(1.0, 2.0))
        df = self.spark.createDataFrame([row])
        self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first())
        udf = UserDefinedFunction(lambda p: p.y, DoubleType())
        self.assertEqual(2.0, df.select(udf(df.point)).first()[0])
        udf2 = UserDefinedFunction(lambda p: ExamplePoint(p.x + 1, p.y + 1), ExamplePointUDT())
        self.assertEqual(ExamplePoint(2.0, 3.0), df.select(udf2(df.point)).first()[0])

        row = Row(label=1.0, point=PythonOnlyPoint(1.0, 2.0))
        df = self.spark.createDataFrame([row])
        self.assertEqual(1.0, df.rdd.map(lambda r: r.point.x).first())
        udf = UserDefinedFunction(lambda p: p.y, DoubleType())
        self.assertEqual(2.0, df.select(udf(df.point)).first()[0])
        udf2 = UserDefinedFunction(lambda p: PythonOnlyPoint(p.x + 1, p.y + 1), PythonOnlyUDT())
        self.assertEqual(PythonOnlyPoint(2.0, 3.0), df.select(udf2(df.point)).first()[0])
Пример #28
0
def read_receipts(path, ids=None):
    """
    read receipts data from signals
    :param path:
    :return:
    """
    lines = sc.textFile(path)
    receipts = lines.map(lambda x: x.split('|'))
    print receipts.take(10)
    # print receipts.take(10)
    schema_str = """cust_id product_id variant_id division_id source_id qty date origin price discount_type used_for_recs
    date_modified"""
    fields = [StructField(field_name, StringType(), True) for field_name in schema_str.split()]
    schema = StructType(fields)
    receipts_df = sqlContext.createDataFrame(receipts, schema)
    udfstring_to_date = udf(parse_created_on, IntegerType())
    # udfstring_to_date = udf(parse, DateType())
    receipts_df = (receipts_df
                   .withColumn("delta_date", udfstring_to_date("date")))
    receipts_out = (receipts_df
                    .filter(receipts_df.qty > 1)
                    .select(receipts_df.cust_id, receipts_df.qty, receipts_df.price, receipts_df.delta_date)
                    # .pivot() not available until Spark 1.6
                    .groupby("cust_id").agg({"price": "mean", "qty": "mean", "delta_date": "mean"})
                    )
    fig, axes = plt.subplots(nrows=1, ncols=1)
    axes.set_title('receipts')
    receipts_out.toPandas().boxplot(ax=axes)
    plt.show()
    receipts_out.show(n=30)
    return receipts_df
Пример #29
0
 def test_udf_in_filter_on_top_of_join(self):
     # regression test for SPARK-18589
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(b=1)])
     f = udf(lambda a, b: a == b, BooleanType())
     df = left.crossJoin(right).filter(f("a", "b"))
     self.assertEqual(df.collect(), [Row(a=1, b=1)])
Пример #30
0
    def splitStrCol(self, column, featureNames, mark):
        """This functions split a column into different ones. In the case of this method, the column provided should
        be a string of the following form 'word,foo'.

        :param column       Name of the target column, this column is going to be replaced.
        :param featureNames     List of strings of the new column names after splitting the strings.
        :param mark         String that specifies the splitting mark of the string, this frequently is ',' or ';'.
        """

        # Check if column argument is a string datatype:
        self.__assertTypeStr(column, "column")

        # Check if mark argument is a string datatype:
        self.__assertTypeStr(mark, "mark")

        assert (column in self.__df.columns), "Error: column specified does not exist in dataFrame."

        assert (type(featureNames) == type([])), "Error: featureNames must be a list of strings."

        # Setting a udf that split the string into a list of strings.
        # This is "word, foo" ----> ["word", "foo"]
        func = udf(lambda x: x.split(mark), ArrayType(StringType()))

        self.__df = self.__df.withColumn(column, func(col(column)))
        self.undoVecAssembler(column=column, featureNames=featureNames)
        self.__addTransformation()  # checkpoint in case

        return self
# In[23]:


dictCategories3


# In[24]:


from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType
def categoriesToInt(cat):
    return dictCategories3[cat]

udfCategoriesToInt = udf(categoriesToInt, IntegerType())


# In[25]:


df3_incoded = df3.withColumn("authentication_orientation", udfCategoriesToInt("authentication_orientation") )


# In[26]:


df3_incoded.limit(10).toPandas()


# In[27]:
Пример #32
0
import random
from datetime import datetime, timedelta
import pyspark.sql.functions as F
import uuid
import sys

output_path = sys.argv[1]


def rand_date():
    delta = random.randint(0, 10000)
    dt = datetime.now() - timedelta(days=delta)
    return dt.date().isoformat()


def rand_string():
    return str(uuid.uuid4())


df = sqlContext.range(1000000, 2000000)
df = df.select('id',
               F.udf(rand_date)().alias('as_of'),
               F.rand().alias('a'),
               F.rand().alias('b'),
               F.rand().alias('c')).orderBy(F.asc('as_of'))
df.write.json(output_path)
Пример #33
0
    """
    function to assign every timestamp in data to the time interval it belongs to
    a timestamp belongs to a specific time interval means:
    start_time of this time interval <= timestamp < start_time of this time interval
    """
    timestamp = datetime.strptime(timestamp, '%H:%M:%S')
    for i in range(len(endPoints)):
        if timestamp < endPoints[i]:
            break
    return i

from pyspark.sql import functions as f
from pyspark.sql.types import StringType

# convert into user defined function so that can be applied to pysaprk dataframe
udf_assign_interval = f.udf(assign_interval,StringType())

data = data.withColumn('INTERVAL',udf_assign_interval('TIME'))

# calculate mean price of each time intervel according to symbol, date, interval
data = data.groupby(['SYMBOL','DATE','INTERVAL']).agg({'PRICE':'mean'})
data = data.withColumnRenamed('avg(PRICE)', 'AVG_PRICE')

data = data.select(data.SYMBOL,data.DATE,data.INTERVAL.cast('double'),data.AVG_PRICE)

data = data.orderBy(["SYMBOL","DATE","INTERVAL"], ascending=[1, 1])

# apply window function to get previous time interval avg_price
w = Window().partitionBy(col('SYMBOL')).orderBy([col('SYMBOL'),col('DATE'),col('INTERVAL')])
data = data.select("*", lag('AVG_PRICE').over(w).alias('PRE_AVG_PRICE'))
Пример #34
0
def process_log_data(spark, input_data, output_data):
    """
    Purpose:
        Process log/event data using Spark and output as parquet to S3


    Description: 
        Download JSON log/event data from S3 bucket and set up temporary view
        so Spark SQL queries may be run on the dataset and then again loaded back 
        
    Parameters:
        spark       : Spark Session
        input_data  : location of song_data JSON files with metadata about events
        output_data : dimensional tables in parquet format will be stored
    """

    # read log data file
    print('Read log data from JSON file')
    df = spark.read.json(os.path.join(input_data, "log_data/*/*/*.json"))
    print(df.count())
    df.printSchema()

    # create temp view for Spark SQL queries
    print('create temp view for Spark SQL queries - logs_data')
    df.createOrReplaceTempView("logs_data")

    # filter by actions for song plays
    songplays_table = df['ts', 'userId', 'level', 'sessionId', 'location',
                         'userAgent']

    # extract columns for users table
    print('Users table: ')
    users_table = spark.sql("""
                              SELECT DISTINCT userId, firstName, lastName, gender, level
                              FROM logs_data 
                              WHERE page = 'NextSong' AND userId IS NOT NULL
                              """).dropDuplicates(['userId'])
    print(users_table.limit(5).toPandas())

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users.parquet'),
                              'overwrite')
    print("users.parquet completed")

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df = df.withColumn('timestamp', get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))
    df = df.withColumn("datetime", get_datetime(df.ts))

    # create temp view for SQL queries
    print('create temp view for Spark SQL queries - time_data')
    df.createOrReplaceTempView("time_data")

    # extract columns to create time table
    print('Time table: ')
    time_table = spark.sql("""
                           SELECT DISTINCT datetime as start_time,
                               hour(datetime) AS hour,
                               day(datetime) AS day,
                               weekofyear(datetime) AS week,
                               month(datetime) AS month,
                               year(datetime) AS year,
                               dayofweek(datetime) AS weekday
                           FROM time_data
                           """)
    print(time_table.limit(5).toPandas())

    # write time table to parquet files partitioned by year and month
    time_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'time.parquet'), 'overwrite')
    print("time.parquet completed")

    # read in song data to use for songplays table
    song_df = spark.read.json(
        os.path.join(input_data, "song-data/A/A/A/*.json"))

    # create temp view for Spark SQL queries
    print('create temp view for Spark SQL queries - song_df')
    song_df.createOrReplaceTempView("songs_data")

    # extract columns from joined song and log datasets to create songplays table
    print('Songplays table: ')
    songplays_table = spark.sql("""
                                    SELECT monotonically_increasing_id() AS songplay_id,
                                        to_timestamp(logs_data.ts/1000) AS start_time,
                                        month(to_timestamp(logs_data.ts/1000)) AS month,
                                        year(to_timestamp(logs_data.ts/1000)) AS year,
                                        logs_data.userId AS user_id,
                                        logs_data.level AS level,
                                        songs_data.song_id AS song_id,
                                        songs_data.artist_id AS artist_id,
                                        logs_data.sessionId AS session_id,
                                        logs_data.location AS location,
                                        logs_data.userAgent AS user_agent
                                    FROM logs_data 
                                    JOIN songs_data ON logs_data.artist = songs_data.artist_name
                                """)
    print(songplays_table.limit(5).toPandas())

    #songplays_table = songplays_table.selectExpr("start_time")
    songplays_table.select(
        monotonically_increasing_id().alias('songplay_id')).collect()

    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.partitionBy('year', 'month').parquet(
        os.path.join(output_data, 'songplays.parquet'), 'overwrite')
    print("songplays.parquet completed")

    print("process_log_data IS DONE")
Пример #35
0
# In[152]:


input_flat = input_f.flatMap(lambda x: x.split(' '))


input_dist = input_flat.distinct()


input_filter = input_dist.filter(lambda x: x is not None and x!="")


# In[153]:


count_non_zero=udf(lambda row:len([x for x in row if x!=None]),IntegerType()) #udf to get count of non_zero columns
sqr=udf(lambda row:(sum([x**2 for x in row if x!=None]))**(1/2),DoubleType()) #udf to get sqrt_of_sum 


# In[178]:


#calculating tf

data=input_flat.collect()

dffilt = sqlContext.createDataFrame(input_filter.collect(), StringType())
for i in range(input_1.count()): 
    d1=data.index('~')
    #print(i,d1)
    df = sqlContext.createDataFrame(data[0:d1], StringType())
Пример #36
0
 def test_udf_with_order_by_and_limit(self):
     my_copy = udf(lambda x: x, IntegerType())
     df = self.spark.range(10).orderBy("id")
     res = df.select(df.id, my_copy(df.id).alias("copy")).limit(1)
     self.assertEqual(res.collect(), [Row(id=0, copy=0)])
Пример #37
0
 def test_udf_in_filter_on_top_of_outer_join(self):
     left = self.spark.createDataFrame([Row(a=1)])
     right = self.spark.createDataFrame([Row(a=1)])
     df = left.join(right, on="a", how="left_outer")
     df = df.withColumn("b", udf(lambda x: "x")(df.a))
     self.assertEqual(df.filter('b = "x"').collect(), [Row(a=1, b="x")])
Пример #38
0
spark = SparkSession.builder.appName('cf').getOrCreate()

input_file = '/home/ehsan/WIP/DataMininig/train_rating.txt'
input_file = 'train_rating.txt'
schema = StructType([\
StructField('train_id',IntegerType(), False),\
StructField('user_id', IntegerType(), False),\
StructField('business_id', IntegerType(), False),\
StructField('rating', IntegerType(), False),\
StructField('date', StringType(), False)])

lines = spark.read.csv(input_file, schema=schema, sep=',')
lines = lines.drop('train_id').na.drop()

date2intUDF = udf(date2int, IntegerType())
lines = lines.withColumn(
    'date', date2intUDF('date'))  #Converts the date from string to dateType

#lines = lines.rdd
#parts = lines.map(lambda row: row.value.split("::"))
#ratingsRDD = parts.map(lambda p: Row(user_id=int(p[1]), business_id=int(p[2]),rating=float(p[3]), date=long(p[4])))
#ratings = spark.createDataFrame(ratingsRDD)
#(training, test) = ratings.randomSplit([0.8, 0.2])
(training, test) = lines.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=10,
          rank=100,
          regParam=0.01,
          userCol="user_id",
Пример #39
0
def vectorize_searches_F(topics):
    return udf(lambda x: vectorize_searches(x, topics),
               MapType(IntegerType(), IntegerType()))
Пример #40
0
        #string = string.strip(punctuation)

        # tokenize string
        tokens = re.findall(r"[\w'-]+", string)

        # remove stopwords
        tokens = [w for w in tokens if not w in bc_stopwords.value]

        # word stemming
        tokens = list(map(bc_stemmer.value, tokens))
    else:
        tokens = []
    return tokens


smart_tokenize_F = udf(lambda x: smart_tokenize(x), ArrayType(StringType()))


def stringTokenizer(tokens):
    stop_words = stopwords.words('english')
    stemmer = SnowballStemmer("english").stem
    if pd.isnull(tokens):
        tokens = ""
    return smart_tokenize(tokens, stop_words, stemmer)


def combine_text(strings):
    strings = map(str, strings)
    return ' '.join(strings)

                                 inferSchema='true')
    bs_df.show()
    print(bs_df.printSchema())

    def valueToCategory(value, encoding_index):
        if (value == encoding_index):
            return 1
        else:
            return 0

    #Explode season column into separate columns such as season_<val> and drop season
    from pyspark.sql.functions import udf
    from pyspark.sql.functions import lit
    from pyspark.sql.types import *
    from pyspark.sql.functions import col
    udfValueToCategory = udf(valueToCategory, IntegerType())
    bs_df_encoded = (bs_df.withColumn(
        "season_1", udfValueToCategory(col('season'), lit(1))).withColumn(
            "season_2", udfValueToCategory(col('season'), lit(2))).withColumn(
                "season_3",
                udfValueToCategory(col('season'), lit(3))).withColumn(
                    "season_4", udfValueToCategory(col('season'), lit(4))))
    bs_df_encoded = bs_df_encoded.drop('season')
    #https://stackoverflow.com/questions/40161879/pyspark-withcolumn-with-two-conditions-and-three-outcomes

    bs_df_encoded = (bs_df_encoded.withColumn(
        "weather_1", udfValueToCategory(col('weather'), lit(1))).withColumn(
            "weather_2",
            udfValueToCategory(col('weather'), lit(2))).withColumn(
                "weather_3",
                udfValueToCategory(col('weather'), lit(3))).withColumn(
Пример #42
0
    Converts SAS date
  
    Parameters: 
    arg1 (days)    
  
    Returns: 
    date object or None
    """
    try:
        start = dt.datetime(1960, 1, 1).date()
        return start + dt.timedelta(days=int(x))
    except:
        return None


udf_to_datetime_sas = udf(lambda x: to_datetime(x), DateType())


def to_datetimefrstr(x):
    """
    Summary line. 
    Converts date format
  
    Parameters: 
    arg1 (date)
  
    Returns: 
    date object or None
    """
    try:
        return dt.datetime.strptime(x, '%m%d%Y')
Пример #43
0
   else:
        return r

def country_exists(r):
   if r in tofullname:
        return tofullname[r]
   else:
	return ''

def event_exists(r):
   if r in toevent:
	return toevent[r]
   else:
	return ''

c_exists = udf(country_exists,StringType())
e_exists = udf(event_exists,StringType())
dfsub1 =  df.withColumn("ActionGeo_CountryCode",c_exists(col("ActionGeo_CountryCode"))) \
	    .withColumn("Actor1Type1Code",e_exists(col("Actor1Type1Code")))

sqlContext.registerDataFrameAsTable(dfsub1, 'temp')
df2 = sqlContext.sql("""SELECT ActionGeo_CountryCode,
                               CAST(SQLDATE AS INTEGER), CAST(MonthYear AS INTEGER), CAST(Year AS INTEGER),
                               CASE WHEN Actor1Type1Code = '' AND Actor2Type1Code <> '' THEN Actor2Type1Code
				    ELSE Actor1Type1Code END AS Actor1Type1Code,
                               CAST(NumArticles AS INTEGER),
                               CAST(GoldsteinScale AS INTEGER),
                               CAST(AvgTone AS INTEGER)
                          FROM temp
                         WHERE ActionGeo_CountryCode <> '' AND ActionGeo_CountryCode IS NOT NULL
                            AND Actor1Type1Code <> '' AND Actor1Type1Code IS NOT NULL
Пример #44
0
def gapply(grouped_data, func, schema, *cols):
    """Applies the function ``func`` to data grouped by key. In particular, given a dataframe
    grouped by some set of key columns key1, key2, ..., keyn, this method groups all the values
    for each row with the same key columns into a single Pandas dataframe and by default invokes
    ``func((key1, key2, ..., keyn), values)`` where the number and order of the key arguments is
    determined by columns on which this instance's parent :class:`DataFrame` was grouped and
    ``values`` is a ``pandas.DataFrame`` of columns selected by ``cols``, in that order.

    If there is only one key then the key tuple is automatically unpacked, with
    ``func(key, values)`` called.

    ``func`` is expected to return a ``pandas.DataFrame`` of the specified schema ``schema``,
    which should be of type :class:`StructType` (output columns are of this name and order).

    If ``spark.conf.get("spark.sql.retainGroupColumns")`` is not ``u'true'``, then ``func`` is
    called with an empty key tuple (note it is set to ``u'true'`` by default).

    If no ``cols`` are specified, then all grouped columns will be offered, in the order of the
    columns in the original dataframe. In either case, the Pandas columns will be named
    according to the DataFrame column names.

    The order of the rows passed in as Pandas rows is not guaranteed to be stable relative to
    the original row order.

    :note: Users must ensure that the grouped values for every group must fit entirely in memory.
    :note: This method is only available if Pandas is installed.

    :param grouped_data: data grouped by key
    :param func: a two argument function, which may be either a lambda or named function
    :param schema: the return schema for ``func``, a :class:`StructType`
    :param cols: list of column names (string only)

    :raise ValueError: if ``"*"`` is in ``cols``
    :raise ValueError: if ``cols`` contains duplicates
    :raise ValueError: if ``schema`` is not a :class:`StructType`
    :raise ImportError: if ``pandas`` module is not installed
    :raise ImportError: if ``pandas`` version is too old (less than 0.7.1)

    :return: the new :class:`DataFrame` with the original key columns replicated for each returned
             value in each group's resulting pandas dataframe, the schema being the original key
             schema prepended to ``schema``, where all the resulting groups' rows are concatenated.
             Of course, if retaining group columns is disabled, then the output will exactly match
             ``schema`` since no keys can be prepended.

    >>> import pandas as pd
    >>> from pyspark.sql import SparkSession
    >>> from spark_sklearn.group_apply import gapply
    >>> from spark_sklearn.util import createLocalSparkSession
    >>> spark = createLocalSparkSession()
    >>> df = (spark
    ...     .createDataFrame([Row(course="dotNET", year=2012, earnings=10000),
    ...                       Row(course="Java",   year=2012, earnings=20000),
    ...                       Row(course="dotNET", year=2012, earnings=5000),
    ...                       Row(course="dotNET", year=2013, earnings=48000),
    ...                       Row(course="Java",   year=2013, earnings=30000)])
    ...     .select("course", "year", "earnings"))
    >>> def yearlyMedian(_, vals):
    ...     all_years = set(vals['year'])
    ...     # Note that interpolation is performed, so we need to cast back to int.
    ...     yearly_median = [(year, int(vals['earnings'][vals['year'] == year].median()))
    ...                      for year in all_years]
    ...     return pd.DataFrame.from_records(yearly_median)
    >>> newSchema = StructType().add("year", LongType()).add("median_earnings", LongType())
    >>> gapply(df.groupBy("course"), yearlyMedian, newSchema).orderBy("median_earnings").show()
    +------+----+---------------+
    |course|year|median_earnings|
    +------+----+---------------+
    |dotNET|2012|           7500|
    |  Java|2012|          20000|
    |  Java|2013|          30000|
    |dotNET|2013|          48000|
    +------+----+---------------+
    <BLANKLINE>
    >>> def twoKeyYearlyMedian(_, vals):
    ...     return pd.DataFrame.from_records([(int(vals["earnings"].median()),)])
    >>> newSchema = StructType([df.schema["earnings"]])
    >>> gapply(df.groupBy("course", "year"), twoKeyYearlyMedian, newSchema, "earnings").orderBy(
    ...     "earnings").show()
    +------+----+--------+
    |course|year|earnings|
    +------+----+--------+
    |dotNET|2012|    7500|
    |  Java|2012|   20000|
    |  Java|2013|   30000|
    |dotNET|2013|   48000|
    +------+----+--------+
    <BLANKLINE>
    >>> spark.stop(); SparkSession._instantiatedContext = None
    """
    import pandas as pd
    minPandasVersion = '0.7.1'
    if LooseVersion(pd.__version__) < LooseVersion(minPandasVersion):
        raise ImportError(
            'Pandas installed but version is {}, {} required'.format(
                pd.__version__, minPandasVersion))

    # Do a null aggregation to retrieve the keys first (should be no computation)
    # Also consistent with spark.sql.retainGroupColumns
    keySchema = grouped_data.agg({}).schema
    keyCols = grouped_data.agg({}).columns

    if not cols:
        # Extract the full column list with the parent df
        javaDFName = "org$apache$spark$sql$RelationalGroupedDataset$$df"
        parentDF = java_gateway.get_field(grouped_data._jgd, javaDFName)
        allCols = DataFrame(parentDF, None).columns
        keyColsSet = set(keyCols)
        cols = [col for col in allCols if col not in keyColsSet]

    if "*" in cols:
        raise ValueError("cols expected to contain only singular columns")

    if len(set(cols)) < len(cols):
        raise ValueError("cols expected not to contain duplicate columns")

    if not isinstance(schema, StructType):
        raise ValueError("output schema should be a StructType")

    inputAggDF = grouped_data.agg({col: 'collect_list' for col in cols})
    # Recover canonical order (aggregation may change column order)
    canonicalOrder = chain(
        keyCols, [inputAggDF['collect_list(' + col + ')'] for col in cols])
    inputAggDF = inputAggDF.select(*canonicalOrder)

    # Wraps the user-provided function with another python function, which prepares the
    # input in the form specified by the documentation. Then, once the function completes,
    # this wrapper prepends the keys to the output values and converts from pandas.
    def pandasWrappedFunc(*args):
        nvals = len(cols)
        keys, collectedCols = args[:-nvals], args[-nvals:]
        paramKeys = tuple(keys)
        if len(paramKeys) == 1:
            paramKeys = paramKeys[0]
        valuesDF = pd.DataFrame.from_dict(dict(zip(cols, collectedCols)))
        valuesDF = valuesDF[list(cols)]  # reorder to canonical
        outputDF = func(paramKeys, valuesDF)
        valCols = outputDF.columns.tolist()
        for key, keyName in zip(keys, keyCols):
            outputDF[keyName] = key
        outputDF = outputDF[keyCols + valCols]  # reorder to canonical
        # To recover native python types for serialization, we need
        # to convert the pandas dataframe to a numpy array, then to a
        # native list (can't go straight to native, since pandas will
        # attempt to perserve the numpy type).
        return outputDF.values.tolist()

    keyPrependedSchema = StructType(list(chain(keySchema, schema)))
    outputAggSchema = ArrayType(keyPrependedSchema, containsNull=False)
    pandasUDF = udf(pandasWrappedFunc, outputAggSchema)
    outputAggDF = inputAggDF.select(pandasUDF(*inputAggDF))

    explodedDF = outputAggDF.select(explode(*outputAggDF).alias("gapply"))
    # automatically retrieves nested schema column names
    return explodedDF.select("gapply.*")
Пример #45
0
def process_log_data(spark, input_data, output_data):
    """
    reads log file, filter the records by `NextSong` action.
    converts milliseconds into timestamp.
    fetches dimension specific attributes and insert them into respective table.
    also fills fact table `songplays`.
    :param spark: spark session.
    :param input_data: input data path.
    :param output_data: output data path.
    :return: None.
    """
    # get file path to log data file
    log_data = f'{input_data}/log_data/2018/11/*.json'

    # read log data file
    df = spark.read.json(log_data).dropDuplicates()

    # filter by actions for song plays
    df = df.filter("page='NextSong'")

    # extract columns for users table
    user_table = df.select('userId', 'firstName', 'lastName', 'gender',
                           'level')

    # write users table to parquet files
    user_table.coalesce(1).write.format('json').mode('overwrite').save(
        f'{output_data}/users/')

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: str(int(int(x) / 1000)))
    df = df.withColumn("timestamp", get_timestamp(df.ts))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: str(datetime.fromtimestamp(int(x) / 1000.0)))
    df = df.withColumn("datetime", get_datetime(df.ts))

    # extract columns to create time table
    time_table = df.select('timestamp',
                           hour('datetime').alias('hour'),
                           dayofmonth('datetime').alias('day'),
                           weekofyear('datetime').alias('week'),
                           month('datetime').alias('month'),
                           year('datetime').alias('year'),
                           date_format('datetime', 'F').alias('weekday'))

    # write time table to parquet files partitioned by year and month
    time_table.coalesce(1).write.format("json").partitionBy(["year", "month"]) \
        .mode("overwrite") \
        .save(f'{output_data}/time/year=2018')

    # read in song data to use for songplays table
    song_df = spark.read.format('json').load(
        f'{output_data}/songs/year=*/artist_id=*')
    song_df.createOrReplaceTempView('songs')

    artists_songs_df = spark.read.format('json').load(
        f'{output_data}/artists_songs/')

    df.orderBy('artist').show(20)
    artists_songs_df.orderBy('artist_name').show(20)

    songplays_table = df.join(artists_songs_df,
                              [df.artist == artists_songs_df.artist_name,
                               df.song == artists_songs_df.song_title,
                               artists_songs_df.length == df.length], how='left') \
        .withColumn('start_time', get_datetime(df['ts']).cast('timestamp'))

    songplays_table = songplays_table.select(
        songplays_table['userId'].alias('user_id'),
        songplays_table['level'],
        artists_songs_df['song_id'],
        artists_songs_df['artist_id'],
        songplays_table['sessionId'].alias('session_id'),
        df['location'],
        month('start_time').alias('month'),
        year('start_time').alias('year'),
        songplays_table['userAgent'].alias('user_agent'),
    )

    songplays_table.show(20)

    # write songplays table to parquet files partitioned by year and month
    songplays_table.coalesce(1).write.format("json") \
        .partitionBy(["year", "month"]) \
        .mode("overwrite") \
        .save(f'{output_data}/songsplays/')
Пример #46
0
        df = new_df
    else:
        df = df.unionAll(new_df)

# Clean
df_pickups = df.filter((df.Start_Lon >= -80) & (df.Start_Lon <= -70) & (df.Start_Lat >= 40) & (df.Start_Lat <= 50))
df_dropoffs = df.filter((df.End_Lon >= -80) & (df.End_Lon <= -70) & (df.End_Lat >= 40) & (df.End_Lat <= 50))

def is_float(value):
  try:
    float(value)
    return True
  except ValueError:
    return False

is_float_udf = udf(is_float, BooleanType())
df_pickups = df_pickups.filter((is_float_udf(df.Start_Lon)) & (is_float_udf(df.Start_Lat)))
df_dropoffs = df_dropoffs.filter((is_float_udf(df.End_Lon)) & (is_float_udf(df.End_Lat)))

# Discretize Location
df_pickups = df_pickups.withColumn("Lon", sqlfunctions.round(df.Start_Lon, 2))
df_pickups = df_pickups.withColumn("Lat", sqlfunctions.round(df.Start_Lat, 2))
df_dropoffs = df_dropoffs.withColumn("Lon", sqlfunctions.round(df.End_Lon, 2))
df_dropoffs = df_dropoffs.withColumn("Lat", sqlfunctions.round(df.End_Lat, 2))

# Select Columns
df_pickups = df_pickups.select(df_pickups.Pickup_Time, df_pickups.Lon, df_pickups.Lat)
df_dropoffs = df_dropoffs.select(df_dropoffs.Dropoff_Time, df_dropoffs.Lon, df_dropoffs.Lat)

# Discretize Time
def discretize_time(row):
def tel_to_matrix_udf(telemetry_keys):
  return udf(lambda x: telemetry_to_matrix(x, telemetry_keys), ArrayType(ArrayType(FloatType())))
Пример #48
0
#-------------------------------------------------------------

import sys

from pyspark import SparkConf, SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler, IndexToString
from pyspark.ml.regression import LinearRegression
from pyspark.sql import SQLContext
import pyspark.sql.functions as sf
from pyspark.sql.functions import udf
from sklearn.model_selection import train_test_split

from slicing.spark_modules import spark_utils, join_data_parallel, union_data_parallel

binner = udf(lambda arg: int(int(arg) // 5))

if __name__ == "__main__":
    args = sys.argv
    if len(args) > 1:
        k = int(args[1])
        w = float(args[2].replace(',', '.'))
        alpha = int(args[3])
        if args[4] == "True":
            b_update = True
        else:
            b_update = False
        debug = args[5]
        loss_type = int(args[6])
        enumerator = args[7]
        dataset = args[8]
                    print("MY LIST Y = " , mylist[y])
                    print("Duration = " , duration)
                    print("average = " , avg)
                    break
                else:
                    y = y + 1
        else:
            x = x + 1
    return bottleneck, avg, blist[0],blist[1],clist[0], clist[1]

# COMMAND ----------

func = F.udf(lambda x: calculate(x), T.StructType(
        [T.StructField("val1", T.IntegerType(), True),
         T.StructField("val2", T.FloatType(), True),
         T.StructField("val3", T.IntegerType(), True),
         T.StructField("val4", T.IntegerType(), True),
         T.StructField("val5", T.IntegerType(), True),
         T.StructField("val6", T.IntegerType(), True)]))

# COMMAND ----------

dfd = dfc.withColumn('vals', func('lowSpeed'))

# COMMAND ----------

dfd.show()

# COMMAND ----------

dfe = dfd.withColumn("BottleneckCount",col('vals.val1')).withColumn("AverageDuration",col('vals.val2')).drop("lowSpeed","vals")
Пример #50
0
def add_udf_column(points):
    udf_multiply_latlong = udf(multiply_latlong, IntegerType())
    points = points.withColumn("multiplied_latlong", udf_multiply_latlong(col("lat"), col("long")))
    return points
def calc_onehot_udf(feature_keys, u_features):
  return udf(lambda x: one_hot_features(x, feature_keys, u_features), ArrayType(FloatType()))
Пример #52
0
 def python_plus_one(self):
     return udf(lambda v: v + 1, 'double')
Пример #53
0
spark = SparkSession.builder\
        .master("local[*]")\
        .appName("trial app 1")\
        .config('spark.sql.warehouse.dir', 'file:///C:/tmp') \
        .getOrCreate()

#-------------------
#select, filter
row = Row(name="Alice", age=11)
row['name'], row['age']

data = [('Alice', 1), ('Bob', 2)]
df = spark.createDataFrame(data, ['name', 'age'])
df.show()

doubled = udf(lambda s: s * 2, IntegerType())
df_doubled = df.select(df.name, doubled(df.age).alias('age'))

df_doubled_filter = df_doubled.filter(df_doubled.age > 3)

#--------------------------
#distinct, sort
data = [('Alice', 1), ('Bob', 2), ('Bob', 4)]
df = spark.createDataFrame(data, ['name', 'age'])
df.show()

df_distinct = df.distinct()
df_distinct.show()

df_sorted = df.sort(df.age)
df_sorted.show()
Пример #54
0
 def udf_func(func):
     return udf(func, returnType=return_type)
Пример #55
0
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType


def do_something(x):
    return x + 'hello'


sample_udf = udf(lambda x: do_something(x), StringType())
Пример #56
0
def derive_basic_features_set(drug_reviews_df, fda_df, spark):
    reviews_spark_df_schema = StructType([StructField("drug_name", StringType(), False), \
                                      StructField("condition", StringType(), True), \
                                      StructField("review", StringType(), True), \
                                      StructField("rating", IntegerType(), False), \
                                      StructField("review_date", StringType(), False), \
                                      StructField("useful_count", IntegerType(), False)])
    fda_recalls_spark_df_schema = StructType([StructField("recall_date", StringType(), False), \
                                      StructField("brand", StringType(), True), \
                                      StructField("product_description", StringType(), True), \
                                      StructField("recall_reason", StringType(), False), \
                                      StructField("company", StringType(), False)])
    reviews_spark_df = spark.createDataFrame(review_df,
                                             reviews_spark_df_schema)
    fda_recalls_spark_df = spark.createDataFrame(fda_df,
                                                 fda_recalls_spark_df_schema)
    fda_product_descriptions = []
    for row in fda_recalls_spark_df.select("product_description",
                                           "recall_date").collect():
        fda_product_descriptions.append(
            [row.product_description, row.recall_date])

    wordCount = udf(lambda s: len(s.split(" ")), IntegerType())
    cleanWords = udf(lambda x: cleanText(x))
    avgWordLength = udf(lambda x: average_word_length(x, is_list=True),
                        FloatType())
    drugRecalledUDF = udf(lambda x: drugRecalled(x, fda_product_descriptions))
    drugRecallDateUDF = udf(
        lambda x: drugRecallDate(x, fda_product_descriptions))

    textblob_sentimentUDF = udf(lambda x: TextBlobAnalysis(x))
    vader_sentimentUDF = udf(lambda x: VaderAnalysis(x))
    stanford_sentimentUDF = udf(lambda x: StanfordAnalysis(x))

    reviews_spark_df = reviews_spark_df.withColumn("partial_name", f.lower(f.trim(f.split(reviews_spark_df.drug_name, "/")[0]))). \
                                        withColumn("is_recalled", drugRecalledUDF(reviews_spark_df.drug_name)). \
                                        withColumn("review_date", f.to_date(reviews_spark_df.review_date)).fillna('', subset=['review_date']). \
                                        withColumn("condition", f.trim(f.lower(reviews_spark_df.condition))). \
                                        withColumn("review_length", f.length(reviews_spark_df.review)). \
                                        withColumn("review_word_count", wordCount(reviews_spark_df.review)). \
                                        withColumn("cleaned_words", cleanWords(reviews_spark_df.review))

    reviews_added_cols_spark_df = reviews_spark_df.withColumn("review_cleaned_word_count", f.length(reviews_spark_df.cleaned_words)). \
                                                    withColumn("review_avg_word_length", avgWordLength(reviews_spark_df.review)). \
                                                    withColumn("review_avg_cleaned_word_length", avgWordLength(reviews_spark_df.cleaned_words)). \
                                                    withColumn("combined_sentiment", textblob_sentimentUDF(reviews_spark_df.cleaned_words)). \
                                                    withColumn("combined_vader", vader_sentimentUDF(reviews_spark_df.cleaned_words)). \
                                                    withColumn("combined_stanford", stanford_sentimentUDF(reviews_spark_df.cleaned_words))

    split_textblob = f.split(reviews_added_cols_spark_df['combined_sentiment'],
                             '!')
    split_vader = f.split(reviews_added_cols_spark_df['combined_vader'], '!')
    split_stanford = f.split(reviews_added_cols_spark_df['combined_stanford'],
                             '!')

    reviews_spark_df_split = reviews_added_cols_spark_df.withColumn('Sentimental_Polarity', split_textblob.getItem(0)). \
                                                    withColumn('Sentimental_Subjectivity', split_textblob.getItem(1)). \
                                                    withColumn('vader_neg', split_vader.getItem(0)). \
                                                    withColumn('vader_neu', split_vader.getItem(1)). \
                                                    withColumn('vader_pos', split_vader.getItem(2)). \
                                                    withColumn('vader_comp', split_vader.getItem(3)). \
                                                    withColumn('stanford_neg', split_stanford.getItem(0)). \
                                                    withColumn('stanford_neu', split_stanford.getItem(1)). \
                                                    withColumn('stanford_pos', split_stanford.getItem(2))
    return reviews_spark_df_split
# Import dependencies ZIP
execfile("./__pyfiles__/load.py")

###
### Sentence polarity using NLTK
###


def compute_nltk_polarity(msg_body):
    nltk.data.path.append("./nltk_data.zip/nltk_data")
    sid = SentimentIntensityAnalyzer()
    msg_body = sid.polarity_scores(msg_body)
    return msg_body


compute_nltk_polarity_udf = func.udf(compute_nltk_polarity,
                                     MapType(StringType(), FloatType(), False))
spark.udf.register('compute_nltk_polarity', compute_nltk_polarity_udf)

# Load and preprocess sample data
_, messages = load_data(sc, filter=[2015, 2016, 2017], sample=0.01)
messages = messages.withColumn('created_utc', func.from_unixtime(messages['created_utc'], 'yyyy-MM-dd HH:mm:ss.SS').cast(DateType())) \
                                .withColumnRenamed('created_utc', 'creation_date')

# Clean messages
cleaned_messages = messages.filter(
    "body != '[removed]' and body != '[deleted]'")

nlp_nltk_polartiy = cleaned_messages.selectExpr(
    'id', 'creation_date', 'body',
    "compute_nltk_polarity(body) as nltk_scores")
nlp_nltk_polartiy = nlp_nltk_polartiy.selectExpr(
Пример #58
0
    token_doc_map_df = sqlContext.createDataFrame(token_doc_map, ['token', 'doc_id'])
    # Group By token and doc id : it gives in which all document a token appears
    token_doc_grp = token_doc_map_df.groupBy(token_doc_map_df.token, token_doc_map_df.doc_id).count()
    # Group By token. Gives Document Frequency. ToDo:Can do it in one pass
    doc_freq = token_doc_grp.groupBy(token_doc_grp.token).count().withColumnRenamed('count', 'df')

    print 'token_doc_map    : \n', token_doc_map.take(10)
    token_doc_map_df.show(10, truncate=False)
    token_doc_grp.show(10, truncate=False)
    doc_freq.show(10, truncate=False)

    return doc_freq

corpus_df = df(corpus)

tokendfUDF = udf(idf, FloatType())
idf_df = corpus_df.withColumn("idf", tokendfUDF("df"))
token_idf = idf_df.select(idf_df.token, idf_df.idf)


print 'Calculated df : \n', corpus_df.show(10, truncate=False)
print 'idf_df        : \n', idf_df.show(10, truncate=False)
print 'token_idf     : ', token_idf.show(10, truncate=False)

'''
idf_df :
+--------------+---+---------+
|token         |df |idf      |
+--------------+---+---------+
|adventure     |44 |4.1764364|
|check         |40 |4.269527 |
Пример #59
0
def process_drones(rdd):
    '''
        Driver function to process drone rdd's and select sensor data close to the event
    '''

    if rdd.isEmpty():
        print("RDD is empty")
    else:
        s3_bucket = 's3a://dronesensordata/drone_detect_data'
        df = rdd.toDF()
        df = df.selectExpr("_1 as device_id",\
                            "_2 as latitude",\
                            "_3 as longitude",\
                            "_4 as TimeStamp",\
                            "_5 as barometric_reading", \
                            "_6 as gyrometer_x",\
                            "_7 as gyrometer_y",\
                            "_8 as wind_speed")

        GroupedDF = df.groupBy("device_id").agg(f.collect_list('barometric_reading').\
                                alias('barometric_reading'),\
                                f.min('latitude').\
                                alias('latitude'),\
                                f.min('longitude').\
                                alias('longitude'),\
                                f.collect_list('gyrometer_x').\
                                alias('gyrometer_x'),\
                                f.collect_list('gyrometer_y').\
                                alias('gyrometer_y'),\
                                f.collect_list('wind_speed').\
                                alias('wind_speed'),\
                                f.collect_list('TimeStamp').\
                                alias('TimeStamp'))

        anamoly_udf = udf(detect_barometric_anamoly, BooleanType())
        crashed_udf = udf(detect_crashed_drones, BooleanType())
        minimum_udf = udf(get_min, FloatType())

        processed_DF = GroupedDF.withColumn("malfunctioning", anamoly_udf("barometric_reading", \
                                                                            "TimeStamp")) \
                                .withColumn("crashed", crashed_udf("barometric_reading"))\

        malfunctioning_DF = processed_DF.filter(processed_DF['malfunctioning'])
        crashed_DF = processed_DF.filter(processed_DF['crashed'])

        tuple_list = [(row.latitude, row.longitude, row.device_id)
                      for row in crashed_DF.select('latitude', 'longitude',
                                                   'device_id').collect()]

        for latitude, longitude, device_id in tuple_list:
            data = dumps({
                "device_id": device_id,
                "latitude": latitude,
                "longitude": longitude
            }).encode('utf-8')
            Producer.send('crashed-devices', value=data)
            Producer.flush()

        malfunctioning_DF.write\
                         .mode('append')\
                         .parquet('{}/malfunctioning_devices_sensor_data.parquet'.format(s3_bucket))

        connector = PostgresConnector(args.psnode, args.dbname, args.pusername,
                                      args.password)
        connector.write(processed_DF, 'devices', 'overwrite')
Пример #60
0
def get_weather_station_weather_df(spark, stations_id):
    ''' Download the weather station data during all hours of
        the 5 years for given station ids and return a dataframe
    '''
    cache_file = workdir + 'data/weather_stations.parquet'
    if isdir(cache_file):
        print('Skip downloading weather station: already done')
        return spark.read.parquet(cache_file)

    get_station_weather_month_udf = \
        udf(get_station_weather_month, ArrayType(StructType([
            StructField('day', IntegerType()),
            StructField('hour', IntegerType()),
            StructField('dew_point_temp', FloatType()),
            StructField('rel_hum', FloatType()),
            StructField('wind_dir', FloatType()),
            StructField('wind_spd', FloatType()),
            StructField('visibility', FloatType()),
            StructField('stn_press', FloatType()),
            StructField('hmdx', FloatType()),
            StructField('wind_chill', FloatType()),
            StructField('temp', FloatType()),
            StructField('risky_weather', FloatType())
        ])))

    month_per_year_df = spark.createDataFrame(zip(range(1, 13),), ['month'])
    years_df = spark.createDataFrame(zip(range(2012, 2019),), ['year'])
    months_df = years_df.crossJoin(month_per_year_df)
    stations_months_df = stations_id.crossJoin(months_df)

    c = col('col')

    def create_date(year, month, day):
        return datetime.datetime.strptime(f'{year}-{month}-{day}', "%Y-%m-%d")
    create_date_udf = udf(create_date, DateType())

    df = (stations_months_df
          .repartition(200, 'year', 'month')
          .withColumn('weather', get_station_weather_month_udf('station_id',
                                                               'year',
                                                               'month'))
          .select('station_id', 'year', 'month', explode('weather'))
          .select('station_id',
                  create_date_udf('year', 'month', c['day']).alias('date'),
                  c['hour'].alias('hour'),
                  c['dew_point_temp'].alias('dew_point_temp'),
                  c['rel_hum'].alias('rel_hum'),
                  c['wind_dir'].alias('wind_dir'),
                  c['wind_spd'].alias('wind_spd'),
                  c['visibility'].alias('visibility'),
                  c['stn_press'].alias('stn_press'),
                  c['hmdx'].alias('hmdx'),
                  c['wind_chill'].alias('wind_chill'),
                  c['temp'].alias('temp'),
                  c['risky_weather'].alias('risky_weather')))

    # We make a moving average of risky_weather since the effect of a risky
    # weather is distributed in the next hours
    def weighted_average(c, window, offsets, weights):
        def value(i):
            return lag(c, -i).over(window)

        values = [coalesce(value(i) * w, lit(0))
                  for i, w in zip(offsets, weights)]

        return sum(values, lit(0))

    window = (Window
              .partitionBy('station_id')
              .orderBy('date'))
    offsets = range(-23, 1)
    weights = [exp(0.5*t) for t in offsets]
    weights = [w/sum(weights) for w in weights]
    df = df.withColumn('risky_weather',
                       weighted_average('risky_weather',
                                        window,
                                        offsets,
                                        weights))

    df.write.parquet(cache_file)

    return df