Exemplo n.º 1
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        tRatio = self.getOrDefault(self.trainRatio)
        seed = self.getOrDefault(self.seed)
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        condition = (df[randCol] >= tRatio)
        validation = df.filter(condition).cache()
        train = df.filter(~condition).cache()

        subModels = None
        collectSubModelsParam = self.getCollectSubModels()
        if collectSubModelsParam:
            subModels = [None for i in range(numModels)]

        tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam)
        pool = ThreadPool(processes=min(self.getParallelism(), numModels))
        metrics = [None] * numModels
        for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks):
            metrics[j] = metric
            if collectSubModelsParam:
                subModels[j] = subModel

        train.unpersist()
        validation.unpersist()

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(TrainValidationSplitModel(bestModel, metrics, subModels))
Exemplo n.º 2
0
 def fit(self, dataset, params={}):
     paramMap = self.extractParamMap(params)
     est = paramMap[self.estimator]
     epm = paramMap[self.estimatorParamMaps]
     numModels = len(epm)
     eva = paramMap[self.evaluator]
     nFolds = paramMap[self.numFolds]
     h = 1.0 / nFolds
     randCol = self.uid + "_rand"
     df = dataset.select("*", rand(0).alias(randCol))
     metrics = np.zeros(numModels)
     for i in range(nFolds):
         validateLB = i * h
         validateUB = (i + 1) * h
         condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
         validation = df.filter(condition)
         train = df.filter(~condition)
         for j in range(numModels):
             model = est.fit(train, epm[j])
             # TODO: duplicate evaluator to take extra params from input
             metric = eva.evaluate(model.transform(validation, epm[j]))
             metrics[j] += metric
     bestIndex = np.argmax(metrics)
     bestModel = est.fit(dataset, epm[bestIndex])
     return CrossValidatorModel(bestModel)
Exemplo n.º 3
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        tRatio = self.getOrDefault(self.trainRatio)
        seed = self.getOrDefault(self.seed)
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        condition = (df[randCol] >= tRatio)
        validation = df.filter(condition).cache()
        train = df.filter(~condition).cache()

        def singleTrain(paramMap):
            model = est.fit(train, paramMap)
            metric = eva.evaluate(model.transform(validation, paramMap))
            return metric

        pool = ThreadPool(processes=min(self.getParallelism(), numModels))
        metrics = pool.map(singleTrain, epm)
        train.unpersist()
        validation.unpersist()

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(TrainValidationSplitModel(bestModel, metrics))
Exemplo n.º 4
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels
        for i in range(nFolds):
            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
            validation = df.filter(condition)
            train = df.filter(~condition)
            for j in range(numModels):
                model = est.fit(train, epm[j])
                # TODO: duplicate evaluator to take extra params from input
                metric = eva.evaluate(model.transform(validation, epm[j]))
                metrics[j] += metric/nFolds

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(CrossValidatorModel(bestModel, metrics))
Exemplo n.º 5
0
 def test_rand_functions(self):
     df = self.df
     from pyspark.sql import functions
     rnd = df.select('key', functions.rand()).collect()
     for row in rnd:
         assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1]
     rndn = df.select('key', functions.randn(5)).collect()
     for row in rndn:
         assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]
Exemplo n.º 6
0
    def test_rand_functions(self):
        df = self.df
        from pyspark.sql import functions
        rnd = df.select('key', functions.rand()).collect()
        for row in rnd:
            assert row[1] >= 0.0 and row[1] <= 1.0, "got: %s" % row[1]
        rndn = df.select('key', functions.randn(5)).collect()
        for row in rndn:
            assert row[1] >= -4.0 and row[1] <= 4.0, "got: %s" % row[1]

        # If the specified seed is 0, we should use it.
        # https://issues.apache.org/jira/browse/SPARK-9691
        rnd1 = df.select('key', functions.rand(0)).collect()
        rnd2 = df.select('key', functions.rand(0)).collect()
        self.assertEqual(sorted(rnd1), sorted(rnd2))

        rndn1 = df.select('key', functions.randn(0)).collect()
        rndn2 = df.select('key', functions.randn(0)).collect()
        self.assertEqual(sorted(rndn1), sorted(rndn2))
Exemplo n.º 7
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        pool = ThreadPool(processes=min(self.getParallelism(), numModels))
        subModels = None
        collectSubModelsParam = self.getCollectSubModels()
        if collectSubModelsParam:
            subModels = [[None for j in range(numModels)] for i in range(nFolds)]

        for i in range(nFolds):
            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
            validation = df.filter(condition).cache()
            train = df.filter(~condition).cache()

            tasks = _parallelFitTasks(est, train, eva, validation, epm, collectSubModelsParam)
            for j, metric, subModel in pool.imap_unordered(lambda f: f(), tasks):
                metrics[j] += (metric / nFolds)
                if collectSubModelsParam:
                    subModels[i][j] = subModel

            validation.unpersist()
            train.unpersist()

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(CrossValidatorModel(bestModel, metrics, subModels))
Exemplo n.º 8
0
def split_data(frame, num_folds, tc=TkContext.implicit):
    """
    Randomly split data based on num_folds specified. Implementation logic borrowed from pyspark.
    :param frame: The frame to be split into train and validation frames
    :param num_folds: Number of folds to be split into
    :param tc: spark-tk context passed implicitly
    :return: train frame and test frame for each fold
    """
    from pyspark.sql.functions import rand
    df = frame.dataframe
    h = 1.0/num_folds
    rand_col = "rand_1"
    df_indexed = df.select("*", rand(0).alias(rand_col))
    for i in xrange(num_folds):
        test_lower_bound = i*h
        test_upper_bound = (i+1)*h
        condition = (df_indexed[rand_col] >= test_lower_bound) & (df_indexed[rand_col] < test_upper_bound)
        test_df = df_indexed.filter(condition)
        train_df = df_indexed.filter(~condition)
        train_frame = tc.frame.create(train_df)
        test_frame = tc.frame.create(test_df)
        yield train_frame, test_frame
Exemplo n.º 9
0
    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        h = 1.0 / nFolds
        randCol = self.uid + "_rand"
        df = dataset.select("*", rand(seed).alias(randCol))
        metrics = [0.0] * numModels

        pool = ThreadPool(processes=min(self.getParallelism(), numModels))

        for i in range(nFolds):
            validateLB = i * h
            validateUB = (i + 1) * h
            condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB)
            validation = df.filter(condition).cache()
            train = df.filter(~condition).cache()

            def singleTrain(paramMap):
                model = est.fit(train, paramMap)
                # TODO: duplicate evaluator to take extra params from input
                metric = eva.evaluate(model.transform(validation, paramMap))
                return metric

            currentFoldMetrics = pool.map(singleTrain, epm)
            for j in range(numModels):
                metrics[j] += (currentFoldMetrics[j] / nFolds)
            validation.unpersist()
            train.unpersist()

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)
        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(CrossValidatorModel(bestModel, metrics))
Exemplo n.º 10
0
 def _fit(self, dataset):
     est = self.getOrDefault(self.estimator)
     epm = self.getOrDefault(self.estimatorParamMaps)
     numModels = len(epm)
     eva = self.getOrDefault(self.evaluator)
     tRatio = self.getOrDefault(self.trainRatio)
     seed = self.getOrDefault(self.seed)
     randCol = self.uid + "_rand"
     df = dataset.select("*", rand(seed).alias(randCol))
     metrics = [0.0] * numModels
     condition = (df[randCol] >= tRatio)
     validation = df.filter(condition)
     train = df.filter(~condition)
     for j in range(numModels):
         model = est.fit(train, epm[j])
         metric = eva.evaluate(model.transform(validation, epm[j]))
         metrics[j] += metric
     if eva.isLargerBetter():
         bestIndex = np.argmax(metrics)
     else:
         bestIndex = np.argmin(metrics)
     bestModel = est.fit(dataset, epm[bestIndex])
     return self._copyValues(TrainValidationSplitModel(bestModel, metrics))
Exemplo n.º 11
0
    y = tf.placeholder(tf.float32, shape=[None, 10], name='y')
    layer1 = tf.layers.dense(x, 256, activation=tf.nn.relu)
    layer2 = tf.layers.dense(layer1, 256, activation=tf.nn.relu)
    out = tf.layers.dense(layer2, 10)
    z = tf.argmax(out, 1, name='out')
    loss = tf.losses.softmax_cross_entropy(y, out)
    return loss


if __name__ == '__main__':
    spark = SparkSession.builder \
        .appName("examples") \
        .master('local[8]').config('spark.driver.memory', '4g') \
        .getOrCreate()

    df = spark.read.option("inferSchema", "true").csv('mnist_train.csv').orderBy(rand())
    mg = build_graph(small_model)
    adam_config = build_adam_config(learning_rate=0.001, beta1=0.9, beta2=0.999)

    va = VectorAssembler(inputCols=df.columns[1:785], outputCol='features').transform(df)
    encoded = OneHotEncoder(inputCol='_c0', outputCol='labels', dropLast=False).transform(va).select(['features', 'labels'])

    #demonstration of options. Not all are required
    spark_model = SparkAsyncDL(
        inputCol='features',
        tensorflowGraph=mg,
        tfInput='x:0',
        tfLabel='y:0',
        tfOutput='out:0',
        tfOptimizer='adam',
        miniBatchSize=300,
Exemplo n.º 12
0
    num_users = ui_mat_rdd.map(_func).distinct().count()

    def _func(i):
        usrId, docId, value = i
        return docId

    num_movies = ui_mat_rdd.map(_func).distinct().count()
    print('users:', num_users, 'products:', num_movies)

    # Create Spark dataframe
    df = spark.createDataFrame(ui_mat_rdd, ['userId', 'movieId', 'value'])

    ui_mat_rdd.unpersist()

    print('Splitting data set...')
    df = df.orderBy(F.rand())

    train_df, test_df = df.randomSplit([0.9, 0.1], seed=45)
    train_df, val_df = train_df.randomSplit([0.95, 0.05], seed=45)

    train_df = train_df.withColumn('flag', F.lit(0))
    val_df = val_df.withColumn('flag', F.lit(1))
    val_df = val_df.union(train_df)
    test_df = test_df.withColumn('flag', F.lit(2))
    test_df = test_df.union(train_df)
    test_df = test_df.union(val_df)

    train_size = train_df.count()
    val_size = val_df.count()
    test_size = test_df.count()
Exemplo n.º 13
0
def ROEM_cv(ratings_df, userCol = "userId", itemCol = "songId", ratingCol = "num_plays", ranks = [10, 50, 100, 150, 200], maxIters = [10, 25, 50, 100, 200, 400], regParams = [.05, .1, .15], alphas = [10, 40, 80, 100]):

  #Originally run on a subset of the Echo Next Taste Profile dataset found here:
  #https://labrosa.ee.columbia.edu/millionsong/tasteprofile

  from pyspark.sql.functions import rand
  from pyspark.ml.recommendation import ALS

  ratings_df = ratings_df.orderBy(rand()) #Shuffling to ensure randomness

  #Building train and validation test sets
  train, validate = ratings_df.randomSplit([0.8, 0.2], seed = 0)

  #Building 5 folds within the training set.
  test1, test2, test3, test4, test5 = train.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2], seed = 1)
  train1 = test2.union(test3).union(test4).union(test5)
  train2 = test3.union(test4).union(test5).union(test1)
  train3 = test4.union(test5).union(test1).union(test2)
  train4 = test5.union(test1).union(test2).union(test3)
  train5 = test1.union(test2).union(test3).union(test4)
  

  #Creating variables that will be replaced by the best model's hyperparameters for subsequent printing
  best_validation_performance = 9999999999999
  best_rank = 0
  best_maxIter = 0
  best_regParam = 0
  best_alpha = 0
  best_model = 0
  best_predictions = 0

  #Looping through each combindation of hyperparameters to ensure all combinations are tested.
  for r in ranks:
    for mi in maxIters:
      for rp in regParams:
        for a in alphas:
          #Create ALS model
          als = ALS(rank = r, maxIter = mi, regParam = rp, alpha = a, userCol=userCol, itemCol=itemCol, ratingCol=ratingCol,
                    coldStartStrategy="drop", nonnegative = True, implicitPrefs = True)

          #Fit model to each fold in the training set
          model1 = als.fit(train1)
          model2 = als.fit(train2)
          model3 = als.fit(train3)
          model4 = als.fit(train4)
          model5 = als.fit(train5)

          #Generating model's predictions for each fold in the test set
          predictions1 = model1.transform(test1)
          predictions2 = model2.transform(test2)
          predictions3 = model3.transform(test3)
          predictions4 = model4.transform(test4)
          predictions5 = model5.transform(test5)

          #Expected percentile rank error metric function
          def ROEM(predictions, userCol = "userId", itemCol = "songId", ratingCol = "num_plays"):
              #Creates table that can be queried
              predictions.createOrReplaceTempView("predictions")

              #Sum of total number of plays of all songs
              denominator = predictions.groupBy().sum(ratingCol).collect()[0][0]

              #Calculating rankings of songs predictions by user
              spark.sql("SELECT " + userCol + " , " + ratingCol + " , PERCENT_RANK() OVER (PARTITION BY " + userCol + " ORDER BY prediction DESC) AS rank FROM predictions").createOrReplaceTempView("rankings")

              #Multiplies the rank of each song by the number of plays and adds the products together
              numerator = spark.sql('SELECT SUM(' + ratingCol + ' * rank) FROM rankings').collect()[0][0]

              performance = numerator/denominator

              return performance

          #Calculating expected percentile rank error metric for the model on each fold's prediction set
          performance1 = ROEM(predictions1)
          performance2 = ROEM(predictions2)
          performance3 = ROEM(predictions3)
          performance4 = ROEM(predictions4)
          performance5 = ROEM(predictions5)

          #Printing the model's performance on each fold
          print ("Model Parameters: ")("Rank:"), r, ("  MaxIter:"), mi, ("RegParam:"), rp, ("Alpha: "), a
          print("Test Percent Rank Errors: "), performance1, performance2, performance3, performance4, performance5

          #Validating the model's performance on the validation set
          validation_model = als.fit(train)
          validation_predictions = validation_model.transform(validate)
          validation_performance = ROEM(validation_predictions)

          #Printing model's final expected percentile ranking error metric
          print("Validation Percent Rank Error: "), validation_performance
          print(" ")

          #Filling in final hyperparameters with those of the best-performing model
          if validation_performance < best_validation_performance:
            best_validation_performance = validation_performance
            best_rank = r
            best_maxIter = mi
            best_regParam = rp
            best_alpha = a
            best_model = validation_model
            best_predictions = validation_predictions

  #Printing best model's expected percentile rank and hyperparameters
  print ("**Best Model** ")
  print ("  Percent Rank Error: "), best_validation_performance
  print ("  Rank: "), best_rank
  print ("  MaxIter: "), best_maxIter
  print ("  RegParam: "), best_regParam
  print ("  Alpha: "), best_alpha
  return best_model, best_predictions
Exemplo n.º 14
0
def spark_stratified_split(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    seed=42,
):
    """Spark stratified splitter
    For each user / item, the split function takes proportions of ratings which is
    specified by the split ratio(s). The split is stratified.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two halves and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list is
            provided and the ratios are not summed to 1, they will be normalized.
            Earlier indexed splits will have earlier times
            (e.g the latest time per user or item in split[0] <= the earliest time per user or item in split[1])
        seed (int): Seed.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to filter
            with min_rating.
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError("min_rating should be integer and larger than or equal to 1.")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    if min_rating > 1:
        data = min_rating_filter_spark(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.partitionBy(split_by_column).orderBy(rand(seed=seed))

    rating_grouped = (
        data.groupBy(split_by_column)
        .agg({col_rating: "count"})
        .withColumnRenamed("count(" + col_rating + ")", "count")
    )
    rating_all = data.join(broadcast(rating_grouped), on=split_by_column)

    rating_rank = rating_all.withColumn(
        "rank", row_number().over(window_spec) / col("count")
    )

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i])
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            )

        splits.append(rating_split)

    return splits
# COMMAND ----------

from pyspark.sql.types import StringType

manualSplitPythonUDF = spark.udf.register("manualSplitSQLUDF", manual_split, StringType())

# COMMAND ----------

# MAGIC %md
# MAGIC Create a dataframe of 100k values with a string to index. Do this by using a hash function.

# COMMAND ----------

from pyspark.sql.functions import sha1, rand
randomDF = (spark.range(1, 10000 * 10 * 10 * 10)
  .withColumn("random_value", rand(seed=10).cast("string"))
  .withColumn("hash", sha1("random_value"))
  .drop("random_value")
)

display(randomDF)

# COMMAND ----------

# MAGIC %md
# MAGIC Apply the UDF by using it just like any other Spark function.

# COMMAND ----------

randomAugmentedDF = randomDF.select("*", manualSplitPythonUDF("hash").alias("augmented_col"))
Exemplo n.º 16
0
#cuminv=np.loadtxt('scripts/cuminv_bdt.txt')
# we know the binnings that were used
dz = 0.01
du = 1 / 1000.


#find indices and return the table values
@pandas_udf('float', PandasUDFType.SCALAR)
def z_PZ(zr, u):
    iz = np.array(zr / dz, dtype='int')
    iu = np.array(u / du, dtype='int')
    return pd.Series(cuminv[iz, iu])


#add column of uniform random numbers
gal = gal.withColumn("u", F.rand())

#transform with the inverse-cumulative table
gal=gal.withColumn("zrec",z_PZ("z","u")+dz/2)\
       .drop("u")  #do not need u anymore

gal.show(5)
ddt.append(timer.step())
timer.print(ana)

####
ana = "3: cache (count)"
gal = gal.cache()
print("N={}".format(gal.count()))
ddt.append(timer.step())
timer.print(ana)
Exemplo n.º 17
0
# 0      1   16382  0.049476
# 1      0  314728  0.950524

# test
# 82183
   # Class  count     ratio
# 0      1   4184  0.050911
# 1      0  77999  0.949089
 
 
#way 2-2 Exact stratification using Window     ===multi-class variant in comments

temp = (
    JMM_binary_Vfeature
    .withColumn("id", F.monotonically_increasing_id())
    .withColumn("Random", F.rand(seed=1000))
    .withColumn(
        "Row",
        F.row_number()
        .over(
            Window
            .partitionBy("Class")
            .orderBy("Random")
        )
    )
)
#top 20899 rows are class  1
 
num_P=20566	   
num_N=392727
training_stratification = temp.where(
schema = StructType() \
    .add("order_id", StringType()) \
    .add("customer_id", StringType()) \
    .add("order_status", StringType()) \
    .add("order_purchase_timestamp", StringType()) \
    .add("order_approved_at", StringType()) \
    .add("order_delivered_carrier_date", StringType()) \
    .add("order_delivered_customer_date", StringType()) \
    .add("order_estimated_delivery_date", StringType())

parsed_orders = raw_orders \
    .select(F.from_json(F.col("value").cast("String"), schema).alias("value"), "offset") \
    .select("value.*", "offset")

extended_orders = parsed_orders \
    .withColumn("my_extra_column", F.round( F.rand() * 100 ) ) \
    .withColumn("my_current_time", F.current_timestamp())


#FOREACH BATCH SINK
def foreach_batch_sink(df, freq):
    return  df \
        .writeStream \
        .foreachBatch(foreach_batch_function) \
        .trigger(processingTime='%s seconds' % freq ) \
        .start()


def foreach_batch_function(df, epoch_id):
    print("starting epoch " + str(epoch_id))
    df.persist()
Exemplo n.º 19
0
result = result.join(data_pid_profile_emb,
                     result.row_num == data_pid_profile_emb.row_num,
                     'inner').drop(result['row_num'])
result = result.join(data_click_emb, result.row_num == data_click_emb.row_num,
                     'inner').drop(result['row_num'])
result = result.join(data_click_cross_emb,
                     result.row_num == data_click_cross_emb.row_num,
                     'inner').drop(result['row_num'])
result = result.join(data_order_cross_emb,
                     result.row_num == data_order_cross_emb.row_num,
                     'inner').drop(result['row_num'])

data_stage4_1 = result.withColumn(
    'features',
    concat_ws('|', result.cid, result.pid, result.click, result.click_cross,
              result.order_cross,
              *[getattr(result, str(col)) for col in item_col]))
data_stage4_2 = data_stage4_1.withColumn(
    'merge', concat_ws('@', data_stage4_1.label, data_stage4_1.features))

trainDF, testDF = data_stage4_2.orderBy(rand()).randomSplit([rate, 1.0 - rate],
                                                            777)
trainDF.select('merge').rdd.map(lambda x: '|'.join([
    item[1:-1] if idx in [2, 3, 4] else item
    for idx, item in enumerate(x[0].split('|'))
])).saveAsTextFile(train_out_path)
testDF.select('merge').rdd.map(lambda x: '|'.join([
    item[1:-1] if idx in [2, 3, 4] else item
    for idx, item in enumerate(x[0].split('|'))
])).saveAsTextFile(eval_out_path)
Exemplo n.º 20
0
        axis=0)
    labPt = LabeledPoint(line.tip_amount, features)
    return labPt


## SPLIT DATA ##

sample_size = 0.25
#test with sample of data
train_ = 0.75
test_ = (1 - train_)
#seed = 5767;
encoded_sample = encoded.sample(False, sample_size, seed=seed)

temp_rand = encoded_sample.select("*",
                                  rand(0).alias("rand"))
train_data, test_data = temp_rand.randomSplit([train_, test_], seed=seed)

indexed_train_bin = train_data.map(parseRowIndexingBinary)
indexed_test_bin = test_data.map(parseRowIndexingBinary)
oneHot_train_bin = train_data.map(parseRowOneHotBinary)
oneHot_test_bin = test_data.map(parseRowOneHotBinary)
indexed_train_reg = train_data.map(parseRowIndexingRegression)
indexed_test_reg = test_data.map(parseRowIndexingRegression)
oneHot_train_reg = train_data.map(parseRowOneHotRegression)
oneHot_test_reg = test_data.map(parseRowOneHotRegression)

## FEATURE SCALING ##

label = oneHot_train_reg.map(lambda x: x.label)
features = oneHot_train_reg.map(lambda x: x.features)
def fallback_prediction(x):
    """
    Make a random Guess if model made no predicitons
    """
    return when(isnan(x), rand()).otherwise(col(x))
Exemplo n.º 22
0
import pyspark.sql.functions as f

rate = 10000000
item_nums = 10000000

ratePartitions = 40
dfInput = (spark
       .readStream
       .format("rate")
       .option("rowsPerSecond", rate)
       .option("numPartitions",ratePartitions)
       .load())

dfSales = (dfInput
 .withColumn("item_id",f.col("value") % item_nums)
 .withColumn("sales", (f.lit(1) + 10 * f.rand(seed = 42)).cast("int"))
 .select("timestamp","item_id","sales")
)


# Define table name and checkpoint location of the streaming table. (checkpoint_location for database has been defined in setup notebook)
table_name = "sw_db.bronze_compact"
checkpointTable = checkpoint_location + table_name 

# Write to Delta
(dfSales
 .writeStream
 .option("checkpointLocation", checkpointTable)
 .format("delta")
 .outputMode("append")
 .table(table_name)
Exemplo n.º 23
0
def spark_stratified_split(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    seed=42,
):
    """Spark stratified splitter
    For each user / item, the split function takes proportions of ratings which is
    specified by the split ratio(s). The split is stratified.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two halves and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list is
            provided and the ratios are not summed to 1, they will be normalized.
            Earlier indexed splits will have earlier times
            (e.g the latest time per user or item in split[0] <= the earliest time per user or item in split[1])
        seed (int): Seed.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to filter
            with min_rating.
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError(
            "min_rating should be integer and larger than or equal to 1.")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    if min_rating > 1:
        data = min_rating_filter_spark(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.partitionBy(split_by_column).orderBy(rand(seed=seed))

    rating_grouped = (data.groupBy(split_by_column).agg({
        col_rating: "count"
    }).withColumnRenamed("count(" + col_rating + ")", "count"))
    rating_all = data.join(broadcast(rating_grouped), on=split_by_column)

    rating_rank = rating_all.withColumn(
        "rank",
        row_number().over(window_spec) / col("count"))

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i])
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i])
                & (col("rank") > ratio_index[i - 1]))

        splits.append(rating_split)

    return splits
Exemplo n.º 24
0
schema = StructType(
    [get_structfield(colname) for colname in header.split(',')])

#inputs = 'gs://{}/flights/tzcorr/all_flights-00000-*'.format(BUCKET) # 1/30th
inputs = 'gs://{}/flights/tzcorr/all_flights-*'.format(BUCKET)  # FULL
flights = spark.read\
            .schema(schema)\
            .csv(inputs)
flights.createOrReplaceTempView('flights')

# separate training and validation data
from pyspark.sql.functions import rand
SEED = 13
traindays = traindays.withColumn(
    "holdout",
    rand(SEED) > 0.8)  # 80% of data is for training
traindays.createOrReplaceTempView('traindays')

# logistic regression
trainquery = """
SELECT
  DEP_DELAY, TAXI_OUT, ARR_DELAY, DISTANCE, DEP_TIME, DEP_AIRPORT_TZOFFSET
FROM flights f
JOIN traindays t
ON f.FL_DATE == t.FL_DATE
WHERE
  t.is_train_day == 'True' AND
  t.holdout == False AND
  f.CANCELLED == '0.00' AND 
  f.DIVERTED == '0.00'
"""
import pyspark.sql.functions as F
import time

# COMMAND ----------

days_back = 14
values_per_second = 337
nowTimestamp = time.time()

# COMMAND ----------

dfTimeSeries = sqlContext.range(0, days_back * 24 * 60 * 60 * values_per_second) \
  .withColumn("Timestamp", (nowTimestamp - (F.col("id") / values_per_second)).cast("Timestamp")) \
  .drop("id") \
  .withColumn("Sensor", F.concat_ws('-',
                               1 + (F.rand() * 10).cast("Int"),
                               1 + (F.rand() * 100).cast("Int"),
                               1 + (F.rand() * 350).cast("Int"))) \
  .withColumn("Value", F.round(F.rand() * 100, 3)) \
  .withColumn("year", F.year("Timestamp")) \
  .withColumn("month", F.month("Timestamp")) \
  .withColumn("day", F.dayofmonth("Timestamp"))

display(dfTimeSeries)

# COMMAND ----------

spark.conf.set("fs.azure.account.key.<StorageAccountName>.blob.core.windows.net", \
  "<StorageAccountKey>")

dfTimeSeries.write \
Exemplo n.º 26
0
import geopandas as gpd

# Read csv
uber_df = spark.read.csv("uber14.csv", inferSchema=True, header=True)
nyc = gpd.read_file('NYC_map/nyc.shp')

# Change lat/long to float
uber_df = uber_df.withColumn("Lat", uber_df["Lat"].cast(FloatType()))
uber_df = uber_df.withColumn("Lon", uber_df["Lon"].cast(FloatType()))

# Add columns: Burrow , Month
uber_df = uber_df.withColumn('Burrow', lit(None))
uber_df = uber_df.withColumn('Month', lit(None))

# Take sample
sample_uber_df = uber_df.select("*").orderBy(rand()).limit(100000)


def burrow_column(X, Y):
    point = Point(Y, X)
    if nyc['geometry'][0].contains(point):
        return ('Bronx')
    if nyc['geometry'][1].contains(point):
        return ('Staten Island')
    if nyc['geometry'][2].contains(point):
        return ('Brooklyn')
    if nyc['geometry'][3].contains(point):
        return ('Queens')
    if nyc['geometry'][4].contains(point):
        return ('Manhattan')
Exemplo n.º 27
0
def sample_from_healthy_population(df, frac, withreplacement=True):
    return df.sample(withreplacement, frac) \
        .select(
                col("id")
                ,(rand()*365).alias("timestamp").cast("int")
               )
Exemplo n.º 28
0
 def _transform(self, dataset):
     return dataset.withColumn(
         "prediction", dataset.feature + (rand(0) * self.getInducedError()))
Exemplo n.º 29
0
non_escalation_case_status_history = (case_status_history.join(
    escalation_starts, on=['reference_id'], how='left_anti'))
non_escalation_case_status_history.count()  # 783586

non_escalation_decision_times = (
    non_escalation_case_status_history.groupby('reference_id').agg(
        F.max('seconds_since_case_start').alias('case_end')).crossJoin(
            escalation_points_distribution).withColumn(
                'time_cut',
                F.col('case_end') *
                F.col("average_percentile_escalation_point")).withColumn(
                    'random_row_rank_for_sampling',
                    F.row_number().over(
                        Window.partitionBy('reference_id').orderBy(
                            F.rand()))).filter(
                                F.col('random_row_rank_for_sampling') == 1))
non_escalation_case_status_history.count()  # 783586
non_escalation_decision_times.count()  # 52989

non_escalation_training_targets = (non_escalation_decision_times.join(
    non_escalation_case_status_history,
    on=['reference_id'], how='inner').filter(
        F.col('seconds_since_case_start') < F.col('time_cut')
    ).groupBy('reference_id').agg(
        F.max('seconds_since_case_start').alias('decision_time')).withColumn(
            'target', F.lit(0)))

non_escalation_training_targets.show()
non_escalation_training_targets.count(
)  # 51443 (we loose 52989 - 51443 = 1546)
Exemplo n.º 30
0
 def _sample_using_random(df, p: float = 0.1, seed: int = 1):
     """Take a random sample of rows, retaining proportion p"""
     res = (df.withColumn(
         "rand", F.rand(seed=seed)).filter(F.col("rand") < p).drop("rand"))
     return res
Exemplo n.º 31
0
def shuffle(dataset):
    dataset = dataset.orderBy(rand())

    return dataset
regexTokenizer = RegexTokenizer(inputCol="comment",
                                outputCol="text",
                                pattern="\\W")
df_clean = regexTokenizer.transform(df_clean)
df_clean.show(10)

# COMMAND ----------

# MAGIC %md
# MAGIC ##### Alert: First try is to use 1,000,000 rows for testing

# COMMAND ----------

from pyspark.sql.functions import rand

df_clean.orderBy(rand(seed=0)).createOrReplaceTempView("table1")
df_clean = spark.sql("select * from table1 limit 1000000")

# COMMAND ----------

# use word2vec get text vector feature.
from pyspark.ml.feature import Word2Vec
# Learn a mapping from words to Vectors. (choose higher vectorSize here)
#word2Vec = Word2Vec(vectorSize=20, minCount=1, inputCol="text", outputCol="wordVector")
word2Vec = Word2Vec(vectorSize=50,
                    minCount=1,
                    inputCol="text",
                    outputCol="wordVector")
model = word2Vec.fit(df_clean)

df_model = model.transform(df_clean)
Exemplo n.º 33
0
output_path = "s3://emr-rwes-pa-spark-dev-datastore/BI_IPF_2016/02_results/"
start_time = time.time()
st = datetime.datetime.fromtimestamp(start_time).strftime('%Y%m%d_%H%M%S')

table_name = "hive_test_" + st
datafactz_table_name = "hive_test_datafactz_" + st

pos = sqlContext.read.load((data_path + pos_file),
                           format='com.databricks.spark.csv',
                           header='true',
                           inferSchema='true')

neg = sqlContext.read.load((data_path + neg_file),
                           format='com.databricks.spark.csv',
                           header='true',
                           inferSchema='true')

dataColumns = pos.columns

data = pos.select(dataColumns).unionAll(neg.select(dataColumns))

#for IMS
data.write.save(path=output_path + table_name, format='orc')

#for datafactz
df = sqlContext.range(0, numRowsReq)
datafactz_df = df.select(rand().alias("Col1"),
                         rand().alias("Col2"),
                         rand().alias("Col3"))
datafactz_df.write.save(path=output_path + datafactz_table_name, format='orc')
Exemplo n.º 34
0
def array_choice(col):
    index = (F.rand() * F.size(col)).cast("int")
    return col[index]
Exemplo n.º 35
0
def main():
    # Setup Spark
    spark = SparkSession.builder.master("local[*]").getOrCreate()

    # Nice way to write a tmp file onto the system
    temp_csv_file = tempfile.mktemp()
    with open(temp_csv_file, mode="wb") as f:
        data_https = requests.get("https://teaching.mrsharky.com/data/iris.data")
        f.write(data_https.content)

    fisher_df_orig = spark.read.csv(temp_csv_file, inferSchema="true", header="false")
    fisher_df_orig.persist(StorageLevel.MEMORY_ONLY)
    fisher_df_orig.show()

    # Change column names
    column_names = [
        "sepal_length",
        "sepal_width",
        "petal_length",
        "petal_width",
        "class",
    ]
    fisher_df_orig = fisher_df_orig.toDF(*column_names)

    # Randomize order of rows
    fisher_df_orig = fisher_df_orig.withColumn("random", rand()).orderBy("random")

    # Make a copy of the DataFrame (so we can start over)
    fisher_df = fisher_df_orig
    fisher_df.createOrReplaceTempView("fisher")
    print_heading("Original Dataset")
    fisher_df.show()

    # Get the average of all continuous fields
    print_heading("Population Average")
    fisher_avg_df = spark.sql(
        """
        SELECT
                AVG(sepal_length) AS avg_sepal_length
                , AVG(sepal_width) AS avg_sepal_width
                , AVG(petal_length) AS avg_petal_length
                , AVG(petal_width) AS avg_petal_width
            FROM fisher
        """
    )
    fisher_avg_df.show()

    # Get the average of all continuous fields by class
    print_heading("Average by class")
    fisher_avg_df = spark.sql(
        """
        SELECT
                class
                , AVG(sepal_length) AS avg_sepal_length
                , AVG(sepal_width) AS avg_sepal_width
                , AVG(petal_length) AS avg_petal_length
                , AVG(petal_width) AS avg_petal_width
            FROM fisher
            GROUP BY class
            ORDER BY class
        """
    )
    fisher_avg_df.show()

    # Build a features vector
    print_heading("VectorAssembler")
    vector_assembler = VectorAssembler(
        inputCols=["sepal_length", "sepal_width", "petal_length", "petal_width"],
        outputCol="features",
    )
    fisher_df = vector_assembler.transform(fisher_df)
    fisher_df.show()

    # Label String Indexer
    print_heading("StringIndexer")
    label_indexer = StringIndexer(inputCol="class", outputCol="class_idx")
    label_indexer_model = label_indexer.fit(fisher_df)
    fisher_df = label_indexer_model.transform(fisher_df)
    fisher_df.show()

    # Random forest
    print_heading("RandomForestClassifier")
    random_forest = RandomForestClassifier(
        labelCol="class_idx",
        featuresCol="features",
    )
    random_forest_model = random_forest.fit(fisher_df)
    fisher_df_predicted = random_forest_model.transform(fisher_df)
    fisher_df_predicted.createOrReplaceTempView("predicted")
    fisher_df_predicted.show()

    print_heading("Accuracy")
    fisher_df_accuracy = spark.sql(
        """
        SELECT
                SUM(correct)/COUNT(*) AS accuracy
            FROM
                (SELECT
                        CASE WHEN prediction == class_idx THEN 1
                        ELSE 0 END AS correct
                    FROM predicted) AS TMP
              """
    )
    fisher_df_accuracy.show()

    # Pipeline
    print_heading("Pipeline")
    fisher_df = fisher_df_orig
    fisher_df.createOrReplaceTempView("fisher")
    pipeline = Pipeline(stages=[vector_assembler, label_indexer, random_forest])
    model = pipeline.fit(fisher_df)
    fisher_df_predicted = model.transform(fisher_df)
    fisher_df_predicted.show()
    return
Exemplo n.º 36
0
from pyspark.sql.functions import rand, randn, mean, min, max
from pyspark.sql.context import SQLContext
from pyspark.context import SparkConf, SparkContext

conf = SparkConf().setMaster("local").setAppName("sparkDataFrame")
sc = SparkContext(conf = conf)
sqlcontext = SQLContext(sc)

# 1. Create a DataFrame with one int column and 10 rows.
df = sqlcontext.range(0, 10)
df.show()

# Generate two other columns using uniform distribution and normal distribution.
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal"))
df.show()

# 2. Summary and Descriptive Statistics
df = sqlcontext.range(0, 10).withColumn('uniform', rand(seed=10)).withColumn('normal', randn(seed=27))
df.describe('uniform', 'normal').show()

df.select([mean('uniform'), min('uniform'), max('uniform')]).show()

# 3. Sample covariance and correlation
# Covariance is a measure of how two variables change with respect to each other. 
# A positive number would mean that there is a tendency that as one variable increases, 
# the other increases as well. 
# A negative number would mean that as one variable increases, 
# the other variable has a tendency to decrease.
df = sqlcontext.range(0, 10).withColumn('rand1', rand(seed=10)).withColumn('rand2', rand(seed=27))
df.stat.cov('rand1', 'rand2')
df.stat.cov('id', 'id')
Exemplo n.º 37
0
from pyspark.shell import sqlContext
from pyspark.sql.functions import rand, randn
from pyspark.sql import *
from pyspark.sql.functions import mean, min, max

df = sqlContext.range(0, 7)

df.show()

df.select("id",
          rand(seed=10).alias("uniform"),
          randn(seed=27).alias("normal")).show()

df.describe("uniform", "normal").show()

dfNew = df.describe("uniform", "normal").show()

dfNew.select([mean("uniform"), min("uniform"), max("uniform")]).show()
Exemplo n.º 38
0
            'prediction_vote',
            f.when(df.confidence_vote > 0.5, 1.0).otherwise(0.0))
        df = df.withColumn(
            'confidence_vote',
            f.when(df.prediction_vote == 0.0,
                   1 - df.prediction_vote).otherwise(df.prediction_vote))
        return (df)


# Import des data
training_large = [dir + '/Data/stemmed_amazon_500k_train.txt']
test_large = ['./Data/stemmed_amazon_500k_test.txt']
test_imbd = [dir + '/Data/imdb_yelp.txt']

X_train_large = loadData(training_large, minDF=1, TFIDF_b=True)
X_train_large = X_train_large.orderBy(rand())
X_test_large = loadData(test_large, train_cv=0, TFIDF_b=True)
X_test_imbd = loadData(test_imbd, train_cv=0, TFIDF_b=True)

X_train_large.groupby('score').count().show()
X_test_large.groupby('score').count().show()
X_test_imbd.groupby('score').count().show()

# Model path
NB_model_path = './Model/NB_model_500k'
LR_model_path = './Model/LR_model_500k'
RF_model_path = './Model/RF_model_500k'

# Naive Bayes
NB = NaiveBayes(modelType="multinomial", labelCol="score", featuresCol="X")
NB_model = NB.fit(X_train_large)
Exemplo n.º 39
0
def _do_stratification_spark(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    is_partitioned=True,
    is_random=True,
    seed=42,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Helper function to perform stratified splits.

    This function splits data in a stratified manner. That is, the same values for the
    filter_by column are retained in each split, but the corresponding set of entries
    are divided according to the ratio provided.

    Args:
        data (pyspark.sql.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list is
            provided and the ratios are not summed to 1, they will be normalized.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to filter
            with min_rating.
        is_partitioned (bool): flag to partition data by filter_by column
        is_random (bool): flag to make split randomly or use timestamp column
        seed (int): Seed.
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps.

    Args:

    Returns:
    """
    # A few preliminary checks.
    if filter_by not in ["user", "item"]:
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError(
            "min_rating should be integer and larger than or equal to 1.")

    if col_user not in data.columns:
        raise ValueError("Schema of data not valid. Missing User Col")

    if col_item not in data.columns:
        raise ValueError("Schema of data not valid. Missing Item Col")

    if not is_random:
        if col_timestamp not in data.columns:
            raise ValueError("Schema of data not valid. Missing Timestamp Col")

    if min_rating > 1:
        data = min_rating_filter_spark(
            data=data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    split_by = col_user if filter_by == "user" else col_item
    partition_by = split_by if is_partitioned else []

    col_random = "_random"
    if is_random:
        data = data.withColumn(col_random, F.rand(seed=seed))
        order_by = F.col(col_random)
    else:
        order_by = F.col(col_timestamp)

    window_count = Window.partitionBy(partition_by)
    window_spec = Window.partitionBy(partition_by).orderBy(order_by)

    data = (data.withColumn("_count",
                            F.count(split_by).over(window_count)).withColumn(
                                "_rank",
                                F.row_number().over(window_spec) /
                                F.col("_count")).drop("_count", col_random))
    # Persist to avoid duplicate rows in splits caused by lazy evaluation
    data.persist(StorageLevel.MEMORY_AND_DISK_2).count()

    multi_split, ratio = process_split_ratio(ratio)
    ratio = ratio if multi_split else [ratio, 1 - ratio]

    splits = []
    prev_split = None
    for split in np.cumsum(ratio):
        condition = F.col("_rank") <= split
        if prev_split is not None:
            condition &= F.col("_rank") > prev_split
        splits.append(data.filter(condition).drop("_rank"))
        prev_split = split

    return splits
Exemplo n.º 40
0
counts2 = (
    inventory
    # Select only the columns that are needed
    .select(['id','element'])
    # Group by source and count destinations
    .groupBy('id')
    .agg({'element': 'count'})
    .orderBy('count(element)', ascending=False)
    .select(
        F.col('id'),
        F.col('count(element)').alias('element_count')
    )
)
# By adding 'element' to the groupby we can determine that there are no duplicates for each element of each inventory

inventory2 = inventory.withColumn("core_flag", F.rand())
inventory2 = inventory.withColumn("prcp_flag", F.rand())
inventory2.show()

inventory2 = inventory2.withColumn(
    "core_flag",
    F.when((F.col('element') == "TMAX")
           | (F.col('element') == "TMIN")
           | (F.col('element') == "PRCP")
           | (F.col('element') == "SNOW")
           | (F.col('element') == "SNWD"), 1).otherwise(0))

inventory2 = inventory2.withColumn(
    "prcp_flag",
    F.when((F.col('element') == "PRCP"), 1).otherwise(0))
Exemplo n.º 41
0
from pyspark.ml.feature import StandardScaler

# Initialize the `standardScaler`
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled")

# Fit the DataFrame to the scaler
scaler = standardScaler.fit(df)

# Transform the data in `df` with the scaler
scaled_df = scaler.transform(df)

# Inspect the result
scaled_df.take(2)

from pyspark.sql.functions import rand 
df = df.orderBy(rand())
train_data, test_data = df.randomSplit([0.8, 0.2],seed=1234)


# Fitting the LogisticRegression: Change the below code for all the types of algorithms that we need for project
from pyspark.ml.classification import LogisticRegression
mlr = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)

# Fit the model
mlrModel = mlr.fit(train_data)

#Predict the values for test_data
predicted = mlrModel.transform(test_data)
predicted.head(5)

# COMMAND ----------

from pyspark.sql.functions import rand, randn
# Create a DataFrame with one int column and 10 rows.
df = sqlContext.range(0, 10)
df.show()

# COMMAND ----------

display(df)

# COMMAND ----------

# Generate two other columns using uniform distribution and normal distribution.
df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")).show()


# COMMAND ----------

display(df.select("id", rand(seed=10).alias("uniform"), randn(seed=27).alias("normal")))

# COMMAND ----------

# MAGIC %md ### Summary and Descriptive Statistics
# MAGIC 
# MAGIC The first operation to perform after importing data is to get some sense of what it looks like. For numerical columns, knowing the descriptive summary statistics can help a lot in understanding the distribution of your data. The function `describe` returns a DataFrame containing information such as number of non-null entries (count), mean, standard deviation, and minimum and maximum value for each numerical column.

# COMMAND ----------

from pyspark.sql.functions import rand, randn
Exemplo n.º 43
0
def combine_matrix(X, y, top = 4):
    """Create the data matrix for predictive modeling

    Notes: The default top n number is 5

    Args:
        X(SparkSQL DataFrame):
        y(SparkSQL DataFrame):

    Return:
        matrixAll(SparkSQL DataFrame):

    """
    # logging.info('Creating the big matrix X:y...')
    # y = hc.createDataFrame(y)
    ### Change y's column name 'serial_number' to 'SN'
    y = y.withColumnRenamed('serial_number', 'SN')
    ### Join X and y on serial_number, SN
    ### Add a new column 'y' specify return (1) or pass (0)
    matrixAll = (X.join(y, X.serial_number == y.SN, how = 'left_outer')
                  .withColumn('y', y['SN'].isNotNull().cast('int')))

    # matrixAll.cache()
    ### Drop row that has null values
    matrixAllDropNa = matrixAll.dropna(how = 'any')
    
    # matrixAllDropNa.cache()
    print 'to pandas()'
    symptomLocationPdf = matrixAllDropNa[['check_in_code', 'fail_location']].toPandas()
    print 'complete toPandas()'
    # locationPdf = matrixAllDropNa[['fail_location']].toPandas()
    #return symptomPdf
    #return matrixAllDropNa, matrixAll
    
    codeSeries = symptomLocationPdf['check_in_code'].value_counts()
    #print codeSeries
    locationSeries = symptomLocationPdf['fail_location'].value_counts()
    ### Top N = 5 symptoms
    codeDict = {}
    locationDict = {}
    for i in range(top):
        # top n check in codes
        code = codeSeries.index[i]
        #codeLabel = 'code_{}'.format(i)
        codeLabel = '{}'.format(code)
        codeDict[code] = codeSeries[i]
        print 'top {} symptom: {}, count: {}'.format(i+1, code, codeSeries[i])
        matrixAll = (matrixAll.withColumn(codeLabel, (matrixAll['check_in_code'].like('%{}'.format(code))).cast('int'))
                              .fillna({codeLabel: 0}))

        # top n fail locations
        location = locationSeries.index[i]
        #locationLabel = 'location_{}'.format(i)
        locationLabel = '{}'.format(location)
        locationDict[location] = locationSeries[i]
        #print location
        print 'top {} fail location: {}, count: {}'.format(i+1, location, locationSeries[i])
        matrixAll = (matrixAll.withColumn(locationLabel, (matrixAll['fail_location'].like('%{}'.format(location))).cast('int'))
                              .fillna({locationLabel: 0}))

    # add a random integer column from 1 to 100 for later on sampling of training samples
    matrixAllRandDf = matrixAll.withColumn('random', rand())

    # transform the float random number to integer between 1 to 100
    matrixAllIntDf = matrixAllRandDf.withColumn('randInt', (matrixAllRandDf.random * 100).cast('int'))
    
    # cache the whole matrix table
    matrixAllIntDf.cache()
    
    return matrixAllIntDf
Exemplo n.º 44
0
 def _transform(self, dataset):
     return dataset.withColumn("prediction",
                               dataset.feature + (rand(0) * self.getInducedError()))
Exemplo n.º 45
0
    'RNTP', 'SMP', 'VALP', 'WATP', 'GRNTP', 'GRPIP', 'GASP', 'NOC', 'NPF',
    'NRC', 'OCPIP', 'SMOCP', 'AGEP', 'INTP', 'JWMNP', 'OIP', 'PAP', 'RETP',
    'SEMP', 'SSIP', 'SSP', 'WKHP', 'POVPIP'
]
ordinals = [
    'AGS', 'YBL', 'MV', 'TAXP', 'CITWP', 'DRAT', 'JWRIP', 'MARHT', 'MARHYP',
    'SCHG', 'SCHL', 'WKW', 'YOEP', 'DECADE', 'JWAP', 'JWDP', 'SFN'
]

################################################################
#fill all null numericals value with 0
df = df.fillna(0, numericals)

# SPLIT DATASET
from pyspark.sql.functions import rand
(train_set, test_set) = df.orderBy(rand()).randomSplit([0.7, 0.3])

###############################################################
#INDEXING AND SCALING

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.feature import StandardScaler

utils.printNowToFile("starting pipeline")

ordinals_input = [col + "_index" for col in ordinals]
stdFeatures = ['numericals_std', 'ordinals_std']

# stages for index and scaling pipeline
stages = [