Пример #1
0
 def test_vectorized_udf_complex(self):
     df = self.spark.range(10).select(
         col('id').cast('int').alias('a'),
         col('id').cast('int').alias('b'),
         col('id').cast('double').alias('c'))
     add = pandas_udf(lambda x, y: x + y, IntegerType())
     power2 = pandas_udf(lambda x: 2 ** x, IntegerType())
     mul = pandas_udf(lambda x, y: x * y, DoubleType())
     res = df.select(add(col('a'), col('b')), power2(col('a')), mul(col('b'), col('c')))
     expected = df.select(expr('a + b'), expr('power(2, a)'), expr('b * c'))
     self.assertEquals(expected.collect(), res.collect())
Пример #2
0
    def test_expr(self):
        from pyspark.sql import functions

        row = Row(a="length string", b=75)
        df = self.sqlCtx.createDataFrame([row])
        result = df.select(functions.expr("length(a)")).collect()[0].asDict()
        self.assertEqual(13, result["'length(a)"])
Пример #3
0
 def test_register_vectorized_udf_basic(self):
     df = self.spark.range(10).select(
         col('id').cast('int').alias('a'),
         col('id').cast('int').alias('b'))
     original_add = pandas_udf(lambda x, y: x + y, IntegerType())
     self.assertEqual(original_add.deterministic, True)
     self.assertEqual(original_add.evalType, PythonEvalType.SQL_SCALAR_PANDAS_UDF)
     new_add = self.spark.catalog.registerFunction("add1", original_add)
     res1 = df.select(new_add(col('a'), col('b')))
     res2 = self.spark.sql(
         "SELECT add1(t.a, t.b) FROM (SELECT id as a, id as b FROM range(10)) t")
     expected = df.select(expr('a + b'))
     self.assertEquals(expected.collect(), res1.collect())
     self.assertEquals(expected.collect(), res2.collect())
Пример #4
0
    def _calculate_metrics(self):
        """Calculate ranking metrics."""
        self._items_for_user_pred = self.rating_pred

        self._items_for_user_true = (
            self.rating_true
            .groupBy(self.col_user)
            .agg(expr("collect_list(" + self.col_item + ") as ground_truth"))
            .select(self.col_user, "ground_truth")
        )

        self._items_for_user_all = self._items_for_user_pred.join(
            self._items_for_user_true, on=self.col_user
        ).drop(self.col_user)

        return RankingMetrics(self._items_for_user_all.rdd)
Пример #5
0
def distance_bw_parking_spots(data):
    if not isinstance(data, DataFrame):
        raise ValueError('Type pass to distance_bw_parking_spots() should be DataFrame.')
    data = data.\
        withColumn('LONGITUDE_ORIGINE_rad', F.expr('radians(LONGITUDE_ORIGINE)')).\
        withColumn('LATITUDE_ORIGINE_rad', F.expr('radians(LATITUDE_ORIGINE)')).\
        withColumn('LONGITUDE_DESTINATION_rad', F.expr('radians(LONGITUDE_DESTINATION)')).\
        withColumn('LATITUDE_DESTINATION_rad', F.expr('radians(LATITUDE_DESTINATION)')).\
        withColumn('Diff_long', F.expr('(LONGITUDE_DESTINATION_rad-LONGITUDE_ORIGINE_rad)/2')).\
        withColumn('Diff_lat', F.expr('(LONGITUDE_DESTINATION_rad-LONGITUDE_ORIGINE_rad)/2')).\
        withColumn('LATITUDE_DESTINATION_cos', F.expr('cos(LATITUDE_DESTINATION_rad)')).\
        withColumn('LATITUDE_ORIGINE_cos', F.expr('cos(LATITUDE_ORIGINE_rad)')).\
        withColumn('Diff_long', F.expr('sin(Diff_long)')).\
        withColumn('Diff_lat', F.expr('sin(Diff_lat)')).\
        withColumn('A', F.expr('Diff_lat*Diff_lat + LATITUDE_DESTINATION_cos * LATITUDE_ORIGINE_cos * Diff_long * Diff_long')).\
        withColumn('One_minus_A', F.expr('1-A')).\
        withColumn('C', F.expr('2 * atan2( sqrt(A), sqrt(One_minus_A))')).\
        withColumn('Distance_km', F.expr('6373.0*C'))

    # cols_needed = ['DATE_ORIGINE', 'LONGITUDE_ORIGINE', 'LATITUDE_ORIGINE', 'Distance(Km)', 'MOTIF_REMORQUAGE']
    
    df_final = data.select('DATE_ORIGINE', 'LONGITUDE_ORIGINE', 'LATITUDE_ORIGINE', 'Distance_km', 'MOTIF_REMORQUAGE')
    try:
        assert df_final.count()==250077
    except AssertionError:
        logging.error('Final count does not match before removing NA. Saving to file anyways...')

    df_final = df_final.na.drop()

    try:
        assert df_final.count()==248476
    except AssertionError:
        logging.error('Final count does not match after removing NA. Saving to file anyways...')
    
    return df_final
def main(saprk):
    # Format the VIIRS dataset
    viirsDf = spark.read \
            .format("csv") \
            .option("header", True) \
            .option("inferSchema", True) \
            .load("/tmp/{}".format(viirs_file))

    viirsDf2 = viirsDf \
            .withColumn("acq_time_min", F.expr("acq_time % 100")) \
            .withColumn("acq_time_hr", F.expr("int(acq_time / 100)")) \
            .withColumn("acq_time2", F.unix_timestamp(F.col("acq_date"))) \
            .withColumn("acq_time3", F.expr("acq_time2 + acq_time_min * 60 + acq_time_hr * 3600")) \
            .withColumn("acq_datetime", F.from_unixtime(F.col("acq_time3"))) \
            .drop("acq_date", "acq_time", "acq_time_min", "acq_time_hr", "acq_time2", "acq_time3") \
            .withColumnRenamed("confidence", "confidence_level") \
            .withColumn("brightness", F.lit(None)) \
            .withColumn("bright_t31", F.lit(None))

    viirsDf2.show()
    viirsDf2.printSchema()

    # This piece of code shows the repartition by confidence level, so you
    # can compare when you convert the confidence as a % to a level for the
    # MODIS dataset.
    df = viirsDf2.groupBy("confidence_level").count()
    count = viirsDf2.count()
    df = df.withColumn("%", F.round(F.expr("100 / {} * count".format(count)),
                                    2))
    df.show()

    # Format the MODIS dataset
    low = 40
    high = 100

    modisDf = spark.read.format("csv") \
            .option("header", True) \
            .option("inferSchema", True) \
            .load("/tmp/{}".format(modis_file)) \
            .withColumn("acq_time_min", F.expr("acq_time % 100")) \
            .withColumn("acq_time_hr", F.expr("int(acq_time / 100)")) \
            .withColumn("acq_time2", F.unix_timestamp(F.col("acq_date"))) \
            .withColumn("acq_time3", F.expr("acq_time2 + acq_time_min * 60 + acq_time_hr * 3600")) \
            .withColumn("acq_datetime", F.from_unixtime(F.col("acq_time3"))) \
            .drop("acq_date", "acq_time", "acq_time_min", "acq_time_hr", "acq_time2", "acq_time3") \
            .withColumn("confidence_level", F.when(F.col("confidence") <= F.lit(low), "low")
                        .when((F.col("confidence") > F.lit(low)) & (F.col("confidence") < F.lit(high)), "nominal")
                        .when(F.isnull(F.col("confidence")), "high")
                        .otherwise(F.col("confidence"))) \
            .drop("confidence") \
            .withColumn("bright_ti4", F.lit(None)) \
            .withColumn("bright_ti5", F.lit(None))

    modisDf.show()
    modisDf.printSchema()

    # This piece of code shows the repartition by confidence level, so you
    # can compare when you convert the confidence as a % to a level for the
    # MODIS dataset.
    df = modisDf.groupBy("confidence_level").count()
    count = modisDf.count()
    df = df.withColumn("%", F.round(F.expr("100 / {} * count".format(count)),
                                    2))
    df.show()

    wildfireDf = viirsDf2.unionByName(modisDf)
    wildfireDf.show()
    wildfireDf.printSchema()

    logging.info("# of partitions: {}".format(
        wildfireDf.rdd.getNumPartitions()))

    wildfireDf.write.format("parquet") \
            .mode("overwrite") \
            .save("/tmp/fires_parquet")

    outputDf = wildfireDf.filter("confidence_level = 'high'") \
     .repartition(1)

    outputDf.write.format("csv") \
            .option("header", True) \
            .mode("overwrite") \
            .save("/tmp/high_confidence_fires_csv")
Пример #7
0
# COMMAND ----------

motifs = stationGraph.find("(a)-[ab]->(b); (b)-[bc]->(c); (c)-[ca]->(a)")


# COMMAND ----------

from pyspark.sql.functions import expr
motifs.selectExpr("*",
    "to_timestamp(ab.`Start Date`, 'MM/dd/yyyy HH:mm') as abStart",
    "to_timestamp(bc.`Start Date`, 'MM/dd/yyyy HH:mm') as bcStart",
    "to_timestamp(ca.`Start Date`, 'MM/dd/yyyy HH:mm') as caStart")\
  .where("ca.`Bike #` = bc.`Bike #`").where("ab.`Bike #` = bc.`Bike #`")\
  .where("a.id != b.id").where("b.id != c.id")\
  .where("abStart < bcStart").where("bcStart < caStart")\
  .orderBy(expr("cast(caStart as long) - cast(abStart as long)"))\
  .selectExpr("a.id", "b.id", "c.id", "ab.`Start Date`", "ca.`End Date`")
  .limit(1).show(1, False)


# COMMAND ----------

from pyspark.sql.functions import desc
ranks = stationGraph.pageRank(resetProbability=0.15, maxIter=10)
ranks.vertices.orderBy(desc("pagerank")).select("id", "pagerank").show(10)


# COMMAND ----------

inDeg = stationGraph.inDegrees
inDeg.orderBy(desc("inDegree")).show(5, False)
Пример #8
0
def basic_rec_val(spark, dirname, rank, regParam, k, random_seed):
	
	val_set = spark.read.parquet(f'{dirname}/val.parquet')
	
	print(f'Validating on model with rank = {rank} and regParam = {regParam} trained using {dirname} data ...')
	
	# load corresponding trained model
	model = ALSModel.load(f'{dirname}/{rank}_{regParam}_model')

	# computing RMSE on validation set
	predictions = model.transform(val_set)
	evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
	rmse = evaluator.evaluate(predictions)

	print(f'rmse: {rmse}')
							
	print(f'Constructing top {k} books recommended to per user ...')
	val_users = val_set.select('user_id').distinct()

	start_time = time.time()

	perUserPredictedTopKItemsDF = model.recommendForUserSubset(val_users, k)

	myudf = udf(extract_item, ArrayType(IntegerType()))
	perUserPredictedTopKItemsDF = perUserPredictedTopKItemsDF.withColumn('predictions', myudf(perUserPredictedTopKItemsDF['recommendations'])).drop('recommendations')

	print('Constructing actual books per user ...')
	perUserActualItemsDF = val_set.filter(column('rating') >= 3.0).groupBy('user_id').agg(expr('collect_list(book_id) as book_ids'))

	print('Constructing Ranking Metrics ...')
	perUserItemsRDD = perUserPredictedTopKItemsDF.join(perUserActualItemsDF, 'user_id').rdd.map(lambda row: (row[1], row[2]))

	rankingMetrics = RankingMetrics(perUserItemsRDD)

	precisionAtK = rankingMetrics.precisionAt(k)
	mAP = rankingMetrics.meanAveragePrecision

	end_time = time.time()
	time_delta = str(datetime.timedelta(seconds = end_time - start_time))

	print(f'p@{k}: {precisionAtK}')
	print(f'mAP: {mAP}')
	print(f'run time: {time_delta}')
Пример #9
0
print("Recommendations: ------------------------------")
user_recs = best_model.recommendForAllUsers(500)
print(user_recs.count())


prediction_val = best_model.transform(df_validation)
print(" Predictions for validation dataset: ------------------------------")
prediction_val.show()


prediction_test = best_model.transform(df_test)
print(" Predictions for test dataset: ------------------------------")
prediction_test.show()


actual_val = df_validation.groupBy("user_id").agg(expr("collect_set(book_id) as books"))
pred_val = user_recs.select('user_id','recommendations.book_id')
output_val =pred_val.join(actual_val,['user_id']).select('book_id','books')
metrics_val = RankingMetrics(output_val.rdd)
result_val = metrics_val.meanAveragePrecision
result_val2 = metrics_val.precisionAt(20)

print("Mean average precision for validation dataset: " + str(result_val))
print("Precision @ 20 for validation dataset: " + str(result_val2))
rmse_val = evaluator.evaluate(prediction_val)
print("RMSE for validation dataset=" + str(rmse_val))


actual_test = df_test.groupBy("user_id").agg(expr("collect_set(book_id) as books"))
pred_test = user_recs.select('user_id','recommendations.book_id')
output_test =pred_test.join(actual_test,['user_id']).select('book_id','books')
])
df = spark.read.format("json").schema(myManualSchema)\
  .load("/data/flight-data/json/2015-summary.json")


# COMMAND ----------

from pyspark.sql.functions import col, column
col("someColumnName")
column("someColumnName")


# COMMAND ----------

from pyspark.sql.functions import expr
expr("(((someCol + 5) * 200) - 6) < otherCol")


# COMMAND ----------

from pyspark.sql import Row
myRow = Row("Hello", None, 1, False)


# COMMAND ----------

myRow[0]
myRow[2]


# COMMAND ----------
def prepare_df(
    df: pyspark.sql.DataFrame,
    store_csv: pyspark.sql.DataFrame,
    store_states_csv: pyspark.sql.DataFrame,
    state_names_csv: pyspark.sql.DataFrame,
    google_trend_csv: pyspark.sql.DataFrame,
    weather_csv: pyspark.sql.DataFrame,
) -> pyspark.sql.DataFrame:
    num_rows = df.count()

    # expand dates
    df = expand_date(df)

    # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed).
    df = (df.withColumn("Open", df.Open != "0").withColumn(
        "Promo",
        df.Promo != "0").withColumn("StateHoliday",
                                    df.StateHoliday != "0").withColumn(
                                        "SchoolHoliday",
                                        df.SchoolHoliday != "0"))

    # merge store information
    store = store_csv.join(store_states_csv, "Store")
    df = df.join(store, "Store")

    # merge Google Trend information
    google_trend_all = prepare_google_trend(google_trend_csv)
    df = df.join(google_trend_all,
                 ["State", "Year", "Week"]).select(df["*"],
                                                   google_trend_all.trend)

    # merge in Google Trend for whole Germany
    google_trend_de = google_trend_all[google_trend_all.file ==
                                       "Rossmann_DE"].withColumnRenamed(
                                           "trend", "trend_de")
    df = df.join(google_trend_de,
                 ["Year", "Week"]).select(df["*"], google_trend_de.trend_de)

    # merge weather
    weather = weather_csv.join(state_names_csv,
                               weather_csv.file == state_names_csv.StateName)
    df = df.join(weather, ["State", "Date"])

    # fix null values
    df = (df.withColumn(
        "CompetitionOpenSinceYear",
        F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)),
    ).withColumn(
        "CompetitionOpenSinceMonth",
        F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)),
    ).withColumn("Promo2SinceYear",
                 F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn(
                     "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek,
                                                   F.lit(1))))

    # days and months since the competition has been open, cap it to 2 years
    df = df.withColumn(
        "CompetitionOpenSince",
        F.to_date(
            F.format_string("%s-%s-15", df.CompetitionOpenSinceYear,
                            df.CompetitionOpenSinceMonth)),
    )
    df = df.withColumn(
        "CompetitionDaysOpen",
        F.when(
            df.CompetitionOpenSinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(360 * 2),
                        F.datediff(df.Date, df.CompetitionOpenSince)),
            ),
        ).otherwise(0),
    )
    df = df.withColumn("CompetitionMonthsOpen",
                       (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

    # days and weeks of promotion, cap it to 25 weeks
    df = df.withColumn(
        "Promo2Since",
        F.expr(
            'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'
        ),
    )
    df = df.withColumn(
        "Promo2Days",
        F.when(
            df.Promo2SinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))),
        ).otherwise(0),
    )
    df = df.withColumn("Promo2Weeks",
                       (df.Promo2Days / 7).cast(T.IntegerType()))

    # ensure that no row was lost through inner joins
    assert num_rows == df.count(), "lost rows in joins"
    return df
Пример #12
0
# Databricks notebook source
from pyspark.sql.functions import expr, pow, col, round, bround, lit, corr
#dataframe 생성
df = spark.read.format("csv")\
  .option("header","true")\
  .option("inferSchema","true")\
  .load("/databricks-datasets/definitive-guide/data/retail-data/by-day/2010-12-01.csv")
df.createOrReplaceGlobalTempView("dfTable")

#pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("RealQuntity")).show(2)
#두개의 column이 모두 수치형 데이터 이므로 계산 가능

#내림
df.select(round(lit("1.6")), bround(lit("1.6"))).show(2)
#두 column의 상관관계
df.stat.corr("Quantity", "UnitPrice")
df.select(corr("Quantity", "UnitPrice")).show(2)

#describe (평균, 표준편차, 최소,최대,집계)
#통계 스키마는 변경될 수 있으므로 확인용으로만 사용
df.describe().show(6)

#statFunctions package
#stat. 을 통해 접근
ColName = "UnitPrice"
quantiledProbs = [0.5]
relError = 0.05
df.stat.approxQuantile("UnitPrice", quantiledProbs, relError)

# COMMAND ----------

wrongJoinExpression = person["name"] == graduateProgram["school"]


# COMMAND ----------

joinType = "inner"


# COMMAND ----------

gradProgram2 = graduateProgram.union(spark.createDataFrame([
    (0, "Masters", "Duplicated Row", "Duplicated School")]))

gradProgram2.createOrReplaceTempView("gradProgram2")


# COMMAND ----------

from pyspark.sql.functions import expr

person.withColumnRenamed("id", "personId")\
  .join(sparkStatus, expr("array_contains(spark_status, id)")).show()


# COMMAND ----------

# COMMAND ----------

from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show() # 29310


# COMMAND ----------

from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()


# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"),
  stddev_pop("Quantity"), stddev_samp("Quantity")).show()


# COMMAND ----------
Пример #15
0
    def aggregate(self, func_or_funcs, *args, **kwargs):
        """Aggregate using one or more operations over the specified axis.

        Parameters
        ----------
        func : dict
             a dict mapping from column name (string) to aggregate functions (string).

        Returns
        -------
        Series or DataFrame

            The return can be:

            * Series : when DataFrame.agg is called with a single function
            * DataFrame : when DataFrame.agg is called with several functions

            Return Series or DataFrame.

        Notes
        -----
        `agg` is an alias for `aggregate`. Use the alias.

        Examples
        --------

        >>> df = ks.DataFrame({'A': [1, 1, 2, 2],
        ...                    'B': [1, 2, 3, 4],
        ...                    'C': [0.362, 0.227, 1.267, -0.562]},
        ...                   columns=['A', 'B', 'C'])

        >>> df
           A  B      C
        0  1  1  0.362
        1  1  2  0.227
        2  2  3  1.267
        3  2  4 -0.562

        Different aggregations per column

        >>> aggregated = df.groupby('A').agg({'B': 'min', 'C': 'sum'})
        >>> aggregated[['B', 'C']]  # doctest: +NORMALIZE_WHITESPACE
           B      C
        A
        1  1  0.589
        2  3  0.705

        """
        if not isinstance(func_or_funcs, dict) or \
                not all(isinstance(key, str) and isinstance(value, str)
                        for key, value in func_or_funcs.items()):
            raise ValueError("aggs must be a dict mapping from column name (string) to aggregate "
                             "functions (string).")

        sdf = self._kdf._sdf
        groupkeys = self._groupkeys
        groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i))
                         for i, s in enumerate(groupkeys)]
        reordered = [F.expr('{1}({0}) as {0}'.format(key, value))
                     for key, value in func_or_funcs.items()]
        sdf = sdf.groupby(*groupkey_cols).agg(*reordered)
        metadata = Metadata(data_columns=[key for key, _ in func_or_funcs.items()],
                            index_map=[('__index_level_{}__'.format(i), s.name)
                                       for i, s in enumerate(groupkeys)])
        return DataFrame(sdf, metadata)
Пример #16
0
    kafka_source_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "invoice-items") \
        .option("startingOffsets", "earliest") \
        .load()

    avroSchema = open('schema/invoice-items', mode='r').read()

    value_df = kafka_source_df.select(from_avro(col("value"), avroSchema).alias("value"))

    rewards_df = value_df.filter("value.CustomerType == 'PRIME'") \
        .groupBy("value.CustomerCardNo") \
        .agg(sum("value.TotalValue").alias("TotalPurchase"),
             sum(expr("value.TotalValue * 0.2").cast("integer")).alias("AggregatedRewards"))

    kafka_target_df = rewards_df.select(expr("CustomerCardNo as key"),
                                        to_json(struct("TotalPurchase", "AggregatedRewards")).alias("value"))

    # kafka_target_df.show(truncate=False)

    rewards_writer_query = kafka_target_df \
        .writeStream \
        .queryName("Rewards Writer") \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("topic", "customer-rewards") \
        .outputMode("update") \
        .option("checkpointLocation", "chk-point-dir") \
        .start()
predictions = model.transform(test)
window = Window.partitionBy(predictions['user_id']).orderBy(
    predictions['prediction'].desc())
test_pred_order = predictions.select(
    '*',
    rank().over(window).alias('rank')).filter(col('rank') <= 500)

evaluator = RegressionEvaluator(metricName="rmse",
                                labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(test_pred_order)

# evaluate the model by computing the MAP on the validation data
test_pred_list = test_pred_order.select(
    'user_id',
    'book_id').groupBy('user_id').agg(expr('collect_list(book_id) as books'))
test_RDD = test_pred_list.join(test_true_list,
                               'user_id').rdd.map(lambda row: (row[1], row[2]))
rankingMetrics = RankingMetrics(test_RDD)
current_map = rankingMetrics.meanAveragePrecision

print(
    '\nThe best baseline model select by RMSE = {} has {} latent factors and '
    'regularization = {}  with maxIter = {} MAP = {}'.format(
        rmse, current_rank, reg, iteration, current_map))
"""
# evaluate read model

train_new = train.withColumn('rating',when(train.is_read == 0,float('nan')).otherwise(train.rating))
train_read = train_new.na.drop()
train_unread = train.subtract(train_read)
Пример #18
0
 def test_udf_plus1(self):
     scala('''val plus1 = udf { x: Int => x + 1 }; spark.udf.register("plus1", plus1)''')
     from pyspark.sql.functions import expr
     res = self.spark.createDataFrame(range(10), "int").select(expr('plus1(value) value')).collect()
     res = [x.value for x in res]
     self.assertListEqual(res, list(range(1, 11)))
  .format("memory").outputMode("complete")\
  .start()


# COMMAND ----------

from time import sleep
for x in range(5):
    spark.sql("SELECT * FROM activity_counts").show()
    sleep(1)


# COMMAND ----------

from pyspark.sql.functions import expr
simpleTransform = streaming.withColumn("stairs", expr("gt like '%stairs%'"))\
  .where("stairs")\
  .where("gt is not null")\
  .select("gt", "model", "arrival_time", "creation_time")\
  .writeStream\
  .queryName("simple_transform")\
  .format("memory")\
  .outputMode("append")\
  .start()


# COMMAND ----------

deviceModelStats = streaming.cube("gt", "model").avg()\
  .drop("avg(Arrival_time)")\
  .drop("avg(Creation_Time)")\
Пример #20
0
     StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
     StructField("count", LongType(), False, metadata={"hello": "world"})])

myRow = Row("Hello", None, 1)
myDf = spark.createDataFrame([myRow],myManualSchema)
myDf.show()

# Select and SelectExpr
# Selecting single column
df.select("DEST_COUNTRY_NAME").show(2)

# Selecting multiple column
df.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").show(2)

df.select(
    expr("DEST_COUNTRY_NAME"),
    col("DEST_COUNTRY_NAME"),
    column("DEST_COUNTRY_NAME")
).show(2)

# Using Alias -- select DEST_COUNTRY_NAME as destination from table
df.select(expr("DEST_COUNTRY_NAME As destination")).show(2)
df.select(expr ("DEST_COUNTRY_NAME As destination").alias("DEST_COUNTRY_NAME")).show(2)
df.selectExpr("DEST_COUNTRY_NAME As destination","DEST_COUNTRY_NAME").show(2)

# SelectExpr Example -- Comparing the column value return boolean
df.selectExpr(
    "*","(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME ) as withinCountry"
).show(2)

df.selectExpr(
training_df, validation_df, testing_df = review.randomSplit([0.6, 0.3, 0.1],
                                                            seed=0)
[training_df.count(), validation_df.count(), testing_df.count()]

lambda_par = 0.02
alpha_par = 0.3
en_lr = LogisticRegression().\
        setLabelCol('sentiment').\
        setFeaturesCol('tfidf').\
        setRegParam(lambda_par).\
        setMaxIter(100).\
        setElasticNetParam(alpha_par)

en_lr_pipeline = Pipeline(stages=[idf_pipeline, en_lr]).fit(review)
en_lr_pipeline.transform(review).select(
    fn.avg(fn.expr('float(prediction = sentiment)'))).show()

en_weights = en_lr_pipeline.stages[-1].coefficients.toArray()
en_coeffs_df = pd.DataFrame({'word': vocabulary, 'weight': en_weights})

#en_coeffs_df.sort_values('weight').head(15)
#en_coeffs_df.sort_values('weight', ascending=False).head(15)
#en_coeffs_df.query('weight == 0.0').shape
en_coeffs_df.query('weight == 0.0').shape[0] / en_coeffs_df.shape[0]

from pyspark.ml.tuning import ParamGridBuilder

en_lr_estimator = Pipeline(stages=[idf_pipeline, en_lr])

grid = ParamGridBuilder().\
    addGrid(en_lr.regParam, [0., 0.01, 0.02]).\
Пример #22
0
def get_aggregations(year, month, in_category, all=False, flipsign=True):
    """
    This function builds the aggregations as needed for that year and month.
    It also produces histograms in a specific category.

    :param year: input year
    :param month: input month
    :param in_category: input category for histogram
    :param all: for histogram only past 10 months
    :param flipsign: flipsign is used for reversing the bars. required for only fewer categories like pay.
    :return: returns aggregations and histograms
    """

    # getting the main master file path
    all_transactions_path = config["target"]["all_master"]
    # reading the category flags
    category_flags_path = config["lookup"]["category_flags"]
    # reading all the descriptions
    desc_flags_path = config["lookup"]["description_flags"]

    # extracting date values as needed
    start_date = str(year) + month + '01'
    _, end_day = calendar.monthrange(year, int(month))
    end_date = str(year) + month + str(end_day)

    # reading in the latest all master file
    latest_file = read_latest_file_from_hdfs(spark,
                                             all_transactions_path,
                                             match_filename='20')
    all_transactions = spark.read.csv(latest_file, sep=',', header=True)

    # if all = true then it is for histogram or it is only monthly expenditure

    if not all:
        # filtering that respective months transations
        monthly_transactions = all_transactions.filter(
            f"trndt between {start_date} and {end_date}")
        category_flags = spark.read.csv(category_flags_path,
                                        sep=',',
                                        header=True)
        desc_flags = spark.read.csv(desc_flags_path, sep=',', header=True)

        desc_only_flags = desc_flags.select('DESCRIPTIONS')

        Flag_df = [str(i.DESCRIPTIONS) for i in desc_only_flags.collect()]

        # converting it to pandas dataframe
        pandas_monthly_transactions = monthly_transactions.toPandas()

        # rewwrite the new descriptions to the transactions
        for i in Flag_df:
            pandas_monthly_transactions.loc[
                pandas_monthly_transactions['Description'].str.
                contains(i, case=False), 'new_Description'] = i

        headers = [
            "Transaction_date", "Description", "Amount", "trndt", "act_type",
            "new_Description"
        ]
        schema = StructType(
            [StructField(col, StringType()) for col in headers])

        # converting back to spark dataframe
        transactions = spark.createDataFrame(pandas_monthly_transactions,
                                             schema=schema)

        t = transactions.alias('t')
        f = desc_flags.alias('f')

        # flags the transactions
        transactions = t.join(f, t.new_Description == f.DESCRIPTIONS,
                              "left_outer").drop("DESCRIPTIONS")

        print('No of null amounts are',
              transactions.filter("Amount is null").count())

        transactions = transactions.filter("Amount is not null")

        # replaces commas in the amount field
        transactions = transactions.withColumn("Amount",
                                               replace_comma("Amount"))

        transactions.show(20, False)

        # gives the number of transactions per account type in a month
        print('number of transactions per account type in a month')
        transactions.groupby('act_type').agg(
            F.count("act_type").alias("total_act_type_transactions")).show()

        # gives the latest transaction date per each account
        print('latest transaction date per each account')
        transactions.groupby('act_type').agg(
            F.max("trndt").alias("max_trndt")).sort("max_trndt").show()

        # gives the new transaction list
        print('brand new transactions')
        null_flags = transactions.filter("FLAG is null")
        null_flags.orderBy(F.asc("trndt")).show(100, False)

        print(f'checking for transactions under category {in_category}')
        filter_flags = transactions.filter(f"FLAG='{in_category}'")
        filter_flags.orderBy(F.asc("trndt")).show(100, False)

        grouped_df = transactions.groupby('FLAG').agg(
            F.sum("Amount").alias("total_amt"))
        print('grouped_df per category')
        grouped_df = grouped_df.join(category_flags, "FLAG", "left_outer")\
                     .orderBy(F.desc("total_amt"))

        grouped_df = grouped_df.filter("FLAG<>'PAY'")
        grouped_df.show(200, False)

        y = grouped_df.select("total_amt").rdd.map(
            lambda row: row[0]).collect()

        print(y)
        print('incoming', sum(i for i in y if i > 0))
        print('outgoing', sum(i for i in y if i < 0))

        # all_transactions = all_transactions.withColumn("year_month", F.substring(F.col("trndt"), 1, 6))
        # all_transactions.groupBy("year_month","")
        # all_transactions.show(200, False)
    else:
        category_flags = spark.read.csv(category_flags_path,
                                        sep=',',
                                        header=True)
        desc_flags = spark.read.csv(desc_flags_path, sep=',', header=True)

        category_flags.show(100, False)

        desc_only_flags = desc_flags.select('DESCRIPTIONS')
        Flag_df = [str(i.DESCRIPTIONS) for i in desc_only_flags.collect()]

        pandas_monthly_transactions = all_transactions.toPandas()

        for i in Flag_df:
            pandas_monthly_transactions.loc[
                pandas_monthly_transactions['Description'].str.
                contains(i, case=False), 'new_Description'] = i

        headers = [
            "Transaction_date", "Description", "Amount", "trndt", "act_type",
            "new_Description"
        ]
        schema = StructType(
            [StructField(col, StringType()) for col in headers])

        transactions = spark.createDataFrame(pandas_monthly_transactions,
                                             schema=schema)

        t = transactions.alias('t')
        f = desc_flags.alias('f')

        transactions = t.join(f, t.new_Description == f.DESCRIPTIONS,
                              "left_outer").drop("DESCRIPTIONS")

        print('No of null amounts are',
              transactions.filter("Amount is null").count())

        transactions = transactions.filter("Amount is not null")

        transactions = transactions.withColumn("Amount",
                                               replace_comma("Amount"))

        transactions.show(20, False)

        transactions.groupby('act_type').agg(
            F.count("act_type").alias("total_act_type_transactions")).show()

        transactions.groupby('act_type').agg(
            F.max("trndt").alias("max_trndt")).sort("max_trndt").show()

        transactions = transactions.withColumn(
            "trn_month",
            F.expr("concat(substr(trndt, 3, 2),'-',substr(trndt, 5, 2))"))

        # F.concat(F.substring(F.col("trndt"), 3, 4), "-", F.substring(F.col("trndt"), 4, 5)))

        null_flags = transactions.filter("FLAG is null")
        null_flags.orderBy(F.asc("trndt")).show(100, False)

        filter_flags = transactions.filter(f"FLAG='{in_category}'")
        # filter_flags = transactions.filter("act_type='bofacredit'")
        filter_flags.orderBy(F.asc("trndt")).show(100, False)

        # filter_flags_2 = transactions.filter("FLAG='GR'")
        # # filter_flags = transactions.filter("act_type='bofacredit'")
        # filter_flags_2.orderBy(F.asc("trndt")).show(100, False)

        grouped_df = transactions.groupby('FLAG', 'trn_month').agg(
            F.sum("Amount").alias("total_amt"))
        print('grouped_df')
        grouped_df = grouped_df.join(category_flags, "FLAG", "left_outer") \
            .orderBy(F.desc("trn_month"))

        grouped_df.show(200, False)

        grouped_df = grouped_df.filter(f"FLAG='{in_category}'").orderBy(
            F.asc("trn_month"))

        grouped_df = grouped_df.filter("FLAG<>'PAY'")

        print_category = grouped_df.select("CATEGORY").filter(
            f"FLAG='{in_category}'").distinct().rdd.map(
                lambda row: row[0]).collect()

        grouped_df.orderBy(F.desc("trn_month")).show()

        if flipsign:
            grouped_df = grouped_df.withColumn("total_amt",
                                               flip_sign("total_amt"))

        # plotting the histograms
        x = grouped_df.select("trn_month").rdd.map(
            lambda row: row[0]).collect()
        y = grouped_df.select("total_amt").rdd.map(
            lambda row: row[0]).collect()

        x = x[-10:]
        y = y[-10:]

        avg = Average(y)

        print(x)
        print(y)
        print(avg)

        plt.bar(x, y, align='center')
        plt.ylabel(f'{print_category}')
        plt.xlabel('[months]')
        plt.title(f'avg in 10 months is {avg}')
        for i in range(len(y)):
            plt.hlines(y[i], 0,
                       x[i])  # Here you are drawing the horizontal lines
        plt.show()

        # all_transactions = all_transactions.withColumn("year_month", F.substring(F.col("trndt"), 1, 6))
        # all_transactions.groupBy("year_month","")
        # all_transactions.show(200, False)

    pass
Пример #23
0
                    StructField("TotalValue", DoubleType())
                ]))),
    ])

    kafka_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "localhost:9092") \
        .option("subscribe", "invoices") \
        .option("startingOffsets", "earliest") \
        .load()

    value_df = kafka_df.select(
        from_json(col("value").cast("string"), schema).alias("value"))

    notification_df = value_df.select("value.InvoiceNumber", "value.CustomerCardNo", "value.TotalAmount") \
        .withColumn("EarnedLoyaltyPoints", expr("TotalAmount * 0.2"))

    # kafka_target_df = notification_df.selectExpr("InvoiceNumber as key", "to_json(struct(*)) as value")

    kafka_target_df = notification_df.selectExpr(
        "InvoiceNumber as key", """to_json(named_struct(
                                                 'CustomerCardNo', CustomerCardNo,
                                                 'TotalAmount', TotalAmount,
                                                 'EarnedLoyaltyPoints', TotalAmount * 0.2)) as value"""
    )
    '''
    notification_writer_query = kafkaTarget_df.writeStream \
        .format("console") \
        .outputMode("append") \
        .option("truncate", "false") \
        .option("checkpointLocation", "chk-point-dir") \
Пример #24
0
df.select(sum("Quantity")).show()  # 5176450

# COMMAND ----------

from pyspark.sql.functions import sumDistinct
df.select(sumDistinct("Quantity")).show()  # 29310

# COMMAND ----------

from pyspark.sql.functions import sum, count, avg, expr

df.select(
    count("Quantity").alias("total_transactions"),
    sum("Quantity").alias("total_purchases"),
    avg("Quantity").alias("avg_purchases"),
    expr("mean(Quantity)").alias("mean_purchases"))\
  .selectExpr(
    "total_purchases/total_transactions",
    "avg_purchases",
    "mean_purchases").show()

# COMMAND ----------

from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp
df.select(var_pop("Quantity"), var_samp("Quantity"), stddev_pop("Quantity"),
          stddev_samp("Quantity")).show()

# COMMAND ----------

from pyspark.sql.functions import skewness, kurtosis
Пример #25
0
def build_attribute_matrix(
        spark,
        sub=0,
        book_df='hdfs:/user/yw2115/goodreads_books.json.gz',
        author_df='hdfs:/user/yw2115/goodreads_book_authors.json.gz',
        genre_df='hdfs:/user/yw2115/gooreads_book_genres_initial.json.gz',
        records_path="hdfs:/user/xc1511/onepct_int_001.parquet"):

    ####Create Attribute Matrix for Genres####
    '''
    10 categories: 
    children| comics, graphic| fantasy, paranormal| fiction| history, historical fiction, biography,
    mystery, thriller, crime| non-fiction| poetry| romance| young-adult

    '''

    book_df = spark.read.json('hdfs:/user/yw2115/goodreads_books.json.gz')
    author_df = spark.read.json(
        'hdfs:/user/yw2115/goodreads_book_authors.json.gz')
    genre_df = spark.read.json(
        'hdfs:/user/yw2115/gooreads_book_genres_initial.json.gz')

    genre_at = genre_df.select('book_id',f.expr('genres.children'),f.expr('genres.`comics, graphic`'),\
        f.expr('genres.`fantasy, paranormal`'),f.expr('genres.fiction'), \
        f.expr('genres.`history, historical fiction, biography`'), f.expr('genres.`mystery, thriller, crime`'),\
        f.expr('genres.`non-fiction`'),f.expr('genres.poetry'),f.expr('genres.romance'),f.expr('genres.`young-adult`'))

    #change col names
    new_col = [
        'book_id', 'g1', 'g2', 'g3', 'g4', 'g5', 'g6', 'g7', 'g8', 'g9', 'g10'
    ]
    genre_at = genre_at.toDF(*new_col)
    #genre_at.show(3)

    #0/1 Encoding
    #change Null value to 0 (meaning the book is not in this genre)
    # and other int to 1 (meaning the book in this genre)

    for i in range(1, len(new_col)):
        col_name = new_col[i]
        genre_at = genre_at.withColumn(
            col_name,
            when(genre_at[col_name].isNotNull(), 1).otherwise(0))

    #genre_at.show(10)

    #subsample 1% data
    if sub == 0.01:
        records_pq = spark.read.parquet(records_path)
        records_pq.createOrReplaceTempView('records_pq')
        book_pq = spark.sql('SELECT DISTINCT book_id FROM records_pq')

        book_pq.createOrReplaceTempView('book_pq')
        book_df.createOrReplaceTempView('book_df')
        genre_at.createOrReplaceTempView('genre_at')
        genre_at = spark.sql('SELECT genre_at.* FROM genre_at JOIN book_pq ON \
            genre_at.book_id = book_pq.book_id')
        book_df = spark.sql('SELECT book_df.* FROM book_df JOIN book_pq ON \
            book_df.book_id = book_pq.book_id')

    ####Add Author Rating as Additional Attribute####
    #Select the first author (there are books with more than 1 author, first author is the main author)
    book_df = book_df.select('book_id', f.expr('authors[0]').alias('a'))
    #Add author_id
    book_df = book_df.select('book_id', f.expr('a.author_id'))

    #Join book_df and author_df
    book_df.createOrReplaceTempView('book_df')
    author_df.createOrReplaceTempView('author_df')

    author_at = spark.sql('SELECT book_df.book_id, book_df.author_id,\
     author_df.average_rating FROM book_df JOIN author_df ON \
     book_df.author_id=author_df.author_id')
    #author_at.show(10)

    ####Join The Two Matrix to Get Book Attribute Matrix####
    genre_at.createOrReplaceTempView('genre_at')
    author_at.createOrReplaceTempView('author_at')

    book_at = spark.sql('SELECT genre_at.book_id, genre_at.g1, genre_at.g2,\
     genre_at.g3, genre_at.g4, genre_at.g5, genre_at.g6, genre_at.g7, genre_at.g8, \
     genre_at.g9, genre_at.g10, author_at.average_rating AS author_rating \
     FROM genre_at JOIN author_at ON genre_at.book_id=author_at.book_id')

    book_at = book_at.withColumn('author_rating',
                                 book_at['author_rating'].cast('float'))

    #return the I*N attribute matrix for book
    #I is number of items (books)
    #N = 11 is number of attribute features of the books

    #add a features col
    vecAssembler = VectorAssembler(inputCols=[
        'g1', 'g2', 'g3', 'g4', 'g5', 'g6', 'g7', 'g8', 'g9', 'g10',
        'author_rating'
    ],
                                   outputCol="features")
    book_at = vecAssembler.transform(book_at)
    #note here 'features' is a SparseVector type due to spark memory default

    #book_at.show(3)

    return book_at
Пример #26
0
def main(spark, log_comp=False, drop_low=False, drop_thr=0):
    '''

    Parameters
    ----------
    spark : SparkSession object

    train_path : string, path to the training parquet file to load

    val_path : string, path to the validation parquet file to load

    test_path : string, path to the validation parquet file to load
    '''
    ## Load in datasets
    train_path = 'hdfs:/user/bm106/pub/project/cf_train.parquet'
    val_path = 'hdfs:/user/bm106/pub/project/cf_validation.parquet'
    test_path = 'hdfs:/user/bm106/pub/project/cf_test.parquet'

    train = spark.read.parquet(train_path)
    val = spark.read.parquet(val_path)
    test = spark.read.parquet(test_path)

    ## Downsample the data
    # Pick out user list in training set
    user_train = set(row['user_id']
                     for row in train.select('user_id').distinct().collect())
    # Pick out user list in validation set
    user_val = set(row['user_id']
                   for row in val.select('user_id').distinct().collect())
    # Get the previous 1M users
    user_prev = list(user_train - user_val)
    # Random sampling to get 20%
    k = int(0.2 * len(user_prev))
    user_prev_filtered = random.sample(user_prev, k)
    train = train.where(train.user_id.isin(user_prev_filtered +
                                           list(user_val)))

    ## Create StringIndexer
    indexer_user = StringIndexer(inputCol="user_id",
                                 outputCol="user_id_indexed",
                                 handleInvalid='skip')
    indexer_user_model = indexer_user.fit(train)
    indexer_track = StringIndexer(inputCol="track_id",
                                  outputCol="track_id_indexed",
                                  handleInvalid='skip')
    indexer_track_model = indexer_track.fit(train)

    train = indexer_user_model.transform(train)
    train = indexer_track_model.transform(train)

    val = indexer_user_model.transform(val)
    val = indexer_track_model.transform(val)

    test = indexer_user_model.transform(test)
    test = indexer_track_model.transform(test)

    ## ALS model
    rank_ = [5, 10, 20]
    regParam_ = [0.1, 1, 10]
    alpha_ = [1, 5, 10]
    param_grid = it.product(rank_, regParam_, alpha_)

    ## Pick out users from validation set
    user_id = val.select('user_id_indexed').distinct()
    true_label = val.select('user_id_indexed', 'track_id_indexed')\
                    .groupBy('user_id_indexed')\
                    .agg(expr('collect_list(track_id_indexed) as true_item'))

    ## Log-Compression
    ## count -> log(1+count)
    if log_comp == True:
        train = train.select('*', F.log1p('count').alias('count_log1p'))
        val = val.select('*', F.log1p('count').alias('count_log1p'))
        rateCol = "count_log1p"
    else:
        rateCol = "count"

    ## Drop interactions that have counts lower than specified threhold
    if drop_low == True:
        train = train.filter(train['count'] > drop_thr)
        val = val.filter(val['count'] > drop_thr)

    for i in param_grid:
        print('Start Training for {}'.format(i))
        als = ALS(rank = i[0], maxIter=10, regParam=i[1], userCol="user_id_indexed", itemCol="track_id_indexed", ratingCol=rateCol, implicitPrefs=True, \
            alpha=i[2], nonnegative=True, coldStartStrategy="drop")
        model = als.fit(train)
        print('Finish Training for {}'.format(i))

        # Make top 500 recommendations for users in validation test
        res = model.recommendForUserSubset(user_id, 500)
        pred_label = res.select('user_id_indexed',
                                'recommendations.track_id_indexed')

        pred_true_rdd = pred_label.join(F.broadcast(true_label), 'user_id_indexed', 'inner') \
                    .rdd \
                    .map(lambda row: (row[1], row[2]))

        print('Start Evaluating for {}'.format(i))
        metrics = RankingMetrics(pred_true_rdd)
        map_ = metrics.meanAveragePrecision
        ndcg = metrics.ndcgAt(500)
        mpa = metrics.precisionAt(500)
        print(i, 'map score: ', map_, 'ndcg score: ', ndcg, 'map score: ', mpa)

    pass
def prepare_df(df):
    num_rows = df.count()

    # Expand dates.
    df = expand_date(df)

    df = df \
        .withColumn('Open', df.Open != '0') \
        .withColumn('Promo', df.Promo != '0') \
        .withColumn('StateHoliday', df.StateHoliday != '0') \
        .withColumn('SchoolHoliday', df.SchoolHoliday != '0')

    # Merge in store information.
    store = store_csv.join(store_states_csv, 'Store')
    df = df.join(store, 'Store')

    # Merge in Google Trend information.
    google_trend_all = prepare_google_trend()
    df = df.join(google_trend_all,
                 ['State', 'Year', 'Week']).select(df['*'],
                                                   google_trend_all.trend)

    # Merge in Google Trend for whole Germany.
    google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE']
    df = df.join(google_trend_de, ['Year', 'Week']).select(
        df['*'], google_trend_all.trend.alias('trend_de'))

    # Merge in weather.
    weather = weather_csv.join(state_names_csv,
                               weather_csv.file == state_names_csv.StateName)
    df = df.join(weather, ['State', 'Date'])

    # Fix null values.
    df = df \
        .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \
        .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \
        .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \
        .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1)))

    # Days & months competition was open, cap to 2 years.
    df = df.withColumn(
        'CompetitionOpenSince',
        F.to_date(
            F.format_string('%s-%s-15', df.CompetitionOpenSinceYear,
                            df.CompetitionOpenSinceMonth)))
    df = df.withColumn(
        'CompetitionDaysOpen',
        F.when(
            df.CompetitionOpenSinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(360 * 2),
                        F.datediff(df.Date,
                                   df.CompetitionOpenSince)))).otherwise(0))
    df = df.withColumn('CompetitionMonthsOpen',
                       (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

    # Days & weeks of promotion, cap to 25 weeks.
    df = df.withColumn(
        'Promo2Since',
        F.expr(
            'date_add(format_string("%s-01-01", Promo2SinceYear), (Promo2SinceWeek - 1) * 7)'
        ))
    df = df.withColumn(
        'Promo2Days',
        F.when(
            df.Promo2SinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(25 * 7),
                        F.datediff(df.Date, df.Promo2Since)))).otherwise(0))
    df = df.withColumn('Promo2Weeks',
                       (df.Promo2Days / 7).cast(T.IntegerType()))

    # Check that we did not lose any rows through inner joins.
    assert num_rows == df.count(), 'lost rows in joins'
    return df
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, avg, col
import pyspark.sql.functions as SQLFunctions
import os
import time

os.environ["PYSPARK_PYTHON"] = '/usr/bin/python3'
spark = SparkSession.builder.getOrCreate()
# Load data from a CSV
filePath = "/home/varun/PycharmProjects/BatchProcessingAirQuality/weatherAUS.csv"
df = spark.read.format("CSV").option("inferSchema",
                                     True).option("header",
                                                  True).load(filePath)
df = df.withColumn("Date", expr("to_date(Date)"))
print(df.show(5))

# time.sleep(10)
# Average rainfall overall
avgRain = df.filter(SQLFunctions.col('Date') >= '2008-12-01').select(
    SQLFunctions.round(avg('Rainfall'), 2).alias("Avg. Rainfall")).show()
# time.sleep(10)

# Min and Max Temperatures where MaxTemp >= 10
Temp = df.filter(SQLFunctions.col('MaxTemp') >= '10').select(
    'Date', 'MinTemp', 'MaxTemp').dropDuplicates(subset=['MaxTemp']).show(5)
# time.sleep(10)

# Average Temperature of the day where Wind Direction is North
meanCols = [col('MaxTemp'), col('MinTemp')]
avgCol = sum(x for x in meanCols) / len(meanCols)
avgTempOfDay = df.filter(SQLFunctions.col('WindGustDir') == 'N').select(
Пример #29
0
FROM tC
""").show()

# %% [markdown]
# ## DataFrames and Spark SQL Common Relational Operators
# %%
delays_path = os.path.join(DATA_DIRECTORY, "flights", "departuredelays.csv")
airports_path = os.path.join(DATA_DIRECTORY, "flights", "airport-codes-na.txt")

airports = spark.read.options(header='true', inferSchema='true',
                              sep='\t').csv(airports_path)
airports.createOrReplaceTempView("airports_na")

delays = spark.read.options(header='true').csv(delays_path)
delays = (delays.withColumn("delay",
                            F.expr("CAST(delay as INT) as delay")).withColumn(
                                "distance",
                                F.expr("CAST(distance as INT) as distance")))

delays.createOrReplaceTempView("departureDelays")

# Create temporary small table
foo = delays.where(
    F.expr("""
        origin == 'SEA' AND
        destination == 'SFO' AND
        date like '01010%' AND
        delay > 0
    """))

foo.createOrReplaceTempView("foo")
Пример #30
0
    gq_revised = gq_revised.filter((gq_revised.MAFID != "MAFID"))

    # Clean hu data
    hu_revised = pp10_hu_edited.select("_c0", "_c1", "_c2", "_c21", "_c22")

    hu_revised = hu_revised.withColumnRenamed(
        "_c0", "COLBLKST").withColumnRenamed("_c1", "LCO").withColumnRenamed(
            "_c2", "MAFID").withColumnRenamed("_c21",
                                              "EDIT_SEQ").withColumnRenamed(
                                                  "_c22", "FINAL_POP")

    hu_revised = hu_revised.filter((hu_revised.MAFID != "MAFID"))

    # Clean gq file by removing extra digits not needed
    gq_revised = gq_revised.withColumn(
        "mafid_temp", sf.expr("substring(MAFID, 1, length(MAFID)-2)"))
    gq_revised = gq_revised.withColumn(
        "edit_seq2", sf.expr("substring(EDIT_SEQ, 1, length(EDIT_SEQ)-2)"))
    gq_revised = gq_revised.drop("MAFID", "GQTYPE", "PEGQTYPE", "FGQTYPE",
                                 "PP_GQ_MEDIAN_AGE", "EDIT_SEQ")
    gq_revised = gq_revised.withColumnRenamed("mafid_temp",
                                              "MAFID").withColumnRenamed(
                                                  "edit_seq2", "EDIT_SEQ")

    # Perform Union of gq and hu
    gq_hu_union = hu_revised.union(gq_revised)

    # Read and clean ops file
    op_revised = spark.read.csv(
        "s3://uscb-decennial-ite-das/2010/cef/pp10_op.csv")
    op_revised = op_revised.select("_c0", "_c16", "_c17", "_c55", "_c68")
def as_of_join(
    entity_df: DataFrame,
    entity_event_timestamp_column: str,
    feature_table_df: DataFrame,
    feature_table: FeatureTable,
) -> DataFrame:
    """Perform an as of join between entity and feature table, given a maximum age tolerance.
    Join conditions:
    1. Entity primary key(s) value matches.
    2. Feature event timestamp is the closest match possible to the entity event timestamp,
       but must not be more recent than the entity event timestamp, and the difference must
       not be greater than max_age, unless max_age is not specified.
    3. If more than one feature table rows satisfy condition 1 and 2, feature row with the
       most recent created timestamp will be chosen.
    4. If none of the above conditions are satisfied, the feature rows will have null values.

    Args:
        entity_df (DataFrame): Spark dataframe representing the entities, to be joined with
            the feature tables.
        entity_event_timestamp_column (str): Column name in entity_df which represents
            event timestamp.
        feature_table_df (Dataframe): Spark dataframe representing the feature table.
        feature_table (FeatureTable): Feature table specification, which provide information on
            how the join should be performed, such as the entity primary keys and max age.

    Returns:
        DataFrame: Join result, which contains all the original columns from entity_df, as well
            as all the features specified in feature_table, where the feature columns will
            be prefixed with feature table name.

    Example:
        >>> entity_df.show()
            +------+-------------------+
            |entity|    event_timestamp|
            +------+-------------------+
            |  1001|2020-09-02 00:00:00|
            +------+-------------------+

        >>> feature_table_1_df.show()
            +------+-------+-------------------+-------------------+
            |entity|feature|    event_timestamp|  created_timestamp|
            +------+-------+-------------------+-------------------+
            |    10|    200|2020-09-01 00:00:00|2020-09-02 00:00:00|
            +------+-------+-------------------+-------------------+
            |    10|    400|2020-09-01 00:00:00|2020-09-01 00:00:00|
            +------+-------+-------------------+-------------------+
        >>> feature_table_1.max_age
            None
        >>> feature_table_1.name
            'table1'
        >>> df = as_of_join(entity_df, "event_timestamp", feature_table_1_df, feature_table_1)
        >>> df.show()
            +------+-------------------+---------------+
            |entity|    event_timestamp|table1__feature|
            +------+-------------------+---------------+
            |  1001|2020-09-02 00:00:00|            200|
            +------+-------------------+---------------+

        >>> feature_table_2.df.show()
            +------+-------+-------------------+-------------------+
            |entity|feature|    event_timestamp|  created_timestamp|
            +------+-------+-------------------+-------------------+
            |    10|    200|2020-09-01 00:00:00|2020-09-02 00:00:00|
            +------+-------+-------------------+-------------------+
            |    10|    400|2020-09-01 00:00:00|2020-09-01 00:00:00|
            +------+-------+-------------------+-------------------+
        >>> feature_table_2.max_age
            43200
        >>> feature_table_2.name
            'table2'
        >>> df = as_of_join(entity_df, "event_timestamp", feature_table_2_df, feature_table_2)
        >>> df.show()
            +------+-------------------+---------------+
            |entity|    event_timestamp|table2__feature|
            +------+-------------------+---------------+
            |  1001|2020-09-02 00:00:00|           null|
            +------+-------------------+---------------+

    """
    entity_with_id = entity_df.withColumn("_row_nr",
                                          monotonically_increasing_id())

    feature_event_timestamp_column_with_prefix = (
        f"{feature_table.name}__{EVENT_TIMESTAMP_ALIAS}")
    feature_created_timestamp_column_with_prefix = (
        f"{feature_table.name}__{CREATED_TIMESTAMP_ALIAS}")

    projection = [
        col(col_name).alias(f"{feature_table.name}__{col_name}")
        for col_name in feature_table_df.columns
    ]

    aliased_feature_table_df = feature_table_df.select(projection)

    join_cond = (
        entity_with_id[entity_event_timestamp_column] >=
        aliased_feature_table_df[feature_event_timestamp_column_with_prefix])
    if feature_table.max_age:
        join_cond = join_cond & (
            aliased_feature_table_df[feature_event_timestamp_column_with_prefix]
            >= entity_with_id[entity_event_timestamp_column] -
            expr(f"INTERVAL {feature_table.max_age} seconds"))

    for key in feature_table.entity_names:
        join_cond = join_cond & (
            entity_with_id[key]
            == aliased_feature_table_df[f"{feature_table.name}__{key}"])

    conditional_join = entity_with_id.join(aliased_feature_table_df, join_cond,
                                           "leftOuter")
    for key in feature_table.entity_names:
        conditional_join = conditional_join.drop(
            aliased_feature_table_df[f"{feature_table.name}__{key}"])

    window = Window.partitionBy(
        "_row_nr", *feature_table.entity_names).orderBy(
            col(feature_event_timestamp_column_with_prefix).desc(),
            col(feature_created_timestamp_column_with_prefix).desc(),
        )
    filter_most_recent_feature_timestamp = conditional_join.withColumn(
        "_rank",
        row_number().over(window)).filter(col("_rank") == 1)

    return filter_most_recent_feature_timestamp.select(entity_df.columns + [
        f"{feature_table.name}__{feature}"
        for feature in feature_table.feature_names
    ])
Пример #32
0
print(test_op)
#%%
"""
check if col and expr result to same output
"""

bankDf = (spark.read.option('header',
                            'true').csv(dataset_folder + '/bank.csv'))

print(bankDf.show(5))

#%%
from pyspark.sql.functions import expr, col

data1 = expr('(((balance + 5)* 100) > loan)')
print(data1)

data2 = (((col('balance') + 5) * 100) > col('loan'))
print(data2)
print(bankDf.columns)

#%%
"""
Create a datafrane by creating row objs
"""
from pyspark.sql import Row
from pyspark.sql.types import StructField, StructType, StringType, LongType

myManualSchema = StructType([
    StructField("some", StringType(), True),
Пример #33
0
def run_transactions(spark, config, cycle_date, account_type):
    """
    This function writes the monthly csv files to its respective master

    :param spark:
    :param config: it has all the source and target file paths
    :param cycle_date: YYYYMM:  cycle year and month that is being executed
    :param account_type: type of account that is being executed
    :return: writes the monthly transaction to its master csv file
    """
    if account_type == 'citi':

        # extracts year from yyyymm
        cycle_year = '{}'.format(cycle_date[0:4])
        # extracts month from yyyymm
        cycle_month_num = '{}'.format(cycle_date[4:6])
        # gets the MON name from the month number
        cycle_month = calendar.month_name[int(cycle_month_num)]

        previous_year = int(cycle_year) - 1

        # source path of citi csv files
        src_path = config["source"][
            "citi"] + 'tabula-' + cycle_year + ' ' + cycle_month + '*.csv'
        # master path of citi data
        master_path = config["target"]["citi_master"] + cycle_date

        # source headers of citi
        headers = ["transdate", "Posting_date", "Description", "Amount"]

        src_df = spark.read.csv(src_path, sep=',')

        new_df = src_df.toDF(*headers).filter('Amount is not null')

        # handling Januray month transactions where year is missing in the trn date.
        if cycle_month_num == '01':
            new_df = new_df.withColumn(
                "transdate",
                F.expr(
                    f"case when transdate like '%Dec%' then concat(transdate,'-',{previous_year})"
                    f" else concat(transdate,'-',{cycle_year}) end"))
        else:
            new_df = new_df.withColumn(
                "transdate", F.expr(f"concat(transdate,'-',{cycle_year})"))

        src_df = convert_date_format(new_df, 'transdate', '%d-%b-%Y', '%Y%m%d')

        # writing it to master
        new_df = write_src_csv_to_master(src_df, master_path, account_type)

    elif account_type == 'discover':

        # src path of discover csv statements
        src_path = config["source"][
            "discover"] + 'Discover-Statement-' + cycle_date + '*.csv'
        # master file path of discover
        master_path = config["target"]["discover_master"] + cycle_date

        src_df = spark.read.csv(src_path, header=True, sep=',')

        columns = [c.replace('. ', '_') for c in src_df.columns]

        src_df = src_df.toDF(*columns)

        # convert date format as needed
        src_df = convert_date_format(src_df, 'Trans_Date', '%m/%d/%Y',
                                     '%Y%m%d')

        new_df = write_src_csv_to_master(src_df, master_path, account_type)

    elif account_type == 'bofachk':

        # date formats as needed
        cycle_date_input = '{}-{}'.format(cycle_date[0:4], cycle_date[4:6])
        trg_cycle_date = '{}{}'.format(cycle_date[0:4], cycle_date[4:6])

        # src path for bofa chk files
        src_path = config["source"][
            "bofa_checking"] + 'tabula-eStmt_' + cycle_date_input + '*.csv'
        # master path for bofa chk
        master_path = config["target"]["bofa_chk_master"] + trg_cycle_date

        src_df = spark.read.csv(src_path, header=True,
                                sep=',').filter('Amount is not null')

        src_df = convert_date_format(src_df, 'Date', '%m/%d/%y', '%Y%m%d')

        new_df = write_src_csv_to_master(src_df, master_path, account_type)

    elif account_type == 'bofacredit':

        # extracting dates as needed
        cycle_year = '{}'.format(cycle_date[0:4])
        cycle_month = '{}'.format(cycle_date[4:6])
        previous_year = int(cycle_year) - 1

        # cycle_date_input = '{}-{}'.format(cycle_date[0:4], cycle_date[4:6])
        # trg_cycle_date = '{}{}'.format(cycle_date[0:4], cycle_date[4:6])
        #
        # src = config["source"]["bofa_credit"] + 'eStmt_' + cycle_date_input + '-15.pdf'
        # master = config["target"]["bofa_cc_master"] + trg_cycle_date
        #
        # new_df = write_bofa_cc_to_master(spark, src, cycle_date)
        #
        # new_df = convert_date_format(new_df, 'Transaction_date', '%m/%d/%Y', '%Y%m%d')
        #
        # new_df = new_df.withColumn('act_type', F.lit(account_type))
        #
        # new_df.coalesce(1).write.format("csv").mode("overwrite").save(master, header="true")

        headers = ["Transaction_date", "Posting_date", "Description", "Amount"]

        cycle_date_input = '{}-{}'.format(cycle_date[0:4], cycle_date[4:6])
        trg_cycle_date = '{}{}'.format(cycle_date[0:4], cycle_date[4:6])

        src_path = config["source"][
            "bofa_credit"] + 'tabula-eStmt_' + cycle_date_input + '*.csv'
        master_path = config["target"]["bofa_cc_master"] + trg_cycle_date

        src_df = spark.read.csv(src_path, header=False, sep=',')

        src_df = src_df.toDF(*headers)

        # handling Januray month transactions where year is missing in the trn date.
        if cycle_month == '01':
            src_df = src_df.withColumn(
                "Transaction_date",
                F.expr(
                    f"case when Transaction_date like '%Dec%' then concat(Transaction_date,'-',{previous_year})"
                    f" else concat(Transaction_date,'-',{cycle_year}) end"))
        else:
            src_df = src_df.withColumn(
                "Transaction_date",
                F.expr(f"concat(Transaction_date,'-',{cycle_year})"))

        src_df = src_df.filter('AMOUNT is not null')

        src_df = convert_date_format(src_df, 'Transaction_date', '%d-%b-%Y',
                                     '%Y%m%d')

        new_df = write_src_csv_to_master(src_df, master_path, account_type)

    elif account_type == 'chase':

        # extracting dates as needed
        cycle_year = cycle_date[0:4]
        cycle_month = cycle_date[4:6]
        previous_year = int(cycle_year) - 1
        trg_cycle_date = '{}{}'.format(cycle_date[0:4], cycle_date[4:6])

        src_path = config["source"]["chase"]
        master = config["target"]["chase_master"] + trg_cycle_date
        headers = ["date", "description", "amount", "balance"]

        # gets the file with the match filename
        src_file_name = get_file_starting_with(spark,
                                               src_path,
                                               match_filename=cycle_date)
        src_path = os.path.join(src_path, src_file_name)

        src_df = spark.read.csv(src_path, header=True,
                                sep=',').filter('AMOUNT is not null')

        new_df = src_df.toDF(*headers)

        #new_df = write_chase_chk_pdf_to_master(spark, src, cycle_date)

        # handling Januray month transactions where year is missing in the trn date.
        if cycle_month == '01':
            new_df = new_df.withColumn(
                "date",
                F.expr(
                    f"case when date like '%Dec%' then concat(date,'-',{previous_year})"
                    f" else concat(date,'-',{cycle_year}) end"))
        else:
            new_df = new_df.withColumn(
                "date", F.expr(f"concat(date,'-',{cycle_year})"))

        new_df = convert_date_format(new_df, 'DATE', '%d-%b-%Y', '%Y%m%d')

        new_df = new_df.withColumn('act_type', F.lit(account_type))

        new_df.coalesce(1).write.format("csv").mode("overwrite").save(
            master, header="true")

    else:
        print(
            'Please put in the right account_type: from citi discover bofachk bofacredit chase'
        )

    new_df.show(200, False)

    pass
Пример #34
0
# this creates a temporary streaming view based on the streaming dataframe
# it can later be queried with spark.sql, we will cover that in the next section
vehicleCheckinStreamingDF.withColumn("value",from_json("value",vehicleCheckinSchema))\
        .select(col('value.*')) \
        .createOrReplaceTempView("VehicleCheckin")

# Using spark.sql we can select any valid select statement from the spark view
vehicleCheckinSelectStarDF = spark.sql(
    "select reservationId, locationName, truckNumber as checkinTruckNumber, status from VehicleCheckin"
)

# Join the bank deposit and customer dataframes on the accountNumber fields
checkinStatusDF = vehicleStatusSelectStarDF.join(
    vehicleCheckinSelectStarDF,
    expr("""
    statusTruckNumber = checkinTruckNumber
"""))

# this takes the stream and "sinks" it to the console as it is updated one message at a time:
# +-----------------+------------+-------------+---------------+-------------+------------+------------------+------+
# |statusTruckNumber| destination|milesFromShop|odometerReading|reservationId|locationName|checkinTruckNumber|status|
# +-----------------+------------+-------------+---------------+-------------+------------+------------------+------+
# |             1445|Pennsylvania|          447|         297465|1602364379489|    Michigan|              1445|    In|
# |             1445|     Colardo|          439|         298038|1602364379489|    Michigan|              1445|    In|
# |             1445|    Maryland|          439|         298094|1602364379489|    Michigan|              1445|    In|
# |             1445|       Texas|          439|         298185|1602364379489|    Michigan|              1445|    In|
# |             1445|    Maryland|          439|         298234|1602364379489|    Michigan|              1445|    In|
# |             1445|      Nevada|          438|         298288|1602364379489|    Michigan|              1445|    In|
# |             1445|   Louisiana|          438|         298369|1602364379489|    Michigan|              1445|    In|
# |             1445|       Texas|          438|         298420|1602364379489|    Michigan|              1445|    In|
# |             1445|       Texas|          436|         298471|1602364379489|    Michigan|              1445|    In|
Пример #35
0
lmm_udf = fx.pandas_udf(lmm, returnType=DoubleType())

# COMMAND ----------

# DBTITLE 1,Prepare the input DataFrame
"""
Read in 1000genomes phase 3 chr 22 and split multiallelic sites to biallelic.

Add the phenotypes by cross joining with the genomic DataFrame.

The input to the lmm is the genotype represented as the number of alt alleles (0, 1, or 2).
In this example, we remove all sites where some samples are missing (as represented by -1).
"""

df = glow.transform( \
         "split_multiallelics", \
         spark.read.format("vcf").load("/databricks-datasets/genomics/1kg-vcfs/*chr22*.vcf.gz") \
     ) \
     .crossJoin(spark.read.format("parquet").load("/databricks-datasets/genomics/1000G/phenotypes.normalized/")) \
     .withColumn('genotype_states', fx.expr("genotype_states(genotypes)")) \
     .where(~fx.array_contains(fx.col('genotype_states'), -1))

# COMMAND ----------

# DBTITLE 1,Run the UDF and display results
by_pvalue = df.limit(1000).select("contigName", "start", "names", lmm_udf(df['genotype_states'], df['values']).alias("pValue"))\
  .na.drop(subset=["pValue"])\
  .orderBy("pValue", ascending=True)

display(by_pvalue)
Пример #36
0
]).toDF("id", "name", "graduate_program", "spark_status"))

graduateProgram = (spark.createDataFrame([
    (0, "Masters", "School of Information", "UC Berkeley"),
    (2, "Masters", "EECS", "UC Berkeley"), (1, "Ph.D.", "EECS", "UC Berkeley")
]).toDF("id", "degree", "department", "school"))

sparkStatus = (spark.createDataFrame([(500, "Vice President"),
                                      (250, "PMC Member"), (100, "Contributor")
                                      ]).toDF("id", "status"))

person.createOrReplaceTempView("person")
graduateProgram.createOrReplaceTempView("graduateProgram")
sparkStatus.createOrReplaceTempView("sparkStatus")

#joinExpression=person.graduate_program == graduateProgram.id
joinExpression = expr("graduate_program = id")

person.withColumnRenamed("id",
                         "personId").join(graduateProgram, joinExpression,
                                          "inner").show()
#person.join(graduateProgram,joinExpression,"inner").explain(extended=True)

#person.join(graduateProgram,joinExpression,"outer").show()

# (
#     person.withColumnRenamed("id","personId")
#     .join(sparkStatus,expr("array_contains(spark_status,id)")).show()
# )

spark.stop()
Пример #37
0
def main(argv):
    mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf(
        "SC_PHYS_PAGES")  # e.g. 4015976448
    mem_gib = int((mem_bytes / (1024.0**3)) * 0.9)
    tar_jar = os.path.join(find_runfiles(),
                           "__main__/galvasr2/spark/tar_spark_datasource.jar")
    spark = (pyspark.sql.SparkSession.builder.master(
        f"local[{os.cpu_count() - 1}]").config(
            "spark.eventLog.enabled",
            "true").config("spark.eventLog.dir", "/spark-events").config(
                "spark.sql.execution.arrow.pyspark.enabled", "true").config(
                    "spark.driver.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config(
                    "spark.executor.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config("spark.driver.memory", f"{mem_gib}g").config(
                    "spark.history.fs.logDirectory", "/spark-events").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        "1").config("spark.jars", tar_jar).config(
                            "spark.local.dir",
                            "/mnt/disks/spark-scratch/").getOrCreate())
    spark.sparkContext.setLogLevel("INFO")  # "ALL" for very verbose logging
    logging.getLogger("py4j").setLevel(logging.ERROR)

    catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue)

    _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue)
    licenseurl_df = licenseurl_df.select(
        [F.col("identifier"),
         F.col("text_document_id"),
         F.col("licenseurl")])

    # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file
    # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_"
    catalogue_df = catalogue_df.withColumn(
        "kaldi_normalized_uttid",
        F.concat_ws(
            "-",
            F.translate(catalogue_df.identifier, " /", "__"),
            F.translate(catalogue_df.audio_document_id, " /", "__"),
        ),
    )
    # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv")
    if not FLAGS.work_dir.startswith("gs://"):
        os.makedirs(FLAGS.work_dir, exist_ok=True)
    wav_scp = os.path.join(FLAGS.work_dir, "wav.scp")
    ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir")
    if FLAGS.stage <= 0:
        catalogue_df = catalogue_df.cache()
        # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping)
        training_sample_rows = catalogue_df.collect()
        catalogue_df.unpersist()

        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            create_wav_scp(posix_wav_scp, training_sample_rows,
                           FLAGS.input_dir, ctm_out_dir)

    # /development/lingvo-source/output_ctm_dir/

    # nvprof --analysis-metrics -o  decoder-analysis.nvprof \
    # We want only the best path, so we set lattice-beam to 0.1
    # --main-q-capacity=35000 \
    # Can get 266x RTF with this configuration. Keep it?
    # bath size of 100 and num channels of 100 works just fine

    if FLAGS.stage <= 1:
        if not FLAGS.work_dir.startswith("gs://"):
            os.makedirs(ctm_out_dir, exist_ok=True)
        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:

            posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                       temp_dir_name, ctm_out_dir)
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                    temp_dir_name, FLAGS.work_dir)
            num_gpus = 4
            posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir,
                                                 num_gpus)

            executor = ThreadPoolExecutor(max_workers=num_gpus)

            def run_gpu(posix_wav_scp_shard, gpu_number):
                cmd = f"""\
  /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \
  --frame-subsampling-factor=3 \
  --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \
  --max-active=7000 \
  --beam=15.0 \
  --lattice-beam=0.1 \
  --acoustic-scale=1.0 \
  --cuda-decoder-copy-threads=2 \
  --cuda-worker-threads={os.cpu_count() // num_gpus} \
  --segmentation=true \
  --cuda-use-tensor-cores=true \
  --max-batch-size=150 \
  --num-channels=250 \
  --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \
  --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \
  /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \
  /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \
  scp,p:{posix_wav_scp_shard} \
  {posix_ctm_out_dir}
  """
                env = deepcopy(os.environ)
                env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}"
                subprocess.check_call(shlex.split(cmd), env=env)

            for i, shard in enumerate(posix_wav_scp_shards):
                executor.submit(run_gpu, shard, i)
            executor.shutdown(wait=True)

    alignments_dir = os.path.join(FLAGS.alignments_work_dir,
                                  "alignments_json_jul_28")
    if FLAGS.stage <= 2:
        # TODO: Add options to DSAlign here
        dsalign_args = dsalign_main.parse_args(
            ["--output-wer",
             "--output-cer"])  # , "--output-sws", "--output-levenshtein"])

        alphabet_normalized_path = (
            "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt")
        align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path,
                                      15_000, 3_000)

        ctm_df = (spark.read.format("binaryFile").option(
            "pathGlobFilter", "*.ctm").load(ctm_out_dir))
        ctm_df = ctm_df.withColumn(
            "kaldi_normalized_uttid",
            F.regexp_replace(
                F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""),
        )
        ctm_df = ctm_df.withColumn("ctm_content",
                                   fix_text_udf(F.col("content"))).drop(
                                       "path", "length", "modificationTime",
                                       "content")

        ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid")
        downsampled_catalogue_df = ctm_df.drop("ctm_content")

        training_sample_rows = downsampled_catalogue_df.collect()
        transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path,
                                          training_sample_rows)
        transcripts_df = transcripts_df.withColumn(
            "transcript",
            normalize_english_text_udf(transcripts_df.transcript))
        ctm_df = ctm_df.join(transcripts_df,
                             ["identifier", "text_document_id"])
        ctm_df = ctm_df.repartition(960)

        # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id),
        #                                         F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id),
        #                                         ctm_df.transcript, ctm_df.ctm_content))
        alignments_df = ctm_df.withColumn(
            "alignments",
            align_udf(
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.text_document_id),
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.audio_document_id),
                ctm_df.transcript,
                ctm_df.ctm_content,
            ),
        ).drop("ctm_content")
        print("GALVEZ:schema")
        alignments_df.printSchema()

        sys.stdout.flush()

        alignments_df.write.mode("overwrite").format("json").save(
            alignments_dir)

    manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest")
    tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars")
    if FLAGS.stage <= 3:
        duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json"
        duplicates_df = spark.read.format("json").load(duplicate_data_path)

        alignments_df = spark.read.json(alignments_dir)

        alignments_df = alignments_df.join(
            duplicates_df,
            on=(alignments_df.identifier == duplicates_df.identifier)
            &
            (alignments_df.text_document_id == duplicates_df.text_document_id),
            how="anti",
        )

        if FLAGS.license_filter == "":
            pass
        else:
            if FLAGS.license_filter == "Not CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    ~is_cc_by_sa(F.col("licenseurl")))
            elif FLAGS.license_filter == "CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    is_cc_by_sa(F.col("licenseurl")))
            else:
                raise Exception("Unknown license_filter provided.")
            filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl")

            alignments_df = alignments_df.join(
                filtered_licenseurl_df,
                on=(alignments_df.identifier
                    == filtered_licenseurl_df.identifier)
                & (alignments_df.text_document_id
                   == filtered_licenseurl_df.text_document_id),
                how="inner",
            )
            alignments_df = alignments_df.drop(
                filtered_licenseurl_df.identifier).drop(
                    filtered_licenseurl_df.text_document_id)

        # We would like the number of partitions to be some large multiple
        # of the number of executors. Not every audio file is the same
        # length, so this helps with load balancing.
        alignments_df = alignments_df.withColumn(
            "duration_ms",
            F.expr(
                "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)"
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.arrays_zip(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.duration_ms,
            ).cast(
                T.ArrayType(
                    T.StructType([
                        T.StructField("cer", T.FloatType()),
                        T.StructField("end_ms", T.LongType()),
                        T.StructField("label", T.StringType()),
                        T.StructField("start_ms", T.LongType()),
                        T.StructField("wer", T.FloatType()),
                        T.StructField("duration_ms", T.LongType()),
                    ]))),
        )

        alignments_df = alignments_df.drop("duration_ms")

        alignments_df = alignments_df.withColumn(
            "alignments",
            F.filter(
                alignments_df.alignments,
                # Need to select this filter such that total number of
                # hours is 31,400
                lambda alignment:
                (alignment.duration_ms < FLAGS.max_duration_ms)
                & (alignment.duration_ms >= FLAGS.min_duration_ms)
                & (alignment.cer < FLAGS.max_cer)
                & (alignment.cer >= FLAGS.min_cer),
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.struct(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.alignments.duration_ms,
            ).cast(
                T.StructType([
                    T.StructField("cer", T.ArrayType(T.FloatType())),
                    T.StructField("end_ms", T.ArrayType(T.LongType())),
                    T.StructField("label", T.ArrayType(T.StringType())),
                    T.StructField("start_ms", T.ArrayType(T.LongType())),
                    T.StructField("wer", T.ArrayType(T.FloatType())),
                    T.StructField("duration_ms", T.ArrayType(T.LongType())),
                ])),
        )

        alignments_df = alignments_df.repartition(960)

        abc = alignments_df.select(
            F.sum(
                F.expr(
                    "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)"
                )) / 1000.0 / 60.0 / 60.0).collect()
        print("GALVEZ:total number of hours=", abc)
        sys.stdout.flush()

        alignments_df = alignments_df.select(
            alignments_df.identifier,
            alignments_df.audio_document_id,
            alignments_df.text_document_id,
            alignments_df.alignments,
        )

        alignments_df = F.broadcast(alignments_df)

        audio_paths = F.concat(
            F.lit(FLAGS.input_gcs_path),
            F.lit("/"),
            F.col("identifier"),
            F.lit("/"),
            F.col("audio_document_id"),
        )
        rows = alignments_df.select(audio_paths).collect()
        paths = [row[0] for row in rows]  # [:1] # GALVEZ: WARNING test!
        # print(f"number of paths = {len(paths)}")
        audio_df = (spark.read.format("binaryFile").load(paths).drop(
            "modificationTime", "length"))

        alignments_audio_df = alignments_df.join(audio_df,
                                                 audio_paths == audio_df.path)
        # from IPython import embed; embed()

        # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory
        # Remove "." becasue it has special meaning in webdataset format.
        # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient)
        name = F.concat(F.col("identifier"), F.lit("/"),
                        F.col("audio_document_id"))
        # name = F.regexp_replace(name, r"/", "_SLASH_")
        name = F.regexp_replace(name, r"\.", "_DOT_")
        name = F.regexp_replace(name, r" ", "_SPACE_")
        # glob.glob("**/*.flac")

        pdf = df.select(name).collect()
        for name in pdf.name:
            assert len(name) < 4096
            for chunk in "/".split(name):
                assert len(chunk) < 256
        # name = F.regexp_replace(F.concat(F.col("identifier"),
        #                                  F.lit("-"),
        #                                  F.col("audio_document_id")),
        #                         r"(\.|/)",
        #                         "_"
        # )

        # The name of each thing in the tar file. May not exceed 100 characters in length
        # substr indexes from 1!
        # name = name.substr(
        #     F.length(name) - F.least(F.length(name), F.lit(88)) + 1,
        #     F.least(F.length(name), F.lit(88))
        # )

        alignments_audio_df = alignments_audio_df.withColumn(
            "aligned_chunks",
            create_audio_segments_udf(
                alignments_audio_df.content,
                F.lit("mp3"),
                name,
                alignments_audio_df.alignments.start_ms,
                alignments_audio_df.alignments.end_ms,
                F.lit("flac"),
            ),
        )
        a = alignments_audio_df.select(
            F.explode(
                F.arrays_zip("aligned_chunks.audio_name",
                             "aligned_chunks.audio"))).select(
                                 "col.0", "col.1")
        a.write.mode("overwrite").format("tar").save(tars_dir)

        output_df = alignments_audio_df.select(
            alignments_audio_df.identifier,
            alignments_audio_df.audio_document_id,
            alignments_audio_df.text_document_id,
            F.struct(
                alignments_audio_df.alignments.label.alias("label"),
                create_audio_segment_names_udf(
                    # Is F.size right here?
                    name,
                    F.size(alignments_audio_df.alignments.start_ms),
                    F.lit("flac"),
                ).alias("name"),
                alignments_audio_df.alignments.duration_ms.alias(
                    "duration_ms"),
            ).alias("training_data"),
        )
        output_df = output_df.coalesce(960)

        # coalesce(1) seems to make the create_audio_segments_udf function run serially
        output_df.write.mode("overwrite").json(manifest_dir)

    repartitioned_tars_dir = os.path.join(FLAGS.work_dir,
                                          "repartitioned_dataset_tars")
    tmp_tars_dir = os.path.join(FLAGS.work_dir,
                                "repartitioned_dataset_tmp_dir")
    if FLAGS.stage <= 4:
        tars_df = spark.read.format("tar").load(tars_dir)  # .limit(100)
        number_of_rows = tars_df.count()

        spark2 = spark.newSession()
        spark2.conf.set(
            "spark.sql.execution.rangeExchange.sampleSizePerPartition",
            number_of_rows)
        spark2.conf.set("spark.sql.files.minPartitionNum",
                        FLAGS.number_of_shards)
        # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100)

        # print("GALVEZ:", tars_df.select(F.col("key")).collect())
        # import sys; sys.exit()
        tars_df = spark2.read.format("tar").load(tars_dir)  # .limit(100)
        tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards,
                                             F.col("key"))
        # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs.
        # # tars_df = tars_df.persist()
        tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir)
        tars_df = spark2.read.format("tar").load(
            tmp_tars_dir)  # .repartitionByRange()  # coalesce(1024)
        # counts_df = (
        #     tars_df.withColumn("partitionId", F.spark_partition_id())
        #     .groupBy("partitionId")
        #     .count()
        # )
        # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0]
        # # Consider doing this in java
        # def drop_final_rows(rows):
        #     for _ in range(num_rows_to_keep):
        #         yield next(rows)
        #     for _ in rows:
        #         pass
        #     return

        # print("GALVEZ:before=", tars_df.rdd.getNumPartitions())
        # # , preservesPartitioning=True
        # tars_df = spark2.createDataFrame(
        #     tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema
        # )
        # print("GALVEZ:after=", tars_df.rdd.getNumPartitions())
        # import sys

        # sys.stdout.flush()
        # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode.
        # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir)

        # manifest_df = spark2.read.json(manifest_dir)
        # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count()
        # print(f"GALVEZ:number_of_utterances={number_of_utterances}")
        # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards
        # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard)

    nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo")
    nemo_single_manifest_dir = os.path.join(FLAGS.work_dir,
                                            "dataset_manifest_nemo_single")

    if FLAGS.stage <= 5:
        json_df = spark.read.format("json").load(manifest_dir)
        nemo_df = json_df.select(
            F.explode(
                F.arrays_zip(
                    F.col("training_data.name").alias("audio_filepath"),
                    F.col("training_data.label").alias("text"),
                    F.col("training_data.duration_ms").alias("duration_ms"),
                )))
        nemo_df = nemo_df.select(
            F.col("col.name").alias("audio_filepath"),
            F.col("col.label").alias("text"),
            (F.col("col.duration_ms").cast(T.DoubleType()) /
             1000.0).alias("duration"),
            F.lit(-1).alias("shard_id"),
        )
        if False:
            tars_df = spark.read.format("tar").load(repartitioned_tars_dir)
            tars_df = tars_df.select(tars_df.key)
            nemo_df = F.broadcast(nemo_df)
            nemo_df = nemo_df.join(
                tars_df,
                F.col("audio_filepath") == F.col("key")).drop(F.col("key"))

        # TODO: Join against tar files that have been made to contain the
        # same number of files to filter out removed files
        nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir)

        nemo_single_df = spark.read.format("json").load(nemo_manifest_dir)
        nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save(
            nemo_single_manifest_dir)

    single_manifest_dir = os.path.join(FLAGS.work_dir,
                                       "dataset_manifest_single")
    single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single")
    # Create single tar file and single json file
    if FLAGS.stage <= 6:
        json_df = spark.read.format("json").load(manifest_dir)
        json_df.coalesce(1).write.format("json").mode("overwrite").save(
            single_manifest_dir)

        tars_df = spark.read.format("tar").load(tmp_tars_dir)
        tars_df.coalesce(1).write.format("tar").mode("overwrite").save(
            single_tar_dir)
    spark.read.format("json")
    .load("data/flight_data/json/2015-summary.json")
)

df.printSchema()
df.createOrReplaceTempView("dfTable")

df.select("DEST_COUNTRY_NAME").show(2)

spark.sql("""
SELECT DEST_COUNTRY_NAME FROM dfTable LIMIT 2
""").show()

df.select("DEST_COUNTRY_NAME","ORIGIN_COUNTRY_NAME").show(2)

df.select(expr("DEST_COUNTRY_NAME as destination")).show(2)

df.select(expr("DEST_COUNTRY_NAME as destination").alias("DEST")).show(2)

df.selectExpr("DEST_COUNTRY_NAME as newColumn","DEST_COUNTRY_NAME").show(4)

df.selectExpr(
    "*",
    "(DEST_COUNTRY_NAME = ORIGIN_COUNTRY_NAME) AS withinCountry"
).show(5,truncate=False)

df.selectExpr(
    "AVG(count)",
    "COUNT(DISTINCT(DEST_COUNTRY_NAME))"
).show()
# COMMAND ----------

from pyspark.sql.functions import instr
DOTCodeFilter = col("StockCode") == "DOT"
priceFilter = col("UnitPrice") > 600
descripFilter = instr(col("Description"), "POSTAGE") >= 1
df.withColumn("isExpensive", DOTCodeFilter & (priceFilter | descripFilter))\
  .where("isExpensive")\
  .select("unitPrice", "isExpensive").show(5)


# COMMAND ----------

from pyspark.sql.functions import expr
df.withColumn("isExpensive", expr("NOT UnitPrice <= 250"))\
  .where("isExpensive")\
  .select("Description", "UnitPrice").show(5)


# COMMAND ----------

from pyspark.sql.functions import expr, pow
fabricatedQuantity = pow(col("Quantity") * col("UnitPrice"), 2) + 5
df.select(expr("CustomerId"), fabricatedQuantity.alias("realQuantity")).show(2)


# COMMAND ----------

df.selectExpr(
  "CustomerId",
graduateProgram.join(person, joinExpression, joinType).show()
# Be careful when you cross join on large datasets, it causes the explosion in the number of rows contained in result DF.

# COMMAND ----------

# When performing joins, there are some specific challenges and some common questions that arise.
# The rest of the notebook will provide answers to these common questions and then explain how, at a high level, Spark performs joins.

# COMMAND ----------

# Even though this might seem like a challenge, it’s actually not. Any expression is a valid join expression, assuming that it returns a Boolean:
# For ex: Joining by the id from the person DF and spark_status array from the SparkStatus DF.
from pyspark.sql.functions import expr
# As spark_status is an array column, using an expression to check if the spark_status contains an id value.
person.withColumnRenamed("id", "personId")\
  .join(sparkStatus, expr("array_contains(spark_status, id)")).show()

# COMMAND ----------

# One of the tricky things that come up in joins is dealing with duplicate column names in your results DataFrame.
# In a DataFrame, each column has a unique ID within Spark’s SQL Engine, Catalyst. This unique ID is purely internal and not something that you can directly reference.
# This makes it quite difficult to refer to a specific column when you have a DataFrame with duplicate column names.

# COMMAND ----------

# This can occur in two distinct situations:

# The join expression that you specify does not remove one key from one of the input DataFrames and the keys have the same column name
# Two columns on which you are not performing the join have the same name

# COMMAND ----------