예제 #1
0
def extract_field(df):
    text = 'bbbb'
    df = spark.createDataFrame([('A$B', 'a$b$c')], ['k', 's'])
    df.select(f.sha1(f.split(f.col('s'), '\$')).alias('k')).withColumn(
        'k',
        f.when(
            f.col('k') == f"{text}$",
            f.concat(f.lit(f"{text}-"),
                     f.sha1(F.split(F.col("s"), "\$").getItem(0)))).otherwise(
                         F.col('k'))).collect()
    def normal_df(self, num_rows, num_dummy_columns=1, num_partitions=200):

        _df = self._spark.range(
            num_rows,
            numPartitions=num_partitions).withColumnRenamed('id', 'key')

        for dummy in range(0, num_dummy_columns):
            _df = _df.withColumn('dummy_' + str(dummy + 1),
                                 sha1(col('key').cast('string')))

        return _df.repartition(num_partitions)
예제 #3
0
def __sha1_concat(hash_columns):
  """
  Private Function - concatenates columns provided using : delimiter

  Parameters:
  hash_columns (list): A string list containing the list of columns to hash

  Returns:
  str: returns Hash sha1{'column1:column2:'}
  """
  column_ref_list = [when(col(hash_col_name).isNull(),lit(":")).otherwise(concat(col(hash_col_name).cast("string"),lit(":"))) 
                     for hash_col_name in hash_columns]
  concat_string = sha1(concat(*column_ref_list))
  return concat_string
    def skewed_df(self,
                  num_rows,
                  percentage_rows_skewed,
                  num_dummy_columns=1,
                  num_partitions=200):

        _df = self._spark.range(num_rows, numPartitions=num_partitions) \
            .withColumn('id', when(col('id') <= percentage_rows_skewed * num_rows, lit(1)).otherwise(col('id'))) \
            .withColumnRenamed('id', 'key')

        for dummy in range(0, num_dummy_columns):
            _df = _df.withColumn('dummy_' + str(dummy + 1),
                                 sha1(col('key').cast('string')))

        return _df.repartition(num_partitions)
def benchmark2():
    print("===Benchmark 2===")
    print(
        "Comparing JDBC writes to InnoDB and API writes to ColumnStore with larger datasets"
    )
    print("")

    emptyDatabase()

    print("creating dataframe 1: two random generated doubles")
    randDF = sqlContext.range(0, 7000000).withColumn(
        'uniform', rand(seed=23)).withColumn('normal', randn(seed=42)).cache()
    randDFRows = randDF.count()
    randDFItems = randDFRows * len(randDF.columns)
    randDF.printSchema()
    print("bemchmarking dataframe 1")
    rand_benchmark = benchmark2execution(
        "rand", randDF, "id BIGINT, uniform DOUBLE, normal DOUBLE")
    randDF.unpersist()

    print(
        "creating dataframe 2: sha1, sha256, sha512 and md5 hashes of integers"
    )
    tmpDF = sqlContext.createDataFrame(
        sc.parallelize(range(
            0, 3000000)).map(lambda i: Row(number=i, string=str(i))))
    hashDF = tmpDF.select(tmpDF.number,
                          sha1(tmpDF.string).alias("sha1"),
                          sha2(tmpDF.string, 256).alias("sha256"),
                          sha2(tmpDF.string, 512).alias("sha512"),
                          md5(tmpDF.string).alias("md5")).cache()
    hashDFRows = hashDF.count()
    hashDFItems = hashDFRows * len(hashDF.columns)
    hashDF.printSchema()
    print("bemchmarking dataframe 2")
    hash_benchmark = benchmark2execution(
        "hash", hashDF,
        "number BIGINT, sha1 VARCHAR(40), sha256 VARCHAR(64), sha512 VARCHAR(128), md5 VARCHAR(32)"
    )
    hashDF.unpersist()

    print("jdbc_innodb\tapi_columnstore\t\trows\t\titems")
    print("%.3fs\t\t%.3fs\t\t%i\t\t%i" %
          (rand_benchmark[0], rand_benchmark[1], randDFRows, randDFItems))
    print("%.3fs\t\t%.3fs\t\t%i\t\t%i" %
          (hash_benchmark[0], hash_benchmark[1], hashDFRows, hashDFItems))
예제 #6
0
def make_songplay_data(d_artist_df, d_song_df, event_df):
    """
    Create the songplay fact dataframe.

    Parameters:
    d_artist_df (DataFrame): The artist dimension dataframe.
    d_song_df (DataFrame): The song dimension dataframe.
    event_df (DataFrame): The raw song play event dataframe.

    Returns:
    f_songplay_df (DataFrame): A songplay fact dataframe.
    """
    print('\nmake_songplay_data...')

    tmp_df = d_song_df.withColumnRenamed('artist_id', 'song_artist_id')
    tmp_df = tmp_df.join(d_artist_df, d_artist_df.artist_id == tmp_df.song_artist_id) \
        .select('song_id', 'title', 'duration', 'artist_id', 'artist_name')

    comparison = [
        event_df.song == tmp_df.title,
        event_df.length.cast(ShortType()) == tmp_df.duration.cast(ShortType())
    ]

    # extract columns from joined song and log datasets to create songplays table
    # create hash of timestmap userId and song for unique songplay ID
    # year and month columns exist for paritioning parquet files
    f_songplay_df = event_df.withColumn('songplay_id', F.sha1(F.concat_ws('|', 'timestamp', 'userId', 'song'))) \
        .withColumn('year', F.year('timestamp')) \
        .withColumn('month', F.month('timestamp')) \
        .join(tmp_df, comparison, 'left') \
        .select(['songplay_id', 'start_time', 'year', 'month', 'userId', 'level', 'song_id', 'artist_id', 'sessionId', 'location', 'userAgent'])
    print('songplay fact record count:', f_songplay_df.count())

    not_null_count = f_songplay_df.filter(F.col('song_id').isNotNull()).count()
    print('songplay fact records with song_id value:', not_null_count)

    return f_songplay_df
예제 #7
0
def get_client_details(client_details, fn):

    # hash client_id using secure sha1
    client_details = client_details.withColumn('client_id',
                                               sha1(client_details.client_id))

    # calculate attribution_site column as :
    # Value assigned to 'www.mozilla.org or unknown site'.
    # CASE WHEN medium = unknown AND source = unknown AND campaign = unknown THEN 'unknown site' ELSE 'www_mozilla_org' END
    #def attribution_site(attribution):
    #   if   attribution.medium == 'unknown' and attribution.source == 'unknown' and attribution.campaign == 'unknown' : return 'unknown_site'
    #   else: return 'www_mozilla_org'
    # NOTE: it seems that calls to udf() must be after SparkContext() is called
    attribution_site_udf = udf(
        lambda attribution: None if (attribution is None) else
        ("unknown_site"
         if attribution.medium == 'unknown' and attribution.source == 'unknown'
         and attribution.campaign == 'unknown' else 'www_mozilla_org'),
        StringType())
    client_details = client_details.withColumn(
        "attribution_site", attribution_site_udf("attribution"))

    # convert submission date to string
    client_details_with_date = client_details.withColumn(
        "date",
        unix_timestamp("submission_date_s3",
                       "yyyyMMdd").cast("timestamp").cast("date"))

    w = Window.partitionBy(client_details_with_date["client_id"]).orderBy(
        client_details_with_date["date"].desc())
    client_details_with_date_latest = client_details_with_date.select(
        '*',
        F.rank().over(w).alias('rank')).filter(F.col('rank') <= 1)

    # calculate most recent active date column (has to be done here cause we are not storing history in Vertica) :
    # most recent date where uri count (pageviews) scalar_parent_browser_engagement_total_uri_count_sum >=5
    uri_dt = client_details_with_date.filter(
        "scalar_parent_browser_engagement_total_uri_count_sum>=5").groupBy(
            "client_id").agg(F.max("date").alias("max_activity_date"))

    # left join on clients_details_with_date_latest
    df = client_details_with_date_latest.join(uri_dt, ["client_id"], 'left')

    # add most_recent_active_date_int column
    #df = df.withColumn("max_activity_date_int", F.datediff(F.lit(date.today()), df.max_activity_date))
    df = df.withColumn("max_activity_date_int",
                       F.datediff(F.lit(end_date), df.max_activity_date))

    # activity_group - calc in vertica?
    # This will be based on most_recent_active_day - If most recent active day is yesterday (<2) aDAU, if most recent active day is 2-7 days ago, aWAU, if most recent active day is 8-31 days ago aMAU (based on calculation date)
    activity_group_udf = udf(
        lambda days_passed: None if (days_passed is None) else
        ("aDAU" if days_passed < 2 else
         ("aWAU" if days_passed >= 2 and days_passed < 8 else
          ("aMAU" if days_passed >= 8 and days_passed <= 31 else None))),
        StringType())
    df = df.withColumn("activity_group",
                       activity_group_udf("max_activity_date_int"))

    # recast max_activity_date to string yyyyMMdd
    df = df.withColumn("max_activity_date",
                       F.date_format(uri_dt.max_activity_date, "yyyyMMdd"))

    # select ordered columns for writing to file
    df = df.select(
        "client_id", "os", "os_version", "city", "geo_subdivision1",
        "geo_subdivision2", "country", "default_search_engine",
        "default_search_engine_data_submission_url",
        "default_search_engine_data_load_path",
        "default_search_engine_data_origin", "e10s_enabled", "channel",
        "locale", "is_default_browser", "memory_mb", "os_service_pack_major",
        "os_service_pack_minor", "sample_id", "profile_creation_date",
        "profile_age_in_days", "active_addons_count_mean", "sync_configured",
        "sync_count_desktop_sum", "sync_count_mobile_sum",
        "places_bookmarks_count_mean", "timezone_offset", "attribution_site",
        "source", "medium", "campaign", "content", "submission_date_s3",
        "max_activity_date", "activity_group", "distribution_id")
    #df=df.select("client_id","os","os_version","city","geo_subdivision1","geo_subdivision2","country","default_search_engine","default_search_engine_data_submission_url","default_search_engine_data_load_path","default_search_engine_data_origin","e10s_enabled","channel","locale","is_default_browser","memory_mb","os_service_pack_major","os_service_pack_minor","profile_creation_date","profile_age_in_days","active_addons_count_mean","sync_configured","sync_count_desktop_sum","sync_count_mobile_sum","places_bookmarks_count_mean","timezone_offset","attribution_site","source","medium","campaign","content","max_activity_date","activity_group","distribution_id")

    #return df
    #df.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn)
    duplicated = df.withColumn("sample_id_dupe", df["sample_id"])
    duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode(
        "overwrite").save(fn)
예제 #8
0
from pyspark.sql.types import StringType

manualSplitPythonUDF = spark.udf.register("manualSplitSQLUDF", manual_split, StringType())

# COMMAND ----------

# MAGIC %md
# MAGIC Create a DataFrame of 100k values with a string to index. Do this by using a hash function, in this case `SHA-1`.

# COMMAND ----------

from pyspark.sql.functions import sha1, rand
randomDF = (spark.range(1, 10000 * 10 * 10 * 10)
  .withColumn("random_value", rand(seed=10).cast("string"))
  .withColumn("hash", sha1("random_value"))
  .drop("random_value")
)

display(randomDF)

# COMMAND ----------

# MAGIC %md
# MAGIC Apply the UDF by using it just like any other Spark function.

# COMMAND ----------

randomAugmentedDF = randomDF.select("*", manualSplitPythonUDF("hash").alias("augmented_col"))

display(randomAugmentedDF)
예제 #9
0
def summary_df(df,fn): #,max_date):
    # drop null ad_click values
    df = df.na.drop(subset=["ad_click"])
    # Remove non search sessions
    df = df[df['ad_click']>0]

    # sum ad_click
    sum_search_clients_daily = df.groupBy("client_id", "country", "submission_date_s3", "activity_date")\
                                        .agg(F.sum("ad_click").alias("ad_click"))
        
    # read revenue_by_country
    rev_by_country_s3_path = "s3://net-mozaws-prod-us-west-2-pipeline-analysis/nawong/revenue_by_country.csv"
    rev_by_country = sqlContext.read.csv(rev_by_country_s3_path, header=True)
    rev_by_country = rev_by_country.withColumn("rev_per_search_float", F.col("rev_per_search").cast("double"))\
                               .withColumn("yyyyMM_timestamp", F.to_timestamp(F.col("yyyymm"), "yyyyMM"))\
                               .withColumn("country_code", F.upper(F.col("country_code")))

    # add country field and revenue table - need transform to calculate transaction-level monetary value
    tbl = sum_search_clients_daily.join(rev_by_country, sum_search_clients_daily.country == rev_by_country.country_code,how='left_outer')
    spec = Window.partitionBy("client_id","country","submission_date_s3").orderBy(F.col("yyyyMM_timestamp").desc())
    # NOTE partition includes country because client may change country over time

    no_country=(
    tbl
        .where(F.isnull(F.col("yyyymm")))
        .withColumn("rev_per_search_float", F.lit(.005))
    )

    has_country=(
    tbl
        .na.drop(subset=["yyyymm"])
        .where("yyyyMM_timestamp <= activity_date")
        .withColumn('rank', F.row_number().over(spec))
        .where("rank = 1")
        .drop('rank')
    )

    tbl2=(no_country.union(has_country))

    # drop first purchase to calculate revenue
    spec2 = Window.partitionBy("client_id").orderBy(F.col("activity_date").asc()) # earliest date has row #1
    search_rev = (tbl2
     .withColumn("rank", F.row_number().over(spec2))
     .where("rank > 1")
    ).groupBy("client_id").agg(F.avg(F.col('rev_per_search_float')*F.col('ad_click')).alias("monetary_value"))
    
    # compute the final dataset for the BG/NBD model
    dataset = (
        tbl2
        .groupBy("client_id")
        .agg(F.datediff(F.max('activity_date'),F.min("activity_date")).alias("recency"), 
             (F.countDistinct('activity_date')-1).alias("frequency"),
             (F.datediff(F.lit(end_date.strftime("%Y-%m-%d")).cast("date"),F.min("activity_date"))).alias("T"),
             F.sum("ad_click").alias("historical_searches"),
             F.sum(F.col('rev_per_search_float')*F.col('ad_click')).alias("historical_clv"))
        .join(search_rev, "client_id", how="left")
        .where("frequency >= 0 AND recency >= 0 AND T >= 0")
        .select("client_id", (F.crc32("client_id") % 100).alias("sample_id"), "frequency","recency","T","monetary_value","historical_searches","historical_clv")
      ).fillna(0, subset=['monetary_value'])

    # anonymize client_id    
    dataset = dataset.withColumn('client_id',sha1(dataset.client_id))

    # write dataset recency, freq, age, revenue table per client
    #dataset.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn)
    duplicated = dataset.withColumn("sample_id_dupe", dataset["sample_id"])
    duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode("append").save(fn)
 def addChecksumCol(self, colName):
     self.inputDf = self.inputDf.withColumn(
         colName, sha1(concat_ws("~~", *self.inputDf.columns)))
     return self.inputDf