def hash_mdp_id_and_subsample(df, sample_range: Optional[Tuple[float, float]] = None): """Since mdp_id is a string but Pytorch Tensors do not store strings, we hash them with crc32, which is treated as a cryptographic hash (with range [0, MAX_UINT32-1]). We also perform an optional subsampling based on this hash value. NOTE: we're assuming no collisions in this hash! Otherwise, two mdp_ids can be indistinguishable after the hash. TODO: change this to a deterministic subsample. """ if sample_range: assert (0.0 <= sample_range[0] and sample_range[0] <= sample_range[1] and sample_range[1] <= 100.0), f"{sample_range} is invalid." # pyre-fixme[16]: Module `functions` has no attribute `col`. df = df.withColumn("mdp_id", crc32(col("mdp_id"))) if sample_range: lower_bound = sample_range[0] / 100.0 * MAX_UINT32 upper_bound = sample_range[1] / 100.0 * MAX_UINT32 # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. df = df.filter((lower_bound <= col("mdp_id")) & (col("mdp_id") <= upper_bound)) return df
def createDataFile(start_date, end_date, spark_instance, jackknife_buckets, sample_percent, output_path): feature_data_phase1 = spark_instance.table(_TABLE_SOURCE).select([ _COL_ID.alias("id"), _DATE_PARSED.alias("date"), # TODO: Use MD5 instead of CRC32 ((F.crc32(_COL_ID) / 100) % jackknife_buckets).alias("bucket"), lit(1).alias("is_active"), F.when(_COL_URI_COUNT >= _NUM_ADAU_THRESHOLD, 1).otherwise(0).alias( "is_active_active"), F.to_date(_COL_PC_DATE).alias("profile_creation_date") ] + _MAP_NATURAL_DIMENSIONS.keys()).filter( (_DATE_PARSED.between(start_date, end_date)) & (_COL_SAMPLE < sample_percent)).withColumn( "young_profile", F.when( col("date") < F.date_add(col("profile_creation_date"), 14), "TRUE").otherwise("FALSE")) new_profile_window = Window.partitionBy(col("id")).orderBy(col("date")) new_profile_data = feature_data_phase1.filter( (col("date") >= col("profile_creation_date")) & (col("date") <= F.date_add(col("profile_creation_date"), 6))).select( "*", F.rank().over(new_profile_window).alias('rank')).filter( col('rank') == 1).withColumn("new_profile", lit(1)).drop("date").withColumn( "date", col("profile_creation_date")) feature_data = feature_data_phase1.alias("fd").join( new_profile_data.alias("np"), (col("fd.id") == col("np.id")) & (col("fd.date") == col("np.date")), how='full', ).select( [F.coalesce(col("np.new_profile"), lit(0)).alias("new_profile")] + [F.coalesce(col("fd.is_active"), lit(0)).alias("is_active")] + [ F.coalesce(col("fd.is_active_active"), lit(0)).alias( "is_active_active") ] + [ F.coalesce(col("fd.{}".format(c)), col("np.{}".format(c))).alias(c) for c in feature_data_phase1.columns if c not in ["is_active", "is_active_active"] ]) once_ever_profiles = feature_data.filter( col("is_active") == 1).groupBy("id").count().filter( col("count") == 1).select("id").withColumn("single_day_profile", lit("1")) feature_data = feature_data.alias("fd").join( once_ever_profiles.alias("oep"), "id", "outer").fillna({"single_day_profile": "0"}) feature_data.write.partitionBy("date").mode('overwrite').parquet( output_path)
def hash_histogram(df, column_name, max_bins): hashed = df.select((f.crc32(f.col(column_name).cast("string")) % max_bins).alias(column_name)) hist = hashed.groupBy(column_name).count() hist = hist.rdd.collectAsMap() if None in hist: hist["None"] = hist[None] del hist[None] values_total = float(sum(hist.values())) hist = dict((k, v / values_total) for k, v in hist.items()) return hist
def summary_df(df,fn): #,max_date): # drop null ad_click values df = df.na.drop(subset=["ad_click"]) # Remove non search sessions df = df[df['ad_click']>0] # sum ad_click sum_search_clients_daily = df.groupBy("client_id", "country", "submission_date_s3", "activity_date")\ .agg(F.sum("ad_click").alias("ad_click")) # read revenue_by_country rev_by_country_s3_path = "s3://net-mozaws-prod-us-west-2-pipeline-analysis/nawong/revenue_by_country.csv" rev_by_country = sqlContext.read.csv(rev_by_country_s3_path, header=True) rev_by_country = rev_by_country.withColumn("rev_per_search_float", F.col("rev_per_search").cast("double"))\ .withColumn("yyyyMM_timestamp", F.to_timestamp(F.col("yyyymm"), "yyyyMM"))\ .withColumn("country_code", F.upper(F.col("country_code"))) # add country field and revenue table - need transform to calculate transaction-level monetary value tbl = sum_search_clients_daily.join(rev_by_country, sum_search_clients_daily.country == rev_by_country.country_code,how='left_outer') spec = Window.partitionBy("client_id","country","submission_date_s3").orderBy(F.col("yyyyMM_timestamp").desc()) # NOTE partition includes country because client may change country over time no_country=( tbl .where(F.isnull(F.col("yyyymm"))) .withColumn("rev_per_search_float", F.lit(.005)) ) has_country=( tbl .na.drop(subset=["yyyymm"]) .where("yyyyMM_timestamp <= activity_date") .withColumn('rank', F.row_number().over(spec)) .where("rank = 1") .drop('rank') ) tbl2=(no_country.union(has_country)) # drop first purchase to calculate revenue spec2 = Window.partitionBy("client_id").orderBy(F.col("activity_date").asc()) # earliest date has row #1 search_rev = (tbl2 .withColumn("rank", F.row_number().over(spec2)) .where("rank > 1") ).groupBy("client_id").agg(F.avg(F.col('rev_per_search_float')*F.col('ad_click')).alias("monetary_value")) # compute the final dataset for the BG/NBD model dataset = ( tbl2 .groupBy("client_id") .agg(F.datediff(F.max('activity_date'),F.min("activity_date")).alias("recency"), (F.countDistinct('activity_date')-1).alias("frequency"), (F.datediff(F.lit(end_date.strftime("%Y-%m-%d")).cast("date"),F.min("activity_date"))).alias("T"), F.sum("ad_click").alias("historical_searches"), F.sum(F.col('rev_per_search_float')*F.col('ad_click')).alias("historical_clv")) .join(search_rev, "client_id", how="left") .where("frequency >= 0 AND recency >= 0 AND T >= 0") .select("client_id", (F.crc32("client_id") % 100).alias("sample_id"), "frequency","recency","T","monetary_value","historical_searches","historical_clv") ).fillna(0, subset=['monetary_value']) # anonymize client_id dataset = dataset.withColumn('client_id',sha1(dataset.client_id)) # write dataset recency, freq, age, revenue table per client #dataset.write.partitionBy("sample_id").format("parquet").mode("overwrite").save(fn) duplicated = dataset.withColumn("sample_id_dupe", dataset["sample_id"]) duplicated.write.partitionBy("sample_id_dupe").format("parquet").mode("append").save(fn)