Пример #1
0
    def append_ranks(self, hour_df, day_df, month_df, year_df):
        """
        Ranks based on pageviews within categories. For hour_df the category is a single hour,
         for day_df the category is a single day, and so on.
        Args
            :param hour_df: Dataframe with hourly aggregated data.
            :param day_df: Dataframe with daily aggregated data.
            :param month_df: Dataframe with monthly aggregated data.
            :param year_df: Dataframe with yearly aggregated data.
        Returns:
            :return: hour_df, day_df, month_df, year_df with ranks added.
        """
        #Specify windows to rank within.
        hour_window = Window.partitionBy(['year', 'month', 'day', 'hour']).orderBy(hour_df['pageviews'].desc())
        day_window = Window.partitionBy(['year', 'month', 'day']).orderBy(day_df['pageviews'].desc())
        month_window = Window.partitionBy(['year', 'month']).orderBy(month_df['pageviews'].desc())
        year_window = Window.partitionBy(['year']).orderBy(year_df['pageviews'].desc())

        #Create ranking functions
        hour_rank = dense_rank().over(hour_window)
        day_rank = dense_rank().over(day_window)
        month_rank = dense_rank().over(month_window)
        year_rank = dense_rank().over(year_window)

        #Apply the ranking functions to make new columns
        hour_df = hour_df.withColumn("hour_rank", hour_rank)
        day_df = day_df.withColumn("day_rank", day_rank)
        month_df = month_df.withColumn("month_rank", month_rank)
        year_df = year_df.withColumn("year_rank", year_rank)
        return hour_df, day_df, month_df, year_df
Пример #2
0
 def gen_report_table(hc,curUnixDay):
     rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
     HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
     #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY 
     sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_indoor order by entityid,clientmac,utoday" 
     df_id_stat=hc.sql(sql)
     df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_id_mm df_min_max ,to caculate firtarrival and last arrival 
     df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
     df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
     df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
     
     #newly added part for indoors7 and indoors30 based on current date
     df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
     df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
     df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
     hc.sql("drop table if exists df_indoor_fin")
     df_indoor_fin.write.saveAsTable("df_indoor_fin")
     
     rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
     HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
     
     # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
     sql="select entityid,clientmac,utoday,UFIRSTDAY,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_flow order by entityid,clientmac,utoday" 
     df_fl_stat=hc.sql(sql)
     df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_fl_mm df_min_max ,to caculate firtarrival and last arrival 
     df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
     df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
     df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
     hc.sql("drop table if exists df_flow_fin")
     df_flow_fin.write.saveAsTable("df_flow_fin") 
Пример #3
0
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             mode='overwrite')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'),
        '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      mode='overwrite')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           mode='overwrite')
Пример #4
0
def _get_relevant_items_by_timestamp(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_timestamp=DEFAULT_TIMESTAMP_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        k=DEFAULT_K
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevent items to be filtered by the function.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user
Пример #5
0
def _get_top_k_items(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user
Пример #6
0
def _get_relevant_items_by_threshold(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        threshold=DEFAULT_THRESHOLD
):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        threshold (float): threshold for determining the relevant recommended items.
            This is used for the case that predicted ratings follow a known
            distribution.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
            items.
    """
    items_for_user = (
        dataframe
        .orderBy(col_rating, ascending=False)
        .where(col_rating + " >= " + str(threshold))
        .select(
            col_user, col_item, col_rating
        )
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates()
    )

    return items_for_user
Пример #7
0
    .config("spark.executor.memory", "3g") \
    .appName("Exercise1") \
    .getOrCreate()

# Read the source tables
products_table = spark.read.parquet("./data/products_parquet")
sales_table = spark.read.parquet("./data/sales_parquet")
sellers_table = spark.read.parquet("./data/sellers_parquet")

# Calcuate the number of pieces sold by each seller for each product
sales_table = sales_table.groupby(col("product_id"), col("seller_id")) \
                         .agg(sum("num_pieces_sold").alias("num_pieces_sold"))

# Create the window functions, one will sort ascending the other one descending. Partition by the product_id
# and sort by the pieces sold
window_desc = Window.partitionBy(col("product_id")).orderBy(
    col("num_pieces_sold").desc())
window_asc = Window.partitionBy(col("product_id")).orderBy(
    col("num_pieces_sold").asc())

# Create a Dense Rank (to avoid holes)
sales_table = sales_table.withColumn("rank_asc", dense_rank().over(window_asc)). \
    withColumn("rank_desc", dense_rank().over(window_desc))

# Get products that only have one row OR the products in which multiple sellers sold the same amount
# (i.e. all the employees that ever sold the product, sold the same exact amount)
single_seller = sales_table.where(col("rank_asc") == col("rank_desc")) \
                           .select(col("product_id").alias("single_seller_product_id"), \
                                   col("seller_id").alias("single_seller_seller_id"), \
                                   lit("Only seller or multiple sellers with the same results").alias("type"))

# Get the second top sellers
Пример #8
0
 def gpu_run(spark):
     df = _create_df(spark)
     w = Window.partitionBy('id').rowsBetween(Window.unboundedPreceding,
                                              Window.unboundedFollowing)
     return df.withColumn('sum_v', _sum_gpu_func('v').over(w)).collect()
Пример #9
0
def clean_dataframe(df):
    # Clean
    # -----
    # Dropping the missing User's ID rows
    df = df.filter(df.userId != '')
    # Dropping null session IDs if any
    df = df.filter(df.sessionId != '')

    # Adding churn column
    # ...................
    # Define a flag function
    flag_cancelation_event = udf(
        lambda x: 1 if x == "Cancellation Confirmation" else 0, IntegerType())
    # apply to the dataframe
    df = df.withColumn("churn", flag_cancelation_event("page"))
    #Define window bounds
    windowval = Window.partitionBy("userId").rangeBetween(
        Window.unboundedPreceding, Window.unboundedFollowing)
    # Applying the window
    df = df.withColumn("churn", Fsum("churn").over(windowval))

    # Adding Time columns
    # ...................
    # Definig user functions to get hour, day, month, and weekday of cancellation
    get_hour = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).hour)
    get_day = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000.0).day)
    get_month = udf(
        lambda x: datetime.datetime.fromtimestamp(x / 1000.0).month)
    # Source https://stackoverflow.com/questions/38928919/how-to-get-the-weekday-from-day-of-month-using-pyspark
    get_weekday = udf(
        lambda x: datetime.datetime.fromtimestamp(x / 1000.0).strftime('%w'))

    # Adding columns to the dataframe
    funcs = {
        'hour': get_hour,
        'day': get_day,
        'month': get_month,
        'week_day': get_weekday
    }
    for label, func in funcs.items():
        df = df.withColumn(label, func(df.ts))
        print(f'Column {label} added successfully.')

    # Adding Operating Systems columns
    # ................................
    # TO get the data between parenthesis
    ex = '\(([^\)]*)\)'
    # Create mappers for the os
    mapper_general = {
        'Compatible': 'Windows',
        'Ipad': 'Mac',
        'Iphone': 'Mac',
        'Macintosh': 'Mac',
        'Windows nt 5.1': 'Windows',
        'Windows nt 6.0': 'Windows',
        'Windows nt 6.1': 'Windows',
        'Windows nt 6.2': 'Windows',
        'Windows nt 6.3': 'Windows',
        'X11': 'Linux'
    }
    mapper_specific = {
        'Compatible': 'Windows 7',
        'Ipad': 'iPad',
        'Iphone': 'iPhone',
        'Macintosh': 'MacOS',
        'Windows nt 5.1': 'Windows XP',
        'Windows nt 6.0': 'Windows Vista',
        'Windows nt 6.1': 'Windows 7',
        'Windows nt 6.2': 'Windows 8.0',
        'Windows nt 6.3': 'Windows 8.1',
        'X11': 'Linux'
    }
    # Define user defined functions
    os_general = udf(lambda x: mapper_general[re.findall(ex, x)[0].split(';')[
        0].capitalize()])
    os_specific = udf(lambda x: mapper_specific[re.findall(ex, x)[0].split(';')
                                                [0].capitalize()])
    df = df.withColumn("os_general", os_general(df.userAgent))
    df = df.withColumn("os_specific", os_specific(df.userAgent))
    return df
Пример #10
0
    master_regex_one_df = df_regex_make(glob.glob(directory))

    # Check number of partitions -- should be 1
    print('Checking number of partitions - should be 1 b/c df.repartition(1)')
    master_regex_one_df = master_regex_one_df.repartition(args.num_partitions)
    print(master_regex_one_df.rdd.getNumPartitions())
    # print(master_regex_one_df.rdd.getNumPartitions())

    print("--- %s seconds ---" % (time.time() - start_time))

    #master_regex_one_df.orderBy('articleid').show(n=3,vertical=True)
    #master_regex_one_df.orderBy(master_regex_one_df.articleid.asc()).show(n=3,vertical=True)
    master_regex_one_df = master_regex_one_df.orderBy('articleid')

    print("First we sort the master_regex_one_df by articleid,timestamp and add regexes_prev")
    my_window = Window.partitionBy('articleid').orderBy('date_time')
    master_regex_one_df = master_regex_one_df.withColumn('regexes_prev', f.lag(master_regex_one_df.regexes).over(my_window))
    master_regex_one_df = master_regex_one_df.withColumn('core_prev', f.lag(master_regex_one_df.core_regexes).over(my_window))

    #master_regex_one_df = master_regex_one_df.na.replace('{{EMPTYBABY}}',None)
    master_regex_one_df = master_regex_one_df.na.fill('{{EMPTYBABY}}')

    ## regexes_diff_bool, core_diff_bool keep track of # of revisions that have a new regex / 0 for no new regex, 1 for diff
    ## we can sum this for the # of revisions with difference in regex / total number of revisions
    ## regexes_diff, core_diff keep track of the actual additions (string)
    ## regexes_diff_count, core_diff_count count the number of new policy invocations from core/regexes_diff (per revision)

    master_regex_one_df = master_regex_one_df.withColumn("regexes_diff_bool", f.when(master_regex_one_df.regexes == master_regex_one_df.regexes_prev, 0).otherwise(1))
    master_regex_one_df = master_regex_one_df.withColumn("core_diff_bool", f.when(master_regex_one_df.core_regexes == master_regex_one_df.core_prev, 0).otherwise(1))

    master_regex_one_df.orderBy('articleid','YYYY_MM','date_time').show(n=100)
Пример #11
0
    df_cleanning = df_cleanning.withColumn(
        "NAME_CHANGE",
        pudf_name_replace(df_cleanning.PROVINCE, df_cleanning.CITY,
                          df_cleanning.DISTRICT, df_cleanning.NAME))

    # 2. Join 一下
    df_cleanning = df_cleanning.join(broadcast(df_standard),
                                     on=["PROVINCE", "CITY"],
                                     how="left")
    df_cleanning.persist()
    df_not_match = df_cleanning.where(isnull(df_cleanning.STANDARD_NAME))
    df_cleanning = df_cleanning.where(~isnull(df_cleanning.STANDARD_NAME))
    df_cleanning = df_cleanning.withColumn(
        "SIMILARITY",
        efftiveness_with_jaro_winkler_similarity_in_hc_mapping(
            df_cleanning.NAME_CHANGE, df_cleanning.STANDARD_NAME))
    windowSpec = Window.partitionBy("ID").orderBy(desc("SIMILARITY"))
    df_cleanning = df_cleanning.withColumn("RANK", rank().over(windowSpec))
    df_cleanning = df_cleanning.where(df_cleanning.RANK == 1)
    df_cleanning.orderBy("ID").repartition(1).write.mode("overwrite").option(
        "header", "true"
    ).csv(
        "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/chc_hc_pov_mapping/hc_result"
    )
    # df_not_match.repartition(1).orderBy("ID").write.mode("overwrite").option("header", "true").csv("s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/chc_hc_pov_mapping/hc_result_not_match")
    df_not_match.union(df_drop).repartition(1).write.mode("overwrite").option(
        "header", "true"
    ).csv(
        "s3a://ph-max-auto/2020-08-11/BPBatchDAG/refactor/alfred/tmp/chc_hc_pov_mapping/hc_result_not_match"
    )
Пример #12
0
#print(trainDF.count())
#print(valDF.count())

print('done with train validation split')

# filter to just trainDF for globalMean
globalMean = trainDF.groupBy().avg("rating").take(1)[0][0]

print('done with globalMean calculation')

# find user mean and book mean of trainDF and add to trainDF
# add columns with globalMean, userMean and bookMean to trainDF

trainDF = trainDF.withColumn("globalMean", lit(globalMean)).cache()

w = Window.partitionBy('userIdNum')
q = Window.partitionBy('bookIdNum')

trainDF = trainDF.select('time', 'userIdNum', 'bookIdNum', 'rating', 'globalMean', avg('rating').over(w).alias('userMean'))\
                    .select('time', 'userIdNum', 'bookIdNum', 'rating', 'globalMean','userMean', avg('rating').over(q).alias('bookMean'))\
                        .cache()#.sort('userIdNum', 'bookIdNum').cache()

print(
    "done with trainDF add columns globalMean userMean bookMean create and cache"
)

# add columns for user bias and book bias
trainDF = trainDF.withColumn('userBias', col('userMean') - col('globalMean'))\
                    .withColumn('bookBias', col('bookMean') - col('globalMean'))\
                        .cache()
df_areas.filter(df_areas.payment_type == "paypal").groupBy(df_areas.areas).agg(
    sf.count(df_areas.areas).alias("Paypal_by_areas")) \
    .orderBy("Paypal_by_areas", ascending=False).show(1)

# b. Top 3 most purchased products in each area
print "Top purchased products by area (total):"
df_areas.groupBy(df_areas.areas, df_areas.product_id, df_areas.item_type).agg(sf.count(df_areas.product_id). \
                                                                              alias("top_items_by_area")).orderBy(
    "top_items_by_area", ascending=False).show()

temporal = df_areas.groupBy(df_areas.areas, df_areas.product_id, df_areas.item_type).agg(sf.count(df_areas.product_id). \
                                                                                         alias(
    "top_items_by_area")).orderBy(df_areas.areas, "top_items_by_area", ascending=False)

print "Top 3 most purchased products in each area:"
temporal.withColumn("top_products", sf.row_number().over(Window.partitionBy("areas"). \
                                                         orderBy(temporal['top_items_by_area'].desc()))).filter(
    sf.col('top_products') <= 3).show()

# c. Area that has billed less money
print "Least sales by areas:"
df_areas.groupBy(df_areas.areas).agg(sf.sum(df_areas.price). \
                                     alias("Top_Sales")).orderBy("Top_Sales", ascending=True).show()

# 6. Products that do not have enough stock for purchases made
print "Read the stock.csv file:"
print ""

# DataFrame creation from JSON file
df_stock = spark.read.option("header", "true") \
    .option("inferSchema", "true") \
    .csv("data/stock.csv")
Пример #14
0
def nb_previous_ratings(df):
    df = df.withColumn("nb_previous_ratings",
                       rank().over(Window.partitionBy("userId").
                                   orderBy("timestamp")) - 1)
    return df
Пример #15
0
def main():
    # sensor_data_df = spark.read.format("org.apache.spark.sql.cassandra").options(table=sensor_data_table,
    #                                                                              keyspace=keyspace).load()
    sensor_data_df = spark.read.parquet('sensor_data_ts')
    """ REGRESSION PROBLEM FOR GAS DATA PREDICTION"""
    regression_df = sensor_data_df.select(
        sensor_data_df['datetime'], sensor_data_df['H2S'],
        sensor_data_df['CO'], sensor_data_df['LEL'], sensor_data_df['O2'],
        sensor_data_df['message_code_name']).orderBy('datetime')

    day_df = regression_df.withColumn(
        'date', regression_df['datetime'].cast(types.DateType()))
    # day_df.show()
    group_df = day_df.groupby(day_df['date']).agg(
        functions.max('H2S').alias('H2S_max'))

    # Window operation over date column where max(H2S) of the next day will be assinged to each row
    w = Window.partitionBy().orderBy('date')
    max_H2S_tmrw_df = group_df.withColumn(
        'max_H2S_tmrw',
        functions.lead('H2S_max').over(w)).cache()
    max_H2S_tmrw_df.show()

    final_df = day_df.join(max_H2S_tmrw_df, 'date').repartition(100)
    final_df = final_df.withColumn('timestamp', functions.unix_timestamp(final_df['datetime'])) \
        .orderBy(final_df['datetime']).dropna()

    split_date = datetime(2020, 11, 1)
    train_set = final_df.where(functions.col('datetime') < split_date) \
        .select(final_df['CO'], final_df['LEL'], final_df['O2'], final_df['H2S'], final_df['H2S_max'],
                final_df['max_H2S_tmrw'])
    test_set = final_df.where(functions.col('datetime') >= split_date) \
        .select(final_df['CO'], final_df['LEL'], final_df['O2'], final_df['H2S'], final_df['H2S_max'],
                final_df['max_H2S_tmrw']).cache()

    x_train, x_val = train_set.randomSplit([0.75, 0.25])
    x_train = x_train.cache()
    x_val = x_val.cache()

    feature_assembler = VectorAssembler(inputCols=['H2S', 'CO', 'O2', 'LEL'],
                                        outputCol='features')
    # pca = PCA(k=10, inputCol="Features", outputCol="pcaFeatures")

    gbt = GBTRegressor(featuresCol='features', labelCol='max_H2S_tmrw')
    # dt = DecisionTreeRegressor(featuresCol='features', labelCol='max_H2S_tmrw')
    # rf = RandomForestRegressor(featuresCol='features', labelCol='max_H2S_tmrw')
    # fm = FMRegressor(featuresCol='features', labelCol='max_H2S_tmrw')

    pipeline = Pipeline(stages=[feature_assembler, gbt])
    # pipeline = Pipeline(stages=[feature_assembler, dt])
    # pipeline = Pipeline(stages=[feature_assembler, rf])
    # pipeline = Pipeline(stages=[feature_assembler, fm])

    model_fit = pipeline.fit(x_train)

    y_hat = model_fit.transform(x_val)

    rmse_evaluator = RegressionEvaluator(predictionCol='prediction',
                                         labelCol='max_H2S_tmrw',
                                         metricName='rmse')
    rmse_score = rmse_evaluator.evaluate(y_hat)
    print('rmse validation score: ', rmse_score)

    r2_evaluator = RegressionEvaluator(predictionCol='prediction',
                                       labelCol='max_H2S_tmrw',
                                       metricName='r2')
    r2_score = r2_evaluator.evaluate(y_hat)
    print('r2 validation score : ', r2_score)

    print(model_fit.stages[-1].featureImportances)
Пример #16
0
    today = date.today().isoformat()
    prices_path = "s3://" + bucket_name + "/data/price-data-" + today + ".csv"
    df = spark.read.csv(prices_path, header=True)

    # extract day of year from timestamp and convert column types
    df = df.withColumn("timeframe", col("timeframe").cast(TimestampType()))
    df = df.withColumn("dayofyear", dayofyear("timeframe"))
    for col_name in df.columns[2:]:
        df = df.withColumn(col_name, col(col_name).cast(FloatType()))

    # filter data from last two weeks
    # two_weeks_ago = (date.today() - timedelta(days=13))
    # df = df.filter(to_date(df["timeframe"]) > two_weeks_ago)

    # extract open, high, low, close of the last known price
    window = Window.partitionBy()
    price_data = df.withColumn("last_day", Max(col("dayofyear")).over(window)) \
    .filter(col("dayofyear") == col("last_day")) \
    .drop("max", "dayofyear") \
    .withColumn("high", Max(col("high")).over(Window.partitionBy("symbol").orderBy())) \
    .withColumn("low", Min(col("low")).over(Window.partitionBy("symbol").orderBy()))

    temp = price_data.select("symbol","timeframe", "open") \
    .withColumn("open_time", Min(col("timeframe")).over(Window.partitionBy("symbol").orderBy())) \
    .filter(col("timeframe") == col("open_time")) \
    .drop("timeframe", "open_time")

    ohlc = temp.join(price_data.select("symbol", "high", "low", "close", "last_day", "timeframe") \
            .withColumn("close_time", Max(col("timeframe")).over(Window.partitionBy("symbol").orderBy())) \
            .filter(col("timeframe") == col("close_time")) \
            .drop("timeframe", "open_time", "last_day", "close_time"), ["symbol"])
Пример #17
0
def get_pasr_realtime_value(spark_df, max_cal_period):
    """
    实时计算pasr的值 按照存储过程cd_11_sec.dbo.compute_mkt_pasr_v2翻译过来 未做改动
    :param spark_df: 行情数据 含 open high low close 为spark类型dataframe
    :param max_cal_period: 计算指标值需要多少根bar
    :return:
    """
    @func.pandas_udf("sec_cd string,var_cl string,mkt_cl string," \
                     "pub_dt timestamp," \
                     "pasr float,raw_pi string, f10 float", func.PandasUDFType.GROUPED_MAP)
    def get_pasr(group):
        group = group.sort_values('pub_dt')
        sar_t = group['low'].iloc[:5].min()
        ep_t = group['high'].iloc[:5].max()
        ep_sar_t = sar_t - ep_t
        tempVal_h = group.iloc[4]['high']
        tempVal_l = group.iloc[4]['low']
        p_t1 = 'Long'
        if p_t1 == 'Long':
            if tempVal_l > sar_t:
                p_t = 'Long'
            else:
                p_t = 'Short'
        else:
            if tempVal_h < sar_t:
                p_t = 'Short'
            else:
                p_t = 'Long'
        if p_t != p_t1:
            af_t = 0.02
            afd_t = af_t * ep_sar_t
        else:
            af_t = np.nan
            afd_t = np.nan
        step = 0.02
        z = 0.2
        y = 0.02
        is_updt = 0
        n = len(group)
        pasr_list = [np.nan] * n
        pi_list = [''] * n
        f10_list = [np.nan] * n
        pasr_list[4] = sar_t
        pi_list[4] = p_t
        for i in range(5, n):
            sar_t1 = sar_t
            p_t1 = p_t
            ep_t1 = ep_t
            afd_t1 = afd_t
            af_t1 = af_t
            ep_sar_t1 = ep_sar_t
            l_t = group.iloc[i]['low']
            h_t = group.iloc[i]['high']
            if p_t1 == 'Long' and sar_t1 + afd_t1 < l_t:
                sar_t = sar_t1 + afd_t1
                if h_t > ep_t1:
                    ep_t = h_t
                else:
                    ep_t = ep_t1
                is_updt = 1
                if ep_t > h_t:
                    af_t = af_t1
                elif af_t1 + step < z:
                    af_t = af_t1 + step
                else:
                    af_t = z
            if p_t1 == 'Long' and is_updt == 0:
                p_t = 'Short'
                ep_t = l_t
                sar_t = ep_t1
                af_t = y
                is_updt = 1
            if is_updt == 0 and sar_t1 - afd_t1 > h_t:
                p_t = 'Short'
                sar_t = sar_t1 - afd_t1
                if l_t < ep_t1:
                    ep_t = l_t
                else:
                    ep_t = ep_t1
                if ep_t < l_t:
                    af_t = af_t1
                elif af_t1 + step < z:
                    af_t = af_t1 + step
                else:
                    af_t = z
                is_updt = 1
            if is_updt == 0:
                p_t = 'Long'
                sar_t = ep_t1
                ep_t = h_t
                af_t = y
                is_udpt = 1
            ep_sar_t = abs(ep_t - sar_t)
            afd_t = ep_sar_t * af_t
            is_updt = 0
            pasr_list[i] = sar_t
            pi_list[i] = p_t
            f10_list[i] = afd_t
        group['pasr'] = pasr_list
        group['raw_pi'] = pi_list
        group['f10'] = f10_list
        group = group[['sec_cd', 'var_cl', 'mkt_cl', 'pub_dt', 'pasr', 'raw_pi', 'f10']]
        return group
    spark_df = spark_df.groupby('sec_cd').apply(get_pasr)
    # 得到股票的周期数
    w = Window.partitionBy('sec_cd')
    spark_df = spark_df.withColumn('sec_length', func.count('pub_dt').over(w))
    spark_df = spark_df.withColumn('cal_threshold', func.when(func.col('sec_length') < max_cal_period,
                                                           func.col('sec_length')-1).otherwise(max_cal_period - 1))
    w = Window.partitionBy('sec_cd').orderBy('pub_dt')
    spark_df = spark_df.withColumn('row_no', func.count('pub_dt').over(w))
    spark_df = spark_df.filter(spark_df.row_no >= spark_df.cal_threshold)
    spark_df = spark_df.select('sec_cd', 'var_cl', 'mkt_cl', 'pub_dt', 'pasr', 'raw_pi','f10')
    return spark_df
Пример #18
0
     StructField('timestamp', StringType(), True),
     StructField('open', FloatType(), True),
     StructField('high', FloatType(), True),
     StructField('low', FloatType(), True),
     StructField('close', FloatType(), True),
     StructField('vol', FloatType(), True),
 ])
 logging.info('成功从实时接口得到数据')
 spark.conf.set("spark.sql.execution.arrow.enabled", "true")
 k_line_df = spark.createDataFrame(result_df, schema=schema)
 k_line_df = k_line_df.withColumn('var_cl', func.lit('A'))\
     .withColumn('mkt_cl', func.when(func.substring('sec_cd', 8, 9) == 'SZ', 'Z').otherwise('S'))
 udf_time_format = func.udf(lambda x: datetime.fromtimestamp(x), TimestampType())
 k_line_df = k_line_df.withColumn('pub_dt', udf_time_format(func.substring('timestamp', 1, 10).cast(IntegerType())))
 # 过滤掉数据不足的股票
 w = Window.partitionBy('sec_cd')
 k_line_df = k_line_df.withColumn('sec_length', func.count('pub_dt').over(w))
 k_line_df = k_line_df.filter(k_line_df.sec_length >= min_cal_period)
 k_line_df = k_line_df.drop('sec_length', 'timestamp')
 # 得到修正pasr的值 及 修正的信号
 pasr_spark_df = get_pasr_realtime_value(k_line_df, max_cal_period)
 dmi_spark_df = get_dmi_realtime_value(k_line_df, max_cal_period)
 pasr_spark_df = pasr_spark_df.join(dmi_spark_df, on=['sec_cd', 'var_cl', 'mkt_cl','pub_dt'], how='inner')
 pasr_spark_df = get_amended_pasr(pasr_spark_df)
 # 得到kdj的值和信号
 kdj_spark_df = get_kdj_realtime_value(k_line_df, max_cal_period)
 kdj_spark_df = get_kdj_realtime_signal(kdj_spark_df)
 # 得到pasr的值 pasr信号 kdj的值 kdj的信号
 all_spark_df = pasr_spark_df.join(kdj_spark_df, on=['sec_cd', 'var_cl', 'mkt_cl', 'pub_dt'], how='inner')
 # 结合pasr 和 kdf 得出pk信号 无法确定的信号用-1代替
 all_spark_df = all_spark_df.withColumn('pk_sign', func.when((func.col('pi') == 'Long'), 1)\
Пример #19
0
  .read \
  .format('csv') \
  .options(delimiter = ',') \
  .load('datafeed') \
  .toDF('control_area', 'unit', 'scp', 'raw_station', 'lines', 'division', 'raw_date', 'time', 'description', 'raw_entries', 'raw_exits')

# CLEAN AND CAST RELEVANT VARIABLES
df = df \
  .withColumn('date', to_timestamp('raw_date', 'MM/dd/yyyy').cast(DateType())) \
  .withColumn('hour', split('time', ':').getItem(0).cast(IntegerType())) \
  .withColumn('total_entries', df['raw_entries'].cast(LongType())) \
  .withColumn('total_exits', split('raw_exits', ' ').getItem(0).cast(LongType()))

# CALCULATE HOURLY ENTRIES AND EXITS
df = df.withColumn('dummy', lit(1))
window = Window.partitionBy('dummy').orderBy('control_area', 'unit', 'scp',
                                             'date', 'hour')

df = df \
  .withColumn('last_entries', lag('total_entries', 1).over(window)) \
  .withColumn('last_exits', lag('total_exits', 1).over(window))

df = df \
  .withColumn('entries', when(df['total_entries'] >= df['last_entries'], df['total_entries'] - df['last_entries']).otherwise(df['total_entries'])) \
  .withColumn('exits', when(df['total_exits'] >= df['last_exits'], df['total_exits'] - df['last_exits']).otherwise(df['total_exits']))

# FILTER FOR OUTLIERS AND RELEVANT TIME PERIOD 3/1/2015 - 4/30/2020
df = df \
  .filter(df['entries'] <= 88000) \
  .filter(df['exits'] <= 88000) \
  .filter(df['date'] >= '2015-03-01') \
  .filter(df['date'] <= '2020-04-30')
Пример #20
0
    def processData(sc,hc,fs,con,incFileName,inThresh,outThresh,progTag):
        #incFileName="hdfs://namenode.navroomhdp.com:8020/data/t_cf_inc/100033/t_cf_20161028.txt"
        
        #inThresh=10
        #outThresh=300
        
        #**************************************
        #
        #this procedure will use incfile to caculate 
        #flow(everyday one record) store as file
        #indoor(everyday one record) store as file
        #indoor_for_delete(every indoor records) store in hbase
        #indoor detail(every indoor records) store in hbase and as file
        #
        #destIndoorFile     : /data/indoor/entityid/year/id_date.json      used to generate report
        #destFlowFile       : /data/flow/entityid/year/fl_date.json        used to generate report
        #rec_destIndoorfile : /data/rec_indoor/entityid/year/id_date.json  this folder is mirror of hbase records
        #
        #
        #**************************************

        destIndoorFile=get_str_indoorFileName(incFileName)
        #hdfs://namenode.navroomhdp.com:8020/data/indoor/100033/2016/id_20161028.txt
        rec_destIndoorfile=destIndoorFile.replace("/indoor/","/rec_indoor/")
        #hdfs://namenode.navroomhdp.com:8020/data/rec_indoor/101762/2016/id_20161011.txt
        destFlowFile  =destIndoorFile.replace("/indoor/","/flow/").replace("id_","fl_")
        #hdfs://namenode.navroomhdp.com:8020/data/flow/101762/2016/fl_20161011.txt
        tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag)
        tmp_destFlowFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag)
        tmp_rec_destIndoorFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag)
        EntityID=int(get_str_entityID(incFileName))
        #101762
        histFileName=get_str_histFileName(incFileName) #processed file will be place here
        #hdfs://namenode.navroomhdp.com:8020/data/t_cf/101762/t_cf_20161011.txt
        if fs.exists(sc._jvm.Path(histFileName)):
            tmpFileName=get_str_tmpFileName(histFileName)
            #tmpFileName = hdfs://namenode.navroomhdp.com:8020/data/tmp/101762/t_cf_20161011.txt
            tmpFolderName=tmpFileName.rsplit('/',1)[0]+"tmp"
            #tmpFolderName=hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp
            #copy hist file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/hist and distroy the hist file
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(histFileName),fs,sc._jvm.Path(tmpFolderName+"/hist"),True,True,con) 
            #copy inc file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/inc and  destroy the inc file
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(tmpFolderName+"/inc"),True,True,con)
            #copymerge the 2 files (inc and hist) into one file 
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmpFolderName),fs,sc._jvm.Path(tmpFileName),True,con,None)
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(tmpFileName),fs,sc._jvm.Path(incFileName),True,True,con)

        unixFirtDayofMonth = get_int_firstDayUnixDate(incFileName)
        # firtDayofMonth= 1475251200 it is 20161001 unixdate
        startUnixTime=get_int_fileNameUnixDate(incFileName) #1456808400 this is today's unix datetime
 
        rows_t_cf=sc.textFile(incFileName).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], stime=p[1],flag=p[2]))
        HiveContext.createDataFrame(hc,rows_t_cf).registerTempTable("t_cf_inc_tmp")
        hc.sql("select distinct clientmac,stime,flag from t_cf_inc_tmp").registerTempTable("t_cf")
        
        df=hc.sql("select distinct ClientMac,stime ,lag(stime) over (partition by ClientMac order by stime) as lag_time ,lead(stime) over (partition by ClientMac order by stime) as lead_time from t_cf where flag=1")
        df1=df.withColumn("diff" , df["stime"]-df["lag_time"]).na.fill(-1)
        df1.filter((df1.diff>=outThresh)|(df1.lag_time ==-1)|( df1.lead_time==-1)).registerTempTable("df2")
        df2=hc.sql("select ClientMac,stime,lag_time,lead_time,case when (diff < "+ str(outThresh) +" and diff>0) then diff ELSE 0 end as diff from df2")
        df3=df2.withColumn("lag_time1",df2.lag_time+df2.diff).drop( "lag_time")
        df3.withColumn("lag_time2",func.lead("lag_time1").over(Window.partitionBy("clientMac"))).registerTempTable("df3")
        
        df4=hc.sql("select ClientMac,cast(stime as int) as ETime ,cast(lag_time2 as int) as LTime,cast((lag_time2- stime) as int) as Seconds from df3").na.fill(-1)
        df5=df4.filter((df4.LTime>0)&(df4.Seconds>=inThresh)&(df4.ETime>startUnixTime)&(df4.ETime<(startUnixTime+86400))).withColumn("ENTITYID",lit(EntityID)) #86400 is seonds in one day
        df5.registerTempTable("df5")
        #DF5 will be save to hbase as indoor details(rec_destIndoorfile) ,df6 and df7 will be used for stats caculation
        
        df6=hc.sql("select ClientMac,ETime, LTime, Seconds ,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df5")
        df6.registerTempTable("df6_indoor")
        df7=hc.sql("select ClientMac,min(etime) as etime,max(ltime) as ltime,sum(Seconds) as seconds,utoday from df6_indoor group by ClientMac,utoday")
        df_current_result=df7.withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth))

        flow_sql=  "select ClientMac,min(stime) as etime,max(stime) as ltime from t_cf where stime >"+str(startUnixTime) + " and stime <"+str(startUnixTime+86400)+" group by clientmac"
        hc.sql(flow_sql).registerTempTable("df_flow_tmp")
        df_flow=hc.sql("select ClientMac,etime,ltime,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df_flow_tmp").withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth))
       
        #df_flow.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_FLOW_TODAY") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
        #df_flow.saveAsTable("T_FLOW")
        if len(df5.head(1))==1:  #df5 is not empty better than df5.rdd.isEmpty
            tmp_rec_destIndoorFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag)
            df5.select('clientmac','entityid','etime','ltime','seconds').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_rec_destIndoorFolder)            
            #df5.write.mode('overwrite').json(tmp_rec_destIndoorFolder)
            df5.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
            if fs.exists(sc._jvm.Path(rec_destIndoorfile)):  #the old indoor folder exists,will generate df_delete_pk for phoenix to delete invalid rows
                rows_rec_indoor=sc.textFile(rec_destIndoorfile).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=str(p[0]), entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4])))
                HiveContext.createDataFrame(hc,rows_rec_indoor).registerTempTable("df_old_indoor")
                df_old_indoor_pk=hc.sql("select ClientMac,ENTITYID,ETime from df_old_indoor") 
                df_current_result_pk=hc.sql("select ClientMac,ENTITYID,ETime from df5") 
                df_delete_pk = df_old_indoor_pk.subtract(df_current_result_pk)
                if len(df_delete_pk.head(1))==1:
                    df_delete_pk.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR_FOR_DELETE").option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
        else:
            tmp_rec_destIndoorFolder="NONE"
            
        if len(df_flow.head(1))==1:
            tmp_destFlowFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag)
            df_flow.select('clientmac','entityid','etime','ltime','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destFlowFolder)
            #df_flow.write.mode('overwrite').json(tmp_destFlowFolder)
        else:
            tmp_destFlowFolder="NONE"
            
        if len(df_current_result.head(1))==1:
            tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag)
            df_current_result.select('clientmac','entityid','etime','ltime','seconds','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destIndoorFolder)
            #df_current_result.write.mode('overwrite').json(tmp_destIndoorFolder)
        else:
            tmp_destIndoorFolder="NONE"
        
        sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(histFileName),True,True,con) 

        if fs.exists(sc._jvm.Path(destIndoorFile)):
            fs.delete(sc._jvm.Path(destIndoorFile))
        if fs.exists(sc._jvm.Path(destFlowFile)):
            fs.delete(sc._jvm.Path(destFlowFile))
        if fs.exists(sc._jvm.Path(rec_destIndoorfile)):
            fs.delete(sc._jvm.Path(rec_destIndoorfile))        
        #delete is a must if file already exists otherwise copymerge will fail  
        
        if tmp_destIndoorFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destIndoorFolder),fs,sc._jvm.Path(destIndoorFile),True,con,None)
            #destIndoorFile=get_str_indoorFileName(incFileName)
        if tmp_destFlowFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destFlowFolder),fs,sc._jvm.Path(destFlowFile),True,con,None)
        if tmp_rec_destIndoorFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_rec_destIndoorFolder),fs,sc._jvm.Path(rec_destIndoorfile),True,con,None)
Пример #21
0
# Case 2 #

SparkConf().setAppName("Spark_DF")

#imports
from pyspark.sql import Window
from pyspark.sql.functions import col, rank

#Creating Df
df = spark.createDataFrame(
    [
        (1,'São Paulo', 'SP', '01/01/2021'),
        (2,'São Paulo', 'SP', '02/01/2021'),
        (3,'Guarulhos', 'SP', '03/01/2021'),
        (4,'Guarulhos', 'SP', '04/01/2021'),
        (5,'Campinas', 'SP', '05/01/2021'),
        (6,'Campinas', 'SP', '07/01/2021'),
        (7,'São Paulo', 'SP', '08/01/2021'),
        (8,'Guarulhos', 'SP', '10/01/2021'),
        (9,'Campinas', 'SP', '15/01/2021')
    ],
    ['Transacao','Municipio', 'Estado', 'DtAtualizacao']
)

# ranking transactions by municipio considering date
window = Window.partitionBy("Municipio").orderBy(col("DtAtualizacao").asc())
df1 = df.withColumn("OrdemTransacao",rank().over(window))
Пример #22
0
def avg_ratings_previous(df):
    WindowSpec = Window.partitionBy("userId").rowsBetween(Window.unboundedPreceding,-1)
    df = df.withColumn("avg_ratings_previous", F.avg(F.col("rating")).over(WindowSpec.orderBy("userId", "timestamp")))
    return df
Пример #23
0
    .otherwise(0)
)

# COMMAND ----------

from pyspark.sql import Window

# COMMAND ----------

prediction_summary = (post_model_predictions.groupBy(
    F.window(F.col('timestamp'), '1 day').alias('window'),
    F.col('predicted_quality')).count().withColumn(
        'window_day', F.expr('to_date(window.start)')).withColumn(
            'total',
            F.sum(F.col('count')).over(
                Window.partitionBy('window_day'))).withColumn(
                    'ratio',
                    F.col('count') * 100 / F.col('total')).select(
                        'window_day', 'predicted_quality', 'count', 'total',
                        'ratio').orderBy('window_day'))
display(prediction_summary)

# COMMAND ----------

# MAGIC %md
# MAGIC ### Trend Showing Model Drift

# COMMAND ----------

accurate_prediction_summary = (post_model_predictions.groupBy(
    F.window(F.col('timestamp'), '1 day').alias('window'),
# !usr/bin/env python
# -*- coding:utf-8 -*-
"""Save Instant Variates
"""
from __future__ import unicode_literals
from pyspark.sql import Window
from pyspark.sql import functions as F

__author__ = 'WiGi'
__mtime__ = 'Nov 2, 2018'

PORTFOLIO_ACCUM_RET_TABLE = 'fbidm.fnd_portfolio_accum_ret'
RISK_FREE_RATE_ADJUST_TABLE = 'fdm_sor.sor_fbicics_t_eastmoney_yhll'
SECURITY_TRADING_DAY_TABLE = 'fdm_dpa.dpa_zt_tradingday_day'
INTERNAL_WORKDAYS = 'fdm_sor.sor_bps_working_day_config'

w_order = Window.orderBy('effective_date') \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

w_h = Window.partitionBy('com_id').orderBy('trading_date') \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

w_y = Window.partitionBy('com_id').orderBy('trading_date') \
    .rowsBetween(-242, Window.currentRow)

w_unbnd = Window.partitionBy('com_id').orderBy('trading_date')

w_h_desc = Window.partitionBy('com_id').orderBy(F.desc('trading_date')) \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)
Пример #25
0
dfl.select(dfl.l.getItem(0), dfl.d.getItem("key")).show()

# In[188]:

dfl.select(dfl.l[0], dfl.d["key"]).show()

# In[189]:

from pyspark.sql.functions import *
from pyspark.sql import functions as F
df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show()

# In[190]:

from pyspark.sql import Window
window = Window.partitionBy("name").orderBy("age").rowsBetween(-1, 1)
from pyspark.sql.functions import rank, min

# In[191]:

df.select(df.name.substr(1, 3).alias("col")).collect()

# In[192]:

df.show()

# In[193]:

df.select(df.name,
          F.when(df.age > 4, 1).when(df.age < 3, -1).otherwise(0)).show()
Пример #26
0
def createDataFile(
    start_date, end_date, spark_instance, jackknife_buckets, sample_percent, output_path
):
    feature_data_phase1 = spark_instance.table(_TABLE_SOURCE).select(
        [
            _COL_ID.alias("id"),
            _DATE_PARSED.alias("date"),
            # TODO: Use MD5 instead of CRC32
            (F.floor(F.crc32(_COL_ID) / 100) % jackknife_buckets).alias("bucket"),
            lit(1).alias("is_active"),
            F.when(
                _COL_URI_COUNT >= _NUM_ADAU_THRESHOLD, 1
            ).otherwise(
                0
            ).alias("is_active_active"),
            F.to_date(_COL_PC_DATE).alias("profile_creation_date")
        ] + _MAP_NATURAL_DIMENSIONS.keys()
    ).filter(
        (_DATE_PARSED.between(start_date, end_date)) &
        (_COL_SAMPLE < sample_percent)
    ).withColumn(
        "young_profile",
        F.when(
            col("date") < F.date_add(col("profile_creation_date"), 14), "TRUE"
        ).otherwise(
            "FALSE"
        )
    )

    new_profile_window = Window.partitionBy(col("id")).orderBy(col("date"))
    new_profile_data = feature_data_phase1.filter(
        (col("date") >= col("profile_creation_date")) &
        (col("date") <= F.date_add(col("profile_creation_date"), 6))
    ).select(
        "*",
        F.rank().over(new_profile_window).alias('rank')
    ).filter(
        col('rank') == 1
    ).withColumn(
        "new_profile", lit(1)
    ).drop(
        "date"
    ).withColumn(
        "date", col("profile_creation_date")
    )

    feature_data = feature_data_phase1.alias("fd").join(
        new_profile_data.alias("np"),
        (col("fd.id") == col("np.id")) & (col("fd.date") == col("np.date")),
        how='full',
    ).select(
        [F.coalesce(col("np.new_profile"), lit(0)).alias("new_profile")] +
        [F.coalesce(col("fd.is_active"), lit(0)).alias("is_active")] +
        [F.coalesce(col("fd.is_active_active"), lit(0)).alias("is_active_active")] +
        [
            F.coalesce(col("fd.{}".format(c)), col("np.{}".format(c))).alias(c)
            for c in feature_data_phase1.columns
            if c not in ["is_active", "is_active_active"]
        ]
    )

    once_ever_profiles = feature_data.filter(col("is_active") == 1).groupBy(
        "id"
    ).count(
    ).filter(
        col("count") == 1
    ).select(
        "id"
    ).withColumn(
        "single_day_profile", lit("1")
    )

    feature_data = feature_data.alias("fd").join(
        once_ever_profiles.alias("oep"),
        "id",
        "outer"
    ).fillna(
        {"single_day_profile": "0"}
    )

    ppi_profiles = spark_instance.table("main_summary").select(
      col("client_id").alias("id"),
      lit(1).alias("ppi")
    ).filter(
        '''submission_date_s3 >= '20190121'
        AND scalar_parent_startup_profile_selection_reason IN (
            'firstrun-skipped-default', 'restart-skipped-default'
        )'''
    )

    feature_data = feature_data.alias("fd").join(
        ppi_profiles.alias("ppip"),
        "id",
        "left"
    ).fillna(
        {"ppi": 0}
    )

    feature_data.write.partitionBy("date").mode('overwrite').parquet(output_path)
Пример #27
0
def mysolution(sc,spark):
	# Load data/DataSample.csv to Spark DataFrame

	df_dataSample = spark.read.option("header",True).csv("data\DataSample.csv")
	print('\nDisplay Schema of DataSample.csv dataset table\n')
	df_dataSample.printSchema()


	# In[9]:


	#Display the contents of DataSample data
	print('\nDisplay contents of DataSample.csv dataset table\n')
	df_dataSample.show()


	# ### 1. Cleanup
	# 
	# A sample dataset of request logs is given in data/DataSample.csv. We consider records that have identical geoinfo and timest as suspicious. Please clean up the sample dataset by filtering out those suspicious request records.

	# In[22]:


	# Drop duplicate rows based on columns TimeSt, Latitude and Longitude

	df_clean = df_dataSample.dropDuplicates(['Latitude', 'Longitude']).dropDuplicates([' TimeSt'])
	print ('\nDisplay clean dataset after dropping suspicius requests (i.e., duplicate geoinfo and timest)\n')
	df_clean.show()

	print ("\n------------------------END OF ANSWER #1------------------------\n")
	# **End of Answer #1**
	# 
	# ---

	# ### 2. Label
	# Assign each request (from data/DataSample.csv) to the closest (i.e. minimum distance) POI (from data/POIList.csv).
	# 
	# **Note:** A POI is a geographical Point of Interest.

	# In[23]:


	#Load data from data/POIList.csv in Spark Dataframe

	df_poil = spark.read.option("header",True).csv("data\POIList.csv")
	
	print ('\nDisplay Schema and data of POIList dataset table\n')
	df_poil.printSchema()
	df_poil.show(5)


	# In[24]:


	#Convert pois Spark DataFrame to Pandas Dataframe
	df_pd_pois = df_poil.toPandas()


	# In[69]:


	#Python-UDF to find POI with minimum distance to each entry of DataSample
	def myfun(la2, lo2):
		
		min_dis = 1.0e10
		poi_id = df_pd_pois.loc[0,'POIID']
		
		for i, (la1,lo1) in enumerate( zip(df_pd_pois[' Latitude'], df_pd_pois['Longitude'])):
			la1, lo1 = float(la1), float(lo1)
			dis = math.sqrt((la1-la2)**2 + (lo1-lo2)**2)
			if min_dis > dis:
				min_dis = dis
				poi_id = df_pd_pois.loc[i,'POIID']
				
		return ([poi_id, min_dis])

	#Register Python-UDF with Spark-UDF
	myfun_spark = F.udf(myfun, ArrayType(StringType()))

	df_poi = df_clean.withColumn('temp_col', myfun_spark(  F.col('Latitude').cast(FloatType()),
											  F.col('Longitude').cast(FloatType())  )).cache()\
				.withColumn('POI', F.col('temp_col')[0])\
				.withColumn('POI_DIS', F.col('temp_col')[1].cast(DoubleType()))\
				.drop('temp_col')

	print('Display the dataframe with new columns of nearest POI and POI_DIS(i.e, distance to POI from request)')
	df_poi.show(5)

	print ("\n------------------------END OF ANSWER #2------------------------\n")
	# **End of Answer #2**
	# 
	# ---

	# ### 3. Analysis
	# For each POI, calculate the average and standard deviation of the distance between the POI to each of its assigned requests.
	# 
	# At each POI, draw a circle (with the center at the POI) that includes all of its assigned requests. Calculate the radius and density (requests/area) for each POI.

	# In[70]:


	#Group the dataframe df_poi on 'POI' column and calculate average and standard deviation on each group
	df_avgSD = df_poi.groupby('POI').agg(F.avg('POI_DIS').alias('Average'), F.stddev('POI_DIS').alias('Std_Dev'))

	#Left Join df_avgSD dataframe to df_poil dataframe for completeness
	df_avgSD = df_poil.join(df_avgSD, df_poil.POIID == df_avgSD.POI, how = 'Left').drop(df_avgSD.POI)

	print('Display distance Average and Std_Dev for each POI')
	df_avgSD.show()

	print ("Note: Based on above output, it can be concluded that POI2 radius of influence is ZERO\n")
	# **Note:** Based on above output, it can be concluded that POI2 radius of influence is ZERO

	# In[71]:


	#The radius of Influence-Circle of POI will be the distance to farthest assigned request

	w = Window.partitionBy('POI')

	df_radius = df_poi.withColumn('max_r', F.max('POI_DIS').over(w))                  .where(F.col('POI_DIS') == F.col('max_r'))                  .drop('max_r')

	#Left Join df_radius dataframe to df_poil dataframe for completeness
	df_avgSD_r = df_avgSD.join(df_radius['POI', 'POI_DIS'], df_avgSD.POIID == df_radius.POI, how = 'Left')                   .drop(df_radius.POI)                   .withColumnRenamed('POI_DIS', 'POI_RADIUS')

	print('Display the maximum POI_DIS (i.e, POI_RADIUS) values for each group\n')
	df_avgSD_r.show()


	# In[72]:


	#Calculate number of requests for each POI
	df_no_of_req = df_poi.groupby('POI').agg(F.count('POI').alias('Requests'))

	#Append POI_No.
	df_poi_req = df_avgSD_r.join(df_no_of_req, df_avgSD_r.POIID == df_no_of_req.POI, 'Left' )                         .drop(df_no_of_req['POI'])

	#Calculate the density
	df_poi_density = df_poi_req.withColumn('Density', F.col('Requests')/ (3.14*F.col('POI_RADIUS')**2 ))

	print('Dislay No. of Requests and Density for each POI')
	df_poi_density.show()

	print ("\n------------------------END OF ANSWER #3------------------------\n")
	# **End of Answer #3**
	# 
	# ---

	# ### 4. Data Science/Engineering Tracks
	# Please complete either 4a or 4b. Extra points will be awarded for completing both tasks.
	# 
	# #### 4a. Model
	# To visualize the popularity of each POI, they need to be mapped to a scale that ranges from -10 to 10. Please provide a mathematical model to implement this, taking into consideration of extreme cases and outliers. Aim to be more sensitive around the average and provide as much visual differentiability as possible.
	# Bonus: Try to come up with some reasonable hypotheses regarding POIs, state all assumptions, testing steps and conclusions. Include this as a text file (with a name bonus) in your final submission.

	# In[61]:


	#Import PySpark Libraries for Data Analytics
	from pyspark.ml.feature import MinMaxScaler
	from pyspark.ml.feature import VectorAssembler
	from pyspark.ml import Pipeline


	# In[88]:


	df_poi_density_temp = df_poi_density.filter(df_poi_density.Density.isNotNull())
	#df_poi_density_temp.show()


	# In[109]:


	# Spark-udf for converting column from vector type to double type
	myfun_vec2double = F.udf(lambda x: round(float(list(x)[0]),3), DoubleType())

	# Use Spark VectorAssembler Transformation - Converting column to vector type
	assembler = VectorAssembler(inputCols=['Density'],outputCol="Density_Vector")

	# Use Spark MinMaxScaler Transformation to scale the column within (min,max) range
	scaler = MinMaxScaler(min = -10, max = 10, inputCol="Density_Vector", outputCol="Density_Scaled")

	# Create a Spark Pipeline of VectorAssembler and MinMaxScaler
	pipeline = Pipeline(stages=[assembler, scaler])

	#Drop POI2 as outlier 
	df_poi_density_temp = df_poi_density.filter(df_poi_density.Density.isNotNull())

	# Spark fitting pipeline on dataframe
	df_norm = pipeline.fit(df_poi_density_temp).transform(df_poi_density_temp).withColumn("Density_Scaled", myfun_vec2double("Density_Scaled")).drop("Density_Vector")

	print('Display scaled density for each POI')
	df_norm.select(*['POIID'], *[F.round(c, 3).alias(c) for c in df_norm.columns[1:] ]).show()


	# In[112]:


	df_lognorm = df_norm.withColumn('log_Density', F.log10(F.col('Density')) )

	# Use Spark VectorAssembler Transformation - Converting column to vector type
	assembler_log = VectorAssembler(inputCols=['log_Density'],outputCol="log_Density_Vector")

	# Use Spark MinMaxScaler Transformation to scale the column within (min,max) range
	scaler_log = MinMaxScaler(min = -1.0, max = 1.0, inputCol="log_Density_Vector", outputCol="log_Density_Scaled")

	# Create a Spark Pipeline of VectorAssembler and MinMaxScaler
	pipeline_log = Pipeline(stages=[assembler_log, scaler_log])


	# Spark fitting pipeline on dataframe
	df_lognorm = pipeline_log.fit(df_lognorm).transform(df_lognorm)                  .withColumn("log_Density_Scaled", myfun_vec2double("log_Density_Scaled"))                  .drop("log_Density_Vector")

	print('Display scaled log_density for each POI')
	df_lognorm.select(*['POIID'], *[F.round(c, 3).alias(c) for c in df_lognorm.columns[1:] ]).show()


	#Save the interpretation on results in 'bonus' file
	bonus = """
	Interpretation:
	Density column is the ratio of Requests to POI_Area. log_Density was calculated by taking log10 of Density values. log_Density were scaled in range (-10,10) to calculate log_Density_Scaled.

	It is difficult to come up with a statitics with only 3 good POIs.

	Nonetheless, the density values of POI1 and POI3 are 3 orders higher than POI4. Hence, Density_Scaled, log_Density and log_Density_Scaled values are also skewed.
	POI1 and POI3 attract more customers or requests per unit area of influence.

	Assumptions: POI2 was dropped as outlier. POI2 data must be investigated to identify the cause of zero zone of influence. Bad data collection and formatting can be reasons for POI2 being outlier
	"""

	with open('bonus', 'w') as f:
		f.write(bonus)

	f.close()

	# 
	# **Interpretation:**
	# Density column is the ratio of Requests to POI_Area. log_Density was calculated by taking log10 of Density values. log_Density were scaled in range (-10,10) to calculate log_Density_Scaled.
	# 
	# It is difficult to come up with a statitics with only 3 good POIs.
	# 
	# Nonetheless, the density values of POI1 and POI3 are 3 orders higher than POI4. Hence, Density_Scaled, log_Density and log_Density_Scaled values are also skewed.
	# POI1 and POI3 attract more customers or requests per unit area of influence.
	# 
	# **Assumptions:** POI2 was dropped as outlier. POI2 data must be investigated to identify the cause of zero zone of influence. Bad data collection and formatting can be reasons for POI2 being outlier

	print ("\n------------------------END OF ANSWER #4a------------------------\n")
	# **End of Answer #4a**
	# 
	# ----

	# #### 4b. Pipeline Dependency
	# We use a modular design on all of our data analysis tasks. To get to a final product, we organize steps using a data pipeline. One task may require the output of one or multiple other tasks to run successfully. This creates dependencies between tasks.
	# 
	# We also require the pipeline to be flexible. This means a new task may enter a running pipeline anytime that may not have the tasks' dependencies satisfied. In this event, we may have a set of tasks already running or completed in the pipeline, and we will need to map out which tasks are prerequisites for the newest task so the pipeline can execute them in the correct order. For optimal pipeline execution, when we map out the necessary tasks required to execute the new task, we want to avoid scheduling tasks that have already been executed.
	# 
	# If we treat each task as a node and the dependencies between a pair of tasks as directed edges, we can construct a DAG (Wiki: Directed Acyclic Graph).
	# 
	# Consider the following scenario. At a certain stage of our data processing, we have a set of tasks (starting tasks) that we know all its prerequisite task has been executed, and we wish to reach to a later goal task. We need to map out a path that indicates the order of executions on tasks that finally leads to the goal task. We are looking for a solution that satisfies both necessity and sufficiency -- if a task is not a prerequisite task of goal, or its task is a prerequisite task for starting tasks (already been executed), then it shouldn't be included in the path. The path needs to follow a correct topological ordering of the DAG, hence a task needs to be placed behind all its necessary prerequisite tasks in the path.
	# 
	# Note: A starting task should be included in the path, if and only if it's a prerequisite of the goal task
	# 
	# For example, we have 6 tasks [A, B, C, D, E, F], C depends on A (denoted as A->C), B->C, C->E, E->F. A new job has at least 2 tasks and at most 6 tasks, each task can only appear once.
	# 
	# Examples:
	# 
	# Inputs: starting task: A, goal task: F, output: A,B,C,E,F or B,A,C,E,F.
	# Input: starting task: A,C, goal task:'F', outputs: C,E,F.
	# You will find the starting task and the goal task in question.txt file, list of all tasks in task_ids.txt and dependencies in relations.txt.
	# 
	# Please submit your implementation and result.

	# In[113]:


	#Assign questions data
	questions = {'starting task': '73', 'goal task': '36'}
	questions


	# In[114]:


	#Assign relations data
	relations = [(97,102),
				 (75,31),
				 (75,37),
				 (100,20),
				 (102,36),
				 (102,37),
				 (102,31),
				 (16,37),
				 (39,73),
				 (39,100),
				 (41,73),
				 (41,112),
				 (62,55),
				 (112,97),
				 (20,94),
				 (20,97),
				 (21,20),
				 (73,20),
				 (56,102),
				 (56,75),
				 (56,55),
				 (55,31),
				 (55,37),
				 (94,56),
				 (94,102)]


	# In[117]:


	#Assign Task-IDs data
	task_ids = [97,75,100,102,16,39,41,62,112,20,21,73,56,55,36,37,94,31]


	# In[118]:


	#Create a pandas-dataframe of relations data
	r_pd = pd.DataFrame(relations, columns = ['from', 'to'])
	#r_pd.head()


	# In[119]:


	#Get starting target (st) and goal target (gt)
	st = int(questions['starting task']); print ('Starting Task: %2d'%(st))
	gt = int(questions['goal task']); print ('Goal Task: %2d'%(gt))


	# In[171]:


	#A python recursive function to find the path from source to target
	def replicate_recur(st, gt, mylist=None):

		# If a list has not been passed as argument create an empty one
		if(mylist == None):
			mylist = [st]
			
		if st == gt:
			return mylist
		
		temp = r_pd[r_pd['from'] == st].values

		if not temp.any() :
			temp = 'Error'
			mylist.append(temp)
			return mylist
		
		mylist = [ [i for i in mylist] for _ in range(len(temp))]
		for idx,val in enumerate(temp[:,1]):
			mylist[idx].append(val)
			mylist[idx] = replicate_recur(val, gt, mylist[idx])

		return mylist

	output = []
	def removeNestings(l): 
		for i in l: 
			if (type(i) == list) & (type(i[0]) == list):
				removeNestings(i) 
			elif ('Error' not in i):
				output.append(i)

	print ('\nThe different paths from Starting Target to Goal Target\n')
	removeNestings([replicate_recur(st, gt)])
	pprint(output)

	print ("\n------------------------END OF ANSWER #4b------------------------\n")
Пример #28
0
)


# In[328]:


match_won_lose_count.orderBy("team").show(40)


# # 5. Which are the home and away grounds where India has played most number of matches?
# 

# In[ ]:


window = Window.partitionBy("home_flag").orderBy(F.col("match_count").desc())


# In[293]:


home_away_count = grouped_data.groupby(
    F.col("ground_new").alias("ground"),
    "home_flag"
).agg(
    F.count(F.col("Scorecard")).alias("match_count")
).withColumn(
    "rank",
    F.dense_rank().over(window)
).filter(F.col("rank") <=5 )
Пример #29
0
# Now we do a selection of aggregations over each window, and add these new
# columns to our dataframe. If we had many features, we could add a second
# loop to do this operation per feature.
for days in settings['lookback_windows']:
    # Create unix timestamp col (seconds since epoch). This allows us to use
    # the `rangeBetween` window specification to include rows in the last `days`
    # days. We use this instead of `rowsBetween` in case there are any days
    # without events, since those days do not show up as rows in our aggregated
    # daily dataset.
    unix_ts_col = f.unix_timestamp(f.col('date'))

    # Create the sliding `window` object, including the `rangeBetween` between
    # `days` days ago (`-60*60*24*days`) and today (`0`).
    # Don't forget to partition by `machine`!
    window = Window.partitionBy(['machine'])\
                   .orderBy(unix_ts_col)\
                   .rangeBetween(-60*60*24*days, 0)

    # Make the summary of each feature for the last `days` days. We'll store
    # the new column objects in a `dict` of the form {'new_col_name':
    # new_col_object}, and add all the columns to `df_agg` in a loop at the end.
    #
    # We name the new columns with a suffix like `_2d` indicating that the
    # column is an aggregation of days up to 2 days back in time
    #
    # These summary columns are generally some aggregation function of the
    # corresponding `_0d` summary, over the new sliding window:
    # `f.some_func(f.col(`feat_some_func_0d`)).over(window).
    # We show a few simple examples, feel free to add your own!
    feat = settings['feature'] + '_'  # convenience
    dd = '_{}d'.format(days)  # column name suffix indicating window size
Пример #30
0
def as_of_join(
    entity_df: DataFrame,
    entity_event_timestamp_column: str,
    feature_table_df: DataFrame,
    feature_table: FeatureTable,
    feature_event_timestamp_column: str,
    feature_created_timestamp_column: str,
) -> DataFrame:
    """Perform an as of join between entity and feature table, given a maximum age tolerance.
    Join conditions:
    1. Entity primary key(s) value matches.
    2. Feature event timestamp is the closest match possible to the entity event timestamp,
       but must not be more recent than the entity event timestamp, and the difference must
       not be greater than max_age, unless max_age is not specified.
    3. If more than one feature table rows satisfy condition 1 and 2, feature row with the
       most recent created timestamp will be chosen.
    4. If none of the above conditions are satisfied, the feature rows will have null values.

    Args:
        entity_df (DataFrame): Spark dataframe representing the entities, to be joined with
            the feature tables.
        entity_event_timestamp_column (str): Column name in entity_df which represents
            event timestamp.
        feature_table_df (Dataframe): Spark dataframe representing the feature table.
        feature_table (FeatureTable): Feature table specification, which provide information on
            how the join should be performed, such as the entity primary keys and max age.
        feature_event_timestamp_column (str): Column name in feature_table_df which represents
            event timestamp.
        feature_created_timestamp_column (str): Column name in feature_table_df which represents
            when the feature is created.

    Returns:
        DataFrame: Join result, which contains all the original columns from entity_df, as well
            as all the features specified in feature_table, where the feature columns will
            be prefixed with feature table name.

    Example:
        >>> entity_df.show()
            +------+-------------------+
            |entity|    event_timestamp|
            +------+-------------------+
            |  1001|2020-09-02 00:00:00|
            +------+-------------------+

        >>> feature_table_1_df.show()
            +------+-------+-------------------+-------------------+
            |entity|feature|    event_timestamp|  created_timestamp|
            +------+-------+-------------------+-------------------+
            |    10|    200|2020-09-01 00:00:00|2020-09-02 00:00:00|
            +------+-------+-------------------+-------------------+
            |    10|    400|2020-09-01 00:00:00|2020-09-01 00:00:00|
            +------+-------+-------------------+-------------------+
        >>> feature_table_1.max_age
            None
        >>> feature_table_1.name
            'table1'
        >>> df = as_of_join(entity_df, "event_timestamp", feature_table_1_df, feature_table_1,
                            "event_timestamp", "created_timestamp")
        >>> df.show()
            +------+-------------------+---------------+
            |entity|    event_timestamp|table1__feature|
            +------+-------------------+---------------+
            |  1001|2020-09-02 00:00:00|            200|
            +------+-------------------+---------------+

        >>> feature_table_2.df.show()
            +------+-------+-------------------+-------------------+
            |entity|feature|    event_timestamp|  created_timestamp|
            +------+-------+-------------------+-------------------+
            |    10|    200|2020-09-01 00:00:00|2020-09-02 00:00:00|
            +------+-------+-------------------+-------------------+
            |    10|    400|2020-09-01 00:00:00|2020-09-01 00:00:00|
            +------+-------+-------------------+-------------------+
        >>> feature_table_2.max_age
            43200
        >>> feature_table_2.name
            'table2'
        >>> df = as_of_join(entity_df, "event_timestamp", feature_table_2_df, feature_table_2,
                            "event_timestamp", "created_timestamp")
        >>> df.show()
            +------+-------------------+---------------+
            |entity|    event_timestamp|table2__feature|
            +------+-------------------+---------------+
            |  1001|2020-09-02 00:00:00|           null|
            +------+-------------------+---------------+

    """
    entity_with_id = entity_df.withColumn("_row_nr", monotonically_increasing_id())

    feature_event_timestamp_column_with_prefix = (
        f"{feature_table.name}__{feature_event_timestamp_column}"
    )
    feature_created_timestamp_column_with_prefix = (
        f"{feature_table.name}__{feature_created_timestamp_column}"
    )

    projection = [
        col(col_name).alias(f"{feature_table.name}__{col_name}")
        for col_name in feature_table_df.columns
    ]

    aliased_feature_table_df = feature_table_df.select(projection)

    join_cond = (
        entity_with_id[entity_event_timestamp_column]
        >= aliased_feature_table_df[feature_event_timestamp_column_with_prefix]
    )
    if feature_table.max_age:
        join_cond = join_cond & (
            aliased_feature_table_df[feature_event_timestamp_column_with_prefix]
            >= entity_with_id[entity_event_timestamp_column]
            - expr(f"INTERVAL {feature_table.max_age} seconds")
        )

    for key in feature_table.entity_names:
        join_cond = join_cond & (
            entity_with_id[key]
            == aliased_feature_table_df[f"{feature_table.name}__{key}"]
        )

    conditional_join = entity_with_id.join(
        aliased_feature_table_df, join_cond, "leftOuter"
    )
    for key in feature_table.entity_names:
        conditional_join = conditional_join.drop(
            aliased_feature_table_df[f"{feature_table.name}__{key}"]
        )

    window = Window.partitionBy("_row_nr", *feature_table.entity_names).orderBy(
        col(feature_event_timestamp_column_with_prefix).desc(),
        col(feature_created_timestamp_column_with_prefix).desc(),
    )
    filter_most_recent_feature_timestamp = conditional_join.withColumn(
        "_rank", row_number().over(window)
    ).filter(col("_rank") == 1)

    return filter_most_recent_feature_timestamp.select(
        entity_df.columns
        + [
            f"{feature_table.name}__{feature}"
            for feature in feature_table.feature_names
        ]
    )
Пример #31
0
    # Also creating a feature with the PySpark DateType() just in case
    get_date = udf(lambda x: datetime.datetime.fromtimestamp(x / 1000),
                   DateType())
    df = df.withColumn('date', get_date(col('ts')))

    # Creating a column containing 1 if the event was a "NextSong" page visit or 0 otherwise
    listen_flag = udf(lambda x: 1 if x == 'NextSong' else 0, IntegerType())
    df = df.withColumn('listen_flag', listen_flag('page'))

    # Creating a second table where I will create this feature, then join it back to the main table later
    df_listen_day = df.select(['userId', 'date', 'listen_flag']).groupBy([
        'userId', 'date'
    ]).agg(Fmax('listen_flag')).alias('listen_flag').sort(['userId', 'date'])

    # Defining a window partitioned by User and ordered by date
    window = Window.partitionBy('userId').orderBy(col('date'))

    # Using the above defined window and a lag function to create a previous day column
    df_listen_day = df_listen_day.withColumn('prev_day',
                                             lag(col('date')).over(window))

    # Creating a udf to compare one date to another
    date_group = udf(compare_date_cols, IntegerType())

    # Creating another window partitioned by userId and ordered by date
    windowval = (Window.partitionBy('userId').orderBy('date').rangeBetween(
        Window.unboundedPreceding, 0))

    df_listen_day = df_listen_day \
                                .withColumn('date_group',
                                date_group(col('date'), date_add(col('prev_day'),1)) \
# Remove the rows with no category 
user_category=user_category.filter(F.size('category')>0)

user_category.count()

# Order by the user id and category
user_category.createOrReplaceTempView('user_category')
user_category_order=spark.sql('''select * from user_category order by user1,category''')

user_category_order.show(5)

user1_category_order=user_category_order.select('user1', F.explode('category').alias("category"))\
                                       .groupBy('user1','category')\
                                       .count()\
                                       .select('user1','category', 'count',F.sum('count').over(Window.partitionBy("user1")).alias('total_count'))\
                                       .sort('user1', 'category')

# Calculate the proprotion of each category
user1_category_order=user1_category_order.select('user1','category',(100*(round(((col("count") /col("total_count"))),2))).alias("proportion_%"))

# Pivot the table
user1_category_order=user1_category_order.groupBy('user1')\
                       .pivot("category").sum("proportion_%").sort("user1")
user1_category_order.show()

"""**explore how a user’s spending profile is evolving over her lifetime in Venmo, starting from 0 up to 12.*"""

# Create table for user profile every month

# Break the datetime into months
    test_df = data_df.filter(data_df['dt'] >= test1_begin_date).filter(
        data_df['dt'] < test1_end_date)  #.filter(F.col('model')==1)
    val_df = data_df.filter(data_df['dt'] >= val_begin_date).filter(
        data_df['dt'] < val_end_date)
    use_only_fault_disk = False
    if use_only_fault_disk is True:

        # only fault tag
        train_df_n = train_df.filter(
            F.col('fault_time').isNotNull()).filter(F.col('label') == 0)
        train_df_p = data_df.filter(F.col('label') == 1).filter(F.datediff(F.col('fault_time'), F.col('dt')) <= 30) \
            .filter(F.col('dt') >= '2017-08-01').filter(F.col('fault_time') < tags_end_date)
        train_df = train_df_n.union(train_df_p)

    else:
        window = Window.partitionBy('serial_number', 'model',
                                    'month').orderBy(F.rand(seed=2019))
        train_df_p = data_df.filter(F.col('label') == 1).filter(F.datediff(F.col('fault_time'),F.col('dt'))<=30)\
            .filter(F.col('dt')>='2017-08-01').filter(F.col('fault_time')<tags_end_date)#.withColumn('topn', F.row_number().over(window)).where(
        #F.col('topn') <= 2).select(train_df.columns)
        train_df_n = train_df.filter(F.col('label') == 0).withColumn(
            'topn',
            F.row_number().over(window)).where(F.col('topn') <= 2).select(
                train_df.columns)

        train_df_ext_n = train_df.filter(
            F.col('fault_time').isNotNull()).filter(F.col('label') == 0)
        train_df = train_df_n.union(train_df_p)  #.union(train_df_ext_n)
        #train_df=train_df.withColumn('model_serial',F.when(F.col('fault_time').isNotNull(),F.col('model_serial')).otherwise(F.lit(-1)))

    #window_model_serial = Window.partitionBy('serial_number', 'model').orderBy(F.rand(seed=2019))
    #label_data = data_df.withColumn('topn', F.row_number().over(window_model_serial)).where(F.col('topn') <= 1).select(
Пример #34
0
        'Itau_open', 'BVSP_open', 'USDBRL_open', 'lag_1', 'lag_2', 'lag_3'
    ]
    vectorAssembler = VectorAssembler(inputCols=FEATURES_COL1,
                                      outputCol="features")
    vdf = vectorAssembler.transform(df.na.drop())
    vdf = vdf.select(
        ['Date', 'Itau_Close', 'features', 'class_N', 'class_NN', 'class_P'])
    scale_features = MinMaxScaler(inputCol='features',
                                  outputCol='scaled_features')
    model_scale = scale_features.fit(vdf)
    df_scaled = model_scale.transform(vdf)
    FEATURES_COL1 = ['scaled_features', 'class_N', 'class_NN', 'class_P']
    vectorAssembler = VectorAssembler(inputCols=FEATURES_COL1,
                                      outputCol="Col_features")
    df_completed = vectorAssembler.transform(df_scaled)
    df_completed = df_completed.select(['Date', 'Itau_Close', 'Col_features'])
    df_completed = df_completed.withColumn(
        "rank",
        percent_rank().over(Window.partitionBy().orderBy("Date")))
    train_df = df_completed.where("rank <= .95").drop("rank")
    test_df = df_completed.where("rank > .95").drop("rank")
    lr = LinearRegression(featuresCol='Col_features', labelCol='Itau_Close')
    lr_model = lr.fit(train_df)
    lr_predictions = lr_model.transform(test_df)
    lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                     labelCol="Itau_Close",metricName="r2")
    test_result = lr_model.evaluate(test_df)
    predictions = lr_model.transform(test_df)
    predictions.select("prediction", "Itau_close").write.option(
        "header", "true").mode("overwrite").parquet('pred.parquet')
Пример #35
0
def spark_chrono_split(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark chronological splitter
    This function splits data in a chronological manner. That is, for each user / item, the
    split function takes proportions of ratings which is specified by the split ratio(s).
    The split is stratified.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits 
            data into several portions corresponding to the split ratios. If a list is 
            provided and the ratios are not summed to 1, they will be normalized.
        seed (int): Seed.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to filter
            with min_rating.
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError("min_rating should be integer and larger than or equal to 1.")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    if min_rating > 1:
        data = min_rating_filter_spark(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.partitionBy(split_by_column).orderBy(col(col_timestamp))

    rating_grouped = (
        data.groupBy(split_by_column)
        .agg({col_timestamp: "count"})
        .withColumnRenamed("count(" + col_timestamp + ")", "count")
    )
    rating_all = data.join(broadcast(rating_grouped), on=split_by_column)

    rating_rank = rating_all.withColumn(
        "rank", row_number().over(window_spec) / col("count")
    )

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i])
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            )

        splits.append(rating_split)

    return splits
Пример #36
0
    def _attach_distributed_sequence_column(sdf, column_name):
        """
        >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark()
        >>> sdf = InternalFrame._attach_distributed_sequence_column(sdf, column_name="sequence")
        >>> sdf.sort("sequence").show()  # doctest: +NORMALIZE_WHITESPACE
        +--------+---+
        |sequence|  0|
        +--------+---+
        |       0|  a|
        |       1|  b|
        |       2|  c|
        +--------+---+
        """
        scols = [scol_for(sdf, column) for column in sdf.columns]

        spark_partition_column = verify_temp_column_name(
            sdf, "__spark_partition_id__")
        offset_column = verify_temp_column_name(sdf, "__offset__")
        row_number_column = verify_temp_column_name(sdf, "__row_number__")

        # 1. Calculates counts per each partition ID. `counts` here is, for instance,
        #     {
        #         1: 83,
        #         6: 83,
        #         3: 83,
        #         ...
        #     }
        sdf = sdf.withColumn(spark_partition_column, F.spark_partition_id())

        # Checkpoint the DataFrame to fix the partition ID.
        sdf = sdf.localCheckpoint(eager=False)

        counts = map(
            lambda x: (x["key"], x["count"]),
            sdf.groupby(
                sdf[spark_partition_column].alias("key")).count().collect(),
        )

        # 2. Calculates cumulative sum in an order of partition id.
        #     Note that it does not matter if partition id guarantees its order or not.
        #     We just need a one-by-one sequential id.

        # sort by partition key.
        sorted_counts = sorted(counts, key=lambda x: x[0])
        # get cumulative sum in an order of partition key.
        cumulative_counts = [0] + list(
            accumulate(map(lambda count: count[1], sorted_counts)))
        # zip it with partition key.
        sums = dict(
            zip(map(lambda count: count[0], sorted_counts), cumulative_counts))

        # 3. Attach offset for each partition.
        @pandas_udf(LongType(), PandasUDFType.SCALAR)
        def offset(id):
            current_partition_offset = sums[id.iloc[0]]
            return pd.Series(current_partition_offset).repeat(len(id))

        sdf = sdf.withColumn(offset_column, offset(spark_partition_column))

        # 4. Calculate row_number in each partition.
        w = Window.partitionBy(spark_partition_column).orderBy(
            F.monotonically_increasing_id())
        row_number = F.row_number().over(w)
        sdf = sdf.withColumn(row_number_column, row_number)

        # 5. Calculate the index.
        return sdf.select((sdf[offset_column] + sdf[row_number_column] -
                           1).alias(column_name), *scols)
Пример #37
0
        ccsProc=[],
        como=[],
        DispCD='01')
]

# In[26]:

test2 = spark.createDataFrame(test).repartition(2)

# In[27]:

test2.rdd.getNumPartitions()

# In[28]:

wndw = Window.partitionBy('MemberSK').orderBy(['StartDT', 'EndDT'])

# In[29]:

test3 = test2.withColumn(
    "GroupID",
    F.when(
        F.col("StartDT").between(
            F.lag(F.col("StartDT"), 1).over(wndw),
            F.lag(F.col("EndDT"), 1).over(wndw)),
        None).otherwise(F.monotonically_increasing_id())).withColumn(
            "GroupID",
            F.last(F.col("GroupID"), ignorenulls=True).over(
                wndw.rowsBetween(Window.unboundedPreceding, 0))).cache()

# In[30]:
        Product("Thin", "Cell phone", 6000),
        Product("Normal", "Tablet", 1500),
        Product("Mini", "Tablet", 5500),
        Product("Ultra Thin", "Cell phone", 5000),
        Product("Very Thin", "Cell phone", 6000),
        Product("Big", "Tablet", 2500),
        Product("Bendable", "Cell phone", 3000),
        Product("Foldable", "Cell phone", 3000),
        Product("Pro", "Tablet", 4500),
        Product("Pro2", "Tablet", 6500)
    ]
    products = sparkSession.createDataFrame(productList1)
    products.createOrReplaceTempView("products")
    products.printSchema()

    catRevenueWindowSpec = Window.partitionBy("category")\
        .orderBy("revenue")

    sparkSession.sql("""
            select
              product,
              category,
              revenue,
              lag(revenue, 1) over (partition by category order by revenue) as prevRevenue,
              lag(revenue, 2, 0) over(partition by category order by revenue) as prev2Revenue,
              row_number() over (partition by category order by revenue) as row_number,
              rank() over(partition by category order by revenue) as rev_rank,
              dense_rank() over(partition by category order by revenue) as rev_dense_rank
             from
              products
          """)\
        .show(5,False)