Exemplo n.º 1
0
    def append_ranks(self, hour_df, day_df, month_df, year_df):
        """
        Ranks based on pageviews within categories. For hour_df the category is a single hour,
         for day_df the category is a single day, and so on.
        Args
            :param hour_df: Dataframe with hourly aggregated data.
            :param day_df: Dataframe with daily aggregated data.
            :param month_df: Dataframe with monthly aggregated data.
            :param year_df: Dataframe with yearly aggregated data.
        Returns:
            :return: hour_df, day_df, month_df, year_df with ranks added.
        """
        #Specify windows to rank within.
        hour_window = Window.partitionBy(['year', 'month', 'day', 'hour']).orderBy(hour_df['pageviews'].desc())
        day_window = Window.partitionBy(['year', 'month', 'day']).orderBy(day_df['pageviews'].desc())
        month_window = Window.partitionBy(['year', 'month']).orderBy(month_df['pageviews'].desc())
        year_window = Window.partitionBy(['year']).orderBy(year_df['pageviews'].desc())

        #Create ranking functions
        hour_rank = dense_rank().over(hour_window)
        day_rank = dense_rank().over(day_window)
        month_rank = dense_rank().over(month_window)
        year_rank = dense_rank().over(year_window)

        #Apply the ranking functions to make new columns
        hour_df = hour_df.withColumn("hour_rank", hour_rank)
        day_df = day_df.withColumn("day_rank", day_rank)
        month_df = month_df.withColumn("month_rank", month_rank)
        year_df = year_df.withColumn("year_rank", year_rank)
        return hour_df, day_df, month_df, year_df
Exemplo n.º 2
0
 def gen_report_table(hc,curUnixDay):
     rows_indoor=sc.textFile("/data/indoor/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4]),utoday=int(p[5]),ufirstday=int(p[6])))
     HiveContext.createDataFrame(hc,rows_indoor).registerTempTable("df_indoor")
     #ClientMac|etime|ltime|seconds|utoday|ENTITYID|UFIRSTDAY 
     sql="select entityid,clientmac,utoday,UFIRSTDAY,seconds,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_indoor order by entityid,clientmac,utoday" 
     df_id_stat=hc.sql(sql)
     df_id_mm=df_id_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_id_mm df_min_max ,to caculate firtarrival and last arrival 
     df_id_stat_distinct=df_id_stat.drop("seconds").drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_id_prepremon=df_id_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_id = [df_id_mm.clientmac == df_id_prepremon.clientmac, df_id_mm.entityid == df_id_prepremon.entityid, df_id_mm.UFIRSTDAY==df_id_prepremon.UFIRSTDAY]
     df_indoor_fin_tmp=df_id_mm.join(df_id_prepremon, cond_id, 'outer').select(df_id_mm.entityid,df_id_mm.clientmac,df_id_mm.utoday,df_id_mm.UFIRSTDAY,df_id_mm.seconds,df_id_mm.day_30,df_id_mm.day_7,df_id_mm.min,df_id_mm.max,df_id_mm.total_cnt,df_id_prepremon.prepre_mon)
     df_indoor_fin_tmp=df_indoor_fin_tmp.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","seconds as secondsbyday","day_30 as indoors30","day_7 as indoors7","min as FirstIndoor","max as LastIndoor","total_cnt as indoors","prepre_mon as indoorsPrevMonth")
     
     #newly added part for indoors7 and indoors30 based on current date
     df_indoor_fin_tmp1= df_indoor_fin_tmp.withColumn("r_day_7", func.when((curUnixDay- df_indoor_fin_tmp.utoday)/86400<7 , 1).otherwise(0))
     df_indoor_fin_tmp2=df_indoor_fin_tmp1.withColumn("r_day_30", func.when((curUnixDay- df_indoor_fin_tmp1.utoday)/86400<30 , 1).otherwise(0))
     df_indoor_fin_tmp3=df_indoor_fin_tmp2.withColumn("r_indoors7",func.sum("r_day_7").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin_tmp4=df_indoor_fin_tmp3.withColumn("r_indoors30",func.sum("r_day_30").over(Window.partitionBy("entityid","clientmac")))
     df_indoor_fin=df_indoor_fin_tmp4.drop("r_day_7").drop("r_day_30")
     hc.sql("drop table if exists df_indoor_fin")
     df_indoor_fin.write.saveAsTable("df_indoor_fin")
     
     rows_flow=sc.textFile("/data/flow/*/*").map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),utoday=int(p[4]),ufirstday=int(p[5])))
     HiveContext.createDataFrame(hc,rows_flow).registerTempTable("df_flow")
     
     # ClientMac|ENTITYID|UFIRSTDAY|etime|ltime|utoday
     sql="select entityid,clientmac,utoday,UFIRSTDAY,"
     sql=sql+"count(1) over(partition by entityid,clientmac) as total_cnt,"
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  2505600 preceding) as day_30," # 2505600 is 29 days
     sql=sql+"count(1) over (partition by entityid,clientmac order by utoday range  518400 preceding)  as day_7," #518400 is 6 days
     sql=sql+"count(1) over (partition by entityid,clientmac,UFIRSTDAY order by UFIRSTDAY  range 1 preceding) as pre_mon "
     sql=sql+"from df_flow order by entityid,clientmac,utoday" 
     df_fl_stat=hc.sql(sql)
     df_fl_mm=df_fl_stat.withColumn("min", func.min("utoday").over(Window.partitionBy("entityid","clientmac"))).withColumn("max", func.max("utoday").over(Window.partitionBy("entityid","clientmac")))
     #df_fl_mm df_min_max ,to caculate firtarrival and last arrival 
     df_fl_stat_distinct=df_fl_stat.drop("day_30").drop("day_7").drop("utoday").drop("total_cnt").distinct()
     #distinct df is for lag function to work
     df_fl_prepremon=df_fl_stat_distinct.withColumn("prepre_mon",func.lag("pre_mon").over(Window.partitionBy("entityid","clientmac").orderBy("entityid","clientmac","UFIRSTDAY"))).drop("pre_mon").na.fill(0)
     
     cond_fl = [df_fl_mm.clientmac == df_fl_prepremon.clientmac, df_fl_mm.entityid == df_fl_prepremon.entityid, df_fl_mm.UFIRSTDAY==df_fl_prepremon.UFIRSTDAY]
     df_flow_fin=df_fl_mm.join(df_fl_prepremon, cond_fl, 'outer').select(df_fl_mm.entityid,df_fl_mm.clientmac,df_fl_mm.utoday,df_fl_mm.UFIRSTDAY,df_fl_mm.day_30,df_fl_mm.day_7,df_fl_mm.min,df_fl_mm.max,df_fl_mm.total_cnt,df_fl_prepremon.prepre_mon)
     df_flow_fin=df_flow_fin.selectExpr("entityid as entityid","clientmac as  clientmac","utoday as utoday","UFIRSTDAY as ufirstday","day_30 as visits30","day_7 as visits7","min as FirstVisit","max as LastVisit","total_cnt as visits","prepre_mon as visitsPrevMonth")
     hc.sql("drop table if exists df_flow_fin")
     df_flow_fin.write.saveAsTable("df_flow_fin") 
Exemplo n.º 3
0
def process_file(date_update):
    """Process downloaded MEDLINE folder to parquet file"""
    print("Process MEDLINE file to parquet")
    # remove if folder still exist
    if glob(os.path.join(save_dir, 'medline_*.parquet')):
        subprocess.call(['rm', '-rf', 'medline_*.parquet'])

    date_update_str = date_update.strftime("%Y_%m_%d")
    path_rdd = sc.parallelize(glob(os.path.join(download_dir, 'medline*.xml.gz')), numSlices=1000)
    parse_results_rdd = path_rdd.\
        flatMap(lambda x: [Row(file_name=os.path.basename(x), **publication_dict)
                           for publication_dict in pp.parse_medline_xml(x)])
    medline_df = parse_results_rdd.toDF()
    medline_df.write.parquet(os.path.join(save_dir, 'medline_raw_%s.parquet' % date_update_str),
                             mode='overwrite')

    window = Window.partitionBy(['pmid']).orderBy(desc('file_name'))
    windowed_df = medline_df.select(
        max('delete').over(window).alias('is_deleted'),
        rank().over(window).alias('pos'),
        '*')
    windowed_df.\
        where('is_deleted = False and pos = 1').\
        write.parquet(os.path.join(save_dir, 'medline_lastview_%s.parquet' % date_update_str),
                      mode='overwrite')

    # parse grant database
    parse_grant_rdd = path_rdd.flatMap(lambda x: pp.parse_medline_grant_id(x))\
        .filter(lambda x: x is not None)\
        .map(lambda x: Row(**x))
    grant_df = parse_grant_rdd.toDF()
    grant_df.write.parquet(os.path.join(save_dir, 'medline_grant_%s.parquet' % date_update_str),
                           mode='overwrite')
Exemplo n.º 4
0
def spark_timestamp_split(
    data,
    ratio=0.75,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark timestamp based splitter
    The splitter splits the data into sets by timestamps without stratification on either
    user or item.
    The ratios are applied on the timestamp column which is divided accordingly into
    several partitions.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list is
            provided and the ratios are not summed to 1, they will be normalized.
            Earlier indexed splits will have earlier times
            (e.g the latest time in split[0] <= the earliest time in split[1])
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps. Float number represented in
        seconds since Epoch.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    multi_split, ratio = process_split_ratio(ratio)

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.orderBy(col(col_timestamp))
    rating = data.withColumn("rank", row_number().over(window_spec))

    data_count = rating.count()
    rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count)

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop(
                "rank"
            )
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            ).drop("rank")

        splits.append(rating_split)

    return splits
Exemplo n.º 5
0
def _get_relevant_items_by_timestamp(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_timestamp=DEFAULT_TIMESTAMP_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        k=DEFAULT_K
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevent items to be filtered by the function.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user
Exemplo n.º 6
0
def _get_relevant_items_by_threshold(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        threshold=DEFAULT_THRESHOLD
):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        threshold (float): threshold for determining the relevant recommended items.
            This is used for the case that predicted ratings follow a known
            distribution.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
            items.
    """
    items_for_user = (
        dataframe
        .orderBy(col_rating, ascending=False)
        .where(col_rating + " >= " + str(threshold))
        .select(
            col_user, col_item, col_rating
        )
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates()
    )

    return items_for_user
Exemplo n.º 7
0
def _get_top_k_items(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user
def add_recent_encounters(encounter_features):

    lowest_date = (encounter_features.select('START').orderBy('START').limit(
        1).withColumnRenamed('START', 'EARLIEST_TIME'))

    output_df = (encounter_features.crossJoin(lowest_date).withColumn(
        "day", datediff(col('START'), col('EARLIEST_TIME'))).withColumn(
            "patient_age", datediff(col('START'), col('BIRTHDATE'))))

    w = (Window.orderBy(output_df['day']).partitionBy(
        output_df['PATIENT']).rangeBetween(-int(num_days), -1))

    for comorbidity_idx in range(num_conditions):
        col_name = "recent_%d" % comorbidity_idx

        output_df = (output_df.withColumn(
            col_name,
            sum(col("comorbidity_%d" % comorbidity_idx)).over(w)).withColumn(
                col_name, coalesce(col(col_name), lit(0))))

    return (output_df)
Exemplo n.º 9
0
def fill_missing_values_forward(sdf, ordering_col, cols_to_fill, *id_cols):
    """ Fill missing values by carrying previous values forward.
  
  Args:
    sdf: a Spark DataFrame
    ordering_col: column by which rows should be sorted
    cols_to_fill: list of columns where missing values will be filled
    id_cols: list of columns that collectively form a unique identifier that can be used to partition cases.
  
  """
    lookback_window = Window.partitionBy(*id_cols)\
                   .orderBy(ordering_col)\
                   .rowsBetween( Window.unboundedPreceding, 0)

    for ctf in cols_to_fill:
        filled_col = ctf + "_filled"
        sdf = sdf.withColumn(
            filled_col,
            fn.last(sdf[ctf], ignorenulls=True).over(lookback_window))

    return sdf
Exemplo n.º 10
0
    def converting_file_into_chunks(self, df, chunk_size=100):
        """
        Large pyspark dataframe is converted into chunks and converted to pandas to
         convert it into pandas df
        :param df: orig dataframe
        :param chunk_size: size of each dataframe chunk
        :return:
        """
        # created window using first column
        window = Window.orderBy(funct.col(df.columns[0]))
        df = df.withColumn('row_number', funct.row_number().over(window))

        for i in range(0, df.count(), chunk_size):
            chunk = df.where((funct.col('row_number') >= i) & (funct.col('row_number') < (i + chunk_size)))
            logger.info(chunk.count())
            if chunk.count() != 0:
                pd_df = chunk.toPandas()

                ### you can do what ever you want to with the pandas ddataframe
                pd_df.to_csv("{}_file.csv".format(i))
                print("############")
Exemplo n.º 11
0
    def etl_load_quote(self):
        logging.debug("Inside transform parking occupancy dataset module")

        quote_common = self.spark.read.\
                       parquet(self._load_path+"/partition=Q/*.parquet")

        quote=quote_common.select("trade_dt","symbol","exchange","event_tm","event_seq_nb","arrival_tm", \
                                 "bid_pr","bid_size","ask_pr","ask_size")

        quote_corrected=quote.withColumn("row_number",F.row_number().over(Window.partitionBy(quote.trade_dt,quote.symbol,\
                                            quote.exchange,quote.event_tm,quote.event_seq_nb).\
                                            orderBy(quote.arrival_tm.desc()))).filter(F.col("row_number")==1).drop("row_number")

        quote_corrected.show(3, truncate=False)

        quote_corrected.printSchema()

        logging.debug(
            "Writting transformed quote dataframe to a trade partition")
        quote_corrected.withColumn("trade_date", F.col("trade_dt")).write.\
                       partitionBy("trade_dt").mode("overwrite").parquet(self._save_path+"/quote/")
Exemplo n.º 12
0
 def test_window_functions_without_partitionBy(self):
     df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"),
                                      (1, "2")], ["key", "value"])
     w = Window.orderBy("key", df.value)
     from pyspark.sql import functions as F
     sel = df.select(
         df.value, df.key,
         F.max("key").over(w.rowsBetween(0, 1)),
         F.min("key").over(w.rowsBetween(0, 1)),
         F.count("key").over(w.rowsBetween(float('-inf'), float('inf'))),
         F.row_number().over(w),
         F.rank().over(w),
         F.dense_rank().over(w),
         F.ntile(2).over(w))
     rs = sorted(sel.collect())
     expected = [
         ("1", 1, 1, 1, 4, 1, 1, 1, 1), ("2", 1, 1, 1, 4, 2, 2, 2, 1),
         ("2", 1, 2, 1, 4, 3, 2, 2, 2), ("2", 2, 2, 2, 4, 4, 4, 3, 2)
     ]
     for r, ex in zip(rs, expected):
         self.assertEqual(tuple(r), ex[:len(r)])
Exemplo n.º 13
0
def add_frequency_total(df_impressions,
                        id_col="user_id",
                        order_col="timestamp"):
    """
    Calculate the `frequency_total` that is the frequency of exposure for each unique `id_col`.
    This method also adds the `exposure_index` (1 to `frequency_total`) in case needed for 
    other calculation.
    The table is partitioned by `id_col` and ordered by `timestamp`.
    """
    if id_col not in df_impressions.columns:
        raise Exception(f"The id_col {id_col} does not exist!")
    if order_col not in df_impressions.columns:
        raise Exception(f"The order_col {order_col} does not exist!")

    window = Window.partitionBy(id_col)
    df = (df_impressions.withColumn(
        "exposure_index",
        F.row_number().over(window.orderBy(order_col))).withColumn(
            "frequency_total",
            F.count("*").over(window)))
    return df
Exemplo n.º 14
0
def main(spark, model_file, data_file, K):
    '''Main routine for Collaborative Filtering Model testing

        Parameters
        ----------
        spark: SparkSession object

        model_file: string, path to store the model

        data_file: string, path to the parquet file to load

        K: int, evaluations are based on predictions of the top K items for each user
        '''
    testIdx = spark.read.parquet(data_file)
    model = ALSModel.load(model_file)

    users_val = testIdx.select("user_idx").distinct()

    perUserPredictedItemsDF = model.recommendForUserSubset(users_val, K)
    perUserPredictedItemsDF = perUserPredictedItemsDF.select(
        "user_idx", "recommendations.track_idx").withColumnRenamed(
            'user_idx', 'user').withColumnRenamed('recommendations.track_idx',
                                                  'items')

    w2 = Window.partitionBy('user_idx').orderBy(col('count').desc())
    perUserActualItemsDF = testIdx.select(
        'user_idx', 'track_idx', 'count',
        F.rank().over(w2).alias('rank')).where(
            'rank <= {0}'.format(K)).groupBy('user_idx').agg(
                expr('collect_list(track_idx) as items')).withColumnRenamed(
                    'user_idx', 'user')

    perUserItemsRDD = perUserPredictedItemsDF.join(
        perUserActualItemsDF, 'user').rdd.map(lambda row: (row[1], row[2]))
    rankingMetrics = RankingMetrics(perUserItemsRDD)

    print("============================================")
    print("meanAveragePrecision = %.8f" % rankingMetrics.meanAveragePrecision)
    print("precisionAt(K) = %.8f" % rankingMetrics.precisionAt(K))
    print("ndcgAt(K) = %.8f" % rankingMetrics.ndcgAt(K))
Exemplo n.º 15
0
def _get_top_k_items(
    dataframe,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    k=DEFAULT_K,
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.

    .. note::
        if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (pyspark.sql.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        pyspark.sql.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user
Exemplo n.º 16
0
def org_predictData(df_buildFeatures, df_holidayList):
    """
       每天凌晨开始生成下一天需要预测的数据,预测的日期从'今天'开始;
       预测数据集中preceding3day_price处理逻辑:将现有数据partitionBy('flineId_noDate', 'intervalDays').orderBy(desc('departDate'))进行分区排序,\
                                            并取最新的3个价格的均值,作为预测数据集相对应的('flineId_noDate', 'intervalDays')的参考价格数据
       """
    window = Window.partitionBy('flineId_noDate',
                                'intervalDays').orderBy(desc('departDate'))
    df_preceding3day_priceFeatures = df_buildFeatures.withColumn('rank', row_number().over(window)) \
           .filter("rank <= 3") \
           .groupBy('flineId_noDate', 'intervalDays') \
           .agg(avg('price').alias('preceding3day_price'))
    df_predictData = df_buildFeatures.filter((col('departDate') >= to_date(lit(todayStr2))) & (col('departDate') <= date_add(to_date(lit(todayStr2)), 90))) \
           .select('departDate', 'flineId_noDate') \
           .distinct()\
           .withColumn('todayStr2', lit(todayStr2))
    df_predictData.createOrReplaceTempView('predict_data')
    df_predictData = spark.sql("select t.flineId_noDate,t.departDate,date_add(date_add(todayStr2, 1), p.k) as predictDate from predict_data t\
                                  lateral view posexplode(split(space(datediff(t.departDate,date_add(todayStr2,1))),'')) p as k,v"                                                                                                                               )\
                          .join(df_holidayList, col('departDate') == col('dateList'), 'left') \
                          .drop('dateList') \
                          .na.fill({'is_vacation': 'no', 'vacation_days': 0, 'day_of_vacation': 0}) \
                          .withColumn('org', split(col('flineId_noDate'), '_').getItem(0)) \
                          .withColumn('dst', split(col('flineId_noDate'), '_').getItem(1)) \
                          .withColumn('isReturn_type', split(col('flineId_noDate'), '_').getItem(2)) \
                          .withColumn('isDirect_type', split(col('flineId_noDate'), '_').getItem(4)) \
                          .withColumn('departYear', year('departDate')) \
                          .withColumn('departMonth', month('departDate')) \
                          .withColumn('depart_dayofmonth', dayofmonth('departDate')) \
                          .withColumn('depart_weekofyear', weekofyear('departDate')) \
                          .withColumn('depart_dayofweek', datediff('departDate', to_date(lit('2017-01-09'))) % 7 + 1)\
                          .withColumn('departQuarter', quarter('departDate')) \
                          .withColumn('intervalDays', datediff('departDate', 'predictDate')) \
                          .withColumn('intervalMonths', round(months_between('departDate', 'predictDate')))\
                          .withColumn('intervalWeeks', round(col('intervalDays') / 7)) \
                          .withColumn('intervalQuarters', round(col('intervalDays') / 120))\
                          .join(df_preceding3day_priceFeatures, ['flineId_noDate', 'intervalDays'], 'left')\
                          .dropna(how='any')
    #col('preceding3day_price')有空值,所以要删除,空缺4363499个
    return df_predictData
Exemplo n.º 17
0
def locale_on_date(data, date, topN, period=7, country_list=None):
    """ Gets the ratio of the top locales in each country over the last week.

    parameters:
        data: The main ping server
        date: The date to find the locale distribution
        topN: The number of locales to get for each country. Only does the top N.
        period: The number of days before looked at in the analyisis
        country_list: The list to find look at in the analysis

    output:
       dataframe with columns:
           ['country', 'submission_date_s3', 'locale', 'pct_on_locale']
    """
    data_all = keep_countries_and_all(data, country_list)
    begin = date_plus_x_days(date, -period)

    wau = data_all\
        .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\
        .groupBy('country')\
        .agg(countDistinct('client_id').alias('WAU'))

    locale_wau = data_all\
        .filter((col('submission_date_s3') <= date) & (col('submission_date_s3') > begin))\
        .groupBy('country', 'locale')\
        .agg(countDistinct('client_id').alias('WAU_on_locale'))\
        .select(lit(begin).alias('start_date'), lit(date).alias('submission_date_s3'),
                'country', 'WAU_on_locale', 'locale')

    res = locale_wau.join(wau, 'country', how='left')\
        .select('start_date', 'submission_date_s3',
                'country', 'WAU_on_locale', 'locale', 'WAU')

    rank_window = Window.partitionBy('country', 'submission_date_s3').orderBy(
        desc('WAU_on_locale'))

    return res.select('*', F.row_number().over(rank_window).alias('rank'))\
        .filter(col('rank') <= topN)\
        .select('submission_date_s3', 'country', 'locale',
                (100.0 * col('WAU_on_locale') / col('WAU')).alias('pct_on_locale'))
Exemplo n.º 18
0
def test_rolling_with_non_window_op(client, ibis_windows, spark_range):
    """ Test context adjustment for rolling window and non window ops

    non window ops should calculate only with in user's context,
    while rolling window should calculate on expanded context.
    For a rolling window of 1 day, and a `count` aggregation
    time       value
    2020-01-01   a
    2020-01-02   b
    2020-01-03   c
    2020-01-04   d
    with context = (2020-01-02, 2002-01-04), result will be:
    time       value  roll_count    count
    2020-01-02   b      2            3
    2020-01-03   c      2            3
    2020-01-04   d      2            3
    Because there are 3 rows within user context (01-02, 01-04),
    count should return 3 for every row, rather 4, based on the
    adjusted context (01-01, 01-04).
    """
    table = client.table('time_indexed_table')
    context = (
        pd.Timestamp('20170102 07:00:00', tz='UTC'),
        pd.Timestamp('20170105', tz='UTC'),
    )
    result_pd = table.mutate(
        count_1h=table['value'].count().over(ibis_windows[0]),
        count=table['value'].count(),
    ).execute(timecontext=context)

    spark_table = table.compile()
    spark_window_1h = (Window.partitionBy('key').orderBy(
        F.col('time').cast('long')).rangeBetween(*spark_range[0]))
    expected = (spark_table.withColumn(
        'count_1h',
        F.count(spark_table['value']).over(spark_window_1h)).withColumn(
            'count', F.count(spark_table['value'])).toPandas())
    expected = expected[expected.time.between(
        *(t.tz_convert(None) for t in context))].reset_index(drop=True)
    tm.assert_frame_equal(result_pd, expected)
Exemplo n.º 19
0
def run():
    try:    
        args = read_args()  
        data = [ "{pwd}/data/part-0000{x}.json.gz".format(pwd=PWD, x=y) for y in range(args.files) ]

        spark = SparkSession.builder \
                            .appName("Calculates Sessions Spark") \
                            .getOrCreate()
        
        logger.info("build a new instance.")
        logger.info("read data: {data}.".format(data=data))
        df = spark.read.json(path=data, schema=schema())
        logger.info("rows count: {rows}.".format(rows=df.count()))
        
        # -1800 seconds = -60 seconds * 30 minutes
        w = Window.partitionBy("anonymous_id").orderBy("device_sent_timestamp").rangeBetween(start=-1800, end=0)
        
        # add column events_on_session, based on window spec
        df = df.withColumn("events_on_session", F.count(df.device_sent_timestamp).over(w))
        
        # add column event_time, based on column device_sent_timestamp
        df = df.withColumn("event_time", F.to_timestamp(df.device_sent_timestamp))
        
        # filter df by first event on session
        df = df.filter(df.events_on_session == 1)
        logger.info("distinct sessions: {rows}.".format(rows=df.count()))

        if args.all or args.browser:
            sessions_by_column(df, "browser_family", args)
            
        if args.all or args.os:
            sessions_by_column(df, "os_family", args)

        if args.all or args.device:
            sessions_by_column(df, "device_family", args)
        
        logger.info("finished.")

    except Exception as e:
        logger.error("failed: {msg}.".format(msg=e))
Exemplo n.º 20
0
def _get_relevant_items_by_threshold(
    dataframe,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    threshold=DEFAULT_THRESHOLD,
):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        threshold (float): threshold for determining the relevant recommended items.
            This is used for the case that predicted ratings follow a known
            distribution.

    Return:
        pyspark.sql.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
        items.
    """
    items_for_user = (
        dataframe.orderBy(col_rating, ascending=False)
        .where(col_rating + " >= " + str(threshold))
        .select(col_user, col_item, col_rating)
        .withColumn(
            col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user))
        )
        .select(col_user, col_prediction)
        .dropDuplicates()
    )

    return items_for_user
Exemplo n.º 21
0
    def is_monotonic_decreasing(self):
        """
        Return boolean if values in the object are monotonically decreasing.

        .. note:: the current implementation of is_monotonic_decreasing uses Spark's
            Window without specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Returns
        -------
        is_monotonic : boolean

        Examples
        --------
        >>> ser = ks.Series(['4/1/2018', '3/1/2018', '1/1/2018'])
        >>> ser.is_monotonic_decreasing
        True

        >>> df = ks.DataFrame({'dates': [None, '3/1/2018', '2/1/2018', '1/1/2018']})
        >>> df.dates.is_monotonic_decreasing
        False

        >>> df.index.is_monotonic_decreasing
        False

        >>> ser = ks.Series([1])
        >>> ser.is_monotonic_decreasing
        True

        >>> ser = ks.Series([])
        >>> ser.is_monotonic_decreasing
        True

        >>> ser.rename("a").to_frame().set_index("a").index.is_monotonic_decreasing
        True
        """
        col = self._scol
        window = Window.orderBy(self._kdf._internal.index_scols).rowsBetween(-1, -1)
        return self._with_new_scol((col <= F.lag(col, 1).over(window)) & col.isNotNull()).all()
Exemplo n.º 22
0
    def _is_monotonic_decreasing(self):
        scol = self.spark.column
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)
        prev = F.lag(scol, 1).over(window)

        cond = F.lit(True)
        has_not_null = F.lit(True)
        for field in self.spark.data_type[::-1]:
            left = scol.getField(field.name)
            right = prev.getField(field.name)
            compare = MultiIndex._comparator_for_monotonic_decreasing(
                field.dataType)
            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
            # Therefore, we should check `has_not_null` over the all levels.
            has_not_null = has_not_null & left.isNotNull()
            cond = F.when(left.eqNullSafe(right), cond).otherwise(
                compare(left, right, spark.Column.__lt__))

        cond = has_not_null & (prev.isNull() | cond)

        cond_name = verify_temp_column_name(
            self._internal.spark_frame.select(
                self._internal.index_spark_columns),
            "__is_monotonic_decreasing_cond__",
        )

        sdf = self._internal.spark_frame.select(
            self._internal.index_spark_columns + [cond.alias(cond_name)])

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col)
                for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_dtypes=self._internal.index_dtypes,
        )

        return first_series(DataFrame(internal))
def aggregation(dataframe, aggregation_function, window_size, step_size):
    grouping_functions = ["avg", "sum", "min", "max"]
    if aggregation_function in grouping_functions:
        if "timestamp" not in dataframe.columns:
            raise ValueError("timestamp column not found!")
        w = Window.orderBy('timestamp').rowsBetween(0, int(window_size))
        signals = dataframe.columns.copy()
        signals.remove('timestamp')
        method = getattr(pyspark.sql.functions, aggregation_function)
        dataframe = dataframe.select(
            'timestamp', *(method(column).over(w) for column in signals),
            pyspark.sql.functions.monotonically_increasing_id().alias("idx"))
    elif aggregation_function == "first":
        dataframe = dataframe.withColumn(
            "idx", pyspark.sql.functions.monotonically_increasing_id())
    else:
        raise ValueError(
            "aggregation_function should be one of avg, sum. min, max or first"
        )
    dataframe = dataframe.filter(dataframe.idx % int(step_size) == 0)

    return dataframe.drop("idx")
Exemplo n.º 24
0
    def __init__(self,
                 spark=None,
                 code=None,
                 lim=None,
                 dataset_type='full',
                 sample=True,
                 balanced=False):
        """
        Defines the arguments to be used in the data loader class: 
        code: the census type of data you are using, {"sic", "soc"}
        lim: integer limit of dataframe when loading it from Hue. 
        sample: data taken from the 1% sample
            
        """
        self.code = code

        if not sample:
            if self.code == 'sic':
                source_string = "SELECT * FROM 2011_census_identified.cen_ident_2011_stage_0_writein_industry_std"
            elif self.code == 'soc':
                #                 source_string = "SELECT * FROM 2011_census_identified.cen_ident_2011_stage_0_writein_occupation_std where occ_job_title is not null and occupation_code is not null"
                source_string = "SELECT * FROM 2011_census_identified.cen_ident_2011_stage_0_writein_occupation_std"
            data_source = 'sql'
            return_as_pandas = False
        elif self.code in ['sic', 'soc']:
            source_string = get_file_path(self.code, dataset_type, balanced)
            data_source = 'file'
            return_as_pandas = True
        else:
            raise ValueError(
                "Code is not recognised. Please use either sic or soc")
        self.df = load_data(source_string, spark, data_source, lim,
                            return_as_pandas)
        if not sample:
            # add index
            self.df = self.df.withColumn(
                "id",
                F.row_number().over(
                    Window.orderBy(F.monotonically_increasing_id())) - 1)
Exemplo n.º 25
0
def main():
    conf = SparkConf().setAppName("SearchECPMStat")
    sc = SparkContext(conf=conf)
    sqlContext = HiveContext(sc)
    sc.setLogLevel("WARN")

    # data clean
    print('ECPMStat Start get_order_table')
    orderTable = get_order_table(sc, sqlContext)
    print('ECPMStat Finish get_order_table')

    print('ECPMStat Start get_sensor_table')
    sensorTable = get_sensor_table(sc, sqlContext)
    print('ECPMStat Finish get_sensor_table')

    print('ECPMStat Start calculate ecpm value')
    cond = [
        orderTable['sell_label'] == sensorTable['search_word'],
        orderTable['productId'] == sensorTable['product_id']
    ]
    table = orderTable.join(sensorTable, cond, 'inner').\
                       withColumn('exposure_ecpm', orderTable.sales_amount / sensorTable.exposure_count * 100.0).\
                       withColumn('click_ecpm', orderTable.sales_amount / sensorTable.click_count * 100.0).\
                       select('search_word', 'product_id', 'exposure_ecpm', 'click_ecpm')
    # result limit
    table = table.withColumn(
        'row_number',
        row_number().over(
            Window.partitionBy("search_word").orderBy(desc("exposure_ecpm"))))
    table = table.filter(table['row_number'] <= CONFIG['product_list_limit_per_query']).\
                  drop('row_number')
    if CONFIG['do_save_table']:
        table.write.saveAsTable('recommend.ecpm_final' +
                                CONFIG['table_suffix'],
                                mode='overwrite')
    print('ECPMStat Finish calculate ecpm value')

    print('ECPMStat finished!############################################')
    sc.stop()
Exemplo n.º 26
0
def create_city_dimention(spark, cities_fact_df, temprature_df):
    try:
        # Select distinct cities
        cities_dims_df = cities_fact_df.select("City", "State",
                                               "State Code").distinct()

        # Assign unique id to each city as city_code
        cities_window = Window.orderBy(f.col("City"), f.col("State"))
        cities_dims_df = cities_dims_df.withColumn(
            "city_code",
            f.row_number().over(cities_window)).withColumnRenamed(
                "State Code", "state_code")

        #Joining with Temprature DF to get Lat ,Long & Temp
        cities_dims_df = cities_dims_df.join(temprature_df,
                                             on=['City'],
                                             how='left')

        return cities_dims_df

    except Exception as e:
        print(str(e))
Exemplo n.º 27
0
def SupplementTags2(spark, TagSynMapping, TagName):
    # http://ec2-44-235-91-5.us-west-2.compute.amazonaws.com:8888/notebooks/findspark.py.ipynb
    TagSynMapping = TagSynMapping.select(
        col("key").alias("TagName"),
        col("value").alias("id"))
    TagName = TagName.drop('Count').withColumn("id",
                                               lit(None).cast(
                                                   StringType())).select('*')
    # Concate two column
    result = TagSynMapping.union(TagName)
    w = Window.partitionBy('TagName')
    result = result.select('TagName', 'id',
                           count('TagName').over(w).alias('n'))
    # Get rid of duplicate
    result = result.dropDuplicates(subset=['TagName'])
    # Generate mapping from name to
    mapping = result.groupBy(result.id).count().select(
        'id',
        col('count').alias('n')).withColumn(
            "id2", monotonically_increasing_id()).select('*')

    #mapping.show()
    def working_fun2(mapping):
        def f(x):
            return mapping.value.get(x)

        return udf(f)

    mapping = {entry[0]: entry[2] for entry in mapping.collect()}

    b = spark.sparkContext.broadcast(mapping)
    result = result.withColumn(
        'new_id',
        when(col('id').isNotNull(),
             working_fun2(b)(col('id'))).otherwise(
                 monotonically_increasing_id()))
    result = result.drop('id', 'n')
    #result.show()
    return result
Exemplo n.º 28
0
    def stratify_data(self, dataset):
        nfolds = self.numFolds
        df = dataset.withColumn("id", monotonically_increasing_id())
        windowval = (Window.partitionBy(
            self.labelCol).orderBy('id').rangeBetween(
                Window.unboundedPreceding, 0))
        stratified_data = df.withColumn('cum_sum',
                                        f.sum(f.lit(1)).over(windowval))
        stratified_data = stratified_data.withColumn("bucket_fold",
                                                     f.col("cum_sum") % nfolds)

        if self.stratify_summary:
            self.stratify_summary = stratified_data.withColumn("bucket_fold",f.concat(f.lit("fold_"),f.col("bucket_fold") + 1)).\
                                                  groupby(self.labelCol).\
                                                  pivot("bucket_fold").\
                                                  agg(f.count("id"))
        else:
            self.stratify_summary = "To create summary rerun with stratify_summary = True"

        stratified_data = stratified_data.drop(*["id", "cum_sum"])

        return stratified_data
Exemplo n.º 29
0
def transformData(prices):
    # Window to get last trading day's metrics for each stock
    w = Window.partitionBy('Symbol').orderBy('Date')

    # Compute daily stock price changes
    returns = prices.withColumn('Return', (F.col('Adj Close')/F.col('Open')).astype('decimal(10,4)'))\
      .withColumn('Spread', (F.col('High')/F.col('Low')).astype('decimal(10,5)'))\
      .na.fill({'Return': 0.0, 'Spread': 0.0})\
      .select('Date','Symbol','Sector','Industry','Return','Spread','Volume')\
      .withColumn('yReturn', F.lag('Return').over(w))\
      .withColumn('ySpread', F.lag('Spread').over(w))\
      .withColumn('yVolume', F.lag('Volume').over(w))\
      .na.fill({'yReturn': 0.0, 'ySpread': 0.0, 'yVolume': 0})

    # Filter out extraordinary daily returns or trading volumes (noise)
    returns = returns.filter(returns.Return.between(0.5,1.5) &
                         returns.yReturn.between(0.5,1.5) &
                         returns.Spread.between(0.5,1.5) &
                         returns.ySpread.between(0.5,1.5))\
      .filter('Volume>0 and yVolume>0')

    return returns
Exemplo n.º 30
0
    def etl_load_trade(self):
        logging.debug("Inside transform parking occupancy dataset module")

        trade_common = self.spark.read.\
                       parquet(self._load_path+"/partition=T/*.parquet")

        trade = trade_common.select("trade_dt", "symbol", "exchange",\
                             "event_tm","event_seq_nb", "arrival_tm", "trade_pr")




        trade_corrected=trade.withColumn("row_number",F.row_number().over(Window.partitionBy(trade.trade_dt,\
                        trade.symbol,trade.exchange,trade.event_tm,trade.event_seq_nb) \
                        .orderBy(trade.arrival_tm.desc()))).filter(F.col("row_number")==1).drop("row_number")

        trade_corrected.show(3, truncate=False)

        logging.debug(
            "Writting transformed trade dataframe to a trade partition")
        trade_corrected.withColumn("trade_date", F.col("trade_dt")).write.\
                       partitionBy("trade_dt").mode("overwrite").parquet(self._save_path+"/trade/")
Exemplo n.º 31
0
def get_relevant_items_by_threshold(
    dataframe,
    col_user="******",
    col_item="itemID",
    col_rating="rating",
    threshold=3.5,
):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        threshold: threshold for determining the relevant recommended items.
        This is used for the case that predicted ratings follow a known
        distribution.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
            items.
    """
    items_for_user = (
        dataframe
        .orderBy(col_rating, ascending=False)
        .where(col_rating + " >= " + str(threshold))
        .select(
            col_user, col_item, col_rating
        )
        .withColumn("prediction", F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, "prediction")
        .dropDuplicates()
    )

    return items_for_user
Exemplo n.º 32
0
    def back_fill_dataframe(df, partition_col, columns_to_fill):
        df = df.withColumn('Dummy_ID', F.monotonically_increasing_id())
        # backfilled_df = df.withColumn('Dummy_ID', F.monotonically_increasing_id())
        backfilled_final_df = df.select('Dummy_ID', 'PropHash')
        for column_to_fill in columns_to_fill:
            df_a = df.select('Dummy_ID', 'PropHash', column_to_fill)
            df_b = df.select('Dummy_ID', 'PropHash', column_to_fill)
            backfilled_df = df_a.crossJoin(df_b).where((df_a[partition_col] >= df_b[partition_col]) &
                                                       (df_a[column_to_fill].isNotNull() |
                                                        df_b[column_to_fill].isNotNull()))
            #for c in backfilled_df.columns:
            print('Column is {}'.format(column_to_fill))
            back_fill_window = Window.partitionBy(df_a[partition_col]).orderBy(df_b[partition_col])
            backfilled_df = backfilled_df.withColumn('row_num', F.row_number().over(back_fill_window))
            backfilled_df = backfilled_df.filter(F.col('row_num') == 1)

            backfilled_df = backfilled_df.select(df_a.Dummy_ID, df_a.PropHash,
                                                 F.coalesce(df_a[column_to_fill], df_b[column_to_fill]).alias(column_to_fill))
            backfilled_final_df = backfilled_final_df.join(backfilled_df,
                                                           ['Dummy_ID', 'PropHash'],
                                                           'left')
        return backfilled_final_df
def assign_group(model_dict, df, d, strata_cols, colname):
    '''given (1) a dictionary of ranges,
    (2) a DF with random values ranked 
    by random block, and 
    (3) a name for the grouped columns,
    return DF with a new column that 
    assigns group membership'''
    window = Window.orderBy('dataset_rnd')\
                   .partitionBy(*model_dict['strata_cols'])
    df = df.withColumn('dataset_rk', F.percent_rank().over(window))
    for i, (k, v) in enumerate(d.iteritems()):
        ## if the bottom is 0, make it -1 to include 0
        min_val = -1 if v[0] == 0 else min_val
        if i == 0:
            group_assign_cond = F.when((col('dataset_rk') > min_val)
                                       & (col('dataset_rk') <= v[1]), F.lit(k))
        else:
            group_assign_cond = group_assign_cond.when(
                (col('dataset_rk') > min_val)
                & (col('dataset_rk') <= v[1]), F.lit(k))

    return df.withColumn(colname, group_assign_cond)
Exemplo n.º 34
0
def window_sample():
    def make_keywords_count_kv(r):
        for keyword in r.keywords:
            yield (r.region, keyword), r.twi

    def format_db((region, records)):
        out = [{'keyword': r.keyword, 'score': r.score} for r in records]
        return region, 'keywords', json.dumps(out)

    sc = SparkContext()
    sql_ctx = SQLContext(sc)

    scheme = StructType([
        StructField('region', StringType()),
        StructField('keywords', ArrayType(StringType())),
        StructField('twi', IntegerType()),
    ])

    rdd = sc.parallelize(
        [('bj', ['ni'], 10), ('bj', ['ni'], 10), ('sh', ['ni'], 20), ('sh', ['ni'], 30), ('sh', ['wo'], 70)])
    df = sql_ctx.createDataFrame(rdd, scheme)

    window = Window.partitionBy('region').orderBy(functions.desc('score'))

    df = df.rdd \
        .flatMap(make_keywords_count_kv) \
        .reduceByKey(add) \
        .map(lambda (k, v): list(k) + [v]) \
        .toDF(['region', 'keyword', 'score']) \
        .withColumn('rank', functions.row_number().over(window)) \
        .where('rank<=30') \
        .coalesce(500) \
        .rdd \
        .keyBy(lambda r: r.region) \
        .groupByKey() \
        .map(format_db).toDF(['region', 'keywords', 'scores'])

    for item in df.toJSON().take(10):
        print(item)
Exemplo n.º 35
0
def compile_window_op(t, expr, scope, **kwargs):
    op = expr.op()
    window = op.window
    operand = op.expr

    group_by = window._group_by
    grouping_keys = [
        key_op.name if isinstance(key_op, ops.TableColumn) else
        compile_with_scope(t, key, scope) for key, key_op in zip(
            group_by, map(operator.methodcaller('op'), group_by))
    ]

    order_by = window._order_by
    ordering_keys = [
        key.to_expr().get_name()
        for key in map(operator.methodcaller('op'), order_by)
    ]

    pyspark_window = Window.partitionBy(grouping_keys).orderBy(ordering_keys)
    result = t.translate(operand, scope, window=pyspark_window)

    return result
Exemplo n.º 36
0
def history_report(year_month):
    sc = SparkContext()
    spark = SparkSession.builder\
                        .appName("history_report")\
                        .getOrCreate()

    history_report = spark.read.parquet(
        f"hdfs://teb101-1.cloudera.com//user/admin/twstock/analysis/daily_report/{year_month}*.parquet"
    )
    history_report.createOrReplaceTempView("history_report")

    sql = "SELECT sID,bID,Date,total_buy,total_sell,\
                  (total_buy*avg_buy_price) as cost,(total_sell*avg_sell_price) as recieve,\
                  storage,close\
           FROM history_report\
           ORDER BY sID,bID,Date"

    ## Using Window to cumsum data
    window = Window.orderBy("Date").partitionBy("sID", "bID").rangeBetween(
        Window.unboundedPreceding, 0)
    history_report = spark.sql(sql).withColumn('cumsum_storage', sum('storage').over(window))\
                                   .ithColumn('cumsum_cost', (sum('cost')).over(window))\
                                   .withColumn('cumsum_buy', (sum("total_buy")).over(window))\
                                   .withColumn('cumsum_recieve', (sum('recieve')).over(window))\
                                   .withColumn('cumsum_sell', (sum('total_sell')).over(window))

    # alias column name
    history_report = history_report.select("sID","bID","Date","cumsum_cost","cumsum_buy","cumsum_recieve","cumsum_sell",\
                                           (col("cumsum_cost") / col("cumsum_buy")).alias("avg_buy_price"),\
                                           (col("cumsum_recieve") / col("cumsum_sell")).alias("avg_sell_price"),\
                                           "close","cumsum_storage")
    # changing Type of data
    history_report = history_report.withColumn("avg_buy_price",col("avg_buy_price").cast(FloatType()))\
                                   .withColumn("avg_sell_price",col("avg_sell_price").cast(FloatType()))\
                                   .withColumn("cumsum_storage",col("cumsum_storage").cast(FloatType()))

    history_report.write.mode("append").parquet(
        f"/user/admin/twstock/analysis/history/")
Exemplo n.º 37
0
def extract_road_segments_df(spark, use_cache=True):
    cache = workdir + "data/road-network.parquet"
    if os.path.isdir(cache) and use_cache:
        print("Skip extraction of road network dataframe: already done,"
              " reading from file")
        return spark.read.parquet(cache)

    print("Extracting road network dataframe...")
    cols = [
        "street_name",
        "street_type",
        "center_long",
        "center_lat",
        "coord_long",
        "coord_lat",
        "nid",
    ]

    road_seg_df = get_road_segments_RDD(spark).flatMap(kml_extract_RDD).toDF(
        cols)

    # Some specific road segments have the same nid
    w = Window.partitionBy("nid").orderBy("center_lat")
    street_ids = (road_seg_df.select(
        "nid", "center_lat", "center_long").distinct().select(
            "center_lat",
            "center_long",
            concat("nid",
                   row_number().over(w)).alias("street_id"),
        ))

    road_seg_df = road_seg_df.join(street_ids,
                                   ["center_lat", "center_long"]).drop("nid")

    if use_cache:
        road_seg_df.write.parquet(cache)
    print("Extracting road network dataframe done")
    return road_seg_df
Exemplo n.º 38
0
	|   3|  a|
	|   1|  b|
	|   2|  b|
	|   3|  b|
	|   1|  c|
	|   3|  c|
	+----+---+
"""

schema = StructType([StructField("attr", StringType(), True), StructField("obj", StringType(), True)])

aoDF = sqlCtx.createDataFrame(aoPair, schema)

#Window that moves over rows of same obj and sorted by attr

window = Window.orderBy("attr").partitionBy("obj")

## Prev column contains previous attr of the same object
"""
	Transformed Table	
	+----+---+----+
	|attr|obj|prev|
	+----+---+----+
	|   1|  a|null|
	|   2|  a|   1|
	|   3|  a|   2|
	|   1|  b|null|
	|   2|  b|   1|
	|   3|  b|   2|
	|   1|  c|null|
	|   3|  c|   1|
Exemplo n.º 39
0
def spark_chrono_split(
    data,
    ratio=0.75,
    min_rating=1,
    filter_by="user",
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark chronological splitter
    This function splits data in a chronological manner. That is, for each user / item, the
    split function takes proportions of ratings which is specified by the split ratio(s).
    The split is stratified.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits 
            data into several portions corresponding to the split ratios. If a list is 
            provided and the ratios are not summed to 1, they will be normalized.
        seed (int): Seed.
        min_rating (int): minimum number of ratings for user or item.
        filter_by (str): either "user" or "item", depending on which of the two is to filter
            with min_rating.
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    if not (filter_by == "user" or filter_by == "item"):
        raise ValueError("filter_by should be either 'user' or 'item'.")

    if min_rating < 1:
        raise ValueError("min_rating should be integer and larger than or equal to 1.")

    multi_split, ratio = process_split_ratio(ratio)

    split_by_column = col_user if filter_by == "user" else col_item

    if min_rating > 1:
        data = min_rating_filter_spark(
            data,
            min_rating=min_rating,
            filter_by=filter_by,
            col_user=col_user,
            col_item=col_item,
        )

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.partitionBy(split_by_column).orderBy(col(col_timestamp))

    rating_grouped = (
        data.groupBy(split_by_column)
        .agg({col_timestamp: "count"})
        .withColumnRenamed("count(" + col_timestamp + ")", "count")
    )
    rating_all = data.join(broadcast(rating_grouped), on=split_by_column)

    rating_rank = rating_all.withColumn(
        "rank", row_number().over(window_spec) / col("count")
    )

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i])
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            )

        splits.append(rating_split)

    return splits
Exemplo n.º 40
0
    def processData(sc,hc,fs,con,incFileName,inThresh,outThresh,progTag):
        #incFileName="hdfs://namenode.navroomhdp.com:8020/data/t_cf_inc/100033/t_cf_20161028.txt"
        
        #inThresh=10
        #outThresh=300
        
        #**************************************
        #
        #this procedure will use incfile to caculate 
        #flow(everyday one record) store as file
        #indoor(everyday one record) store as file
        #indoor_for_delete(every indoor records) store in hbase
        #indoor detail(every indoor records) store in hbase and as file
        #
        #destIndoorFile     : /data/indoor/entityid/year/id_date.json      used to generate report
        #destFlowFile       : /data/flow/entityid/year/fl_date.json        used to generate report
        #rec_destIndoorfile : /data/rec_indoor/entityid/year/id_date.json  this folder is mirror of hbase records
        #
        #
        #**************************************

        destIndoorFile=get_str_indoorFileName(incFileName)
        #hdfs://namenode.navroomhdp.com:8020/data/indoor/100033/2016/id_20161028.txt
        rec_destIndoorfile=destIndoorFile.replace("/indoor/","/rec_indoor/")
        #hdfs://namenode.navroomhdp.com:8020/data/rec_indoor/101762/2016/id_20161011.txt
        destFlowFile  =destIndoorFile.replace("/indoor/","/flow/").replace("id_","fl_")
        #hdfs://namenode.navroomhdp.com:8020/data/flow/101762/2016/fl_20161011.txt
        tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag)
        tmp_destFlowFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag)
        tmp_rec_destIndoorFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag)
        EntityID=int(get_str_entityID(incFileName))
        #101762
        histFileName=get_str_histFileName(incFileName) #processed file will be place here
        #hdfs://namenode.navroomhdp.com:8020/data/t_cf/101762/t_cf_20161011.txt
        if fs.exists(sc._jvm.Path(histFileName)):
            tmpFileName=get_str_tmpFileName(histFileName)
            #tmpFileName = hdfs://namenode.navroomhdp.com:8020/data/tmp/101762/t_cf_20161011.txt
            tmpFolderName=tmpFileName.rsplit('/',1)[0]+"tmp"
            #tmpFolderName=hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp
            #copy hist file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/hist and distroy the hist file
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(histFileName),fs,sc._jvm.Path(tmpFolderName+"/hist"),True,True,con) 
            #copy inc file to temp folder and name it as hdfs://namenode.navroomhdp.com:8020/data/tmp/101762tmp/inc and  destroy the inc file
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(tmpFolderName+"/inc"),True,True,con)
            #copymerge the 2 files (inc and hist) into one file 
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmpFolderName),fs,sc._jvm.Path(tmpFileName),True,con,None)
            sc._jvm.FileUtil.copy(fs,sc._jvm.Path(tmpFileName),fs,sc._jvm.Path(incFileName),True,True,con)

        unixFirtDayofMonth = get_int_firstDayUnixDate(incFileName)
        # firtDayofMonth= 1475251200 it is 20161001 unixdate
        startUnixTime=get_int_fileNameUnixDate(incFileName) #1456808400 this is today's unix datetime
 
        rows_t_cf=sc.textFile(incFileName).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=p[0], stime=p[1],flag=p[2]))
        HiveContext.createDataFrame(hc,rows_t_cf).registerTempTable("t_cf_inc_tmp")
        hc.sql("select distinct clientmac,stime,flag from t_cf_inc_tmp").registerTempTable("t_cf")
        
        df=hc.sql("select distinct ClientMac,stime ,lag(stime) over (partition by ClientMac order by stime) as lag_time ,lead(stime) over (partition by ClientMac order by stime) as lead_time from t_cf where flag=1")
        df1=df.withColumn("diff" , df["stime"]-df["lag_time"]).na.fill(-1)
        df1.filter((df1.diff>=outThresh)|(df1.lag_time ==-1)|( df1.lead_time==-1)).registerTempTable("df2")
        df2=hc.sql("select ClientMac,stime,lag_time,lead_time,case when (diff < "+ str(outThresh) +" and diff>0) then diff ELSE 0 end as diff from df2")
        df3=df2.withColumn("lag_time1",df2.lag_time+df2.diff).drop( "lag_time")
        df3.withColumn("lag_time2",func.lead("lag_time1").over(Window.partitionBy("clientMac"))).registerTempTable("df3")
        
        df4=hc.sql("select ClientMac,cast(stime as int) as ETime ,cast(lag_time2 as int) as LTime,cast((lag_time2- stime) as int) as Seconds from df3").na.fill(-1)
        df5=df4.filter((df4.LTime>0)&(df4.Seconds>=inThresh)&(df4.ETime>startUnixTime)&(df4.ETime<(startUnixTime+86400))).withColumn("ENTITYID",lit(EntityID)) #86400 is seonds in one day
        df5.registerTempTable("df5")
        #DF5 will be save to hbase as indoor details(rec_destIndoorfile) ,df6 and df7 will be used for stats caculation
        
        df6=hc.sql("select ClientMac,ETime, LTime, Seconds ,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df5")
        df6.registerTempTable("df6_indoor")
        df7=hc.sql("select ClientMac,min(etime) as etime,max(ltime) as ltime,sum(Seconds) as seconds,utoday from df6_indoor group by ClientMac,utoday")
        df_current_result=df7.withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth))

        flow_sql=  "select ClientMac,min(stime) as etime,max(stime) as ltime from t_cf where stime >"+str(startUnixTime) + " and stime <"+str(startUnixTime+86400)+" group by clientmac"
        hc.sql(flow_sql).registerTempTable("df_flow_tmp")
        df_flow=hc.sql("select ClientMac,etime,ltime,unix_timestamp(date_sub(from_unixtime(etime),0),'yyyy-MM-dd') as utoday from df_flow_tmp").withColumn("ENTITYID",lit(EntityID)).withColumn('UFIRSTDAY',lit(unixFirtDayofMonth))
       
        #df_flow.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_FLOW_TODAY") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
        #df_flow.saveAsTable("T_FLOW")
        if len(df5.head(1))==1:  #df5 is not empty better than df5.rdd.isEmpty
            tmp_rec_destIndoorFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/rec_indoor"+str(progTag)
            df5.select('clientmac','entityid','etime','ltime','seconds').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_rec_destIndoorFolder)            
            #df5.write.mode('overwrite').json(tmp_rec_destIndoorFolder)
            df5.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR") .option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
            if fs.exists(sc._jvm.Path(rec_destIndoorfile)):  #the old indoor folder exists,will generate df_delete_pk for phoenix to delete invalid rows
                rows_rec_indoor=sc.textFile(rec_destIndoorfile).map(lambda r: r.split(",")).map(lambda p: Row(clientmac=str(p[0]), entityid=int(p[1]),etime=int(p[2]),ltime=int(p[3]),seconds=int(p[4])))
                HiveContext.createDataFrame(hc,rows_rec_indoor).registerTempTable("df_old_indoor")
                df_old_indoor_pk=hc.sql("select ClientMac,ENTITYID,ETime from df_old_indoor") 
                df_current_result_pk=hc.sql("select ClientMac,ENTITYID,ETime from df5") 
                df_delete_pk = df_old_indoor_pk.subtract(df_current_result_pk)
                if len(df_delete_pk.head(1))==1:
                    df_delete_pk.write.format("org.apache.phoenix.spark").mode("overwrite").option("table", "T_INDOOR_FOR_DELETE").option("zkUrl", "namenode.navroomhdp.com:2181:/hbase-unsecure").save()
        else:
            tmp_rec_destIndoorFolder="NONE"
            
        if len(df_flow.head(1))==1:
            tmp_destFlowFolder   = "hdfs://namenode.navroomhdp.com:8020/data/tmp/flow"+str(progTag)
            df_flow.select('clientmac','entityid','etime','ltime','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destFlowFolder)
            #df_flow.write.mode('overwrite').json(tmp_destFlowFolder)
        else:
            tmp_destFlowFolder="NONE"
            
        if len(df_current_result.head(1))==1:
            tmp_destIndoorFolder = "hdfs://namenode.navroomhdp.com:8020/data/tmp/indoor"+str(progTag)
            df_current_result.select('clientmac','entityid','etime','ltime','seconds','utoday','ufirstday').write.mode('overwrite').format('com.databricks.spark.csv').options(header='false').save(tmp_destIndoorFolder)
            #df_current_result.write.mode('overwrite').json(tmp_destIndoorFolder)
        else:
            tmp_destIndoorFolder="NONE"
        
        sc._jvm.FileUtil.copy(fs,sc._jvm.Path(incFileName),fs,sc._jvm.Path(histFileName),True,True,con) 

        if fs.exists(sc._jvm.Path(destIndoorFile)):
            fs.delete(sc._jvm.Path(destIndoorFile))
        if fs.exists(sc._jvm.Path(destFlowFile)):
            fs.delete(sc._jvm.Path(destFlowFile))
        if fs.exists(sc._jvm.Path(rec_destIndoorfile)):
            fs.delete(sc._jvm.Path(rec_destIndoorfile))        
        #delete is a must if file already exists otherwise copymerge will fail  
        
        if tmp_destIndoorFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destIndoorFolder),fs,sc._jvm.Path(destIndoorFile),True,con,None)
            #destIndoorFile=get_str_indoorFileName(incFileName)
        if tmp_destFlowFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_destFlowFolder),fs,sc._jvm.Path(destFlowFile),True,con,None)
        if tmp_rec_destIndoorFolder!="NONE":
            sc._jvm.FileUtil.copyMerge(fs, sc._jvm.Path(tmp_rec_destIndoorFolder),fs,sc._jvm.Path(rec_destIndoorfile),True,con,None)