예제 #1
0
    def test_collect_functions(self):
        df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        from pyspark.sql import functions

        self.assertEqual(
            sorted(df.select(functions.collect_set(df.key).alias('r')).collect()[0].r),
            [1, 2])
        self.assertEqual(
            sorted(df.select(functions.collect_list(df.key).alias('r')).collect()[0].r),
            [1, 1, 1, 2])
        self.assertEqual(
            sorted(df.select(functions.collect_set(df.value).alias('r')).collect()[0].r),
            ["1", "2"])
        self.assertEqual(
            sorted(df.select(functions.collect_list(df.value).alias('r')).collect()[0].r),
            ["1", "2", "2", "2"])
예제 #2
0
def runAggregateFunctions(spark, df1, df2):
    # collect_list, collect_set
    doubledDf1 = df1.union(df1)
    doubledDf1.select(functions.collect_list(doubledDf1["name"])).show(truncate=False)
    doubledDf1.select(functions.collect_set(doubledDf1["name"])).show(truncate=False)

    # count, countDistinct
    doubledDf1.select(functions.count(doubledDf1["name"]), functions.countDistinct(doubledDf1["name"])).show(
        truncate=False)

    # sum
    df2.printSchema()
    df2.select(sum(df2["price"])).show(truncate=False)

    # grouping, grouping_id
    df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping(df2["store"])).show(truncate=False)
    df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping_id(df2["store"], df2["product"])).show(
        truncate=False)
예제 #3
0
def _get_top_k_items(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        k=DEFAULT_K
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .groupby(col_user)
        .agg(F.collect_list(col_item).alias(col_prediction))
    )

    return items_for_user
예제 #4
0
def _get_relevant_items_by_threshold(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        threshold=DEFAULT_THRESHOLD
):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        threshold (float): threshold for determining the relevant recommended items.
            This is used for the case that predicted ratings follow a known
            distribution.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
            items.
    """
    items_for_user = (
        dataframe
        .orderBy(col_rating, ascending=False)
        .where(col_rating + " >= " + str(threshold))
        .select(
            col_user, col_item, col_rating
        )
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates()
    )

    return items_for_user
예제 #5
0
def _get_relevant_items_by_timestamp(
        dataframe,
        col_user=DEFAULT_USER_COL,
        col_item=DEFAULT_ITEM_COL,
        col_rating=DEFAULT_RATING_COL,
        col_timestamp=DEFAULT_TIMESTAMP_COL,
        col_prediction=DEFAULT_PREDICTION_COL,
        k=DEFAULT_K
):
    """Get relevant items for each customer defined by timestamp.

    Relevant items are defined as k items that appear mostly recently
    according to timestamps.

    Args:
        dataframe (spark.DataFrame): A Spark DataFrame of customerID-itemID-rating-timeStamp
            tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_timestamp (str): column name for timestamp.
        col_prediction (str): column name for prediction.
        k: number of relevent items to be filtered by the function.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant items.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_timestamp).desc())

    items_for_user = (
        dataframe.select(
            col_user, col_item, col_rating, row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn(col_prediction, F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, col_prediction)
        .dropDuplicates([col_user, col_prediction])
    )

    return items_for_user
예제 #6
0
    def loss(self, period, agg_method, loss_method):
        end_date = self.raw_data.select(
            F.date_format(
                F.max(self.ds),
                self.ds_format)).rdd.map(lambda x: list(x)).first()[0]
        end_date_be = (parse(end_date) -
                       datetime.timedelta(period)).strftime('%Y-%m-%d')
        calculate_loss = self.raw_data.filter(F.col(self.ds) < end_date_be)
        loss_result_rdd = calculate_loss.map(lambda x: ((x[0], x[1]), (x[2], x[3]))).groupByKey(). \
            map(lambda x: _run(x, period))
        pre_loss_sp = loss_result_rdd.toDF().toDF(self.key, 'ts_type',
                                                  self.target)
        check_loss_sp = self.raw_data.where(
            ''' {0} >= '{1}' and {0} < '{2}' '''.format(
                'dt', end_date_be, end_date))

        date_list = get_date_range(end_date_be, period)
        date_sp = spark.createDataFrame(map(lambda x: [x], date_list), ['dt'])

        main_sp = pre_loss_sp.select('sku').drop_duplicates().crossJoin(
            date_sp)
        pre_sp = split_pre_data(pre_loss_sp, date_list, ['sku', 'dt', 'pre'])
        main_sp = main_sp.join(check_loss_sp, on=['sku', 'dt'],
                               how='left').join(pre_sp,
                                                on=['sku', 'dt'],
                                                how='left').na.fill(0)

        main_sp = main_sp.join(pre_sp, on=['sku']).join(check_loss_sp,
                                                        on=['sku', 'dt'],
                                                        how='left')

        loss_sp = main_sp.groupBy('sku').agg(
            F.udf(agg_cal_loss)(F.collect_list(
                F.struct(F.col('dt'), F.col('sale'), F.col('pre'))),
                                F.lit(agg_method),
                                F.lit(loss_method)).alias('loss'))
        return loss_sp
예제 #7
0
 def daily_compute(self, busi_date):
     """
     """
     spark = self.spark
     # 计算T-1日(交易日历)
     yesterday = get_date(
         self.date_order, self.order_date, busi_date, -1)
     unclose = spark.sql("select * from %s.%s where busi_date='%s'"
                         % (self.fdata, self.unclose_table, yesterday))
     trd = spark.sql("select * from %s.%s where busi_date='%s'"
                     % (self.odata, self.cash_flow_table, busi_date))
     trd = trd.filter("trd_type='long_related' and prd_no!='0.0'")
     asset = spark.sql("select * from %s.%s where busi_date='%s'"
                       % (self.odata, self.asset_table, busi_date))
     asset = asset.select("trade_id", "secu_acc_id", "prd_no", "qty", "mkt_val")
     asset = asset.filter("prd_no!='0.0'")
     trd = trd.withColumn(
         "now_trd", func.struct("trd_qty", "trd_cash_flow", "timestamp"))
     trd = trd.groupBy("trade_id", "secu_acc_id", "prd_no") \
         .agg(func.collect_list("now_trd").alias("now_trd"))
     trd.persist(StorageLevel.DISK_ONLY)
     trd.count()
     df = full_outer_join(unclose, trd, ["trade_id", "secu_acc_id", "prd_no"])
     df = full_outer_join(df, asset, ["trade_id", "secu_acc_id", "prd_no"])
     df.persist(StorageLevel.DISK_ONLY)
     df.count()
     data = df.rdd.map(lambda x: Row(**long_compute(x.asDict(recursive=True), busi_date)))
     if data.count() > 0:
         data = data.toDF()
         close = data.filter("remain_qty = 0")
         close = close.withColumn("close_date", close.busi_date)
         self.check_data(close)
         unclose = data.filter("remain_qty != 0")
         self.save_close_data(close, busi_date)
         self.save_unclose_data(unclose, busi_date)
     else:
         print "清仓股票收益做多没有输出"
예제 #8
0
def get_client_similar_cigar(spark):

    # 注册UDF函数,选择三个月的数据分析
    strlen = spark.udf.register("strlen", lambda x: len(x))
    begin_time = (date.today() - timedelta(days=90)).strftime("%Y%m%d")

    lines = spark.sql(
        f"SELECT co_num, item_id FROM DB2_DB2INST1_CO_CO_LINE WHERE \
            born_date > {begin_time} AND qty_rsn > 0 AND qty_ord > 0"
    )
    # 构造训练集
    # 单次订单少于2个品规的,不进行训练
    items_df = lines.groupBy("co_num").agg(f.collect_list("item_id").alias("sentence")).select("sentence")
    items_df = items_df.filter(strlen("sentence") >= 2)

    # Word2Vec需要调整参数
    word2Vec = Word2Vec(vectorSize=30, seed=88,minCount=1, inputCol="sentence", outputCol="model")
    model = word2Vec.fit(items_df)

    # 构造测试集
    raw_data = lines.select(col("item_id").alias("items")).distinct()
    raw_data = raw_data.dropna()
    raw_data = list(map(lambda x: x['items'], raw_data.collect()))

    predict_df = pd.DataFrame(raw_data, columns=["origin"])
    predict_df["near_list"] = predict_df["origin"].apply(lambda x: combineWithSyn(x, model))

    out = []
    for v in predict_df.values:
        df = pd.DataFrame(v[1], columns=["other", "distance"])
        df["origin"] = v[0]
        out.append(df.copy())
    df = pd.concat(out)
    # 生成对应的sparkDataFrame   烟 相似的烟 相似度
    df = spark.createDataFrame(df[["origin", "other", "distance"]], ["origin", "other", "distance"])

    return df
예제 #9
0
def get_variants_df_with_case_duration(df, parameters=None):
    """Gets variants dataframe from the Spark dataframe, with case duration that is included
    """
    if parameters is None:
        parameters = {}

    case_id_glue = parameters[
        PARAMETER_CONSTANT_CASEID_KEY] if PARAMETER_CONSTANT_CASEID_KEY in parameters else CASE_CONCEPT_NAME
    activity_key = parameters[
        PARAMETER_CONSTANT_ACTIVITY_KEY] if PARAMETER_CONSTANT_ACTIVITY_KEY in parameters else DEFAULT_NAME_KEY
    timestamp_key = parameters[
        PARAMETER_CONSTANT_TIMESTAMP_KEY] if PARAMETER_CONSTANT_TIMESTAMP_KEY in parameters else DEFAULT_TIMESTAMP_KEY

    ordered_df = df.orderBy(timestamp_key).select(case_id_glue, timestamp_key,
                                                  activity_key)
    grouped_df = ordered_df.groupby(case_id_glue)
    df1 = grouped_df.agg(F.collect_list(activity_key).alias("variant"))
    df1 = df1.withColumn("variant",
                         F.concat_ws(",", "variant")).orderBy(case_id_glue)

    start_df = grouped_df.agg(F.min(timestamp_key).alias(timestamp_key))
    first_eve_df = ordered_df.join(F.broadcast(start_df), start_df.columns)
    end_df = grouped_df.agg(F.max(timestamp_key).alias(timestamp_key))
    last_eve_df = ordered_df.join(F.broadcast(end_df), end_df.columns)
    last_eve_df = last_eve_df.withColumnRenamed(timestamp_key,
                                                timestamp_key + "_2")
    last_eve_df = last_eve_df.withColumnRenamed(activity_key,
                                                activity_key + "_2")

    stacked_df = first_eve_df.join(last_eve_df,
                                   case_id_glue).orderBy(case_id_glue)
    stacked_df = stacked_df.withColumn(
        "caseDuration",
        F.unix_timestamp(stacked_df[timestamp_key + "_2"]) -
        F.unix_timestamp(stacked_df[timestamp_key]))
    new_df = df1.join(stacked_df, case_id_glue)
    return new_df
예제 #10
0
def shuffle_generator_method_2(df):
    df = df.select(
        ['source_id', 'cited_source_uid', 'reference_issn', 'reference_year'])

    df = df.sort("reference_year")

    df_g = df.groupby('reference_year').agg(collect_list(df.cited_source_uid))
    df_g = df_g.withColumnRenamed('collect_list(cited_source_uid)', 'csi')
    # df_g.show()

    df_g = df_g.select(shuffle(df_g.csi).alias('s_csi'))
    # df_g.show()

    df_g = df_g.select(explode(df_g.s_csi).alias('s_csi'))
    # df_g.show()
    # df.show()

    df = df.withColumn('id', monotonically_increasing_id())
    df_g = df_g.withColumn('id', monotonically_increasing_id())

    df = df.join(df_g, df.id == df_g.id,
                 'inner').select(df.source_id, df.cited_source_uid,
                                 df.reference_issn, df_g.s_csi,
                                 df.reference_year)
    # df.show()
    # print("lenght of dataframe before filter ",df.count())

    #Filter rows for duplication
    # df=df.filter("cited_source_uid != s_csi")
    filter_df = df.filter("cited_source_uid == s_csi").select(
        'source_id').distinct()
    df = df.join(filter_df, ['source_id'],
                 'leftanti').select(df.source_id, df.reference_issn,
                                    df.cited_source_uid, df.s_csi,
                                    df.reference_year)
    df = df.withColumnRenamed('s_csi', 's_cited_source_uid')
    return df
예제 #11
0
def get_top_10_coinstalls(addons_expanded_day):
    def str_map_to_dict(m):
        result = {}
        for i in m:
            k, v = i.split("=")
            result[k] = v
        return result

    def format_row(row):
        return Row(
            addon_id=row.addon_id,
            top_10_coinstalls=str_map_to_dict(row.top_10_coinstalls),
        )

    w = Window().partitionBy("addon_id").orderBy(F.col("count").desc())
    d = (
        addons_expanded_day.join(
            addons_expanded_day.filter("is_system=false").withColumnRenamed(
                "addon_id", "coaddon"
            ),
            on="client_id",
        )
        .groupby("client_id", "addon_id", "coaddon")
        .count()
        .withColumn("rn", (F.row_number().over(w) - F.lit(1)))  # start at 0
        .filter("rn BETWEEN 1 and 10")  # ignore 0th addon (where coaddon==addon_id)
        .groupby("addon_id")
        .agg(
            F.collect_list(F.concat(F.col("rn"), F.lit("="), "coaddon")).alias(
                "top_10_coinstalls"
            )
        )
        .rdd.map(format_row)
        .toDF()
    )

    return d
예제 #12
0
def _get_top_k_items(
    dataframe,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_rating=DEFAULT_RATING_COL,
    col_prediction=DEFAULT_PREDICTION_COL,
    k=DEFAULT_K,
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.

    .. note::
        if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (pyspark.sql.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        k (int): number of items for each user.

    Return:
        pyspark.sql.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (dataframe.select(
        col_user, col_item, col_rating,
        row_number().over(window_spec).alias("rank")).where(
            col("rank") <= k).groupby(col_user).agg(
                F.collect_list(col_item).alias(col_prediction)))

    return items_for_user
예제 #13
0
    def group_batched_logs(df_logs):
        # group logs from did + interval_time + keyword.
        # group 1: group by did + interval_starting_time + keyword
        df = df_logs.groupBy(
            'aid', 'interval_starting_time', 'keyword_index').agg(
                first('keyword').alias('keyword'),
                first('age').alias('age'),
                first('gender_index').alias('gender_index'),
                first('aid_bucket').alias('aid_bucket'),
                fn.sum(col('is_click')).alias('kw_clicks_count'),
                fn.sum(fn.when(col('is_click') == 0,
                               1).otherwise(0)).alias('kw_shows_count'),
            )
        # df = df.orderBy('keyword_index')
        df = df.withColumn(
            'kwi_clicks_count',
            concat_ws(":", col('keyword_index'), col('kw_clicks_count')))
        df = df.withColumn(
            'kwi_shows_count',
            concat_ws(":", col('keyword_index'), col('kw_shows_count')))
        df = df.withColumn(
            'kw_clicks_count',
            concat_ws(":", col('keyword'), col('kw_clicks_count')))
        df = df.withColumn(
            'kw_shows_count',
            concat_ws(":", col('keyword'), col('kw_shows_count')))
        # group 2: group by did + interval_starting_time
        df = df.groupBy('aid', 'interval_starting_time').agg(
            concat_ws(",", collect_list('keyword_index')).alias('kwi'),
            concat_ws(
                ",",
                collect_list('kwi_clicks_count')).alias('kwi_click_counts'),
            concat_ws(
                ",", collect_list('kwi_shows_count')).alias('kwi_show_counts'),
            concat_ws(",", collect_list('keyword')).alias('interval_keywords'),
            concat_ws(
                ",", collect_list('kw_clicks_count')).alias('kw_click_counts'),
            concat_ws(",",
                      collect_list('kw_shows_count')).alias('kw_show_counts'),
            first('age').alias('age'),
            first('gender_index').alias('gender_index'),
            first('aid_bucket').alias('aid_bucket'))

        return df
예제 #14
0
def joinDF(rev_df, prod_df, cat):
    postgres_url = os.getenv('POSTGRES_URL', 'default')
    postgres_user = os.getenv('POSTGRES_USER', 'default')
    postgres_pw = os.getenv('POSTGRES_PW', 'default')
    postgres_properties = {"user": postgres_user, "password": postgres_pw}

    table_name = cat + "joined"
    prod_df = prod_df.filter(functions.array_contains("categories", cat))
    joined_df = rev_df.join(prod_df, rev_df.asin == prod_df.productid)
    joined_df = joined_df.groupby("reviewerid").agg(
        functions.collect_list("categories").alias("categories"))
    flatAll_udf = functions.udf(flatAll, ArrayType(StringType()))
    joined_df = joined_df.withColumn("categories",
                                     flatAll_udf(joined_df.categories))
    joined_df = joined_df.rdd.flatMap(
        lambda (user, cats): [(user, cat) for cat in cats]).toDF(
            ["reviewerid", "category"])
    joined_df = joined_df.groupby("reviewerid").pivot("category").count()
    joined_df = joined_df.na.fill(0)
    joined_df.write.jdbc(url=postgres_url,
                         table=table_name,
                         mode='overwrite',
                         properties=postgres_properties)
    return
예제 #15
0
 def create_buckets(sc, dataframe, buckets=20, prediction_col='prediction'):
     n_buckets = sc.broadcast(buckets)
     generate_list_udf = F.udf(
         f=lambda l, minimum, maximum, boundary: ShowResults.create_linspace(
             data=l, min=minimum, max=maximum,
             boundary=boundary, buckets=n_buckets.value
         ),
         returnType=T.ArrayType(
             elementType=T.ArrayType(
                 elementType=T.IntegerType(),
                 containsNull=True),
             containsNull=True
         )
     )
     tmp = (dataframe
            .groupBy(prediction_col, F.col('computed_boundary'))
            .agg(F.min('distance').alias('min'), F.max('distance').alias('max'),
                 F.sum('is_outlier').alias('n_outliers'),
                 F.collect_list('distance').alias('distances'))
            .withColumn(colName='buckets', col=generate_list_udf(
                         'distances', 'min', 'max',
                         'computed_boundary'))
            )
     return tmp.select(prediction_col, 'buckets')
예제 #16
0
def main(spark, test_file, index_file, model_file):
    # load test data and create dataframe
    test_df = spark.read.parquet(test_file)
    model_indexer = PipelineModel.load(index_file)
    # transform user and track ids for test data
    test_df = model_indexer.transform(test_df)
    # store ground truth for user
    user_truth = test_df.groupby('user_label').agg(
        F.collect_list('book_label').alias('truth'))
    print('created ground truth df')
    als_model = ALSModel.load(model_file)

    # predict based on the top 500 item of each user
    recommend = als_model.recommendForAllUsers(500)
    print('recommendation has been created.')
    # RMSE
    predict = als_model.transform(test_df)
    evaluator = RegressionEvaluator(metricName='rmse',
                                    labelCol='rating',
                                    predictionCol='prediction')
    rmse = evaluator.evaluate(predict)
    print('Root mean square error is ' + str(rmse))

    # prediction = spark.sql('SELECT * FROM recommend INNER JOIN user_truth WHERE recommend.user_label=user_truth.user_label')
    # after running panda udf is faster than using sparksql
    prediction = recommend.join(user_truth,
                                recommend.user_label == user_truth.user_label,
                                'inner')

    score = prediction.select('recommendations.book_label',
                              'truth').rdd.map(tuple)
    rank_metric = RankingMetrics(score)
    precision = rank_metric.precisionAt(500)
    mean_precision = rank_metric.meanAveragePrecision
    print(' precision at 500 ' + str(precision) +
          'mean average precision of ' + str(mean_precision))
예제 #17
0
def task_a_1_step_1_final(spark):
    a1_struct = T.StructType([
        T.StructField("datetime_start", T.TimestampType()),
        T.StructField("datetime_end", T.TimestampType()),
        T.StructField("map_item", T.MapType(T.StringType(), T.IntegerType())),
    ])

    result = kafka_source(spark, config.BOOTSTRAP_SERVERS, "topics-by-country_step-0").parse_json(a1_struct) \
        .withWatermark("datetime_end", "1 minute").groupBy(
        F.window("datetime_end", "6 hour", "1 hour")
    ).agg(
        F.first("window.start").alias("timestamp_start"),
        F.first("window.end").alias("timestamp_end"),
        F.collect_list("map_item").alias("statistics")
    ).select(
        F.struct(
            F.concat(F.hour('timestamp_start'), lit(":"), F.minute('timestamp_start')).alias("time_start"),
            F.concat(F.hour('timestamp_end'), lit(":"), F.minute('timestamp_end')).alias("time_end"),
            col('timestamp_end').alias("time_end"),
            sum_maps_udf(col('statistics')).alias("statistics")
        ).alias("res")
    ).send_to_kafka(config.BOOTSTRAP_SERVERS, "topics-by-country", config.LOG_PREFIX)

    return result
예제 #18
0
def remove_duplicated_paths(paths, trips):
    """ Given a paths dataframe, it will remove the duplicated paths in `paths` and update the `trips`
        with the non-duplicated paths
    """
    old_paths_count = paths.count()
    old_trips_count = trips.count()

    # Group the paths that are the same
    similar_paths = paths.groupBy("path_latitudes", "path_longitudes").agg(
        F.min("path_id").alias("new_path_id"),
        F.collect_list("path_id").alias("old_path_ids"),
    )

    # Make a map of the old path IDs to the new path IDs
    # NOTE: the exploded column name is called 'col'
    path_id_mappings = similar_paths.select("new_path_id", F.explode("old_path_ids"))

    # Update the trips' reference to path ids
    trips = trips.join(
        path_id_mappings, trips.path_id == path_id_mappings.col, how="leftouter"
    )
    trips = trips.withColumn("path_id", trips.new_path_id)

    # Grab the unique paths
    unique_paths = (
        similar_paths.select("new_path_id", "path_latitudes", "path_longitudes",)
        .withColumnRenamed("new_path_id", "path_id")
        .drop("new_path_id")
    )

    new_paths_count = unique_paths.count()
    new_trips_count = trips.count()

    assert new_paths_count <= old_paths_count and new_trips_count == old_trips_count

    return unique_paths, trips
예제 #19
0
def get_top_k_items(
    dataframe, col_user="******", col_item="itemID", col_rating="rating", k=10
):
    """Get the input customer-item-rating tuple in the format of Spark
    DataFrame, output a Spark DataFrame in the dense format of top k items
    for each user.
    NOTE: if it is implicit rating, just append a column of constants to be ratings.

    Args:
        dataframe (spark.DataFrame): DataFrame of rating data (in the format of
        customerID-itemID-rating tuple).
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        k (int): number of items for each user.

    Return:
        spark.DataFrame: DataFrame of top k items for each user.
    """
    window_spec = Window.partitionBy(col_user).orderBy(col(col_rating).desc())

    # this does not work for rating of the same value.
    items_for_user = (
        dataframe.select(
            col_user,
            col_item,
            col_rating,
            row_number().over(window_spec).alias("rank")
        )
        .where(col("rank") <= k)
        .withColumn("prediction", F.collect_list(col_item).over(Window.partitionBy(col_user)))
        .select(col_user, "prediction")
        .dropDuplicates([col_user, "prediction"])
    )

    return items_for_user
예제 #20
0
파일: queries.py 프로젝트: raalesir/ikea
    def top_5_selling_each_day_pyspark(self):
        """
        returns top 5  selling products for each day using pyspark syntax

        :return:
        """

        window = Window.partitionBy('date').orderBy(
            F.col('total_amount').desc())


        tmp = \
            self.df\
                .groupBy([F.to_date(F.col('timestamp')).alias('date'), 'ID']) \
                .agg(F.sum('quantity').alias('total_amount')) \
                .orderBy('date', accending=False) \
                .withColumn('row_number', F.row_number().over(window)) \
                .filter(F.col('row_number') <= 5) \
                .withColumn('zipped', F.concat("ID", F.lit(':'), "total_amount")) \
                .groupBy('date').agg(F.collect_list('zipped').alias("product_id:number_sold")) \
                .orderBy('date') \
                .show(50, truncate=False)

        return tmp
예제 #21
0
def get_n_freq_str(df_counts, top_n: Union[DataFrame, Column]) -> DataFrame:
    """
    extract n frequent values from get_counts and collect to a string representation so that each column has 1 row mapped to a string with the 10 frequent values and their counts
    """
    conc = udf(lambda lst: "{}->{}".format(str(lst[0]), str(lst[1])),
               StringType())  # take top n
    mapped_counts = df_counts.select(
        'col_name',
        conc(struct('frequent_value', 'count')).alias('frequent_value->count'))
    # mapped_counts = df_counts.select('col_name', create_map(['frequent_value', 'count']).alias('frequent_value->count'))  # to collect to map

    df_output = mapped_counts.groupBy('col_name').agg(
        collect_list(spark_col('frequent_value->count')).alias(
            'frequent_values->counts'))

    freq_col_name = 'frequent_values->counts_(top_{:d})'.format(top_n)

    conv = udf(lambda lst: [str(dct) for dct in lst[:top_n]],
               ArrayType(StringType()))  # take top n
    df_output = df_output.select(
        'col_name',
        conv('frequent_values->counts').alias(freq_col_name))

    return df_output
예제 #22
0
def create_normalization_spec_spark(df,
                                    column,
                                    num_samples: int,
                                    seed: Optional[int] = None):
    """Returns approximately num_samples random rows from column of df."""

    # assumes column has a type of map
    df = df.select(
        explode(col(column).alias("features")).alias("feature_name",
                                                     "feature_value"))

    # calculate fractions
    counts_df = df.groupBy("feature_name").count()
    frac = {}
    for row in counts_df.collect():
        assert num_samples <= row["count"]
        frac[row["feature_name"]] = num_samples / row["count"]

    # TODO(T64843081): change to reservoir sampling, currently it approximates
    # perform sampling and collect them
    df = df.sampleBy("feature_name", fractions=frac, seed=seed)
    df = df.groupBy("feature_name").agg(
        collect_list("feature_value").alias("feature_values"))
    return df
def user_personalization(df_similarity,df_active):

    df_pop=user_item_popularity(df_active)

    df_tmp=user_item_similarities(df_pop,df_similarity)



    df_save=df_tmp.select("user_id","_id","score")\
    .withColumn("Recommendations", F.struct(F.col("_id"), F.col("score")))\
    .select("user_id","Recommendations")\
    .groupby("user_id").agg(F.collect_list("Recommendations").alias("Recommendations"))  
    
    
    start_time = time.time()


    df_save.repartition(1) \
        .write.format("json") \
        .mode("overwrite") \
        .option("header","true")\
        .save("userrr_2")
    
    print("Saving user personalization json took - ", (time.time() -  start_time) )
예제 #24
0
def top20_channels(data):
    result = {"channels": []}
    channels = data.groupBy(["channel_title"]) \
        .agg(collect_list(array('video_id', 'trending_date', "views")).alias("trending_days"),
             countDistinct("video_id"),
             sum("views").alias("all_views")).orderBy("all_views", ascending=False).head(20)

    i = 0
    for row in channels:
        result["channels"].append({
            "channel_name": row["channel_title"],
            "start_date": row["trending_days"][0][1],
            "end_date": row["trending_days"][-1][1],
            "total_view": row["all_views"],
            "videos_views": []
        })
        for video in row["trending_days"]:
            result["channels"][i]["videos_views"].append({
                "video_id": video[0],
                "views": video[2]
            })
        i += 1

    return result
예제 #25
0
    def process_df(self, df):
        def detect_anomaly(ts):
            """
            Args ts: pandas.series
            rtype: int
            """
            outliers_indices = seasonal_esd(
                ts, hybrid=True, max_anomalies=10)
            return len(outliers_indices)

        grouped_df = df.groupBy(["id"]).agg(F.collect_list("downsample_avg").alias(
            "downsampled_ts"), first("start_ts").alias("start_ts"), last("end_ts").alias("end_ts"))
        anomaly_udf = udf(detect_anomaly, IntegerType())
        processed_df = grouped_df.withColumn("num_anomaly", anomaly_udf(
            "downsampled_avg")).sort(desc("num_anomaly"))
        final_df = processed_df.select(
            "id", "start_ts", "end_ts", "num_anomaly")
        try:
            connector = pgConnector.PostgresConnector(
                "ec2-3-94-71-208.compute-1.amazonaws.com", "datanodedb", "datanode", "password")
            connector.write(final_df, "global_anomalies_table", "append")
        except Exception as e:
            print(e)
            pass
예제 #26
0
def run(spark, config, **kwargs):
    df = read_tidb(spark, config, "select id, title, mp_id from content.content")

    df = df.withColumn('words', jieba_seg_udf(df['title']))
    # df.show(10)

    model = Word2Vec(numPartitions=10, inputCol='words', outputCol='vecs', seed=42).fit(df)
    df_transformed = model.transform(df)
    df_cross = df_transformed.select(
        col('id').alias('id1'),
        col('vecs').alias('vecs1')).crossJoin(df_transformed.select(
            col('id').alias('id2'),
            col('vecs').alias('vecs2'))
    )
    
    df_cross = df_cross.withColumn('sim', sim(df_cross['vecs1'], df_cross['vecs2']))
    df_simi = df_cross.filter(col('sim')<1)
    
    df_simi_top10 = get_topN(df_simi, col('id1'), col('sim'), n=10)
    
    df_simi_top10.show()
    df_simi_top10.groupby("id1").agg(collect_list("id2").alias("sim_ids")).show(10)

    return 
예제 #27
0
def main(spark, val_file, model_file):
    model = ALSModel.load(model_file)
    print('finish loading models')
    val_df = spark.read.parquet(val_file)
    val_df = val_df.select('user_label', 'track_label')
    val_grouped = val_df.groupBy('user_label').agg(F.collect_list(F.col('track_label')).alias('track_label'))
    print('Finish preparing test data')
    val_grouped.cache()

    predictions = model.recommendForAllUsers(500)
    print('finish making predictions')
    prediction_df = predictions.rdd.map(lambda r: (r.user_label, [i[0] for i in r.recommendations])).toDF()
    prediction_df = prediction_df.selectExpr("_1 as user_label", "_2 as recommendations")

    # Join table
    val_pred = val_grouped.join(prediction_df, "user_label", "inner")
    print('finish joining data')
    # Instantiate regression metrics to compare predicted and actual ratings
    rdd = val_pred.select('recommendations', 'track_label').rdd
    print('final steps')
    ranking_metrics = RankingMetrics(rdd)

    # MAP
    print("MAP = %s" % ranking_metrics.meanAveragePrecision)
예제 #28
0
def _get_relevant_items_by_threshold(dataframe,
                                     col_user=DEFAULT_USER_COL,
                                     col_item=DEFAULT_ITEM_COL,
                                     col_rating=DEFAULT_RATING_COL,
                                     col_prediction=DEFAULT_PREDICTION_COL,
                                     threshold=DEFAULT_THRESHOLD):
    """Get relevant items for each customer in the input rating data.

    Relevant items are defined as those having ratings above certain threshold.
    The threshold is defined as a statistical measure of the ratings for a
    user, e.g., median.

    Args:
        dataframe: Spark DataFrame of customerID-itemID-rating tuples.
        col_user (str): column name for user.
        col_item (str): column name for item.
        col_rating (str): column name for rating.
        col_prediction (str): column name for prediction.
        threshold (float): threshold for determining the relevant recommended items.
            This is used for the case that predicted ratings follow a known
            distribution.

    Return:
        spark.DataFrame: DataFrame of customerID-itemID-rating tuples with only relevant
            items.
    """
    items_for_user = (dataframe.orderBy(
        col_rating,
        ascending=False).where(col_rating + " >= " + str(threshold)).select(
            col_user, col_item, col_rating).withColumn(
                col_prediction,
                F.collect_list(col_item).over(
                    Window.partitionBy(col_user))).select(
                        col_user, col_prediction).dropDuplicates())

    return items_for_user
예제 #29
0
    def group_batched_logs(df_logs_batched):
        # group the logs to generate the train ready data from the basic unit of uckey + interval_time + keyword.
        # group 1: group by uckey + interval_starting_time + keyword
        df = df_logs_batched.groupBy('uckey', 'interval_starting_time', 'keyword_index').agg(
            first('keyword').alias('keyword'),
            fn.sum(col('is_click')).alias('keyword_click_count'),
            fn.count(fn.when(col('is_click') == 0, 1).otherwise(
                0)).alias('keyword_show_count')
        )
        df = df.withColumn('keyword_index_click_count', concat_ws(":", col('keyword_index'), col('keyword_click_count')))
        df = df.withColumn('keyword_index_show_count', concat_ws(":", col('keyword_index'), col('keyword_show_count')))
        df = df.withColumn('keyword_click_count', concat_ws(":", col('keyword'), col('keyword_click_count')))
        df = df.withColumn('keyword_show_count', concat_ws(":", col('keyword'), col('keyword_show_count')))

        # group 2: group by uckey + interval_starting_time
        df = df.groupBy('uckey', 'interval_starting_time').agg(
            concat_ws(",", collect_list('keyword_index')).alias('interval_keyword_indexes'),
            concat_ws(",", collect_list('keyword_index_click_count')).alias('interval_keyword_indexes_click_counts'),
            concat_ws(",", collect_list('keyword_index_show_count')).alias('interval_keyword_indexes_show_counts'),
            concat_ws(",", collect_list('keyword')).alias('interval_keywords'),
            concat_ws(",", collect_list('keyword_click_count')).alias('interval_keywords_click_counts'),
            concat_ws(",", collect_list('keyword_show_count')).alias('interval_keywords_show_counts')
        )
        return df
예제 #30
0
    def run(self):
        """
        Run application
        :return:
        """
        join_df = self.join_df
        self.persist()

        # Create dataframes for intersection , cogo labs and liveworks only dataframes
        logger.info(
            " Creating common dataframe emd5 present both in live works and cogo labs"
        )
        """
        Cogo labs
        emd5    Name
        1       Sam
        2       Henry
        
        Liveworks
        emd5    Name
        2       John
        3       Smith
        
        Full Outer Join
        c_emd5    l_emd5   c_name  l_name
        1           Null     Sam     Null
        2           2       Henry   John
        Null        3        Null    Smith
        
        Intersection from cogo labs and Live works, where c_emd5 and l_emd5 is not null
        c_emd5    l_emd5   c_name  l_name
        2           2       Henry   John   
        
        Users only from cogo labs, where c_emd5 is not null and l_emd5 is null
        c_emd5    l_emd5   c_name  l_name
        1           Null     Sam     Null
        
        Users only from live works, where l_emd is not null and c_emd5 is null
        c_emd5    l_emd5   c_name  l_name
        Null        3        Null    Smith
        
        """

        common_df = join_df.filter(~join_df.emd5.isNull()
                                   & ~join_df.cogo_emd5.isNull())
        logger.info(
            " Creating cogo labs only dataframe where emd5 present in cogo labs and not present in live works"
        )
        cogo_labs_only_df = join_df.filter(~join_df.cogo_emd5.isNull()
                                           & join_df.emd5.isNull())
        logger.info(
            " Creating live works only dataframe where emd5 present in liveworks and not present in cogo labs"
        )
        live_works_only_df = join_df.filter(join_df.cogo_emd5.isNull()
                                            & ~join_df.emd5.isNull())

        # counting distinct emd5 counts
        intersection_count = common_df.select(
            common_df.cogo_emd5).distinct().count()
        cogo_labs_only_count = cogo_labs_only_df.select(
            cogo_labs_only_df.cogo_emd5).distinct().count()
        live_works_only_count = live_works_only_df.select(
            live_works_only_df.emd5).distinct().count()

        logger.info(
            "Number of Unique users present in both cogo labs and liveworks %s",
            intersection_count)
        logger.info("Number of Unique users present only in cogo labs data %s",
                    cogo_labs_only_count)
        logger.info(
            "Number of Unique users present only in live works data %s",
            live_works_only_count)

        # Create common job data frame with users having same job title
        logger.info(
            "Creating common job data frame where common users have same job title"
        )
        common_job_df = common_df.where(common_df.cogo_job == common_df.job)
        common_job_df.persist(StorageLevel.DISK_ONLY)
        print("Output with common emd5 users having same job title")
        common_job_df.show()
        common_job_count = common_job_df.count()

        # Calculate percentage common emd5 users have different job titles
        different_jobs_percent = (
            (intersection_count - common_job_count) / intersection_count) * 100
        logger.info(
            "Number of  users with common job present in both cogo labs and liveworks %s",
            common_job_df.count())
        logger.info("Percent have different job titles in intersection %s",
                    different_jobs_percent)

        # jsonsify data from common data frame
        """
        Create Key:Value pair
        Key = Job title , Value = Company Name
        
        cogolabs_emd5   cogolabs_job    cogolabs_company    liveworks_job   liveworks_company
        1              Hotel manager     Bender PLC          Barrister           Brown PLC
        1            Immigration officer Diaz Ltd                                           
        
        cogolabs_emd5   cogo_labs_c                         liveworks_c
        1               {"Hotel manager":"Bender PLC"}      {"Barrister":"Brown PLC"}
        1               {"Immigration officer":"Diaz Ltd"} 
        """

        common_json_df = common_df.withColumn("live_works_c", concat(lit("{\""), common_df.job, lit("\":\""),
                                                                     common_df.company, lit("\"}"))) \
            .withColumn("cogo_labs_c", concat(lit("{\""), common_df.cogo_job,
                                              lit("\":\""), common_df.cogo_company, lit("\"}")))
        """
        Concatenate results to form Array of key value pairs group by emd5
        emd5            cogolabs_json                                               liveworks_json
        1   [{"Hotel manager":"Bender PLC"},{"Immigration officer":"Diaz Ltd"}]     [{"Barrister":"Brown PLC"}]

        """

        common_agg_df = common_json_df.select(common_json_df.emd5, common_json_df.cogo_labs_c,
                                               common_json_df.live_works_c) \
            .groupBy(common_json_df.emd5).agg(concat_ws(",", collect_list(common_json_df.cogo_labs_c)),
                                                concat_ws(",", collect_list(common_json_df.live_works_c))) \
            .withColumnRenamed("concat_ws(,, collect_list(cogo_labs_c))", "cogo_labs_json") \
            .withColumnRenamed("concat_ws(,, collect_list(live_works_c))", "live_works_json")

        final_df = common_agg_df.withColumn("cogo_labs_json",
                                             concat(lit("["), common_agg_df.cogo_labs_json, lit("]"))) \
            .withColumn("live_works_json", concat(lit("["), common_agg_df.live_works_json, lit("]")))
        print("Final output with emd5 , cogolabs json and liveworks json")
        final_df.persist(StorageLevel.DISK_ONLY)
        final_df.show()

        # save final output as csv

        if os.path.exists("data/final_output/"):
            remove_directory("data/final_output/")
        logger.info("Save final output as csv")
        final_df.repartition(1).write.format("csv").save(
            os.path.join(DATA_PATH, "final_output"))

        self.unpersist()
예제 #31
0
    HashTagsTable = extracted_SQL_table.select("created_at",
                                               explode("hashtags"))

    HashTagsTable_WithDates = HashTagsTable.withColumn(
        'Keyword', get_hash_tag('col')).withColumn('Time',
                                                   tweet_time('created_at'))

    # clean up table
    columns_to_drop = ['created_at', 'col']
    HashTagsTable_WithDates = HashTagsTable_WithDates.drop(*columns_to_drop)

    #HashTagsTable_WithDates.show()
    to_mongo = HashTagsTable_WithDates.groupBy('Keyword', 'Time').count()

    groupby_result = (to_mongo.groupBy("Keyword").agg(
        collect_list(struct("Time", "count")).alias('occurances')))

    # loading to REDIS
    redis_host = config_dict['redis_path']
    redis_port = config_dict[r]
    redis_password = ""
    r = redis.StrictRedis(host=redis_host,
                          port=redis_port,
                          decode_responses=True)

    for row in groupby_result.rdd.collect():
        r.set(row.Keyword, json.dumps(row.occurances))
        # set TTL : expire in 60 seconds * 60 min * 24 hours = 86400 seconds
        r.expire(row.Keyword, 86400)
    spark.stop()
예제 #32
0
    argparser.add_argument('outdir', help='Output directory')
    args = argparser.parse_args()

    spark = SparkSession.builder.appName('Cluster Features').getOrCreate()

    df = spark.read.load(args.indir)

    raw = df.filter(col('size') >= args.clusterSize) \
            .select('cluster', 'size', regexp_replace('text', u'\xad\s*', '').alias('text'))
    raw.cache()

    tok = RegexTokenizer(inputCol='text', outputCol='terms', gaps=False, pattern='\w+') \
          .transform(raw)
    counts = CountVectorizer(inputCol='terms', outputCol='counts', minDF=2.0) \
             .fit(tok).transform(tok)
    
    mergeCounts = udf(lambda va, size: threshold_sparse(scale_sparse(reduce(add_sparse, va), 1.0/size), args.minCount),
                      VectorUDT())

    res = counts.groupBy('cluster', 'size') \
                .agg(mergeCounts(collect_list('counts'), 'size').alias('counts'))

    # lda = LDA(k=2, featuresCol='counts', seed=1, optimizer='em')
    # model = lda.fit(res)

    # model.describeTopics().write.json(args.outdir)

    res.write.json(args.outdir)

    spark.stop()
예제 #33
0
파일: project.py 프로젝트: sam46/Yelper
def prepareDatasets(sc, spark):
    buisHeader = ['business_id', 'name', 'neighborhood', 'address', 'city', 'state', 'postal_code',
        'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories']
    buis = sc.textFile(datapath+'yelp_business.csv', use_unicode=False)
    buis = buis.filter(lambda row: not row.startswith('business_id,name'))\
        .map(lambda row: re.findall(r'(?:[^,"]|"(?:\\.|[^"])*")+', row.replace(',,', ', ,')))\
        .map(lambda row: map(lambda x: x.replace('"', ''), row))\
        .map(lambda row: dict(zip(buisHeader, row)))\
        .filter(lambda row: row['business_id'] and row['longitude'] and row['latitude'])\
        .filter(lambda row: row['business_id'].strip() and row['longitude'].strip() and row['latitude'].strip())\
        .toDF()
    buis = buis.select('business_id', 'name', 'city', 'state', 'postal_code', 'categories',
                        buis['latitude'].cast('float'), buis['longitude'].cast('float'),
                        buis['stars'].cast('float'), buis['review_count'].cast('int'),
                        buis['is_open'].cast('int'))\
        .dropna(how='any', subset=['business_id','longitude', 'latitude'])

    def reviews_mapper(index, lines):
        import csv
        reader = csv.reader(lines)
        if index==0: lines.next()
        for row in reader:
            if len(row) == 9 and len(row[1])==22:
                yield row
    reviewsHeader = ["review_id","user_id","business_id","stars","date","text","useful","funny","cool"]
    reviews = sc.textFile(datapath+'yelp_review.csv', use_unicode=False)\
        .mapPartitionsWithIndex(reviews_mapper)\
        .map(lambda x: dict(zip(reviewsHeader, x)))\
        .toDF()
    reviews = reviews.select(
        "review_id", "user_id", "business_id", "text",
        reviews["stars"].cast('float'), reviews["date"].cast('date'),
        reviews["useful"].cast('int'), reviews["funny"].cast('int'),
        reviews["cool"].cast('int'))\
        .filter(reviews.text.isNotNull())\
        .filter(reviews.business_id.isNotNull())
    reviews = reviews.alias('a').join(buis.alias('b'),
        sf.col('b.business_id') == sf.col('a.business_id'))\
        .select('b.*','a.text') #,'a.user_id')
    reviews = reviews.where(
        'longitude > {:f} and longitude < {:f} and latitude > {:f} and latitude < {:f}'\
        .format(westAMER, eastAMER, southAMER, northAMER)
    ).cache()

    id_text = reviews.select('business_id', 'text')\
        .groupBy('business_id').agg(sf.concat_ws(' ', sf.collect_list("text")).alias('text_concat'))
    reviews = reviews.drop(reviews.text)\
        .select('business_id','categories','state', 'stars')\
        .alias('a').join(id_text.alias('b'),
        sf.col('b.business_id') == sf.col('a.business_id'))\
        .select('a.*','b.text_concat')\
        .distinct()\
        .withColumnRenamed('text_concat', 'text')

    # some data cleansing:
    reviews = reviews.withColumn('text', sf.regexp_replace(reviews.text, '\\/', '/'))
    def cleanse(text):
        re_punc = re.compile('[' + re.escape(punctuation) + '0-9\\n\\t\\r]')
        re_spc = re.compile('[ ]+') # get rid of extra spaces
        return re_spc.sub(' ', re_punc.sub(" ", text))
    cleanser = sf.udf(lambda x: cleanse(x))
    reviews = reviews.withColumn('text', cleanser('text'))
    # tokinizing and removing stop words:
    import pyspark.ml.feature as sparkml
    from pyspark.ml import Pipeline
    tokenizer = sparkml.Tokenizer(inputCol="text", outputCol="words")
    swremover = sparkml.StopWordsRemover(inputCol='words', outputCol='words_clean')
    pipeline = Pipeline(stages=[tokenizer, swremover])
    reviews = pipeline.fit(reviews).transform(reviews)
    reviews = reviews.drop('text', 'words')
    return reviews.cache()
예제 #34
0
sc   = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

# Import Data
data   = sc.textFile("hdfs://sandbox.hortonworks.com:/tmp/household_power_consumption_uci_edu.txt")
header = data.first().split(';')
data   = data.map(lambda x: x.split(';')).filter(lambda x: x!=header)

df = data.toDF(header)

# Use schema
df1 = df.withColumn('Sub_metering_1',df.Sub_metering_1.cast('float')) \
    .withColumn('Sub_metering_2',df.Sub_metering_2.cast('float')) \
    .withColumn('Sub_metering_3',df.Sub_metering_3.cast('float')) \
    .withColumn('Voltage',df.Voltage.cast('float'))

df1.show(5)

df2 = df1.groupby("Date").agg(F.collect_list("Sub_metering_1").alias('meter1'),F.collect_list("Sub_metering_2").alias('meter2'),F.collect_list("Sub_metering_3").alias('meter3'))

df2.show(5)

def get_udf_distance(array1, array2):
    distance, path = fastdtw(array1, array2, dist=euclidean)
    return distance

udf_dtw = udf(get_udf_distance , FloatType())
df3 = df2.select('Date', udf_dtw(df2.meter1, df2.meter2).alias('dtw_distance (meter1-meter2)'))

df3.show()
예제 #35
0
attributes_path = '../input/attributes.csv'

spark = SparkSession.builder.getOrCreate()

df = spark.read.csv(data_path, header=True)

#descr_df = spark.read.csv(description_path,
#	header=True)

attr_df = spark.read.csv(attributes_path, header=True)

#df = df.join(descr_df, ['product_uid'])
#attr_df = attr_df.groupBy('product_uid').agg(f.concat_ws(" ", f.collect_list(attr_df.name))).show()

attr_df = attr_df.select('product_uid', 'value').groupBy('product_uid').agg(
    concat_ws(' ', collect_list('value')).alias('attribute_names'))

df = df.join(attr_df, ['product_uid'])

df.show()

df = df.withColumn('new_relevance', df['relevance'].cast(FloatType()))

replaceDigitsUdf = udf(replaceDigits, StringType())

createClassLabelUdf = udf(createClassLabel, IntegerType())

df = df.withColumn('label', createClassLabelUdf('new_relevance'))

df = df.withColumn('nodigits_title', replaceDigitsUdf('product_title'))
df = df.withColumn('nodigits_sterm', replaceDigitsUdf('search_term'))
예제 #36
0
def run_spark_job(sc, run_N_times = 1):

    ###############################################################################################################
    ## utility functions:
    ###############################################################################################################

    def print_df(df, only_display=100):
        outout = df.collect()
        for idx, o in enumerate(outout):
            if idx > only_display:
                print(' And more ...')
                break
            print(o)

    # print spark configurations
    def printSparkConfigurations():
        c = SparkConf()
        print("Spark configurations: {}".format(c.getAll()))

    # Use udf to define a row-at-a-time udf
    def myFunc(data_list):
        s = set()
        for d in data_list:
            s.add(d)
        count = len(s)
        output_str = reduce(lambda x,y: x+', '+y, s)
        return "{} {}".format(count, output_str)

    def read_csv_in_chunk(input_file):
        def optimize_chunk(chunk, column_names):
            for n in column_names:
                chunk[n] = chunk[n].astype('category')
            return chunk

        df = pd.DataFrame()
        chunk_size = 50000
        for idx, chunk in enumerate(pd.read_csv(input_file, sep=",", delimiter=",", chunksize=chunk_size)):   # assuming the file contains a header
            print("Reading the csv input in chunk index: {} on chunk_size: {} ".format(idx, chunk_size))
            # optimize the rows by changing it to category type
            chunk = optimize_chunk(chunk, ['trnk_wire', 'trnk_light', 'trnk_other', 'state', 'root_grate', 'root_other', 'root_stone', 'sidewalk', 'health', 'curb_loc', 'brnch_ligh', 'brnch_othe', 'brnch_shoe', 'guards',
                                           'user_type', 'status', 'steward', 'boroname', 'zip_city'])
            df = pd.concat([df, chunk], ignore_index=True)
        return df

    printSparkConfigurations()

    input_file = './2015StreetTreesCensus_TREES.csv'
    # input_file = './2015StreetTreesCensus_TREES_small.csv'
    pandas_df_raw = read_csv_in_chunk(input_file)

    for i in range(run_N_times):
        pandas_df = pandas_df_raw.replace(np.nan, '', regex=True)   # there are some empty columns (internally interpreted as nan inside dataframe) in the file that caused exceptions so need to map it to an empty string
        pandas_df = pandas_df[['created_at', 'block_id', 'spc_common']]

        pandas_df = sc.createDataFrame(pandas_df)

        pandas_df.repartition(5000, 'created_at')     # optimization trick !

        myUdf = udf(myFunc, StringType())

        created_at_block_id_summaries = pandas_df.groupby(['created_at', 'block_id']).agg(collect_list('spc_common').alias('spc_common')).withColumn('spc_common', myUdf('spc_common'))

        print_df(created_at_block_id_summaries)

        block_id_created_at_summaries = pandas_df.groupby(['block_id', 'created_at']).agg(collect_list('spc_common').alias('spc_common')).withColumn('spc_common', myUdf('spc_common'))

        print_df(block_id_created_at_summaries)
    ("Kumar", "Marketing", 2000),
    ("Saif", "Sales", 4100)
  ]
schema = ["employee_name", "department", "salary"]
  
  
df = spark.createDataFrame(data=simpleData, schema = schema)
df.printSchema()
df.show(truncate=False)

print("approx_count_distinct: " + \
      str(df.select(approx_count_distinct("salary")).collect()[0][0]))

print("avg: " + str(df.select(avg("salary")).collect()[0][0]))

df.select(collect_list("salary")).show(truncate=False)

df.select(collect_set("salary")).show(truncate=False)

df2 = df.select(countDistinct("department", "salary"))
df2.show(truncate=False)
print("Distinct Count of Department &amp; Salary: "+str(df2.collect()[0][0]))

print("count: "+str(df.select(count("salary")).collect()[0]))
df.select(first("salary")).show(truncate=False)
df.select(last("salary")).show(truncate=False)
df.select(kurtosis("salary")).show(truncate=False)
df.select(max("salary")).show(truncate=False)
df.select(min("salary")).show(truncate=False)
df.select(mean("salary")).show(truncate=False)
df.select(skewness("salary")).show(truncate=False)
예제 #38
0
udf_cos_sim = udf(lambda cell: float(ref_vec.dot(cell) / (ref_vec.norm(2) * cell.norm(2))), FloatType())
df_transformed \
        .withColumn('cos_similarity', udf_cos_sim(df_transformed.word2vec_large)) \
        .sort(col('cos_similarity'), ascending=False) \
        .select('resource_id', 'cos_similarity')
        .show(10)

# create a list of terms connected to their stems
df_wordcount = spark \
        .createDataFrame(df_transformed.select(explode(df_transformed.tokens_stop).alias('term')) \
        .groupBy('term') \
        .agg({"*": "count"}) \
        .collect())
df_stems = df_wordcount \
        .withColumn('stem', udf(lambda term: opinion_stemm.stem(term), StringType())(col('term'))) \
        .groupBy('stem') \
        .agg(collect_list('term').alias('terms'))
df_stems \
        .select('terms') \
        .filter(df_stems.stem == opinion_stemm.stem('artful')) \
        .first()[0]

# create a count for each opinion of the number of times it has been cited by other Washington opinions
df_citecount = spark.createDataFrame(
        df_transformed.select(explode(df_transformed.opinions_cited).alias('cites')) \
                .groupBy('cites') \
                .count() \
                .collect())
df_citecount.orderBy('count', ascending=False).show()

예제 #39
0
less_than_week_funds = fund_count.filter(fund_count['fund_count'] <= 5)

less_than_week_funds = less_than_week_funds.select(
    f.collect_set('fund_id')).collect()[0][0]
less_than_week_funds = [
    str(less_than_week_funds) for less_than_week_funds in less_than_week_funds
]

#df_uniqTicker30 = df_uniqTicker.where(df_uniqTicker['ticker'].isin(df_Ticker30['ticker']))

df_less = df_uniqfund.where(df_uniqfund.fund_id.isin(less_than_week_funds))

#df_less = df_uniqTicker30

df_less_quantiles = df_less.groupby('client_name', 'fund_id').agg(
    f.collect_list("event_time_numeric").alias("lis_2_event_time_numeric"))
df_less_quantiles = df_less_quantiles.withColumn(
    'Quantile01(fund_MinimumTime)',
    udf_percent_rank('lis_2_event_time_numeric', f.lit(1)))
df_less_quantiles = df_less_quantiles.withColumn(
    'Predicted_Time', udf_percent_rank('lis_2_event_time_numeric', f.lit(50)))

#df_less_quantiles = df_less_quantiles.withColumn('Quantile01(Ticker_MinimumTime)', df_less_quantiles['Quantile01(Ticker_MinimumTime)'].cast(IntegerType()))
#df_less_quantiles = df_less_quantiles.withColumn('Predicted_Time',df_less_quantiles['Predicted_Time'].cast(IntegerType()))

df_less_quantiles = df_less_quantiles.withColumn(
    'Quantile01(fund_MinimumTime)',
    udf_string_time('Quantile01(fund_MinimumTime)'))
df_less_quantiles = df_less_quantiles.withColumn(
    'Predicted_Time', udf_string_time('Predicted_Time'))
from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()


# COMMAND ----------

from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
    expr("count(Quantity)")).show()


# COMMAND ----------

df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)"))\
  .show()