Exemplo n.º 1
0
    def __init__(self, kdf_or_kser, window, min_periods=None):
        from databricks.koalas import DataFrame, Series
        from databricks.koalas.groupby import SeriesGroupBy, DataFrameGroupBy

        if window < 0:
            raise ValueError("window must be >= 0")
        if (min_periods is not None) and (min_periods < 0):
            raise ValueError("min_periods must be >= 0")
        self._window_val = window
        if min_periods is not None:
            self._min_periods = min_periods
        else:
            # TODO: 'min_periods' is not equivalent in pandas because it does not count NA as
            #  a value.
            self._min_periods = window

        self.kdf_or_kser = kdf_or_kser
        if not isinstance(kdf_or_kser, (DataFrame, Series, DataFrameGroupBy, SeriesGroupBy)):
            raise TypeError(
                "kdf_or_kser must be a series or dataframe; however, got: %s" % type(kdf_or_kser))
        if isinstance(kdf_or_kser, (DataFrame, Series)):
            self._index_scols = kdf_or_kser._internal.index_scols
            self._window = Window.orderBy(self._index_scols).rowsBetween(
                Window.currentRow - (self._window_val-1), Window.currentRow)

            self._unbounded_window = Window.orderBy(self._index_scols)
Exemplo n.º 2
0
def main(spark, model_file, test_file):

    model = MatrixFactorizationModel.load(sc, model_file)
    #model = MatrixFactorizationModel.load(sc, 'hdfs:/user/xx852/als_model')
    test_df = spark.read.parquet(test_file)
    #test_df = spark.read.parquet('hdfs:/user/xx852/cf_test_small.parquet')
    test_df = test_df.select('user_label', 'track_label', 'count')

    #predictions = model.recommendProductsForUsers(500)
    predictions = model.recommendProductsForUsers(2)
    prediction_flat = predictions.flatMap(lambda p: p[1])
    prediction_df = prediction_flat.toDF()
    intersections = prediction_df.join(test_df, (prediction_df.product == test_df.track_label)&
                                      (prediction_df.user == test_df.user_label), how = 'inner')
    predLabel = intersections.select('rating', 'count')
    #predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1]))
    #metrics = RankingMetrics(predLabel_rdd)
    #print(metrics.meanAveragePrecision)

    from pyspark.sql import Window
    import pyspark.sql.functions as psf
    w_rating = Window.orderBy(psf.desc('rating'))
    w_count = Window.orderBy(psf.desc('count'))
    predLabel = predLabel.withColumn('rating_rank', \
                psf.dense_rank().over(w_rating)).withColumn('count_rank', \
                psf.dense_rank().over(w_count))

    predLabel = predLabel.select('rating_rank', 'count_rank')
    #predLabel_rdd = predLabel.rdd.map(lambda x: Row(x[0], x[1]))
    predLabel_rdd = predLabel.rdd
    metrics = RankingMetrics(predLabel_rdd)
Exemplo n.º 3
0
def AnadeColumnaSpark(SQLContext, DFInput, NombreColumna, ListaAInsertar,
                      cambioADouble):

    # Se anade una columna a un df de spark a partir de una lista y poniendole
    # un nombre a la columna

    #Se crea el dataframe de la Lista a Insertar
    DFListaAInsertar = SQLContext.createDataFrame([(l, )
                                                   for l in ListaAInsertar],
                                                  [NombreColumna])

    #add 'sequential' index and join both dataframe to get the final result
    DFInput = DFInput.withColumn(
        "row_idx",
        row_number().over(Window.orderBy(monotonically_increasing_id())))
    DFListaAInsertar = DFListaAInsertar.withColumn(
        "row_idx",
        row_number().over(Window.orderBy(monotonically_increasing_id())))

    DFSalida = DFInput.join(
        DFListaAInsertar,
        DFInput.row_idx == DFListaAInsertar.row_idx).drop("row_idx")

    if cambioADouble == True:
        DFSalida = DFSalida.withColumn(
            NombreColumna, DFSalida[NombreColumna].cast(DecimalType(15, 3)))

    return DFSalida
Exemplo n.º 4
0
def with_row_number(output_col: str,
                    order_by: list,
                    df: DataFrame,
                    sort="asc",
                    zero_indexed=True) -> DataFrame:
    """Assign a sequential row number to each member of a dataframe"""

    is_desc = sort.lower() in ["desc", "descending"]
    if isinstance(order_by, str) or isinstance(order_by, Column):
        order_by = [order_by]
    elif not isinstance(order_by, list):
        msg = "Ordering criteria must be a string column name or a list of string column names"
        raise Exception(msg)

    # create a window function depending on the sort order
    if is_desc:
        window = Window.orderBy(*[F.desc(i) for i in order_by])
    else:
        window = Window.orderBy(*[F.asc(i) for i in order_by])

    # if the client wants to start from row 1 then that's fine
    if not zero_indexed:
        return df.withColumn(output_col, F.row_number().over(window))

    # otherwise start from row number 0
    return df.withColumn(output_col, F.row_number().over(window) - 1)
Exemplo n.º 5
0
def calc_profit2(df):
    '''
    Creating new column with shifted Close price by 1 day
    Profit label calculation
    1 if stock risen up, 0 is it went down
    :param df:
    :return:
    '''
    df_daily_return = df.withColumn(
        'prev_day_price',
        F.lag(df['Open']).over(Window.orderBy("id")))

    df_daily_return = df_daily_return.filter(
        df_daily_return.prev_day_price.isNotNull())

    df_profit = df_daily_return.withColumn(
        'Profit',
        profit_udf(df_daily_return.Open, df_daily_return.prev_day_price))

    # df_profit = df_profit.withColumn(
    #     'prediction', BHC(df_profit.Profit, F.lag(df_profit['Profit']).over(
    #         Window.orderBy("id"))))

    df_profit = df_profit.withColumn('prediction', RC(df_profit.Profit))

    df_shifted_profit = df_profit.withColumn(
        'Profit',
        F.lag(df_profit['Profit'], count=-1).over(Window.orderBy("id")))

    final_df = df_shifted_profit.filter(df_shifted_profit.Profit.isNotNull())

    final_df = final_df.drop("Daily return")
    final_df = final_df.drop("prev_day_price")
    return final_df
Exemplo n.º 6
0
def filterEnglish(df, sqlContext):

    before = df.count()
    df = df.dropna(subset=['tweet'])
    after = df.count()
    df = df.filter(df.country == 'United States of America')
    print("Success: removed " + str(before - after) +
          " null tweets and filtered USA")

    tweetList = list(df.select('tweet').toPandas()['tweet'])

    print("Success: made tweet list")

    languages = [ld.detect_language(text) for text in tweetList]

    print("Success: languages detected")

    languageDf = sqlContext.createDataFrame([(l, ) for l in languages],
                                            ['languages'])
    df = df.withColumn(
        "row_idx",
        row_number().over(Window.orderBy(monotonically_increasing_id())))
    languageDf = languageDf.withColumn(
        "row_idx",
        row_number().over(Window.orderBy(monotonically_increasing_id())))
    df = df.join(languageDf, df.row_idx == languageDf.row_idx).drop("row_idx")
    df = df.filter(df.languages == 'english').drop("languages")
    print("Success: filtered languages")

    return df
Exemplo n.º 7
0
def spark_join(df, column, column_name, sql_context):
    b = sql_context.createDataFrame([(int(l),) for l in column], [column_name])

    # add 'sequential' index and join both dataframe to get the final result
    df = df.withColumn("row_idx_2", row_number().over(Window.orderBy(monotonically_increasing_id())))
    b = b.withColumn("row_idx", row_number().over(Window.orderBy(monotonically_increasing_id())))

    final_df = df.join(b, df.row_idx_2 == b.row_idx). \
        drop("row_idx_2")
    return final_df
Exemplo n.º 8
0
 def _cal_percent_rank(self, dfBase, col_name, order):
     print "sort column[{0}],order by[{1}]".format(col_name, order)
     if order == "asc":
         window = Window.orderBy(col_name)
     else:
         window = Window.orderBy(fn.desc(col_name))
     df = dfBase.where("%s is not null" % col_name) \
         .withColumn(col_name + "_rank", fn.percent_rank().over(window)) \
         .persist(storageLevel=StorageLevel.DISK_ONLY) \
         .union(dfBase.where("%s is null" % col_name)
                .withColumn(col_name + "_rank", fn.lit(None))) \
         .repartition(20).persist(storageLevel=StorageLevel.DISK_ONLY)
     df.count()
     return df
Exemplo n.º 9
0
    def _is_monotonic_decreasing(self) -> Series:
        window = Window.orderBy(NATURAL_ORDER_COLUMN_NAME).rowsBetween(-1, -1)

        cond = SF.lit(True)
        has_not_null = SF.lit(True)
        for scol in self._internal.index_spark_columns[::-1]:
            data_type = self._internal.spark_type_for(scol)
            prev = F.lag(scol, 1).over(window)
            compare = MultiIndex._comparator_for_monotonic_increasing(data_type)
            # Since pandas 1.1.4, null value is not allowed at any levels of MultiIndex.
            # Therefore, we should check `has_not_null` over the all levels.
            has_not_null = has_not_null & scol.isNotNull()
            cond = F.when(scol.eqNullSafe(prev), cond).otherwise(compare(scol, prev, Column.__lt__))

        cond = has_not_null & (prev.isNull() | cond)

        cond_name = verify_temp_column_name(
            self._internal.spark_frame.select(self._internal.index_spark_columns),
            "__is_monotonic_decreasing_cond__",
        )

        sdf = self._internal.spark_frame.select(
            self._internal.index_spark_columns + [cond.alias(cond_name)]
        )

        internal = InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[
                scol_for(sdf, col) for col in self._internal.index_spark_column_names
            ],
            index_names=self._internal.index_names,
            index_fields=self._internal.index_fields,
        )

        return first_series(DataFrame(internal))
def modify_group_for_dim(model_dict, df, d, colname):
    '''given a DF with a groups assigned (variable colname),
    apply a dictionary to post-process the groups according 
    to that one dimension. returns original DF with modified
    colname column.
    e.g. move specific seasons to the holdout or throwaway sets.
    '''
    dim_props = model_dict['dimensional_dataset_proportions'].iteritems()
    for grp, grp_dict_list in dim_props:
        for grp_dict in grp_dict_list:
            window = Window.orderBy('dim_rnd')\
                           .partitionBy(grp_dict['dim'], colname)
            df = df.withColumn('dim_rk', F.percent_rank().over(window))

            ## if (1) the column is within the set values,
            ## (2) the pre-existing group falls within those set values, and
            ## (3) the random value is below the set threshold,
            ## then override and modify the group membership
            if grp_dict['prop_to_move'] > 0:
                df = df.withColumn(
                    colname,
                    F.when((col(grp_dict['dim']).isin(grp_dict['vals']))
                           & (col(colname).isin(grp_dict['from_groups']))
                           & (col('dim_rk') >= 1 - grp_dict['prop_to_move']),
                           grp).otherwise(col(colname)))
    return df
Exemplo n.º 11
0
def generate_brand_data():
    try:

        hbase = {"table": brand_table, "families": ["0"], "row": "brand_id"}
        print(f"{dt.now()} 生成品牌名称 {brand_table}")

        #写入品牌表
        plm_item = get_plm_item(spark).select("item_id", "item_name",
                                              "item_kind", "is_mrb")
        # item_kind !=4 4为罚没  is_mrb=1 1为在使用   去掉空格  去掉*   中文括号用英文括号替换
        #处理item_name
        #??? 可以用所有数据 然后drop掉没有括号的
        plm_item_brand = plm_item\
                    .where(col("item_kind") != "4") \
                    .where(col("is_mrb") == "1") \
                    .withColumn("brand_name",item_name_udf(col("item_name")))\
                    .dropDuplicates(["brand_name"]) \
                    .where(col("brand_name") != "")

        from pyspark.sql import Window
        win = Window.orderBy("item_id")
        brand=plm_item_brand.select("item_id", "item_name", "brand_name") \
                       .withColumn("brand_id", f.row_number().over(win))
        brand.foreachPartition(
            lambda x: write_hbase1(x, ["brand_name"], hbase))
    except:
        tb.print_exc()
Exemplo n.º 12
0
def state_dimension(business_df):
    # state dimension
    states = business_df.select(business_df.state).distinct()
    state_dim = states.withColumn(
        "state_id",
        F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
    return state_dim
Exemplo n.º 13
0
    def shift(self, periods=1, fill_value=None):
        """
        Shift Series/Index by desired number of periods.

        .. note:: the current implementation of shift uses Spark's Window without
            specifying partition specification. This leads to move all data into
            single partition in single machine and could cause serious
            performance degradation. Avoid this method against very large dataset.

        Parameters
        ----------
        periods : int
            Number of periods to shift. Can be positive or negative.
        fill_value : object, optional
            The scalar value to use for newly introduced missing values.
            The default depends on the dtype of self. For numeric data, np.nan is used.

        Returns
        -------
        Copy of input Series/Index, shifted.

        Examples
        --------
        >>> df = ks.DataFrame({'Col1': [10, 20, 15, 30, 45],
        ...                    'Col2': [13, 23, 18, 33, 48],
        ...                    'Col3': [17, 27, 22, 37, 52]},
        ...                   columns=['Col1', 'Col2', 'Col3'])

        >>> df.Col1.shift(periods=3)
        0     NaN
        1     NaN
        2     NaN
        3    10.0
        4    20.0
        Name: Col1, dtype: float64

        >>> df.Col2.shift(periods=3, fill_value=0)
        0     0
        1     0
        2     0
        3    13
        4    23
        Name: Col2, dtype: int64

        """
        if len(self._internal.index_columns) == 0:
            raise ValueError("Index must be set.")

        if not isinstance(periods, int):
            raise ValueError('periods should be an int; however, got [%s]' % type(periods))

        col = self._scol
        index_columns = self._kdf._internal.index_columns
        window = Window.orderBy(index_columns).rowsBetween(-periods, -periods)
        shifted_col = F.lag(col, periods).over(window)
        col = F.when(
            shifted_col.isNull() | F.isnan(shifted_col), fill_value
        ).otherwise(shifted_col)

        return self._with_new_scol(col).alias(self.name)
Exemplo n.º 14
0
def to_indexed_ids(df: DataFrame, col_name: str) -> DataFrame:
    """node id to index number.

    Args:
        df (DataFrame): source data frame.
        col_name (str): column to generate index.

    Returns:
        DataFrame:
        ```
        +-----------------+-----------+..+
        |customer_id_index|customer_id|..|
        +-----------------+-----------+..|
        |                1|   10001105|..|
        |                2|   10007421|..|
        |                3|   10008274|..|
        |                4|   10010722|..|
        |                5|   10012171|..|
        +-----------------+-----------+--+
        ```
    """
    # ref: https://towardsdatascience.com/adding-sequential-ids-to-a-spark-dataframe-fa0df5566ff6
    index_window = Window.orderBy(F.col(col_name))
    index_df = df.withColumn(f"{col_name}_index",
                             F.row_number().over(index_window) - 1)
    return index_df
Exemplo n.º 15
0
def tr_create_state_dim(df):

    df_dim = df.select(col('state').alias('StateName')).drop_duplicates()
    window = Window.orderBy(df_dim.StateName)
    df_dim_final = df_dim.withColumn('StateId', row_number().over(window))

    return df_dim_final
Exemplo n.º 16
0
    def ffill_windows(cls, df, time_col, columns_to_fill):
        """
    Forward filling strategy. This strategy fills empty 
    spots using the last know value of a column

    """
        import sys
        from pyspark.sql import Window
        from pyspark.sql.functions import last

        # define the window (and order it by time)
        window = Window.orderBy(time_col)\
                    .rowsBetween(-sys.maxsize, 0)

        # fill every column and replace columns
        for col_entry in columns_to_fill:
            col_name_to_fill = col_entry[0]
            col_name_new = col_entry[1]

            if (col_name_new is None):
                col_name_new = col_name_to_fill

            df = df.withColumn(
                col_name_new,
                last(df[col_name_to_fill], ignorenulls=True).over(window))

        return df
Exemplo n.º 17
0
def cal_sku_band(sp):
    """ Calculate the band of sp. """

    sku_id = 'sku_id'
    sale = 'sale'

    split_list = [0.2, 0.4, 0.6, 0.8]
    split_name = [1, 2, 3, 4, 5]
    cond_str = ''
    for i, percent in enumerate(split_list):
        cond_str += '''        WHEN rank_percent < {percent} THEN {name} '''.format(
            percent=percent, name=split_name[i])
    cond_str += '''        ELSE {name}'''.format(name=split_name[-1])
    split_cond = '''
    CASE 
        {cond_str}
    END 
        AS band
    '''.format(cond_str=cond_str)

    group_sp = sp.groupBy(sku_id).agg(F.sum(F.col(sale)).alias('sale_sum'))
    distinct_sp = group_sp.select(F.countDistinct(sku_id).alias('cnt'))
    windowspec_r = Window.orderBy(F.col('sale_sum').desc())
    rank_sp = group_sp.select(
        'sku_id',
        F.rank().over(windowspec_r).alias('rank')).crossJoin(distinct_sp)
    rank_sp = rank_sp.withColumn('rank_percent',
                                 F.col('rank') / F.col('sale_sum'))
    band_sp = rank_sp.selectExpr('sku_id', split_cond)

    return band_sp
Exemplo n.º 18
0
    def test_window_functions_without_partitionBy(self):
        df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"),
                                         (1, "2")], ["key", "value"])
        w = Window.orderBy("key", df.value)
        from pyspark.sql import functions as F

        sel = df.select(
            df.value,
            df.key,
            F.max("key").over(w.rowsBetween(0, 1)),
            F.min("key").over(w.rowsBetween(0, 1)),
            F.count("key").over(w.rowsBetween(float("-inf"), float("inf"))),
            F.row_number().over(w),
            F.rank().over(w),
            F.dense_rank().over(w),
            F.ntile(2).over(w),
        )
        rs = sorted(sel.collect())
        expected = [
            ("1", 1, 1, 1, 4, 1, 1, 1, 1),
            ("2", 1, 1, 1, 4, 2, 2, 2, 1),
            ("2", 1, 2, 1, 4, 3, 2, 2, 2),
            ("2", 2, 2, 2, 4, 4, 4, 3, 2),
        ]
        for r, ex in zip(rs, expected):
            self.assertEqual(tuple(r), ex[:len(r)])
Exemplo n.º 19
0
    def execute(self):
        """Execute the link.

        :returns: status code of execution
        :rtype: StatusCode
        """
        ds = process_manager.service(DataStore)

        # --- your algorithm code goes here
        self.logger.debug('Now executing link: {link}.', link=self.name)

        df = ds[self.read_key]

        window = Window.orderBy(self.datetime_col) \
            .rowsBetween(0, Window.unboundedFollowing)

        if self.partitionby_cols:
            window = window.partitionBy(self.partitionby_cols)

        date_col = f.to_date(self.datetime_col)  # only dates per row
        reset_col = f.when(f.col(self.event_col) > 0,
                           date_col)  # dates of events
        bfilled_reset_col = f.first(reset_col, ignorenulls=True).over(
            window)  # backfilled dates of events
        countdown_col = f.datediff(
            bfilled_reset_col,
            date_col)  # difference between next event date and today

        ds[self.store_key] = df.withColumn(self.countdown_col_name,
                                           countdown_col)

        return StatusCode.Success
def events_popularity_score(df):
      
      df.registerTempTable("df_tbl")
      
      df_popularity=spark.sql('''
                      select item_id,country, count(distinct USER_ID) as count_score
                      from df_tbl group by 1,2 order by 3 desc
                      ''')
      
      '''
      df_popularity=df_popularity.coalesce(1)
      from pyspark.sql.functions import monotonically_increasing_id
      df_popularity=df_popularity.withColumn('score', monotonically_increasing_id())
      '''
      
      from pyspark.sql import Window
      import pyspark.sql.functions as psf

      cnt_score = Window.orderBy(psf.desc("count_score"))

      df_popularity_sc = df_popularity.withColumn("score", 
                          psf.dense_rank().over(cnt_score)
                          )
      
      return df_popularity_sc
    def run(self):

        session = SparkSession.builder.appName("KueskiRaitings").master(
            "local[*]").config("spark.driver.memory",
                               "8g").config("spark.executor.memory",
                                            "8g").getOrCreate()

        dataFrameReader = session.read

        df = dataFrameReader.option("header",
                                    "true").option("inferSchema",
                                                   "true").csv(self.get_data())
        df = df.withColumn(
            "index",
            row_number().over(Window.orderBy(monotonically_increasing_id())) -
            1)

        df = df.withColumn(USER_ID, df[USER_ID].cast(IntegerType()))
        df = df.withColumn(MOVIE_ID, df[MOVIE_ID].cast(IntegerType()))
        df = df.withColumn(RATING, df[RATING].cast(FloatType()))
        df = df.withColumn(TS, df[TS].cast(TimestampType()))

        w1 = Window.partitionBy(USER_ID).orderBy(TS)

        df = df.withColumn("nb_previous_ratings", row_number().over(w1) - 1)
        df = df.withColumn("avg_ratings_previous", mean(RATING).over(w1))
        df = df.withColumn("avg_ratings_previous",
                           lag('avg_ratings_previous', 1, 0).over(w1))

        obj = df.select([
            c for c in df.columns
            if c in ['nb_previous_ratings', 'avg_ratings_previous']
        ])

        return obj.toPandas().to_json(orient="records")
Exemplo n.º 22
0
def get_street_categories_index(spark, col_name, pos_samples, road_features):
    n_roads = road_features.count()
    street_cat_distrib = (
        road_features.select(col_name)
        .na.fill("unknown")
        .groupBy(col_name)
        .count()
        .withColumn("p", col("count") / lit(n_roads))
        .orderBy(col("p").desc())
        .drop("count")
    )
    return (
        pos_samples.select(col_name)
        .na.fill("unknown")
        .groupBy(col_name)
        .count()
        .withColumn("p_pos", col("count") / lit(pos_samples.count()))
        .drop("count")
        .join(street_cat_distrib, col_name)
        .na.fill(0, ["p_pos"])
        .select(col_name, (col("p_pos") - col("p")).alias("risk"))
        .withColumn(
            col_name + "_indexed", row_number().over(Window.orderBy(col("risk").desc()))
        )
        .drop("risk")
    )
Exemplo n.º 23
0
 def Run(self):
     def Lift(cum_resp,decil_no):
         return (cum_resp/float(decil_no+1))
     self._df=self._df.orderBy(self._proba_column,ascending=False)
     self._df = self._df.withColumn("id", monotonically_increasing_id())
     # w = Window.orderBy(desc(self._proba_column))
     # self._df = self._df.withColumn('id',row_number().over(w))
     discretizer = QuantileDiscretizer(numBuckets=10, inputCol="id", outputCol="deciles")
     self._df = discretizer.fit(self._df).transform(self._df)
     Rank=self._df.groupby('deciles').agg(F.count(self._df[self._proba_column]).alias('cnt'), F.count(when(self._df[self._target_column] == int(self._posLabel), True)).alias('cnt_resp'))
     Rank=Rank.withColumn('cnt_non_resp',Rank['cnt']-Rank['cnt_resp'])
     Rank=Rank.orderBy('deciles',ascending=True)
     cumsum_window = (Window.orderBy(Rank['deciles']).rangeBetween(Window.unboundedPreceding, Window.currentRow))
     Rank=Rank.withColumn("cum_resp",F.sum('cnt_resp').over(cumsum_window))
     Rank=Rank.withColumn("cum_non_resp",F.sum('cnt_non_resp').over(cumsum_window))
     Rank=Rank.withColumn("% Responders(Cumulative)",F.round(old_div(Rank["cum_resp"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2))
     Rank=Rank.withColumn("% Non-Responders(Cumulative)",F.round(old_div(Rank["cum_non_resp"]*100,Rank.select(F.sum('cnt_non_resp')).collect()[0][0]),2))
     Rank=Rank.withColumn("cum_population",F.sum("cnt").over(cumsum_window))
     Rank=Rank.withColumn("pop_pct_per_decile",F.round(old_div(Rank["cnt"]*100,Rank.select(F.sum('cnt')).collect()[0][0])))
     Rank=Rank.withColumn("% Population(Cumulative)",F.round(F.sum('pop_pct_per_decile').over(cumsum_window)))
     Rank=Rank.withColumn("KS",F.round(Rank["% Responders(Cumulative)"] - Rank["% Non-Responders(Cumulative)"],2))
     Rank=Rank.withColumn("Lift at Decile",F.round(old_div(Rank["cnt_resp"]*Rank["pop_pct_per_decile"]*100,Rank.select(F.sum('cnt_resp')).collect()[0][0]),2))
     Rank = Rank.withColumn("id", monotonically_increasing_id())
     Lift_udf=udf(lambda x,y:Lift(x,y),FloatType())
     Rank=Rank.withColumn("Total_Lift",F.round(Lift_udf("cum_resp","id"),2))
     Rank=Rank.drop('id')
     return(Rank)
Exemplo n.º 24
0
def main():
    """
    Run pipeline:
    - Create spark session
    - Get config
    - Read all dataframes with meta
    - Merge country names
    - Correct country names
    - Generate an id column
    - Write with meta
    :return: None
    """
    spark = create_spark_session()

    config_path = get_config_path_from_cli()
    config = provide_config(config_path).get('scripts').get('country')
    country_mapping_path = config.get('country_mapping_path')

    (gdp_per_capita, human_capital_index, press_freedom_index,
     temperatures_by_country, immigration) = read_data(spark, config=config)
    df = merge_country_names(gdp_per_capita, human_capital_index,
                             press_freedom_index, temperatures_by_country,
                             immigration)

    df = correct_country_names(df=df,
                               country_col='country_name',
                               country_mapping_path=country_mapping_path)
    df = df.withColumn('country_id',
                       F.row_number().over(Window.orderBy('country_name')))

    write_with_meta(df=df, df_meta=config['output_meta'])
Exemplo n.º 25
0
def induce_graph(graph, relabel=True, partitions=[]):
    """Remove extra edges that do not belong to the graph"""
    # small dataframe for reindexing/relabeling

    window = Window.orderBy("id")
    if partitions:
        window = window.partitionBy(partitions)

    # ensure 0 index for mapping into a scipy.sparse matrix
    rank = graph.vertices.select(
        "id",
        F.row_number().over(window).alias("rank")).withColumn(
            "rank", F.expr("rank - 1"))

    vertices = graph.vertices.join(rank, on="id", how="left")

    edges = graph.edges.join(vertices.selectExpr("id as src",
                                                 "rank as rank_src"),
                             on="src",
                             how="inner").join(vertices.selectExpr(
                                 "id as dst", "rank as rank_dst"),
                                               on="dst",
                                               how="inner")

    if relabel:
        vertices = vertices.withColumn("relabeled_id", F.col("id")).withColumn(
            "id", F.col("rank"))
        edges = (edges.withColumn("relabeled_src", F.col("src")).withColumn(
            "relabeled_dst",
            F.col("dst")).withColumn("src", F.col("rank_src")).withColumn(
                "dst", F.col("rank_dst")))

    vertices = vertices.drop("rank")
    edges = edges.drop("rank_src", "rank_dst")
    return GraphFrame(vertices, edges)
Exemplo n.º 26
0
def tr_create_city_dim(df):

    df_dim = df.select(col('county').alias('CityName')).drop_duplicates()
    window = Window.orderBy(df_dim.CityName)
    df_dim_final = df_dim.withColumn('CityId', row_number().over(window))

    return df_dim_final
Exemplo n.º 27
0
def create_ids(df):
    df_titles = df.select("title").distinct()
    windowspec = Window.orderBy("title")
    df_ids = df_titles.withColumn("id", row_number().over(windowspec))
    df_ids = df_ids.withColumn("id", col("id") - 1)
    count = df_ids.count()
    return df_ids, count
def _calc_model_metrics(df: DataFrame, prob_col: str,
                        label_col: str) -> Dict[str, float]:
    r"""calc model metrics at max f1 given probabilities and labels

    Parameters
    ----------
    df : pyspark.sql.DataFrame
    prob_col : str
        colname w/ raw probabilities of being in class 1
    label_col : str

    Returns
    -------
    max_metrics : dict
        dict with keys : 'tp', 'tn', 'fp', 'fn', 'f1', 'accuracy',
        'precision', 'recall' and corresponding values as floats

    Raises
    ------
    UncaughtException
    """
    _persist_if_unpersisted(df)

    metrics_df = df.groupby(prob_col).pivot(label_col).count().fillna(value=0)
    metrics_df.persist(StorageLevel(False, True, False, False))

    window = Window.orderBy(prob_col).rowsBetween(Window.unboundedPreceding,
                                                  -1)
    metrics_df = metrics_df.withColumn('fn', F.sum(F.col(str(1))).over(window))
    metrics_df = metrics_df.withColumn(
        'tn',
        F.sum(F.col(str(0))).over(window)).fillna(value=0)
    metrics_df.persist(StorageLevel(False, True, False, False, 1))

    all_count = df.count()
    pos_count = df.where(F.col(label_col) == 1).count()
    neg_count = all_count - pos_count

    metrics_df = metrics_df.withColumn('tp', pos_count - F.col('fn'))
    metrics_df = metrics_df.withColumn('fp', neg_count - F.col('tn'))

    metrics_df = metrics_df.withColumn('precision', (F.col('tp')) /
                                       (F.col('tp') + F.col('fp')))
    metrics_df = metrics_df.withColumn('recall', F.col('tp') / pos_count)

    metrics_df = metrics_df.withColumn(
        'informativeness', 2 * (F.col('precision') * F.col('recall')) /
        (F.col('precision') + F.col('recall')))
    metrics_df.persist(StorageLevel(False, True, False, False))

    max_metrics = metrics_df.where(
        F.col('informativeness') == metrics_df.select(
            F.max(F.col('informativeness'))).take(1)[0][0]).take(
                1)[0].asDict()
    max_metrics['accuracy'] = (max_metrics['tp'] +
                               max_metrics['tn']) / all_count
    max_metrics['threshold'] = max_metrics[prob_col]

    return max_metrics
Exemplo n.º 29
0
def topK_df(df, key_col, K):
    """
    Using window functions.  Handles ties OK.
    """
    window = Window.orderBy(functions.col(key_col).desc())
    return (df.withColumn("rank",
                          functions.rank().over(window)).filter(
                              functions.col('rank') <= K).drop('rank'))
Exemplo n.º 30
0
def category_dimension(b_df):
    # category dimension
    categories = b_df.select(b_df.restaurant_category).filter(
        b_df.restaurant_category.isNotNull()).distinct()
    category_dim = categories.withColumn(
        "category_id",
        F.row_number().over(Window.orderBy(F.monotonically_increasing_id())))
    return category_dim
Exemplo n.º 31
0
def spark_timestamp_split(
    data,
    ratio=0.75,
    col_user=DEFAULT_USER_COL,
    col_item=DEFAULT_ITEM_COL,
    col_timestamp=DEFAULT_TIMESTAMP_COL,
):
    """Spark timestamp based splitter
    The splitter splits the data into sets by timestamps without stratification on either
    user or item.
    The ratios are applied on the timestamp column which is divided accordingly into
    several partitions.

    Args:
        data (spark.DataFrame): Spark DataFrame to be split.
        ratio (float or list): Ratio for splitting data. If it is a single float number
            it splits data into two sets and the ratio argument indicates the ratio of
            training data set; if it is a list of float numbers, the splitter splits
            data into several portions corresponding to the split ratios. If a list is
            provided and the ratios are not summed to 1, they will be normalized.
            Earlier indexed splits will have earlier times
            (e.g the latest time in split[0] <= the earliest time in split[1])
        col_user (str): column name of user IDs.
        col_item (str): column name of item IDs.
        col_timestamp (str): column name of timestamps. Float number represented in
        seconds since Epoch.

    Returns:
        list: Splits of the input data as spark.DataFrame.
    """
    multi_split, ratio = process_split_ratio(ratio)

    ratio = ratio if multi_split else [ratio, 1 - ratio]
    ratio_index = np.cumsum(ratio)

    window_spec = Window.orderBy(col(col_timestamp))
    rating = data.withColumn("rank", row_number().over(window_spec))

    data_count = rating.count()
    rating_rank = rating.withColumn("rank", row_number().over(window_spec) / data_count)

    splits = []
    for i, _ in enumerate(ratio_index):
        if i == 0:
            rating_split = rating_rank.filter(col("rank") <= ratio_index[i]).drop(
                "rank"
            )
        else:
            rating_split = rating_rank.filter(
                (col("rank") <= ratio_index[i]) & (col("rank") > ratio_index[i - 1])
            ).drop("rank")

        splits.append(rating_split)

    return splits
Exemplo n.º 32
0
	|   3|  a|
	|   1|  b|
	|   2|  b|
	|   3|  b|
	|   1|  c|
	|   3|  c|
	+----+---+
"""

schema = StructType([StructField("attr", StringType(), True), StructField("obj", StringType(), True)])

aoDF = sqlCtx.createDataFrame(aoPair, schema)

#Window that moves over rows of same obj and sorted by attr

window = Window.orderBy("attr").partitionBy("obj")

## Prev column contains previous attr of the same object
"""
	Transformed Table	
	+----+---+----+
	|attr|obj|prev|
	+----+---+----+
	|   1|  a|null|
	|   2|  a|   1|
	|   3|  a|   2|
	|   1|  b|null|
	|   2|  b|   1|
	|   3|  b|   2|
	|   1|  c|null|
	|   3|  c|   1|