def _filter_feature_table_by_time_range(
    feature_table_df: DataFrame,
    feature_table: FeatureTable,
    feature_event_timestamp_column: str,
    entity_df: DataFrame,
    entity_event_timestamp_column: str,
):
    entity_max_timestamp = entity_df.agg({
        entity_event_timestamp_column: "max"
    }).collect()[0][0]
    entity_min_timestamp = entity_df.agg({
        entity_event_timestamp_column: "min"
    }).collect()[0][0]

    feature_table_timestamp_filter = (
        col(feature_event_timestamp_column).between(
            entity_min_timestamp - timedelta(seconds=feature_table.max_age),
            entity_max_timestamp,
        ) if feature_table.max_age else
        col(feature_event_timestamp_column) <= entity_max_timestamp)

    time_range_filtered_df = feature_table_df.filter(
        feature_table_timestamp_filter)

    return time_range_filtered_df
    def _get_base_dataframe(self, client: SparkClient, dataframe: DataFrame,
                            end_date: str) -> DataFrame:
        start_date = dataframe.agg(functions.min(
            self.timestamp_column)).take(1)[0][0]
        end_date = end_date or dataframe.agg(
            functions.max(
                self.timestamp_column)).take(1)[0][0] + timedelta(days=1)
        date_df = date_range.get_date_range(client, start_date, end_date)
        unique_keys = dataframe.dropDuplicates(
            subset=self.keys_columns).select(*self.keys_columns)

        return unique_keys.crossJoin(date_df)
示例#3
0
def create_report(data: DataFrame) -> DataFrame:
    log_metric("Column Count", len(data.columns))
    log_metric(
        "Avg Score",
        int(
            data.agg({
                "score": "sum"
            }).collect()[0][0] +
            randint(-2 * len(data.columns), 2 * len(data.columns))),
    )
    return data
示例#4
0
    def _get_ks_statistic(self, df_distances: DataFrame) -> dict:
        if not self.partitions:
            ks_stat_row = df_distances.agg(
                F.max(self.DISTANCE).alias(self.DISTANCE)).collect()
            ks_stat = ks_stat_row[0][self.DISTANCE]

            result = {
                'statistic':
                ks_stat,
                'ks_table': [{
                    'upper_bound': 1,
                    'lower_bound': 0,
                    'statistic': ks_stat
                }]
            }
        else:
            df_ks_partitioned = df_distances.withColumn(
                self.PARTITION, F.lit(0))

            for idx, threshold in enumerate(self.partitions):
                df_ks_partitioned = df_ks_partitioned\
                    .withColumn(self.PARTITION,
                                F.when(F.col(self.probability_col) <= threshold, F.lit(idx+1))
                                .otherwise(F.col(self.PARTITION)))
            ks_stat_partitioned = df_ks_partitioned.groupBy(self.PARTITION)\
                .agg(F.max(self.DISTANCE).alias(self.DISTANCE))

            ks_stat_row = ks_stat_partitioned.agg(
                F.max(self.DISTANCE).alias(self.DISTANCE)).collect()
            ks_stat = ks_stat_row[0][self.DISTANCE]

            bounded_partition = [1.0] + self.partitions + [0.0]
            ks_stat_rows = ks_stat_partitioned.collect()
            ks_table = [{
                'upper_bound':
                bounded_partition[row[self.PARTITION]],
                'lower_bound':
                bounded_partition[1 + row[self.PARTITION]],
                'statistic':
                row[self.DISTANCE]
            } for row in ks_stat_rows]

            result = {'statistic': ks_stat, 'ks_table': ks_table}

        return result
示例#5
0
def to_user_product_pairs(products_by_customer_df: DataFrame,
                          indexed_user_items: DataFrame) -> DataFrame:
    """
    create item positive and negative pairs.

    Args:
        products_by_customer_df (DataFrame):
        +-----------+--------------------+
        |customer_id|            products|
        +-----------+--------------------+
        |   10007421|[12537, 27265, 32...|
        |   10036418|[46420, 34635, 27...|
        |   10058663|[42536, 42984, 37...|
        |   10083340|[34617, 45656, 42...|
        |   10108034|[43418, 46650, 42...|
        +-----------+--------------------+
        products_index (DataFrame): product id and product index mapping.

    Returns:
        DataFrame:
        root
            |-- <partition_column> customer_id: string (nullable = true)
            |-- positives: array (nullable = true)
            |    |-- element: integer (containsNull = false)

    """
    products_collection = indexed_user_items.agg(
        F.collect_set("product_id_index").alias("full_product_index"), )
    positive_and_neg = (products_by_customer_df.crossJoin(
        F.broadcast(products_collection)).withColumn(
            "negative_ids",
            F.array_except("full_product_index", "positives_ids")).select(
                F.col("customer_id"),
                F.col("customer_id_index"),
                F.col("cross_bin_number"),
                F.col("positives_ids").alias("positives"),
                F.expr("slice(shuffle(negative_ids), 1, size(positives_ids))").
                alias("negatives"),
            ))
    return positive_and_neg
def create_report(data: DataFrame) -> DataFrame:
    log_metric("Column Count", len(data.columns))
    log_dataframe("ready_data", data, with_histograms=True)
    avg_score = data.agg({"score_label": "sum"}).collect()[0][0]
    log_metric("Avg Score", chaos_float(avg_score))
    return data
示例#7
0
def aggregateBlocks(df: DataFrame):
    df.agg(func.sum("blockCount")).show()
示例#8
0
    def _pyspark_quantile_filtering(self, data: DataFrame, kpis: List[str],
                                    thresholds: Dict[str, Tuple[str, float]]):
        """ Make the filtering based on the given quantile level.
        Filtering is performed for each kpi independently.

        :param kpis: the kpis to perform filtering
        :type  kpis: list[str]
        :param thresholds: dict of thresholds mapping KPI names to (type, percentile) tuples
        :type  thresholds: dict

        :return: boolean values indicating whether the row should be filtered
        :rtype: pd.Series
        """

        warnings.warn(
            'Only approximated filtering is supported for PySpark DataFrame.')
        logger.warning(
            'Only approximated filtering is supported for PySpark DataFrame.')

        def find_smallest(data: DataFrame,
                          col_name: str,
                          quantile: float,
                          error: float = 0.01):
            """ Return boolean vector of data points smaller than given quantile."""
            threshold, = data.approxQuantile(col_name, [quantile], error)
            return data.withColumn("flags", (F.col("flags") |
                                             (F.col(col_name) <= threshold)))

        def find_largest(data: DataFrame,
                         col_name: str,
                         quantile: float,
                         error: float = 0.01):
            """ Return boolean vector of data points larger than given quantile."""
            threshold, = data.approxQuantile(col_name, [quantile], error)
            return data.withColumn("flags", (F.col("flags") |
                                             (F.col(col_name) >= threshold)))

        def find_smallest_and_largest(data: DataFrame,
                                      col_name: str,
                                      quantile: float,
                                      error: float = 0.01):
            """ Return boolean vector of data points outside of the given quantile."""
            rest = 1.0 - quantile
            quantiles = [rest / 2.0, 1.0 - rest / 2.0]
            low_threshold, high_low_threshold = data.approxQuantile(
                col_name, quantiles, error)
            return data.withColumn("flags",
                                   (F.col("flags") |
                                    ((F.col(col_name) <= low_threshold) |
                                     (F.col(col_name) >= high_low_threshold))))

        def find_smallest_and_largest_asym(data: DataFrame,
                                           col_name: str,
                                           quantile: float,
                                           error: float = 0.01):
            """ Return boolean vector of data to remove such that quantile/2
                is kept in both non-negative and non-positive subsets
                of data."""
            rest = 1.0 - quantile

            neg_threshold, = data.filter(F.col(col_name) < 0).approxQuantile(
                col_name, [rest / 2.0], error)

            pos_threshold, = data.filter(F.col(col_name) >= 0).approxQuantile(
                col_name, [1.0 - rest / 2.0], error)

            return data.withColumn("flags",
                                   (F.col("flags") |
                                    ((F.col(col_name) < neg_threshold) |
                                     (F.col(col_name) > pos_threshold))))

        data = data.withColumn("flags", F.lit(False))

        method_table = {
            'upper': find_largest,
            'lower': find_smallest,
            'two-sided': find_smallest_and_largest,
            'two-sided-asym': find_smallest_and_largest_asym
        }

        for col in data[kpis].columns:
            data = data.withColumn(f"replaced_{col}", F.col(col)).replace(
                [np.inf, -np.inf], np.nan, subset=[f"replaced_{col}"])

            if col in thresholds:
                threshold_type, percentile = thresholds[col]
                quantile = percentile / 100.0
            else:
                quantile = DEFAULT_OUTLIER_QUANTILE
                min_ = data.agg({f"replaced_{col}": "min"}).collect()[0][0]
                max_ = data.agg({f"replaced_{col}": "max"}).collect()[0][0]
                threshold_type = _get_threshold_type(min_, max_)

            if threshold_type not in method_table:
                raise ValueError("Unknown outlier filtering method '%s'." %
                                 (threshold_type, ))
            else:
                method = method_table[threshold_type]
                data = method(data, f"replaced_{col}", quantile,
                              self.error).drop(f"replaced_{col}")
        return data
示例#9
0
 def avg_lvl(self, df: DataFrame):
     return df.agg(avg(self.elevation_column).alias("average_elevation"))
示例#10
0
 def min_max_lvl(self, df: DataFrame):
     return df.agg(
         min(self.elevation_column).alias("min_elevation"),
         max(self.elevation_column).alias("max_elevation"))
示例#11
0
    def _get_shap_values(
        self,
        sdf: DataFrame,
        label_col: str = "label",
        shuffle: bool = False,
        sdf_validation: DataFrame = None,
        estimator_params: Optional[Dict[str, object]] = None,
        explainer_type_params: Optional[Dict[str, object]] = None,
        explainer_params: Optional[Dict[str, object]] = None,
        broadcast: bool = True,
    ) -> Tuple[List[Series], List[Series]]:
        # Don't shuffle to get true shap values, shuffle to get null shap values
        if shuffle:
            shuffling_seed = self.random_state + self._current_iter if self.random_state is not None else None
            sdf = self._shuffle(sdf,
                                label_col=label_col,
                                broadcast=broadcast,
                                seed=shuffling_seed)

        # Train the model
        model = self.estimator.fit(sdf, **estimator_params or {})

        # Explain the model
        explainer = self.explainer_type(model, **explainer_type_params or {})

        # If we have a validation set, compute shap values on it instead of the training set
        if sdf_validation is not None:
            sdf = sdf_validation

        # Select features for shap
        # The features dataframe never changes, so we can compute it only the first time
        features = [
            col for col in sdf.columns
            if col not in (label_col, SPARK_FEATURES_NAME)
        ]
        if self._X_for_shap is None:
            self._X_for_shap = sdf.select(features).cache()

        # Compute shap values
        sdf = self._compute_shap_values(self._X_for_shap, explainer,
                                        explainer_params)
        if self._n_outputs is None:
            self._n_outputs = sdf.agg(
                F.countDistinct(SPARK_CLS_NAME)).head()[0]
        shap_values = [
            sdf.filter(F.col(SPARK_CLS_NAME) == i).drop(SPARK_CLS_NAME)
            for i in range(self._n_outputs)
        ]

        # Average positive and negative shap values for each class
        pos_shap_values = []
        neg_shap_values = []
        for cls_shap_values in shap_values:
            sdf_pos = cls_shap_values.agg(*[
                F.mean(
                    F.when(F.col(col_name) >= 0, F.col(col_name)).otherwise(
                        0)).name(col_name)
                for col_name in cls_shap_values.columns
            ])
            sdf_neg = cls_shap_values.agg(*[
                F.mean(
                    F.when(F.col(col_name) < 0, F.col(col_name)).otherwise(
                        0)).name(col_name)
                for col_name in cls_shap_values.columns
            ])
            # Back to "little data" regime with pandas
            s_pos = pd.Series(data=sdf_pos.head(),
                              index=features,
                              name="shap_values")
            s_neg = pd.Series(data=sdf_neg.head(),
                              index=features,
                              name="shap_values")
            pos_shap_values.append(s_pos)
            neg_shap_values.append(s_neg)

        return (pos_shap_values, neg_shap_values)
 def apply(self, data: DataFrame) -> DataFrame:
     return data.agg({"age": "max"})