def _filter_feature_table_by_time_range( feature_table_df: DataFrame, feature_table: FeatureTable, feature_event_timestamp_column: str, entity_df: DataFrame, entity_event_timestamp_column: str, ): entity_max_timestamp = entity_df.agg({ entity_event_timestamp_column: "max" }).collect()[0][0] entity_min_timestamp = entity_df.agg({ entity_event_timestamp_column: "min" }).collect()[0][0] feature_table_timestamp_filter = ( col(feature_event_timestamp_column).between( entity_min_timestamp - timedelta(seconds=feature_table.max_age), entity_max_timestamp, ) if feature_table.max_age else col(feature_event_timestamp_column) <= entity_max_timestamp) time_range_filtered_df = feature_table_df.filter( feature_table_timestamp_filter) return time_range_filtered_df
def _get_base_dataframe(self, client: SparkClient, dataframe: DataFrame, end_date: str) -> DataFrame: start_date = dataframe.agg(functions.min( self.timestamp_column)).take(1)[0][0] end_date = end_date or dataframe.agg( functions.max( self.timestamp_column)).take(1)[0][0] + timedelta(days=1) date_df = date_range.get_date_range(client, start_date, end_date) unique_keys = dataframe.dropDuplicates( subset=self.keys_columns).select(*self.keys_columns) return unique_keys.crossJoin(date_df)
def create_report(data: DataFrame) -> DataFrame: log_metric("Column Count", len(data.columns)) log_metric( "Avg Score", int( data.agg({ "score": "sum" }).collect()[0][0] + randint(-2 * len(data.columns), 2 * len(data.columns))), ) return data
def _get_ks_statistic(self, df_distances: DataFrame) -> dict: if not self.partitions: ks_stat_row = df_distances.agg( F.max(self.DISTANCE).alias(self.DISTANCE)).collect() ks_stat = ks_stat_row[0][self.DISTANCE] result = { 'statistic': ks_stat, 'ks_table': [{ 'upper_bound': 1, 'lower_bound': 0, 'statistic': ks_stat }] } else: df_ks_partitioned = df_distances.withColumn( self.PARTITION, F.lit(0)) for idx, threshold in enumerate(self.partitions): df_ks_partitioned = df_ks_partitioned\ .withColumn(self.PARTITION, F.when(F.col(self.probability_col) <= threshold, F.lit(idx+1)) .otherwise(F.col(self.PARTITION))) ks_stat_partitioned = df_ks_partitioned.groupBy(self.PARTITION)\ .agg(F.max(self.DISTANCE).alias(self.DISTANCE)) ks_stat_row = ks_stat_partitioned.agg( F.max(self.DISTANCE).alias(self.DISTANCE)).collect() ks_stat = ks_stat_row[0][self.DISTANCE] bounded_partition = [1.0] + self.partitions + [0.0] ks_stat_rows = ks_stat_partitioned.collect() ks_table = [{ 'upper_bound': bounded_partition[row[self.PARTITION]], 'lower_bound': bounded_partition[1 + row[self.PARTITION]], 'statistic': row[self.DISTANCE] } for row in ks_stat_rows] result = {'statistic': ks_stat, 'ks_table': ks_table} return result
def to_user_product_pairs(products_by_customer_df: DataFrame, indexed_user_items: DataFrame) -> DataFrame: """ create item positive and negative pairs. Args: products_by_customer_df (DataFrame): +-----------+--------------------+ |customer_id| products| +-----------+--------------------+ | 10007421|[12537, 27265, 32...| | 10036418|[46420, 34635, 27...| | 10058663|[42536, 42984, 37...| | 10083340|[34617, 45656, 42...| | 10108034|[43418, 46650, 42...| +-----------+--------------------+ products_index (DataFrame): product id and product index mapping. Returns: DataFrame: root |-- <partition_column> customer_id: string (nullable = true) |-- positives: array (nullable = true) | |-- element: integer (containsNull = false) """ products_collection = indexed_user_items.agg( F.collect_set("product_id_index").alias("full_product_index"), ) positive_and_neg = (products_by_customer_df.crossJoin( F.broadcast(products_collection)).withColumn( "negative_ids", F.array_except("full_product_index", "positives_ids")).select( F.col("customer_id"), F.col("customer_id_index"), F.col("cross_bin_number"), F.col("positives_ids").alias("positives"), F.expr("slice(shuffle(negative_ids), 1, size(positives_ids))"). alias("negatives"), )) return positive_and_neg
def create_report(data: DataFrame) -> DataFrame: log_metric("Column Count", len(data.columns)) log_dataframe("ready_data", data, with_histograms=True) avg_score = data.agg({"score_label": "sum"}).collect()[0][0] log_metric("Avg Score", chaos_float(avg_score)) return data
def aggregateBlocks(df: DataFrame): df.agg(func.sum("blockCount")).show()
def _pyspark_quantile_filtering(self, data: DataFrame, kpis: List[str], thresholds: Dict[str, Tuple[str, float]]): """ Make the filtering based on the given quantile level. Filtering is performed for each kpi independently. :param kpis: the kpis to perform filtering :type kpis: list[str] :param thresholds: dict of thresholds mapping KPI names to (type, percentile) tuples :type thresholds: dict :return: boolean values indicating whether the row should be filtered :rtype: pd.Series """ warnings.warn( 'Only approximated filtering is supported for PySpark DataFrame.') logger.warning( 'Only approximated filtering is supported for PySpark DataFrame.') def find_smallest(data: DataFrame, col_name: str, quantile: float, error: float = 0.01): """ Return boolean vector of data points smaller than given quantile.""" threshold, = data.approxQuantile(col_name, [quantile], error) return data.withColumn("flags", (F.col("flags") | (F.col(col_name) <= threshold))) def find_largest(data: DataFrame, col_name: str, quantile: float, error: float = 0.01): """ Return boolean vector of data points larger than given quantile.""" threshold, = data.approxQuantile(col_name, [quantile], error) return data.withColumn("flags", (F.col("flags") | (F.col(col_name) >= threshold))) def find_smallest_and_largest(data: DataFrame, col_name: str, quantile: float, error: float = 0.01): """ Return boolean vector of data points outside of the given quantile.""" rest = 1.0 - quantile quantiles = [rest / 2.0, 1.0 - rest / 2.0] low_threshold, high_low_threshold = data.approxQuantile( col_name, quantiles, error) return data.withColumn("flags", (F.col("flags") | ((F.col(col_name) <= low_threshold) | (F.col(col_name) >= high_low_threshold)))) def find_smallest_and_largest_asym(data: DataFrame, col_name: str, quantile: float, error: float = 0.01): """ Return boolean vector of data to remove such that quantile/2 is kept in both non-negative and non-positive subsets of data.""" rest = 1.0 - quantile neg_threshold, = data.filter(F.col(col_name) < 0).approxQuantile( col_name, [rest / 2.0], error) pos_threshold, = data.filter(F.col(col_name) >= 0).approxQuantile( col_name, [1.0 - rest / 2.0], error) return data.withColumn("flags", (F.col("flags") | ((F.col(col_name) < neg_threshold) | (F.col(col_name) > pos_threshold)))) data = data.withColumn("flags", F.lit(False)) method_table = { 'upper': find_largest, 'lower': find_smallest, 'two-sided': find_smallest_and_largest, 'two-sided-asym': find_smallest_and_largest_asym } for col in data[kpis].columns: data = data.withColumn(f"replaced_{col}", F.col(col)).replace( [np.inf, -np.inf], np.nan, subset=[f"replaced_{col}"]) if col in thresholds: threshold_type, percentile = thresholds[col] quantile = percentile / 100.0 else: quantile = DEFAULT_OUTLIER_QUANTILE min_ = data.agg({f"replaced_{col}": "min"}).collect()[0][0] max_ = data.agg({f"replaced_{col}": "max"}).collect()[0][0] threshold_type = _get_threshold_type(min_, max_) if threshold_type not in method_table: raise ValueError("Unknown outlier filtering method '%s'." % (threshold_type, )) else: method = method_table[threshold_type] data = method(data, f"replaced_{col}", quantile, self.error).drop(f"replaced_{col}") return data
def avg_lvl(self, df: DataFrame): return df.agg(avg(self.elevation_column).alias("average_elevation"))
def min_max_lvl(self, df: DataFrame): return df.agg( min(self.elevation_column).alias("min_elevation"), max(self.elevation_column).alias("max_elevation"))
def _get_shap_values( self, sdf: DataFrame, label_col: str = "label", shuffle: bool = False, sdf_validation: DataFrame = None, estimator_params: Optional[Dict[str, object]] = None, explainer_type_params: Optional[Dict[str, object]] = None, explainer_params: Optional[Dict[str, object]] = None, broadcast: bool = True, ) -> Tuple[List[Series], List[Series]]: # Don't shuffle to get true shap values, shuffle to get null shap values if shuffle: shuffling_seed = self.random_state + self._current_iter if self.random_state is not None else None sdf = self._shuffle(sdf, label_col=label_col, broadcast=broadcast, seed=shuffling_seed) # Train the model model = self.estimator.fit(sdf, **estimator_params or {}) # Explain the model explainer = self.explainer_type(model, **explainer_type_params or {}) # If we have a validation set, compute shap values on it instead of the training set if sdf_validation is not None: sdf = sdf_validation # Select features for shap # The features dataframe never changes, so we can compute it only the first time features = [ col for col in sdf.columns if col not in (label_col, SPARK_FEATURES_NAME) ] if self._X_for_shap is None: self._X_for_shap = sdf.select(features).cache() # Compute shap values sdf = self._compute_shap_values(self._X_for_shap, explainer, explainer_params) if self._n_outputs is None: self._n_outputs = sdf.agg( F.countDistinct(SPARK_CLS_NAME)).head()[0] shap_values = [ sdf.filter(F.col(SPARK_CLS_NAME) == i).drop(SPARK_CLS_NAME) for i in range(self._n_outputs) ] # Average positive and negative shap values for each class pos_shap_values = [] neg_shap_values = [] for cls_shap_values in shap_values: sdf_pos = cls_shap_values.agg(*[ F.mean( F.when(F.col(col_name) >= 0, F.col(col_name)).otherwise( 0)).name(col_name) for col_name in cls_shap_values.columns ]) sdf_neg = cls_shap_values.agg(*[ F.mean( F.when(F.col(col_name) < 0, F.col(col_name)).otherwise( 0)).name(col_name) for col_name in cls_shap_values.columns ]) # Back to "little data" regime with pandas s_pos = pd.Series(data=sdf_pos.head(), index=features, name="shap_values") s_neg = pd.Series(data=sdf_neg.head(), index=features, name="shap_values") pos_shap_values.append(s_pos) neg_shap_values.append(s_neg) return (pos_shap_values, neg_shap_values)
def apply(self, data: DataFrame) -> DataFrame: return data.agg({"age": "max"})