def _make_prefilter(aggregation_params, transform_params): timestamp_expr = Column(aggregation_params.timestamp_column) ref_date_expr = Constant(transform_params.ref_date).cast(ColumnType.DATE) time_filter = None if aggregation_params.is_rolling_window(): time_filter = timestamp_expr.le(ref_date_expr) elif transform_params.buffer_unit is not None: buffer_interval = Interval(transform_params.buffer_width, transform_params.buffer_unit) upper = ref_date_expr.minus(buffer_interval) if aggregation_params.whole_history_window_enabled: time_filter = timestamp_expr.le(upper) else: window_width, window_unit = _get_widest_window(aggregation_params) base_interval = Interval(window_width, window_unit) lower = upper.minus(base_interval) time_filter = timestamp_expr.ge(lower).and_( timestamp_expr.le(upper)) else: if aggregation_params.whole_history_window_enabled: time_filter = timestamp_expr.le(ref_date_expr) else: window_width, window_unit = _get_widest_window(aggregation_params) base_interval = Interval(window_width, window_unit) lower = ref_date_expr.minus(base_interval) time_filter = timestamp_expr.ge(lower).and_( timestamp_expr.le(ref_date_expr)) return time_filter
def _get_similarity_formula(self): rounding_decimals = 15 rounding_expression = Constant(10**rounding_decimals) logger.debug(f"Rounding similarity to {rounding_decimals} decimals") if self.use_explicit: # compute Pearson correlation rating_product = (Column( self.dku_config.ratings_column_name, table_name=self.LEFT_NORMALIZATION_FACTOR_AS).minus( Column( self.RATING_AVERAGE, table_name=self.LEFT_NORMALIZATION_FACTOR_AS)).times( Column(self.dku_config.ratings_column_name, table_name=self. RIGHT_NORMALIZATION_FACTOR_AS).minus( Column(self.RATING_AVERAGE, table_name=self. RIGHT_NORMALIZATION_FACTOR_AS)))) else: rating_product = Constant(1) return (rating_product.times( Column( self.NORMALIZATION_FACTOR_AS, table_name=self.LEFT_NORMALIZATION_FACTOR_AS)).times( Column( self.NORMALIZATION_FACTOR_AS, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS)).sum(). times(rounding_expression).round().div(rounding_expression))
def _build_normalization_factor(self, select_from, select_from_as="_visit_count"): # compute normalization factor normalization_factor = SelectQuery() normalization_factor.select_from(select_from, alias=select_from_as) self._select_columns_list( normalization_factor, column_names=self.similarity_computation_columns + self.filtering_columns, table_name=select_from_as, ) rating_column = (Column(self.dku_config.ratings_column_name).minus( Column(self.RATING_AVERAGE)) if self.use_explicit else Constant(1)) normalization_factor.select( self._get_normalization_factor_formula(Column(self.based_column), rating_column), alias=self.NORMALIZATION_FACTOR_AS, ) self.similarity_computation_columns += [self.NORMALIZATION_FACTOR_AS] # keep only items and users with enough visits normalization_factor.where( Column(self.NB_VISIT_USER_AS, table_name=select_from_as).ge( Constant(self.dku_config.user_visit_threshold))) normalization_factor.where( Column(self.NB_VISIT_ITEM_AS, table_name=select_from_as).ge( Constant(self.dku_config.item_visit_threshold))) return normalization_factor
def _build_unordered_similarity( self, select_from, left_select_from_as="_left_ordered_similarity", right_select_from_as="_right_ordered_similarity", with_clause_as="_with_clause_ordered_similarity", ): """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table""" similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=left_select_from_as) join_condition = Constant(1).eq_null_unsafe(Constant(0)) similarity.join(select_from, JoinTypes.FULL, join_condition, alias=right_select_from_as) similarity.select( Column(f"{self.based_column}_1", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_2", table_name=right_select_from_as)), alias=f"{self.based_column}_1", ) similarity.select( Column(f"{self.based_column}_2", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_1", table_name=right_select_from_as)), alias=f"{self.based_column}_2", ) similarity.select( Column("similarity", table_name=left_select_from_as).coalesce( Column("similarity", table_name=right_select_from_as)), alias=constants.SIMILARITY_COLUMN_NAME, ) return similarity
def _build_remove_historical_samples(self, select_from, select_from_as="_remove_negative_samples_seen"): historical_negative_samples_removed = SelectQuery() historical_negative_samples_removed.select_from(select_from, alias=select_from_as) columns_to_select = self.sample_keys + [constants.TARGET_COLUMN_NAME] + self.dku_config.score_column_names self._select_columns_list(select_query=historical_negative_samples_removed, column_names=columns_to_select) unseen_samples_condition = ( Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)).or_(Column(self.SCORE_SAMPLE).eq(Constant(0))) ) historical_negative_samples_removed.where(unseen_samples_condition) return historical_negative_samples_removed
def _build_filtered_samples(inner_select_from, inner_select_from_as): ratio = float(self.dku_config.negative_samples_percentage / 100.0) filtered_samples = SelectQuery() filtered_samples.select_from(inner_select_from, inner_select_from_as) columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME] self._select_columns_list(filtered_samples, columns_to_select) nb_negative_threshold_expr = ( Column(NB_POSITIVE_PER_USER, table_name=select_from_as) .times(Constant(ratio)) .div(Constant(1).minus(Constant(ratio))) .ceil() ) filtered_samples.where( Column(constants.TARGET_COLUMN_NAME, table_name=select_from_as) .eq(Constant(1)) .or_(Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(nb_negative_threshold_expr)) ) return filtered_samples
def _build_timestamp_filtered(self, select_from, select_from_as="_prepared_input_dataset"): def _build_timestamp_filtered_row_number(select_from_inner, select_from_as_inner): ts_row_numbers = SelectQuery() ts_row_numbers.select_from(select_from_inner, alias=select_from_as_inner) self._select_columns_list( ts_row_numbers, column_names=self.similarity_computation_columns, table_name=select_from_as_inner) ts_row_number_expression = (Expression().rowNumber().over( Window( partition_by=[ Column(self.based_column, table_name=select_from_as_inner) ], order_by=[ Column(self.dku_config.timestamps_column_name, table_name=select_from_as_inner), Column(self.pivot_column, table_name=select_from_as_inner), ], order_types=["DESC", "DESC"], mode=None, ))) ts_row_numbers.select(ts_row_number_expression, alias=self.TIMESTAMP_FILTERED_ROW_NB) return ts_row_numbers built_ts_row_numbers = _build_timestamp_filtered_row_number( select_from, select_from_as) ts_row_numbers_alias = "_ts_row_numbers" timestamp_filtered = SelectQuery() timestamp_filtered.select_from(built_ts_row_numbers, alias=ts_row_numbers_alias) self._select_columns_list( timestamp_filtered, column_names=self.similarity_computation_columns, table_name=ts_row_numbers_alias) timestamp_filtered.where( Column(self.TIMESTAMP_FILTERED_ROW_NB, table_name=ts_row_numbers_alias).le( Constant(self.dku_config.top_n_most_recent))) return timestamp_filtered
def make_max_time_interval_query(timestamp_column, resolved_ref_date, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() max_time_interval = Constant(resolved_ref_date).minus( Column(timestamp_column)).extract(TimeUnit.DAY).max() query = SelectQuery() query.select(max_time_interval, alias="max_time_interval") #TODO naming if is_hdfs: query.select_from('_'.join(dataset.name.split('.'))) else: query.select_from(dataset) dialect_handler = dialectHandler(dataset) return dialect_handler.convertToSQL(query) #toSQL(query, dataset=dataset)
def _get_user_item_similarity_formula(self, similarity_table, samples_table): if self.use_explicit: return (Column(constants.SIMILARITY_COLUMN_NAME, table_name=similarity_table).times( Column(self.dku_config.ratings_column_name, table_name=samples_table).minus( Column(self.RATING_AVERAGE, table_name=samples_table))). sum().div( Column(constants.SIMILARITY_COLUMN_NAME, table_name=similarity_table).abs().sum())) else: return (Column(constants.SIMILARITY_COLUMN_NAME, table_name=similarity_table).sum().div( Constant(self.dku_config.top_n_most_similar)))
def _build_top_n(self, select_from, select_from_as="_row_number_table"): top_n = SelectQuery() top_n.select_from(select_from, alias=select_from_as) columns_to_select = [ f"{self.based_column}_1", f"{self.based_column}_2", constants.SIMILARITY_COLUMN_NAME ] self._select_columns_list(top_n, column_names=columns_to_select, table_name=select_from_as) top_n.where( Column(self.ROW_NUMBER_AS, table_name=select_from_as).le( Constant(self.dku_config.top_n_most_similar))) return top_n
def _get_normalization_factor_formula(self, partition_column, rating_column): logger.debug("Using L2 normalization") return Constant(1).div( rating_column.times(rating_column).sum().over( Window(partition_by=[partition_column], mode=None)).sqrt())
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Sample code for SQL generation python API # It does not replace proper documentation, tutos and samples but for now it will do the trick # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE import dataiku.sql as sql import json from dataiku.sql import JoinTypes, Expression, Column, Constant, InlineSQL, SelectQuery, Table, Dialects, toSQL # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Expression: 1 * 2 * 3 eb = Constant(1).times(Constant(2), Constant(3)) toSQL(eb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Simple select sb = SelectQuery() sb.select_from(Table('myTable')) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # join in fluent style sb = SelectQuery() \ .select(Constant('Hello')) \ .select_from('t1') \ .join('t2', JoinTypes.INNER, Column('c', 't1').eq(Column('c', 't2'))) toSQL(sb, Dialects.MYSQL)
def _build_samples_for_scoring(self, select_from, select_from_as="_samples_for_scores"): samples_for_scores = SelectQuery() samples_for_scores.select_from(select_from, alias=select_from_as) self._select_columns_list(samples_for_scores, self.sample_keys, table_name=select_from_as) samples_for_scores.select(Constant(1), alias=self.IS_SCORE_SAMPLE) return samples_for_scores
def _build_samples_for_training(self, select_from, select_from_as="_samples_for_training"): samples_for_training = SelectQuery() samples_for_training.select_from(select_from, alias=select_from_as) self._select_columns_list(samples_for_training, self.sample_keys, table_name=select_from_as) samples_for_training.select(Constant(1), alias=self.IS_TRAINING_SAMPLE) return samples_for_training
def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME): samples_with_only_positives = SelectQuery() samples_with_only_positives.select_from(inner_select_from, inner_select_from_as) samples_with_only_positives.select(Column(self.dku_config.users_column_name)) samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER) samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1))) samples_with_only_positives.group_by(Column(self.dku_config.users_column_name)) return samples_with_only_positives