def _make_prefilter(aggregation_params, transform_params):
    timestamp_expr = Column(aggregation_params.timestamp_column)
    ref_date_expr = Constant(transform_params.ref_date).cast(ColumnType.DATE)
    time_filter = None

    if aggregation_params.is_rolling_window():
        time_filter = timestamp_expr.le(ref_date_expr)

    elif transform_params.buffer_unit is not None:
        buffer_interval = Interval(transform_params.buffer_width,
                                   transform_params.buffer_unit)
        upper = ref_date_expr.minus(buffer_interval)

        if aggregation_params.whole_history_window_enabled:
            time_filter = timestamp_expr.le(upper)
        else:
            window_width, window_unit = _get_widest_window(aggregation_params)
            base_interval = Interval(window_width, window_unit)
            lower = upper.minus(base_interval)
            time_filter = timestamp_expr.ge(lower).and_(
                timestamp_expr.le(upper))
    else:
        if aggregation_params.whole_history_window_enabled:
            time_filter = timestamp_expr.le(ref_date_expr)
        else:
            window_width, window_unit = _get_widest_window(aggregation_params)
            base_interval = Interval(window_width, window_unit)
            lower = ref_date_expr.minus(base_interval)
            time_filter = timestamp_expr.ge(lower).and_(
                timestamp_expr.le(ref_date_expr))

    return time_filter
    def _get_similarity_formula(self):
        rounding_decimals = 15
        rounding_expression = Constant(10**rounding_decimals)
        logger.debug(f"Rounding similarity to {rounding_decimals} decimals")

        if self.use_explicit:
            # compute Pearson correlation
            rating_product = (Column(
                self.dku_config.ratings_column_name,
                table_name=self.LEFT_NORMALIZATION_FACTOR_AS).minus(
                    Column(
                        self.RATING_AVERAGE,
                        table_name=self.LEFT_NORMALIZATION_FACTOR_AS)).times(
                            Column(self.dku_config.ratings_column_name,
                                   table_name=self.
                                   RIGHT_NORMALIZATION_FACTOR_AS).minus(
                                       Column(self.RATING_AVERAGE,
                                              table_name=self.
                                              RIGHT_NORMALIZATION_FACTOR_AS))))
        else:
            rating_product = Constant(1)

        return (rating_product.times(
            Column(
                self.NORMALIZATION_FACTOR_AS,
                table_name=self.LEFT_NORMALIZATION_FACTOR_AS)).times(
                    Column(
                        self.NORMALIZATION_FACTOR_AS,
                        table_name=self.RIGHT_NORMALIZATION_FACTOR_AS)).sum().
                times(rounding_expression).round().div(rounding_expression))
    def _build_normalization_factor(self,
                                    select_from,
                                    select_from_as="_visit_count"):
        # compute normalization factor
        normalization_factor = SelectQuery()
        normalization_factor.select_from(select_from, alias=select_from_as)

        self._select_columns_list(
            normalization_factor,
            column_names=self.similarity_computation_columns +
            self.filtering_columns,
            table_name=select_from_as,
        )

        rating_column = (Column(self.dku_config.ratings_column_name).minus(
            Column(self.RATING_AVERAGE)) if self.use_explicit else Constant(1))

        normalization_factor.select(
            self._get_normalization_factor_formula(Column(self.based_column),
                                                   rating_column),
            alias=self.NORMALIZATION_FACTOR_AS,
        )

        self.similarity_computation_columns += [self.NORMALIZATION_FACTOR_AS]

        # keep only items and users with enough visits
        normalization_factor.where(
            Column(self.NB_VISIT_USER_AS, table_name=select_from_as).ge(
                Constant(self.dku_config.user_visit_threshold)))
        normalization_factor.where(
            Column(self.NB_VISIT_ITEM_AS, table_name=select_from_as).ge(
                Constant(self.dku_config.item_visit_threshold)))
        return normalization_factor
    def _build_unordered_similarity(
        self,
        select_from,
        left_select_from_as="_left_ordered_similarity",
        right_select_from_as="_right_ordered_similarity",
        with_clause_as="_with_clause_ordered_similarity",
    ):
        """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table"""
        similarity = SelectQuery()

        if self.supports_with_clause:
            similarity.with_cte(select_from, alias=with_clause_as)
            select_from = with_clause_as

        similarity.select_from(select_from, alias=left_select_from_as)

        join_condition = Constant(1).eq_null_unsafe(Constant(0))

        similarity.join(select_from,
                        JoinTypes.FULL,
                        join_condition,
                        alias=right_select_from_as)

        similarity.select(
            Column(f"{self.based_column}_1",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_2",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_1",
        )
        similarity.select(
            Column(f"{self.based_column}_2",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_1",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_2",
        )
        similarity.select(
            Column("similarity", table_name=left_select_from_as).coalesce(
                Column("similarity", table_name=right_select_from_as)),
            alias=constants.SIMILARITY_COLUMN_NAME,
        )

        return similarity
Пример #5
0
 def _build_remove_historical_samples(self, select_from, select_from_as="_remove_negative_samples_seen"):
     historical_negative_samples_removed = SelectQuery()
     historical_negative_samples_removed.select_from(select_from, alias=select_from_as)
     columns_to_select = self.sample_keys + [constants.TARGET_COLUMN_NAME] + self.dku_config.score_column_names
     self._select_columns_list(select_query=historical_negative_samples_removed, column_names=columns_to_select)
     unseen_samples_condition = (
         Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)).or_(Column(self.SCORE_SAMPLE).eq(Constant(0)))
     )
     historical_negative_samples_removed.where(unseen_samples_condition)
     return historical_negative_samples_removed
Пример #6
0
        def _build_filtered_samples(inner_select_from, inner_select_from_as):
            ratio = float(self.dku_config.negative_samples_percentage / 100.0)
            filtered_samples = SelectQuery()
            filtered_samples.select_from(inner_select_from, inner_select_from_as)
            columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME]
            self._select_columns_list(filtered_samples, columns_to_select)

            nb_negative_threshold_expr = (
                Column(NB_POSITIVE_PER_USER, table_name=select_from_as)
                .times(Constant(ratio))
                .div(Constant(1).minus(Constant(ratio)))
                .ceil()
            )
            filtered_samples.where(
                Column(constants.TARGET_COLUMN_NAME, table_name=select_from_as)
                .eq(Constant(1))
                .or_(Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(nb_negative_threshold_expr))
            )
            return filtered_samples
    def _build_timestamp_filtered(self,
                                  select_from,
                                  select_from_as="_prepared_input_dataset"):
        def _build_timestamp_filtered_row_number(select_from_inner,
                                                 select_from_as_inner):
            ts_row_numbers = SelectQuery()
            ts_row_numbers.select_from(select_from_inner,
                                       alias=select_from_as_inner)

            self._select_columns_list(
                ts_row_numbers,
                column_names=self.similarity_computation_columns,
                table_name=select_from_as_inner)

            ts_row_number_expression = (Expression().rowNumber().over(
                Window(
                    partition_by=[
                        Column(self.based_column,
                               table_name=select_from_as_inner)
                    ],
                    order_by=[
                        Column(self.dku_config.timestamps_column_name,
                               table_name=select_from_as_inner),
                        Column(self.pivot_column,
                               table_name=select_from_as_inner),
                    ],
                    order_types=["DESC", "DESC"],
                    mode=None,
                )))
            ts_row_numbers.select(ts_row_number_expression,
                                  alias=self.TIMESTAMP_FILTERED_ROW_NB)
            return ts_row_numbers

        built_ts_row_numbers = _build_timestamp_filtered_row_number(
            select_from, select_from_as)

        ts_row_numbers_alias = "_ts_row_numbers"
        timestamp_filtered = SelectQuery()
        timestamp_filtered.select_from(built_ts_row_numbers,
                                       alias=ts_row_numbers_alias)

        self._select_columns_list(
            timestamp_filtered,
            column_names=self.similarity_computation_columns,
            table_name=ts_row_numbers_alias)

        timestamp_filtered.where(
            Column(self.TIMESTAMP_FILTERED_ROW_NB,
                   table_name=ts_row_numbers_alias).le(
                       Constant(self.dku_config.top_n_most_recent)))

        return timestamp_filtered
def make_max_time_interval_query(timestamp_column, resolved_ref_date, dataset):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    max_time_interval = Constant(resolved_ref_date).minus(
        Column(timestamp_column)).extract(TimeUnit.DAY).max()
    query = SelectQuery()
    query.select(max_time_interval, alias="max_time_interval")  #TODO naming
    if is_hdfs:
        query.select_from('_'.join(dataset.name.split('.')))
    else:
        query.select_from(dataset)
    dialect_handler = dialectHandler(dataset)
    return dialect_handler.convertToSQL(query)  #toSQL(query, dataset=dataset)
 def _get_user_item_similarity_formula(self, similarity_table,
                                       samples_table):
     if self.use_explicit:
         return (Column(constants.SIMILARITY_COLUMN_NAME,
                        table_name=similarity_table).times(
                            Column(self.dku_config.ratings_column_name,
                                   table_name=samples_table).minus(
                                       Column(self.RATING_AVERAGE,
                                              table_name=samples_table))).
                 sum().div(
                     Column(constants.SIMILARITY_COLUMN_NAME,
                            table_name=similarity_table).abs().sum()))
     else:
         return (Column(constants.SIMILARITY_COLUMN_NAME,
                        table_name=similarity_table).sum().div(
                            Constant(self.dku_config.top_n_most_similar)))
    def _build_top_n(self, select_from, select_from_as="_row_number_table"):
        top_n = SelectQuery()
        top_n.select_from(select_from, alias=select_from_as)

        columns_to_select = [
            f"{self.based_column}_1", f"{self.based_column}_2",
            constants.SIMILARITY_COLUMN_NAME
        ]
        self._select_columns_list(top_n,
                                  column_names=columns_to_select,
                                  table_name=select_from_as)

        top_n.where(
            Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(
                Constant(self.dku_config.top_n_most_similar)))
        return top_n
 def _get_normalization_factor_formula(self, partition_column,
                                       rating_column):
     logger.debug("Using L2 normalization")
     return Constant(1).div(
         rating_column.times(rating_column).sum().over(
             Window(partition_by=[partition_column], mode=None)).sqrt())
Пример #12
0
# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Sample code for SQL generation python API
# It does not replace proper documentation, tutos and samples but for now it will do the trick

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import dataiku.sql as sql
import json
from dataiku.sql import JoinTypes, Expression, Column, Constant, InlineSQL, SelectQuery, Table, Dialects, toSQL

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Expression: 1 * 2 * 3
eb = Constant(1).times(Constant(2), Constant(3))

toSQL(eb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Simple select
sb = SelectQuery()
sb.select_from(Table('myTable'))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# join in fluent style
sb = SelectQuery() \
    .select(Constant('Hello')) \
    .select_from('t1') \
    .join('t2', JoinTypes.INNER, Column('c', 't1').eq(Column('c', 't2')))

toSQL(sb, Dialects.MYSQL)
Пример #13
0
 def _build_samples_for_scoring(self, select_from, select_from_as="_samples_for_scores"):
     samples_for_scores = SelectQuery()
     samples_for_scores.select_from(select_from, alias=select_from_as)
     self._select_columns_list(samples_for_scores, self.sample_keys, table_name=select_from_as)
     samples_for_scores.select(Constant(1), alias=self.IS_SCORE_SAMPLE)
     return samples_for_scores
Пример #14
0
 def _build_samples_for_training(self, select_from, select_from_as="_samples_for_training"):
     samples_for_training = SelectQuery()
     samples_for_training.select_from(select_from, alias=select_from_as)
     self._select_columns_list(samples_for_training, self.sample_keys, table_name=select_from_as)
     samples_for_training.select(Constant(1), alias=self.IS_TRAINING_SAMPLE)
     return samples_for_training
Пример #15
0
 def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME):
     samples_with_only_positives = SelectQuery()
     samples_with_only_positives.select_from(inner_select_from, inner_select_from_as)
     samples_with_only_positives.select(Column(self.dku_config.users_column_name))
     samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER)
     samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)))
     samples_with_only_positives.group_by(Column(self.dku_config.users_column_name))
     return samples_with_only_positives