def _build_row_numbers(self,
                           select_from,
                           select_from_as="_similarity_matrix"):
        row_numbers = SelectQuery()
        row_numbers.select_from(select_from, alias=select_from_as)

        columns_to_select = [
            f"{self.based_column}_1", f"{self.based_column}_2",
            constants.SIMILARITY_COLUMN_NAME
        ]
        self._select_columns_list(row_numbers,
                                  column_names=columns_to_select,
                                  table_name=select_from_as)

        row_number_expression = (Expression().rowNumber().over(
            Window(
                partition_by=[
                    Column(f"{self.based_column}_1", table_name=select_from_as)
                ],
                order_by=[
                    Column(constants.SIMILARITY_COLUMN_NAME,
                           table_name=select_from_as),
                    Column(f"{self.based_column}_2",
                           table_name=select_from_as),
                ],
                order_types=["DESC", "DESC"],
                mode=None,
            )))
        row_numbers.select(row_number_expression, alias=self.ROW_NUMBER_AS)
        return row_numbers
Exemplo n.º 2
0
    def _build_all_cf_scores(
        self,
        select_from,
        samples_for_training,
        samples_for_scores=None,
        select_from_as="_all_cf_scores",
        samples_for_training_as="_samples_for_training",
        samples_for_scores_as="_samples_for_scores",
    ):
        all_cf_scores = SelectQuery()
        all_cf_scores.select_from(select_from, alias=select_from_as)

        self._left_join_samples(
            all_cf_scores, select_from_as, samples_for_training, samples_for_training_as, self.sample_keys
        )
        all_cf_scores.select(Column(self.IS_TRAINING_SAMPLE, table_name=samples_for_training_as))

        if samples_for_scores:
            self._left_join_samples(
                all_cf_scores, select_from_as, samples_for_scores, samples_for_scores_as, self.sample_keys
            )
            all_cf_scores.select(Column(self.IS_SCORE_SAMPLE, table_name=samples_for_scores_as))

        columns_to_select = self.sample_keys + self.dku_config.score_column_names
        self._select_columns_list(all_cf_scores, columns_to_select, table_name=select_from_as)
        return all_cf_scores
Exemplo n.º 3
0
        def _build_samples_with_all_infos(inner_select_from, join_with, inner_select_from_as=ALL_INFOS_TABLE_NAME):
            samples_with_all_infos = SelectQuery()
            samples_with_all_infos.select_from(inner_select_from, alias=inner_select_from_as)

            columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME]
            self._select_columns_list(samples_with_all_infos, columns_to_select, table_name=inner_select_from_as)

            row_number_expression = (
                Expression()
                .rowNumber()
                .over(
                    Window(
                        partition_by=[
                            Column(self.dku_config.users_column_name, table_name=inner_select_from_as),
                            Column(constants.TARGET_COLUMN_NAME, table_name=inner_select_from_as),
                        ],
                        order_by=[Column(constants.TARGET_COLUMN_NAME, table_name=inner_select_from_as)],
                        order_types=["DESC"],
                        mode=None,
                    )
                )
            )
            samples_with_all_infos.select(row_number_expression, alias=self.ROW_NUMBER_AS)
            samples_with_all_infos.select(Column(NB_POSITIVE_PER_USER, table_name=ONLY_POSITIVE_TABLE_NAME))

            self._left_join_samples(
                left_select_query=samples_with_all_infos,
                left_table_name=inner_select_from_as,
                right_select_query=join_with,
                right_table_name=ONLY_POSITIVE_TABLE_NAME,
                keys=[self.dku_config.users_column_name],
            )
            return samples_with_all_infos
        def _build_timestamp_filtered_row_number(select_from_inner,
                                                 select_from_as_inner):
            ts_row_numbers = SelectQuery()
            ts_row_numbers.select_from(select_from_inner,
                                       alias=select_from_as_inner)

            self._select_columns_list(
                ts_row_numbers,
                column_names=self.similarity_computation_columns,
                table_name=select_from_as_inner)

            ts_row_number_expression = (Expression().rowNumber().over(
                Window(
                    partition_by=[
                        Column(self.based_column,
                               table_name=select_from_as_inner)
                    ],
                    order_by=[
                        Column(self.dku_config.timestamps_column_name,
                               table_name=select_from_as_inner),
                        Column(self.pivot_column,
                               table_name=select_from_as_inner),
                    ],
                    order_types=["DESC", "DESC"],
                    mode=None,
                )))
            ts_row_numbers.select(ts_row_number_expression,
                                  alias=self.TIMESTAMP_FILTERED_ROW_NB)
            return ts_row_numbers
    def _build_normalization_factor(self,
                                    select_from,
                                    select_from_as="_visit_count"):
        # compute normalization factor
        normalization_factor = SelectQuery()
        normalization_factor.select_from(select_from, alias=select_from_as)

        self._select_columns_list(
            normalization_factor,
            column_names=self.similarity_computation_columns +
            self.filtering_columns,
            table_name=select_from_as,
        )

        rating_column = (Column(self.dku_config.ratings_column_name).minus(
            Column(self.RATING_AVERAGE)) if self.use_explicit else Constant(1))

        normalization_factor.select(
            self._get_normalization_factor_formula(Column(self.based_column),
                                                   rating_column),
            alias=self.NORMALIZATION_FACTOR_AS,
        )

        self.similarity_computation_columns += [self.NORMALIZATION_FACTOR_AS]

        # keep only items and users with enough visits
        normalization_factor.where(
            Column(self.NB_VISIT_USER_AS, table_name=select_from_as).ge(
                Constant(self.dku_config.user_visit_threshold)))
        normalization_factor.where(
            Column(self.NB_VISIT_ITEM_AS, table_name=select_from_as).ge(
                Constant(self.dku_config.item_visit_threshold)))
        return normalization_factor
    def _build_ordered_similarity(
            self,
            select_from,
            with_clause_as="_with_clause_normalization_factor"):
        """Build a similarity table col_1, col_2, similarity where col_1 < col_2 """
        similarity = SelectQuery()

        if self.supports_with_clause:
            similarity.with_cte(select_from, alias=with_clause_as)
            select_from = with_clause_as

        similarity.select_from(select_from,
                               alias=self.LEFT_NORMALIZATION_FACTOR_AS)

        join_conditions = [
            Column(self.pivot_column,
                   self.LEFT_NORMALIZATION_FACTOR_AS).eq_null_unsafe(
                       Column(self.pivot_column,
                              self.RIGHT_NORMALIZATION_FACTOR_AS))
        ]

        if self.supports_full_outer_join:
            join_conditions += [
                Column(self.based_column,
                       self.LEFT_NORMALIZATION_FACTOR_AS).lt(
                           Column(self.based_column,
                                  self.RIGHT_NORMALIZATION_FACTOR_AS))
            ]
        else:
            join_conditions += [
                Column(self.based_column,
                       self.LEFT_NORMALIZATION_FACTOR_AS).ne(
                           Column(self.based_column,
                                  self.RIGHT_NORMALIZATION_FACTOR_AS))
            ]

        similarity.join(select_from,
                        JoinTypes.INNER,
                        join_conditions,
                        alias=self.RIGHT_NORMALIZATION_FACTOR_AS)

        similarity.group_by(
            Column(self.based_column,
                   table_name=self.LEFT_NORMALIZATION_FACTOR_AS))
        similarity.group_by(
            Column(self.based_column,
                   table_name=self.RIGHT_NORMALIZATION_FACTOR_AS))

        similarity.select(Column(self.based_column,
                                 table_name=self.LEFT_NORMALIZATION_FACTOR_AS),
                          alias=f"{self.based_column}_1")
        similarity.select(Column(
            self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS),
                          alias=f"{self.based_column}_2")

        similarity.select(self._get_similarity_formula(),
                          alias=constants.SIMILARITY_COLUMN_NAME)

        return similarity
Exemplo n.º 7
0
 def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME):
     samples_with_only_positives = SelectQuery()
     samples_with_only_positives.select_from(inner_select_from, inner_select_from_as)
     samples_with_only_positives.select(Column(self.dku_config.users_column_name))
     samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER)
     samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)))
     samples_with_only_positives.group_by(Column(self.dku_config.users_column_name))
     return samples_with_only_positives
Exemplo n.º 8
0
 def _or_condition_columns_list(self, select_query, column_names,
                                condition_method):
     for i, col_name in enumerate(column_names):
         if i == 0:
             or_condition = condition_method(Column(col_name))
         else:
             or_condition = or_condition.or_(
                 condition_method(Column(col_name)))
     select_query.where(or_condition)
Exemplo n.º 9
0
 def _build_remove_historical_samples(self, select_from, select_from_as="_remove_negative_samples_seen"):
     historical_negative_samples_removed = SelectQuery()
     historical_negative_samples_removed.select_from(select_from, alias=select_from_as)
     columns_to_select = self.sample_keys + [constants.TARGET_COLUMN_NAME] + self.dku_config.score_column_names
     self._select_columns_list(select_query=historical_negative_samples_removed, column_names=columns_to_select)
     unseen_samples_condition = (
         Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)).or_(Column(self.SCORE_SAMPLE).eq(Constant(0)))
     )
     historical_negative_samples_removed.where(unseen_samples_condition)
     return historical_negative_samples_removed
def _make_prefilter(aggregation_params, transform_params):
    timestamp_expr = Column(aggregation_params.timestamp_column)
    ref_date_expr = Constant(transform_params.ref_date).cast(ColumnType.DATE)
    time_filter = None

    if aggregation_params.is_rolling_window():
        time_filter = timestamp_expr.le(ref_date_expr)

    elif transform_params.buffer_unit is not None:
        buffer_interval = Interval(transform_params.buffer_width,
                                   transform_params.buffer_unit)
        upper = ref_date_expr.minus(buffer_interval)

        if aggregation_params.whole_history_window_enabled:
            time_filter = timestamp_expr.le(upper)
        else:
            window_width, window_unit = _get_widest_window(aggregation_params)
            base_interval = Interval(window_width, window_unit)
            lower = upper.minus(base_interval)
            time_filter = timestamp_expr.ge(lower).and_(
                timestamp_expr.le(upper))
    else:
        if aggregation_params.whole_history_window_enabled:
            time_filter = timestamp_expr.le(ref_date_expr)
        else:
            window_width, window_unit = _get_widest_window(aggregation_params)
            base_interval = Interval(window_width, window_unit)
            lower = ref_date_expr.minus(base_interval)
            time_filter = timestamp_expr.ge(lower).and_(
                timestamp_expr.le(ref_date_expr))

    return time_filter
Exemplo n.º 11
0
 def _build_all_cf_scores_with_target(self, select_from, select_from_as="_all_cf_scores_with_target"):
     all_cf_scores_with_target = SelectQuery()
     all_cf_scores_with_target.select_from(select_from, alias=select_from_as)
     columns_to_select = self.sample_keys + self.dku_config.score_column_names
     self._select_columns_list(select_query=all_cf_scores_with_target, column_names=columns_to_select)
     all_cf_scores_with_target.select(
         Column(self.IS_TRAINING_SAMPLE).coalesce(0).cast("int"), alias=constants.TARGET_COLUMN_NAME
     )
     if self.has_historical_data:
         all_cf_scores_with_target.select(
             Column(self.IS_SCORE_SAMPLE).coalesce(0).cast("int"), alias=self.SCORE_SAMPLE
         )
     return all_cf_scores_with_target
 def _get_user_item_similarity_formula(self, similarity_table,
                                       samples_table):
     if self.use_explicit:
         return (Column(constants.SIMILARITY_COLUMN_NAME,
                        table_name=similarity_table).times(
                            Column(self.dku_config.ratings_column_name,
                                   table_name=samples_table).minus(
                                       Column(self.RATING_AVERAGE,
                                              table_name=samples_table))).
                 sum().div(
                     Column(constants.SIMILARITY_COLUMN_NAME,
                            table_name=similarity_table).abs().sum()))
     else:
         return (Column(constants.SIMILARITY_COLUMN_NAME,
                        table_name=similarity_table).sum().div(
                            Constant(self.dku_config.top_n_most_similar)))
Exemplo n.º 13
0
 def _rename_table(self, to_rename, renaming_mapping):
     renamed_table = SelectQuery()
     renamed_table.select_from(to_rename, alias="_renamed")
     for input_column, renamed_column in renaming_mapping.items():
         renamed_table.select(Column(input_column, table_name="_renamed"),
                              alias=renamed_column)
     return renamed_table
    def _encoding_feature_name(self, dataset):

        is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
        dataset_schema = dataset.read_schema()

        col_list = []
        new_col_alias = []

        for col_index, col_info in enumerate(dataset_schema):
            col_name = col_info.get('name')
            col_list.append(Column(col_name))
            col_name_mapping = self.params.feature_name_mapping.get(col_name)
            if col_name_mapping:
                col_alias = '{}_{}'.format(col_name_mapping, col_index)
            else:
                col_alias = col_name
            new_col_alias.append(col_alias)

        query_to_rename = SelectQuery().select(col_list, new_col_alias)
        if is_hdfs:
            query_to_rename.select_from('_'.join(dataset.name.split('.')))
        else:
            query_to_rename.select_from(dataset)
        dialect_handler = dialectHandler(dataset)
        dialect_handler.get_executor().exec_recipe_fragment(
            dataset, query=dialect_handler.convertToSQL(
                query_to_rename))  #toSQL(query_to_rename, dataset=dataset))
Exemplo n.º 15
0
 def _cast_table(self, to_cast, cast_mapping, alias):
     cast_table = SelectQuery()
     cast_table.select_from(to_cast, alias=alias)
     for input_column, target_type in cast_mapping.items():
         cast_table.select(Column(input_column,
                                  table_name=alias).cast(target_type),
                           alias=input_column)
     return cast_table
def make_full_transform_query(aggregation_queries,
                              dataset,
                              aggregation_params,
                              transform_params,
                              encoding_feature=False):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    inner = SelectQuery()
    if is_hdfs:
        inner.select_from('_'.join(dataset.name.split('.')))
    else:
        inner.select_from(dataset)

    if aggregation_params.is_rolling_window():
        inner.select(Column('*'))
    else:
        inner.distinct()  #TODO why?! -> avoid dupplicate
        for key in aggregation_params.get_effective_keys():
            inner.select(Column(key))
    prefilter = _make_prefilter(aggregation_params, transform_params)
    inner.where(prefilter)

    outer = SelectQuery()
    outer.select_from(inner, alias='inner')
    if aggregation_params.is_rolling_window():
        outer.select(Column('*', 'inner'))
    else:
        for col in aggregation_params.get_effective_keys():  #+ feature_names:
            outer.select(Column(col, 'inner'))

    reverse_mapping_dict = {}

    for idx, agg_query in enumerate(aggregation_queries):
        agg_query.alias(
            agg_query.get_alias()
            or 'cte_' + str(idx))  #TODO remove, make sure they have ids
        outer.with_cte(agg_query)
        join_cond = Expression()
        for key in aggregation_params.get_effective_keys():
            join_cond = join_cond.and_(
                Column(key, 'inner').eq_null_unsafe(
                    Column(key, agg_query.get_alias())))
        outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond)

        for idx2, col in enumerate(agg_query.get_columns_alias()):
            if encoding_feature:
                if aggregation_params.feature_name_mapping.get(col):
                    new_alias = '{}_{}_{}'.format(
                        aggregation_params.feature_name_mapping.get(col), idx,
                        idx2)
                    outer.select(Column(col, agg_query.get_alias()), new_alias)
                    reverse_mapping_dict[new_alias] = col
            else:
                outer.select(Column(col, agg_query.get_alias()))

    return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
Exemplo n.º 17
0
        def _build_filtered_samples(inner_select_from, inner_select_from_as):
            ratio = float(self.dku_config.negative_samples_percentage / 100.0)
            filtered_samples = SelectQuery()
            filtered_samples.select_from(inner_select_from, inner_select_from_as)
            columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME]
            self._select_columns_list(filtered_samples, columns_to_select)

            nb_negative_threshold_expr = (
                Column(NB_POSITIVE_PER_USER, table_name=select_from_as)
                .times(Constant(ratio))
                .div(Constant(1).minus(Constant(ratio)))
                .ceil()
            )
            filtered_samples.where(
                Column(constants.TARGET_COLUMN_NAME, table_name=select_from_as)
                .eq(Constant(1))
                .or_(Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(nb_negative_threshold_expr))
            )
            return filtered_samples
def make_most_recent_timestamp_query(timestamp_column, dataset):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    query = SelectQuery()
    query.select(Column(timestamp_column).max(), alias='most_recent_timestamp')
    if is_hdfs:
        query.select_from('_'.join(dataset.name.split('.')))
    else:
        query.select_from(dataset)
    dialect_handler = dialectHandler(dataset)
    return dialect_handler.convertToSQL(
        query)  #toSQL(query, dialect='Hive')#dataset=dataset)
def make_distinct_values_query(column, dataset):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()

    column = Column(column)

    layer_1 = SelectQuery()
    layer_1.select(column)
    layer_1.select(Column('*').count(), 'count')
    if is_hdfs:
        layer_1.select_from('_'.join(dataset.name.split('.')))
    else:
        layer_1.select_from(dataset)
    layer_1.group_by(column)

    count = Column('count')
    layer_2 = SelectQuery()
    layer_2.select(column)
    layer_2.select(
        count.div(count.sum().over(Window())).times(100), 'distribution')
    layer_2.select_from(layer_1, alias='layer_1')

    dialect_handler = dialectHandler(dataset)
    return dialect_handler.convertToSQL(
        layer_2)  #toSQL(query, dialect='Hive')#dataset=dataset)
    def _get_similarity_formula(self):
        rounding_decimals = 15
        rounding_expression = Constant(10**rounding_decimals)
        logger.debug(f"Rounding similarity to {rounding_decimals} decimals")

        if self.use_explicit:
            # compute Pearson correlation
            rating_product = (Column(
                self.dku_config.ratings_column_name,
                table_name=self.LEFT_NORMALIZATION_FACTOR_AS).minus(
                    Column(
                        self.RATING_AVERAGE,
                        table_name=self.LEFT_NORMALIZATION_FACTOR_AS)).times(
                            Column(self.dku_config.ratings_column_name,
                                   table_name=self.
                                   RIGHT_NORMALIZATION_FACTOR_AS).minus(
                                       Column(self.RATING_AVERAGE,
                                              table_name=self.
                                              RIGHT_NORMALIZATION_FACTOR_AS))))
        else:
            rating_product = Constant(1)

        return (rating_product.times(
            Column(
                self.NORMALIZATION_FACTOR_AS,
                table_name=self.LEFT_NORMALIZATION_FACTOR_AS)).times(
                    Column(
                        self.NORMALIZATION_FACTOR_AS,
                        table_name=self.RIGHT_NORMALIZATION_FACTOR_AS)).sum().
                times(rounding_expression).round().div(rounding_expression))
    def _build_timestamp_filtered(self,
                                  select_from,
                                  select_from_as="_prepared_input_dataset"):
        def _build_timestamp_filtered_row_number(select_from_inner,
                                                 select_from_as_inner):
            ts_row_numbers = SelectQuery()
            ts_row_numbers.select_from(select_from_inner,
                                       alias=select_from_as_inner)

            self._select_columns_list(
                ts_row_numbers,
                column_names=self.similarity_computation_columns,
                table_name=select_from_as_inner)

            ts_row_number_expression = (Expression().rowNumber().over(
                Window(
                    partition_by=[
                        Column(self.based_column,
                               table_name=select_from_as_inner)
                    ],
                    order_by=[
                        Column(self.dku_config.timestamps_column_name,
                               table_name=select_from_as_inner),
                        Column(self.pivot_column,
                               table_name=select_from_as_inner),
                    ],
                    order_types=["DESC", "DESC"],
                    mode=None,
                )))
            ts_row_numbers.select(ts_row_number_expression,
                                  alias=self.TIMESTAMP_FILTERED_ROW_NB)
            return ts_row_numbers

        built_ts_row_numbers = _build_timestamp_filtered_row_number(
            select_from, select_from_as)

        ts_row_numbers_alias = "_ts_row_numbers"
        timestamp_filtered = SelectQuery()
        timestamp_filtered.select_from(built_ts_row_numbers,
                                       alias=ts_row_numbers_alias)

        self._select_columns_list(
            timestamp_filtered,
            column_names=self.similarity_computation_columns,
            table_name=ts_row_numbers_alias)

        timestamp_filtered.where(
            Column(self.TIMESTAMP_FILTERED_ROW_NB,
                   table_name=ts_row_numbers_alias).le(
                       Constant(self.dku_config.top_n_most_recent)))

        return timestamp_filtered
def make_max_time_interval_query(timestamp_column, resolved_ref_date, dataset):

    is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys()
    max_time_interval = Constant(resolved_ref_date).minus(
        Column(timestamp_column)).extract(TimeUnit.DAY).max()
    query = SelectQuery()
    query.select(max_time_interval, alias="max_time_interval")  #TODO naming
    if is_hdfs:
        query.select_from('_'.join(dataset.name.split('.')))
    else:
        query.select_from(dataset)
    dialect_handler = dialectHandler(dataset)
    return dialect_handler.convertToSQL(query)  #toSQL(query, dataset=dataset)
    def _build_top_n(self, select_from, select_from_as="_row_number_table"):
        top_n = SelectQuery()
        top_n.select_from(select_from, alias=select_from_as)

        columns_to_select = [
            f"{self.based_column}_1", f"{self.based_column}_2",
            constants.SIMILARITY_COLUMN_NAME
        ]
        self._select_columns_list(top_n,
                                  column_names=columns_to_select,
                                  table_name=select_from_as)

        top_n.where(
            Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(
                Constant(self.dku_config.top_n_most_similar)))
        return top_n
    def _build_sum_of_similarity_scores(
            self,
            top_n,
            normalization_factor,
            top_n_as="_top_n",
            normalization_factor_as="_normalization_factor"):
        cf_scores = SelectQuery()
        cf_scores.select_from(top_n, alias=top_n_as)

        join_condition = Column(f"{self.based_column}_2",
                                top_n_as).eq_null_unsafe(
                                    Column(self.based_column,
                                           normalization_factor_as))
        cf_scores.join(normalization_factor,
                       JoinTypes.INNER,
                       join_condition,
                       alias=normalization_factor_as)

        cf_scores.group_by(
            Column(f"{self.based_column}_1", table_name=top_n_as))
        cf_scores.group_by(
            Column(self.pivot_column, table_name=normalization_factor_as))

        cf_scores.select(Column(f"{self.based_column}_1", table_name=top_n_as),
                         alias=self.based_column)
        cf_scores.select(
            Column(self.pivot_column, table_name=normalization_factor_as))

        cf_scores.select(self._get_user_item_similarity_formula(
            top_n_as, normalization_factor_as),
                         alias=constants.SCORE_COLUMN_NAME)

        cf_scores.order_by(Column(self.based_column))
        cf_scores.order_by(Column(constants.SCORE_COLUMN_NAME),
                           direction="DESC")
        return cf_scores
    def _build_unordered_similarity(
        self,
        select_from,
        left_select_from_as="_left_ordered_similarity",
        right_select_from_as="_right_ordered_similarity",
        with_clause_as="_with_clause_ordered_similarity",
    ):
        """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table"""
        similarity = SelectQuery()

        if self.supports_with_clause:
            similarity.with_cte(select_from, alias=with_clause_as)
            select_from = with_clause_as

        similarity.select_from(select_from, alias=left_select_from_as)

        join_condition = Constant(1).eq_null_unsafe(Constant(0))

        similarity.join(select_from,
                        JoinTypes.FULL,
                        join_condition,
                        alias=right_select_from_as)

        similarity.select(
            Column(f"{self.based_column}_1",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_2",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_1",
        )
        similarity.select(
            Column(f"{self.based_column}_2",
                   table_name=left_select_from_as).coalesce(
                       Column(f"{self.based_column}_1",
                              table_name=right_select_from_as)),
            alias=f"{self.based_column}_2",
        )
        similarity.select(
            Column("similarity", table_name=left_select_from_as).coalesce(
                Column("similarity", table_name=right_select_from_as)),
            alias=constants.SIMILARITY_COLUMN_NAME,
        )

        return similarity
 def _check_populations(self):
     if self.params.populations_mode == PopulationsDefinitionMode.MANUAL:
         if len(self.params.populations_dict) > 0:
             populations_values = self.params.populations_dict.values()
         else:
             populations_values = [[]]
     else:  # brute_force (or None?)
         # We loop the populations_dict and create one list for each granularity column, for example if our granularity dict is {'event_type': ['buy_order'], 'event_dayofweek': [1,2,3]}
         #       we will have something like [ ['buy_order'], [1,2,3] ]
         # Finally, we find all the possible combination of the list above (buy_order and 1, buy_order and 2, ...)
         populations_values = [
             zip(self.params.populations_dict, prod)
             for prod in itertools.product(
                 *(self.params.populations_dict[key]
                   for key in self.params.populations_dict))
         ]
         # In the end we have something like this: [[('event_dayofweek', ['1', '2', '3']), ('event_type', ['buy_order'])], [('event_dayofweek', ['4', '5']), ('event_type', ['buy_order'])],...]
     self.params.populations = []
     for population_combination in populations_values:
         print('POPULATION: ', population_combination)
         conditions = []
         # for example, if we have population_combination = [[event_timestmap, ['buy_order']], [event_dayofweek, ['1','2','3']]]
         for combination_val in population_combination:
             condition_column = combination_val[0]
             single_cond_expr = Column(condition_column).in_(
                 List(*combination_val[1]))
             conditions.append(single_cond_expr)
         if len(conditions) > 0:
             population = conditions[0]
             for condition_expr in conditions[1:]:
                 population = population.and_(condition_expr)
             self.params.populations.append(population)
         else:
             self.params.populations.append(None)
     logger.info("There are {} population(s)".format(
         len(self.params.populations)))
    def _build_visit_count(self,
                           select_from,
                           select_from_as="_filtered_input_dataset"):
        # total user and item visits
        visit_count = SelectQuery()
        visit_count.select_from(select_from, alias=select_from_as)

        self._select_columns_list(
            visit_count,
            column_names=self.similarity_computation_columns +
            self.filtering_columns,
            table_name=select_from_as,
        )

        visit_count.select(
            Column("*").count().over(
                Window(
                    partition_by=[Column(self.dku_config.users_column_name)],
                    mode=None,
                )),
            alias=self.NB_VISIT_USER_AS,
        )
        visit_count.select(
            Column("*").count().over(
                Window(
                    partition_by=[Column(self.dku_config.items_column_name)],
                    mode=None,
                )),
            alias=self.NB_VISIT_ITEM_AS,
        )
        if self.use_explicit:
            visit_count.select(
                Column(self.dku_config.ratings_column_name).avg().over(
                    Window(partition_by=[Column(self.based_column)],
                           mode=None)),
                alias=self.RATING_AVERAGE,
            )
            self.similarity_computation_columns += [self.RATING_AVERAGE]

        return visit_count
Exemplo n.º 28
0
operator = get_recipe_config()['operator']
columns_left = get_recipe_config()['columns_1']
columns_right = get_recipe_config()['columns_2']


#############################
# Original recipe
#############################

#Start the loop
joins = ['LEFT', 'RIGHT', 'INNER']
join_conds = []

for key in range(len(key_a)):
    join_cond = Expression()
    globals()['join_cond_'+str(key)] = join_cond.and_(Column(key_a[key], input_A_names[0]).eq_null_unsafe(Column(key_b[key], input_B_names[0])))
    join_conds.append(globals()['join_cond_'+str(key)])  
    
for i in joins:
    query = SelectQuery()
    query.select_from(input_A_datasets[0], alias = input_A_names[0])
    for j in columns_left:
        query.select(Column(j, input_A_names[0]),alias = j)
    for k in columns_right:
        query.select(Column(k, input_B_names[0]),alias = k) 
    query.join(input_B_datasets[0], i, join_conds, operatorBetweenConditions = operator , alias= input_B_names[0])
    globals()['sql_'+str(i)] = toSQL(query, input_A_datasets[0])


e = SQLExecutor2()
e.exec_recipe_fragment(output_A_datasets[0], sql_LEFT)
Exemplo n.º 29
0
toSQL(eb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Simple select
sb = SelectQuery()
sb.select_from(Table('myTable'))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# join in fluent style
sb = SelectQuery() \
    .select(Constant('Hello')) \
    .select_from('t1') \
    .join('t2', JoinTypes.INNER, Column('c', 't1').eq(Column('c', 't2')))

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Group by and limit
sb = SelectQuery()
sb.select_from('myTable')
sb.group_by(Column('groupCol'))
sb.limit(1000)

toSQL(sb, Dialects.MYSQL)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Where clause and select distinct
sb = SelectQuery()
Exemplo n.º 30
0
 def _select_columns_list(self,
                          select_query,
                          column_names,
                          table_name=None):
     for col_name in column_names:
         select_query.select(Column(col_name, table_name=table_name))