def _build_row_numbers(self, select_from, select_from_as="_similarity_matrix"): row_numbers = SelectQuery() row_numbers.select_from(select_from, alias=select_from_as) columns_to_select = [ f"{self.based_column}_1", f"{self.based_column}_2", constants.SIMILARITY_COLUMN_NAME ] self._select_columns_list(row_numbers, column_names=columns_to_select, table_name=select_from_as) row_number_expression = (Expression().rowNumber().over( Window( partition_by=[ Column(f"{self.based_column}_1", table_name=select_from_as) ], order_by=[ Column(constants.SIMILARITY_COLUMN_NAME, table_name=select_from_as), Column(f"{self.based_column}_2", table_name=select_from_as), ], order_types=["DESC", "DESC"], mode=None, ))) row_numbers.select(row_number_expression, alias=self.ROW_NUMBER_AS) return row_numbers
def _build_all_cf_scores( self, select_from, samples_for_training, samples_for_scores=None, select_from_as="_all_cf_scores", samples_for_training_as="_samples_for_training", samples_for_scores_as="_samples_for_scores", ): all_cf_scores = SelectQuery() all_cf_scores.select_from(select_from, alias=select_from_as) self._left_join_samples( all_cf_scores, select_from_as, samples_for_training, samples_for_training_as, self.sample_keys ) all_cf_scores.select(Column(self.IS_TRAINING_SAMPLE, table_name=samples_for_training_as)) if samples_for_scores: self._left_join_samples( all_cf_scores, select_from_as, samples_for_scores, samples_for_scores_as, self.sample_keys ) all_cf_scores.select(Column(self.IS_SCORE_SAMPLE, table_name=samples_for_scores_as)) columns_to_select = self.sample_keys + self.dku_config.score_column_names self._select_columns_list(all_cf_scores, columns_to_select, table_name=select_from_as) return all_cf_scores
def _build_samples_with_all_infos(inner_select_from, join_with, inner_select_from_as=ALL_INFOS_TABLE_NAME): samples_with_all_infos = SelectQuery() samples_with_all_infos.select_from(inner_select_from, alias=inner_select_from_as) columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME] self._select_columns_list(samples_with_all_infos, columns_to_select, table_name=inner_select_from_as) row_number_expression = ( Expression() .rowNumber() .over( Window( partition_by=[ Column(self.dku_config.users_column_name, table_name=inner_select_from_as), Column(constants.TARGET_COLUMN_NAME, table_name=inner_select_from_as), ], order_by=[Column(constants.TARGET_COLUMN_NAME, table_name=inner_select_from_as)], order_types=["DESC"], mode=None, ) ) ) samples_with_all_infos.select(row_number_expression, alias=self.ROW_NUMBER_AS) samples_with_all_infos.select(Column(NB_POSITIVE_PER_USER, table_name=ONLY_POSITIVE_TABLE_NAME)) self._left_join_samples( left_select_query=samples_with_all_infos, left_table_name=inner_select_from_as, right_select_query=join_with, right_table_name=ONLY_POSITIVE_TABLE_NAME, keys=[self.dku_config.users_column_name], ) return samples_with_all_infos
def _build_timestamp_filtered_row_number(select_from_inner, select_from_as_inner): ts_row_numbers = SelectQuery() ts_row_numbers.select_from(select_from_inner, alias=select_from_as_inner) self._select_columns_list( ts_row_numbers, column_names=self.similarity_computation_columns, table_name=select_from_as_inner) ts_row_number_expression = (Expression().rowNumber().over( Window( partition_by=[ Column(self.based_column, table_name=select_from_as_inner) ], order_by=[ Column(self.dku_config.timestamps_column_name, table_name=select_from_as_inner), Column(self.pivot_column, table_name=select_from_as_inner), ], order_types=["DESC", "DESC"], mode=None, ))) ts_row_numbers.select(ts_row_number_expression, alias=self.TIMESTAMP_FILTERED_ROW_NB) return ts_row_numbers
def _build_normalization_factor(self, select_from, select_from_as="_visit_count"): # compute normalization factor normalization_factor = SelectQuery() normalization_factor.select_from(select_from, alias=select_from_as) self._select_columns_list( normalization_factor, column_names=self.similarity_computation_columns + self.filtering_columns, table_name=select_from_as, ) rating_column = (Column(self.dku_config.ratings_column_name).minus( Column(self.RATING_AVERAGE)) if self.use_explicit else Constant(1)) normalization_factor.select( self._get_normalization_factor_formula(Column(self.based_column), rating_column), alias=self.NORMALIZATION_FACTOR_AS, ) self.similarity_computation_columns += [self.NORMALIZATION_FACTOR_AS] # keep only items and users with enough visits normalization_factor.where( Column(self.NB_VISIT_USER_AS, table_name=select_from_as).ge( Constant(self.dku_config.user_visit_threshold))) normalization_factor.where( Column(self.NB_VISIT_ITEM_AS, table_name=select_from_as).ge( Constant(self.dku_config.item_visit_threshold))) return normalization_factor
def _build_ordered_similarity( self, select_from, with_clause_as="_with_clause_normalization_factor"): """Build a similarity table col_1, col_2, similarity where col_1 < col_2 """ similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=self.LEFT_NORMALIZATION_FACTOR_AS) join_conditions = [ Column(self.pivot_column, self.LEFT_NORMALIZATION_FACTOR_AS).eq_null_unsafe( Column(self.pivot_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] if self.supports_full_outer_join: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).lt( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] else: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).ne( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] similarity.join(select_from, JoinTypes.INNER, join_conditions, alias=self.RIGHT_NORMALIZATION_FACTOR_AS) similarity.group_by( Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS)) similarity.group_by( Column(self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS)) similarity.select(Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_1") similarity.select(Column( self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_2") similarity.select(self._get_similarity_formula(), alias=constants.SIMILARITY_COLUMN_NAME) return similarity
def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME): samples_with_only_positives = SelectQuery() samples_with_only_positives.select_from(inner_select_from, inner_select_from_as) samples_with_only_positives.select(Column(self.dku_config.users_column_name)) samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER) samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1))) samples_with_only_positives.group_by(Column(self.dku_config.users_column_name)) return samples_with_only_positives
def _or_condition_columns_list(self, select_query, column_names, condition_method): for i, col_name in enumerate(column_names): if i == 0: or_condition = condition_method(Column(col_name)) else: or_condition = or_condition.or_( condition_method(Column(col_name))) select_query.where(or_condition)
def _build_remove_historical_samples(self, select_from, select_from_as="_remove_negative_samples_seen"): historical_negative_samples_removed = SelectQuery() historical_negative_samples_removed.select_from(select_from, alias=select_from_as) columns_to_select = self.sample_keys + [constants.TARGET_COLUMN_NAME] + self.dku_config.score_column_names self._select_columns_list(select_query=historical_negative_samples_removed, column_names=columns_to_select) unseen_samples_condition = ( Column(constants.TARGET_COLUMN_NAME).eq(Constant(1)).or_(Column(self.SCORE_SAMPLE).eq(Constant(0))) ) historical_negative_samples_removed.where(unseen_samples_condition) return historical_negative_samples_removed
def _make_prefilter(aggregation_params, transform_params): timestamp_expr = Column(aggregation_params.timestamp_column) ref_date_expr = Constant(transform_params.ref_date).cast(ColumnType.DATE) time_filter = None if aggregation_params.is_rolling_window(): time_filter = timestamp_expr.le(ref_date_expr) elif transform_params.buffer_unit is not None: buffer_interval = Interval(transform_params.buffer_width, transform_params.buffer_unit) upper = ref_date_expr.minus(buffer_interval) if aggregation_params.whole_history_window_enabled: time_filter = timestamp_expr.le(upper) else: window_width, window_unit = _get_widest_window(aggregation_params) base_interval = Interval(window_width, window_unit) lower = upper.minus(base_interval) time_filter = timestamp_expr.ge(lower).and_( timestamp_expr.le(upper)) else: if aggregation_params.whole_history_window_enabled: time_filter = timestamp_expr.le(ref_date_expr) else: window_width, window_unit = _get_widest_window(aggregation_params) base_interval = Interval(window_width, window_unit) lower = ref_date_expr.minus(base_interval) time_filter = timestamp_expr.ge(lower).and_( timestamp_expr.le(ref_date_expr)) return time_filter
def _build_all_cf_scores_with_target(self, select_from, select_from_as="_all_cf_scores_with_target"): all_cf_scores_with_target = SelectQuery() all_cf_scores_with_target.select_from(select_from, alias=select_from_as) columns_to_select = self.sample_keys + self.dku_config.score_column_names self._select_columns_list(select_query=all_cf_scores_with_target, column_names=columns_to_select) all_cf_scores_with_target.select( Column(self.IS_TRAINING_SAMPLE).coalesce(0).cast("int"), alias=constants.TARGET_COLUMN_NAME ) if self.has_historical_data: all_cf_scores_with_target.select( Column(self.IS_SCORE_SAMPLE).coalesce(0).cast("int"), alias=self.SCORE_SAMPLE ) return all_cf_scores_with_target
def _get_user_item_similarity_formula(self, similarity_table, samples_table): if self.use_explicit: return (Column(constants.SIMILARITY_COLUMN_NAME, table_name=similarity_table).times( Column(self.dku_config.ratings_column_name, table_name=samples_table).minus( Column(self.RATING_AVERAGE, table_name=samples_table))). sum().div( Column(constants.SIMILARITY_COLUMN_NAME, table_name=similarity_table).abs().sum())) else: return (Column(constants.SIMILARITY_COLUMN_NAME, table_name=similarity_table).sum().div( Constant(self.dku_config.top_n_most_similar)))
def _rename_table(self, to_rename, renaming_mapping): renamed_table = SelectQuery() renamed_table.select_from(to_rename, alias="_renamed") for input_column, renamed_column in renaming_mapping.items(): renamed_table.select(Column(input_column, table_name="_renamed"), alias=renamed_column) return renamed_table
def _encoding_feature_name(self, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() dataset_schema = dataset.read_schema() col_list = [] new_col_alias = [] for col_index, col_info in enumerate(dataset_schema): col_name = col_info.get('name') col_list.append(Column(col_name)) col_name_mapping = self.params.feature_name_mapping.get(col_name) if col_name_mapping: col_alias = '{}_{}'.format(col_name_mapping, col_index) else: col_alias = col_name new_col_alias.append(col_alias) query_to_rename = SelectQuery().select(col_list, new_col_alias) if is_hdfs: query_to_rename.select_from('_'.join(dataset.name.split('.'))) else: query_to_rename.select_from(dataset) dialect_handler = dialectHandler(dataset) dialect_handler.get_executor().exec_recipe_fragment( dataset, query=dialect_handler.convertToSQL( query_to_rename)) #toSQL(query_to_rename, dataset=dataset))
def _cast_table(self, to_cast, cast_mapping, alias): cast_table = SelectQuery() cast_table.select_from(to_cast, alias=alias) for input_column, target_type in cast_mapping.items(): cast_table.select(Column(input_column, table_name=alias).cast(target_type), alias=input_column) return cast_table
def make_full_transform_query(aggregation_queries, dataset, aggregation_params, transform_params, encoding_feature=False): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() inner = SelectQuery() if is_hdfs: inner.select_from('_'.join(dataset.name.split('.'))) else: inner.select_from(dataset) if aggregation_params.is_rolling_window(): inner.select(Column('*')) else: inner.distinct() #TODO why?! -> avoid dupplicate for key in aggregation_params.get_effective_keys(): inner.select(Column(key)) prefilter = _make_prefilter(aggregation_params, transform_params) inner.where(prefilter) outer = SelectQuery() outer.select_from(inner, alias='inner') if aggregation_params.is_rolling_window(): outer.select(Column('*', 'inner')) else: for col in aggregation_params.get_effective_keys(): #+ feature_names: outer.select(Column(col, 'inner')) reverse_mapping_dict = {} for idx, agg_query in enumerate(aggregation_queries): agg_query.alias( agg_query.get_alias() or 'cte_' + str(idx)) #TODO remove, make sure they have ids outer.with_cte(agg_query) join_cond = Expression() for key in aggregation_params.get_effective_keys(): join_cond = join_cond.and_( Column(key, 'inner').eq_null_unsafe( Column(key, agg_query.get_alias()))) outer.join(agg_query.get_alias(), JoinTypes.LEFT, join_cond) for idx2, col in enumerate(agg_query.get_columns_alias()): if encoding_feature: if aggregation_params.feature_name_mapping.get(col): new_alias = '{}_{}_{}'.format( aggregation_params.feature_name_mapping.get(col), idx, idx2) outer.select(Column(col, agg_query.get_alias()), new_alias) reverse_mapping_dict[new_alias] = col else: outer.select(Column(col, agg_query.get_alias())) return dialectHandler(dataset).convertToSQL(outer), reverse_mapping_dict
def _build_filtered_samples(inner_select_from, inner_select_from_as): ratio = float(self.dku_config.negative_samples_percentage / 100.0) filtered_samples = SelectQuery() filtered_samples.select_from(inner_select_from, inner_select_from_as) columns_to_select = self.sample_keys + self.dku_config.score_column_names + [constants.TARGET_COLUMN_NAME] self._select_columns_list(filtered_samples, columns_to_select) nb_negative_threshold_expr = ( Column(NB_POSITIVE_PER_USER, table_name=select_from_as) .times(Constant(ratio)) .div(Constant(1).minus(Constant(ratio))) .ceil() ) filtered_samples.where( Column(constants.TARGET_COLUMN_NAME, table_name=select_from_as) .eq(Constant(1)) .or_(Column(self.ROW_NUMBER_AS, table_name=select_from_as).le(nb_negative_threshold_expr)) ) return filtered_samples
def make_most_recent_timestamp_query(timestamp_column, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() query = SelectQuery() query.select(Column(timestamp_column).max(), alias='most_recent_timestamp') if is_hdfs: query.select_from('_'.join(dataset.name.split('.'))) else: query.select_from(dataset) dialect_handler = dialectHandler(dataset) return dialect_handler.convertToSQL( query) #toSQL(query, dialect='Hive')#dataset=dataset)
def make_distinct_values_query(column, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() column = Column(column) layer_1 = SelectQuery() layer_1.select(column) layer_1.select(Column('*').count(), 'count') if is_hdfs: layer_1.select_from('_'.join(dataset.name.split('.'))) else: layer_1.select_from(dataset) layer_1.group_by(column) count = Column('count') layer_2 = SelectQuery() layer_2.select(column) layer_2.select( count.div(count.sum().over(Window())).times(100), 'distribution') layer_2.select_from(layer_1, alias='layer_1') dialect_handler = dialectHandler(dataset) return dialect_handler.convertToSQL( layer_2) #toSQL(query, dialect='Hive')#dataset=dataset)
def _get_similarity_formula(self): rounding_decimals = 15 rounding_expression = Constant(10**rounding_decimals) logger.debug(f"Rounding similarity to {rounding_decimals} decimals") if self.use_explicit: # compute Pearson correlation rating_product = (Column( self.dku_config.ratings_column_name, table_name=self.LEFT_NORMALIZATION_FACTOR_AS).minus( Column( self.RATING_AVERAGE, table_name=self.LEFT_NORMALIZATION_FACTOR_AS)).times( Column(self.dku_config.ratings_column_name, table_name=self. RIGHT_NORMALIZATION_FACTOR_AS).minus( Column(self.RATING_AVERAGE, table_name=self. RIGHT_NORMALIZATION_FACTOR_AS)))) else: rating_product = Constant(1) return (rating_product.times( Column( self.NORMALIZATION_FACTOR_AS, table_name=self.LEFT_NORMALIZATION_FACTOR_AS)).times( Column( self.NORMALIZATION_FACTOR_AS, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS)).sum(). times(rounding_expression).round().div(rounding_expression))
def _build_timestamp_filtered(self, select_from, select_from_as="_prepared_input_dataset"): def _build_timestamp_filtered_row_number(select_from_inner, select_from_as_inner): ts_row_numbers = SelectQuery() ts_row_numbers.select_from(select_from_inner, alias=select_from_as_inner) self._select_columns_list( ts_row_numbers, column_names=self.similarity_computation_columns, table_name=select_from_as_inner) ts_row_number_expression = (Expression().rowNumber().over( Window( partition_by=[ Column(self.based_column, table_name=select_from_as_inner) ], order_by=[ Column(self.dku_config.timestamps_column_name, table_name=select_from_as_inner), Column(self.pivot_column, table_name=select_from_as_inner), ], order_types=["DESC", "DESC"], mode=None, ))) ts_row_numbers.select(ts_row_number_expression, alias=self.TIMESTAMP_FILTERED_ROW_NB) return ts_row_numbers built_ts_row_numbers = _build_timestamp_filtered_row_number( select_from, select_from_as) ts_row_numbers_alias = "_ts_row_numbers" timestamp_filtered = SelectQuery() timestamp_filtered.select_from(built_ts_row_numbers, alias=ts_row_numbers_alias) self._select_columns_list( timestamp_filtered, column_names=self.similarity_computation_columns, table_name=ts_row_numbers_alias) timestamp_filtered.where( Column(self.TIMESTAMP_FILTERED_ROW_NB, table_name=ts_row_numbers_alias).le( Constant(self.dku_config.top_n_most_recent))) return timestamp_filtered
def make_max_time_interval_query(timestamp_column, resolved_ref_date, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() max_time_interval = Constant(resolved_ref_date).minus( Column(timestamp_column)).extract(TimeUnit.DAY).max() query = SelectQuery() query.select(max_time_interval, alias="max_time_interval") #TODO naming if is_hdfs: query.select_from('_'.join(dataset.name.split('.'))) else: query.select_from(dataset) dialect_handler = dialectHandler(dataset) return dialect_handler.convertToSQL(query) #toSQL(query, dataset=dataset)
def _build_top_n(self, select_from, select_from_as="_row_number_table"): top_n = SelectQuery() top_n.select_from(select_from, alias=select_from_as) columns_to_select = [ f"{self.based_column}_1", f"{self.based_column}_2", constants.SIMILARITY_COLUMN_NAME ] self._select_columns_list(top_n, column_names=columns_to_select, table_name=select_from_as) top_n.where( Column(self.ROW_NUMBER_AS, table_name=select_from_as).le( Constant(self.dku_config.top_n_most_similar))) return top_n
def _build_sum_of_similarity_scores( self, top_n, normalization_factor, top_n_as="_top_n", normalization_factor_as="_normalization_factor"): cf_scores = SelectQuery() cf_scores.select_from(top_n, alias=top_n_as) join_condition = Column(f"{self.based_column}_2", top_n_as).eq_null_unsafe( Column(self.based_column, normalization_factor_as)) cf_scores.join(normalization_factor, JoinTypes.INNER, join_condition, alias=normalization_factor_as) cf_scores.group_by( Column(f"{self.based_column}_1", table_name=top_n_as)) cf_scores.group_by( Column(self.pivot_column, table_name=normalization_factor_as)) cf_scores.select(Column(f"{self.based_column}_1", table_name=top_n_as), alias=self.based_column) cf_scores.select( Column(self.pivot_column, table_name=normalization_factor_as)) cf_scores.select(self._get_user_item_similarity_formula( top_n_as, normalization_factor_as), alias=constants.SCORE_COLUMN_NAME) cf_scores.order_by(Column(self.based_column)) cf_scores.order_by(Column(constants.SCORE_COLUMN_NAME), direction="DESC") return cf_scores
def _build_unordered_similarity( self, select_from, left_select_from_as="_left_ordered_similarity", right_select_from_as="_right_ordered_similarity", with_clause_as="_with_clause_ordered_similarity", ): """Retrieve both pairs (when col_1 < col_2 and col_1 > col_2) from the ordered similarity table""" similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=left_select_from_as) join_condition = Constant(1).eq_null_unsafe(Constant(0)) similarity.join(select_from, JoinTypes.FULL, join_condition, alias=right_select_from_as) similarity.select( Column(f"{self.based_column}_1", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_2", table_name=right_select_from_as)), alias=f"{self.based_column}_1", ) similarity.select( Column(f"{self.based_column}_2", table_name=left_select_from_as).coalesce( Column(f"{self.based_column}_1", table_name=right_select_from_as)), alias=f"{self.based_column}_2", ) similarity.select( Column("similarity", table_name=left_select_from_as).coalesce( Column("similarity", table_name=right_select_from_as)), alias=constants.SIMILARITY_COLUMN_NAME, ) return similarity
def _check_populations(self): if self.params.populations_mode == PopulationsDefinitionMode.MANUAL: if len(self.params.populations_dict) > 0: populations_values = self.params.populations_dict.values() else: populations_values = [[]] else: # brute_force (or None?) # We loop the populations_dict and create one list for each granularity column, for example if our granularity dict is {'event_type': ['buy_order'], 'event_dayofweek': [1,2,3]} # we will have something like [ ['buy_order'], [1,2,3] ] # Finally, we find all the possible combination of the list above (buy_order and 1, buy_order and 2, ...) populations_values = [ zip(self.params.populations_dict, prod) for prod in itertools.product( *(self.params.populations_dict[key] for key in self.params.populations_dict)) ] # In the end we have something like this: [[('event_dayofweek', ['1', '2', '3']), ('event_type', ['buy_order'])], [('event_dayofweek', ['4', '5']), ('event_type', ['buy_order'])],...] self.params.populations = [] for population_combination in populations_values: print('POPULATION: ', population_combination) conditions = [] # for example, if we have population_combination = [[event_timestmap, ['buy_order']], [event_dayofweek, ['1','2','3']]] for combination_val in population_combination: condition_column = combination_val[0] single_cond_expr = Column(condition_column).in_( List(*combination_val[1])) conditions.append(single_cond_expr) if len(conditions) > 0: population = conditions[0] for condition_expr in conditions[1:]: population = population.and_(condition_expr) self.params.populations.append(population) else: self.params.populations.append(None) logger.info("There are {} population(s)".format( len(self.params.populations)))
def _build_visit_count(self, select_from, select_from_as="_filtered_input_dataset"): # total user and item visits visit_count = SelectQuery() visit_count.select_from(select_from, alias=select_from_as) self._select_columns_list( visit_count, column_names=self.similarity_computation_columns + self.filtering_columns, table_name=select_from_as, ) visit_count.select( Column("*").count().over( Window( partition_by=[Column(self.dku_config.users_column_name)], mode=None, )), alias=self.NB_VISIT_USER_AS, ) visit_count.select( Column("*").count().over( Window( partition_by=[Column(self.dku_config.items_column_name)], mode=None, )), alias=self.NB_VISIT_ITEM_AS, ) if self.use_explicit: visit_count.select( Column(self.dku_config.ratings_column_name).avg().over( Window(partition_by=[Column(self.based_column)], mode=None)), alias=self.RATING_AVERAGE, ) self.similarity_computation_columns += [self.RATING_AVERAGE] return visit_count
operator = get_recipe_config()['operator'] columns_left = get_recipe_config()['columns_1'] columns_right = get_recipe_config()['columns_2'] ############################# # Original recipe ############################# #Start the loop joins = ['LEFT', 'RIGHT', 'INNER'] join_conds = [] for key in range(len(key_a)): join_cond = Expression() globals()['join_cond_'+str(key)] = join_cond.and_(Column(key_a[key], input_A_names[0]).eq_null_unsafe(Column(key_b[key], input_B_names[0]))) join_conds.append(globals()['join_cond_'+str(key)]) for i in joins: query = SelectQuery() query.select_from(input_A_datasets[0], alias = input_A_names[0]) for j in columns_left: query.select(Column(j, input_A_names[0]),alias = j) for k in columns_right: query.select(Column(k, input_B_names[0]),alias = k) query.join(input_B_datasets[0], i, join_conds, operatorBetweenConditions = operator , alias= input_B_names[0]) globals()['sql_'+str(i)] = toSQL(query, input_A_datasets[0]) e = SQLExecutor2() e.exec_recipe_fragment(output_A_datasets[0], sql_LEFT)
toSQL(eb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Simple select sb = SelectQuery() sb.select_from(Table('myTable')) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # join in fluent style sb = SelectQuery() \ .select(Constant('Hello')) \ .select_from('t1') \ .join('t2', JoinTypes.INNER, Column('c', 't1').eq(Column('c', 't2'))) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Group by and limit sb = SelectQuery() sb.select_from('myTable') sb.group_by(Column('groupCol')) sb.limit(1000) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Where clause and select distinct sb = SelectQuery()
def _select_columns_list(self, select_query, column_names, table_name=None): for col_name in column_names: select_query.select(Column(col_name, table_name=table_name))