def make_distinct_values_query(column, dataset): is_hdfs = 'hiveTableName' in dataset.get_config().get('params').keys() column = Column(column) layer_1 = SelectQuery() layer_1.select(column) layer_1.select(Column('*').count(), 'count') if is_hdfs: layer_1.select_from('_'.join(dataset.name.split('.'))) else: layer_1.select_from(dataset) layer_1.group_by(column) count = Column('count') layer_2 = SelectQuery() layer_2.select(column) layer_2.select( count.div(count.sum().over(Window())).times(100), 'distribution') layer_2.select_from(layer_1, alias='layer_1') dialect_handler = dialectHandler(dataset) return dialect_handler.convertToSQL( layer_2) #toSQL(query, dialect='Hive')#dataset=dataset)
def _build_ordered_similarity( self, select_from, with_clause_as="_with_clause_normalization_factor"): """Build a similarity table col_1, col_2, similarity where col_1 < col_2 """ similarity = SelectQuery() if self.supports_with_clause: similarity.with_cte(select_from, alias=with_clause_as) select_from = with_clause_as similarity.select_from(select_from, alias=self.LEFT_NORMALIZATION_FACTOR_AS) join_conditions = [ Column(self.pivot_column, self.LEFT_NORMALIZATION_FACTOR_AS).eq_null_unsafe( Column(self.pivot_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] if self.supports_full_outer_join: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).lt( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] else: join_conditions += [ Column(self.based_column, self.LEFT_NORMALIZATION_FACTOR_AS).ne( Column(self.based_column, self.RIGHT_NORMALIZATION_FACTOR_AS)) ] similarity.join(select_from, JoinTypes.INNER, join_conditions, alias=self.RIGHT_NORMALIZATION_FACTOR_AS) similarity.group_by( Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS)) similarity.group_by( Column(self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS)) similarity.select(Column(self.based_column, table_name=self.LEFT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_1") similarity.select(Column( self.based_column, table_name=self.RIGHT_NORMALIZATION_FACTOR_AS), alias=f"{self.based_column}_2") similarity.select(self._get_similarity_formula(), alias=constants.SIMILARITY_COLUMN_NAME) return similarity
def _build_samples_with_only_positives(inner_select_from, inner_select_from_as=ONLY_POSITIVE_TABLE_NAME): samples_with_only_positives = SelectQuery() samples_with_only_positives.select_from(inner_select_from, inner_select_from_as) samples_with_only_positives.select(Column(self.dku_config.users_column_name)) samples_with_only_positives.select(Column("*").count(), alias=NB_POSITIVE_PER_USER) samples_with_only_positives.where(Column(constants.TARGET_COLUMN_NAME).eq(Constant(1))) samples_with_only_positives.group_by(Column(self.dku_config.users_column_name)) return samples_with_only_positives
def _build_sum_of_similarity_scores( self, top_n, normalization_factor, top_n_as="_top_n", normalization_factor_as="_normalization_factor"): cf_scores = SelectQuery() cf_scores.select_from(top_n, alias=top_n_as) join_condition = Column(f"{self.based_column}_2", top_n_as).eq_null_unsafe( Column(self.based_column, normalization_factor_as)) cf_scores.join(normalization_factor, JoinTypes.INNER, join_condition, alias=normalization_factor_as) cf_scores.group_by( Column(f"{self.based_column}_1", table_name=top_n_as)) cf_scores.group_by( Column(self.pivot_column, table_name=normalization_factor_as)) cf_scores.select(Column(f"{self.based_column}_1", table_name=top_n_as), alias=self.based_column) cf_scores.select( Column(self.pivot_column, table_name=normalization_factor_as)) cf_scores.select(self._get_user_item_similarity_formula( top_n_as, normalization_factor_as), alias=constants.SCORE_COLUMN_NAME) cf_scores.order_by(Column(self.based_column)) cf_scores.order_by(Column(constants.SCORE_COLUMN_NAME), direction="DESC") return cf_scores
toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # join in fluent style sb = SelectQuery() \ .select(Constant('Hello')) \ .select_from('t1') \ .join('t2', JoinTypes.INNER, Column('c', 't1').eq(Column('c', 't2'))) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Group by and limit sb = SelectQuery() sb.select_from('myTable') sb.group_by(Column('groupCol')) sb.limit(1000) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Where clause and select distinct sb = SelectQuery() sb.distinct() sb.select_from('myTable') sb.where(Column('wherecol').le(Constant(33))) toSQL(sb, Dialects.MYSQL) # -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE # Having clause