def max(self, src_col_name, new_col_name='max'): """ :param src_col_name: the column to find the max of its values :param new_col_name: the new column name of the max :return: if src_col_name is not None and is a groupby column, return a dataset with a new column name. else return an integer """ if self.cached: ds = self._cache_dataset() return ds.max(src_col_name, new_col_name) param = None if src_col_name not in self.columns: raise Exception("Aggregation column {} doesn't exist in this dataset".format(src_col_name)) agg_col = src_col_name if agg_col not in self.grouping_cols and agg_col in self.parent_dataset.columns: agg_node = GroupedAggregationOperator(self.name, agg_col, AggregationFunction.MAX, new_col_name, param) self.query_queue.append_node(agg_node) self.add_column(new_col_name) self.agg_columns.append(new_col_name) #elif agg_col in self.agg_columns: # # TODO: Generate a subquery when running an aggregation over aggregated column after groupby # pass else: # TODO: Don't allow any more operations on the dataset agg_node = AggregationOperator(self.name, agg_col, AggregationFunction.MAX, new_col_name, param) self.query_queue.append_node(agg_node) self.add_column(new_col_name) self.agg_columns.append(new_col_name) return self
def sum(self, src_col_name, new_col_name='sum'): """ :param src_col_name: the column to find the sum of its values :param new_col_name: the new column name of the sum :return: if src_col_name is not None and is a groupby column, return a dataset with a new column name. else return an integer """ if self.cached: ds = self._cache_dataset() return ds.sum(src_col_name, new_col_name) param = None if src_col_name not in self.columns: raise Exception("Aggregation column {} doesn't exist in this dataset".format(src_col_name)) agg_col = src_col_name if agg_col not in self.grouping_cols and agg_col in self.parent_dataset.columns: agg_node = GroupedAggregationOperator(self.name, agg_col, AggregationFunction.SUM, new_col_name, param) self.query_queue.append_node(agg_node) self.add_column(new_col_name) self.agg_columns.append(new_col_name) else: agg_node = AggregationOperator(self.name, agg_col, AggregationFunction.SUM, new_col_name, param) self.query_queue.append_node(agg_node) self.columns = [src_col_name, new_col_name] self.agg_columns.append(new_col_name) return self
def count(self, src_col_name=None, new_col_name='count', unique=False): """ :param src_col_name: the column to count its values :param new_col_name: the new column name of the count :param unique: if True retun the number of unique values else return the size of the result set :return: if src_col_name is not None and is a groupby column, return a dataset with a new column name. else return an integer """ if self.cached: ds = self._cache_dataset() return ds.count(src_col_name, new_col_name, unique) if unique: param = "DISTINCT" else: param = None if src_col_name is not None: if src_col_name not in self.columns and src_col_name not in self.parent_dataset.columns: raise Exception("Aggregation column {} doesn't exist in this dataset".format(src_col_name)) agg_col = src_col_name if agg_col not in self.grouping_cols and agg_col in self.parent_dataset.columns: agg_node = GroupedAggregationOperator(self.name, agg_col, AggregationFunction.COUNT, new_col_name, param) self.query_queue.append_node(agg_node) self.add_column(new_col_name) self.agg_columns.append(new_col_name) else: # TODO: Don't allow any more operations on the dataset agg_node = AggregationOperator(self.name, agg_col, AggregationFunction.COUNT, new_col_name, param) self.query_queue.append_node(agg_node) self.add_column(new_col_name) self.agg_columns.append(new_col_name) else: # TODO: Don't allow any more operations on the dataser agg_node = IntegerCountOperator(self.name, new_col_name, param) self.query_queue.append_node(agg_node) self.add_column(new_col_name) self.agg_columns.append(new_col_name) return self