Exemplo n.º 1
0
    def max(self, src_col_name, new_col_name='max'):
        """
        :param src_col_name: the column to find the max of its values
        :param new_col_name: the new column name of the max
        :return: if src_col_name is not None and is a groupby column, return a dataset with a new column name. else
            return an integer
        """
        if self.cached:
            ds = self._cache_dataset()
            return ds.max(src_col_name, new_col_name)

        param = None
        if src_col_name not in self.columns:
            raise Exception("Aggregation column {} doesn't exist in this dataset".format(src_col_name))
        agg_col = src_col_name
        if agg_col not in self.grouping_cols and agg_col in self.parent_dataset.columns:
            agg_node = GroupedAggregationOperator(self.name, agg_col, AggregationFunction.MAX, new_col_name, param)
            self.query_queue.append_node(agg_node)
            self.add_column(new_col_name)
            self.agg_columns.append(new_col_name)
        #elif agg_col in self.agg_columns:
        #    # TODO: Generate a subquery when running an aggregation over aggregated column after groupby
        #    pass
        else:
            # TODO: Don't allow any more operations on the dataset
            agg_node = AggregationOperator(self.name, agg_col, AggregationFunction.MAX, new_col_name, param)
            self.query_queue.append_node(agg_node)
            self.add_column(new_col_name)
            self.agg_columns.append(new_col_name)
        return self
Exemplo n.º 2
0
    def sum(self, src_col_name, new_col_name='sum'):
        """
        :param src_col_name: the column to find the sum of its values
        :param new_col_name: the new column name of the sum
        :return: if src_col_name is not None and is a groupby column, return a dataset with a new column name. else
            return an integer
        """
        if self.cached:
            ds = self._cache_dataset()
            return ds.sum(src_col_name, new_col_name)

        param = None
        if src_col_name not in self.columns:
            raise Exception("Aggregation column {} doesn't exist in this dataset".format(src_col_name))
        agg_col = src_col_name
        if agg_col not in self.grouping_cols and agg_col in self.parent_dataset.columns:
            agg_node = GroupedAggregationOperator(self.name, agg_col, AggregationFunction.SUM, new_col_name, param)
            self.query_queue.append_node(agg_node)
            self.add_column(new_col_name)
            self.agg_columns.append(new_col_name)
        else:
            agg_node = AggregationOperator(self.name, agg_col, AggregationFunction.SUM, new_col_name, param)
            self.query_queue.append_node(agg_node)
            self.columns = [src_col_name, new_col_name]
            self.agg_columns.append(new_col_name)
        return self
Exemplo n.º 3
0
    def count(self, src_col_name=None, new_col_name='count', unique=False):
        """
        :param src_col_name: the column to count its values
        :param new_col_name: the new column name of the count
        :param unique: if True retun the number of unique values else return the size of the result set
        :return: if src_col_name is not None and is a groupby column, return a dataset with a new column name. else
            return an integer
        """
        if self.cached:
            ds = self._cache_dataset()
            return ds.count(src_col_name, new_col_name, unique)

        if unique:
            param = "DISTINCT"
        else:
            param = None
        if src_col_name is not None:
            if src_col_name not in self.columns and src_col_name not in self.parent_dataset.columns:
                raise Exception("Aggregation column {} doesn't exist in this dataset".format(src_col_name))
            agg_col = src_col_name
            if agg_col not in self.grouping_cols and agg_col in self.parent_dataset.columns:
                agg_node = GroupedAggregationOperator(self.name, agg_col, AggregationFunction.COUNT, new_col_name, param)
                self.query_queue.append_node(agg_node)
                self.add_column(new_col_name)
                self.agg_columns.append(new_col_name)
            else:
                # TODO: Don't allow any more operations on the dataset
                agg_node = AggregationOperator(self.name, agg_col, AggregationFunction.COUNT, new_col_name, param)
                self.query_queue.append_node(agg_node)
                self.add_column(new_col_name)
                self.agg_columns.append(new_col_name)
        else:
            # TODO: Don't allow any more operations on the dataser
            agg_node = IntegerCountOperator(self.name, new_col_name, param)
            self.query_queue.append_node(agg_node)
            self.add_column(new_col_name)
            self.agg_columns.append(new_col_name)
        return self