def _update_aggregate_dataset(self, formula, new_dframe, name, groups, agg_dataset): """Update the aggregated dataset built for `self` with `calculation`. Proceed with the following steps: - delete the rows in this dataset from the parent - recalculate aggregated dataframe from aggregation - update aggregated dataset with new dataframe and add parent id - recur on all merged datasets descending from the aggregated dataset :param formula: The formula to execute. :param new_dframe: The DataFrame to aggregate on. :param name: The name of the aggregation. :param groups: A column or columns to group on. :type group: String, list of strings, or None. :param agg_dataset: The DataSet to store the aggregation in. """ # parse aggregation and build column arguments aggregation, new_columns = self.make_columns( formula, name, new_dframe) agg = Aggregator(self.dataset, self.dframe, groups, aggregation, name) new_agg_dframe = agg.update(agg_dataset, self, formula, new_columns) # jsondict from new dframe new_data = new_agg_dframe.to_jsondict() for merged_dataset in agg_dataset.merged_datasets: # remove rows in child from this merged dataset merged_dataset.remove_parent_observations( agg_dataset.dataset_id) # calculate updates on the child merged_calculator = Calculator(merged_dataset) call_async(merged_calculator.calculate_updates, merged_calculator, new_data, parent_dataset_id=agg_dataset.dataset_id)
def calculate_column(self, formula, name, groups=None): """Calculate a new column based on `formula` store as `name`. The new column is joined to `dframe` and stored in `self.dataset`. The `group_str` is only applicable to aggregations and groups for aggregations. .. note:: This can result in race-conditions when: - deleting ``controllers.Datasets.DELETE`` - updating ``controllers.Datasets.POST([dataset_id])`` Therefore, perform these actions asychronously. :param formula: The formula parsed by `self.parser` and applied to `self.dframe`. :param name: The name of the new column or aggregate column. :param groups: A list of columns to group on for aggregate calculations. """ self._ensure_dframe() aggregation, new_columns = self.make_columns(formula, name) if aggregation: agg = Aggregator(self.dataset, self.dataset.dframe(), groups, aggregation, name) agg.save(new_columns) else: self.dataset.replace_observations(self.dframe.join(new_columns[0])) # propagate calculation to any merged child datasets for merged_dataset in self.dataset.merged_datasets: merged_calculator = Calculator(merged_dataset) merged_calculator.propagate_column(self.dataset)
def __create_aggregator(dataset, formula, name, groups, dframe=None): # TODO this should work with index eventually columns = parse_columns(dataset, formula, name, dframe, no_index=True) dependent_columns = Parser.dependent_columns(formula, dataset) aggregation = Parser.parse_aggregation(formula) # get dframe with only the necessary columns select = combine_dicts({group: 1 for group in groups}, {col: 1 for col in dependent_columns}) # ensure at least one column (MONGO_ID) for the count aggregation query_args = QueryArgs(select=select or {MONGO_ID: 1}) dframe = dataset.dframe(query_args=query_args, keep_mongo_keys=not select) return Aggregator(dframe, groups, aggregation, name, columns)