def summarize(dataset, dframe, groups, no_cache, update=False): """Raises a ColumnTypeError if grouping on a non-dimensional column.""" # do not allow group by numeric types for group in groups: if not dataset.is_factor(group): raise ColumnTypeError("group: '%s' is not a dimension." % group) group_str = dataset.join_groups(groups) or dataset.ALL # check cached stats for group and update as necessary stats = dataset.stats group_stats = stats.get(group_str) if no_cache or not group_stats or update: group_stats = summarize_with_groups(dframe, groups, dataset) if groups else summarize_df(dframe, dataset) if not no_cache: if update: original_group_stats = stats.get(group_str, {}) group_stats = combine_dicts(original_group_stats, group_stats) stats.update({group_str: group_stats}) dataset.update({dataset.STATS: dict_for_mongo(stats)}) stats_dict = dict_from_mongo(group_stats) if groups: stats_dict = {group_str: stats_dict} return stats_dict
def summarize(dataset, dframe, groups, no_cache, update=False): """Raises a ColumnTypeError if grouping on a non-dimensional column.""" # do not allow group by numeric types for group in groups: if not dataset.is_factor(group): raise ColumnTypeError("group: '%s' is not a dimension." % group) group_str = dataset.join_groups(groups) or dataset.ALL # check cached stats for group and update as necessary stats = dataset.stats group_stats = stats.get(group_str) if no_cache or not group_stats or update: group_stats = summarize_with_groups(dframe, groups, dataset) if\ groups else summarize_df(dframe, dataset) if not no_cache: if update: original_group_stats = stats.get(group_str, {}) group_stats = combine_dicts(original_group_stats, group_stats) stats.update({group_str: group_stats}) dataset.update({dataset.STATS: dict_for_mongo(stats)}) stats_dict = dict_from_mongo(group_stats) if groups: stats_dict = {group_str: stats_dict} return stats_dict
def encode(self, encoding, query): """Encode query, order_by, and select given an encoding. The query will be combined with the existing query. :param encoding: A dict to encode the QueryArgs fields with. :param query: An additional dict to combine with the existing query. """ self.query = replace_keys(combine_dicts(self.query, query), encoding) self.order_by = self.order_by and replace_keys(dict(self.order_by), encoding).items() self.select = self.select and replace_keys(self.select, encoding)
def __create_aggregator(dataset, formula, name, groups, dframe=None): # TODO this should work with index eventually columns = parse_columns(dataset, formula, name, dframe, no_index=True) dependent_columns = Parser.dependent_columns(formula, dataset) aggregation = Parser.parse_aggregation(formula) # get dframe with only the necessary columns select = combine_dicts({group: 1 for group in groups}, {col: 1 for col in dependent_columns}) # ensure at least one column (MONGO_ID) for the count aggregation query_args = QueryArgs(select=select or {MONGO_ID: 1}) dframe = dataset.dframe(query_args=query_args, keep_mongo_keys=not select) return Aggregator(dframe, groups, aggregation, name, columns)
def update(cls, dataset, index, record): """Update a dataset row by index. The record dictionary will update, not replace, the data in the row at index. :param dataset: The dataset to update a row for. :param dex: The index of the row to update. :param record: The dictionary to update the row with. """ previous_record = cls.find_one(dataset, index).record previous_record.pop(MONGO_ID) record = combine_dicts(previous_record, record) record = update_calculations(record, dataset) record = cls.encode(record, dataset=dataset) cls.delete(dataset, index) super(cls, cls()).save(record)
def summarize(self, dframe, groups=[], no_cache=False, update=False, flat=False): """Build and return a summary of the data in this dataset. Return a summary of dframe grouped by `groups`, or the overall summary if no groups are specified. :param dframe: dframe to summarize :param groups: A list of columns to group on. :param no_cache: Do not fetch a cached summary. :param flat: Return a flattened list of groups. :returns: A summary of the dataset as a dict. Numeric columns will be summarized by the arithmetic mean, standard deviation, and percentiles. Dimensional columns will be summarized by counts. """ self.reload() summary = summarize(self, dframe, groups, no_cache, update=update) if flat: flat_summary = [] for cols, v in summary.iteritems(): cols = self.split_groups(cols) for k, data in v.iteritems(): col_values = self.split_groups(k) col_values = [ strip_pattern.sub(',', i)[1:-1] for i in col_values ] flat_summary.append( combine_dicts(dict(zip(cols, col_values)), data)) summary = flat_summary return summary
def summarize(self, dframe, groups=[], no_cache=False, update=False, flat=False): """Build and return a summary of the data in this dataset. Return a summary of dframe grouped by `groups`, or the overall summary if no groups are specified. :param dframe: dframe to summarize :param groups: A list of columns to group on. :param no_cache: Do not fetch a cached summary. :param flat: Return a flattened list of groups. :returns: A summary of the dataset as a dict. Numeric columns will be summarized by the arithmetic mean, standard deviation, and percentiles. Dimensional columns will be summarized by counts. """ self.reload() summary = summarize(self, dframe, groups, no_cache, update=update) if flat: flat_summary = [] for cols, v in summary.iteritems(): cols = self.split_groups(cols) for k, data in v.iteritems(): col_values = self.split_groups(k) col_values = [strip_pattern.sub(',', i)[1:-1] for i in col_values] flat_summary.append( combine_dicts(dict(zip(cols, col_values)), data)) summary = flat_summary return summary
def test_parse_formula_dependent_columns(self): formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS) for formula, column_list in formulas_to_deps.iteritems(): columns = Parser.dependent_columns(formula, self.dataset) self.assertEqual(set(column_list), columns)
def test_parse_formula_dependent_columns(self): formulas_to_deps = combine_dicts(AGG_CALCS_TO_DEPS, CALCS_TO_DEPS) for formula, column_list in formulas_to_deps.iteritems(): functions, dependent_columns = self.parser.parse_formula(formula) self.assertEqual(set(column_list), dependent_columns)