def category_index(datasets): """ Get a list of categories by count of datasets """ # Get the dataset ids in the current list of datasets ds_ids = [d.id for d in datasets] if len(ds_ids): # If we have dataset ids we count the dataset by category q = db.select( [Dataset.category, db.func.count(Dataset.id)], Dataset.id.in_(ds_ids), group_by=Dataset.category, order_by=db.func.count(Dataset.id).desc()) # Execute the queery to the the list of categories categories = db.session.bind.execute(q).fetchall() # Return a list of categories as dicts with category, count, url # and label return [{ 'category': category, 'count': count, 'url': h.url_for(controller='dataset', action='index', category=category), 'label': CATEGORIES.get(category, category) } for (category, count) in categories if category is not None] # We return an empty string if no datasets found return []
def dataset_counts(cls, datasets): ds_ids = [d.id for d in datasets] q = db.select([cls.code, db.func.count(cls.dataset_id)], cls.dataset_id.in_(ds_ids), group_by=cls.code, order_by=db.func.count(cls.dataset_id).desc()) return db.session.bind.execute(q).fetchall()
def times(self, attribute='year'): """ Get all distinct times mentioned in the dataset. """ # TODO: make this a more generic distinct_attribute function field = self['time'][attribute].column_alias query = db.select([field.label(attribute)], self['time'].alias, distinct=True) rp = self.bind.execute(query) return sorted([r[attribute] for r in rp.fetchall()])
def dataset_counts(cls, datasets): ds_ids = [d.id for d in datasets] if not len(ds_ids): return [] q = db.select([cls.code, db.func.count(cls.dataset_id)], cls.dataset_id.in_(ds_ids), group_by=cls.code, order_by=db.func.count(cls.dataset_id).desc()) return db.session.bind.execute(q).fetchall()
def num_entries(self, conditions="1=1"): """ Return the count of entries on the dataset fact table having the dimension set to a value matching the filter given by ``conditions``. """ query = db.select([db.func.count(db.func.distinct(self.column_alias))], conditions) rp = self.dataset.bind.execute(query) return rp.fetchone()[0]
def entries(self, conditions="1=1", order_by=None, limit=None, offset=0, step=10000): """ Generate a fully denormalized view of the entries on this table. This view is nested so that each dimension will be a hash of its attributes. This is somewhat similar to the entries collection in the fully denormalized schema before OpenSpending 0.11 (MongoDB). """ if not self.is_generated: return joins = self.alias for d in self.dimensions: joins = d.join(joins) selects = [f.selectable for f in self.fields] + [self.alias.c.id] # enforce stable sorting: if order_by is None: order_by = [self.alias.c.id.asc()] for i in count(): qoffset = offset + (step * i) qlimit = step if limit is not None: qlimit = min(limit - (step * i), step) if qlimit <= 0: break query = db.select(selects, conditions, joins, order_by=order_by, use_labels=True, limit=qlimit, offset=qoffset) rp = self.bind.execute(query) first_row = True while True: row = rp.fetchone() if row is None: if first_row: return break first_row = False result = {} for k, v in row.items(): field, attr = k.split('_', 1) field = field.replace(ALIAS_PLACEHOLDER, '_') if field == 'entry': result[attr] = v else: if not field in result: result[field] = dict() # TODO: backwards-compat? if isinstance(self[field], CompoundDimension): result[field]['taxonomy'] = \ self[field].taxonomy result[field][attr] = v yield result
def entries(self, conditions="1=1", order_by=None, limit=None, offset=0, step=10000, fields=None): """ Generate a fully denormalized view of the entries on this table. This view is nested so that each dimension will be a hash of its attributes. This is somewhat similar to the entries collection in the fully denormalized schema before OpenSpending 0.11 (MongoDB). """ if not self.is_generated: return if fields is None: fields = self.fields joins = self.alias for d in self.dimensions: if d in fields: joins = d.join(joins) selects = [f.selectable for f in fields] + [self.alias.c.id] # enforce stable sorting: if order_by is None: order_by = [self.alias.c.id.asc()] for i in count(): qoffset = offset + (step * i) qlimit = step if limit is not None: qlimit = min(limit - (step * i), step) if qlimit <= 0: break query = db.select(selects, conditions, joins, order_by=order_by, use_labels=True, limit=qlimit, offset=qoffset) rp = self.bind.execute(query) first_row = True while True: row = rp.fetchone() if row is None: if first_row: return break first_row = False yield decode_row(row, self)
def members(self, conditions="1=1", limit=None, offset=0): """ Get a listing of all the members of the dimension (i.e. all the distinct values) matching the filter in ``conditions``. """ query = db.select([self.column_alias], conditions, limit=limit, offset=offset, distinct=True) rp = self.dataset.bind.execute(query) while True: row = rp.fetchone() if row is None: break yield row[0]
def territories(self): q = db.select([DatasetTerritory.code, db.func.count(DatasetTerritory.dataset_id)], group_by=DatasetTerritory.code, order_by=db.func.count(DatasetTerritory.dataset_id).desc()) result = {} for territory, count in db.session.bind.execute(q).fetchall(): result[territory] = {'count': count, 'label': h.COUNTRIES[territory], 'url': h.url_for(controller='dataset', action='index', territories=territory)} return to_jsonp(result)
def members(self, conditions="1=1", limit=0, offset=0): """ Get a listing of all the members of the dimension (i.e. all the distinct values) matching the filter in ``conditions``. This can also be used to find a single individual member, e.g. a dimension value identified by its name. """ query = db.select([self.alias], conditions, limit=limit, offset=offset) rp = self.dataset.bind.execute(query) while True: row = rp.fetchone() if row is None: break member = dict(row.items()) member['taxonomy'] = self.taxonomy yield member
def members(self, conditions="1=1", limit=None, offset=0): """ Get a listing of all the members of the dimension (i.e. all the distinct values) matching the filter in ``conditions``. This can also be used to find a single individual member, e.g. a dimension value identified by its name. """ query = db.select([self.alias], conditions, limit=limit, offset=offset) rp = self.dataset.bind.execute(query) while True: row = rp.fetchone() if row is None: break member = dict(row.items()) member['taxonomy'] = self.taxonomy yield member
def category_index(datasets): """ Get a list of categories by count of datasets """ # Get the dataset ids in the current list of datasets ds_ids = [d.id for d in datasets] if len(ds_ids): # If we have dataset ids we count the dataset by category q = db.select([Dataset.category, db.func.count(Dataset.id)], Dataset.id.in_(ds_ids), group_by=Dataset.category, order_by=db.func.count(Dataset.id).desc()) # Execute the queery to the the list of categories categories = db.session.bind.execute(q).fetchall() # Return a list of categories as dicts with category, count, url # and label return [{'category': category, 'count': count, 'url': h.url_for(controller='dataset', action='index', category=category), 'label': CATEGORIES.get(category, category)} for (category, count) in categories if category is not None] # We return an empty string if no datasets found return []
def aggregate(self, measures=['amount'], drilldowns=[], cuts=[], page=1, pagesize=10000, order=[]): """ Query the dataset for a subset of cells based on cuts and drilldowns. It returns a structure with a list of drilldown items and a summary about the slice cutted by the query. ``measures`` The numeric units to be aggregated over, defaults to [``amount``]. (type: `list`) ``drilldowns`` Dimensions to drill down to. (type: `list`) ``cuts`` Specification what to cut from the cube. This is a `list` of `two-tuples` where the first item is the dimension and the second item is the value to cut from. It is turned into a query where multible cuts for the same dimension are combined to an *OR* query and then the queries for the different dimensions are combined to an *AND* query. ``page`` Page the drilldown result and return page number *page*. type: `int` ``pagesize`` Page the drilldown result into page of size *pagesize*. type: `int` ``order`` Sort the result based on the dimension *sort_dimension*. This may be `None` (*default*) or a `list` of two-`tuples` where the first element is the *dimension* and the second element is the order (`False` for ascending, `True` for descending). Type: `list` of two-`tuples`. Raises: :exc:`ValueError` If a cube is not yet computed. Call :meth:`compute` to compute the cube. :exc:`KeyError` If a drilldown, cut or order dimension is not part of this cube or the order dimensions are not a subset of the drilldown dimensions. Returns: A `dict` containing the drilldown and the summary: {"drilldown": [ {"num_entries": 5545, "amount": 41087379002.0, "cofog1": {"description": "", "label": "Economic affairs"}}, ... ] "summary": {"amount": 7353306450299.0, "num_entries": 133612}} """ # Get the joins (aka alias) and the dataset joins = alias = self.alias dataset = self # Aggregation fields are all of the measures, so we create individual # summary fields with the sum function of SQLAlchemy fields = [db.func.sum(alias.c[m]).label(m) for m in measures] # We append an aggregation field that counts the number of entries fields.append(db.func.count(alias.c.id).label("entries")) # Create a copy of the statistics fields (for later) stats_fields = list(fields) # Create label map for time columns (year and month) for lookup # since they are found under the time attribute labels = { 'year': dataset['time']['year'].column_alias.label('year'), 'month': dataset['time']['yearmonth'].column_alias.label('month'), } # Get the dimensions we're interested in. These would be the drilldowns # and the cuts. For compound dimensions we are only interested in the # most significant one (e.g. for from.name we're interested in from) dimensions = drilldowns + [k for k, v in cuts] dimensions = [d.split('.')[0] for d in dimensions] # Loop over the dimensions as a set (to avoid multiple occurances) for dimension in set(dimensions): # If the dimension is year or month we're interested in 'time' if dimension in labels: dimension = 'time' # If the dimension table isn't in the automatic joins we add it if dimension not in [c.table.name for c in joins.columns]: joins = dataset[dimension].join(joins) # Drilldowns are performed using group_by SQL functions group_by = [] for key in drilldowns: # If drilldown is in labels we append its mapped column to fields if key in labels: column = labels[key] group_by.append(column) fields.append(column) else: # Get the column from the dataset column = dataset.key(key) # If the drilldown is a compound dimension or the columns table # is in the joins we're already fetching the column so we just # append it to fields and the group_by if '.' in key or column.table == alias: fields.append(column) group_by.append(column) else: # If not we add the column table to the fields and add all # of that tables columns to the group_by fields.append(column.table) for col in column.table.columns: group_by.append(col) # Cuts are managed using AND statements and we use a dict with set as # the default value to create the filters (cut on various values) conditions = db.and_() filters = defaultdict(set) for key, value in cuts: # If the key is in labels (year or month) we get the mapped column # else we get the column from the dataset if key in labels: column = labels[key] else: column = dataset.key(key) # We add the value to the set for that particular column filters[column].add(value) # Loop over the columns in the filter and add that to the conditions # For every value in the set we create and OR statement so we get e.g. # year=2007 AND (from.who == 'me' OR from.who == 'you') for attr, values in filters.items(): conditions.append(db.or_(*[attr == v for v in values])) # Ordering can be set by a parameter or ordered by measures by default order_by = [] # If no order is defined we default to order of the measures in the # order they occur (furthest to the left is most significant) if order is None or not len(order): order = [(m, True) for m in measures] # We loop through the order list to add the columns themselves for key, direction in order: # If it's a part of the measures we have to order by the # aggregated values (the sum of the measure) if key in measures: column = db.func.sum(alias.c[key]).label(key) # If it's in the labels we have to get the mapped column elif key in labels: column = labels[key] # ...if not we just get the column from the dataset else: column = dataset.key(key) # We append the column and set the direction (True == descending) order_by.append(column.desc() if direction else column.asc()) # query 1: get overall sums. # Here we use the stats_field we saved earlier query = db.select(stats_fields, conditions, joins) rp = dataset.bind.execute(query) # Execute the query and turn them to a list so we can pop the # entry count and then zip the measurements and the totals together stats = list(rp.fetchone()) num_entries = stats.pop() total = zip(measures, stats) # query 2: get total count of drilldowns if len(group_by): # Select 1 for each group in the group_by and count them query = db.select(['1'], conditions, joins, group_by=group_by) query = db.select([db.func.count('1')], '1=1', query.alias('q')) rp = dataset.bind.execute(query) num_drilldowns, = rp.fetchone() else: # If there are no drilldowns we still have to do one num_drilldowns = 1 # The drilldown result list drilldown = [] # The offset in the db, based on the page and pagesize (we have to # modify it since page counts starts from 1 but we count from 0 offset = int((page - 1) * pagesize) # query 3: get the actual data query = db.select(fields, conditions, joins, order_by=order_by, group_by=group_by, use_labels=True, limit=pagesize, offset=offset) rp = dataset.bind.execute(query) while True: # Get each row in the db result and append it, decoded, to the # drilldown result. The decoded version is a json represenation row = rp.fetchone() if row is None: break result = decode_row(row, dataset) drilldown.append(result) # Create the summary based on the stats_fields and other things # First we add a the total for each measurement in the root of the # summary (watch out!) and then we add various other, self-explanatory # statistics such as page, number of entries. The currency value is # strange since it's redundant for multiple measures but is left as is # for backwards compatibility summary = {key: value for (key, value) in total} summary.update({ 'num_entries': num_entries, 'currency': {m: dataset.currency for m in measures}, 'num_drilldowns': num_drilldowns, 'page': page, 'pages': int(math.ceil(num_drilldowns / float(pagesize))), 'pagesize': pagesize }) return {'drilldown': drilldown, 'summary': summary}
def aggregate(self, measure='amount', drilldowns=None, cuts=None, page=1, pagesize=10000, order=None): """ Query the dataset for a subset of cells based on cuts and drilldowns. It returns a structure with a list of drilldown items and a summary about the slice cutted by the query. ``measure`` The numeric unit to be aggregated over, defaults to ``amount``. ``drilldowns`` Dimensions to drill down to. (type: `list`) ``cuts`` Specification what to cut from the cube. This is a `list` of `two-tuples` where the first item is the dimension and the second item is the value to cut from. It is turned into a query where multible cuts for the same dimension are combined to an *OR* query and then the queries for the different dimensions are combined to an *AND* query. ``page`` Page the drilldown result and return page number *page*. type: `int` ``pagesize`` Page the drilldown result into page of size *pagesize*. type: `int` ``order`` Sort the result based on the dimension *sort_dimension*. This may be `None` (*default*) or a `list` of two-`tuples` where the first element is the *dimension* and the second element is the order (`False` for ascending, `True` for descending). Type: `list` of two-`tuples`. Raises: :exc:`ValueError` If a cube is not yet computed. Call :meth:`compute` to compute the cube. :exc:`KeyError` If a drilldown, cut or order dimension is not part of this cube or the order dimensions are not a subset of the drilldown dimensions. Returns: A `dict` containing the drilldown and the summary:: {"drilldown": [ {"num_entries": 5545, "amount": 41087379002.0, "cofog1": {"description": "", "label": "Economic affairs"}}, ... ] "summary": {"amount": 7353306450299.0, "num_entries": 133612}} """ cuts = cuts or [] drilldowns = drilldowns or [] order = order or [] joins = alias = self.alias dataset = self fields = [ db.func.sum(alias.c[measure]).label(measure), db.func.count(alias.c.id).label("entries") ] stats_fields = list(fields) labels = { 'year': dataset['time']['year'].column_alias.label('year'), 'month': dataset['time']['yearmonth'].column_alias.label('month'), } dimensions = drilldowns + [k for k, v in cuts] dimensions = [d.split('.')[0] for d in dimensions] for dimension in set(dimensions): if dimension in labels: dimension = 'time' if dimension not in [c.table.name for c in joins.columns]: joins = dataset[dimension].join(joins) group_by = [] for key in drilldowns: if key in labels: column = labels[key] group_by.append(column) fields.append(column) else: column = dataset.key(key) if '.' in key or column.table == alias: fields.append(column) group_by.append(column) else: fields.append(column.table) for col in column.table.columns: group_by.append(col) conditions = db.and_() filters = defaultdict(set) for key, value in cuts: if key in labels: column = labels[key] else: column = dataset.key(key) filters[column].add(value) for attr, values in filters.items(): conditions.append(db.or_(*[attr == v for v in values])) order_by = [] if order is None or not len(order): order = [(measure, True)] for key, direction in order: if key == measure: column = db.func.sum(alias.c[measure]).label(measure) elif key in labels: column = labels[key] else: column = dataset.key(key) order_by.append(column.desc() if direction else column.asc()) # query 1: get overall sums. query = db.select(stats_fields, conditions, joins) rp = dataset.bind.execute(query) total, num_entries = rp.fetchone() # query 2: get total count of drilldowns if len(group_by): query = db.select(['1'], conditions, joins, group_by=group_by) query = db.select([db.func.count('1')], '1=1', query.alias('q')) rp = dataset.bind.execute(query) num_drilldowns, = rp.fetchone() else: num_drilldowns = 1 drilldown = [] offset = int((page - 1) * pagesize) # query 3: get the actual data query = db.select(fields, conditions, joins, order_by=order_by, group_by=group_by, use_labels=True, limit=pagesize, offset=offset) rp = dataset.bind.execute(query) while True: row = rp.fetchone() if row is None: break result = decode_row(row, dataset) drilldown.append(result) return { 'drilldown': drilldown, 'summary': { measure: total, 'num_entries': num_entries, 'currency': { measure: dataset.currency }, 'num_drilldowns': num_drilldowns, 'page': page, 'pages': int(math.ceil(num_drilldowns / float(pagesize))), 'pagesize': pagesize } }
def aggregate(self, measure='amount', drilldowns=None, cuts=None, page=1, pagesize=10000, order=None): """ Query the dataset for a subset of cells based on cuts and drilldowns. It returns a structure with a list of drilldown items and a summary about the slice cutted by the query. ``measure`` The numeric unit to be aggregated over, defaults to ``amount``. ``drilldowns`` Dimensions to drill down to. (type: `list`) ``cuts`` Specification what to cut from the cube. This is a `list` of `two-tuples` where the first item is the dimension and the second item is the value to cut from. It is turned into a query where multible cuts for the same dimension are combined to an *OR* query and then the queries for the different dimensions are combined to an *AND* query. ``page`` Page the drilldown result and return page number *page*. type: `int` ``pagesize`` Page the drilldown result into page of size *pagesize*. type: `int` ``order`` Sort the result based on the dimension *sort_dimension*. This may be `None` (*default*) or a `list` of two-`tuples` where the first element is the *dimension* and the second element is the order (`False` for ascending, `True` for descending). Type: `list` of two-`tuples`. Raises: :exc:`ValueError` If a cube is not yet computed. Call :meth:`compute` to compute the cube. :exc:`KeyError` If a drilldown, cut or order dimension is not part of this cube or the order dimensions are not a subset of the drilldown dimensions. Returns: A `dict` containing the drilldown and the summary:: {"drilldown": [ {"num_entries": 5545, "amount": 41087379002.0, "cofog1": {"description": "", "label": "Economic affairs"}}, ... ] "summary": {"amount": 7353306450299.0, "num_entries": 133612}} """ cuts = cuts or [] drilldowns = drilldowns or [] order = order or [] joins = self.alias fields = [db.func.sum(self.alias.c[measure]).label(measure), db.func.count(self.alias.c.id).label("entries")] labels = { 'year': self['time']['year'].column_alias.label('year'), 'month': self['time']['yearmonth'].column_alias.label('month'), } dimensions = set(drilldowns + [k for k,v in cuts] + [o[0] for o in order]) for dimension in dimensions: if dimension in labels: _name = 'time' else: _name = dimension.split('.')[0] if _name not in [c.table.name for c in joins.columns]: joins = self[_name].join(joins) group_by = [] for key in dimensions: if key in labels: column = labels[key] group_by.append(column) fields.append(column) else: column = self.key(key) if '.' in key or column.table == self.alias: fields.append(column) group_by.append(column) else: fields.append(column.table) for col in column.table.columns: group_by.append(col) conditions = db.and_() filters = defaultdict(set) for key, value in cuts: if key in labels: column = labels[key] else: column = self.key(key) filters[column].add(value) for attr, values in filters.items(): conditions.append(db.or_(*[attr==v for v in values])) order_by = [] for key, direction in order: if key in labels: column = labels[key] else: column = self.key(key) order_by.append(column.desc() if direction else column.asc()) query = db.select(fields, conditions, joins, order_by=order_by or [measure + ' desc'], group_by=group_by, use_labels=True) summary = {measure: 0.0, 'num_entries': 0} drilldown = [] rp = self.bind.execute(query) while True: row = rp.fetchone() if row is None: break result = {} for key, value in row.items(): if key == measure: summary[measure] += value or 0 if key == 'entries': summary['num_entries'] += value or 0 if '_' in key: dimension, attribute = key.split('_', 1) dimension = dimension.replace(ALIAS_PLACEHOLDER, '_') if dimension == 'entry': result[attribute] = value else: if not dimension in result: result[dimension] = {} # TODO: backwards-compat? if isinstance(self[dimension], CompoundDimension): result[dimension]['taxonomy'] = \ self[dimension].taxonomy result[dimension][attribute] = value else: if key == 'entries': key = 'num_entries' result[key] = value drilldown.append(result) offset = ((page-1)*pagesize) # do we really need all this: summary['num_drilldowns'] = len(drilldown) summary['page'] = page summary['pages'] = int(math.ceil(len(drilldown)/float(pagesize))) summary['pagesize'] = pagesize return {'drilldown': drilldown[offset:offset+pagesize], 'summary': summary}
def entries(self, conditions="1=1", order_by=None, limit=None, offset=0, step=10000): """ Generate a fully denormalized view of the entries on this table. This view is nested so that each dimension will be a hash of its attributes. This is somewhat similar to the entries collection in the fully denormalized schema before OpenSpending 0.11 (MongoDB). """ if not self.is_generated: return joins = self.alias for d in self.dimensions: joins = d.join(joins) selects = [f.selectable for f in self.fields] + [self.alias.c.id] # enforce stable sorting: if order_by is None: order_by = [self.alias.c.id.asc()] for i in count(): qoffset = offset + (step * i) qlimit = step if limit is not None: qlimit = min(limit - (step * i), step) if qlimit <= 0: break query = db.select(selects, conditions, joins, order_by=order_by, use_labels=True, limit=qlimit, offset=qoffset) rp = self.bind.execute(query) first_row = True while True: row = rp.fetchone() if row is None: if first_row: return break first_row = False result = {} for k, v in row.items(): field, attr = k.split('_', 1) field = field.replace(ALIAS_PLACEHOLDER, '_') if field == 'entry': result[attr] = v else: if not field in result: result[field] = dict() # TODO: backwards-compat? if isinstance(self[field], CompoundDimension): result[field]['taxonomy'] = self[ field].taxonomy result[field][attr] = v yield result
def index(self, format="html"): c.query = request.params.items() c.add_filter = lambda f, v: "?" + urlencode(c.query + [(f, v)] if (f, v) not in c.query else c.query) c.del_filter = lambda f, v: "?" + urlencode([(k, x) for k, x in c.query if (k, x) != (f, v)]) c.results = c.datasets for language in request.params.getall("languages"): l = db.aliased(DatasetLanguage) c.results = c.results.join(l, Dataset._languages) c.results = c.results.filter(l.code == language) for territory in request.params.getall("territories"): t = db.aliased(DatasetTerritory) c.results = c.results.join(t, Dataset._territories) c.results = c.results.filter(t.code == territory) category = request.params.get("category") if category: c.results = c.results.filter(Dataset.category == category) c.results = list(c.results) c.territory_options = [ { "code": code, "count": count, "url": h.url_for(controller="dataset", action="index", territories=code), "label": COUNTRIES.get(code, code), } for (code, count) in DatasetTerritory.dataset_counts(c.results) ] c.language_options = [ { "code": code, "count": count, "url": h.url_for(controller="dataset", action="index", languages=code), "label": LANGUAGES.get(code, code), } for (code, count) in DatasetLanguage.dataset_counts(c.results) ] # TODO: figure out where to put this: ds_ids = [d.id for d in c.results] if len(ds_ids): q = db.select( [Dataset.category, db.func.count(Dataset.id)], Dataset.id.in_(ds_ids), group_by=Dataset.category, order_by=db.func.count(Dataset.id).desc(), ) c.category_options = [ { "category": category, "count": count, "url": h.url_for(controller="dataset", action="index", category=category), "label": CATEGORIES.get(category, category), } for (category, count) in db.session.bind.execute(q).fetchall() if category is not None ] else: c.category_options = [] c._must_revalidate = True if len(c.results): dt = max([r.updated_at for r in c.results]) etag_cache_keygen(dt) if format == "json": results = map(lambda d: d.as_dict(), c.results) results = [dataset_apply_links(r) for r in results] return to_jsonp( { "datasets": results, "categories": c.category_options, "territories": c.territory_options, "languages": c.language_options, } ) elif format == "csv": results = map(lambda d: d.as_dict(), c.results) return write_csv(results, response) return render("dataset/index.html")
def aggregate(self, measure='amount', drilldowns=None, cuts=None, page=1, pagesize=10000, order=None): """ Query the dataset for a subset of cells based on cuts and drilldowns. It returns a structure with a list of drilldown items and a summary about the slice cutted by the query. ``measure`` The numeric unit to be aggregated over, defaults to ``amount``. ``drilldowns`` Dimensions to drill down to. (type: `list`) ``cuts`` Specification what to cut from the cube. This is a `list` of `two-tuples` where the first item is the dimension and the second item is the value to cut from. It is turned into a query where multible cuts for the same dimension are combined to an *OR* query and then the queries for the different dimensions are combined to an *AND* query. ``page`` Page the drilldown result and return page number *page*. type: `int` ``pagesize`` Page the drilldown result into page of size *pagesize*. type: `int` ``order`` Sort the result based on the dimension *sort_dimension*. This may be `None` (*default*) or a `list` of two-`tuples` where the first element is the *dimension* and the second element is the order (`False` for ascending, `True` for descending). Type: `list` of two-`tuples`. Raises: :exc:`ValueError` If a cube is not yet computed. Call :meth:`compute` to compute the cube. :exc:`KeyError` If a drilldown, cut or order dimension is not part of this cube or the order dimensions are not a subset of the drilldown dimensions. Returns: A `dict` containing the drilldown and the summary:: {"drilldown": [ {"num_entries": 5545, "amount": 41087379002.0, "cofog1": {"description": "", "label": "Economic affairs"}}, ... ] "summary": {"amount": 7353306450299.0, "num_entries": 133612}} """ cuts = cuts or [] drilldowns = drilldowns or [] order = order or [] joins = self.alias fields = [ db.func.sum(self.alias.c[measure]).label(measure), db.func.count(self.alias.c.id).label("entries") ] labels = { 'year': self['time']['year'].column_alias.label('year'), 'month': self['time']['yearmonth'].column_alias.label('month'), } dimensions = set(drilldowns + [k for k, v in cuts] + [o[0] for o in order]) for dimension in dimensions: if dimension in labels: _name = 'time' else: _name = dimension.split('.')[0] if _name not in [c.table.name for c in joins.columns]: joins = self[_name].join(joins) group_by = [] for key in dimensions: if key in labels: column = labels[key] group_by.append(column) fields.append(column) else: column = self.key(key) if '.' in key or column.table == self.alias: fields.append(column) group_by.append(column) else: fields.append(column.table) for col in column.table.columns: group_by.append(col) conditions = db.and_() filters = defaultdict(set) for key, value in cuts: if key in labels: column = labels[key] else: column = self.key(key) filters[column].add(value) for attr, values in filters.items(): conditions.append(db.or_(*[attr == v for v in values])) order_by = [] for key, direction in order: if key in labels: column = labels[key] else: column = self.key(key) order_by.append(column.desc() if direction else column.asc()) query = db.select(fields, conditions, joins, order_by=order_by or [measure + ' desc'], group_by=group_by, use_labels=True) summary = {measure: 0.0, 'num_entries': 0} drilldown = [] rp = self.bind.execute(query) while True: row = rp.fetchone() if row is None: break result = {} for key, value in row.items(): if key == measure: summary[measure] += value or 0 if key == 'entries': summary['num_entries'] += value or 0 if '_' in key: dimension, attribute = key.split('_', 1) dimension = dimension.replace(ALIAS_PLACEHOLDER, '_') if dimension == 'entry': result[attribute] = value else: if not dimension in result: result[dimension] = {} # TODO: backwards-compat? if isinstance(self[dimension], CompoundDimension): result[dimension]['taxonomy'] = \ self[dimension].taxonomy result[dimension][attribute] = value else: if key == 'entries': key = 'num_entries' result[key] = value drilldown.append(result) offset = ((page - 1) * pagesize) # do we really need all this: summary['num_drilldowns'] = len(drilldown) summary['page'] = page summary['pages'] = int(math.ceil(len(drilldown) / float(pagesize))) summary['pagesize'] = pagesize return { 'drilldown': drilldown[offset:offset + pagesize], 'summary': summary }
def aggregate(self, measures=['amount'], drilldowns=[], cuts=[], page=1, pagesize=10000, order=[]): """ Query the dataset for a subset of cells based on cuts and drilldowns. It returns a structure with a list of drilldown items and a summary about the slice cutted by the query. ``measures`` The numeric units to be aggregated over, defaults to [``amount``]. (type: `list`) ``drilldowns`` Dimensions to drill down to. (type: `list`) ``cuts`` Specification what to cut from the cube. This is a `list` of `two-tuples` where the first item is the dimension and the second item is the value to cut from. It is turned into a query where multible cuts for the same dimension are combined to an *OR* query and then the queries for the different dimensions are combined to an *AND* query. ``page`` Page the drilldown result and return page number *page*. type: `int` ``pagesize`` Page the drilldown result into page of size *pagesize*. type: `int` ``order`` Sort the result based on the dimension *sort_dimension*. This may be `None` (*default*) or a `list` of two-`tuples` where the first element is the *dimension* and the second element is the order (`False` for ascending, `True` for descending). Type: `list` of two-`tuples`. Raises: :exc:`ValueError` If a cube is not yet computed. Call :meth:`compute` to compute the cube. :exc:`KeyError` If a drilldown, cut or order dimension is not part of this cube or the order dimensions are not a subset of the drilldown dimensions. Returns: A `dict` containing the drilldown and the summary: {"drilldown": [ {"num_entries": 5545, "amount": 41087379002.0, "cofog1": {"description": "", "label": "Economic affairs"}}, ... ] "summary": {"amount": 7353306450299.0, "num_entries": 133612}} """ # Get the joins (aka alias) and the dataset joins = alias = self.alias dataset = self # Aggregation fields are all of the measures, so we create individual # summary fields with the sum function of SQLAlchemy fields = [db.func.sum(alias.c[m]).label(m) for m in measures] # We append an aggregation field that counts the number of entries fields.append(db.func.count(alias.c.id).label("entries")) # Create a copy of the statistics fields (for later) stats_fields = list(fields) # Create label map for time columns (year and month) for lookup # since they are found under the time attribute labels = { 'year': dataset['time']['year'].column_alias.label('year'), 'month': dataset['time']['yearmonth'].column_alias.label('month'), } # Get the dimensions we're interested in. These would be the drilldowns # and the cuts. For compound dimensions we are only interested in the # most significant one (e.g. for from.name we're interested in from) dimensions = drilldowns + [k for k, v in cuts] dimensions = [d.split('.')[0] for d in dimensions] # Loop over the dimensions as a set (to avoid multiple occurances) for dimension in set(dimensions): # If the dimension is year or month we're interested in 'time' if dimension in labels: dimension = 'time' # If the dimension table isn't in the automatic joins we add it if dimension not in [c.table.name for c in joins.columns]: joins = dataset[dimension].join(joins) # Drilldowns are performed using group_by SQL functions group_by = [] for key in drilldowns: # If drilldown is in labels we append its mapped column to fields if key in labels: column = labels[key] group_by.append(column) fields.append(column) else: # Get the column from the dataset column = dataset.key(key) # If the drilldown is a compound dimension or the columns table # is in the joins we're already fetching the column so we just # append it to fields and the group_by if '.' in key or column.table == alias: fields.append(column) group_by.append(column) else: # If not we add the column table to the fields and add all # of that tables columns to the group_by fields.append(column.table) for col in column.table.columns: group_by.append(col) # Cuts are managed using AND statements and we use a dict with set as # the default value to create the filters (cut on various values) conditions = db.and_() filters = defaultdict(set) for key, value in cuts: # If the key is in labels (year or month) we get the mapped column # else we get the column from the dataset if key in labels: column = labels[key] else: column = dataset.key(key) # We add the value to the set for that particular column filters[column].add(value) # Loop over the columns in the filter and add that to the conditions # For every value in the set we create and OR statement so we get e.g. # year=2007 AND (from.who == 'me' OR from.who == 'you') for attr, values in filters.items(): conditions.append(db.or_(*[attr == v for v in values])) # Ordering can be set by a parameter or ordered by measures by default order_by = [] # If no order is defined we default to order of the measures in the # order they occur (furthest to the left is most significant) if order is None or not len(order): order = [(m, True) for m in measures] # We loop through the order list to add the columns themselves for key, direction in order: # If it's a part of the measures we have to order by the # aggregated values (the sum of the measure) if key in measures: column = db.func.sum(alias.c[key]).label(key) # If it's in the labels we have to get the mapped column elif key in labels: column = labels[key] # ...if not we just get the column from the dataset else: column = dataset.key(key) # We append the column and set the direction (True == descending) order_by.append(column.desc() if direction else column.asc()) # query 1: get overall sums. # Here we use the stats_field we saved earlier query = db.select(stats_fields, conditions, joins) rp = dataset.bind.execute(query) # Execute the query and turn them to a list so we can pop the # entry count and then zip the measurements and the totals together stats = list(rp.fetchone()) num_entries = stats.pop() total = zip(measures, stats) # query 2: get total count of drilldowns if len(group_by): # Select 1 for each group in the group_by and count them query = db.select(['1'], conditions, joins, group_by=group_by) query = db.select([db.func.count('1')], '1=1', query.alias('q')) rp = dataset.bind.execute(query) num_drilldowns, = rp.fetchone() else: # If there are no drilldowns we still have to do one num_drilldowns = 1 # The drilldown result list drilldown = [] # The offset in the db, based on the page and pagesize (we have to # modify it since page counts starts from 1 but we count from 0 offset = int((page - 1) * pagesize) # query 3: get the actual data query = db.select(fields, conditions, joins, order_by=order_by, group_by=group_by, use_labels=True, limit=pagesize, offset=offset) rp = dataset.bind.execute(query) while True: # Get each row in the db result and append it, decoded, to the # drilldown result. The decoded version is a json represenation row = rp.fetchone() if row is None: break result = decode_row(row, dataset) drilldown.append(result) # Create the summary based on the stats_fields and other things # First we add a the total for each measurement in the root of the # summary (watch out!) and then we add various other, self-explanatory # statistics such as page, number of entries. The currency value is # strange since it's redundant for multiple measures but is left as is # for backwards compatibility summary = {key: value for (key,value) in total} summary.update({ 'num_entries': num_entries, 'currency': {m: dataset.currency for m in measures}, 'num_drilldowns': num_drilldowns, 'page': page, 'pages': int(math.ceil(num_drilldowns / float(pagesize))), 'pagesize': pagesize }) return { 'drilldown': drilldown, 'summary': summary }
def index(self, format='html'): c.query = request.params.items() c.add_filter = lambda f, v: '?' + urlencode(c.query + [(f, v)] if ( f, v) not in c.query else c.query) c.del_filter = lambda f, v: '?' + urlencode([(k, x) for k, x in c.query if (k, x) != (f, v)]) c.results = c.datasets for language in request.params.getall('languages'): l = db.aliased(DatasetLanguage) c.results = c.results.join(l, Dataset._languages) c.results = c.results.filter(l.code == language) for territory in request.params.getall('territories'): t = db.aliased(DatasetTerritory) c.results = c.results.join(t, Dataset._territories) c.results = c.results.filter(t.code == territory) category = request.params.get('category') if category: c.results = c.results.filter(Dataset.category == category) c.results = list(c.results) c.territory_options = [{'code': code, 'count': count, 'url': h.url_for(controller='dataset', action='index', territories=code), 'label': COUNTRIES.get(code, code)} \ for (code, count) in DatasetTerritory.dataset_counts(c.results)] c.language_options = [{'code': code, 'count': count, 'url': h.url_for(controller='dataset', action='index', languages=code), 'label': LANGUAGES.get(code, code)} \ for (code, count) in DatasetLanguage.dataset_counts(c.results)] # TODO: figure out where to put this: ds_ids = [d.id for d in c.results] if len(ds_ids): q = db.select( [Dataset.category, db.func.count(Dataset.id)], Dataset.id.in_(ds_ids), group_by=Dataset.category, order_by=db.func.count(Dataset.id).desc()) c.category_options = [{'category': category, 'count': count, 'url': h.url_for(controller='dataset', action='index', category=category), 'label': CATEGORIES.get(category, category)} \ for (category, count) in db.session.bind.execute(q).fetchall() \ if category is not None] else: c.category_options = [] c._must_revalidate = True if len(c.results): dt = max([r.updated_at for r in c.results]) etag_cache_keygen(dt) if format == 'json': results = map(lambda d: d.as_dict(), c.results) results = [dataset_apply_links(r) for r in results] return to_jsonp({ 'datasets': results, 'categories': c.category_options, 'territories': c.territory_options, 'languages': c.language_options }) elif format == 'csv': results = map(lambda d: d.as_dict(), c.results) return write_csv(results, response) c.show_rss = True return templating.render('dataset/index.html')
def aggregate(self, measure='amount', drilldowns=None, cuts=None, page=1, pagesize=10000, order=None): """ Query the dataset for a subset of cells based on cuts and drilldowns. It returns a structure with a list of drilldown items and a summary about the slice cutted by the query. ``measure`` The numeric unit to be aggregated over, defaults to ``amount``. ``drilldowns`` Dimensions to drill down to. (type: `list`) ``cuts`` Specification what to cut from the cube. This is a `list` of `two-tuples` where the first item is the dimension and the second item is the value to cut from. It is turned into a query where multible cuts for the same dimension are combined to an *OR* query and then the queries for the different dimensions are combined to an *AND* query. ``page`` Page the drilldown result and return page number *page*. type: `int` ``pagesize`` Page the drilldown result into page of size *pagesize*. type: `int` ``order`` Sort the result based on the dimension *sort_dimension*. This may be `None` (*default*) or a `list` of two-`tuples` where the first element is the *dimension* and the second element is the order (`False` for ascending, `True` for descending). Type: `list` of two-`tuples`. Raises: :exc:`ValueError` If a cube is not yet computed. Call :meth:`compute` to compute the cube. :exc:`KeyError` If a drilldown, cut or order dimension is not part of this cube or the order dimensions are not a subset of the drilldown dimensions. Returns: A `dict` containing the drilldown and the summary:: {"drilldown": [ {"num_entries": 5545, "amount": 41087379002.0, "cofog1": {"description": "", "label": "Economic affairs"}}, ... ] "summary": {"amount": 7353306450299.0, "num_entries": 133612}} """ cuts = cuts or [] drilldowns = drilldowns or [] order = order or [] joins = alias = self.alias dataset = self fields = [db.func.sum(alias.c[measure]).label(measure), db.func.count(alias.c.id).label("entries")] stats_fields = list(fields) labels = { 'year': dataset['time']['year'].column_alias.label('year'), 'month': dataset['time']['yearmonth'].column_alias.label('month'), } dimensions = drilldowns + [k for k, v in cuts] dimensions = [d.split('.')[0] for d in dimensions] for dimension in set(dimensions): if dimension in labels: dimension = 'time' if dimension not in [c.table.name for c in joins.columns]: joins = dataset[dimension].join(joins) group_by = [] for key in drilldowns: if key in labels: column = labels[key] group_by.append(column) fields.append(column) else: column = dataset.key(key) if '.' in key or column.table == alias: fields.append(column) group_by.append(column) else: fields.append(column.table) for col in column.table.columns: group_by.append(col) conditions = db.and_() filters = defaultdict(set) for key, value in cuts: if key in labels: column = labels[key] else: column = dataset.key(key) filters[column].add(value) for attr, values in filters.items(): conditions.append(db.or_(*[attr == v for v in values])) order_by = [] if order is None or not len(order): order = [(measure, True)] for key, direction in order: if key == measure: column = db.func.sum(alias.c[measure]).label(measure) elif key in labels: column = labels[key] else: column = dataset.key(key) order_by.append(column.desc() if direction else column.asc()) # query 1: get overall sums. query = db.select(stats_fields, conditions, joins) rp = dataset.bind.execute(query) total, num_entries = rp.fetchone() # query 2: get total count of drilldowns if len(group_by): query = db.select(['1'], conditions, joins, group_by=group_by) query = db.select([db.func.count('1')], '1=1', query.alias('q')) rp = dataset.bind.execute(query) num_drilldowns, = rp.fetchone() else: num_drilldowns = 1 drilldown = [] offset = int((page - 1) * pagesize) # query 3: get the actual data query = db.select(fields, conditions, joins, order_by=order_by, group_by=group_by, use_labels=True, limit=pagesize, offset=offset) rp = dataset.bind.execute(query) while True: row = rp.fetchone() if row is None: break result = decode_row(row, dataset) drilldown.append(result) return { 'drilldown': drilldown, 'summary': { measure: total, 'num_entries': num_entries, 'currency': {measure: dataset.currency}, 'num_drilldowns': num_drilldowns, 'page': page, 'pages': int(math.ceil(num_drilldowns / float(pagesize))), 'pagesize': pagesize } }
def index(self, format='html'): c.query = request.params.items() c.add_filter = lambda f, v: '?' + urlencode(c.query + [(f, v)] if (f, v) not in c.query else c.query) c.del_filter = lambda f, v: '?' + urlencode([(k, x) for k, x in c.query if (k, x) != (f, v)]) c.results = c.datasets for language in request.params.getall('languages'): l = db.aliased(DatasetLanguage) c.results = c.results.join(l, Dataset._languages) c.results = c.results.filter(l.code == language) for territory in request.params.getall('territories'): t = db.aliased(DatasetTerritory) c.results = c.results.join(t, Dataset._territories) c.results = c.results.filter(t.code == territory) category = request.params.get('category') if category: c.results = c.results.filter(Dataset.category == category) c.results = list(c.results) c.territory_options = [{'code': code, 'count': count, 'url': h.url_for(controller='dataset', action='index', territories=code), 'label': COUNTRIES.get(code, code)} \ for (code, count) in DatasetTerritory.dataset_counts(c.results)] c.language_options = [{'code': code, 'count': count, 'url': h.url_for(controller='dataset', action='index', languages=code), 'label': LANGUAGES.get(code, code)} \ for (code, count) in DatasetLanguage.dataset_counts(c.results)] # TODO: figure out where to put this: ds_ids = [d.id for d in c.results] if len(ds_ids): q = db.select([Dataset.category, db.func.count(Dataset.id)], Dataset.id.in_(ds_ids), group_by=Dataset.category, order_by=db.func.count(Dataset.id).desc()) c.category_options = [{'category': category, 'count': count, 'url': h.url_for(controller='dataset', action='index', category=category), 'label': CATEGORIES.get(category, category)} \ for (category, count) in db.session.bind.execute(q).fetchall() \ if category is not None] else: c.category_options = [] c._must_revalidate = True if len(c.results): dt = max([r.updated_at for r in c.results]) etag_cache_keygen(dt) if format == 'json': results = map(lambda d: d.as_dict(), c.results) results = [dataset_apply_links(r) for r in results] return to_jsonp({ 'datasets': results, 'categories': c.category_options, 'territories': c.territory_options, 'languages': c.language_options }) elif format == 'csv': results = map(lambda d: d.as_dict(), c.results) return write_csv(results, response) c.show_rss = True return templating.render('dataset/index.html')