class Account(db.Model): __tablename__ = 'account' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode(255), unique=True) fullname = db.Column(db.Unicode(2000)) email = db.Column(db.Unicode(2000)) password = db.Column(db.Unicode(2000)) api_key = db.Column(db.Unicode(2000), default=make_uuid) admin = db.Column(db.Boolean, default=False) datasets = db.relationship(Dataset, secondary=account_dataset_table, backref=db.backref('managers', lazy='dynamic')) def __init__(self): pass @classmethod def by_name(cls, name): return db.session.query(cls).filter_by(name=name).first() @classmethod def by_api_key(cls, api_key): return db.session.query(cls).filter_by(api_key=api_key).first() def as_dict(self): return { 'name': self.name, 'fullname': self.fullname, 'email': self.email, 'admin': self.admin }
class Run(db.Model): """ A run is a generic grouping object for background operations that perform logging to the frontend. """ __tablename__ = 'run' STATUS_RUNNING = 'running' STATUS_COMPLETE = 'complete' STATUS_FAILED = 'failed' id = db.Column(db.Integer, primary_key=True) operation = db.Column(db.Unicode(2000)) status = db.Column(db.Unicode(2000)) time_start = db.Column(db.DateTime, default=datetime.utcnow) time_end = db.Column(db.DateTime) dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id'), nullable=True) source_id = db.Column(db.Integer, db.ForeignKey('source.id'), nullable=True) dataset = db.relationship(Dataset, backref=db.backref( 'runs', order_by='Run.time_start.desc()', lazy='dynamic')) source = db.relationship(Source, backref=db.backref( 'runs', order_by='Run.time_start.desc()', lazy='dynamic')) def __init__(self, operation, status, dataset, source): self.operation = operation self.status = status self.dataset = dataset self.source = source @classmethod def by_id(cls, id): return db.session.query(cls).filter_by(id=id).first() def __repr__(self): return "<Run(%s,%s)>" % (self.source.id, self.id)
def init(self): """ Create a SQLAlchemy model for the current dataset model, without creating the tables and columns. This needs to be called both for access to the data and in order to generate the model physically. """ self.bind = db.engine self.meta = db.MetaData() # self.tx = self.bind.begin() self.meta.bind = db.engine self._init_table(self.meta, self.name, 'entry', id_type=db.Unicode(42)) for field in self.fields: field.column = field.init(self.meta, self.table) self.alias = self.table.alias('entry')
class Account(db.Model): __tablename__ = 'account' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode(255), unique=True) fullname = db.Column(db.Unicode(2000)) email = db.Column(db.Unicode(2000)) password = db.Column(db.Unicode(2000)) api_key = db.Column(db.Unicode(2000), default=make_uuid) admin = db.Column(db.Boolean, default=False) script_root = db.Column(db.Unicode(2000)) terms = db.Column(db.Boolean, default=False) datasets = db.relationship(Dataset, secondary=account_dataset_table, backref=db.backref('managers', lazy='dynamic')) def __init__(self): pass @property def display_name(self): return self.fullname or self.name @property def token(self): h = hmac.new('') h.update(self.api_key) if self.password: h.update(self.password) return h.hexdigest() @classmethod def by_name(cls, name): return db.session.query(cls).filter_by(name=name).first() @classmethod def by_email(cls, email): return db.session.query(cls).filter_by(email=email).first() @classmethod def by_api_key(cls, api_key): return db.session.query(cls).filter_by(api_key=api_key).first() def as_dict(self): return { 'name': self.name, 'fullname': self.fullname, 'email': self.email, 'admin': self.admin }
class Dataset(TableHandler, db.Model): """ The dataset is the core entity of any access to data. All requests to the actual data store are routed through it, as well as data loading and model generation. The dataset keeps an in-memory representation of the data model (including all dimensions and measures) which can be used to generate necessary queries. """ __tablename__ = 'dataset' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode(255), unique=True) label = db.Column(db.Unicode(2000)) description = db.Column(db.Unicode()) currency = db.Column(db.Unicode()) default_time = db.Column(db.Unicode()) schema_version = db.Column(db.Unicode()) entry_custom_html = db.Column(db.Unicode()) ckan_uri = db.Column(db.Unicode()) category = db.Column(db.Unicode()) serp_title = db.Column(db.Unicode(), nullable=True) serp_teaser = db.Column(db.Unicode(), nullable=True) private = db.Column(db.Boolean, default=False) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) data = db.Column(MutableDict.as_mutable(JSONType), default=dict) languages = db.association_proxy('_languages', 'code') territories = db.association_proxy('_territories', 'code') def __init__(self, data): self.data = data.copy() dataset = self.data['dataset'] del self.data['dataset'] self.label = dataset.get('label') self.name = dataset.get('name') self.description = dataset.get('description') self.currency = dataset.get('currency') self.category = dataset.get('category') self.serp_title = dataset.get('serp_title') self.serp_teaser = dataset.get('serp_teaser') self.default_time = dataset.get('default_time') self.entry_custom_html = dataset.get('entry_custom_html') self.languages = dataset.get('languages', []) self.territories = dataset.get('territories', []) self.ckan_uri = dataset.get('ckan_uri') self._load_model() @property def model(self): model = self.data.copy() model['dataset'] = self.as_dict() return model @property def mapping(self): return self.data.get('mapping', {}) @db.reconstructor def _load_model(self): """ Construct the in-memory object representation of this dataset's dimension and measures model. This is called upon initialization and deserialization of the dataset from the SQLAlchemy store. """ self.dimensions = [] self.measures = [] for dim, data in self.mapping.items(): if data.get('type') == 'measure' or dim == 'amount': self.measures.append(Measure(self, dim, data)) continue elif data.get('type') == 'date' or \ (dim == 'time' and data.get('datatype') == 'date'): dimension = DateDimension(self, dim, data) elif data.get('type') in ['value', 'attribute']: dimension = AttributeDimension(self, dim, data) else: dimension = CompoundDimension(self, dim, data) self.dimensions.append(dimension) self.init() self._is_generated = None def __getitem__(self, name): """ Access a field (dimension or measure) by name. """ for field in self.fields: if field.name == name: return field raise KeyError() def __contains__(self, name): try: self[name] return True except KeyError: return False @property def fields(self): """ Both the dimensions and metrics in this dataset. """ return self.dimensions + self.measures @property def compounds(self): """ Return only compound dimensions. """ return filter(lambda d: isinstance(d, CompoundDimension), self.dimensions) @property def facet_dimensions(self): return [d for d in self.dimensions if d.facet] def init(self): """ Create a SQLAlchemy model for the current dataset model, without creating the tables and columns. This needs to be called both for access to the data and in order to generate the model physically. """ self.bind = db.engine self.meta = db.MetaData() # self.tx = self.bind.begin() self.meta.bind = db.engine self._init_table(self.meta, self.name, 'entry', id_type=db.Unicode(42)) for field in self.fields: field.column = field.init(self.meta, self.table) self.alias = self.table.alias('entry') def generate(self): """ Create the tables and columns necessary for this dataset to keep data. """ for field in self.fields: field.generate(self.meta, self.table) for dim in self.dimensions: if isinstance(dim, CompoundDimension): self.table.append_constraint( ForeignKeyConstraint( [dim.name + '_id'], [dim.table.name + '.id'], # use_alter=True, name='fk_' + self.name + '_' + dim.name)) self._generate_table() self._is_generated = True @property def is_generated(self): if self._is_generated is None: self._is_generated = self.table.exists() return self._is_generated @property def has_badges(self): """ Property that returns True if the dataset has been given any badges """ # Cast the badge count as a boolean and return it return bool(self.badges.count()) def commit(self): pass # self.tx.commit() # self.tx = self.bind.begin() def _make_key(self, data): """ Generate a unique identifier for an entry. This is better than SQL auto-increment because it is stable across mutltiple loads and thus creates stable URIs for entries. """ uniques = [self.name] for field in self.fields: if not field.key: continue obj = data.get(field.name) if isinstance(obj, dict): obj = obj.get('name', obj.get('id')) uniques.append(obj) return hash_values(uniques) def load(self, data): """ Handle a single entry of data in the mapping source format, i.e. with all needed columns. This will propagate to all dimensions and set values as appropriate. """ entry = dict() for field in self.fields: field_data = data[field.name] entry.update(field.load(self.bind, field_data)) entry['id'] = self._make_key(data) self._upsert(self.bind, entry, ['id']) def flush(self): """ Delete all data from the dataset tables but leave the table structure intact. """ for dimension in self.dimensions: dimension.flush(self.bind) self._flush(self.bind) def drop(self): """ Drop all tables created as part of this dataset, i.e. by calling ``generate()``. This will of course also delete the data itself. """ self._drop(self.bind) for dimension in self.dimensions: dimension.drop(self.bind) self._is_generated = False def key(self, key): """ For a given ``key``, find a column to indentify it in a query. A ``key`` is either the name of a simple attribute (e.g. ``time``) or of an attribute of a complex dimension (e.g. ``to.label``). The returned key is using an alias, so it can be used in a query directly. """ attr = None if '.' in key: key, attr = key.split('.', 1) dimension = self[key] if hasattr(dimension, 'alias'): attr_name = dimension[attr].column.name if attr else 'name' return dimension.alias.c[attr_name] return self.alias.c[dimension.column.name] def entries(self, conditions="1=1", order_by=None, limit=None, offset=0, step=10000, fields=None): """ Generate a fully denormalized view of the entries on this table. This view is nested so that each dimension will be a hash of its attributes. This is somewhat similar to the entries collection in the fully denormalized schema before OpenSpending 0.11 (MongoDB). """ if not self.is_generated: return if fields is None: fields = self.fields joins = self.alias for d in self.dimensions: if d in fields: joins = d.join(joins) selects = [f.selectable for f in fields] + [self.alias.c.id] # enforce stable sorting: if order_by is None: order_by = [self.alias.c.id.asc()] for i in count(): qoffset = offset + (step * i) qlimit = step if limit is not None: qlimit = min(limit - (step * i), step) if qlimit <= 0: break query = db.select(selects, conditions, joins, order_by=order_by, use_labels=True, limit=qlimit, offset=qoffset) rp = self.bind.execute(query) first_row = True while True: row = rp.fetchone() if row is None: if first_row: return break first_row = False yield decode_row(row, self) def aggregate(self, measures=['amount'], drilldowns=[], cuts=[], page=1, pagesize=10000, order=[]): """ Query the dataset for a subset of cells based on cuts and drilldowns. It returns a structure with a list of drilldown items and a summary about the slice cutted by the query. ``measures`` The numeric units to be aggregated over, defaults to [``amount``]. (type: `list`) ``drilldowns`` Dimensions to drill down to. (type: `list`) ``cuts`` Specification what to cut from the cube. This is a `list` of `two-tuples` where the first item is the dimension and the second item is the value to cut from. It is turned into a query where multible cuts for the same dimension are combined to an *OR* query and then the queries for the different dimensions are combined to an *AND* query. ``page`` Page the drilldown result and return page number *page*. type: `int` ``pagesize`` Page the drilldown result into page of size *pagesize*. type: `int` ``order`` Sort the result based on the dimension *sort_dimension*. This may be `None` (*default*) or a `list` of two-`tuples` where the first element is the *dimension* and the second element is the order (`False` for ascending, `True` for descending). Type: `list` of two-`tuples`. Raises: :exc:`ValueError` If a cube is not yet computed. Call :meth:`compute` to compute the cube. :exc:`KeyError` If a drilldown, cut or order dimension is not part of this cube or the order dimensions are not a subset of the drilldown dimensions. Returns: A `dict` containing the drilldown and the summary: {"drilldown": [ {"num_entries": 5545, "amount": 41087379002.0, "cofog1": {"description": "", "label": "Economic affairs"}}, ... ] "summary": {"amount": 7353306450299.0, "num_entries": 133612}} """ # Get the joins (aka alias) and the dataset joins = alias = self.alias dataset = self # Aggregation fields are all of the measures, so we create individual # summary fields with the sum function of SQLAlchemy fields = [db.func.sum(alias.c[m]).label(m) for m in measures] # We append an aggregation field that counts the number of entries fields.append(db.func.count(alias.c.id).label("entries")) # Create a copy of the statistics fields (for later) stats_fields = list(fields) # Create label map for time columns (year and month) for lookup # since they are found under the time attribute labels = { 'year': dataset['time']['year'].column_alias.label('year'), 'month': dataset['time']['yearmonth'].column_alias.label('month'), } # Get the dimensions we're interested in. These would be the drilldowns # and the cuts. For compound dimensions we are only interested in the # most significant one (e.g. for from.name we're interested in from) dimensions = drilldowns + [k for k, v in cuts] dimensions = [d.split('.')[0] for d in dimensions] # Loop over the dimensions as a set (to avoid multiple occurances) for dimension in set(dimensions): # If the dimension is year or month we're interested in 'time' if dimension in labels: dimension = 'time' # If the dimension table isn't in the automatic joins we add it if dimension not in [c.table.name for c in joins.columns]: joins = dataset[dimension].join(joins) # Drilldowns are performed using group_by SQL functions group_by = [] for key in drilldowns: # If drilldown is in labels we append its mapped column to fields if key in labels: column = labels[key] group_by.append(column) fields.append(column) else: # Get the column from the dataset column = dataset.key(key) # If the drilldown is a compound dimension or the columns table # is in the joins we're already fetching the column so we just # append it to fields and the group_by if '.' in key or column.table == alias: fields.append(column) group_by.append(column) else: # If not we add the column table to the fields and add all # of that tables columns to the group_by fields.append(column.table) for col in column.table.columns: group_by.append(col) # Cuts are managed using AND statements and we use a dict with set as # the default value to create the filters (cut on various values) conditions = db.and_() filters = defaultdict(set) for key, value in cuts: # If the key is in labels (year or month) we get the mapped column # else we get the column from the dataset if key in labels: column = labels[key] else: column = dataset.key(key) # We add the value to the set for that particular column filters[column].add(value) # Loop over the columns in the filter and add that to the conditions # For every value in the set we create and OR statement so we get e.g. # year=2007 AND (from.who == 'me' OR from.who == 'you') for attr, values in filters.items(): conditions.append(db.or_(*[attr == v for v in values])) # Ordering can be set by a parameter or ordered by measures by default order_by = [] # If no order is defined we default to order of the measures in the # order they occur (furthest to the left is most significant) if order is None or not len(order): order = [(m, True) for m in measures] # We loop through the order list to add the columns themselves for key, direction in order: # If it's a part of the measures we have to order by the # aggregated values (the sum of the measure) if key in measures: column = db.func.sum(alias.c[key]).label(key) # If it's in the labels we have to get the mapped column elif key in labels: column = labels[key] # ...if not we just get the column from the dataset else: column = dataset.key(key) # We append the column and set the direction (True == descending) order_by.append(column.desc() if direction else column.asc()) # query 1: get overall sums. # Here we use the stats_field we saved earlier query = db.select(stats_fields, conditions, joins) rp = dataset.bind.execute(query) # Execute the query and turn them to a list so we can pop the # entry count and then zip the measurements and the totals together stats = list(rp.fetchone()) num_entries = stats.pop() total = zip(measures, stats) # query 2: get total count of drilldowns if len(group_by): # Select 1 for each group in the group_by and count them query = db.select(['1'], conditions, joins, group_by=group_by) query = db.select([db.func.count('1')], '1=1', query.alias('q')) rp = dataset.bind.execute(query) num_drilldowns, = rp.fetchone() else: # If there are no drilldowns we still have to do one num_drilldowns = 1 # The drilldown result list drilldown = [] # The offset in the db, based on the page and pagesize (we have to # modify it since page counts starts from 1 but we count from 0 offset = int((page - 1) * pagesize) # query 3: get the actual data query = db.select(fields, conditions, joins, order_by=order_by, group_by=group_by, use_labels=True, limit=pagesize, offset=offset) rp = dataset.bind.execute(query) while True: # Get each row in the db result and append it, decoded, to the # drilldown result. The decoded version is a json represenation row = rp.fetchone() if row is None: break result = decode_row(row, dataset) drilldown.append(result) # Create the summary based on the stats_fields and other things # First we add a the total for each measurement in the root of the # summary (watch out!) and then we add various other, self-explanatory # statistics such as page, number of entries. The currency value is # strange since it's redundant for multiple measures but is left as is # for backwards compatibility summary = {key: value for (key, value) in total} summary.update({ 'num_entries': num_entries, 'currency': {m: dataset.currency for m in measures}, 'num_drilldowns': num_drilldowns, 'page': page, 'pages': int(math.ceil(num_drilldowns / float(pagesize))), 'pagesize': pagesize }) return {'drilldown': drilldown, 'summary': summary} def timerange(self): """ Get the timerange of the dataset (based on the time attribute). Returns a tuple of (first timestamp, last timestamp) where timestamp is a datetime object """ try: # Get the time column time = self.key('time') # We use SQL's min and max functions to get the timestamps query = db.session.query(db.func.min(time), db.func.max(time)) # We just need one result to get min and max time return [ datetime.strptime(date, '%Y-%m-%d') if date else None for date in query.one() ] except: return (None, None) def __repr__(self): return "<Dataset(%s:%s:%s)>" % (self.name, self.dimensions, self.measures) def __len__(self): if not self.is_generated: return 0 rp = self.bind.execute(self.alias.count()) return rp.fetchone()[0] def as_dict(self): return { 'label': self.label, 'name': self.name, 'description': self.description, 'default_time': self.default_time, 'schema_version': self.schema_version, 'currency': self.currency, 'category': self.category, 'serp_title': self.serp_title, 'serp_teaser': self.serp_teaser, 'timestamps': { 'created': self.created_at, 'last_modified': self.updated_at }, 'languages': list(self.languages), 'territories': list(self.territories), 'badges': [b.as_dict(short=True) for b in self.badges] } @classmethod def all_by_account(cls, account): """ Query available datasets based on dataset visibility. """ criteria = [cls.private == false()] if account is not None: criteria += [ "1=1" if account.admin else "1=2", cls.managers.any(type(account).id == account.id) ] q = db.session.query(cls).filter(db.or_(*criteria)) q = q.order_by(cls.label.asc()) return q @classmethod def by_name(cls, name): return db.session.query(cls).filter_by(name=name).first()
class View(db.Model): """ A view stores a specific configuration of a visualisation widget. """ __tablename__ = 'view' id = db.Column(db.Integer, primary_key=True) widget = db.Column(db.Unicode(2000)) name = db.Column(db.Unicode(2000)) label = db.Column(db.Unicode(2000)) description = db.Column(db.Unicode()) state = db.Column(MutableDict.as_mutable(JSONType), default=dict) public = db.Column(db.Boolean, default=False) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, onupdate=datetime.utcnow) dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id')) account_id = db.Column(db.Integer, db.ForeignKey('account.id'), nullable=True) dataset = db.relationship(Dataset, backref=db.backref( 'views', cascade='all,delete,delete-orphan', lazy='dynamic')) account = db.relationship(Account, backref=db.backref( 'views', cascade='all,delete,delete-orphan', lazy='dynamic')) def __init__(self): pass @classmethod def by_id(cls, id): return db.session.query(cls).filter_by(id=id).first() @classmethod def by_name(cls, dataset, name): q = db.session.query(cls).filter_by(name=name) return q.filter_by(dataset=dataset).first() @classmethod def all_by_dataset(cls, dataset): return db.session.query(cls).filter_by(dataset=dataset) def as_dict(self): return { 'id': self.id, 'widget': self.widget, 'name': self.name, 'label': self.label, 'description': self.description, 'state': self.state, 'public': self.public, 'dataset': self.dataset.name, 'account': self.account.name if self.account else None } def __repr__(self): return "<View(%s,%s)>" % (self.dataset.name, self.name)
class Dataset(TableHandler, db.Model): """ The dataset is the core entity of any access to data. All requests to the actual data store are routed through it, as well as data loading and model generation. The dataset keeps an in-memory representation of the data model (including all dimensions and measures) which can be used to generate necessary queries. """ __tablename__ = 'dataset' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode(255), unique=True) label = db.Column(db.Unicode(2000)) description = db.Column(db.Unicode()) currency = db.Column(db.Unicode()) default_time = db.Column(db.Unicode()) schema_version = db.Column(db.Unicode()) entry_custom_html = db.Column(db.Unicode()) ckan_uri = db.Column(db.Unicode()) private = db.Column(db.Boolean, default=False) created_at = db.Column(db.DateTime, default=datetime.utcnow) updated_at = db.Column(db.DateTime, onupdate=datetime.utcnow) data = db.Column(JSONType, default=dict) languages = db.association_proxy('_languages', 'code') territories = db.association_proxy('_territories', 'code') def __init__(self, data): self.data = data.copy() dataset = self.data['dataset'] del self.data['dataset'] self.label = dataset.get('label') self.name = dataset.get('name') self.description = dataset.get('description') self.currency = dataset.get('currency') self.default_time = dataset.get('default_time') self.entry_custom_html = dataset.get('entry_custom_html') self.languages = dataset.get('languages', []) self.territories = dataset.get('territories', []) self.ckan_uri = dataset.get('ckan_uri') self._load_model() @property def model(self): model = self.data.copy() model['dataset'] = self.as_dict() return model @property def mapping(self): return self.data.get('mapping', {}) @db.reconstructor def _load_model(self): """ Construct the in-memory object representation of this dataset's dimension and measures model. This is called upon initialization and deserialization of the dataset from the SQLAlchemy store. """ self.dimensions = [] self.measures = [] for dim, data in self.mapping.items(): if data.get('type') == 'measure' or dim == 'amount': self.measures.append(Measure(self, dim, data)) continue elif data.get('type') == 'date' or \ (dim == 'time' and data.get('datatype') == 'date'): dimension = DateDimension(self, dim, data) elif data.get('type') in ['value', 'attribute']: dimension = AttributeDimension(self, dim, data) else: dimension = CompoundDimension(self, dim, data) self.dimensions.append(dimension) self.init() self._is_generated = None def __getitem__(self, name): """ Access a field (dimension or measure) by name. """ for field in self.fields: if field.name == name: return field raise KeyError() @property def fields(self): """ Both the dimensions and metrics in this dataset. """ return self.dimensions + self.measures @property def compounds(self): """ Return only compound dimensions. """ return filter(lambda d: isinstance(d, CompoundDimension), self.dimensions) def init(self): """ Create a SQLAlchemy model for the current dataset model, without creating the tables and columns. This needs to be called both for access to the data and in order to generate the model physically. """ self.bind = db.engine #.connect() self.meta = db.MetaData() #self.tx = self.bind.begin() self.meta.bind = db.engine self._init_table(self.meta, self.name, 'entry', id_type=db.Unicode(42)) for field in self.fields: field.init(self.meta, self.table) self.alias = self.table.alias('entry') def generate(self): """ Create the tables and columns necessary for this dataset to keep data. """ for field in self.fields: field.generate(self.meta, self.table) self._generate_table() self._is_generated = True @property def is_generated(self): if self._is_generated is None: self._is_generated = self.table.exists() return self._is_generated def commit(self): pass #self.tx.commit() #self.tx = self.bind.begin() def _make_key(self, data): """ Generate a unique identifier for an entry. This is better than SQL auto-increment because it is stable across mutltiple loads and thus creates stable URIs for entries. """ uniques = [self.name] for field in self.fields: if not field.key: continue obj = data.get(field.name) if isinstance(obj, dict): obj = obj.get('name', obj.get('id')) uniques.append(obj) return hash_values(uniques) def load(self, data): """ Handle a single entry of data in the mapping source format, i.e. with all needed columns. This will propagate to all dimensions and set values as appropriate. """ entry = dict() for field in self.fields: field_data = data[field.name] entry.update(field.load(self.bind, field_data)) entry['id'] = self._make_key(data) self._upsert(self.bind, entry, ['id']) def flush(self): """ Delete all data from the dataset tables but leave the table structure intact. """ for dimension in self.dimensions: dimension.flush(self.bind) self._flush(self.bind) def drop(self): """ Drop all tables created as part of this dataset, i.e. by calling ``generate()``. This will of course also delete the data itself. """ for dimension in self.dimensions: dimension.drop(self.bind) self._drop(self.bind) def key(self, key): """ For a given ``key``, find a column to indentify it in a query. A ``key`` is either the name of a simple attribute (e.g. ``time``) or of an attribute of a complex dimension (e.g. ``to.label``). The returned key is using an alias, so it can be used in a query directly. """ attr = None if '.' in key: key, attr = key.split('.', 1) dimension = self[key] if hasattr(dimension, 'alias'): attr_name = dimension[attr].column.name if attr else 'name' return dimension.alias.c[attr_name] return self.alias.c[dimension.column.name] def entries(self, conditions="1=1", order_by=None, limit=None, offset=0, step=10000): """ Generate a fully denormalized view of the entries on this table. This view is nested so that each dimension will be a hash of its attributes. This is somewhat similar to the entries collection in the fully denormalized schema before OpenSpending 0.11 (MongoDB). """ if not self.is_generated: return joins = self.alias for d in self.dimensions: joins = d.join(joins) selects = [f.selectable for f in self.fields] + [self.alias.c.id] # enforce stable sorting: if order_by is None: order_by = [self.alias.c.id.asc()] for i in count(): qoffset = offset + (step * i) qlimit = step if limit is not None: qlimit = min(limit - (step * i), step) if qlimit <= 0: break query = db.select(selects, conditions, joins, order_by=order_by, use_labels=True, limit=qlimit, offset=qoffset) rp = self.bind.execute(query) first_row = True while True: row = rp.fetchone() if row is None: if first_row: return break first_row = False result = {} for k, v in row.items(): field, attr = k.split('_', 1) field = field.replace(ALIAS_PLACEHOLDER, '_') if field == 'entry': result[attr] = v else: if not field in result: result[field] = dict() # TODO: backwards-compat? if isinstance(self[field], CompoundDimension): result[field]['taxonomy'] = self[ field].taxonomy result[field][attr] = v yield result def aggregate(self, measure='amount', drilldowns=None, cuts=None, page=1, pagesize=10000, order=None): """ Query the dataset for a subset of cells based on cuts and drilldowns. It returns a structure with a list of drilldown items and a summary about the slice cutted by the query. ``measure`` The numeric unit to be aggregated over, defaults to ``amount``. ``drilldowns`` Dimensions to drill down to. (type: `list`) ``cuts`` Specification what to cut from the cube. This is a `list` of `two-tuples` where the first item is the dimension and the second item is the value to cut from. It is turned into a query where multible cuts for the same dimension are combined to an *OR* query and then the queries for the different dimensions are combined to an *AND* query. ``page`` Page the drilldown result and return page number *page*. type: `int` ``pagesize`` Page the drilldown result into page of size *pagesize*. type: `int` ``order`` Sort the result based on the dimension *sort_dimension*. This may be `None` (*default*) or a `list` of two-`tuples` where the first element is the *dimension* and the second element is the order (`False` for ascending, `True` for descending). Type: `list` of two-`tuples`. Raises: :exc:`ValueError` If a cube is not yet computed. Call :meth:`compute` to compute the cube. :exc:`KeyError` If a drilldown, cut or order dimension is not part of this cube or the order dimensions are not a subset of the drilldown dimensions. Returns: A `dict` containing the drilldown and the summary:: {"drilldown": [ {"num_entries": 5545, "amount": 41087379002.0, "cofog1": {"description": "", "label": "Economic affairs"}}, ... ] "summary": {"amount": 7353306450299.0, "num_entries": 133612}} """ cuts = cuts or [] drilldowns = drilldowns or [] order = order or [] joins = self.alias fields = [ db.func.sum(self.alias.c[measure]).label(measure), db.func.count(self.alias.c.id).label("entries") ] labels = { 'year': self['time']['year'].column_alias.label('year'), 'month': self['time']['yearmonth'].column_alias.label('month'), } dimensions = set(drilldowns + [k for k, v in cuts] + [o[0] for o in order]) for dimension in dimensions: if dimension in labels: _name = 'time' else: _name = dimension.split('.')[0] if _name not in [c.table.name for c in joins.columns]: joins = self[_name].join(joins) group_by = [] for key in dimensions: if key in labels: column = labels[key] group_by.append(column) fields.append(column) else: column = self.key(key) if '.' in key or column.table == self.alias: fields.append(column) group_by.append(column) else: fields.append(column.table) for col in column.table.columns: group_by.append(col) conditions = db.and_() filters = defaultdict(set) for key, value in cuts: if key in labels: column = labels[key] else: column = self.key(key) filters[column].add(value) for attr, values in filters.items(): conditions.append(db.or_(*[attr == v for v in values])) order_by = [] for key, direction in order: if key in labels: column = labels[key] else: column = self.key(key) order_by.append(column.desc() if direction else column.asc()) query = db.select(fields, conditions, joins, order_by=order_by or [measure + ' desc'], group_by=group_by, use_labels=True) summary = {measure: 0.0, 'num_entries': 0} drilldown = [] rp = self.bind.execute(query) while True: row = rp.fetchone() if row is None: break result = {} for key, value in row.items(): if key == measure: summary[measure] += value or 0 if key == 'entries': summary['num_entries'] += value or 0 if '_' in key: dimension, attribute = key.split('_', 1) dimension = dimension.replace(ALIAS_PLACEHOLDER, '_') if dimension == 'entry': result[attribute] = value else: if not dimension in result: result[dimension] = {} # TODO: backwards-compat? if isinstance(self[dimension], CompoundDimension): result[dimension]['taxonomy'] = \ self[dimension].taxonomy result[dimension][attribute] = value else: if key == 'entries': key = 'num_entries' result[key] = value drilldown.append(result) offset = ((page - 1) * pagesize) # do we really need all this: summary['num_drilldowns'] = len(drilldown) summary['page'] = page summary['pages'] = int(math.ceil(len(drilldown) / float(pagesize))) summary['pagesize'] = pagesize return { 'drilldown': drilldown[offset:offset + pagesize], 'summary': summary } def __repr__(self): return "<Dataset(%s:%s:%s)>" % (self.name, self.dimensions, self.measures) def times(self, attribute='year'): """ Get all distinct times mentioned in the dataset. """ # TODO: make this a more generic distinct_attribute function field = self['time'][attribute].column_alias query = db.select([field.label(attribute)], self['time'].alias, distinct=True) rp = self.bind.execute(query) return sorted([r[attribute] for r in rp.fetchall()]) def __len__(self): if not self.is_generated: return 0 rp = self.bind.execute(self.alias.count()) return rp.fetchone()[0] def as_dict(self): return { 'label': self.label, 'name': self.name, 'description': self.description, 'default_time': self.default_time, 'schema_version': self.schema_version, 'currency': self.currency, 'languages': list(self.languages), 'territories': list(self.territories) } @classmethod def all_by_account(cls, account): """ Query available datasets based on dataset visibility. """ criteria = [cls.private == False] if account is not None: criteria += [ "1=1" if account.admin else "1=2", cls.managers.any(type(account).id == account.id) ] q = db.session.query(cls).filter(db.or_(*criteria)) q = q.order_by(cls.label.asc()) return q @classmethod def by_name(cls, name): return db.session.query(cls).filter_by(name=name).first()
class Run(db.Model): """ A run is a generic grouping object for background operations that perform logging to the frontend. """ __tablename__ = 'run' # Status values STATUS_RUNNING = 'running' STATUS_COMPLETE = 'complete' STATUS_FAILED = 'failed' STATUS_REMOVED = 'removed' # Operation values for database, two operations possible OPERATION_SAMPLE = 'sample' OPERATION_IMPORT = 'import' id = db.Column(db.Integer, primary_key=True) operation = db.Column(db.Unicode(2000)) status = db.Column(db.Unicode(2000)) time_start = db.Column(db.DateTime, default=datetime.utcnow) time_end = db.Column(db.DateTime) dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id'), nullable=True) source_id = db.Column(db.Integer, db.ForeignKey('source.id'), nullable=True) dataset = db.relationship(Dataset, backref=db.backref( 'runs', order_by='Run.time_start.desc()', lazy='dynamic')) source = db.relationship(Source, backref=db.backref( 'runs', order_by='Run.time_start.desc()', lazy='dynamic')) def __init__(self, operation, status, dataset, source): self.operation = operation self.status = status self.dataset = dataset self.source = source @property def successful_sample(self): """ Returns True if the run was a sample operation (not full import) and ran without failures. """ return self.operation == self.OPERATION_SAMPLE and \ self.status == self.STATUS_COMPLETE @property def successful_load(self): """ Returns True if the run was an import operation (not a sample) and ran without failures. """ return self.operation == self.OPERATION_IMPORT and \ self.status == self.STATUS_COMPLETE @property def is_running(self): """ Returns True if the run is currently running """ return self.status == self.STATUS_RUNNING @classmethod def by_id(cls, id): return db.session.query(cls).filter_by(id=id).first() def __repr__(self): return "<Run(%s,%s)>" % (self.source.id, self.id)
class Account(db.Model): __tablename__ = 'account' id = db.Column(db.Integer, primary_key=True) name = db.Column(db.Unicode(255), unique=True) fullname = db.Column(db.Unicode(2000)) email = db.Column(db.Unicode(2000)) public_email = db.Column(db.Boolean, default=False) twitter_handle = db.Column(db.Unicode(140)) public_twitter = db.Column(db.Boolean, default=False) password = db.Column(db.Unicode(2000)) api_key = db.Column(db.Unicode(2000), default=make_uuid) admin = db.Column(db.Boolean, default=False) script_root = db.Column(db.Unicode(2000)) terms = db.Column(db.Boolean, default=False) datasets = db.relationship(Dataset, secondary=account_dataset_table, backref=db.backref('managers', lazy='dynamic')) def __init__(self): pass @property def display_name(self): return self.fullname or self.name @property def token(self): h = hmac.new('') h.update(self.api_key) if self.password: h.update(self.password) return h.hexdigest() @classmethod def by_name(cls, name): return db.session.query(cls).filter_by(name=name).first() @classmethod def by_email(cls, email): return db.session.query(cls).filter_by(email=email).first() @classmethod def by_api_key(cls, api_key): return db.session.query(cls).filter_by(api_key=api_key).first() def as_dict(self): """ Return the dictionary representation of the account """ # Dictionary will include name, fullname, email and the admin bit account_dict = { 'name': self.name, 'fullname': self.fullname, 'email': self.email, 'admin': self.admin } # If the user has a twitter handle we add it if self.twitter_handle is not None: account_dict['twitter'] = self.twitter_handle # Return the dictionary representation return account_dict