예제 #1
0
class Account(db.Model):
    __tablename__ = 'account'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode(255), unique=True)
    fullname = db.Column(db.Unicode(2000))
    email = db.Column(db.Unicode(2000))
    password = db.Column(db.Unicode(2000))
    api_key = db.Column(db.Unicode(2000), default=make_uuid)
    admin = db.Column(db.Boolean, default=False)

    datasets = db.relationship(Dataset,
                               secondary=account_dataset_table,
                               backref=db.backref('managers', lazy='dynamic'))

    def __init__(self):
        pass

    @classmethod
    def by_name(cls, name):
        return db.session.query(cls).filter_by(name=name).first()

    @classmethod
    def by_api_key(cls, api_key):
        return db.session.query(cls).filter_by(api_key=api_key).first()

    def as_dict(self):
        return {
            'name': self.name,
            'fullname': self.fullname,
            'email': self.email,
            'admin': self.admin
        }
예제 #2
0
class Run(db.Model):
    """ A run is a generic grouping object for background operations
    that perform logging to the frontend. """

    __tablename__ = 'run'

    STATUS_RUNNING = 'running'
    STATUS_COMPLETE = 'complete'
    STATUS_FAILED = 'failed'

    id = db.Column(db.Integer, primary_key=True)
    operation = db.Column(db.Unicode(2000))
    status = db.Column(db.Unicode(2000))
    time_start = db.Column(db.DateTime, default=datetime.utcnow)
    time_end = db.Column(db.DateTime)
    dataset_id = db.Column(db.Integer,
                           db.ForeignKey('dataset.id'),
                           nullable=True)
    source_id = db.Column(db.Integer,
                          db.ForeignKey('source.id'),
                          nullable=True)

    dataset = db.relationship(Dataset,
                              backref=db.backref(
                                  'runs',
                                  order_by='Run.time_start.desc()',
                                  lazy='dynamic'))
    source = db.relationship(Source,
                             backref=db.backref(
                                 'runs',
                                 order_by='Run.time_start.desc()',
                                 lazy='dynamic'))

    def __init__(self, operation, status, dataset, source):
        self.operation = operation
        self.status = status
        self.dataset = dataset
        self.source = source

    @classmethod
    def by_id(cls, id):
        return db.session.query(cls).filter_by(id=id).first()

    def __repr__(self):
        return "<Run(%s,%s)>" % (self.source.id, self.id)
예제 #3
0
    def init(self):
        """ Create a SQLAlchemy model for the current dataset model,
        without creating the tables and columns. This needs to be
        called both for access to the data and in order to generate
        the model physically. """
        self.bind = db.engine
        self.meta = db.MetaData()
        # self.tx = self.bind.begin()
        self.meta.bind = db.engine

        self._init_table(self.meta, self.name, 'entry', id_type=db.Unicode(42))
        for field in self.fields:
            field.column = field.init(self.meta, self.table)
        self.alias = self.table.alias('entry')
예제 #4
0
class Account(db.Model):
    __tablename__ = 'account'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode(255), unique=True)
    fullname = db.Column(db.Unicode(2000))
    email = db.Column(db.Unicode(2000))
    password = db.Column(db.Unicode(2000))
    api_key = db.Column(db.Unicode(2000), default=make_uuid)
    admin = db.Column(db.Boolean, default=False)
    script_root = db.Column(db.Unicode(2000))
    terms = db.Column(db.Boolean, default=False)

    datasets = db.relationship(Dataset,
                               secondary=account_dataset_table,
                               backref=db.backref('managers', lazy='dynamic'))

    def __init__(self):
        pass

    @property
    def display_name(self):
        return self.fullname or self.name

    @property
    def token(self):
        h = hmac.new('')
        h.update(self.api_key)
        if self.password:
            h.update(self.password)
        return h.hexdigest()

    @classmethod
    def by_name(cls, name):
        return db.session.query(cls).filter_by(name=name).first()

    @classmethod
    def by_email(cls, email):
        return db.session.query(cls).filter_by(email=email).first()

    @classmethod
    def by_api_key(cls, api_key):
        return db.session.query(cls).filter_by(api_key=api_key).first()

    def as_dict(self):
        return {
            'name': self.name,
            'fullname': self.fullname,
            'email': self.email,
            'admin': self.admin
        }
예제 #5
0
class Dataset(TableHandler, db.Model):
    """ The dataset is the core entity of any access to data. All
    requests to the actual data store are routed through it, as well
    as data loading and model generation.

    The dataset keeps an in-memory representation of the data model
    (including all dimensions and measures) which can be used to
    generate necessary queries.
    """
    __tablename__ = 'dataset'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode(255), unique=True)
    label = db.Column(db.Unicode(2000))
    description = db.Column(db.Unicode())
    currency = db.Column(db.Unicode())
    default_time = db.Column(db.Unicode())
    schema_version = db.Column(db.Unicode())
    entry_custom_html = db.Column(db.Unicode())
    ckan_uri = db.Column(db.Unicode())
    category = db.Column(db.Unicode())
    serp_title = db.Column(db.Unicode(), nullable=True)
    serp_teaser = db.Column(db.Unicode(), nullable=True)
    private = db.Column(db.Boolean, default=False)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime,
                           default=datetime.utcnow,
                           onupdate=datetime.utcnow)
    data = db.Column(MutableDict.as_mutable(JSONType), default=dict)

    languages = db.association_proxy('_languages', 'code')
    territories = db.association_proxy('_territories', 'code')

    def __init__(self, data):
        self.data = data.copy()
        dataset = self.data['dataset']
        del self.data['dataset']
        self.label = dataset.get('label')
        self.name = dataset.get('name')
        self.description = dataset.get('description')
        self.currency = dataset.get('currency')
        self.category = dataset.get('category')
        self.serp_title = dataset.get('serp_title')
        self.serp_teaser = dataset.get('serp_teaser')
        self.default_time = dataset.get('default_time')
        self.entry_custom_html = dataset.get('entry_custom_html')
        self.languages = dataset.get('languages', [])
        self.territories = dataset.get('territories', [])
        self.ckan_uri = dataset.get('ckan_uri')
        self._load_model()

    @property
    def model(self):
        model = self.data.copy()
        model['dataset'] = self.as_dict()
        return model

    @property
    def mapping(self):
        return self.data.get('mapping', {})

    @db.reconstructor
    def _load_model(self):
        """ Construct the in-memory object representation of this
        dataset's dimension and measures model.

        This is called upon initialization and deserialization of
        the dataset from the SQLAlchemy store.
        """
        self.dimensions = []
        self.measures = []
        for dim, data in self.mapping.items():
            if data.get('type') == 'measure' or dim == 'amount':
                self.measures.append(Measure(self, dim, data))
                continue
            elif data.get('type') == 'date' or \
                    (dim == 'time' and data.get('datatype') == 'date'):
                dimension = DateDimension(self, dim, data)
            elif data.get('type') in ['value', 'attribute']:
                dimension = AttributeDimension(self, dim, data)
            else:
                dimension = CompoundDimension(self, dim, data)
            self.dimensions.append(dimension)
        self.init()
        self._is_generated = None

    def __getitem__(self, name):
        """ Access a field (dimension or measure) by name. """
        for field in self.fields:
            if field.name == name:
                return field
        raise KeyError()

    def __contains__(self, name):
        try:
            self[name]
            return True
        except KeyError:
            return False

    @property
    def fields(self):
        """ Both the dimensions and metrics in this dataset. """
        return self.dimensions + self.measures

    @property
    def compounds(self):
        """ Return only compound dimensions. """
        return filter(lambda d: isinstance(d, CompoundDimension),
                      self.dimensions)

    @property
    def facet_dimensions(self):
        return [d for d in self.dimensions if d.facet]

    def init(self):
        """ Create a SQLAlchemy model for the current dataset model,
        without creating the tables and columns. This needs to be
        called both for access to the data and in order to generate
        the model physically. """
        self.bind = db.engine
        self.meta = db.MetaData()
        # self.tx = self.bind.begin()
        self.meta.bind = db.engine

        self._init_table(self.meta, self.name, 'entry', id_type=db.Unicode(42))
        for field in self.fields:
            field.column = field.init(self.meta, self.table)
        self.alias = self.table.alias('entry')

    def generate(self):
        """ Create the tables and columns necessary for this dataset
        to keep data.
        """
        for field in self.fields:
            field.generate(self.meta, self.table)
        for dim in self.dimensions:
            if isinstance(dim, CompoundDimension):
                self.table.append_constraint(
                    ForeignKeyConstraint(
                        [dim.name + '_id'],
                        [dim.table.name + '.id'],
                        # use_alter=True,
                        name='fk_' + self.name + '_' + dim.name))
        self._generate_table()
        self._is_generated = True

    @property
    def is_generated(self):
        if self._is_generated is None:
            self._is_generated = self.table.exists()
        return self._is_generated

    @property
    def has_badges(self):
        """
        Property that returns True if the dataset has been given any badges
        """
        # Cast the badge count as a boolean and return it
        return bool(self.badges.count())

    def commit(self):
        pass
        # self.tx.commit()
        # self.tx = self.bind.begin()

    def _make_key(self, data):
        """ Generate a unique identifier for an entry. This is better
        than SQL auto-increment because it is stable across mutltiple
        loads and thus creates stable URIs for entries.
        """
        uniques = [self.name]
        for field in self.fields:
            if not field.key:
                continue
            obj = data.get(field.name)
            if isinstance(obj, dict):
                obj = obj.get('name', obj.get('id'))
            uniques.append(obj)
        return hash_values(uniques)

    def load(self, data):
        """ Handle a single entry of data in the mapping source format,
        i.e. with all needed columns. This will propagate to all dimensions
        and set values as appropriate. """
        entry = dict()
        for field in self.fields:
            field_data = data[field.name]
            entry.update(field.load(self.bind, field_data))
        entry['id'] = self._make_key(data)
        self._upsert(self.bind, entry, ['id'])

    def flush(self):
        """ Delete all data from the dataset tables but leave the table
        structure intact.
        """
        for dimension in self.dimensions:
            dimension.flush(self.bind)
        self._flush(self.bind)

    def drop(self):
        """ Drop all tables created as part of this dataset, i.e. by calling
        ``generate()``. This will of course also delete the data itself.
        """
        self._drop(self.bind)
        for dimension in self.dimensions:
            dimension.drop(self.bind)
        self._is_generated = False

    def key(self, key):
        """ For a given ``key``, find a column to indentify it in a query.
        A ``key`` is either the name of a simple attribute (e.g. ``time``)
        or of an attribute of a complex dimension (e.g. ``to.label``). The
        returned key is using an alias, so it can be used in a query
        directly. """
        attr = None
        if '.' in key:
            key, attr = key.split('.', 1)
        dimension = self[key]
        if hasattr(dimension, 'alias'):
            attr_name = dimension[attr].column.name if attr else 'name'
            return dimension.alias.c[attr_name]
        return self.alias.c[dimension.column.name]

    def entries(self,
                conditions="1=1",
                order_by=None,
                limit=None,
                offset=0,
                step=10000,
                fields=None):
        """ Generate a fully denormalized view of the entries on this
        table. This view is nested so that each dimension will be a hash
        of its attributes.

        This is somewhat similar to the entries collection in the fully
        denormalized schema before OpenSpending 0.11 (MongoDB).
        """
        if not self.is_generated:
            return

        if fields is None:
            fields = self.fields

        joins = self.alias
        for d in self.dimensions:
            if d in fields:
                joins = d.join(joins)
        selects = [f.selectable for f in fields] + [self.alias.c.id]

        # enforce stable sorting:
        if order_by is None:
            order_by = [self.alias.c.id.asc()]

        for i in count():
            qoffset = offset + (step * i)
            qlimit = step
            if limit is not None:
                qlimit = min(limit - (step * i), step)
            if qlimit <= 0:
                break

            query = db.select(selects,
                              conditions,
                              joins,
                              order_by=order_by,
                              use_labels=True,
                              limit=qlimit,
                              offset=qoffset)
            rp = self.bind.execute(query)

            first_row = True
            while True:
                row = rp.fetchone()
                if row is None:
                    if first_row:
                        return
                    break
                first_row = False
                yield decode_row(row, self)

    def aggregate(self,
                  measures=['amount'],
                  drilldowns=[],
                  cuts=[],
                  page=1,
                  pagesize=10000,
                  order=[]):
        """ Query the dataset for a subset of cells based on cuts and
        drilldowns. It returns a structure with a list of drilldown items
        and a summary about the slice cutted by the query.

        ``measures``
            The numeric units to be aggregated over, defaults to
            [``amount``]. (type: `list`)
        ``drilldowns``
            Dimensions to drill down to. (type: `list`)
        ``cuts``
            Specification what to cut from the cube. This is a
            `list` of `two-tuples` where the first item is the dimension
            and the second item is the value to cut from. It is turned into
            a query where multible cuts for the same dimension are combined
            to an *OR* query and then the queries for the different
            dimensions are combined to an *AND* query.
        ``page``
            Page the drilldown result and return page number *page*.
            type: `int`
        ``pagesize``
            Page the drilldown result into page of size *pagesize*.
            type: `int`
        ``order``
            Sort the result based on the dimension *sort_dimension*.
            This may be `None` (*default*) or a `list` of two-`tuples`
            where the first element is the *dimension* and the second
            element is the order (`False` for ascending, `True` for
            descending).
            Type: `list` of two-`tuples`.

        Raises:

        :exc:`ValueError`
            If a cube is not yet computed. Call :meth:`compute` to compute
            the cube.
        :exc:`KeyError`
            If a drilldown, cut or order dimension is not part of this
            cube or the order dimensions are not a subset of the drilldown
            dimensions.

        Returns: A `dict` containing the drilldown and the summary:

          {"drilldown": [
              {"num_entries": 5545,
               "amount": 41087379002.0,
               "cofog1": {"description": "",
                          "label": "Economic affairs"}},
              ... ]
           "summary": {"amount": 7353306450299.0,
                       "num_entries": 133612}}

        """

        # Get the joins (aka alias) and the dataset
        joins = alias = self.alias
        dataset = self

        # Aggregation fields are all of the measures, so we create individual
        # summary fields with the sum function of SQLAlchemy
        fields = [db.func.sum(alias.c[m]).label(m) for m in measures]
        # We append an aggregation field that counts the number of entries
        fields.append(db.func.count(alias.c.id).label("entries"))
        # Create a copy of the statistics fields (for later)
        stats_fields = list(fields)

        # Create label map for time columns (year and month) for lookup
        # since they are found under the time attribute
        labels = {
            'year': dataset['time']['year'].column_alias.label('year'),
            'month': dataset['time']['yearmonth'].column_alias.label('month'),
        }

        # Get the dimensions we're interested in. These would be the drilldowns
        # and the cuts. For compound dimensions we are only interested in the
        # most significant one (e.g. for from.name we're interested in from)
        dimensions = drilldowns + [k for k, v in cuts]
        dimensions = [d.split('.')[0] for d in dimensions]

        # Loop over the dimensions as a set (to avoid multiple occurances)
        for dimension in set(dimensions):
            # If the dimension is year or month we're interested in 'time'
            if dimension in labels:
                dimension = 'time'
            # If the dimension table isn't in the automatic joins we add it
            if dimension not in [c.table.name for c in joins.columns]:
                joins = dataset[dimension].join(joins)

        # Drilldowns are performed using group_by SQL functions
        group_by = []
        for key in drilldowns:
            # If drilldown is in labels we append its mapped column to fields
            if key in labels:
                column = labels[key]
                group_by.append(column)
                fields.append(column)
            else:
                # Get the column from the dataset
                column = dataset.key(key)
                # If the drilldown is a compound dimension or the columns table
                # is in the joins we're already fetching the column so we just
                # append it to fields and the group_by
                if '.' in key or column.table == alias:
                    fields.append(column)
                    group_by.append(column)
                else:
                    # If not we add the column table to the fields and add all
                    # of that tables columns to the group_by
                    fields.append(column.table)
                    for col in column.table.columns:
                        group_by.append(col)

        # Cuts are managed using AND statements and we use a dict with set as
        # the default value to create the filters (cut on various values)
        conditions = db.and_()
        filters = defaultdict(set)

        for key, value in cuts:
            # If the key is in labels (year or month) we get the mapped column
            # else we get the column from the dataset
            if key in labels:
                column = labels[key]
            else:
                column = dataset.key(key)
            # We add the value to the set for that particular column
            filters[column].add(value)

        # Loop over the columns in the filter and add that to the conditions
        # For every value in the set we create and OR statement so we get e.g.
        # year=2007 AND (from.who == 'me' OR from.who == 'you')
        for attr, values in filters.items():
            conditions.append(db.or_(*[attr == v for v in values]))

        # Ordering can be set by a parameter or ordered by measures by default
        order_by = []
        # If no order is defined we default to order of the measures in the
        # order they occur (furthest to the left is most significant)
        if order is None or not len(order):
            order = [(m, True) for m in measures]

        # We loop through the order list to add the columns themselves
        for key, direction in order:
            # If it's a part of the measures we have to order by the
            # aggregated values (the sum of the measure)
            if key in measures:
                column = db.func.sum(alias.c[key]).label(key)
            # If it's in the labels we have to get the mapped column
            elif key in labels:
                column = labels[key]
            # ...if not we just get the column from the dataset
            else:
                column = dataset.key(key)
            # We append the column and set the direction (True == descending)
            order_by.append(column.desc() if direction else column.asc())

        # query 1: get overall sums.
        # Here we use the stats_field we saved earlier
        query = db.select(stats_fields, conditions, joins)
        rp = dataset.bind.execute(query)
        # Execute the query and turn them to a list so we can pop the
        # entry count and then zip the measurements and the totals together
        stats = list(rp.fetchone())
        num_entries = stats.pop()
        total = zip(measures, stats)

        # query 2: get total count of drilldowns
        if len(group_by):
            # Select 1 for each group in the group_by and count them
            query = db.select(['1'], conditions, joins, group_by=group_by)
            query = db.select([db.func.count('1')], '1=1', query.alias('q'))
            rp = dataset.bind.execute(query)
            num_drilldowns, = rp.fetchone()
        else:
            # If there are no drilldowns we still have to do one
            num_drilldowns = 1

        # The drilldown result list
        drilldown = []
        # The offset in the db, based on the page and pagesize (we have to
        # modify it since page counts starts from 1 but we count from 0
        offset = int((page - 1) * pagesize)

        # query 3: get the actual data
        query = db.select(fields,
                          conditions,
                          joins,
                          order_by=order_by,
                          group_by=group_by,
                          use_labels=True,
                          limit=pagesize,
                          offset=offset)
        rp = dataset.bind.execute(query)

        while True:
            # Get each row in the db result and append it, decoded, to the
            # drilldown result. The decoded version is a json represenation
            row = rp.fetchone()
            if row is None:
                break
            result = decode_row(row, dataset)
            drilldown.append(result)

        # Create the summary based on the stats_fields and other things
        # First we add a the total for each measurement in the root of the
        # summary (watch out!) and then we add various other, self-explanatory
        # statistics such as page, number of entries. The currency value is
        # strange since it's redundant for multiple measures but is left as is
        # for backwards compatibility
        summary = {key: value for (key, value) in total}
        summary.update({
            'num_entries':
            num_entries,
            'currency': {m: dataset.currency
                         for m in measures},
            'num_drilldowns':
            num_drilldowns,
            'page':
            page,
            'pages':
            int(math.ceil(num_drilldowns / float(pagesize))),
            'pagesize':
            pagesize
        })

        return {'drilldown': drilldown, 'summary': summary}

    def timerange(self):
        """
        Get the timerange of the dataset (based on the time attribute).
        Returns a tuple of (first timestamp, last timestamp) where timestamp
        is a datetime object
        """
        try:
            # Get the time column
            time = self.key('time')
            # We use SQL's min and max functions to get the timestamps
            query = db.session.query(db.func.min(time), db.func.max(time))
            # We just need one result to get min and max time
            return [
                datetime.strptime(date, '%Y-%m-%d') if date else None
                for date in query.one()
            ]
        except:
            return (None, None)

    def __repr__(self):
        return "<Dataset(%s:%s:%s)>" % (self.name, self.dimensions,
                                        self.measures)

    def __len__(self):
        if not self.is_generated:
            return 0
        rp = self.bind.execute(self.alias.count())
        return rp.fetchone()[0]

    def as_dict(self):
        return {
            'label': self.label,
            'name': self.name,
            'description': self.description,
            'default_time': self.default_time,
            'schema_version': self.schema_version,
            'currency': self.currency,
            'category': self.category,
            'serp_title': self.serp_title,
            'serp_teaser': self.serp_teaser,
            'timestamps': {
                'created': self.created_at,
                'last_modified': self.updated_at
            },
            'languages': list(self.languages),
            'territories': list(self.territories),
            'badges': [b.as_dict(short=True) for b in self.badges]
        }

    @classmethod
    def all_by_account(cls, account):
        """ Query available datasets based on dataset visibility. """
        criteria = [cls.private == false()]
        if account is not None:
            criteria += [
                "1=1" if account.admin else "1=2",
                cls.managers.any(type(account).id == account.id)
            ]
        q = db.session.query(cls).filter(db.or_(*criteria))
        q = q.order_by(cls.label.asc())
        return q

    @classmethod
    def by_name(cls, name):
        return db.session.query(cls).filter_by(name=name).first()
예제 #6
0
class View(db.Model):
    """ A view stores a specific configuration of a visualisation widget. """

    __tablename__ = 'view'

    id = db.Column(db.Integer, primary_key=True)
    widget = db.Column(db.Unicode(2000))
    name = db.Column(db.Unicode(2000))
    label = db.Column(db.Unicode(2000))
    description = db.Column(db.Unicode())
    state = db.Column(MutableDict.as_mutable(JSONType), default=dict)
    public = db.Column(db.Boolean, default=False)

    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, onupdate=datetime.utcnow)

    dataset_id = db.Column(db.Integer, db.ForeignKey('dataset.id'))
    account_id = db.Column(db.Integer,
                           db.ForeignKey('account.id'),
                           nullable=True)

    dataset = db.relationship(Dataset,
                              backref=db.backref(
                                  'views',
                                  cascade='all,delete,delete-orphan',
                                  lazy='dynamic'))

    account = db.relationship(Account,
                              backref=db.backref(
                                  'views',
                                  cascade='all,delete,delete-orphan',
                                  lazy='dynamic'))

    def __init__(self):
        pass

    @classmethod
    def by_id(cls, id):
        return db.session.query(cls).filter_by(id=id).first()

    @classmethod
    def by_name(cls, dataset, name):
        q = db.session.query(cls).filter_by(name=name)
        return q.filter_by(dataset=dataset).first()

    @classmethod
    def all_by_dataset(cls, dataset):
        return db.session.query(cls).filter_by(dataset=dataset)

    def as_dict(self):
        return {
            'id': self.id,
            'widget': self.widget,
            'name': self.name,
            'label': self.label,
            'description': self.description,
            'state': self.state,
            'public': self.public,
            'dataset': self.dataset.name,
            'account': self.account.name if self.account else None
        }

    def __repr__(self):
        return "<View(%s,%s)>" % (self.dataset.name, self.name)
예제 #7
0
class Dataset(TableHandler, db.Model):
    """ The dataset is the core entity of any access to data. All
    requests to the actual data store are routed through it, as well
    as data loading and model generation.

    The dataset keeps an in-memory representation of the data model
    (including all dimensions and measures) which can be used to 
    generate necessary queries.
    """
    __tablename__ = 'dataset'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode(255), unique=True)
    label = db.Column(db.Unicode(2000))
    description = db.Column(db.Unicode())
    currency = db.Column(db.Unicode())
    default_time = db.Column(db.Unicode())
    schema_version = db.Column(db.Unicode())
    entry_custom_html = db.Column(db.Unicode())
    ckan_uri = db.Column(db.Unicode())
    private = db.Column(db.Boolean, default=False)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, onupdate=datetime.utcnow)
    data = db.Column(JSONType, default=dict)

    languages = db.association_proxy('_languages', 'code')
    territories = db.association_proxy('_territories', 'code')

    def __init__(self, data):
        self.data = data.copy()
        dataset = self.data['dataset']
        del self.data['dataset']
        self.label = dataset.get('label')
        self.name = dataset.get('name')
        self.description = dataset.get('description')
        self.currency = dataset.get('currency')
        self.default_time = dataset.get('default_time')
        self.entry_custom_html = dataset.get('entry_custom_html')
        self.languages = dataset.get('languages', [])
        self.territories = dataset.get('territories', [])
        self.ckan_uri = dataset.get('ckan_uri')
        self._load_model()

    @property
    def model(self):
        model = self.data.copy()
        model['dataset'] = self.as_dict()
        return model

    @property
    def mapping(self):
        return self.data.get('mapping', {})

    @db.reconstructor
    def _load_model(self):
        """ Construct the in-memory object representation of this
        dataset's dimension and measures model.

        This is called upon initialization and deserialization of
        the dataset from the SQLAlchemy store.
        """
        self.dimensions = []
        self.measures = []
        for dim, data in self.mapping.items():
            if data.get('type') == 'measure' or dim == 'amount':
                self.measures.append(Measure(self, dim, data))
                continue
            elif data.get('type') == 'date' or \
                (dim == 'time' and data.get('datatype') == 'date'):
                dimension = DateDimension(self, dim, data)
            elif data.get('type') in ['value', 'attribute']:
                dimension = AttributeDimension(self, dim, data)
            else:
                dimension = CompoundDimension(self, dim, data)
            self.dimensions.append(dimension)
        self.init()
        self._is_generated = None

    def __getitem__(self, name):
        """ Access a field (dimension or measure) by name. """
        for field in self.fields:
            if field.name == name:
                return field
        raise KeyError()

    @property
    def fields(self):
        """ Both the dimensions and metrics in this dataset. """
        return self.dimensions + self.measures

    @property
    def compounds(self):
        """ Return only compound dimensions. """
        return filter(lambda d: isinstance(d, CompoundDimension),
                      self.dimensions)

    def init(self):
        """ Create a SQLAlchemy model for the current dataset model, 
        without creating the tables and columns. This needs to be 
        called both for access to the data and in order to generate
        the model physically. """
        self.bind = db.engine  #.connect()
        self.meta = db.MetaData()
        #self.tx = self.bind.begin()
        self.meta.bind = db.engine

        self._init_table(self.meta, self.name, 'entry', id_type=db.Unicode(42))
        for field in self.fields:
            field.init(self.meta, self.table)
        self.alias = self.table.alias('entry')

    def generate(self):
        """ Create the tables and columns necessary for this dataset
        to keep data.
        """
        for field in self.fields:
            field.generate(self.meta, self.table)
        self._generate_table()
        self._is_generated = True

    @property
    def is_generated(self):
        if self._is_generated is None:
            self._is_generated = self.table.exists()
        return self._is_generated

    def commit(self):
        pass
        #self.tx.commit()
        #self.tx = self.bind.begin()

    def _make_key(self, data):
        """ Generate a unique identifier for an entry. This is better 
        than SQL auto-increment because it is stable across mutltiple
        loads and thus creates stable URIs for entries. 
        """
        uniques = [self.name]
        for field in self.fields:
            if not field.key:
                continue
            obj = data.get(field.name)
            if isinstance(obj, dict):
                obj = obj.get('name', obj.get('id'))
            uniques.append(obj)
        return hash_values(uniques)

    def load(self, data):
        """ Handle a single entry of data in the mapping source format, 
        i.e. with all needed columns. This will propagate to all dimensions
        and set values as appropriate. """
        entry = dict()
        for field in self.fields:
            field_data = data[field.name]
            entry.update(field.load(self.bind, field_data))
        entry['id'] = self._make_key(data)
        self._upsert(self.bind, entry, ['id'])

    def flush(self):
        """ Delete all data from the dataset tables but leave the table
        structure intact.
        """
        for dimension in self.dimensions:
            dimension.flush(self.bind)
        self._flush(self.bind)

    def drop(self):
        """ Drop all tables created as part of this dataset, i.e. by calling
        ``generate()``. This will of course also delete the data itself.
        """
        for dimension in self.dimensions:
            dimension.drop(self.bind)
        self._drop(self.bind)

    def key(self, key):
        """ For a given ``key``, find a column to indentify it in a query.
        A ``key`` is either the name of a simple attribute (e.g. ``time``)
        or of an attribute of a complex dimension (e.g. ``to.label``). The
        returned key is using an alias, so it can be used in a query 
        directly. """
        attr = None
        if '.' in key:
            key, attr = key.split('.', 1)
        dimension = self[key]
        if hasattr(dimension, 'alias'):
            attr_name = dimension[attr].column.name if attr else 'name'
            return dimension.alias.c[attr_name]
        return self.alias.c[dimension.column.name]

    def entries(self,
                conditions="1=1",
                order_by=None,
                limit=None,
                offset=0,
                step=10000):
        """ Generate a fully denormalized view of the entries on this 
        table. This view is nested so that each dimension will be a hash
        of its attributes. 

        This is somewhat similar to the entries collection in the fully
        denormalized schema before OpenSpending 0.11 (MongoDB).
        """
        if not self.is_generated:
            return

        joins = self.alias
        for d in self.dimensions:
            joins = d.join(joins)
        selects = [f.selectable for f in self.fields] + [self.alias.c.id]

        # enforce stable sorting:
        if order_by is None:
            order_by = [self.alias.c.id.asc()]

        for i in count():
            qoffset = offset + (step * i)
            qlimit = step
            if limit is not None:
                qlimit = min(limit - (step * i), step)
            if qlimit <= 0:
                break

            query = db.select(selects,
                              conditions,
                              joins,
                              order_by=order_by,
                              use_labels=True,
                              limit=qlimit,
                              offset=qoffset)
            rp = self.bind.execute(query)

            first_row = True
            while True:
                row = rp.fetchone()
                if row is None:
                    if first_row:
                        return
                    break
                first_row = False
                result = {}
                for k, v in row.items():
                    field, attr = k.split('_', 1)
                    field = field.replace(ALIAS_PLACEHOLDER, '_')
                    if field == 'entry':
                        result[attr] = v
                    else:
                        if not field in result:
                            result[field] = dict()

                            # TODO: backwards-compat?
                            if isinstance(self[field], CompoundDimension):
                                result[field]['taxonomy'] = self[
                                    field].taxonomy
                        result[field][attr] = v
                yield result

    def aggregate(self,
                  measure='amount',
                  drilldowns=None,
                  cuts=None,
                  page=1,
                  pagesize=10000,
                  order=None):
        """ Query the dataset for a subset of cells based on cuts and 
        drilldowns. It returns a structure with a list of drilldown items 
        and a summary about the slice cutted by the query.

        ``measure``
            The numeric unit to be aggregated over, defaults to ``amount``.
        ``drilldowns``
            Dimensions to drill down to. (type: `list`)
        ``cuts``
            Specification what to cut from the cube. This is a
            `list` of `two-tuples` where the first item is the dimension
            and the second item is the value to cut from. It is turned into
            a query where multible cuts for the same dimension are combined
            to an *OR* query and then the queries for the different
            dimensions are combined to an *AND* query.
        ``page``
            Page the drilldown result and return page number *page*.
            type: `int`
        ``pagesize``
            Page the drilldown result into page of size *pagesize*.
            type: `int`
        ``order``
            Sort the result based on the dimension *sort_dimension*.
            This may be `None` (*default*) or a `list` of two-`tuples`
            where the first element is the *dimension* and the second
            element is the order (`False` for ascending, `True` for
            descending).
            Type: `list` of two-`tuples`.

        Raises:

        :exc:`ValueError`
            If a cube is not yet computed. Call :meth:`compute` to compute 
            the cube.
        :exc:`KeyError`
            If a drilldown, cut or order dimension is not part of this
            cube or the order dimensions are not a subset of the drilldown
            dimensions.

        Returns: A `dict` containing the drilldown and the summary::

          {"drilldown": [
              {"num_entries": 5545,
               "amount": 41087379002.0,
               "cofog1": {"description": "",
                          "label": "Economic affairs"}},
              ... ]
           "summary": {"amount": 7353306450299.0,
                       "num_entries": 133612}}

        """
        cuts = cuts or []
        drilldowns = drilldowns or []
        order = order or []
        joins = self.alias
        fields = [
            db.func.sum(self.alias.c[measure]).label(measure),
            db.func.count(self.alias.c.id).label("entries")
        ]
        labels = {
            'year': self['time']['year'].column_alias.label('year'),
            'month': self['time']['yearmonth'].column_alias.label('month'),
        }
        dimensions = set(drilldowns + [k for k, v in cuts] +
                         [o[0] for o in order])
        for dimension in dimensions:
            if dimension in labels:
                _name = 'time'
            else:
                _name = dimension.split('.')[0]
            if _name not in [c.table.name for c in joins.columns]:
                joins = self[_name].join(joins)

        group_by = []
        for key in dimensions:
            if key in labels:
                column = labels[key]
                group_by.append(column)
                fields.append(column)
            else:
                column = self.key(key)
                if '.' in key or column.table == self.alias:
                    fields.append(column)
                    group_by.append(column)
                else:
                    fields.append(column.table)
                    for col in column.table.columns:
                        group_by.append(col)

        conditions = db.and_()
        filters = defaultdict(set)
        for key, value in cuts:
            if key in labels:
                column = labels[key]
            else:
                column = self.key(key)
            filters[column].add(value)
        for attr, values in filters.items():
            conditions.append(db.or_(*[attr == v for v in values]))

        order_by = []
        for key, direction in order:
            if key in labels:
                column = labels[key]
            else:
                column = self.key(key)
            order_by.append(column.desc() if direction else column.asc())

        query = db.select(fields,
                          conditions,
                          joins,
                          order_by=order_by or [measure + ' desc'],
                          group_by=group_by,
                          use_labels=True)
        summary = {measure: 0.0, 'num_entries': 0}
        drilldown = []
        rp = self.bind.execute(query)
        while True:
            row = rp.fetchone()
            if row is None:
                break
            result = {}
            for key, value in row.items():
                if key == measure:
                    summary[measure] += value or 0
                if key == 'entries':
                    summary['num_entries'] += value or 0
                if '_' in key:
                    dimension, attribute = key.split('_', 1)
                    dimension = dimension.replace(ALIAS_PLACEHOLDER, '_')
                    if dimension == 'entry':
                        result[attribute] = value
                    else:
                        if not dimension in result:
                            result[dimension] = {}

                            # TODO: backwards-compat?
                            if isinstance(self[dimension], CompoundDimension):
                                result[dimension]['taxonomy'] = \
                                        self[dimension].taxonomy
                        result[dimension][attribute] = value
                else:
                    if key == 'entries':
                        key = 'num_entries'
                    result[key] = value
            drilldown.append(result)
        offset = ((page - 1) * pagesize)

        # do we really need all this:
        summary['num_drilldowns'] = len(drilldown)
        summary['page'] = page
        summary['pages'] = int(math.ceil(len(drilldown) / float(pagesize)))
        summary['pagesize'] = pagesize

        return {
            'drilldown': drilldown[offset:offset + pagesize],
            'summary': summary
        }

    def __repr__(self):
        return "<Dataset(%s:%s:%s)>" % (self.name, self.dimensions,
                                        self.measures)

    def times(self, attribute='year'):
        """ Get all distinct times mentioned in the dataset. """
        # TODO: make this a more generic distinct_attribute function
        field = self['time'][attribute].column_alias
        query = db.select([field.label(attribute)],
                          self['time'].alias,
                          distinct=True)
        rp = self.bind.execute(query)
        return sorted([r[attribute] for r in rp.fetchall()])

    def __len__(self):
        if not self.is_generated:
            return 0
        rp = self.bind.execute(self.alias.count())
        return rp.fetchone()[0]

    def as_dict(self):
        return {
            'label': self.label,
            'name': self.name,
            'description': self.description,
            'default_time': self.default_time,
            'schema_version': self.schema_version,
            'currency': self.currency,
            'languages': list(self.languages),
            'territories': list(self.territories)
        }

    @classmethod
    def all_by_account(cls, account):
        """ Query available datasets based on dataset visibility. """
        criteria = [cls.private == False]
        if account is not None:
            criteria += [
                "1=1" if account.admin else "1=2",
                cls.managers.any(type(account).id == account.id)
            ]
        q = db.session.query(cls).filter(db.or_(*criteria))
        q = q.order_by(cls.label.asc())
        return q

    @classmethod
    def by_name(cls, name):
        return db.session.query(cls).filter_by(name=name).first()
예제 #8
0
class Run(db.Model):
    """ A run is a generic grouping object for background operations
    that perform logging to the frontend. """

    __tablename__ = 'run'

    # Status values
    STATUS_RUNNING = 'running'
    STATUS_COMPLETE = 'complete'
    STATUS_FAILED = 'failed'
    STATUS_REMOVED = 'removed'

    # Operation values for database, two operations possible
    OPERATION_SAMPLE = 'sample'
    OPERATION_IMPORT = 'import'

    id = db.Column(db.Integer, primary_key=True)
    operation = db.Column(db.Unicode(2000))
    status = db.Column(db.Unicode(2000))
    time_start = db.Column(db.DateTime, default=datetime.utcnow)
    time_end = db.Column(db.DateTime)
    dataset_id = db.Column(db.Integer,
                           db.ForeignKey('dataset.id'),
                           nullable=True)
    source_id = db.Column(db.Integer,
                          db.ForeignKey('source.id'),
                          nullable=True)

    dataset = db.relationship(Dataset,
                              backref=db.backref(
                                  'runs',
                                  order_by='Run.time_start.desc()',
                                  lazy='dynamic'))
    source = db.relationship(Source,
                             backref=db.backref(
                                 'runs',
                                 order_by='Run.time_start.desc()',
                                 lazy='dynamic'))

    def __init__(self, operation, status, dataset, source):
        self.operation = operation
        self.status = status
        self.dataset = dataset
        self.source = source

    @property
    def successful_sample(self):
        """
        Returns True if the run was a sample operation (not full import)
        and ran without failures.
        """
        return self.operation == self.OPERATION_SAMPLE and \
            self.status == self.STATUS_COMPLETE

    @property
    def successful_load(self):
        """
        Returns True if the run was an import operation (not a sample)
        and ran without failures.
        """
        return self.operation == self.OPERATION_IMPORT and \
            self.status == self.STATUS_COMPLETE

    @property
    def is_running(self):
        """
        Returns True if the run is currently running
        """
        return self.status == self.STATUS_RUNNING

    @classmethod
    def by_id(cls, id):
        return db.session.query(cls).filter_by(id=id).first()

    def __repr__(self):
        return "<Run(%s,%s)>" % (self.source.id, self.id)
예제 #9
0
class Account(db.Model):
    __tablename__ = 'account'

    id = db.Column(db.Integer, primary_key=True)
    name = db.Column(db.Unicode(255), unique=True)
    fullname = db.Column(db.Unicode(2000))
    email = db.Column(db.Unicode(2000))
    public_email = db.Column(db.Boolean, default=False)
    twitter_handle = db.Column(db.Unicode(140))
    public_twitter = db.Column(db.Boolean, default=False)
    password = db.Column(db.Unicode(2000))
    api_key = db.Column(db.Unicode(2000), default=make_uuid)
    admin = db.Column(db.Boolean, default=False)
    script_root = db.Column(db.Unicode(2000))
    terms = db.Column(db.Boolean, default=False)

    datasets = db.relationship(Dataset,
                               secondary=account_dataset_table,
                               backref=db.backref('managers', lazy='dynamic'))

    def __init__(self):
        pass

    @property
    def display_name(self):
        return self.fullname or self.name

    @property
    def token(self):
        h = hmac.new('')
        h.update(self.api_key)
        if self.password:
            h.update(self.password)
        return h.hexdigest()

    @classmethod
    def by_name(cls, name):
        return db.session.query(cls).filter_by(name=name).first()

    @classmethod
    def by_email(cls, email):
        return db.session.query(cls).filter_by(email=email).first()

    @classmethod
    def by_api_key(cls, api_key):
        return db.session.query(cls).filter_by(api_key=api_key).first()

    def as_dict(self):
        """
        Return the dictionary representation of the account
        """

        # Dictionary will include name, fullname, email and the admin bit
        account_dict = {
            'name': self.name,
            'fullname': self.fullname,
            'email': self.email,
            'admin': self.admin
        }

        # If the user has a twitter handle we add it
        if self.twitter_handle is not None:
            account_dict['twitter'] = self.twitter_handle

        # Return the dictionary representation
        return account_dict