def Column(*args, **kwargs): """Wrap the standard Column to allow to add some FormAlchemy options to a model field. Basically label and renderer but all the values are passed to :meth:`~formalchemy.fields.AbstractField.set`:: >>> from sqlalchemy import Integer >>> from sqlalchemy.ext.declarative import declarative_base >>> from formalchemy import Column >>> Base = declarative_base() >>> class MyArticle(Base): ... __tablename__ = 'myarticles' ... id = Column(Integer, primary_key=True, label='My id') >>> MyArticle.__table__.c.id.info {'label': 'My id'} """ info = kwargs.get('info', {}) drop = set() for k, v in kwargs.items(): if k in column_options: info[k] = v drop.add(k) for k in drop: del kwargs[k] if info: kwargs['info'] = info return SAColumn(*args, **kwargs)
class User(Base): """Track processes and operations on database objects""" __tablename__ = 'users' id = SAColumn('pr_id', Integer, primary_key=True) username fullname email password org facebookid githubid googleid linkedinid data group = SAColumn('pr_group', Integer, ForeignKey('processes.pr_id'), nullable=True, index=True) parent = relationship('Process', remote_side=[id], backref='children') stage = SAColumn('pr_stage', Integer, default=0) phase = SAColumn('pr_phase', Text, doc='Process phase: such as ingest or build') hostname = SAColumn('pr_host', Text) pid = SAColumn('pr_pid', Integer) d_vid = SAColumn('pr_d_vid', String(13), ForeignKey('datasets.d_vid'), nullable=False, index=True) dataset = relationship('Dataset', backref='process_records') t_vid = SAColumn('pr_t_vid', String(15), ForeignKey('tables.t_vid'), nullable=True, index=True) table = relationship('Table', backref='process_records') s_vid = SAColumn('pr_s_vid', String(17), ForeignKey('datasources.ds_vid'), nullable=True, index=True) source = relationship('DataSource', backref='process_records') p_vid = SAColumn('pr_p_vid', String(17), ForeignKey('partitions.p_vid'), nullable=True, index=True) partition = relationship('Partition', backref='process_records') created = SAColumn( 'pr_created', Float, doc='Creation date: time in seconds since the epoch as a integer.') modified = SAColumn( 'pr_modified', Float, doc='Modification date: time in seconds since the epoch as a integer.') item_type = SAColumn('pr_type', Text, doc='Item type, such as table, source or partition') item_count = SAColumn('pr_count', Integer, doc='Number of items processed') item_total = SAColumn('pr_items', Integer, doc='Number of items to be processed') message = SAColumn('pr_message', Text) state = SAColumn('pr_state', Text) exception_class = SAColumn('pr_ex_class', Text) exception_trace = SAColumn('pr_ex_trace', Text) log_action = SAColumn('pr_action', Text) data = SAColumn('pr_data', MutationDict.as_mutable(JSONEncodedObj)) def __repr__(self): return "{} {}/{} {}:{} {} {}".format(self.d_vid, self.hostname, self.pid, self.phase if self.phase else '?', self.stage, self.log_action, self.message) def __str__(self): return "{} {}/{} {}:{} {} {}".format(self.d_vid, self.hostname, self.pid, self.phase if self.phase else '?', self.stage, self.log_action, self.message) @property def dict(self): """A dict that holds key/values for all of the properties in the object. :return: """ from collections import OrderedDict return OrderedDict((p.key, getattr(self, p.key)) for p in self.__mapper__.attrs if p.key not in ('partition', 'source', 'table', 'dataset', 'children', 'parent')) @staticmethod def before_insert(mapper, conn, target): from time import time target.created = time() Process.before_update(mapper, conn, target) @staticmethod def before_update(mapper, conn, target): from time import time target.modified = time()
class Remote(Base): __tablename__ = 'remote' id = SAColumn('rm_id', Integer, primary_key=True) short_name = SAColumn('rm_short_name', Text, index=True, unique=True) service = SAColumn('rm_service', Text, index=True) # ambry, s3 or fs url = SAColumn('rm_url', Text) d_vid = SAColumn('rm_d_vid', String(20), ForeignKey('datasets.d_vid'), index=True) username = SAColumn('rm_username', Text, doc='Account username, the ARN for S3') access = SAColumn('rm_access', Text, doc='Access key or username') secret = SAColumn('rm_secret', Text, doc='Secret key or password') # These are deprecated. They are properties of a host, not a remote docker_url = SAColumn('rm_docker_url', Text) # These are deprecated, and should be removed when docker support is changed db_name = SAColumn('rm_db_name', Text) vol_name = SAColumn('rm_vol_name', Text) db_dsn = SAColumn('rm_db_dsn', Text) # Base virtual host name, applied to the docker host. virtual_host = SAColumn('rm_virtual_host', Text, doc='Virtual host name, for web proxy') data = SAColumn('rm_data', MutationDict.as_mutable(JSONEncodedObj)) # Temp variables, not stored account_accessor = None # Set externally to allow access to the account credentials tr_db_password = None @property def api_token(self): # old name return self.jwt_secret @property def access_key(self): #Synonym, to have same name as in account record return self.access @property def is_api(self): return self.service in ('ambry', 'docker') @property def dict(self): """A dict that holds key/values for all of the properties in the object. :return: """ from collections import OrderedDict d = OrderedDict([(p.key, getattr(self, p.key)) for p in self.__mapper__.attrs if p.key not in ('data', )]) if 'list' in self.data: d['bundle_count'] = len(self.data['list']) else: d['bundle_count'] = None if self.data: for k, v in self.data.items(): d[k] = v return d @property def db_password(self): from ambry.util import parse_url_to_dict d = parse_url_to_dict(self.db_dsn) return d['password'] @property def db_host(self): from ambry.util import parse_url_to_dict d = parse_url_to_dict(self.db_dsn) return d['hostname'] @property def admin_pw(self): return self.data.get( 'admin_pw') # Set in dockr.py in the ambry_admin module def _api_client(self): from ambry_client import Client from ambry.util import set_url_part username = '******' try: account = self.account_accessor( set_url_part(self.url, username=username)) except KeyError: pass c = Client(self.url, username, account['secret']) return c @property def api_client(self): return self._api_client() def update(self): """Cache the list into the data section of the record""" from ambry.orm.exc import NotFoundError from requests.exceptions import ConnectionError, HTTPError from boto.exception import S3ResponseError d = {} try: for k, v in self.list(full=True): if not v: continue d[v['vid']] = { 'vid': v['vid'], 'vname': v.get('vname'), 'id': v.get('id'), 'name': v.get('name') } self.data['list'] = d except (NotFoundError, ConnectionError, S3ResponseError, HTTPError) as e: raise RemoteAccessError("Failed to update {}: {}".format( self.short_name, e)) def list(self, full=False): """List all of the bundles in the remote""" if self.is_api: return self._list_api(full=full) else: return self._list_fs(full=full) def _list_fs(self, full=False): assert self.account_accessor from fs.errors import ResourceNotFoundError from os.path import join from json import loads remote = self._fs_remote(self.url) # HTTP can't list, so we have to use a cached collection of list entries. # Use 'ambry remote <remote> update-listing' to create the cache if self.url.startswith('http'): try: for e in loads( remote.getcontents(os.path.join('_meta', 'list.json'))): if full: yield (e['vname'], e) else: yield e['vname'] except ResourceNotFoundError: return try: for e in remote.listdir('_meta/vname'): if full: r = loads(remote.getcontents(join('_meta/vname', e))) yield (e, r) else: yield e except ResourceNotFoundError: # An old repo, doesn't have the meta/name values. for fn in remote.walkfiles(wildcard='*.db'): this_name = fn.strip('/').replace('/', '.').replace('.db', '') if full: yield this_name else: # Isn't any support for this yield (this_name, None) def _update_fs_list(self): """Cache the full list for http access. This creates a meta file that can be read all at once, rather than requiring a list operation like S3 access does""" from json import dumps full_list = [e[1] for e in self._list_fs(full=True)] remote = self._fs_remote(self.url) remote.setcontents(os.path.join('_meta', 'list.json'), dumps(full_list, indent=4)) def _list_api(self, full=False): c = self._api_client() for d in c.list(): if full: yield (d.name, d) else: yield d.name def find(self, ref): if self.is_api: return self._find_api(ref) else: return self._find_fs(ref) def _find_fs(self, ref): from fs.errors import ResourceNotFoundError from ambry.orm.exc import NotFoundError import json remote = self._fs_remote(self.url) path_parts = ['vname', 'vid', 'name', 'id'] for p in path_parts: path = "/_meta/{}/{}".format(p, ref) try: e = remote.getcontents(path) return json.loads(e) except ResourceNotFoundError: pass raise NotFoundError("Failed to find bundle for ref '{}' ".format(ref)) def _find_api(self, ref): c = self._api_client() return c.dataset(ref) def checkin(self, package, no_partitions=False, force=False, cb=None): """ Check in a bundle package to the remote. :param package: A Database, referencing a sqlite database holding the bundle :param cb: a two argument progress callback: cb(message, num_records) :return: """ from ambry.orm.exc import NotFoundError if not os.path.exists(package.path): raise NotFoundError("Package path does not exist: '{}' ".format( package.path)) if self.is_api: return self._checkin_api(package, no_partitions=no_partitions, force=force, cb=cb) else: return self._checkin_fs(package, no_partitions=no_partitions, force=force, cb=cb) def _checkin_fs(self, package, no_partitions=False, force=False, cb=None): from fs.errors import NoPathURLError, NoSysPathError from ambry.orm import Partition assert self.account_accessor remote = self._fs_remote(self.url) ds = package.package_dataset db_ck = ds.identity.cache_key + '.db' if cb: def cb_one_arg(n): cb('Uploading package', n) else: def cb_one_arg(n): logger.info('Uploading package {} bytes'.format(n)) with open(package.path) as f: remote.makedir(os.path.dirname(db_ck), recursive=True, allow_recreate=True) e = remote.setcontents_async(db_ck, f, progress_callback=cb_one_arg) e.wait() if package.library: for p in package.session.query(Partition).filter( Partition.type == Partition.TYPE.UNION).all(): self._put_partition_fs(remote, p, package.library, force=force, cb=cb) self._put_metadata(remote, ds) try: return remote, remote.getpathurl(db_ck) except NoPathURLError: pass try: return remote, remote.getsyspath(db_ck) except NoSysPathError: pass return remote, None def _checkin_api(self, package, no_partitions=False, force=False, cb=None): c = self._api_client() return c.library.checkin(package, force=force, cb=cb) @staticmethod def _meta_infos(ds): import json from six import text_type identity = ds.identity d = identity.dict d['summary'] = ds.config.metadata.about.summary d['title'] = ds.config.metadata.about.title ident = json.dumps(d) return ((os.path.join('_meta', 'vid', identity.vid), ident), (os.path.join('_meta', 'id', identity.id_), ident), (os.path.join('_meta', 'vname', text_type(identity.vname)), ident), (os.path.join('_meta', 'name', text_type(identity.name)), ident)) def _put_metadata(self, fs_remote, ds): """Store metadata on a pyfs remote""" from six import text_type from fs.errors import ResourceNotFoundError identity = ds.identity d = identity.dict d['summary'] = ds.config.metadata.about.summary d['title'] = ds.config.metadata.about.title meta_stack = self._meta_infos(ds) def do_metadata(): for path, ident in meta_stack: fs_remote.setcontents(path, ident) try: # Assume the directories already exist do_metadata() except ResourceNotFoundError: # Nope, make them and try again. parts = ['vid', 'id', 'vname', 'name'] for p in parts: dirname = os.path.join('_meta', p) fs_remote.makedir(dirname, allow_recreate=True, recursive=True) do_metadata() def put_partition(self, cb=None): """Store a partition on the remote""" raise NotImplementedError() pass def _put_partition_fs(self, fs_remote, p, library, force=False, cb=None): if cb: def cb_one_arg(n): cb('Uploading partition {}'.format(p.identity.name), n) else: cb_one_arg = None if not library: return p = library.partition(p.vid) with p.datafile.open(mode='rb') as fin: fs_remote.makedir(os.path.dirname(p.datafile.path), recursive=True, allow_recreate=True) exists = fs_remote.exists(p.datafile.path) if force or not exists: event = fs_remote.setcontents_async( p.datafile.path, fin, progress_callback=cb_one_arg) event.wait() else: cb('Partition {} already exists on remote'.format(p.vid), 0) def _put_partition_api(self, p, cb=None): raise NotImplementedError() pass def checkout(self, ref, cb=None): """Checkout a bundle from the remote. Returns a file-like object""" if self.is_api: return self._checkout_api(ref, cb=cb) else: return self._checkout_fs(ref, cb=cb) def _checkout_api(self, ref, cb=None): raise NotImplementedError() def _checkout_fs(self, ref, cb=None): remote = self._fs_remote(self.url) d = self._find_fs(ref) return remote.open(d['cache_key'] + '.db', 'rb') def get_partition(self): """Get a partition from the remote""" pass def remove(self, ref, cb=None): """Check in a bundle to the remote""" if self.is_api: return self._remove_api(ref, cb) else: return self._remove_fs(ref, cb) def _remove_fs(self, ref, cb=None): from fs.errors import ResourceNotFoundError from os.path import join remote = self._fs_remote(self.url) def safe_remove(path): try: remote.remove(path) if cb: cb('Removed {}'.format(path)) except ResourceNotFoundError as e: if cb: cb("Failed to remove '{}': {}".format(path, e)) info = self._find_fs(ref) db_ck = info['cache_key'] + '.db' if cb: cb('Removing {}'.format(db_ck)) safe_remove(db_ck) for dir, files in remote.walk(info['cache_key']): for f in files: path = join(dir, f) safe_remove(path) for p in [ join('_meta', 'vid', info['vid']), join('_meta', 'id', info['id']), join('_meta', 'vname', info['vname']), join('_meta', 'name', info['name']) ]: safe_remove(p) # FIXME! Doesn't remove partitions return info['vid'] def _remove_api(self, ref, cb=None): info = self._find_api(ref) c = self._api_client() c.library.remove(ref) def _fs_remote(self, url): from ambry.util import parse_url_to_dict d = parse_url_to_dict(url) if d['scheme'] == 's3': return self.s3(url, access=self.access, secret=self.secret) else: from fs.opener import fsopendir return fsopendir(url) @property def fs(self): """Return a pyfs object""" return self._fs_remote(self.url) def s3(self, url, account_acessor=None, access=None, secret=None): """Setup an S3 pyfs, with account credentials, fixing an ssl matching problem""" from ambry.util.ambrys3 import AmbryS3FS from ambry.util import parse_url_to_dict import ssl pd = parse_url_to_dict(url) if account_acessor: account = account_acessor(pd['hostname']) assert account['account_id'] == pd['hostname'] aws_access_key = account['access_key'], aws_secret_key = account['secret'] else: aws_access_key = access aws_secret_key = secret assert access, url assert secret, url s3 = AmbryS3FS( bucket=pd['netloc'], prefix=pd['path'].strip('/') + '/', aws_access_key=aws_access_key, aws_secret_key=aws_secret_key, ) return s3 def __str__(self): return '{};{}'.format(self.short_name, self.url) @staticmethod def before_insert(mapper, conn, target): Remote.before_update(mapper, conn, target) @staticmethod def before_update(mapper, conn, target): url = target.url if not target.service and url: if url.startswith('s3:'): target.service = 's3' elif url.startswith('http'): target.service = 'ambry' else: target.service = 'fs'
class Column(Base): __tablename__ = 'columns' _parent_col = 'c_t_vid' vid = SAColumn('c_vid', String(18), primary_key=True) id = SAColumn('c_id', String(15)) # Probably not necessary sequence_id = SAColumn('c_sequence_id', Integer) is_primary_key = SAColumn('c_is_primary_key', Boolean, default=False) t_vid = SAColumn('c_t_vid', String(15), ForeignKey('tables.t_vid'), nullable=False, index=True) d_vid = SAColumn('c_d_vid', String(13), ForeignKey('datasets.d_vid'), nullable=False, index=True) t_id = SAColumn('c_t_id', String(12)) #source_name = SAColumn('c_source_name', Text, index=True) name = SAColumn('c_name', Text, index=True) altname = SAColumn('c_altname', Text) datatype = SAColumn('c_datatype', Text) valuetype = SAColumn('c_valuetype', Text) start = SAColumn( 'c_start', Integer, doc='For fixed width files, the starting position of the column') size = SAColumn( 'c_size', Integer, doc='For fixed width files, the ending position of the column') width = SAColumn('c_width', Integer, doc='For fixed width files, the width of the column') default = SAColumn('c_default', Text) illegal_value = SAColumn('c_illegal_value', Text) # A special value meaning N/A or nan, etc. summary = SAColumn('c_summary', Text) description = SAColumn('c_description', Text) keywords = SAColumn('c_keywords', Text) lom = SAColumn( 'c_lom', String(1), doc= 'Level of Measurement: n,o,i,r for Nominal, Ordinal, Interval, Ratio') role = SAColumn('c_role', String(1), doc='Role: key, dimension, measure, error, name') scale = SAColumn( 'c_scale', Float, doc= 'Number of measure units per natural units. Ie, if 1 == 1000 people, scale = 1000' ) units = SAColumn('c_units', Text) universe = SAColumn('c_universe', Text) parent = SAColumn('c_parent', Text) derivedfrom = SAColumn('c_derivedfrom', Text) numerator = SAColumn('c_numerator', String(20)) denominator = SAColumn('c_denominator', String(20)) # New column value casters and generators _transform = SAColumn('c_transform', Text) data = SAColumn('c_data', MutationDict.as_mutable(JSONEncodedObj)) # This column should really be called 'value labels' codes = relationship(Code, backref='column', order_by='asc(Code.key)', cascade='save-update, delete, delete-orphan') __table_args__ = ( UniqueConstraint('c_sequence_id', 'c_t_vid', name='_uc_c_sequence_id'), UniqueConstraint('c_name', 'c_t_vid', name='_uc_c_name'), ) # FIXME. These types should be harmonized with SourceColumn.DATATYPE DATATYPE_STR = six.binary_type.__name__ DATATYPE_UNICODE = six.text_type.__name__ DATATYPE_INTEGER = 'int' DATATYPE_INTEGER64 = 'long' if six.PY2 else 'int' DATATYPE_FLOAT = 'float' DATATYPE_DATE = 'date' DATATYPE_TIME = 'time' DATATYPE_TIMESTAMP = 'timestamp' DATATYPE_DATETIME = 'datetime' DATATYPE_BLOB = 'blob' DATATYPE_POINT = 'point' # Spatalite, sqlite extensions for geo DATATYPE_LINESTRING = 'linestring' # Spatalite, sqlite extensions for geo DATATYPE_POLYGON = 'polygon' # Spatalite, sqlite extensions for geo DATATYPE_MULTIPOLYGON = 'multipolygon' # Spatalite, sqlite extensions for geo DATATYPE_GEOMETRY = 'geometry' # Spatalite, sqlite extensions for geo types = { # Sqlalchemy, Python, Sql, # Here, 'str' means ascii, 'unicode' means not ascii. # FIXME: Change names to DATATYPE_ASCII, DATATYPE_NOT_ASCII because it confuses while # python2/python3 porting. DATATYPE_STR: (sqlalchemy.types.String, six.binary_type, 'VARCHAR'), DATATYPE_UNICODE: (sqlalchemy.types.String, six.text_type, 'VARCHAR'), DATATYPE_INTEGER: (sqlalchemy.types.Integer, int, 'INTEGER'), DATATYPE_INTEGER64: (BigIntegerType, int, 'INTEGER64'), DATATYPE_FLOAT: (sqlalchemy.types.Float, float, 'REAL'), DATATYPE_DATE: (sqlalchemy.types.Date, datetime.date, 'DATE'), DATATYPE_TIME: (sqlalchemy.types.Time, datetime.time, 'TIME'), DATATYPE_TIMESTAMP: (sqlalchemy.types.DateTime, datetime.datetime, 'TIMESTAMP'), DATATYPE_DATETIME: (sqlalchemy.types.DateTime, datetime.datetime, 'DATETIME'), DATATYPE_POINT: (GeometryType, six.binary_type, 'POINT'), DATATYPE_LINESTRING: (GeometryType, six.binary_type, 'LINESTRING'), DATATYPE_POLYGON: (GeometryType, six.binary_type, 'POLYGON'), DATATYPE_MULTIPOLYGON: (GeometryType, six.binary_type, 'MULTIPOLYGON'), DATATYPE_GEOMETRY: (GeometryType, six.binary_type, 'GEOMETRY'), DATATYPE_BLOB: (sqlalchemy.types.LargeBinary, buffer, 'BLOB') } def __init__(self, **kwargs): super(Column, self).__init__(**kwargs) assert self.sequence_id is not None if not self.name: self.name = 'column' + str(self.sequence_id) # raise ValueError('Column must have a name. Got: {}'.format(kwargs)) # Don't allow these values to be the empty string self.transform = self.transform or None @classmethod def python_types(cls): return [e[1] for e in six.itervalues(cls.types)] def type_is_int(self): return self.python_type == int def type_is_real(self): return self.python_type == float def type_is_number(self): return self.type_is_real or self.type_is_int def type_is_text(self): return self.datatype == Column.DATATYPE_STR or self.datatype == Column.DATATYPE_UNICODE def type_is_geo(self): return self.datatype in (Column.DATATYPE_POINT, Column.DATATYPE_LINESTRING, Column.DATATYPE_POLYGON, Column.DATATYPE_MULTIPOLYGON, Column.DATATYPE_GEOMETRY) def type_is_gvid(self): return 'gvid' in self.name def type_is_time(self): return self.datatype in (Column.DATATYPE_TIME, Column.DATATYPE_TIMESTAMP) def type_is_date(self): return self.datatype in (Column.DATATYPE_TIMESTAMP, Column.DATATYPE_DATETIME, Column.DATATYPE_DATE) def type_is_builtin(self): """Return False if the datatype is not one of the builtin type""" return self.datatype in self.types @property def sqlalchemy_type(self): return self.types[self.datatype][0] @property def valuetype_class(self): """Return the valuetype class, if one is defined, or a built-in type if it isn't""" from ambry.valuetype import resolve_value_type if self.valuetype: return resolve_value_type(self.valuetype) else: return resolve_value_type(self.datatype) @property def valuetype_description(self): """Return the valuetype class, if one is defined, or a built-in type if it isn't""" from ambry.valuetype import resolve_value_type return self.valuetype_class.desc @property def python_type(self): """Return the python type for the row, possibly getting it from a valuetype reference """ from ambry.valuetype import resolve_value_type if self.valuetype and resolve_value_type(self.valuetype): return resolve_value_type(self.valuetype)._pythontype elif self.datatype: try: return self.types[self.datatype][1] except KeyError: return resolve_value_type(self.datatype)._pythontype else: from ambry.exc import ConfigurationError raise ConfigurationError( "Can't get python_type: neither datatype of valuetype is defined" ) @property def role(self): '''Return the code for the role, measure, dimension or error''' from ambry.valuetype.core import ROLE if not self.valuetype_class: return '' role = self.valuetype_class.role if role == ROLE.UNKNOWN: vt_code = self.valuetype_class.vt_code if len(vt_code) == 1 or vt_code[1] == '/': return vt_code[0] else: return '' return role @property def is_dimension(self): """Return true if the colum is a dimension""" from ambry.valuetype.core import ROLE return self.role == ROLE.DIMENSION @property def is_measure(self): """Return true if the colum is a dimension""" from ambry.valuetype.core import ROLE return self.role == ROLE.MEASURE @property def is_label(self): """Return true if the colum is a dimension""" from ambry.valuetype.core import ROLE return self.role == ROLE.LABEL @property def is_error(self): """Return true if the colum is a dimension""" from ambry.valuetype.core import ROLE return self.role == ROLE.ERROR @property def role_description(self): from ambry.valuetype.core import role_descriptions return role_descriptions.get(self.role, '') @property def has_nulls(self): """Return True if the datatype allows for null values ( it is specified with a '?' at the end ) """ return self.valuetype.endswith('?') @property def children(self): """"Return the table's other column that have this column as a parent, excluding labels""" for c in self.table.columns: if c.parent == self.name and not c.valuetype_class.is_label(): yield c @property def label(self): """"Return first child of the column that is marked as a label. Returns self if the column is a label""" if self.valuetype_class.is_label(): return self for c in self.table.columns: if c.parent == self.name and c.valuetype_class.is_label(): return c return None @property def label_or_self(self): """List label(), but also returns self is there is no label""" l = self.label if not l: return self return l @property def geoid(self): """"Return first child of the column, or self that is marked as a geographic identifier""" if self.valuetype_class.is_geoid(): return self for c in self.table.columns: if c.parent == self.name and c.valuetype_class.is_geoid(): return c def python_cast(self, v): """Cast a value to the type of the column. Primarily used to check that a value is valid; it will throw an exception otherwise """ if self.type_is_time(): dt = dateutil.parser.parse(v) if self.datatype == Column.DATATYPE_TIME: dt = dt.time() if not isinstance(dt, self.python_type): raise TypeError('{} was parsed to {}, expected {}'.format( v, type(dt), self.python_type)) return dt else: # This isn't calling the python_type method -- it's getting a python type, then instantialting it, # such as "int(v)" return self.python_type(v) @property def schema_type(self): if not self.datatype: from .exc import ConfigurationError raise ConfigurationError("Column '{}' has no datatype".format( self.name)) # let it fail with KeyError if datatype is unknown. pt = self.python_type.__name__ return self.types[pt][2] @classmethod def convert_numpy_type(cls, dtype): """Convert a numpy dtype into a Column datatype. Only handles common types. Implemented as a function to decouple from numpy """ m = { 'int64': cls.DATATYPE_INTEGER64, 'float64': cls.DATATYPE_FLOAT, 'object': cls.DATATYPE_TEXT # Hack. Pandas makes strings into object. } t = m.get(dtype.name, None) if not t: raise TypeError("Failed to convert numpy type: '{}' ".format( dtype.name)) return t @classmethod def convert_python_type(cls, py_type_in, name=None): type_map = {six.text_type: six.binary_type} for col_type, (sla_type, py_type, sql_type) in six.iteritems(cls.types): if py_type == type_map.get(py_type_in, py_type_in): if col_type == 'blob' and name and name.endswith('geometry'): return cls.DATATYPE_GEOMETRY elif sla_type != GeometryType: # Total HACK. FIXME return col_type return None @property def foreign_key(self): return self.fk_vid @property def dest_header(self): """Allows destination tables to be used as source tables when creating schema from a 'partition' source""" if self.altname: return self.altname else: return self.name @property def has_codes(self): """Allows destination tables to be used as source tables when creating schema from a 'partition' source""" return False @property def dict(self): """A dict that holds key/values for all of the properties in the object. :return: """ d = { p.key: getattr(self, p.key) for p in self.__mapper__.attrs if p.key not in ('table', 'stats', '_codes', 'data') } if not d: raise Exception(self.__dict__) d['schema_type'] = self.schema_type if self.data: # Copy data fields into top level dict, but don't overwrite existind values. for k, v in six.iteritems(self.data): if k not in d and k not in ('table', 'stats', '_codes', 'data'): d[k] = v return d @property def nonull_dict(self): """Like dict, but does not hold any null values. :return: """ return { k: v for k, v in six.iteritems(self.dict) if v and k != '_codes' } @staticmethod def mangle_name(name): """Mangles a column name to a standard form, remoing illegal characters. :param name: :return: """ import re try: return re.sub('_+', '_', re.sub('[^\w_]', '_', name).lower()).rstrip('_') except TypeError: raise TypeError('Trying to mangle name with invalid type of: ' + str(type(name))) @property @memoize def reverse_code_map(self): """Return a map from a code ( usually a string ) to the shorter numeric value""" return {c.value: (c.ikey if c.ikey else c.key) for c in self.codes} @property @memoize def forward_code_map(self): """Return a map from the short code to the full value """ return {c.key: c.value for c in self.codes} def add_code(self, key, value, description=None, data=None, source=None): """ :param key: The code value that appears in the datasets, either a string or an int :param value: The string value the key is mapped to :param description: A more detailed description of the code :param data: A data dict to add to the ORM record :return: the code record """ # Ignore codes we already have, but will not catch codes added earlier for this same # object, since the code are cached from six import text_type for cd in self.codes: if cd.key == text_type(key): return cd def cast_to_int(s): try: return int(s) except (TypeError, ValueError): return None cd = Code(c_vid=self.vid, t_vid=self.t_vid, key=text_type(key), ikey=cast_to_int(key), value=value, source=source, description=description, data=data) self.codes.append(cd) return cd @property def transform(self): return self._transform @transform.setter def transform(self, v): self._transform = self.clean_transform(v) @staticmethod def make_xform_seg(init_=None, datatype=None, transforms=None, exception=None, column=None): return { 'init': init_, 'transforms': transforms if transforms else [], 'exception': exception, 'datatype': datatype, 'column': column } @staticmethod def _expand_transform(transform): from ambry.dbexceptions import ConfigurationError if not bool(transform): return [] transform = transform.rstrip('|') segments = [] for i, seg_str in enumerate( transform.split(';')): #';' seperates pipe stages pipes = seg_str.split('|') # eperates pipes in each stage. d = Column.make_xform_seg() for pipe in pipes: if not pipe.strip(): continue if pipe[0] == '^': # First, the initializer if d['init']: raise ConfigurationError( 'Can only have one initializer in a pipeline segment' ) if i != 0: raise ConfigurationError( 'Can only have an initializer in the first pipeline segment' ) d['init'] = pipe[1:] elif pipe[0] == '!': # Exception Handler if d['exception']: raise ConfigurationError( 'Can only have one exception handler in a pipeline segment' ) d['exception'] = pipe[1:] else: # Assume before the datatype d['transforms'].append(pipe) segments.append(d) return segments @property def expanded_transform(self): """Expands the transform string into segments """ segments = self._expand_transform(self.transform) if segments: segments[0]['datatype'] = self.valuetype_class for s in segments: s['column'] = self else: segments = [ self.make_xform_seg(datatype=self.valuetype_class, column=self) ] # If we want to add the find datatype cast to a transform. #segments.append(self.make_xform_seg(transforms=["cast_"+self.datatype], column=self)) return segments @staticmethod def clean_transform(transform): segments = Column._expand_transform(transform) def pipeify_seg(seg): o = [] seg['init'] and o.append('^' + seg['init']) o += seg['transforms'] seg['exception'] and o.append('!' + seg['exception']) return '|'.join(o) return ';'.join(pipeify_seg(seg) for seg in segments) @property def row(self): from collections import OrderedDict # Use an Ordered Dict to make it friendly to creating CSV files. name_map = {'name': 'column'} d = OrderedDict([('table', self.table.name)] + [(name_map.get(p.key, p.key), getattr(self, p.key)) for p in self.__mapper__.attrs if p.key not in [ 'codes', 'dataset', 'stats', 'table', 'd_vid', 'vid', 't_vid', 'id', 'is_primary_key', 'data' ]]) d['transform'] = d['_transform'] del d['_transform'] if self.name == 'id': t = self.table d['description'] = t.description data = t.data else: data = self.data for k, v in six.iteritems(data): d['d_' + k] = v assert 'data' not in d return d def __repr__(self): return '<column: {}, {}>'.format(self.name, self.vid) @staticmethod def update_number(target): ton = ObjectNumber.parse(target.t_vid) con = ColumnNumber(ton, target.sequence_id) target.id = str(ton.rev(None)) target.vid = str(con) target.id = str(con.rev(None)) target.d_vid = str(ObjectNumber.parse(target.t_vid).as_dataset) @staticmethod def before_insert(mapper, conn, target): """event.listen method for Sqlalchemy to set the seqience_id for this object and create an ObjectNumber value for the id_""" # from identity import ObjectNumber # assert not target.fk_vid or not ObjectNumber.parse(target.fk_vid).revision if target.sequence_id is None: from ambry.orm.exc import DatabaseError raise DatabaseError('Must have sequence_id before insertion') # Check that the id column is always sequence id 1 assert (target.name == 'id') == (target.sequence_id == 1), ( target.name, target.sequence_id) Column.before_update(mapper, conn, target) @staticmethod def before_update(mapper, conn, target): """Set the column id number based on the table number and the sequence id for the column.""" assert target.datatype or target.valuetype target.name = Column.mangle_name(target.name) Column.update_number(target)
class Table(Base, DictableMixin): __tablename__ = 'tables' vid = SAColumn('t_vid', String(15), primary_key=True) id = SAColumn('t_id', String(12), primary_key=False) d_id = SAColumn('t_d_id', String(10)) d_vid = SAColumn('t_d_vid', String(13), ForeignKey('datasets.d_vid'), index=True) sequence_id = SAColumn('t_sequence_id', Integer, nullable=False) name = SAColumn('t_name', String(200), nullable=False) altname = SAColumn('t_altname', Text) summary = SAColumn('t_summary', Text) description = SAColumn('t_description', Text) universe = SAColumn('t_universe', String(200)) keywords = SAColumn('t_keywords', Text) type = SAColumn('t_type', String(20)) # Reference to a column that provides an example of how this table should be used. proto_vid = SAColumn('t_proto_vid', String(20), index=True) installed = SAColumn('t_installed', String(100)) data = SAColumn('t_data', MutationDict.as_mutable(JSONEncodedObj)) c_sequence_id = SAColumn('t_c_sequence_id', Integer, default=1) __table_args__ = ( UniqueConstraint('t_sequence_id', 't_d_vid', name='_uc_tables_1'), UniqueConstraint('t_name', 't_d_vid', name='_uc_tables_2'), ) columns = relationship(Column, backref='table', order_by='asc(Column.sequence_id)', cascade='all, delete-orphan', lazy='joined') _column_sequence = {} @staticmethod def mangle_name(name, preserve_case=False): import re assert name try: r = re.sub('[^\w_]', '_', name.strip()) if not preserve_case: r = r.lower() return r except TypeError: raise TypeError('Not a valid type for name ' + str(type(name))) @property def primary_columns(self): """Iterate over the primary columns, columns which do not have a parent""" for c in self.columns: if not c.parent: yield c @property def dimensions(self): """Iterate over the dimension columns, regardless of parent/child status """ from ambry.valuetype.core import ROLE for c in self.columns: if c.role == ROLE.DIMENSION: yield c @property def primary_dimensions(self): """Iterate over the primary dimension columns, columns which do not have a parent """ from ambry.valuetype.core import ROLE for c in self.columns: if not c.parent and c.role == ROLE.DIMENSION: yield c @property def primary_measures(self): """Iterate over the primary columns, columns which do not have a parent Also sets the property partition_stats to the stats collection for the partition and column. """ from ambry.valuetype.core import ROLE for c in self.columns: if not c.parent and c.role == ROLE.MEASURE: yield c def column(self, ref): # AFAIK, all of the columns in the relationship will get loaded if any one is accessed, # so iterating over the collection only involves one SELECT. from .column import Column column_name = Column.mangle_name(str(ref)) for c in self.columns: if str(column_name) == c.name or str(ref) == c.id or str( ref) == c.vid: return c raise NotFoundError( "Failed to find column '{}' in table '{}' for ref: '{}' ".format( ref, self.name, ref)) def add_column(self, name, update_existing=False, **kwargs): """ Add a column to the table, or update an existing one. :param name: Name of the new or existing column. :param update_existing: If True, alter existing column values. Defaults to False :param kwargs: Other arguments for the the Column() constructor :return: a Column object """ from ..identity import ColumnNumber try: c = self.column(name) extant = True if not update_existing: return c except NotFoundError: sequence_id = len(self.columns) + 1 assert sequence_id c = Column(t_vid=self.vid, sequence_id=sequence_id, vid=str( ColumnNumber(ObjectNumber.parse(self.vid), sequence_id)), name=name, datatype='str') extant = False # Update possibly existing data c.data = dict((list(c.data.items()) if c.data else []) + list(kwargs.get('data', {}).items())) for key, value in list(kwargs.items()): if key[0] != '_' and key not in [ 't_vid', 'name', 'sequence_id', 'data' ]: # Don't update the type if the user has specfied a custom type if key == 'datatype' and not c.type_is_builtin(): continue # Don't change a datatype if the value is set and the new value is unknown if key == 'datatype' and value == 'unknown' and c.datatype: continue # Don't change a datatype if the value is set and the new value is unknown if key == 'description' and not value: continue try: setattr(c, key, value) except AttributeError: raise AttributeError( "Column record has no attribute {}".format(key)) if key == 'is_primary_key' and isinstance(value, str) and len(value) == 0: value = False setattr(c, key, value) # If the id column has a description and the table does not, add it to # the table. if c.name == 'id' and c.is_primary_key and not self.description: self.description = c.description if not extant: self.columns.append(c) return c def add_id_column(self, description=None): from . import Column self.add_column( name='id', datatype=Column.DATATYPE_INTEGER, is_primary_key=True, description=self.description if not description else description) def is_empty(self): """Return True if the table has no columns or the only column is the id""" if len(self.columns) == 0: return True if len(self.columns) == 1 and self.columns[0].name == 'id': return True return False @property def header(self): """Return an array of column names in the same order as the column definitions, to be used zip with a row when reading a CSV file. >> row = dict(zip(table.header, row)) """ return [c.name for c in self.columns] @property def dict(self): INCLUDE_FIELDS = [ 'id', 'vid', 'd_id', 'd_vid', 'sequence_id', 'name', 'altname', 'vname', 'description', 'universe', 'keywords', 'installed', 'proto_vid', 'type', 'codes' ] d = { k: v for k, v in six.iteritems(self.__dict__) if k in INCLUDE_FIELDS } if self.data: for k in self.data: assert k not in d, "Value '{}' is a table field and should not be in data ".format( k) d[k] = self.data[k] d['is_geo'] = False for c in self.columns: if c in ('geometry', 'wkt', 'wkb', 'lat'): d['is_geo'] = True d['foreign_indexes'] = list( set([ c.data['index'].split(":")[0] for c in self.columns if c.data.get('index', False) ])) return d def update_from_stats(self, stats): """Update columns based on partition statistics""" sd = dict(stats) for c in self.columns: if c not in sd: continue stat = sd[c] if stat.size and stat.size > c.size: c.size = stat.size c.lom = stat.lom def update_id(self, sequence_id=None, force=True): """Alter the sequence id, and all of the names and ids derived from it. This often needs to be don after an IntegrityError in a multiprocessing run""" from ..identity import ObjectNumber if sequence_id: self.sequence_id = sequence_id assert self.d_vid if self.id is None or force: dataset_id = ObjectNumber.parse(self.d_vid).rev(None) self.d_id = str(dataset_id) self.id = str(TableNumber(dataset_id, self.sequence_id)) if self.vid is None or force: dataset_vid = ObjectNumber.parse(self.d_vid) self.vid = str(TableNumber(dataset_vid, self.sequence_id)) @property def transforms(self): """Return an array of arrays of column transforms. #The return value is an list of list, with each list being a segment of column transformations, and #each segment having one entry per column. """ tr = [] for c in self.columns: tr.append(c.expanded_transform) return six.moves.zip_longest(*tr) @property def row(self): from collections import OrderedDict import six # Use an Ordered Dict to make it friendly to creating CSV files. SKIP_KEYS = [ 'id', 'd_id', 'd_vid', 'dataset', 'columns', 'data', 'partitions', 'sources', 'process_records' ] d = OrderedDict([(p.key, getattr(self, p.key)) for p in self.__mapper__.attrs if p.key not in SKIP_KEYS]) for k, v in six.iteritems(self.data): d['d_' + k] = v return d def __str__(self): from tabulate import tabulate headers = 'Seq Vid Name Datatype ValueType'.split() rows = [(c.sequence_id, c.vid, c.name, c.datatype, c.valuetype) for c in self.columns] return ('Dest Table: {}\n'.format(self.name)) + tabulate(rows, headers) def _repr_html_(self): from tabulate import tabulate from ambry.util import drop_empty def record_gen(): for i, row in enumerate([c.row for c in self.columns]): if i == 0: yield row.keys() yield row.values() records = list(record_gen()) records = drop_empty(records) return "<h2>{}</h2>".format(self.name) + tabulate( records[1:], headers=records[0], tablefmt="html") @staticmethod def before_insert(mapper, conn, target): """event.listen method for Sqlalchemy to set the seqience_id for this object and create an ObjectNumber value for the id""" if target.sequence_id is None: from ambry.orm.exc import DatabaseError raise DatabaseError('Must have sequence id before insertion') Table.before_update(mapper, conn, target) @staticmethod def before_update(mapper, conn, target): """Set the Table ID based on the dataset number and the sequence number for the table.""" target.name = Table.mangle_name(target.name) if isinstance(target, Column): raise TypeError('Got a column instead of a table') target.update_id(target.sequence_id, False)
class Process(Base): """Track processes and operations on database objects""" __tablename__ = 'processes' id = SAColumn('pr_id', Integer, primary_key=True) group = SAColumn('pr_group', Integer, ForeignKey('processes.pr_id'), nullable=True, index=True) parent = relationship('Process', remote_side=[id], backref='children') stage = SAColumn('pr_stage', Integer, default=0) phase = SAColumn('pr_phase', Text, doc='Process phase: such as ingest or build') hostname = SAColumn('pr_host', Text) pid = SAColumn('pr_pid', Integer) d_vid = SAColumn('pr_d_vid', String(13), ForeignKey('datasets.d_vid'), nullable=False, index=True) dataset = relationship('Dataset', backref='process_records') t_vid = SAColumn('pr_t_vid', String(15), ForeignKey('tables.t_vid'), nullable=True, index=True) table = relationship('Table', backref='process_records') s_vid = SAColumn('pr_s_vid', String(17), ForeignKey('datasources.ds_vid'), nullable=True, index=True) source = relationship('DataSource', backref='process_records') p_vid = SAColumn('pr_p_vid', String(17), ForeignKey('partitions.p_vid'), nullable=True, index=True) partition = relationship('Partition', backref='process_records') created = SAColumn('pr_created', Float, doc='Creation date: time in seconds since the epoch as a integer.') modified = SAColumn('pr_modified', Float, doc='Modification date: time in seconds since the epoch as a integer.') item_type = SAColumn('pr_type', Text, doc='Item type, such as table, source or partition') item_count = SAColumn('pr_count', Integer, doc='Number of items processed') item_total = SAColumn('pr_items', Integer, doc='Number of items to be processed') message = SAColumn('pr_message', Text) state = SAColumn('pr_state', Text) exception_class = SAColumn('pr_ex_class', Text) exception_trace = SAColumn('pr_ex_trace', Text) log_action = SAColumn('pr_action', Text) data = SAColumn('pr_data', MutationDict.as_mutable(JSONEncodedObj)) def __repr__(self): return "{} {}/{} {}:{} {} {}".format( self.d_vid, self.hostname, self.pid, self.phase if self.phase else '?', self.stage , self.log_action, self.message) def __str__(self): return "{} {}/{} {}:{} {} {}".format( self.d_vid, self.hostname, self.pid, self.phase if self.phase else '?', self.stage , self.log_action, self.message) @property def log_str(self): import platform import os parts = [] # This bit only gets executed when records stored in the database from one node or process are # read from another. It won't print out in normall logging, if self.hostname != platform.node() or self.pid != os.getpid(): hostpid = "({}@{})".format(self.pid, self.hostname) parts.append(hostpid) am = { 'start': ">", 'add': '+', 'update': '.', 'done': "<", '': '?', None: '?' } phase_str = self.phase if self.phase else '?' if self.stage: phase_str = phase_str + ':' + str(self.stage) parts.append(phase_str) action_char = am.get(self.log_action,'') if self.state == 'error': action_char = '!' parts.append(action_char) if self.s_vid: parts.append(self.s_vid) if self.t_vid: parts.append(self.t_vid) if self.p_vid: parts.append(self.p_vid) parts.append(self.message if self.message else '') if self.item_count: ic = 'processed '+str(self.item_count) if self.item_total: ic += ' of {}'.format(self.item_total) if self.item_type: ic += ' '+self.item_type parts.append(ic) return ' '.join(parts) @property def dict(self): """A dict that holds key/values for all of the properties in the object. :return: """ from collections import OrderedDict return OrderedDict( (p.key,getattr(self, p.key)) for p in self.__mapper__.attrs if p.key not in ('partition', 'source', 'table','dataset', 'children', 'parent')) @staticmethod def before_insert(mapper, conn, target): from time import time target.created = time() Process.before_update(mapper, conn, target) @staticmethod def before_update(mapper, conn, target): from time import time target.modified = time()
class DataSource(DataSourceBase, Base, DictableMixin): """A source of data, such as a remote file or bundle""" __tablename__ = 'datasources' vid = SAColumn('ds_vid', String(17), primary_key=True) sequence_id = SAColumn('ds_sequence_id', INTEGER) name = SAColumn('ds_name', Text) d_vid = SAColumn('ds_d_vid', String(13), ForeignKey('datasets.d_vid'), nullable=False, doc='Dataset vid') title = SAColumn('ds_title', Text) st_vid = SAColumn('ds_st_vid', String(22), ForeignKey('sourcetables.st_vid'), nullable=True) source_table_name = SAColumn('ds_st_name', Text) _source_table = relationship(SourceTable, backref='sources') t_vid = SAColumn('ds_t_vid', String(15), ForeignKey('tables.t_vid'), nullable=True, doc='Table vid') dest_table_name = SAColumn('ds_dt_name', Text) _dest_table = relationship(Table, backref='sources') stage = SAColumn('ds_stage', INTEGER, default=0) # Order in which to process sources. pipeline = SAColumn('ds_pipeline', Text) time = SAColumn('ds_time', Text) space = SAColumn('ds_space', Text) grain = SAColumn('ds_grain', Text) epsg = SAColumn( 'ds_epsg', INTEGER, doc='EPSG SRID for the reference system of a geographic dataset. ') segment = SAColumn('ds_segment', Text) start_line = SAColumn('ds_start_line', INTEGER) end_line = SAColumn('ds_end_line', INTEGER) comment_lines = SAColumn('ds_comment_lines', MutationList.as_mutable(JSONEncodedObj)) header_lines = SAColumn('ds_header_lines', MutationList.as_mutable(JSONEncodedObj)) description = SAColumn('ds_description', Text) file = SAColumn('ds_file', Text) filetype = SAColumn('ds_filetype', Text) # tsv, csv, fixed, partition encoding = SAColumn('ds_encoding', Text) hash = SAColumn('ds_hash', Text) reftype = SAColumn('ds_reftype', Text) # null, zip, ref, template ref = SAColumn('ds_ref', Text) state = SAColumn('ds_state', Text) account_acessor = None __table_args__ = (UniqueConstraint('ds_d_vid', 'ds_name', name='_uc_ds_d_vid'), )
class ColumnStat(Base): """Table for per column, per partition stats.""" __tablename__ = 'colstats' p_vid = SAColumn('cs_p_vid', String(20), ForeignKey('partitions.p_vid'), primary_key=True, nullable=False, index=True) #partition = relationship('Partition', backref='stats') c_vid = SAColumn('cs_c_vid', String(20), ForeignKey('columns.c_vid'), primary_key=True, nullable=False, index=True) column = relationship('Column', backref='stats') d_vid = SAColumn('cs_d_vid', String(20), ForeignKey('datasets.d_vid'), nullable=False, index=True) dataset = relationship('Dataset', backref='stats') lom = SAColumn('cs_lom', String(1)) count = SAColumn('cs_count', BigIntegerType) mean = SAColumn('cs_mean', Float) std = SAColumn('cs_std', Float) min = SAColumn('cs_min', Float) p25 = SAColumn('cs_p25', Float) p50 = SAColumn('cs_p50', Float) p75 = SAColumn('cs_p75', Float) max = SAColumn('cs_max', Float) nuniques = SAColumn('cs_nuniques', BigIntegerType) width = SAColumn('cs_width', Integer) skewness = SAColumn('cs_skewness', Float) kurtosis = SAColumn('cs_kurtosis', Float) uvalues = SAColumn('f_uvalues', MutationDict.as_mutable(JSONEncodedObj)) hist = SAColumn('f_hist', MutationList.as_mutable(JSONEncodedObj)) text_hist = SAColumn('cs_text_hist', String) __table_args__ = ( UniqueConstraint('cs_p_vid', 'cs_c_vid', name='u_cols_stats'), ) @property def dict(self): # Javascript does not have these values; the spec says they are all mapped to null to_nulls = [ float('nan'), float('-inf'), float('inf')] def nullify(k, v): import math if isinstance(v, float) and (math.isnan(v) or math.isinf(v)) : return None else: return v d = {p.key: nullify(p.key, getattr(self,p.key)) for p in self.__mapper__.attrs if p.key not in ('data','column', 'table','partition', 'dataset')} return d
class SourceColumn(Base): __tablename__ = 'sourcecolumns' _parent_col = 'sc_st_vid' DATATYPE = Constant() DATATYPE.INT = int.__name__ DATATYPE.FLOAT = float.__name__ DATATYPE.STRING = six.binary_type.__name__ DATATYPE.UNICODE = six.text_type.__name__ DATATYPE.DATE = datetime.date.__name__ DATATYPE.TIME = datetime.time.__name__ DATATYPE.DATETIME = datetime.datetime.__name__ DATATYPE.UNKNOWN = unknown.__name__ type_map = { DATATYPE.INT: int, DATATYPE.FLOAT: float, DATATYPE.STRING: six.binary_type, DATATYPE.UNICODE: six.text_type, DATATYPE.DATE: datetime.date, DATATYPE.TIME: datetime.time, DATATYPE.DATETIME: datetime.datetime, DATATYPE.DATETIME: unknown } column_type_map = { # FIXME The Column types should be harmonized with these types DATATYPE.INT: Column.DATATYPE_INTEGER, DATATYPE.FLOAT: Column.DATATYPE_FLOAT, DATATYPE.STRING: Column.DATATYPE_STR, DATATYPE.UNICODE: Column.DATATYPE_STR, DATATYPE.DATE: Column.DATATYPE_DATE, DATATYPE.TIME: Column.DATATYPE_TIME, DATATYPE.DATETIME: Column.DATATYPE_DATETIME, DATATYPE.UNKNOWN: Column.DATATYPE_STR } vid = SAColumn('sc_vid', String(21), primary_key=True) d_vid = SAColumn('sc_d_vid', String(13), ForeignKey('datasets.d_vid'), nullable=False) st_vid = SAColumn('sc_st_vid', String(17), ForeignKey('sourcetables.st_vid'), nullable=False) position = SAColumn('sc_position', Integer, doc='Integer position of column') source_header = SAColumn( 'sc_source_header', Text, doc='Column header, After coalescing but before mangling.') dest_header = SAColumn('sc_dest_header', Text, doc='Original header, mangled') datatype = SAColumn('sc_datatype', Text, doc='Basic data type, usually intuited') valuetype = SAColumn( 'sc_valuetype', Text, doc='Describes the meaning of the value: state, county, address, etc.') has_codes = SAColumn('sc_has_codes', Boolean, default=False, doc='If True column also has codes of different type') start = SAColumn('sc_start', Integer, doc='For fixed width, the column starting position') width = SAColumn('sc_width', Integer, doc='For Fixed width, the field width') size = SAColumn( 'sc_size', Integer, doc='Max size of the column values, after conversion to strings.') summary = SAColumn('sc_summary', Text, doc='Short text description') description = SAColumn('sc_description', Text, doc='Long text description') value_labels = SAColumn('sc_value_labels', MutationDict.as_mutable(JSONEncodedObj)) _next_column_number = None # Set in next_config_number() __table_args__ = (UniqueConstraint('sc_st_vid', 'sc_source_header', name='_uc_sourcecolumns'), ) @property def name(self): return self.source_header @property def python_datatype(self): return self.type_map[self.datatype] @property def column_datatype(self): """Return the data type using the values defined for the schema""" return self.column_type_map[self.datatype] @staticmethod def mangle_name(name): """Mangles a column name to a standard form, removing illegal characters. :param name: :return: """ import re try: return re.sub('_+', '_', re.sub('[^\w_]', '_', name).lower()).rstrip('_') except TypeError: raise TypeError('Trying to mangle name with invalid type of: ' + str(type(name))) @property def row(self): from collections import OrderedDict # Use an Ordered Dict to make it friendly to creating CSV files. d = OrderedDict([('table', self.table.name)] + [(p.key, getattr(self, p.key)) for p in self.__mapper__.attrs if p.key not in [ 'vid', 'st_vid', 'table', 'dataset', 'ds_id', 'd_vid', 'source', 'value_labels' ]]) return d @property def dict(self): """A dict that holds key/values for all of the properties in the object. :return: """ from collections import OrderedDict SKIP_KEYS = () return OrderedDict((p.key, getattr(self, p.key)) for p in self.__mapper__.attrs if p.key not in SKIP_KEYS) def update(self, **kwargs): if 'table' in kwargs: del kwargs[ 'table'] # In source_schema.csv, this is the name of the table, not the object for k, v in list(kwargs.items()): if hasattr(self, k): if k == 'dest_header': # Don't reset the dest header on updates. if self.dest_header and self.dest_header != self.source_header: continue setattr(self, k, v)
class Config(Base): __tablename__ = 'config' __table_args__ = (UniqueConstraint('co_d_vid', 'co_type', 'co_group', 'co_key', name='_type_group_key_uc'), ) id = SAColumn('co_id', String(32), primary_key=True) sequence_id = SAColumn('co_sequence_id', Integer, nullable=False, index=True) d_vid = SAColumn('co_d_vid', String(16), ForeignKey('datasets.d_vid'), index=True, doc='Dataset vid') type = SAColumn('co_type', String(200), doc='Type of the config: metadata, process, sync, etc...') group = SAColumn('co_group', String(200), doc='Group of the config: identity, about, etc...') key = SAColumn('co_key', String(200), doc='Key of the config') value = SAColumn('co_value', JSONAlchemy(Text()), doc='Value of the config key.') modified = SAColumn( 'co_modified', Integer(), doc='Modification date: time in seconds since the epoch as a integer.') # Foreign key constraints may it hard to dump all of the configs to a new bundle database in # ambry.orm.database.Database#copy_dataset, so I've removed the foreign key constraint. # TODO: Write test for that note. parent_id = SAColumn(String(32), ForeignKey('config.co_id'), nullable=True, doc='Id of the parent config.') parent = relationship('Config', remote_side=[id]) children = relationship('Config') def incver(self): """Increment all of the version numbers and return a new object""" from . import incver return incver(self, ['d_vid', 'id', 'parent_id']) @property def dict(self): return {p.key: getattr(self, p.key) for p in self.__mapper__.attrs} def __repr__(self): return u('<config: {} {},{},{} = {}>').format(self.id, self.d_vid, self.group, self.key, self.value) @property def dotted_key(self): return '{}.{}.{}'.format(self.type, self.group, self.key) def update_sequence_id(self, session, dataset): assert dataset.vid == self.d_vid assert session # NOTE: This next_sequence_id uses a different algorithm than dataset.next_sequence_id # FIXME replace this one with dataset.next_sequence_id self.sequence_id = next_sequence_id(session, dataset._sequence_ids, self.d_vid, Config) self.id = str(GeneralNumber1('F', self.d_vid, self.sequence_id)) @staticmethod def before_insert(mapper, conn, target): if not target.sequence_id: from ambry.orm.exc import DatabaseError assert bool(target.d_vid) raise DatabaseError('Must set a sequence id before inserting') if not target.id: target.id = str( GeneralNumber1('F', target.d_vid, target.sequence_id)) Config.before_update(mapper, conn, target) @staticmethod def before_update(mapper, conn, target): if object_session(target).is_modified(target, include_collections=False): target.modified = time()
class SourceTable(Base): __tablename__ = 'sourcetables' vid = SAColumn('st_vid', String(22), primary_key=True) sequence_id = SAColumn('st_sequence_id', Integer, nullable=False) d_vid = SAColumn('st_d_vid', String(16), ForeignKey('datasets.d_vid'), nullable=False) name = SAColumn('st_name', String(50), nullable=False) columns = relationship(SourceColumn, backref='table', order_by='asc(SourceColumn.position)', cascade='all, delete-orphan', lazy='joined') __table_args__ = (UniqueConstraint('st_d_vid', 'st_name', name='_uc_sourcetables'), ) def column(self, source_header_or_pos): """ Return a column by name or position :param source_header_or_pos: If a string, a source header name. If an integer, column position :return: """ for c in self.columns: if c.source_header == source_header_or_pos: assert c.st_vid == self.vid return c elif c.position == source_header_or_pos: assert c.st_vid == self.vid return c else: return None def add_column(self, position, source_header, datatype, **kwargs): """ Add a column to the source table. :param position: Integer position of the column started from 1. :param source_header: Name of the column, as it exists in the source file :param datatype: Python datatype ( str, int, float, None ) for the column :param kwargs: Other source record args. :return: """ from ..identity import GeneralNumber2 c = self.column(source_header) c_by_pos = self.column(position) datatype = 'str' if datatype == 'unicode' else datatype assert not c or not c_by_pos or c.vid == c_by_pos.vid # Convert almost anything to True / False if 'has_codes' in kwargs: FALSE_VALUES = ['False', 'false', 'F', 'f', '', None, 0, '0'] kwargs['has_codes'] = False if kwargs[ 'has_codes'] in FALSE_VALUES else True if c: # Changing the position can result in conflicts assert not c_by_pos or c_by_pos.vid == c.vid c.update(position=position, datatype=datatype.__name__ if isinstance(datatype, type) else datatype, **kwargs) elif c_by_pos: # FIXME This feels wrong; there probably should not be any changes to the both # of the table, since then it won't represent the previouls source. Maybe all of the sources # should get their own tables initially, then affterward the duplicates can be removed. assert not c or c_by_pos.vid == c.vid c_by_pos.update(source_header=source_header, datatype=datatype.__name__ if isinstance( datatype, type) else datatype, **kwargs) else: assert not c and not c_by_pos # Hacking an id number, since I don't want to create a new Identity ObjectNUmber type c = SourceColumn(vid=str( GeneralNumber2('C', self.d_vid, self.sequence_id, int(position))), position=position, st_vid=self.vid, d_vid=self.d_vid, datatype=datatype.__name__ if isinstance( datatype, type) else datatype, source_header=source_header, **kwargs) self.columns.append(c) return c @property def column_map(self): return {c.source_header: c.dest_header for c in self.columns} @property def column_index_map(self): return {c.source_header: c.position for c in self.columns} @property def headers(self): return [c.source_header for c in self.columns] @property def widths(self): widths = [c.width for c in self.columns] if not all(bool(e) for e in widths): from ambry.dbexceptions import ConfigurationError raise ConfigurationError( 'The widths array for source table {} has zero or null entries ' .format(self.name)) widths = [int(w) for w in widths] return widths def update_id(self, sequence_id=None): """Alter the sequence id, and all of the names and ids derived from it. This often needs to be don after an IntegrityError in a multiprocessing run""" from ..identity import GeneralNumber1 if sequence_id: self.sequence_id = sequence_id self.vid = str(GeneralNumber1('T', self.d_vid, self.sequence_id)) def __str__(self): from tabulate import tabulate headers = 'Pos Source_Header Dest_Header Datatype '.split() rows = [(c.position, c.source_header, c.dest_header, c.datatype) for c in self.columns] return ('Source Table: {}\n'.format(self.name)) + tabulate( rows, headers)
class Dataset(Base): __tablename__ = 'datasets' vid = SAColumn('d_vid', String(13), primary_key=True) id = SAColumn('d_id', String(10)) name = SAColumn('d_name', String(200), nullable=False, index=True) vname = SAColumn('d_vname', String(200), unique=True, nullable=False, index=True) fqname = SAColumn('d_fqname', String(200), unique=True, nullable=False) cache_key = SAColumn('d_cache_key', String(200), unique=True, nullable=False, index=True) source = SAColumn('d_source', String(200), nullable=False) dataset = SAColumn('d_dataset', String(200), nullable=False) subset = SAColumn('d_subset', String(200)) variation = SAColumn('d_variation', String(200)) btime = SAColumn('d_btime', String(200)) bspace = SAColumn('d_bspace', String(200)) revision = SAColumn('d_revision', Integer, nullable=False) version = SAColumn('d_version', String(20), nullable=False) space_coverage = SAColumn('d_scov', MutationList.as_mutable(JSONEncodedObj)) time_coverage = SAColumn('d_tcov', MutationList.as_mutable(JSONEncodedObj)) grain_coverage = SAColumn('d_gcov', MutationList.as_mutable(JSONEncodedObj)) # Sequence IDs for various objects. We need records if these IDs to be able to # construct objects in multi-process environments. The sequence numbers become part of the VIDs and must # be unique p_sequence_id = SAColumn('d_p_sequence_id', Integer, default=1) t_sequence_id = SAColumn('d_t_sequence_id', Integer, default=1) st_sequence_id = SAColumn('d_st_sequence_id', Integer, default=1) state = SAColumn('d_state', String(20), doc='Indicates last operation on the dataset' ) # Note! Different from Bundle.state! upstream = SAColumn('d_upstream', String(200), doc='The URL of the upstream source') data = SAColumn('d_data', MutationDict.as_mutable(JSONEncodedObj)) # ---- Relationships tables = relationship('Table', backref='dataset', cascade='all, delete-orphan') partitions = relationship('Partition', backref='dataset', cascade='all, delete-orphan') configs = relationship('Config', backref='dataset', cascade='all, delete-orphan') files = relationship('File', backref='dataset', cascade='all, delete-orphan') source_tables = relationship('SourceTable', backref='dataset', cascade='all, delete-orphan') source_columns = relationship('SourceColumn', backref='dataset', cascade='all, delete-orphan') sources = relationship('DataSource', backref='dataset', cascade='all, delete-orphan') codes = relationship('Code', backref='dataset', cascade='all, delete-orphan') path = None # Set by the Library and other queries. _database = None # Reference to the database, when dataset is retrieved from a database object _sequence_ids = {} # Cache of sequence numbers ( Is this still used? ) def __init__(self, *args, **kwargs): super(Dataset, self).__init__(*args, **kwargs) if self.vid and not self.id: self.revision = ObjectNumber.parse(self.vid).revision self.id = str(ObjectNumber.parse(self.vid).rev(None)) if not self.id: dn = DatasetNumber(None, self.revision) self.vid = str(dn) self.id = str(dn.rev(None)) elif not self.vid: try: self.vid = str(ObjectNumber.parse(self.id).rev(self.revision)) except ValueError as e: raise ValueError('Could not parse id value; ' + e.message) if not self.revision: self.revision = 1 if self.cache_key is None: self.cache_key = self.identity.name.cache_key if not self.name: self.name = str(self.identity.name) if not self.vname: self.vname = str(self.identity.vname) if not self.fqname: self.fqname = str(self.identity.fqname) if not self.version: self.version = str(self.identity.version) assert self.vid[0] == 'd' def incver(self): """Increment all of the version numbers""" d = {} for p in self.__mapper__.attrs: if p.key in ['vid', 'vname', 'fqname', 'version', 'cache_key']: continue if p.key == 'revision': d[p.key] = self.revision + 1 else: d[p.key] = getattr(self, p.key) n = Dataset(**d) return n def commit(self): self._database.commit() def rollback(self): self._database.rollback() def rollback(self): self._database.close() @property def session(self): return self._database.session def query(self, *args, **kwargs): return self.session.query(*args, **kwargs) def close(self): return self._database.close() def close_session(self): return self._database.close_session() @property def identity(self): from ..identity import Identity return Identity.from_dict(self.dict) @property def config(self): return ConfigAccessor(self) def next_sequence_id(self, table_class, force_query=False): """Return the next sequence id for a object, identified by the vid of the parent object, and the database prefix for the child object. On the first call, will load the max sequence number from the database, but subsequence calls will run in process, so this isn't suitable for multi-process operation -- all of the tables in a dataset should be created by one process The child table must have a sequence_id value. """ from . import next_sequence_id from sqlalchemy.orm import object_session # NOTE: This next_sequence_id uses a different algorithm than dataset.next_sequence_id # FIXME replace this one with dataset.next_sequence_id return next_sequence_id(object_session(self), self._sequence_ids, self.vid, table_class, force_query=force_query) def new_unique_object(self, table_class, sequence_id=None, force_query=False, **kwargs): """Use next_sequence_id to create a new child of the dataset, with a unique id""" from sqlalchemy.exc import IntegrityError from sqlalchemy.orm.exc import FlushError # If a sequence ID was specified, the caller is certain # that there is no potential for conflicts, # so there is no need to commit here. if not sequence_id: commit = True sequence_id = self.next_sequence_id(table_class, force_query=force_query) else: commit = False o = table_class(d_vid=self.vid, **kwargs) o.update_id(sequence_id) if commit is False: return o self.commit() if self._database.driver == 'sqlite': # The Sqlite database can't have concurrency, so there no problem. self.session.add(o) self.commit() return o else: # Postgres. Concurrency is a bitch. table_name = table_class.__tablename__ child_sequence_id = table_class.sequence_id.property.columns[ 0].name try: self.session.add(o) self.commit() return o except (IntegrityError, FlushError) as e: self.rollback() self.session.merge(self) print 'Failed' return None return # This is horrible, but it's the only thing that has worked for both # Sqlite and Postgres in both single processes and multiprocesses. d_vid = self.vid while True: try: self.session.add(o) self.commit() return o except (IntegrityError, FlushError) as e: self.rollback() self.session.expunge_all() ds = self._database.dataset(d_vid) sequence_id = ds.next_sequence_id(table_class, force_query=True) o.update_id(sequence_id) except Exception as e: print( 'Completely failed to get a new {} sequence_id; {}'.format( table_class, e)) self.rollback() import traceback # This bit is helpful in a multiprocessing run. tb = traceback.format_exc() print(tb) raise def table(self, ref): from .exc import NotFoundError from .table import Table table_name = Table.mangle_name(str(ref)) for t in self.tables: if table_name == t.name or str(ref) == t.id or str(ref) == t.vid: return t raise NotFoundError( "Failed to find table for ref '{}' in dataset '{}'".format( ref, self.name)) def new_table(self, name, add_id=True, **kwargs): '''Add a table to the schema, or update it it already exists. If updating, will only update data. ''' from . import Table from .exc import NotFoundError try: table = self.table(name) extant = True except NotFoundError: extant = False if 'sequence_id' not in kwargs: kwargs['sequence_id'] = self._database.next_sequence_id( Dataset, self.vid, Table) table = Table(name=name, d_vid=self.vid, **kwargs) table.update_id() # Update possibly extant data table.data = dict((list(table.data.items()) if table.data else []) + list(kwargs.get('data', {}).items())) for key, value in list(kwargs.items()): if not key: continue if key[0] != '_' and key not in [ 'vid', 'id', 'id_', 'd_id', 'name', 'sequence_id', 'table', 'column', 'data' ]: setattr(table, key, value) if add_id: table.add_id_column() if not extant: self.tables.append(table) return table def new_partition(self, table, **kwargs): """ Creates new partition and returns it. Args: table (orm.Table): Returns: orm.Partition """ from . import Partition # Create the basic partition record, with a sequence ID. if isinstance(table, string_types): table = self.table(table) if 'sequence_id' in kwargs: sequence_id = kwargs['sequence_id'] del kwargs['sequence_id'] else: sequence_id = self._database.next_sequence_id( Dataset, self.vid, Partition) p = Partition(t_vid=table.vid, table_name=table.name, sequence_id=sequence_id, dataset=self, d_vid=self.vid, **kwargs) p.update_id() return p def partition(self, ref=None, **kwargs): """ Returns partition by ref. """ from .exc import NotFoundError from six import text_type if ref: for p in self.partitions: # This is slow for large datasets, like Census years. if (text_type(ref) == text_type(p.name) or text_type(ref) == text_type(p.id) or text_type(ref) == text_type(p.vid)): return p raise NotFoundError( "Failed to find partition for ref '{}' in dataset '{}'".format( ref, self.name)) elif kwargs: from ..identity import PartitionNameQuery pnq = PartitionNameQuery(**kwargs) return self._find_orm def _find_orm(self, pnq): """Return a Partition object from the database based on a PartitionId. An ORM object is returned, so changes can be persisted. """ # import sqlalchemy.orm.exc from ..identity import PartitionNameQuery, NameQuery from ambry.orm import Partition as OrmPartition # , Table from sqlalchemy.orm import joinedload # , joinedload_all assert isinstance( pnq, PartitionNameQuery), "Expected PartitionNameQuery, got {}".format( type(pnq)) pnq = pnq.with_none() q = self.bundle.database.session.query(OrmPartition) if pnq.fqname is not NameQuery.ANY: q = q.filter(OrmPartition.fqname == pnq.fqname) elif pnq.vname is not NameQuery.ANY: q = q.filter(OrmPartition.vname == pnq.vname) elif pnq.name is not NameQuery.ANY: q = q.filter(OrmPartition.name == str(pnq.name)) else: if pnq.time is not NameQuery.ANY: q = q.filter(OrmPartition.time == pnq.time) if pnq.space is not NameQuery.ANY: q = q.filter(OrmPartition.space == pnq.space) if pnq.grain is not NameQuery.ANY: q = q.filter(OrmPartition.grain == pnq.grain) if pnq.format is not NameQuery.ANY: q = q.filter(OrmPartition.format == pnq.format) if pnq.segment is not NameQuery.ANY: q = q.filter(OrmPartition.segment == pnq.segment) if pnq.table is not NameQuery.ANY: if pnq.table is None: q = q.filter(OrmPartition.t_id is None) else: tr = self.bundle.schema.table(pnq.table) if not tr: raise ValueError( "Didn't find table named {} in {} bundle path = {}" .format(pnq.table, pnq.vname, self.bundle.database.path)) q = q.filter(OrmPartition.t_id == tr.id_) ds = self.bundle.dataset q = q.filter(OrmPartition.d_vid == ds.vid) q = q.order_by(OrmPartition.vid.asc()).order_by( OrmPartition.segment.asc()) q = q.options(joinedload(OrmPartition.table)) return q def delete_tables_partitions(self): self.t_sequence_id = 1 self.p_sequence_id = 1 return self._database.delete_tables_partitions(self) def delete_partitions(self): self.p_sequence_id = 1 return self._database.delete_partitions(self) def new_source(self, name, **kwargs): from .source import DataSource from ..identity import GeneralNumber1 if 'sequence_id' not in kwargs: kwargs['sequence_id'] = self.next_sequence_id(DataSource) if 'd_vid' not in kwargs: kwargs['d_vid'] = self.vid else: assert kwargs['d_vid'] == self.vid if 'vid' not in kwargs: kwargs['vid'] = str( GeneralNumber1('S', self.vid, int(kwargs['sequence_id']))) source = DataSource(name=name, **kwargs) object_session(self).add(source) return source def source_file(self, name): from .source import DataSource source = object_session(self)\ .query(DataSource)\ .filter(DataSource.name == name)\ .filter(DataSource.d_vid == self.vid)\ .first() if not source: # Try as a source vid source = object_session(self) \ .query(DataSource) \ .filter(DataSource.vid == name) \ .filter(DataSource.d_vid == self.vid) \ .first() if not source: from .exc import NotFoundError raise NotFoundError( "Failed to find source for name : '{}' ".format(name)) return source def new_source_table(self, name, sequence_id=None): from .source_table import SourceTable extant = next(iter(e for e in self.source_tables if e.name == name), None) if extant: return extant if not sequence_id: sequence_id = self._database.next_sequence_id( Dataset, self.vid, SourceTable) assert sequence_id table = SourceTable(name=name, d_vid=self.vid, sequence_id=sequence_id) table.update_id() self.source_tables.append(table) assert table.sequence_id return table def source_table(self, name): for st in self.source_tables: if st.name == name: return st return None def bsfile(self, path): """Return a Build Source file ref, creating a new one if the one requested does not exist""" from sqlalchemy.orm.exc import NoResultFound from ambry.orm.exc import NotFoundError try: f = object_session(self)\ .query(File)\ .filter(File.d_vid == self.vid)\ .filter(File.major_type == File.MAJOR_TYPE.BUILDSOURCE)\ .filter(File.path == path)\ .one() return f except NoResultFound: raise NotFoundError( "Failed to find file for path '{}' ".format(path)) def new_bsfile(self, file_const, path): import time fr = File( d_vid=self.vid, major_type=File.MAJOR_TYPE.BUILDSOURCE, minor_type=file_const, path=path, #modified = int(time.time()), # In case content isn't set, which is where modified is set normally source='fs') self.files.append(fr) object_session(self).add(fr) return fr def find_or_new_bsfile(self, file_const, path): from ambry.orm.exc import NotFoundError try: return self.bsfile(path) except NotFoundError: import time f = self.new_bsfile(file_const, path) return f @property def dict(self): d = { 'id': self.id, 'vid': self.vid, 'name': self.name, 'vname': self.vname, 'fqname': self.fqname, 'cache_key': self.cache_key, 'source': self.source, 'dataset': self.dataset, 'subset': self.subset, 'variation': self.variation, 'btime': self.btime, 'bspace': self.bspace, 'revision': self.revision, 'version': self.version, 'upstream': self.upstream } if self.data: for k in self.data: assert k not in d d[k] = self.data[k] return d def row(self, fields): """Return a row for fields, for CSV files, pretty printing, etc, give a set of fields to return""" d = self.dict row = [None] * len(fields) for i, f in enumerate(fields): if f in d: row[i] = d[f] return row def __repr__(self): return """<datasets: id={} vid={} name={} source={} ds={} ss={} var={} rev={}>""".format( self.id, self.vid, self.name, self.source, self.dataset, self.subset, self.variation, self.revision)
class Code(Base): """Code entries for variables.""" __tablename__ = 'codes' c_vid = SAColumn('cd_c_vid', String(20), ForeignKey('columns.c_vid'), primary_key=True, index=True, nullable=False) d_vid = SAColumn('cd_d_vid', String(20), ForeignKey('datasets.d_vid'), primary_key=True, nullable=False, index=True) key = SAColumn( 'cd_skey', String(20), primary_key=True, nullable=False, index=True) # String version of the key, the value in the dataset ikey = SAColumn('cd_ikey', Integer, index=True) # Set only if the key is actually an integer value = SAColumn('cd_value', Text, nullable=False) # The value the key maps to description = SAColumn('cd_description', Text) source = SAColumn('cd_source', Text) data = SAColumn('cd_data', MutationDict.as_mutable(JSONEncodedObj)) def __init__(self, **kwargs): for p in self.__mapper__.attrs: if p.key in kwargs: setattr(self, p.key, kwargs[p.key]) del kwargs[p.key] if self.data: self.data.update(kwargs) def __repr__(self): return "<code: {}->{} >".format(self.key, self.value) def update(self, f): """Copy another files properties into this one.""" for p in self.__mapper__.attrs: if p.key == 'oid': continue try: setattr(self, p.key, getattr(f, p.key)) except AttributeError: # The dict() method copies data property values into the main dict, # and these don't have associated class properties. continue @property def insertable_dict(self): d = {('cd_' + k).strip('_'): v for k, v in iteritems(self.dict)} # the `key` property is not named after its db column d['cd_skey'] = d['cd_key'] del d['cd_key'] return d @staticmethod def before_insert(mapper, conn, target): target.d_vid = str(ObjectNumber.parse(target.c_vid).as_dataset)
class Account(Base): __tablename__ = 'accounts' id = SAColumn('ac_id', Integer, primary_key=True) d_vid = SAColumn('ac_d_vid', String(20), ForeignKey('datasets.d_vid'), index=True) user_id = SAColumn('ac_user_id', Text, index=True, doc='Ambry User') organization_id = SAColumn('ac_org_id', Text, index=True, doc='Ambry Organization') major_type = SAColumn( 'ac_major_type', Text, doc='Major type, often name of service or account providing company') minor_type = SAColumn('ac_minor_type', Text, doc='Minor type, subtype of the major type') # Foreign account identifier, often a bucket name or domain name. # The key used to reference the account account_id = SAColumn('ac_account_id', Text, unique=True) url = SAColumn('ac_org', Text, doc='URL of service') access_key = SAColumn('ac_access', Text, doc='Access token or username') encrypted_secret = SAColumn('ac_secret', Text, doc='Symmetrically encrypted secret') encrypted_password = SAColumn('ac_password', Text, doc='Asymmetrically encrypted user password') name = SAColumn('ac_name', Text, doc='Person\'s name') email = SAColumn('ac_email', Text, doc='Email for foreign account') org = SAColumn('ac_url', Text, doc='Organization name') comment = SAColumn('ac_comment', Text) # Access token or username data = SAColumn('ac_data', MutationDict.as_mutable(JSONEncodedObj)) __table_args__ = (UniqueConstraint('ac_account_id', 'ac_access', name='_uc_account_1'), ) secret_password = None # Must be set to encrypt or decrypt secret def incver(self): """Increment all of the version numbers and return a new object""" from . import incver return incver(self, ['d_vid']) @staticmethod def sym_encrypt(password, v): return encrypt(password, v).encode('base64') @staticmethod def sym_decrypt(password, v): import binascii try: return decrypt(password, v.decode('base64')) except SC_DecryptionException as e: raise AccountDecryptionError('Wrong password ') except binascii.Error as e: raise AccountDecryptionError('Bad password: {}'.format(e)) @property def secret(self): assert self.secret_password # The encryption password if self.encrypted_secret: return self.sym_decrypt(self.secret_password, self.encrypted_secret) else: return None @secret.setter def secret(self, v): assert self.secret_password # The encryption password self.encrypted_secret = self.sym_encrypt(self.secret_password, v) def decrypt_secret(self, password=None): if not password: password = self.secret_password if not self.encrypted_secret: return None if self.major_type == 'user': return None # These can't be decrypted, only tested. if password: try: return self.sym_decrypt(password, self.encrypted_secret) except AccountDecryptionError as e: raise AccountDecryptionError( "Decryption error for account '{}': {}".format( self.account_id, e)) else: raise MissingPasswordError( 'Must have a password to get or set the secret') def encrypt_secret(self, v, password=None): if not password: password = self.secret_password if password: self.encrypted_secret = self.sym_encrypt(password, v) else: raise MissingPasswordError( 'Must have a password to get or set the secret') return self.encrypted_secret @property def password(self): raise NotImplemented('Use test()') @password.setter def password(self, v): assert self.secret_password self.encrypted_password = self.sym_encrypt(self.secret_password, v) def encrypt_password(self, v): from passlib.hash import pbkdf2_sha512 assert v is not None self.encrypted_password = pbkdf2_sha512.encrypt(v, rounds=50000, salt_size=16) def test(self, v): """Test the password against a value""" from passlib.hash import pbkdf2_sha512 assert self.encrypted_password is not None return pbkdf2_sha512.verify(v, self.encrypted_password) @staticmethod def before_insert(mapper, conn, target): Account.before_update(mapper, conn, target) @staticmethod def before_update(mapper, conn, target): pass @classmethod def prop_map(cls): prop_map = { 'service': 'major_type', 'host': 'url', 'organization': 'org', 'apikey': 'secret', 'access': 'access_key', 'access_key': 'access_key', 'secret': 'secret', 'name': 'name', 'org': 'org', 'url': 'url', 'email': 'email', } for p in cls.__mapper__.attrs: prop_map[p.key] = p.key return prop_map @property def dict(self): """A dict that holds key/values for all of the properties in the object. :return: """ d = { p.key: getattr(self, p.key) for p in self.__mapper__.attrs if p.key not in ('data') } d['secret'] = 'not available' if self.secret_password: try: d['secret'] = self.decrypt_secret() except AccountDecryptionError: pass if self.data: for k, v in self.data.items(): d[k] = v return {k: v for k, v in d.items()}
class Plot(Base): """Records of plots, links measures, dimensions and other plot configuratoin""" __tablename__ = 'plots' id = SAColumn('f_id', Integer, primary_key=True) d_vid = SAColumn('pl_d_vid', String(20), ForeignKey('datasets.d_vid'), nullable=False, index=True) dataset = relationship('Dataset', backref='plots') p_vid = SAColumn('pl_p_vid', String(20), ForeignKey('partitions.p_vid'), nullable=False, index=True) partition = relationship('Partition', backref='plots') title = SAColumn('pl_title', Text) description = SAColumn('pl_description', Text) type = SAColumn('pl_type', String(20)) measure1 = SAColumn('pl_measure1', String(20), ForeignKey('columns.c_vid'), nullable=True) measure2 = SAColumn('pl_measure2', String(20), ForeignKey('columns.c_vid'), nullable=True) dimension1 = SAColumn('pl_dimension1', String(20), ForeignKey('columns.c_vid'), nullable=True) dimension2 = SAColumn('pl_dimension2', String(20), ForeignKey('columns.c_vid'), nullable=True) dimension3 = SAColumn('pl_dimension3', String(20), ForeignKey('columns.c_vid'), nullable=True) d1text = SAColumn('pl_d1text', Text) d2text = SAColumn('pl_d2text', Text) d3text = SAColumn('pl_d3text', Text) error1 = SAColumn('pl_error1', String(20), ForeignKey('columns.c_vid'), nullable=True) error2 = SAColumn('pl_error2', String(20), ForeignKey('columns.c_vid'), nullable=True) multiple1 = SAColumn('pl_multiple1', String(20), ForeignKey('columns.c_vid'), nullable=True) multiple2 = SAColumn('pl_multiple2', String(20), ForeignKey('columns.c_vid'), nullable=True) data = SAColumn('pl_data', MutationDict.as_mutable(JSONEncodedObj)) def dataframe(self, filtered_dims={}, unstack=False, df_class=None, add_code=False): """ Yield rows in a reduced format, with one dimension as an index, one measure column per secondary dimension, and all other dimensions filtered. :param measure: The column names of one or more measures :param p_dim: The primary dimension. This will be the index of the dataframe. :param s_dim: a secondary dimension. The returned frame will be unstacked on this dimension :param unstack: :param filtered_dims: A dict of dimension columns names that are filtered, mapped to the dimension value to select. :param add_code: When substitution a label for a column, also add the code value. :return: """ measure = self.table.column(measure) p_dim = self.table.column(p_dim) assert measure assert p_dim if s_dim: s_dim = self.table.column(s_dim) from six import text_type def maybe_quote(v): from six import string_types if isinstance(v, string_types): return '"{}"'.format(v) else: return v all_dims = [p_dim.name] + filtered_dims.keys() if s_dim: all_dims.append(s_dim.name) if filtered_dims: all_dims += filtered_dims.keys() all_dims = [text_type(c) for c in all_dims] # "primary_dimensions" means something different here, all of the dimensions in the # dataset that do not have children. primary_dims = [text_type(c.name) for c in self.primary_dimensions] if set(all_dims) != set(primary_dims): raise ValueError( "The primary, secondary and filtered dimensions must cover all dimensions" + " {} != {}".format(sorted(all_dims), sorted(primary_dims))) columns = [] p_dim_label = None s_dim_label = None if p_dim.label: # For geographic datasets, also need the gvid if p_dim.type_is_gvid: columns.append(p_dim.name) p_dim = p_dim_label = p_dim.label columns.append(p_dim_label.name) else: columns.append(p_dim.name) if s_dim: if s_dim.label: s_dim = s_dim_label = s_dim.label columns.append(s_dim_label.name) else: columns.append(s_dim.name) columns.append(measure.name) # Create the predicate to filter out the filtered dimensions if filtered_dims: code = ' and '.join("row.{} == {}".format(k, maybe_quote(v)) for k, v in filtered_dims.items()) predicate = eval('lambda row: {}'.format(code)) else: predicate = lambda row: True df = self.analysis.dataframe(predicate, columns=columns, df_class=df_class) if unstack: # Need to set the s_dim in the index to get a hierarchical index, required for unstacking. # The final df will have only the p_dim as an index. if s_dim: df = df.set_index([p_dim.name, s_dim.name]) df = df.unstack() df.columns = df.columns.get_level_values( 1) # [' '.join(col).strip() for col in df.columns.values] else: # Can't actually unstack without a second dimension. df = df.set_index(p_dim.name) df.reset_index() return df
class Partition(Base): __tablename__ = 'partitions' STATES = Constant() STATES.SYNCED = 'synced' STATES.CLEANING = 'cleaning' STATES.CLEANED = 'cleaned' STATES.PREPARING = 'preparing' STATES.PREPARED = 'prepared' STATES.BUILDING = 'building' STATES.BUILT = 'built' STATES.COALESCING = 'coalescing' STATES.COALESCED = 'coalesced' STATES.ERROR = 'error' STATES.FINALIZING = 'finalizing' STATES.FINALIZED = 'finalized' STATES.INSTALLING = 'installing' STATES.INSTALLED = 'installed' TYPE = Constant TYPE.SEGMENT = 's' TYPE.UNION = 'u' sequence_id = SAColumn('p_sequence_id', Integer) vid = SAColumn('p_vid', String(16), primary_key=True, nullable=False) id = SAColumn('p_id', String(13), nullable=False) d_vid = SAColumn('p_d_vid', String(13), ForeignKey('datasets.d_vid'), nullable=False, index=True) t_vid = SAColumn('p_t_vid', String(15), ForeignKey('tables.t_vid'), nullable=False, index=True) name = SAColumn('p_name', String(200), nullable=False, index=True) vname = SAColumn('p_vname', String(200), unique=True, nullable=False, index=True) fqname = SAColumn('p_fqname', String(200), unique=True, nullable=False, index=True) title = SAColumn('p_title', String()) description = SAColumn('p_description', String()) notes = SAColumn('p_notes', String()) cache_key = SAColumn('p_cache_key', String(200), unique=True, nullable=False, index=True) parent_vid = SAColumn('p_p_vid', String(16), ForeignKey('partitions.p_vid'), nullable=True, index=True) ref = SAColumn('p_ref', String(16), index=True, doc='VID reference to an eariler version to use instead of this one.') type = SAColumn('p_type', String(20), default=TYPE.UNION, doc='u - normal partition, s - segment') table_name = SAColumn('p_table_name', String(50)) time = SAColumn('p_time', String(20)) # FIXME: add helptext space = SAColumn('p_space', String(50)) grain = SAColumn('p_grain', String(50)) variant = SAColumn('p_variant', String(50)) format = SAColumn('p_format', String(50)) segment = SAColumn('p_segment', Integer, doc='Part of a larger partition. segment_id is usually also a source ds_id') epsg = SAColumn('p_epsg', Integer, doc='EPSG SRID for the reference system of a geographic dataset. ') # The partition could hold data that is considered a dimension -- if multiple datasets # were joined, that dimension would be a dimension column, but it only has a single # value in each partition. # That could be part of the name, or it could be declared in a table, with a single value for all of the # rows in a partition. min_id = SAColumn('p_min_id', BigIntegerType) max_id = SAColumn('p_max_id', BigIntegerType) count = SAColumn('p_count', Integer) state = SAColumn('p_state', String(50)) data = SAColumn('p_data', MutationDict.as_mutable(JSONEncodedObj)) space_coverage = SAColumn('p_scov', MutationList.as_mutable(JSONEncodedObj)) time_coverage = SAColumn('p_tcov', MutationList.as_mutable(JSONEncodedObj)) grain_coverage = SAColumn('p_gcov', MutationList.as_mutable(JSONEncodedObj)) installed = SAColumn('p_installed', String(100)) _location = SAColumn('p_location', String(100)) # Location of the data file __table_args__ = ( # ForeignKeyConstraint( [d_vid, d_location], ['datasets.d_vid','datasets.d_location']), UniqueConstraint('p_sequence_id', 'p_d_vid', name='_uc_partitions_1'), ) # For the primary table for the partition. There is one per partition, but a table # can be primary in multiple partitions. table = relationship('Table', backref='partitions', foreign_keys='Partition.t_vid') stats = relationship(ColumnStat, backref='partition', cascade='all, delete, delete-orphan') children = relationship('Partition', backref=backref('parent', remote_side=[vid]), cascade='all') _bundle = None # Set when returned from a bundle. _datafile = None # TODO: Unused variable. _datafile_writer = None # TODO: Unused variable. _stats_dict = None @property def identity(self): """Return this partition information as a PartitionId.""" if self.dataset is None: # The relationship will be null until the object is committed s = object_session(self) ds = s.query(Dataset).filter(Dataset.id_ == self.d_id).one() else: ds = self.dataset d = { 'id': self.id, 'vid': self.vid, 'name': self.name, 'vname': self.vname, 'ref': self.ref, 'space': self.space, 'time': self.time, 'table': self.table_name, 'grain': self.grain, 'variant': self.variant, 'segment': self.segment, 'format': self.format if self.format else 'db' } return PartitionIdentity.from_dict(dict(list(ds.dict.items()) + list(d.items()))) @property def display(self): """Return an acessor object to get display titles and descriptions""" return PartitionDisplay(self) @property def bundle(self): return self._bundle # Set externally, such as Bundle.wrap_partition @property def is_segment(self): return self.type == self.TYPE.SEGMENT @property def headers(self): return [c.name for c in self.table.columns] def __repr__(self): return '<partition: {} {}>'.format(self.vid, self.vname) def set_stats(self, stats): self.stats[:] = [] # Delete existing stats for c in self.table.columns: if c.name not in stats: continue d = stats[c.name].dict del d['name'] del d['flags'] cs = ColumnStat(p_vid=self.vid, d_vid=self.d_vid, c_vid=c.vid, **d) self.stats.append(cs) def parse_gvid_or_place(self, gvid_or_place): try: return parse_to_gvid(gvid_or_place) except KeyError: places = list(self._bundle._library.search.search_identifiers(gvid_or_place)) if not places: err_msg = "Failed to find space identifier '{}' in full " \ "text identifier search for partition '{}'" \ .format(gvid_or_place, str(self.identity)) self._bundle.error(err_msg) return None return parse_to_gvid(places[0].vid) def set_coverage(self, stats): """"Extract time space and grain coverage from the stats and store them in the partition""" from ambry.util.datestimes import expand_to_years scov = set() tcov = set() grains = set() def summarize_maybe(gvid): try: return parse_to_gvid(gvid).summarize() except: return None def simplifiy_maybe(values, column): parsed = [] for gvid in values: # The gvid should not be a st if gvid is None or gvid == 'None': continue try: parsed.append(parse_to_gvid(gvid)) except ValueError as e: if self._bundle: self._bundle.warn("While analyzing geo coverage in final partition stage, " + "Failed to parse gvid '{}' in {}.{}: {}" .format(str(gvid), column.table.name, column.name, e)) try: return isimplify(parsed) except: return None def int_maybe(year): try: return int(year) except: return None for c in self.table.columns: if c.name not in stats: continue try: if stats[c.name].is_gvid or stats[c.name].is_geoid: scov |= set(x for x in simplifiy_maybe(stats[c.name].uniques, c)) grains |= set(summarize_maybe(gvid) for gvid in stats[c.name].uniques) elif stats[c.name].is_year: tcov |= set(int_maybe(x) for x in stats[c.name].uniques) elif stats[c.name].is_date: # The fuzzy=True argument allows ignoring the '-' char in dates produced by .isoformat() try: tcov |= set(parser.parse(x, fuzzy=True).year if isinstance(x, string_types) else x.year for x in stats[c.name].uniques) except ValueError: pass except Exception as e: self._bundle.error("Failed to set coverage for column '{}', partition '{}': {}" .format(c.name, self.identity.vname, e)) raise # Space Coverage if 'source_data' in self.data: for source_name, source in list(self.data['source_data'].items()): scov.add(self.parse_gvid_or_place(source['space'])) if self.identity.space: # And from the partition name try: scov.add(self.parse_gvid_or_place(self.identity.space)) except ValueError: # Couldn't parse the space as a GVid pass # For geo_coverage, only includes the higher level summary levels, counties, states, # places and urban areas. self.space_coverage = sorted([str(x) for x in scov if bool(x) and x.sl in (10, 40, 50, 60, 160, 400)]) # # Time Coverage # From the source # If there was a time value in the source that this partition was created from, then # add it to the years. if 'source_data' in self.data: for source_name, source in list(self.data['source_data'].items()): if 'time' in source: for year in expand_to_years(source['time']): if year: tcov.add(year) # From the partition name if self.identity.name.time: for year in expand_to_years(self.identity.name.time): if year: tcov.add(year) self.time_coverage = [t for t in tcov if t] # # Grains if 'source_data' in self.data: for source_name, source in list(self.data['source_data'].items()): if 'grain' in source: grains.add(source['grain']) self.grain_coverage = sorted(str(g) for g in grains if g) @property def dict(self): """A dict that holds key/values for all of the properties in the object. :return: """ d = {p.key: getattr(self, p.key) for p in self.__mapper__.attrs if p.key not in ('table', 'dataset', '_codes', 'stats', 'data', 'process_records')} if self.data: # Copy data fields into top level dict, but don't overwrite existind values. for k, v in six.iteritems(self.data): if k not in d and k not in ('table', 'stats', '_codes', 'data'): d[k] = v return d @property def detail_dict(self): """A more detailed dict that includes the descriptions, sub descriptions, table and columns.""" d = self.dict def aug_col(c): d = c.dict d['stats'] = [s.dict for s in c.stats] return d d['table'] = self.table.dict d['table']['columns'] = [aug_col(c) for c in self.table.columns] return d @property def stats_dict(self): class Bunch(object): """Dict and object access to properties""" def __init__(self, o): self.__dict__.update(o) def __str__(self): return str(self.__dict__) def __repr__(self): return repr(self.__dict__) def keys(self): return list(self.__dict__.keys()) def items(self): return list(self.__dict__.items()) def iteritems(self): return iter(self.__dict__.items()) def __getitem__(self, k): if k in self.__dict__: return self.__dict__[k] else: from . import ColumnStat return ColumnStat(hist=[]) if not self._stats_dict: cols = {s.column.name: Bunch(s.dict) for s in self.stats} self._stats_dict = Bunch(cols) return self._stats_dict def build_sample(self): name = self.table.name count = int( self.database.connection.execute('SELECT count(*) FROM "{}"'.format(name)).fetchone()[0]) skip = count / 20 if count > 100: sql = 'SELECT * FROM "{}" WHERE id % {} = 0 LIMIT 20'.format(name, skip) else: sql = 'SELECT * FROM "{}" LIMIT 20'.format(name) sample = [] for j, row in enumerate(self.database.connection.execute(sql)): sample.append(list(row.values())) self.record.data['sample'] = sample s = self.bundle.database.session s.merge(self.record) s.commit() @property def row(self): # Use an Ordered Dict to make it friendly to creating CSV files. SKIP_KEYS = [ 'sequence_id', 'vid', 'id', 'd_vid', 't_vid', 'min_key', 'max_key', 'installed', 'ref', 'count', 'state', 'data', 'space_coverage', 'time_coverage', 'grain_coverage', 'name', 'vname', 'fqname', 'cache_key' ] d = OrderedDict([('table', self.table.name)] + [(p.key, getattr(self, p.key)) for p in self.__mapper__.attrs if p.key not in SKIP_KEYS]) return d def update(self, **kwargs): if 'table' in kwargs: del kwargs['table'] # In source_schema.csv, this is the name of the table, not the object for k, v in list(kwargs.items()): if hasattr(self, k): setattr(self, k, v) def finalize(self, ps=None): self.state = self.STATES.FINALIZING # Write the stats for this partition back into the partition with self.datafile.writer as w: for i, c in enumerate(self.table.columns, 1): wc = w.column(i) assert wc.pos == c.sequence_id, (c.name, wc.pos, c.sequence_id) wc.name = c.name wc.description = c.description wc.type = c.python_type.__name__ self.count = w.n_rows w.finalize() if self.type == self.TYPE.UNION: ps.update('Running stats ', state='running') stats = self.datafile.run_stats() self.set_stats(stats) self.set_coverage(stats) self._location = 'build' self.title = PartitionDisplay(self).title self.description = PartitionDisplay(self).description self.state = self.STATES.FINALIZED # ============= # These methods are a bit non-cohesive, since they require the _bundle value to be set, which is # set externally, when the object is retured from a bundle. def clean(self): """Remove all built files and return the partition to a newly-created state""" if self.datafile: self.datafile.remove() @property def location(self): base_location = self._location if not base_location: return None if self._bundle.build_fs.exists(base_location): if self._bundle.build_fs.hashsyspath(base_location): return self._bundle.build_fs.getsyspath(base_location) return base_location @location.setter def location(self, v): self._location = v @property def datafile(self): from ambry.exc import NotFoundError if self.is_local: # Use the local version, if it exists logger.debug('datafile: Using local datafile {}'.format(self.vname)) return self.local_datafile else: # If it doesn't try to get the remote. try: logger.debug('datafile: Using remote datafile {}'.format(self.vname)) return self.remote_datafile except NotFoundError: # If the remote doesnt exist, return the local, so the caller can call exists() on it, # get its path, etc. return self.local_datafile @property def local_datafile(self): """Return the datafile for this partition, from the build directory, the remote, or the warehouse""" from ambry_sources import MPRowsFile from fs.errors import ResourceNotFoundError from ambry.orm.exc import NotFoundError try: return MPRowsFile(self._bundle.build_fs, self.cache_key) except ResourceNotFoundError: raise NotFoundError( 'Could not locate data file for partition {} (local)'.format(self.identity.fqname)) @property def remote(self): """ Return the remote for this partition :return: """ from ambry.exc import NotFoundError ds = self.dataset if 'remote_name' not in ds.data: raise NotFoundError('Could not determine remote for partition: {}'.format(self.identity.fqname)) return self._bundle.library.remote(ds.data['remote_name']) @property def remote_datafile(self): from fs.errors import ResourceNotFoundError from ambry.exc import AccessError, NotFoundError from boto.exception import S3ResponseError try: from ambry_sources import MPRowsFile remote = self.remote datafile = MPRowsFile(remote.fs, self.cache_key) if not datafile.exists: raise NotFoundError( 'Could not locate data file for partition {} from remote {} : file does not exist' .format(self.identity.fqname, remote)) except ResourceNotFoundError as e: raise NotFoundError('Could not locate data file for partition {} (remote): {}' .format(self.identity.fqname, e)) except S3ResponseError as e: # HACK. It looks like we get the response error with an access problem when # we have access to S3, but the file doesn't exist. raise NotFoundError("Can't access MPR file for {} in remote {}".format(self.cache_key, remote.fs)) return datafile @property def is_local(self): """Return true is the partition file is local""" from ambry.orm.exc import NotFoundError try: if self.local_datafile.exists: return True except NotFoundError: pass return False def localize(self, ps=None): """Copy a non-local partition file to the local build directory""" from filelock import FileLock from ambry.util import ensure_dir_exists from ambry_sources import MPRowsFile from fs.errors import ResourceNotFoundError if self.is_local: return local = self._bundle.build_fs b = self._bundle.library.bundle(self.identity.as_dataset().vid) remote = self._bundle.library.remote(b) lock_path = local.getsyspath(self.cache_key + '.lock') ensure_dir_exists(lock_path) lock = FileLock(lock_path) if ps: ps.add_update(message='Localizing {}'.format(self.identity.name), partition=self, item_type='bytes', state='downloading') if ps: def progress(bts): if ps.rec.item_total is None: ps.rec.item_count = 0 if not ps.rec.data: ps.rec.data = {} # Should not need to do this. return self item_count = ps.rec.item_count + bts ps.rec.data['updates'] = ps.rec.data.get('updates', 0) + 1 if ps.rec.data['updates'] % 32 == 1: ps.update(message='Localizing {}'.format(self.identity.name), item_count=item_count) else: from ambry.bundle.process import call_interval @call_interval(5) def progress(bts): self._bundle.log("Localizing {}. {} bytes downloaded".format(self.vname, bts)) def exception_cb(e): raise e with lock: # FIXME! This won't work with remote ( http) API, only FS ( s3:, file:) if self.is_local: return self try: with remote.fs.open(self.cache_key + MPRowsFile.EXTENSION, 'rb') as f: event = local.setcontents_async(self.cache_key + MPRowsFile.EXTENSION, f, progress_callback=progress, error_callback=exception_cb) event.wait() if ps: ps.update_done() except ResourceNotFoundError as e: from ambry.orm.exc import NotFoundError raise NotFoundError("Failed to get MPRfile '{}' from {}: {} " .format(self.cache_key, remote.fs, e)) return self @property def reader(self): from ambry.orm.exc import NotFoundError from fs.errors import ResourceNotFoundError """The reader for the datafile""" try: return self.datafile.reader except ResourceNotFoundError: raise NotFoundError("Failed to find partition file, '{}' " .format(self.datafile.path)) def select(self, predicate=None, headers=None): """ Select rows from the reader using a predicate to select rows and and itemgetter to return a subset of elements :param predicate: If defined, a callable that is called for each row, and if it returns true, the row is included in the output. :param headers: If defined, a list or tuple of header names to return from each row :return: iterable of results WARNING: This routine works from the reader iterator, which returns RowProxy objects. RowProxy objects are reused, so if you construct a list directly from the output from this method, the list will have multiple copies of a single RowProxy, which will have as an inner row the last result row. If you will be directly constructing a list, use a getter that extracts the inner row, or which converts the RowProxy to a dict: list(s.datafile.select(lambda r: r.stusab == 'CA', lambda r: r.dict )) """ # FIXME; in Python 3, use yield from with self.reader as r: for row in r.select(predicate, headers): yield row def __iter__(self): """ Iterator over the partition, returning RowProxy objects. :return: a generator """ with self.reader as r: for row in r: yield row @property def analysis(self): """Return an AnalysisPartition proxy, which wraps this partition to provide acess to dataframes, shapely shapes and other analysis services""" if isinstance(self, PartitionProxy): return AnalysisPartition(self._obj) else: return AnalysisPartition(self) @property def measuredim(self): """Return a MeasureDimension proxy, which wraps the partition to provide access to columns in terms of measures and dimensions""" if isinstance(self, PartitionProxy): return MeasureDimensionPartition(self._obj) else: return MeasureDimensionPartition(self) # ============================ def update_id(self, sequence_id=None): """Alter the sequence id, and all of the names and ids derived from it. This often needs to be done after an IntegrityError in a multiprocessing run""" if sequence_id: self.sequence_id = sequence_id self._set_ids(force=True) if self.dataset: self._update_names() def _set_ids(self, force=False): if not self.sequence_id: from .exc import DatabaseError raise DatabaseError('Sequence ID must be set before insertion') if not self.vid or force: assert bool(self.d_vid) assert bool(self.sequence_id) don = ObjectNumber.parse(self.d_vid) assert don.revision on = don.as_partition(self.sequence_id) self.vid = str(on.rev(don.revision)) self.id = str(on.rev(None)) if not self.data: self.data = {} def _update_names(self): """Update the derived names""" d = dict( table=self.table_name, time=self.time, space=self.space, grain=self.grain, variant=self.variant, segment=self.segment ) assert self.dataset name = PartialPartitionName(**d).promote(self.dataset.identity.name) self.name = str(name.name) self.vname = str(name.vname) self.cache_key = name.cache_key self.fqname = str(self.identity.fqname) @staticmethod def before_insert(mapper, conn, target): """event.listen method for Sqlalchemy to set the sequence for this object and create an ObjectNumber value for the id_""" target._set_ids() if target.name and target.vname and target.cache_key and target.fqname and not target.dataset: return Partition.before_update(mapper, conn, target) @staticmethod def before_update(mapper, conn, target): target._update_names() @staticmethod def before_delete(mapper, conn, target): pass