def bundle_ident(self): if not self._bundle: raise ConfigurationError( "Must assign bundle or bundle_dir to repostitory before this operation" ) return self._bundle.identity
def install_packages(self, module_name, pip_name, force=False): from ambry.util.packages import install python_dir = self._fs.python() if not python_dir: raise ConfigurationError( "Can't install python requirements without a configuration item for filesystems.python" ) if not os.path.exists(python_dir): os.makedirs(python_dir) sys.path.append(python_dir) if force: self.logger.info('Upgrading required package: {}->{}'.format( module_name, pip_name)) install(python_dir, module_name, pip_name) else: try: imp.find_module(module_name) return # self.log("Required package already installed: {}->{}".format(module_name, pip_name)) except ImportError: self.logger.info('Installing required package: {}->{}'.format( module_name, pip_name)) install(python_dir, module_name, pip_name)
def _expand_each(self, each): """Generate a set of dicts from the cross product of each of the arrays of 'each' group.""" # Normalize the each group, particular for the case where there is only # one dimension if not isinstance(each, list): raise ConfigurationError( "The 'each' key must have a list. Got a {} ".format( type(each))) elif len(each) == 0: each = [[{}]] if not isinstance(each[0], list): each = [each] # Now the top level arrays of each are dimensions, and we can do a # multi dimensional iteration over them. # This is essentially a cross-product, where out <- out X dim(i) out = [] for i, dim in enumerate(each): if i == 0: out = dim else: o2 = [] for i in dim: for j in out: o2.append(dict(i.items() + j.items())) out = o2 return out
def _get_dependencies(self): if not self._bundle: raise ConfigurationError("Can't use the dep() method for a library that is not attached to a bundle"); errors = 0 deps = self._bundle.metadata.dependencies if not deps: return {} out = {} for k, v in deps.items(): ident = self.resolve(v) if not ident: raise DependencyError("Failed to resolve {} ".format(v)) if ident.partition: out[k] = ident.partition else: out[k] = ident return out
def calling_code(f, f_name=None, raise_for_missing=True): """Return the code string for calling a function. """ import inspect from ambry.dbexceptions import ConfigurationError if inspect.isclass(f): try: args = inspect.getargspec(f.__init__).args except TypeError as e: raise TypeError("Failed to inspect {}: {}".format(f, e)) else: args = inspect.getargspec(f).args if len(args) > 1 and args[0] == 'self': args = args[1:] for a in args: if a not in all_args + ( 'exception', ): # exception arg is only for exception handlers if raise_for_missing: raise ConfigurationError( 'Caster code {} has unknown argument ' 'name: \'{}\'. Must be one of: {} '.format( f, a, ','.join(all_args))) arg_map = {e: e for e in var_args} args = [arg_map.get(a, a) for a in args] return "{}({})".format(f_name if f_name else f.__name__, ','.join(args))
def new_service(config): if config['service'] == 'github': from github import GitHubService # @UnresolvedImport return GitHubService(**config) else: from ambry.dbexceptions import ConfigurationError raise ConfigurationError('No source service for name {}'.format(name))
def resolve_data_type(value_type): from ambry.valuetype import resolve_value_type vt_class = resolve_value_type(value_type) if not vt_class: raise ConfigurationError( "Row error: unknown valuetype '{}'".format(value_type)) return vt_class.python_type().__name__
def bundle_dir(self): if not self._bundle and not self._bundle_dir: raise ConfigurationError( "Must assign bundle or bundle_dir to repostitory before this operation" ) if self._bundle_dir: return self._bundle_dir else: return self.bundle.bundle_dir
def config_edit(args, l, rc): from ambry.dbexceptions import ConfigurationError from ambry.util import AttrDict edit_args = ' '.join(args.args) if args.yaml or args.json: if args.yaml: import yaml v = yaml.load(edit_args) elif args.json: import json v = json.loads(edit_args) d = AttrDict() d.update(v) print d rc.config.update_flat(d.flatten()) else: key, value = edit_args.split('=') value = value.strip() key = key.strip() key_parts = key.split('.') e = rc.config for k in key_parts: k = k.strip() #print(k, str(key_parts[-1])) if str(k) == str(key_parts[-1]): e[k] = value else: e = e[k] configs = rc.config['loaded']['configs'] if len(configs) != 1: raise ConfigurationError("Configuration was loaded from multiple files; don't know which to edit; " "'{}'".format(configs)) try: del rc.config['accounts'] except KeyError: pass try: del rc.config['loaded'] except KeyError: pass with open(configs[0], 'w') as f: rc.config.dump(f)
def usgeo(self): try: usgeo = self.library.dep('usgeo') except ConfigurationError: raise ConfigurationError( "MISSING DEPENDENCY: " + "To use the US geo datasets, the bundle ( or library ) must specify a" + " dependency with a set named 'usgeo', in build.dependencies.usgeo" ) return usgeo
def widths(self): widths = [c.width for c in self.columns] if not all(bool(e) for e in widths): from ambry.dbexceptions import ConfigurationError raise ConfigurationError( 'The widths array for source table {} has zero or null entries ' .format(self.name)) widths = [int(w) for w in widths] return widths
def __init__(self, cache, database, name=None, remotes=None, source_dir = None, require_upload=False, doc_cache = None, warehouse_cache = None, host=None, port=None, urlhost = None): '''Libraries are constructed on the root cache name for the library. If the cache does not exist, it will be created. Args: cache: a path name to a directory where bundle files will be stored database: remote: URL of a remote library, for fallback for get and put. sync: If true, put to remote synchronously. Defaults to False. ''' from ..util import get_logger assert database is not None self.name = name self.cache = cache self._doc_cache = doc_cache self._warehouse_cache = warehouse_cache self.source_dir = source_dir self._database = database self._bundle = None # Set externally in bundle.library() self.host = host self.port = port self.urlhost = urlhost if urlhost else ( '{}:{}'.format(self.host,self.port) if self.port else self.host) self.dep_cb = None# Callback for dependency resolution self.require_upload = require_upload self._dependencies = None self._remotes = remotes self._all_vids = None if not self.cache: raise ConfigurationError("Must specify library.cache for the " "library in bundles.yaml") self.logger = get_logger(__name__) self.logger.setLevel(logging.DEBUG) self.needs_update = False self.bundles = weakref.WeakValueDictionary() self._search = None
def _places(self): try: places = self.library.dep('places').partition except ConfigurationError: raise ConfigurationError( "MISSING DEPENDENCY: " + "To use the US county datasets, the bundle ( or library ) must specify a" + " dependency with a set named 'places', in build.dependencies.places. " + " See https://github.com/clarinova/ambry/wiki/Error-Messages#geoanalysisareasget_analysis_area" ) return places
def new_repository(config): from ..service import new_service, GitServiceMarker # @UnresolvedImport if not 'account' in config: config['account'] = {'user': None, 'password': None} service_config = config['account'] service_config.update(config) service = new_service(service_config) if isinstance(service, GitServiceMarker): from .git import GitRepository # @UnresolvedImport return GitRepository(service=service, dir=config['dir']) else: from ambry.dbexceptions import ConfigurationError raise ConfigurationError('Unknown {}'.format(type(service)))
def include(self, node): if not self.dir: return "ConfigurationError: Can't include file: wasn't able to set base directory" relpath = self.construct_scalar(node) abspath = os.path.join(self.dir, relpath) if not os.path.exists(abspath): raise ConfigurationError( "Can't include file '{}': Does not exist".format(abspath)) with open(abspath, 'r') as f: parts = abspath.split('.') ext = parts.pop() if ext == 'yaml': return yaml.load(f, OrderedDictYAMLLoader) else: return IncludeFile(abspath, relpath, f.read())
def record_to_objects(self): """Write from the stored file data to the source records""" from ambry.orm import SourceTable bsfile = self.record failures = set() # Clear out all of the columns from existing tables. We don't clear out the # tables, since they may be referenced by sources for row in bsfile.dict_row_reader: st = self._dataset.source_table(row['table']) if st: st.columns[:] = [] self._dataset.commit() for row in bsfile.dict_row_reader: st = self._dataset.source_table(row['table']) if not st: st = self._dataset.new_source_table(row['table']) # table_number += 1 if 'datatype' not in row: row['datatype'] = 'unknown' del row['table'] st.add_column(**row) # Create or update if failures: raise ConfigurationError( 'Failed to load source schema, missing sources: {} '.format( failures)) self._dataset.commit()
def _compose(self, name, args, mkdir=True): """Get a named filesystem entry, and extend it into a path with additional path arguments""" from os.path import normpath from ambry.dbexceptions import ConfigurationError root = p = self._config.filesystem[name].format(root=self._root) if args: args = [e.strip() for e in args] p = join(p, *args) if not isdir(p) and mkdir: makedirs(p) p = normpath(p) if not p.startswith(root): raise ConfigurationError( "Path for name='{}', args={} resolved outside of define filesystem root" .format(name, args)) return p
def set_api(self): import ambry.client.ckan repo_group = self.bundle.config.group('datarepo') if not repo_group.get(self.repo_name): raise ConfigurationError( "'repository' group in configure either nonexistent" + " or missing {} sub-group ".format(self.repo_name)) repo_config = repo_group.get(self.repo_name) self._api = ambry.client.ckan.Ckan(repo_config.url, repo_config.key) # Look for an S3 filestore fs_config = repo_config.get('filestore', False) if fs_config is not False: raise Exception("Deprecated?") else: self.filestore = None return self.remote
def accounts(self): """ Return an account reference :param account_id: :param accounts_password: The password for decrypting the secret :return: """ d = {} if False and not self._account_password: from ambry.dbexceptions import ConfigurationError raise ConfigurationError( "Can't access accounts without setting an account password" " either in the accounts.password config, or in the AMBRY_ACCOUNT_PASSWORD" " env var.") for act in self.database.session.query(Account).all(): if self._account_password: act.secret_password = self._account_password e = act.dict a_id = e['account_id'] d[a_id] = e return d
def number(self, assignment_class=None, namespace='d'): """ Return a new number. :param assignment_class: Determines the length of the number. Possible values are 'authority' (3 characters) , 'registered' (5) , 'unregistered' (7) and 'self' (9). Self assigned numbers are random and acquired locally, while the other assignment classes use the number server defined in the configuration. If None, then look in the number server configuration for one of the class keys, starting with the longest class and working to the shortest. :param namespace: The namespace character, the first character in the number. Can be one of 'd', 'x' or 'b' :return: """ if assignment_class == 'self': # When 'self' is explicit, don't look for number server config return str(DatasetNumber()) elif assignment_class is None: try: nsconfig = self.services['numbers'] except ConfigurationError: # A missing configuration is equivalent to 'self' self.logger.error( 'No number server configuration; returning self assigned number' ) return str(DatasetNumber()) for assignment_class in ('self', 'unregistered', 'registered', 'authority'): if assignment_class + '-key' in nsconfig: break # For the case where the number configuratoin references a self-assigned key if assignment_class == 'self': return str(DatasetNumber()) else: try: nsconfig = self.services['numbers'] except ConfigurationError: raise ConfigurationError('No number server configuration') if assignment_class + '-key' not in nsconfig: raise ConfigurationError( 'Assignment class {} not number server config'.format( assignment_class)) try: key = nsconfig[assignment_class + '-key'] config = { 'key': key, 'host': nsconfig['host'], 'port': nsconfig.get('port', 80) } ns = NumberServer(**config) n = str(next(ns)) self.logger.info('Got number from number server: {}'.format(n)) except HTTPError as e: self.logger.error( 'Failed to get number from number server for key: {}'.format( key, e.message)) self.logger.error( 'Using self-generated number. There is no problem with this, ' 'but they are longer than centrally generated numbers.') n = str(DatasetNumber()) return n
def record_to_objects(self): """Create config records to match the file metadata""" from ambry.orm import Column, Table, Dataset def _clean_int(i): if i is None: return None elif isinstance(i, int): return i elif isinstance(i, string_types): if len(i) == 0: return None return int(i.strip()) bsfile = self.record contents = bsfile.unpacked_contents if not contents: return line_no = 1 # Accounts for file header. Data starts on line 2 errors = [] warnings = [] extant_tables = {t.name: t for t in self._dataset.tables} old_types_map = { 'varchar': Column.DATATYPE_STR, 'integer': Column.DATATYPE_INTEGER, 'real': Column.DATATYPE_FLOAT, } def run_progress_f(line_no): self._bundle.log( 'Loading tables from file. Line #{}'.format(line_no)) from ambry.bundle.process import CallInterval run_progress_f = CallInterval(run_progress_f, 10) table_number = self._dataset._database.next_sequence_id( Dataset, self._dataset.vid, Table) for row in bsfile.dict_row_reader: line_no += 1 run_progress_f(line_no) # Skip blank lines if not row.get('column', False) and not row.get('table', False): continue if not row.get('column', False): raise ConfigurationError( 'Row error: no column on line {}'.format(line_no)) if not row.get('table', False): raise ConfigurationError( 'Row error: no table on line {}'.format(line_no)) if not row.get('datatype', False) and not row.get( 'valuetype', False): raise ConfigurationError( 'Row error: no type on line {}'.format(line_no)) value_type = row.get('valuetype', '').strip() if row.get( 'valuetype', False) else None data_type = row.get('datatype', '').strip() if row.get( 'datatype', False) else None def resolve_data_type(value_type): from ambry.valuetype import resolve_value_type vt_class = resolve_value_type(value_type) if not vt_class: raise ConfigurationError( "Row error: unknown valuetype '{}'".format(value_type)) return vt_class.python_type().__name__ # If we have a value type field, and not the datatype, # the value type is as specified, and the data type is derived from it. if value_type and not data_type: data_type = resolve_data_type(value_type) elif data_type and not value_type: value_type = data_type data_type = resolve_data_type(value_type) # There are still some old data types hanging around data_type = old_types_map.get(data_type.lower(), data_type) table_name = row['table'] try: table = extant_tables[table_name] except KeyError: table = self._dataset.new_table( table_name, sequence_id=table_number, description=row.get('description') if row['column'] == 'id' else '') table_number += 1 extant_tables[table_name] = table data = { k.replace('d_', '', 1): v for k, v in list(row.items()) if k and k.startswith('d_') and v } if row['column'] == 'id': table.data.update(data) data = {} table.add_column( row['column'], fk_vid=row['is_fk'] if row.get('is_fk', False) else None, description=(row.get('description', '') or '').strip(), datatype=data_type, valuetype=value_type, parent=row.get('parent'), proto_vid=row.get('proto_vid'), size=_clean_int(row.get('size', None)), width=_clean_int(row.get('width', None)), data=data, keywords=row.get('keywords'), measure=row.get('measure'), transform=row.get('transform'), derivedfrom=row.get('derivedfrom'), units=row.get('units', None), universe=row.get('universe'), update_existing=True) self._dataset.t_sequence_id = table_number return warnings, errors
def impl(self): if not self._impl: raise ConfigurationError( "Must assign bundle to repostitory before this operation") return self._impl
def convert(self, table_name, progress_f=None): """Convert a spatialite geopartition to a regular arg by extracting the geometry and re-projecting it to WGS84 :param config: a `RunConfig` object :rtype: a `LibraryDb` object :param config: a `RunConfig` object :rtype: a `LibraryDb` object """ import subprocess import csv from ambry.orm import Column from ambry.dbexceptions import ConfigurationError # # Duplicate the geo arg table for the new arg # Then make the new arg # t = self.bundle.schema.add_table(table_name) ot = self.table for c in ot.columns: self.bundle.schema.add_column(t, c.name, datatype=c.datatype) # # Open a connection to spatialite and run the query to # extract CSV. # # It would be a lot more efficient to connect to the # Spatialite procss, attach the new database, the copt the # records in SQL. # try: subprocess.check_output('spatialite -version', shell=True) except: raise ConfigurationError( 'Did not find spatialite on path. Install spatialite') # Check the type of geometry: p = subprocess.Popen( ('spatialite {file} "select GeometryType(geometry) FROM {table} LIMIT 1;"' .format( file=self.database.path, table=self.identity.table)), stdout=subprocess.PIPE, shell=True) out, _ = p.communicate() out = out.strip() if out == 'POINT': self.bundle.schema.add_column( t, '_db_lon', datatype=Column.DATATYPE_REAL) self.bundle.schema.add_column( t, '_db_lat', datatype=Column.DATATYPE_REAL) command_template = """spatialite -csv -header {file} "select *, X(Transform(geometry, 4326)) AS _db_lon, Y(Transform(geometry, 4326)) AS _db_lat FROM {table}" """ else: self.bundle.schema.add_column( t, '_wkb', datatype=Column.DATATYPE_TEXT) command_template = """spatialite -csv -header {file} "select *, AsBinary(Transform(geometry, 4326)) AS _wkb FROM {table}" """ self.bundle.database.commit() pid = self.identity pid.table = table_name arg = self.bundle.partitions.new_partition(pid) arg.create_with_tables() # # Now extract the data into a new database. # command = command_template.format(file=self.database.path, table=self.identity.table) self.bundle.log("Running: {}".format(command)) p = subprocess.Popen(command, stdout=subprocess.PIPE, shell=True) stdout, stderr = p.communicate() # # Finally we can copy the data. # # local csv module shadows root #@UndefinedVariable rdr = csv.reader(stdout.decode('ascii').splitlines()) header = rdr.next() if not progress_f: progress_f = lambda x: x with arg.database.inserter(table_name) as ins: for i, line in enumerate(rdr): ins.insert(line) progress_f(i)