def test_identity(self): from databundles.identity import new_identity self.assertEqual('source', self.bundle.identity.source) self.assertEqual('dataset', self.bundle.identity.dataset) self.assertEqual('subset', self.bundle.identity.subset) self.assertEqual('variation', self.bundle.identity.variation) self.assertEqual('creator', self.bundle.identity.creator) self.assertEqual(1, int(self.bundle.identity.revision)) self.assertEqual('source-dataset-subset-variation-ca0d', self.bundle.identity.name) pid = self.bundle.identity d = pid.to_dict() self.assertEquals('source-dataset-subset-variation-ca0d', str(new_identity(pid.to_dict()))) d['id'] = 'foobar' self.assertRaises(ValueError, new_identity, (d)) del d['id'] self.assertEquals('source-dataset-subset-variation-ca0d',str(new_identity(d))) self.assertEquals('source-dataset-subset-variation-ca0d.grain',str(new_identity( {'grain':'grain'}, bundle=self.bundle))) d['grain'] = 'grain' self.assertEquals('source-dataset-subset-variation-ca0d.grain',str( new_identity(d)))
def _get_remote_partition(self, bundle, partition): from databundles.identity import PartitionIdentity, new_identity identity = new_identity(partition.to_dict(), bundle=bundle) p = bundle.partitions.get(identity.id_) # Get partition information from bundle if not p: from databundles.dbexceptions import NotFoundError raise NotFoundError("Failed to find partition {} in bundle {}" .format(identity.name, bundle.identity.name)) p_database_path = p.database.path r = self.remote.get_partition(bundle.identity.id_, p.identity.id_) # Store it in the local cache. p_abs_path = self.cache.put(r,p.identity.cache_key) if os.path.realpath(p_database_path) != os.path.realpath(p_abs_path): m =( "Path mismatch in downloading partition: {} != {}" .format(os.path.realpath(p_database_path), os.path.realpath(p_abs_path))) self.logger.error(m) raise Exception(m) # Ensure the file is in the local library. self.database.add_file(p_abs_path, self.cache.repo_id, bundle.identity.id_, 'pulled') return p_abs_path, p
def post_partition(did, pid, library): from databundles.identity import new_identity, Identity from databundles.util import md5_for_file did = did.replace('|','/') pid = pid.replace('|','/') b = library.get(did) if not b: raise exc.NotFound("No bundle found for id {}".format(did)) payload = request.json identity = new_identity(payload['identity']) p = b.partitions.get(pid) if not p: raise exc.NotFound("No partition for {} in dataset {}".format(pid, did)) if not pid in set([identity.id_, identity.vid]): raise exc.Conflict("Partition address '{}' doesn't match payload id '{}'".format(pid, identity.vid)) library.database.add_remote_file(identity) return identity.to_dict()
def post_dataset(did,library): '''Accept a payload that describes a bundle in the remote. Download the bundle and install it. ''' did = did.replace('|','/') from databundles.identity import new_identity, Identity from databundles.util import md5_for_file payload = request.json identity = new_identity(payload['identity']) if not did in set([identity.id_, identity.vid]): raise exc.Conflict("Dataset address '{}' doesn't match payload id '{}'".format(did, identity.vid)) # need to go directly to remote, not library.get() because the # dataset hasn't been loaded yet. db_path = library.load(identity.cache_key) if not db_path: logger.error("Failed to get {} from cache while posting dataset".format(identity.cache_key)) logger.error(" cache = {}".format(library.cache)) logger.error(" remote = {}".format(library.remote)) raise exc.NotFound("Didn't get bundle file for cache key {} ".format(identity.cache_key)) logger.debug("Loading {} for identity {} ".format(db_path, identity)) #b = DbBundle(db_path, logger=logger) md5 = md5_for_file(db_path) if md5 != payload['md5']: logger.debug('MD5 Mismatch: {} != {} '.format( md5 , payload['md5'])) # First, try deleting the cached copy and re-fetching # but don't delete it unless there is an intervening cache #if library.remote.path(identity.cache_key).startswith('http'): # raise exc.Conflict("MD5 Mismatch (a)") library.remote.remove(identity.cache_key) db_path = library.remote.get(identity.cache_key) md5 = md5_for_file(db_path) if md5 != payload['md5']: logger.debug('MD5 Mismatch, persiting after refetch: {} != {} '.format( md5 , payload['md5'])) raise exc.Conflict("MD5 Mismatch (b)") b = DbBundle(db_path) if b.identity.cache_key != identity.cache_key: logger.debug("Identity mismatch while posting dataset: {} != {}".format(b.identity.cache_key, identity.cache_key)) raise exc.Conflict("Identity of downloaded bundle doesn't match request payload") library.put(b) #library.run_dumper_thread() return b.identity.to_dict()
def source_info(args,rc, src): if not args.term: prt("Source dir: {}", rc.sourcerepo.dir) for repo in rc.sourcerepo.list: prt("Repo : {}", repo.ident) else: import databundles.library as library from ..identity import new_identity l = library.new_library(rc.library(args.library)) found = False for r in l.database.get_file_by_type('source'): ident = new_identity(r.data) if args.term == ident.name or args.term == ident.vname: found = r break if not found: err("Didn't find source for term '{}'. (Maybe need to run 'source sync')", args.term) else: from ..source.repository import new_repository repo = new_repository(rc.sourcerepo(args.name)) ident = new_identity(r.data) repo.bundle_ident = ident prt('Name : {}', ident.vname) prt('Id : {}', ident.vid) prt('Dir : {}', repo.bundle_dir) if not repo.bundle.database.exists(): prt('Exists : Database does not exist or is empty') else: d = dict(repo.bundle.db_config.dict) process = d['process'] prt('Created : {}', process.get('dbcreated','')) prt('Prepared : {}', process.get('prepared','')) prt('Built : {}', process.get('built','')) prt('Build time: {}', str(round(float(process['buildtime']),2))+'s' if process.get('buildtime',False) else '')
def install_bundle_file(self, identity, bundle_file): """Install a bundle in the database, starting from a file that may be a partition or a bundle""" if isinstance(identity , dict): identity = new_identity(identity) if identity.is_bundle: bundle = DbBundle(bundle_file) self.install_bundle(bundle)
def _pid_or_args_to_pid(self, bundle, pid, args): from databundles.identity import Identity, new_identity if isinstance(pid, Identity): return pid, None elif isinstance(pid,basestring): return None, pid # pid is actually the name elif args.get('name', False): return None, args.get('name', None) else: return new_identity(args, bundle=bundle), None
def put(self, metadata): '''''' import json from databundles.identity import new_identity metadata['identity'] = json.loads(metadata['identity']) identity = new_identity(metadata['identity']) if identity.is_bundle: r = self.remote.datasets(identity.vid_enc).post(metadata) raise_for_status(r) else: r = self.remote.datasets(identity.as_dataset.vid_enc).partitions(identity.vid_enc).post(metadata) raise_for_status(r) return r
def put_file(self, identity, file_path, state='new'): '''Store a dataset or partition file, without having to open the file to determine what it is, by using seperate identity''' if isinstance(identity , dict): identity = new_identity(identity) dst = self.cache.put(file_path,identity.cache_key) if self.api and self.sync: self.api.put(file_path) self.database.add_file(dst, self.cache.repo_id, identity.id_, state) if identity.is_bundle: self.database.install_bundle_file(identity, file_path) return dst, identity.cache_key, self.cache.public_url_f()(identity.cache_key)
def source_sync(args,rc, src): '''Synchronize all of the repositories with the local library''' import databundles.library as library from databundles.identity import new_identity l = library.new_library(rc.library(args.library)) for repo in rc.sourcerepo.list: prt('--- Sync with upstream source repository {}', repo.service.ident) for e in repo.service.list(): ident = new_identity(e) l.database.add_file(e['clone_url'], repo.service.ident, ident.id_, state='synced', type_='source', source_url = e['clone_url'], data=e) prt("Added {:15s} {}",ident.id_,e['clone_url'] )
def put_file(self, identity, file_path, state='new'): '''Store a dataset or partition file, without having to open the file to determine what it is, by using seperate identity''' if isinstance(identity , dict): identity = new_identity(identity) dst = self.cache.put(file_path,identity.cache_key) if not os.path.exists(dst): raise Exception("cache {}.put() didn't return an existent path. got: {}".format(type(self.cache), dst)) if self.remote and self.sync: self.remote.put(identity, file_path) self.database.add_file(dst, self.cache.repo_id, identity.id_, state) if identity.is_bundle: self.database.install_bundle_file(identity, file_path) return dst, identity.cache_key, self.cache.public_url_f()(identity.cache_key)
def source_clone(args,rc, src): '''Clone one or more registered source packages ( via sync ) into the source directory ''' import databundles.library as library from ..dbexceptions import ConflictError from ..identity import new_identity l = library.new_library(rc.library(args.library)) def get_by_group(group): return [f for f in l.database.get_file_by_type('source') if f.group == group] for repo in rc.sourcerepo.list: prt ("--- Cloning sources from: {}", repo.ident) for f in get_by_group(repo.ident): try: ident = new_identity(f.data) d = repo.clone(f.path, ident.source_path,repo.dir) prt("Cloned {} to {}",f.path, d) except ConflictError as e : warn("Clone failed for {}: {}".format(f.path, e.message))
def find(self, query): '''Find datasets, given a QueryCommand object''' from databundles.library import QueryCommand from databundles.identity import Identity, PartitionIdentity, new_identity if isinstance(query, basestring): response = self.remote.datasets.find(query).get() raise_for_status(response) r = [response.object] elif isinstance(query, dict): # Dict form of QueryCOmmand response = self.remote.datasets.find.post(query) raise_for_status(response) r = response.object elif isinstance(query, QueryCommand): response = self.remote.datasets.find.post(query.to_dict()) raise_for_status(response) r = response.object else: raise ValueError("Unknown input type: {} ".format(type(query))) raise_for_status(response) # Convert the result back to the form we get from the Library query from collections import namedtuple Ref1= namedtuple('Ref1','Dataset Partition') Ref2= namedtuple('Ref2','Dataset') return [ new_identity(i) for i in r if i is not False]
def get_ref(self,bp_id): from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber, Identity, PartitionIdentity if isinstance(bp_id, Identity): if bp_id.id_: bp_id = bp_id.id_ else: bp_id = bp_id.name # If dataset is not None, it means the file already is in the cache. dataset = None try: on = ObjectNumber.parse(bp_id) if not ( isinstance(on, DatasetNumber) or isinstance(on, PartitionNumber)): raise ValueError("Object number must be for a Dataset or Partition: {} ".format(bp_id)) dataset, partition = self._get_bundle_path_from_id(bp_id) #@UnusedVariable except: pass # Try it as a dataset name if not dataset: r = self.find(QueryCommand().identity(name = bp_id) ) if len(r) > 1: raise Exception("Got more than one result") elif len(r) == 0: r = None else: r = r.pop() if r: dataset, partition = self._get_bundle_path_from_id(r.id_) # Try the name as a partition name if not dataset: q = self.find(QueryCommand().partition(name = bp_id) ) if q: r = q.pop(0) if r: dataset, partition = self._get_bundle_path_from_id(r.id_) # No luck so far, so now try to get it from the remote library if not dataset and self.remote: import socket try: r = self.remote.find(bp_id) if r: r = r[0] if r.is_partition: dataset = r.as_dataset partition = r else: dataset = r partition = None except socket.error: self.logger.error("Connection to remote ") elif dataset: from identity import new_identity dataset = Identity(**dataset.to_dict()) partition = new_identity(partition.to_dict()) if partition else None if not dataset: return False, False return dataset, partition
def source_new(args,rc, src): '''Clone one or more registered source packages ( via sync ) into the source directory ''' from ..source.repository import new_repository from ..identity import new_identity, DatasetNumber repo = new_repository(rc.sourcerepo(args.name)) ident = new_identity(vars(args)) bundle_dir = os.path.join(repo.dir, ident.source_path) if not os.path.exists(bundle_dir): os.makedirs(bundle_dir) elif not os.path.isdir(bundle_dir): raise IOError("Directory already exists: "+bundle_dir) config ={ 'identity':{ 'id': str(DatasetNumber()), 'source': args.source, 'creator': args.creator, 'dataset':args.dataset, 'subset': args.subset, 'variation': args.variation, 'revision': args.revision }, 'about': { 'author': "Author's email address", 'description': "**include**", # Can't get YAML to write this properly 'groups': ['group1','group2'], 'homepage': "https://civicknowledge.org", 'license': "other-open", 'maintainer': "Maintainers email address", 'tags': ['tag1','tag2'], 'title': "Bundle title" } } os.makedirs(os.path.join(bundle_dir, 'meta')) file_ = os.path.join(bundle_dir, 'bundle.yaml-in') yaml.dump(config, file(file_, 'w'), indent=4, default_flow_style=False) # Need to edit the YAML file because the !include line is special metadata # that is hard ( or impossible ) to write through serialization with file(file_, 'r') as f_in: with file(os.path.join(bundle_dir, 'bundle.yaml'), 'w') as f_out: f_out.write(f_in.read().replace("'**include**'", "!include 'meta/about.description.md'")) os.remove(file_) p = lambda x : os.path.join(os.path.dirname(__file__),'..','support',x) shutil.copy(p('bundle.py'),bundle_dir) shutil.copy(p('README.md'),bundle_dir) shutil.copy(p('schema.csv'), os.path.join(bundle_dir, 'meta') ) shutil.copy(p('about.description.md'), os.path.join(bundle_dir, 'meta') ) prt("CREATED: {}",bundle_dir)