示例#1
0
    def test_identity(self):
        from databundles.identity import new_identity
        self.assertEqual('source', self.bundle.identity.source)
        self.assertEqual('dataset', self.bundle.identity.dataset)
        self.assertEqual('subset', self.bundle.identity.subset)
        self.assertEqual('variation', self.bundle.identity.variation)
        self.assertEqual('creator', self.bundle.identity.creator)
        self.assertEqual(1, int(self.bundle.identity.revision))
        self.assertEqual('source-dataset-subset-variation-ca0d', 
                         self.bundle.identity.name)


        pid = self.bundle.identity
        d = pid.to_dict()
        
        self.assertEquals('source-dataset-subset-variation-ca0d', str(new_identity(pid.to_dict())))
        
        d['id'] = 'foobar'
        self.assertRaises(ValueError, new_identity, (d))
        
        del d['id']
        
        self.assertEquals('source-dataset-subset-variation-ca0d',str(new_identity(d)))
        self.assertEquals('source-dataset-subset-variation-ca0d.grain',str(new_identity( {'grain':'grain'}, bundle=self.bundle)))
        
        d['grain'] = 'grain'
        self.assertEquals('source-dataset-subset-variation-ca0d.grain',str( new_identity(d)))
示例#2
0
    def _get_remote_partition(self, bundle, partition):
        
        from databundles.identity import  PartitionIdentity, new_identity 

        identity = new_identity(partition.to_dict(), bundle=bundle) 

            

        p = bundle.partitions.get(identity.id_) # Get partition information from bundle
        
        if not p:
            from databundles.dbexceptions import NotFoundError
            raise NotFoundError("Failed to find partition {} in bundle {}"
                                .format(identity.name, bundle.identity.name))
        
        p_database_path = p.database.path
      
        r = self.remote.get_partition(bundle.identity.id_, p.identity.id_)
        # Store it in the local cache. 
        p_abs_path = self.cache.put(r,p.identity.cache_key)

        if os.path.realpath(p_database_path) != os.path.realpath(p_abs_path):
            m =( "Path mismatch in downloading partition: {} != {}"
                 .format(os.path.realpath(p_database_path),
                                os.path.realpath(p_abs_path)))
            
                              
            self.logger.error(m)
            raise Exception(m)

        # Ensure the file is in the local library. 
        self.database.add_file(p_abs_path, self.cache.repo_id, bundle.identity.id_, 'pulled')                 
    
        return p_abs_path, p
示例#3
0
文件: main.py 项目: kball/databundles
def post_partition(did, pid, library):
    from databundles.identity import new_identity, Identity
    from databundles.util import md5_for_file

    did = did.replace('|','/')
    pid = pid.replace('|','/')

    b =  library.get(did)

    if not b:
        raise exc.NotFound("No bundle found for id {}".format(did))

    payload = request.json
    identity = new_identity(payload['identity'])

    p = b.partitions.get(pid)
    
    if not p:
        raise exc.NotFound("No partition for {} in dataset {}".format(pid, did))

    if not pid in set([identity.id_, identity.vid]):
        raise exc.Conflict("Partition address '{}' doesn't match payload id '{}'".format(pid, identity.vid))

    library.database.add_remote_file(identity)

    return identity.to_dict()
示例#4
0
文件: main.py 项目: kball/databundles
def post_dataset(did,library): 
    '''Accept a payload that describes a bundle in the remote. Download the
    bundle and install it. '''

    did = did.replace('|','/')

    from databundles.identity import new_identity, Identity
    from databundles.util import md5_for_file
    
    payload = request.json
    identity = new_identity(payload['identity'])

    if not did in set([identity.id_, identity.vid]):
        raise exc.Conflict("Dataset address '{}' doesn't match payload id '{}'".format(did, identity.vid))

    # need to go directly to remote, not library.get() because the
    # dataset hasn't been loaded yet. 
    db_path = library.load(identity.cache_key)

    if not db_path:
        logger.error("Failed to get {} from cache while posting dataset".format(identity.cache_key))
        logger.error("  cache =  {}".format(library.cache))
        logger.error("  remote = {}".format(library.remote))
        raise exc.NotFound("Didn't  get bundle file for cache key {} ".format(identity.cache_key))

    logger.debug("Loading {} for identity {} ".format(db_path, identity))

    #b = DbBundle(db_path, logger=logger)

    md5 = md5_for_file(db_path)
    
    if md5 != payload['md5']:
        logger.debug('MD5 Mismatch: {} != {} '.format( md5 , payload['md5']))
        # First, try deleting the cached copy and re-fetching
        # but don't delete it unless there is an intervening cache
        #if library.remote.path(identity.cache_key).startswith('http'):
        #    raise exc.Conflict("MD5 Mismatch (a)")
        
        library.remote.remove(identity.cache_key)
        db_path = library.remote.get(identity.cache_key)
        
        md5 = md5_for_file(db_path)
        if md5 != payload['md5']:
            logger.debug('MD5 Mismatch, persiting after refetch: {} != {} '.format( md5 , payload['md5']))
            raise exc.Conflict("MD5 Mismatch (b)")

    b = DbBundle(db_path)

    if b.identity.cache_key != identity.cache_key:
        logger.debug("Identity mismatch while posting dataset: {} != {}".format(b.identity.cache_key, identity.cache_key))
        raise exc.Conflict("Identity of downloaded bundle doesn't match request payload")

    library.put(b)

    #library.run_dumper_thread()

    return b.identity.to_dict()
示例#5
0
def source_info(args,rc, src):
    
    if not args.term:
        prt("Source dir: {}", rc.sourcerepo.dir)
        for repo in  rc.sourcerepo.list:
            prt("Repo      : {}", repo.ident)
    else:
        import databundles.library as library
        from ..identity import new_identity
        l = library.new_library(rc.library(args.library))  
        found = False      
        
        for r in l.database.get_file_by_type('source'):
            ident = new_identity(r.data)
            
            if args.term == ident.name or args.term == ident.vname:
                found = r
                break
                
        if not found:
            err("Didn't find source for term '{}'. (Maybe need to run 'source sync')", args.term)
        else:
            from ..source.repository import new_repository
            repo = new_repository(rc.sourcerepo(args.name))
            ident = new_identity(r.data)
            repo.bundle_ident = ident
            
            prt('Name      : {}', ident.vname)
            prt('Id        : {}', ident.vid)
            prt('Dir       : {}', repo.bundle_dir)
            
            if not repo.bundle.database.exists():
                prt('Exists    : Database does not exist or is empty')
            else:   
                
                d = dict(repo.bundle.db_config.dict)
                process = d['process']

                prt('Created   : {}', process.get('dbcreated',''))
                prt('Prepared  : {}', process.get('prepared',''))
                prt('Built     : {}', process.get('built',''))
                prt('Build time: {}', str(round(float(process['buildtime']),2))+'s' if process.get('buildtime',False) else '')
示例#6
0
    def install_bundle_file(self, identity, bundle_file):
        """Install a bundle in the database, starting from a file that may
        be a partition or a bundle"""

        if isinstance(identity , dict):
            identity = new_identity(identity)
            
        if identity.is_bundle:
            bundle = DbBundle(bundle_file)
            
            self.install_bundle(bundle)
示例#7
0
    def _pid_or_args_to_pid(self, bundle,  pid, args):
        from databundles.identity import Identity, new_identity
        

        if isinstance(pid, Identity):
            return pid, None
        elif isinstance(pid,basestring):
            return None, pid # pid is actually the name
        elif args.get('name', False):
            return None, args.get('name', None)
        else:
            return new_identity(args, bundle=bundle), None
示例#8
0
文件: rest.py 项目: kball/databundles
    def put(self, metadata):
        ''''''
        import json
        from databundles.identity import new_identity

        metadata['identity'] = json.loads(metadata['identity'])
        
        identity = new_identity(metadata['identity'])

        if identity.is_bundle:
            r =  self.remote.datasets(identity.vid_enc).post(metadata)
            raise_for_status(r)
        else:
            r =  self.remote.datasets(identity.as_dataset.vid_enc).partitions(identity.vid_enc).post(metadata)
            raise_for_status(r)

        return r
示例#9
0
    def put_file(self, identity, file_path, state='new'):
        '''Store a dataset or partition file, without having to open the file
        to determine what it is, by using  seperate identity''' 
        
        if isinstance(identity , dict):
            identity = new_identity(identity)
        
        dst = self.cache.put(file_path,identity.cache_key)

        if self.api and self.sync:
            self.api.put(file_path)

        self.database.add_file(dst, self.cache.repo_id, identity.id_,  state)

        if identity.is_bundle:
            self.database.install_bundle_file(identity, file_path)

        return dst, identity.cache_key, self.cache.public_url_f()(identity.cache_key)
示例#10
0
def source_sync(args,rc, src):
    '''Synchronize all of the repositories with the local library'''
    import databundles.library as library
    from databundles.identity import new_identity

    l = library.new_library(rc.library(args.library))

   
    for repo in rc.sourcerepo.list:
        
        prt('--- Sync with upstream source repository {}', repo.service.ident)
        for e in repo.service.list():

            ident = new_identity(e)

            l.database.add_file(e['clone_url'], repo.service.ident, ident.id_, 
                                state='synced', type_='source', source_url = e['clone_url'], data=e)
            
            prt("Added {:15s} {}",ident.id_,e['clone_url'] )
示例#11
0
    def put_file(self, identity, file_path, state='new'):
        '''Store a dataset or partition file, without having to open the file
        to determine what it is, by using  seperate identity''' 
        
        if isinstance(identity , dict):
            identity = new_identity(identity)

        dst = self.cache.put(file_path,identity.cache_key)

        if not os.path.exists(dst):
            raise Exception("cache {}.put() didn't return an existent path. got: {}".format(type(self.cache), dst))

        if self.remote and self.sync:
            self.remote.put(identity, file_path)

        self.database.add_file(dst, self.cache.repo_id, identity.id_,  state)

        if identity.is_bundle:
            self.database.install_bundle_file(identity, file_path)

        return dst, identity.cache_key, self.cache.public_url_f()(identity.cache_key)
示例#12
0
def source_clone(args,rc, src):   
    '''Clone one or more registered source packages ( via sync ) into the source directory '''
    import databundles.library as library
    from ..dbexceptions import ConflictError
    from ..identity import new_identity
    l = library.new_library(rc.library(args.library))

    

    def get_by_group(group):
        return [f for f in  l.database.get_file_by_type('source') if f.group == group]

    for repo in rc.sourcerepo.list:
        prt ("--- Cloning sources from: {}", repo.ident)
        for f in get_by_group(repo.ident):
            try:
                ident = new_identity(f.data)
                d = repo.clone(f.path, ident.source_path,repo.dir) 
                prt("Cloned {} to {}",f.path, d)
            except ConflictError as e :
                warn("Clone failed for {}: {}".format(f.path, e.message))
示例#13
0
    def find(self, query):
        '''Find datasets, given a QueryCommand object'''
        from databundles.library import QueryCommand
        from databundles.identity import Identity, PartitionIdentity, new_identity
        

        if isinstance(query, basestring):
            response =  self.remote.datasets.find(query).get()
            raise_for_status(response)
            r = [response.object]
            
        elif isinstance(query, dict):
            # Dict form of  QueryCOmmand
            response =  self.remote.datasets.find.post(query)
            raise_for_status(response)
            r = response.object
            
        elif isinstance(query, QueryCommand):
            response =  self.remote.datasets.find.post(query.to_dict())
            raise_for_status(response)
            r = response.object
            
        else:
            raise ValueError("Unknown input type: {} ".format(type(query)))
        
        
        raise_for_status(response)
       
    
        # Convert the result back to the form we get from the Library query 
        
        from collections import namedtuple
        Ref1= namedtuple('Ref1','Dataset Partition')
        Ref2= namedtuple('Ref2','Dataset')

        return [ new_identity(i) for i in r  if i is not False]
示例#14
0
    def get_ref(self,bp_id):
        from databundles.identity import ObjectNumber, DatasetNumber, PartitionNumber, Identity, PartitionIdentity

        if isinstance(bp_id, Identity):
            if bp_id.id_:
                bp_id = bp_id.id_
            else:
                bp_id = bp_id.name
                
        # If dataset is not None, it means the file already is in the cache.
        dataset = None
    
        try:
            on = ObjectNumber.parse(bp_id)

            if not ( isinstance(on, DatasetNumber) or isinstance(on, PartitionNumber)):
                raise ValueError("Object number must be for a Dataset or Partition: {} ".format(bp_id))
            
            dataset, partition  = self._get_bundle_path_from_id(bp_id) #@UnusedVariable
        except: 
            pass
        
        # Try it as a dataset name
        if not dataset:
            r = self.find(QueryCommand().identity(name = bp_id) )
            
            if len(r) > 1:
                raise Exception("Got more than one result")
            elif len(r) == 0:
                r = None
            else:
                r = r.pop()
            
            if r:
                dataset, partition  = self._get_bundle_path_from_id(r.id_) 
                
        # Try the name as a partition name
        if not dataset:
            q = self.find(QueryCommand().partition(name = bp_id) )
       
            if q:
                r = q.pop(0)
                if r:
                    dataset, partition  = self._get_bundle_path_from_id(r.id_)         

        # No luck so far, so now try to get it from the remote library
        if not dataset and self.remote:
            import socket
         
            try:
                r = self.remote.find(bp_id)

                if r:
                    r = r[0]
                    
                    if r.is_partition:
                        dataset = r.as_dataset
                        partition = r
                    else:
                        dataset = r
                        partition = None


            except socket.error:
                self.logger.error("Connection to remote ")
        elif dataset:
            from identity import new_identity
            dataset = Identity(**dataset.to_dict())
            partition = new_identity(partition.to_dict()) if partition else None
            
        if not dataset:
            return False, False
   
        return  dataset, partition
示例#15
0
def source_new(args,rc, src):   
    '''Clone one or more registered source packages ( via sync ) into the source directory '''
    from ..source.repository import new_repository
    from ..identity import new_identity, DatasetNumber
    
    
    repo = new_repository(rc.sourcerepo(args.name))  

    ident = new_identity(vars(args))

    bundle_dir =  os.path.join(repo.dir, ident.source_path)

    if not os.path.exists(bundle_dir):
        os.makedirs(bundle_dir)
    elif not os.path.isdir(bundle_dir):
        raise IOError("Directory already exists: "+bundle_dir)

    config ={
        'identity':{
             'id': str(DatasetNumber()),
             'source': args.source,
             'creator': args.creator,
             'dataset':args.dataset,
             'subset': args.subset,
             'variation': args.variation,
             'revision': args.revision
         },
        'about': {
            'author': "Author's email address",
            'description': "**include**", # Can't get YAML to write this properly
            'groups': ['group1','group2'],
            'homepage': "https://civicknowledge.org",
            'license': "other-open",
            'maintainer': "Maintainers email address",
            'tags': ['tag1','tag2'],
            'title': "Bundle title"
        }
    }
    
    os.makedirs(os.path.join(bundle_dir, 'meta'))
    
    file_ = os.path.join(bundle_dir, 'bundle.yaml-in')
    
    yaml.dump(config, file(file_, 'w'), indent=4, default_flow_style=False)

    # Need to edit the YAML file because the !include line is special metadata
    # that is hard ( or impossible ) to write through serialization
    
    with file(file_, 'r') as f_in:
        with file(os.path.join(bundle_dir, 'bundle.yaml'), 'w') as f_out:
            f_out.write(f_in.read().replace("'**include**'", "!include 'meta/about.description.md'"))
        
    os.remove(file_)
        
    p = lambda x : os.path.join(os.path.dirname(__file__),'..','support',x)

    shutil.copy(p('bundle.py'),bundle_dir)
    shutil.copy(p('README.md'),bundle_dir)
    shutil.copy(p('schema.csv'), os.path.join(bundle_dir, 'meta')  )
    shutil.copy(p('about.description.md'), os.path.join(bundle_dir, 'meta')  )
    

    prt("CREATED: {}",bundle_dir)