示例#1
0
 def children(self, quick=False):
     """Returns list of the Collection's Entity objects.
     
     >>> c = Collection.from_json('/tmp/ddr-testing-123')
     >>> c.children()
     [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...]
     
     TODO use util.find_meta_files()
     
     @param quick: Boolean List only titles and IDs
     @param dicts: Boolean List only titles and IDs (dicts)
     @returns: list of Entities or ListEntity
     """
     entity_paths = []
     if os.path.exists(self.files_path):
         # TODO use cached list if available
         for eid in os.listdir(self.files_path):
             path = os.path.join(self.files_path, eid)
             entity_paths.append(path)
     entity_paths = util.natural_sort(entity_paths)
     entities = []
     for path in entity_paths:
         if quick:
             # fake Entity with just enough info for lists
             entity_json_path = os.path.join(path, 'entity.json')
             if os.path.exists(entity_json_path):
                 e = ListEntity()
                 e.identifier = Identifier(path=path)
                 e.id = e.identifier.id
                 for line in fileio.read_text(entity_json_path).split('\n'):
                     if '"title":' in line:
                         e.title = json.loads('{%s}' % line)['title']
                     elif '"signature_id":' in line:
                         e.signature_id = json.loads('{%s}' %
                                                     line)['signature_id']
                         e.signature_abs = common.signature_abs(
                             e, self.identifier.basepath)
                     if e.title and e.signature_id:
                         # stop once we have what we need so we don't waste time
                         # and have entity.children as separate ghost entities
                         break
                 entities.append(e)
         else:
             entity = Entity.from_identifier(Identifier(path=path))
             for lv in entity.labels_values():
                 if lv['label'] == 'title':
                     entity.title = lv['value']
             entities.append(entity)
     return entities
示例#2
0
 def delete(self, document_id, recursive=False):
     """Delete a document and optionally its children.
     
     @param document_id:
     @param recursive: True or False
     """
     identifier = Identifier(id=document_id)
     if recursive:
         if identifier.model == 'collection':
             doc_type = 'collection,entity,file'
         elif identifier.model == 'entity':
             doc_type = 'entity,file'
         elif identifier.model == 'file':
             doc_type = 'file'
         query = 'id:"%s"' % identifier.id
         try:
             return self.es.delete_by_query(index=self.indexname,
                                            doc_type=doc_type,
                                            q=query)
         except TransportError:
             pass
     else:
         try:
             return self.es.delete(index=self.indexname,
                                   doc_type=identifier.model,
                                   id=identifier.id)
         except TransportError:
             pass
示例#3
0
 def file_name(entity, path_abs, role, sha1=None):
     """Generate a new name for the specified file; Use only when ingesting a file!
     
     rename files to standard names on ingest:
     %{entity_id%}-%{role}-%{sha1}.%{ext}
     example: ddr-testing-56-101-master-fb73f9de29.jpg
     
     SHA1 is optional so it can be passed in by a calling process that has already
     generated it.
     
     @param entity
     @param path_abs: Absolute path to the file.
     @param role
     @param sha1: SHA1 hash (optional)
     """
     if os.path.exists and os.access(path_abs, os.R_OK):
         ext = os.path.splitext(path_abs)[1]
         if not sha1:
             sha1 = util.file_hash(path_abs, 'sha1')
         if sha1:
             idparts = [a for a in entity.idparts]
             idparts.append(role)
             idparts.append(sha1[:10])
             name = '{}{}'.format(Identifier(parts=idparts).id, ext)
             return name
     return None
示例#4
0
def check_file(json_path, verbose=False):
    fi = Identifier(json_path)
    f = models.File.from_identifier(fi)

    if not os.path.exists(f.path_abs):
        result = ['missing', f.path_abs]
        print(result)
        return result
    
    mismatches = []
    md5 = util.file_hash(f.path_abs, 'md5')
    if not (md5 == f.md5):
        mismatches.append['md5']
    sha1 = util.file_hash(f.path_abs, 'sha1')
    if not (sha1 == f.sha1):
        mismatches.append['sha1']
    sha256 = util.file_hash(f.path_abs, 'sha256')
    if not (sha256 == f.sha256):
        mismatches.append['sha256']
    # SHA256 hash from the git-annex filename
    annex_sha256 = os.path.basename(
        os.path.realpath(f.path_abs)
    ).split('--')[1]
    if not (sha256 == annex_sha256):
        mismatches.append['annex_sha256']
    
    if mismatches:
        mismatches.append(json_path)
        print(mismatches)
    
    return mismatches
示例#5
0
文件: entity.py 项目: raux/ddr-cmdln
    def ddrpublic_template_key(self):
        """Combine factors for ddrpublic template selection into key
        
        For use in ddrindex publish to Elasticsearch.
        Generates a key which ddr-public will use to choose a template.
        Finds Entity's signature file, or the first mezzanine file,
        or the Entity's first child's first mezzanine file, etc, etc
        Matches Entity format and file mimetype to template
        
        @returns: signature,key
        """
        entity = self
        try:
            signature = Identifier(entity.signature_id, config.MEDIA_BASE).object()
        except:
            signature = None

        # VH entities may not have a valid signature
        if not signature:
            def first_mezzanine(entity):
                for fg in entity.file_groups:
                    if fg['role'] == 'mezzanine':
                        files = sorted(fg['files'], key=lambda file: file['sort'])
                        if files:
                            return files[0]
                return None
                
            # use child entity if exists and has mezzanine file
            if entity.children_meta:
                for c in entity.children_meta:
                    e = Identifier(c['id'], config.MEDIA_BASE).object()
                    if first_mezzanine(e):
                        entity = e
                        break
            # get signature image
            mezzanine = first_mezzanine(entity)
            if mezzanine:
                signature = Identifier(mezzanine['id'], config.MEDIA_BASE).object()
        
        # prepare decision table key
        key = None
        if signature:
            key = ':'.join([
                entity.format, signature.mimetype.split('/')[0]
            ])
        return signature,key
示例#6
0
 def children(self, quick=False):
     """Returns list of the Collection's Entity objects.
     
     >>> c = Collection.from_json('/tmp/ddr-testing-123')
     >>> c.children()
     [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...]
     
     TODO use util.find_meta_files()
     
     @param quick: Boolean List only titles and IDs
     @param dicts: Boolean List only titles and IDs (dicts)
     @returns: list of Entities or ListEntity
     """
     entity_paths = []
     if os.path.exists(self.files_path):
         # TODO use cached list if available
         for eid in os.listdir(self.files_path):
             path = os.path.join(self.files_path, eid)
             entity_paths.append(path)
     entity_paths = util.natural_sort(entity_paths)
     entities = []
     for path in entity_paths:
         if quick:
             # fake Entity with just enough info for lists
             entity_json_path = os.path.join(path, 'entity.json')
             if os.path.exists(entity_json_path):
                 with open(entity_json_path, 'r') as f:
                     data = json.loads(f.read())
                 e = ListEntity()
                 e.identifier = Identifier(path=path)
                 e.id = e.identifier.id
                 for line in data[1:]:
                     if 'title' in list(line.keys()):
                         e.title = line['title']
                     elif 'signature_id' in list(line.keys()):
                         e.signature_id = line['signature_id']
                 e.signature_abs = common.signature_abs(
                     e, self.identifier.basepath)
                 entities.append(e)
         else:
             entity = Entity.from_identifier(Identifier(path=path))
             for lv in entity.labels_values():
                 if lv['label'] == 'title':
                     entity.title = lv['value']
             entities.append(entity)
     return entities
示例#7
0
文件: entity.py 项目: raux/ddr-cmdln
 def get_role(f):
     if isinstance(f, File):
         return getattr(f, 'role')
     elif isinstance(f, dict) and f.get('role'):
         return f.get('role')
     elif isinstance(f, dict) and f.get('path_rel'):
         fid = os.path.basename(os.path.splitext(f['path_rel'])[0])
         fi = Identifier(id=fid)
         return fi.idparts['role']
     return None
示例#8
0
 def create(path_abs, identifier=None):
     """Creates a new Collection with initial values from module.FIELDS.
     
     @param path_abs: str Absolute path; must end in valid DDR id.
     @param identifier: [optional] Identifier
     @returns: Collection object
     """
     if not identifier:
         identifier = Identifier(path=path_abs)
     return common.create_object(identifier)
示例#9
0
 def _repo_org(self, path, doctype, remove=False):
     """
     seealso DDR.models.common.DDRObject.to_esobject
     """
     # get and validate file
     data = load_json(path)
     if (not (data.get('id') and data.get('repo'))):
         raise Exception('Data file is not well-formed.')
     oi = Identifier(id=data['id'])
     d = OrderedDict()
     d['id'] = oi.id
     d['model'] = oi.model
     d['parent_id'] = oi.parent_id(stubs=1)
     # links
     d['links_html'] = oi.id
     d['links_json'] = oi.id
     d['links_img'] = '%s/logo.png' % oi.id
     d['links_thumb'] = '%s/logo.png' % oi.id
     d['links_parent'] = oi.parent_id(stubs=1)
     d['links_children'] = oi.id
     # title,description
     d['title'] = data['title']
     d['description'] = data['description']
     d['url'] = data['url']
     # ID components (repo, org, cid, ...) as separate fields
     idparts = deepcopy(oi.idparts)
     idparts.pop('model')
     for k in ID_COMPONENTS:
         d[k] = ''  # ensure all fields present
     for k, v in idparts.iteritems():
         d[k] = v
     # add/update
     if remove and self.exists(doctype, oi):
         results = self.es.delete(index=self.indexname,
                                  doc_type=doctype,
                                  id=oi.id)
     else:
         results = self.es.index(index=self.indexname,
                                 doc_type=doctype,
                                 id=oi.id,
                                 body=d)
     return results
示例#10
0
 def _repo_org(self, path, doctype, remove=False):
     """
     seealso DDR.models.common.DDRObject.to_esobject
     """
     # get and validate file
     data = load_json(path)
     if (not (data.get('id') and data.get('repo'))):
         raise Exception('Data file is not well-formed.')
     oi = Identifier(id=data['id'])
     d = OrderedDict()
     d['id'] = oi.id
     d['model'] = oi.model
     d['parent_id'] = oi.parent_id(stubs=1)
     # links
     d['links_html'] = oi.id
     d['links_json'] = oi.id
     d['links_img'] = '%s/logo.png' % oi.id
     d['links_thumb'] = '%s/logo.png' % oi.id
     d['links_parent'] = oi.parent_id(stubs=1)
     d['links_children'] = oi.id
     # title,description
     d['title'] = data['title']
     d['description'] = data['description']
     d['url'] = data['url']
     # ID components (repo, org, cid, ...) as separate fields
     idparts = deepcopy(oi.idparts)
     idparts.pop('model')
     for k in ID_COMPONENTS:
         d[k] = '' # ensure all fields present
     for k,v in idparts.iteritems():
         d[k] = v
     # add/update
     if remove and self.exists(doctype, oi):
         results = self.es.delete(
             index=self.indexname, doc_type=doctype, id=oi.id
         )
     else:
         results = self.es.index(
             index=self.indexname, doc_type=doctype, id=oi.id, body=d
         )
     return results
示例#11
0
 def _children_paths(self):
     """Searches fs for (entity) childrens' .jsons, returns natsorted paths
     
     @returns: list
     """
     if os.path.exists(self.files_path):
         return natsorted([
             f
             for f in util.find_meta_files(self.files_path, recursive=True)
             # only direct children, no descendants
             if Identifier(f).parent_id() == self.id
         ])
     return []
示例#12
0
 def delete(self, document_id, recursive=False):
     """Delete a document and optionally its children.
     
     TODO refactor after upgrading Elasticsearch past 2.4.
     delete_by_query was removed sometime during elasticsearch-py 2.*
     I think it was added back in a later version so the code stays for now.
     
     For now, instead of deleting based on document_id, we start with
     document_id, find all paths beneath it in the filesystem,
     and curl DELETE url each individual document from Elasticsearch.
     
     @param document_id:
     @param recursive: True or False
     """
     logger.debug('delete(%s, %s)' % (document_id, recursive))
     oi = Identifier(document_id, config.MEDIA_BASE)
     if recursive:
         paths = util.find_meta_files(oi.path_abs(),
                                      recursive=recursive,
                                      files_first=1)
     else:
         paths = [oi.path_abs()]
     identifiers = [Identifier(path) for path in paths]
     num = len(identifiers)
     for n, oi in enumerate(identifiers):
         # TODO hard-coded models here!
         if oi.model == 'segment':
             model = 'entity'
         else:
             model = oi.model
         try:
             result = self.es.delete(index=self.index_name(model), id=oi.id)
             print(
                 f'{n}/{num} DELETE {self.index_name(model)} {oi.id} -> {result["result"]}'
             )
         except docstore.NotFoundError as err:
             print(
                 f'{n}/{num} DELETE {self.index_name(model)} {oi.id} -> 404 Not Found'
             )
示例#13
0
文件: files.py 项目: densho/ddr-cmdln
    def new(identifier, git_name, git_mail, agent='cmdln'):
        """Creates new File (metadata only!), writes to filesystem, performs initial commit
        
        @param identifier: Identifier
        @param git_name: str
        @param git_mail: str
        @param agent: str
        @returns: exit,status int,str
        """
        parent = identifier.parent().object()
        if not parent:
            raise Exception('Parent for %s does not exist.' % identifier)
        file_ = File.create(identifier)
        file_.write_json()
        
        entity_file_edit(request, collection, file_, git_name, git_mail)

        exit,status = commands.entity_create(
            git_name, git_mail,
            collection, entity.identifier,
            [collection.json_path_rel, collection.ead_path_rel],
            [config.TEMPLATE_EJSON, config.TEMPLATE_METS],
            agent=agent
        )
        if exit:
            raise Exception('Could not create new Entity: %s, %s' % (exit, status))
        # load Entity object, inherit values from parent, write back to file
        entity = Identifier(identifier).object()
        entity.inherit(collection)
        entity.write_json()
        updated_files = [entity.json_path]
        exit,status = commands.entity_update(
            git_name, git_mail,
            collection, entity,
            updated_files,
            agent=agent
        )
        return exit,status
示例#14
0
    def _repo_org(self, path, doctype, remove=False):
        """
        seealso DDR.models.common.DDRObject.to_esobject
        """
        # get and validate file
        data = load_json(path)
        if (not (data.get('id') and data.get('repo'))):
            raise Exception('Data file is not well-formed.')
        oi = Identifier(id=data['id'])

        ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[doctype]
        d = ES_Class(id=oi.id)
        d.meta.id = oi.id
        d.model = oi.model
        d.parent_id = oi.parent_id(stubs=1)
        # links
        d.links_html = oi.id
        d.links_json = oi.id
        d.links_img = '%s/logo.png' % oi.id
        d.links_thumb = '%s/logo.png' % oi.id
        d.links_parent = oi.parent_id(stubs=1)
        d.links_children = oi.id
        # title,description
        d.title = data['title']
        d.description = data['description']
        d.url = data['url']
        # ID components (repo, org, cid, ...) as separate fields
        idparts = deepcopy(oi.idparts)
        idparts.pop('model')
        for key, val in idparts.items():
            setattr(d, key, val)
        # add/update
        if remove and self.exists(doctype, oi):
            results = d.delete(index=self.index_name(doctype), using=self.es)
        else:
            results = d.save(index=self.index_name(doctype), using=self.es)
        return results
示例#15
0
def signature_abs(obj, basepath):
    """Absolute path to signature image file, if signature_id present.
    
    Expects obj.signature_id to be either a valid file ID
    or a special interview signature image
    (ex. "denshovh-aart-03", "denshovh-hlarry_g-02")
    
    @returns: str absolute path to signature img, or None
    """
    if isinstance(obj, dict):
        sid = obj.get('signature_id')
    else:
        sid = getattr(obj, 'signature_id', None)
    # ignore interview signature ID
    if sid and INTERVIEW_SIG_REGEX.match(sid):
        return None
    if sid:
        try:
            oi = Identifier(sid, basepath)
        except:
            oi = None
        if oi and oi.model == 'file':
            return oi.path_abs('access')
    return None
示例#16
0
文件: common.py 项目: raux/ddr-cmdln
def signature_abs(obj, basepath):
    """Absolute path to signature image file, if signature_id present.
    
    Expects obj.signature_id to be either a valid file ID
    or a special interview signature image
    (ex. "denshovh-aart-03", "denshovh-hlarry_g-02")
    
    @returns: str absolute path to signature img, or None
    """
    if isinstance(obj, dict):
        sid = obj.get('signature_id')
    else:
        sid = getattr(obj, 'signature_id', None)
    # ignore interview signature ID
    if sid and INTERVIEW_SIG_REGEX.match(sid):
        return None
    if sid:
        try:
            oi = Identifier(sid, basepath)
        except:
            oi = None
        if oi and oi.model == 'file':
            return oi.path_abs('access')
    return None
示例#17
0
 def identifiers(self, model=None, force_read=False):
     """Lists Identifiers for all or subset of Collection's descendents.
     
     TODO how is this different from children?
     
     >>> c = Collection.from_json('/tmp/ddr-testing-123')
     >>> c.descendants()
     [<Entity ddr-testing-123-1>, <Entity ddr-testing-123-2>, ...]
     
     @param model: str Restrict list to model.
     @returns: list of Identifiers
     """
     return [
         Identifier(path) for path in util.find_meta_files(
             self.path, recursive=1, model=model, force_read=force_read)
     ]
示例#18
0
def _publishable(paths, parents, force=False):
    """Determines which paths represent publishable paths and which do not.
    
    @param paths
    @param parents
    @param force: boolean Just publish the damn collection already.
    @returns list of dicts, e.g. [{'path':'/PATH/TO/OBJECT', 'action':'publish'}]
    """
    path_dicts = []
    for path in paths:
        d = {
            'path': path,
            'identifier': Identifier(path=path),
            'action': 'UNSPECIFIED',
            'note': '',
        }

        if force:
            d['action'] = 'POST'
            path_dicts.append(d)
            continue

        # see if item incomplete or nonpublic

        # see if item's parents are incomplete or nonpublic
        # TODO Bad! Bad! Generalize this...
        UNPUBLISHABLE = []
        for parent_id in _file_parent_ids(d['identifier']):
            parent = parents.get(parent_id, {})
            for x in parent.itervalues():
                if (x not in STATUS_OK) and (x not in PUBLIC_OK):
                    if parent_id not in UNPUBLISHABLE:
                        UNPUBLISHABLE.append(parent_id)
        if UNPUBLISHABLE:
            d['action'] = 'SKIP'
            d['note'] = 'parent unpublishable'
            path_dicts.append(d)
            continue

        if path and d['identifier'].model:
            d['action'] = 'POST'
        path_dicts.append(d)

    return path_dicts
示例#19
0
 def child_field_values(self, model, fieldname):
     """Get all values of fieldname from specified model in collection.
     
     @param model str
     @param fieldname str
     """
     rows = []
     paths = util.find_meta_files(self.path_abs,
                                  model=model,
                                  recursive=True)
     for path in paths:
         o = Identifier(path).object()
         if getattr(o, fieldname):
             rows.append([
                 o.id,
                 fieldname,
                 getattr(o, fieldname),
             ])
     return rows
示例#20
0
    def ddrpublic_template_key(self):
        """Combine factors for ddrpublic template selection into key
        
        For use in ddrindex publish to Elasticsearch.
        Generates a key which ddr-public will use to choose a template.
        Finds Entity's signature file, or the first mezzanine file,
        or the Entity's first child's first mezzanine file, etc, etc
        Matches Entity format and file mimetype to template
        
        @returns: signature,key
        """
        entity = self
        try:
            signature = Identifier(entity.signature_id,
                                   config.MEDIA_BASE).object()
        except:
            signature = None

        # VH entities may not have a valid signature
        if not signature:

            def first_mezzanine(entity):
                for f in entity.children(role='mezzanine'):
                    return f
                return None

            # use child entity if exists and has mezzanine file
            if entity.children(models=['entity', 'segment']):
                for c in entity.children(models=['entity', 'segment']):
                    if first_mezzanine(c):
                        entity = c
                        break
            # get signature image
            signature = first_mezzanine(entity)

        # prepare decision table key
        key = None
        if signature:
            key = ':'.join([
                getattr(entity, 'format', ''),
                signature.mimetype.split('/')[0]
            ])
        return signature, key
示例#21
0
文件: common.py 项目: raux/ddr-cmdln
def sort_file_paths(json_paths, rank='role-eid-sort'):
    """Sort file JSON paths in human-friendly order.
    
    TODO this belongs in DDR.identifier
    
    @param json_paths: 
    @param rank: 'role-eid-sort' or 'eid-sort-role'
    """
    paths = {}
    keys = []
    while json_paths:
        path = json_paths.pop()
        identifier = Identifier(path=path)
        eid = identifier.parts.get('eid', None)
        role = identifier.parts.get('role', None)
        sha1 = identifier.parts.get('sha1', None)
        sort = 0
        with open(path, 'r') as f:
            for line in f.readlines():
                if 'sort' in line:
                    sort = line.split(':')[1].replace('"', '').strip()
        eid = str(eid)
        sha1 = str(sha1)
        sort = str(sort)
        if rank == 'eid-sort-role':
            key = '-'.join([str(eid), sort, role, sha1])
        elif rank == 'role-eid-sort':
            key = '-'.join([role, eid, sort, sha1])
        paths[key] = path
        keys.append(key)
    keys_sorted = [key for key in util.natural_sort(keys)]
    paths_sorted = []
    while keys_sorted:
        val = paths.pop(keys_sorted.pop(), None)
        if val:
            paths_sorted.append(val)
    return paths_sorted
示例#22
0
 def children(self, models=None, role=None, quick=None, force_read=False):
     """List Entity's child objects,files; optionally regenerate list
     
     @param model: list Restrict to specified model(s)
     @param role: str Restrict list to specified File role
     @param quick: bool Not used
     @param force_read: bool Scan entity dir for file jsons
     @returns: list of File objects, sorted
     """
     if force_read or not self._children_objects:
         # read objects from filesystem
         self._children_objects = _sort_children(
             [Identifier(path).object() for path in self._children_paths()])
     if models:
         return [
             o for o in self._children_objects
             if o.identifier.model in models
         ]
     elif role:
         return [
             o for o in self._children_objects
             if hasattr(o, 'role') and (o.role == role)
         ]
     return self._children_objects
示例#23
0
    def new(identifier, git_name, git_mail, agent='cmdln'):
        """Creates new File (metadata only!), writes to filesystem, performs initial commit
        
        @param identifier: Identifier
        @param git_name: str
        @param git_mail: str
        @param agent: str
        @returns: exit,status int,str
        """
        parent = identifier.parent().object()
        if not parent:
            raise Exception('Parent for %s does not exist.' % identifier)
        file_ = File.create(identifier.path_abs(), identifier)
        file_.write_json()

        entity_file_edit(request, collection, file_, git_name, git_mail)

        exit, status = commands.entity_create(
            git_name,
            git_mail,
            collection,
            entity.identifier,
            [collection.json_path_rel, collection.ead_path_rel],
            [config.TEMPLATE_EJSON, config.TEMPLATE_METS],
            agent=agent)
        if exit:
            raise Exception('Could not create new Entity: %s, %s' %
                            (exit, status))
        # load Entity object, inherit values from parent, write back to file
        entity = Identifier(identifier).object()
        entity.inherit(collection)
        entity.write_json()
        updated_files = [entity.json_path]
        exit, status = commands.entity_update(git_name,
                                              git_mail,
                                              collection,
                                              entity,
                                              updated_files,
                                              agent=agent)
        return exit, status
示例#24
0
def index( hosts, index, path, recursive=False, public=True ):
    """(Re)index with data from the specified directory.
    
    After receiving a list of metadata files, index() iterates through the list several times.  The first pass weeds out paths to objects that can not be published (e.g. object or its parent is unpublished).
    
    The second pass goes through the files and assigns a signature file to each entity or collection ID.
    There is some logic that tries to pick the first file of the first entity to be the collection signature, and so on.  Mezzanine files are preferred over master files.
    
    In the final pass, a list of public/publishable fields is chosen based on the model.  Additional fields not in the model (e.g. parent ID, parent organization/collection/entity ID, the signature file) are packaged.  Then everything is sent off to post().

    @param hosts: list of dicts containing host information.
    @param index: Name of the target index.
    @param path: Absolute path to directory containing object metadata files.
    @param recursive: Whether or not to recurse into subdirectories.
    @param public: For publication (fields not marked public will be ommitted).
    @param paths: Absolute paths to directory containing collections.
    @returns: number successful,list of paths that didn't work out
    """
    logger.debug('index(%s, %s, %s)' % (hosts, index, path))
    
    publicfields = public_fields()
    
    # process a single file if requested
    if os.path.isfile(path):
        paths = [path]
    else:
        # files listed first, then entities, then collections
        paths = util.find_meta_files(path, recursive, files_first=1)
    
    # Store value of public,status for each collection,entity.
    # Values will be used by entities and files to inherit these values from their parent.
    parents = _parents_status(paths)
    
    # Determine if paths are publishable or not
    successful_paths,bad_paths = _publishable_or_not(paths, parents)
    
    # iterate through paths, storing signature_url for each collection, entity
    # paths listed files first, then entities, then collections
    signature_files = _choose_signatures(successful_paths)
    print('Signature files')
    keys = signature_files.keys()
    keys.sort()
    for key in keys:
        print(key, signature_files[key])
    
    successful = 0
    for path in successful_paths:
        identifier = Identifier(path=path)
        parent_id = identifier.parent_id()
        
        document_pub_fields = []
        if public and identifier.model:
            document_pub_fields = publicfields[identifier.model]
        
        additional_fields = {'parent_id': parent_id}
        if identifier.model == 'collection': additional_fields['organization_id'] = parent_id
        if identifier.model == 'entity': additional_fields['collection_id'] = parent_id
        if identifier.model == 'file': additional_fields['entity_id'] = parent_id
        if identifier.model in ['collection', 'entity']:
            additional_fields['signature_file'] = signature_files.get(identifier.id, '')
        
        # HERE WE GO!
        document = load_document_json(path, identifier.model, identifier.id)
        try:
            existing = get(hosts, index, identifier.model, identifier.id, fields=[])
        except:
            existing = None
        result = post(hosts, index, document, document_pub_fields, additional_fields)
        # success: created, or version number incremented
        if result.get('_id', None):
            if existing:
                existing_version = existing.get('version', None)
                if not existing_version:
                    existing_version = existing.get('_version', None)
            else:
                existing_version = None
            result_version = result.get('version', None)
            if not result_version:
                result_version = result.get('_version', None)
            if result['created'] or (existing_version and (result_version > existing_version)):
                successful += 1
        else:
            bad_paths.append((path, result['status'], result['response']))
            #print(status_code)
    logger.debug('INDEXING COMPLETED')
    return {'total':len(paths), 'successful':successful, 'bad':bad_paths}
示例#25
0
def identifier_from_path(path: Path) -> Identifier:
    return Identifier(oid_from_path(path))
示例#26
0
文件: files.py 项目: densho/ddr-cmdln
 def parent( self ):
     i = Identifier(id=self.parent_id, base_path=self.identifier.basepath)
     return i.object()
示例#27
0
 def parent(self):
     i = Identifier(id=self.parent_id, base_path=self.identifier.basepath)
     return i.object()
示例#28
0
    def __init__(self, path_abs, id=None, identifier=None):
        """
        >>> c = Collection('/tmp/ddr-testing-123')
        >>> c.id
        'ddr-testing-123'
        >>> c.ead_path_rel
        'ead.xml'
        >>> c.ead_path
        '/tmp/ddr-testing-123/ead.xml'
        >>> c.json_path_rel
        'collection.json'
        >>> c.json_path
        '/tmp/ddr-testing-123/collection.json'
        """
        path_abs = os.path.normpath(path_abs)
        if identifier:
            i = identifier
        else:
            i = Identifier(path=path_abs)
        self.identifier = i

        self.id = i.id
        self.idparts = i.parts.values()

        self.path_abs = path_abs
        self.path = path_abs

        self.root = os.path.split(self.path)[0]
        self.json_path = i.path_abs('json')
        self.git_path = i.path_abs('git')
        self.gitignore_path = i.path_abs('gitignore')
        self.annex_path = i.path_abs('annex')
        self.changelog_path = i.path_abs('changelog')
        self.control_path = i.path_abs('control')
        self.ead_path = i.path_abs('ead')
        self.lock_path = i.path_abs('lock')
        self.files_path = i.path_abs('files')

        self.path_rel = i.path_rel()
        self.json_path_rel = i.path_rel('json')
        self.git_path_rel = i.path_rel('git')
        self.gitignore_path_rel = i.path_rel('gitignore')
        self.annex_path_rel = i.path_rel('annex')
        self.changelog_path_rel = i.path_rel('changelog')
        self.control_path_rel = i.path_rel('control')
        self.ead_path_rel = i.path_rel('ead')
        self.files_path_rel = i.path_rel('files')

        self.git_url = '{}:{}'.format(config.GITOLITE, self.id)
示例#29
0
def _publishable(paths, parents, force=False):
    """Determines which paths represent publishable paths and which do not.
    
    @param paths
    @param parents
    @param force: boolean Just publish the damn collection already.
    @returns list of dicts, e.g. [{'path':'/PATH/TO/OBJECT', 'action':'publish'}]
    """
    path_dicts = []
    for path in paths:
        d = {
            'path': path,
            'identifier': Identifier(path=path),
            'action': 'UNSPECIFIED',
            'note': '',
        }

        if force:
            d['action'] = 'POST'
            path_dicts.append(d)
            continue

        # see if item's parents are incomplete or nonpublic
        # TODO Bad! Bad! Generalize this...
        UNPUBLISHABLE = []
        for parent_id in _file_parent_ids(d['identifier']):
            parent = parents.get(parent_id, {})
            for x in parent.itervalues():
                if (x not in STATUS_OK) and (x not in PUBLIC_OK):
                    if parent_id not in UNPUBLISHABLE:
                        UNPUBLISHABLE.append(parent_id)
        if UNPUBLISHABLE:
            d['action'] = 'SKIP'
            d['note'] = 'parent unpublishable'
            path_dicts.append(d)
            continue

        # see if item itself is incomplete or nonpublic
        # TODO knows way too much about JSON data format
        public = None
        status = None
        jsonpath = d['identifier'].path_abs('json')
        document = load_json(jsonpath)
        for field in document:
            for k, v in field.iteritems():
                if k == 'public':
                    public = v
                if k == 'status':
                    status = v
        if public and (public not in PUBLIC_OK):
            d['action'] = 'SKIP'
            d['note'] = 'not public'
            path_dicts.append(d)
            continue
        elif status and (status not in STATUS_OK):
            d['action'] = 'SKIP'
            d['note'] = 'status'
            path_dicts.append(d)
            continue

        if path and d['identifier'].model:
            d['action'] = 'POST'
        path_dicts.append(d)

    return path_dicts
示例#30
0
 def __init__( self, path_abs, id=None, identifier=None ):
     path_abs = os.path.normpath(path_abs)
     if identifier:
         i = identifier
     else:
         i = Identifier(path=path_abs)
     self.identifier = i
     
     self.id = i.id
     self.idparts = i.parts.values()
     
     self.collection_id = i.collection_id()
     self.parent_id = i.parent_id()
     
     self.path_abs = path_abs
     self.path = path_abs
     self.collection_path = i.collection_path()
     self.parent_path = i.parent_path()
     
     self.root = os.path.dirname(self.parent_path)
     self.json_path = i.path_abs('json')
     self.changelog_path = i.path_abs('changelog')
     self.control_path = i.path_abs('control')
     self.mets_path = i.path_abs('mets')
     self.lock_path = i.path_abs('lock')
     self.files_path = i.path_abs('files')
     
     self.path_rel = i.path_rel()
     self.json_path_rel = i.path_rel('json')
     self.changelog_path_rel = i.path_rel('changelog')
     self.control_path_rel = i.path_rel('control')
     self.mets_path_rel = i.path_rel('mets')
     self.files_path_rel = i.path_rel('files')
示例#31
0
    def __init__(self, path_abs, id=None, identifier=None):
        path_abs = os.path.normpath(path_abs)
        if identifier:
            i = identifier
        else:
            i = Identifier(path=path_abs)
        self.identifier = i

        self.id = i.id
        self.idparts = list(i.parts.values())

        self.collection_id = i.collection_id()
        self.parent_id = i.parent_id()

        self.path_abs = path_abs
        self.path = path_abs
        self.collection_path = i.collection_path()
        self.parent_path = i.parent_path()

        self.root = os.path.dirname(self.parent_path)
        self.json_path = i.path_abs('json')
        self.changelog_path = i.path_abs('changelog')
        self.control_path = i.path_abs('control')
        self.mets_path = i.path_abs('mets')
        self.lock_path = i.path_abs('lock')
        self.files_path = i.path_abs('files')

        self.path_rel = i.path_rel()
        self.json_path_rel = i.path_rel('json')
        self.changelog_path_rel = i.path_rel('changelog')
        self.control_path_rel = i.path_rel('control')
        self.mets_path_rel = i.path_rel('mets')
        self.files_path_rel = i.path_rel('files')
示例#32
0
    def post_multi(self, path, recursive=False, force=False, backblaze=None):
        """Publish (index) specified document and (optionally) its children.
        
        After receiving a list of metadata files, index() iterates through the
        list several times.  The first pass weeds out paths to objects that can
        not be published (e.g. object or its parent is unpublished).
        
        In the final pass, a list of public/publishable fields is chosen based
        on the model.  Additional fields not in the model (e.g. parent ID, parent
        organization/collection/entity ID) are packaged.  Then everything is sent
        off to post().
        
        @param path: Absolute path to directory containing object metadata files.
        @param recursive: Whether or not to recurse into subdirectories.
        @param force: boolean Just publish the damn collection already.
        @param backblaze: storage.Backblaze object Look in b2sync tmpdir and mark
                   files uploaded to Backblaze.
        @returns: number successful,list of paths that didn't work out
        """
        logger.debug(f'post_multi({path}, {recursive}, {force}, {backblaze})')
        # Check that path
        try:
            ci = Identifier(path).collection()
        except:
            raise Exception(
                'Docstore.post_multi path must point to a collection or subdirectory.'
            )
        ci_path = Path(ci.id)

        publicfields = _public_fields()

        # process a single file if requested
        if os.path.isfile(path):
            paths = [path]
        else:
            # files listed first, then entities, then collections
            logger.debug(f'Finding files in {path}')
            paths = util.find_meta_files(path, recursive, files_first=1)

        # Determine if paths are publishable or not
        logger.debug(f'Checking for publishability')
        identifiers = [Identifier(path) for path in paths]
        parents = {
            oid: oi.object()
            for oid, oi in _all_parents(identifiers).items()
        }
        paths = publishable(identifiers, parents, force=force)

        # list files in b2 bucket
        # TODO do this in parallel with util.find_meta_files?
        b2_files = []
        if backblaze:
            logger.debug(
                f'Checking Backblaze for uploaded files ({backblaze.bucketname})'
            )
            b2_files = backblaze.list_files(folder=ci.id)
            logger.debug(f'{len(b2_files)} files')

        skipped = 0
        successful = 0
        bad_paths = []

        num = len(paths)
        for n, path in enumerate(paths):
            oi = path.get('identifier')
            if not oi:
                path['note'] = 'No identifier'
                bad_paths.append(path)
                continue
            try:
                document = oi.object()
            except Exception as err:
                path['note'] = f'Could not instantiate: {err}'
                bad_paths.append(path)
                continue
            if not document:
                path['note'] = 'No document'
                bad_paths.append(path)
                continue

            # see if file uploaded to Backblaze
            b2_synced = False
            b2str = ''
            if (oi.model == 'file') and b2_files:
                dir_filename = str(ci_path / Path(document.path).name)
                if dir_filename in b2_files:
                    b2_synced = True
                    b2str = '(b2)'
                    b2_files.remove(dir_filename)

            # TODO write logs instead of print
            now = datetime.now(config.TZ)
            action = path['action']
            path_note = path['note'].strip()
            print(f'{now} | {n+1}/{num} {action} {oi.id} {path_note}{b2str}')

            # see if document exists
            existing_v = None
            d = self.get(model=oi.model,
                         es_class=ELASTICSEARCH_CLASSES_BY_MODEL[oi.model],
                         document_id=oi.id)
            if d:
                existing_v = d.meta.version

            # post document
            if path['action'] == 'POST':
                try:
                    created = self.post(document,
                                        parents=parents,
                                        b2=b2_synced,
                                        force=True)
                except Exception as err:
                    traceback.print_exc()
                # force=True bypasses publishable in post() function
            # delete previously published items now marked incomplete/private
            elif existing_v and (path['action'] == 'SKIP'):
                print('%s | %s/%s DELETE' %
                      (datetime.now(config.TZ), n + 1, num))
                self.delete(oi.id)

            if path['action'] == 'SKIP':
                skipped += 1
                continue

            # version is incremented with each updated
            posted_v = None
            # for e.g. segment the ES doc_type will be 'entity' but oi.model is 'segment'
            d = self.get(model=oi.model,
                         es_class=ELASTICSEARCH_CLASSES_BY_MODEL[oi.model],
                         document_id=oi.id)
            if d:
                posted_v = d.meta.version

            # success: created, or version number incremented
            status = 'ERROR - unspecified'
            if posted_v and not existing_v:
                status = 'CREATED'
                successful += 1
            elif (existing_v and posted_v) and (existing_v < posted_v):
                status = 'UPDATED'
                successful += 1
            elif not posted_v:
                status = 'ERROR: not created'
                bad_paths.append(path)
                print(status)

        logger.debug('INDEXING COMPLETED')
        return {
            'total': len(paths),
            'skipped': skipped,
            'successful': successful,
            'bad': bad_paths
        }