def object_metadata(module, repo_path): """Metadata for the ddrlocal/ddrcmdln and models definitions used. @param module: collection, entity, files model definitions module @param repo_path: Absolute path to root of object's repo @returns: dict """ if not config.APP_METADATA: repo = dvcs.repository(repo_path) config.APP_METADATA['git_version'] = '; '.join([ dvcs.git_version(repo), dvcs.annex_version(repo) ]) # ddr-cmdln url = 'https://github.com/densho/ddr-cmdln.git' config.APP_METADATA['application'] = url config.APP_METADATA['app_path'] = config.INSTALL_PATH config.APP_METADATA['app_commit'] = dvcs.latest_commit( config.INSTALL_PATH ) config.APP_METADATA['app_release'] = VERSION # ddr-defs config.APP_METADATA['defs_path'] = modules.Module(module).path config.APP_METADATA['defs_commit'] = dvcs.latest_commit( modules.Module(module).path ) return config.APP_METADATA
def test_Module_is_valid(): class TestModule0(object): __name__ = 'TestModule0' __file__ = '' class TestModule1(object): __name__ = 'TestModule1' __file__ = 'ddr/repo_models' class TestModule2(object): __name__ = 'TestModule2' __file__ = 'ddr/repo_models' FIELDS = 'not a list' class TestModule3(object): __name__ = 'TestModule3' __file__ = 'ddr/repo_models' FIELDS = ['fake fields'] assert modules.Module(TestModule0()).is_valid() == ( False, "TestModule0 not in 'ddr' Repository repo.") assert modules.Module( TestModule1()).is_valid() == (False, 'TestModule1 has no FIELDS variable.') assert modules.Module( TestModule2()).is_valid() == (False, 'TestModule2.FIELDS is not a list.') assert modules.Module(TestModule3()).is_valid() == (True, 'ok')
def test_Module_document_commit(): module = TestModule() # commit exists document = TestDocument() document.object_metadata = { "models_commit": "20dd4e2096e6f9a9eb7c2db52907b094f41f58de 2015-10-13 17:08:43 -0700", } expected = '20dd4e2096e6f9a9eb7c2db52907b094f41f58de' assert modules.Module(module).document_commit(document) == expected # no commit document = TestDocument() document.object_metadata = {} expected = None assert modules.Module(module).document_commit(document) == expected
def load_csv(self, rowd): """Populate File data from JSON-formatted text. @param rowd: dict Headers/row cells for one line of a CSV file. @returns: list of changed fields """ # remove 'id' from rowd because files.FIELDS has no 'id' field # TODO files.FIELDS really should have an ID field... if 'id' in rowd.iterkeys(): rowd.pop('id') module = modules.Module(self.identifier.fields_module()) modified = common.load_csv(self, module, rowd) # fill in the blanks if self.access_rel: access_abs = os.path.join(self.entity_files_path, self.access_rel) if os.path.exists(access_abs): self.access_abs = access_abs # Identifier does not know file extension def add_extension(path, ext): # add extenstions if not already present base, ext = os.path.splitext(path) if not ext: return path + ext return path self.ext = os.path.splitext(self.basename_orig)[1] self.path_abs = add_extension(self.path_abs, self.ext) self.path_rel = add_extension(self.path_rel, self.ext) self.basename = add_extension(self.basename, self.ext) # fix access_rel self.access_rel = os.path.join(os.path.dirname(self.path_rel), os.path.basename(self.access_abs)) return modified
def dump_csv(self, fields=[]): """Dump Entity data to CSV-formatted text. @returns: JSON-formatted text """ module = modules.Module(self.identifier.fields_module()) return common.prep_csv(self, module, fields=fields)
def choices(self, field_name): """Returns controlled-vocab choices for specified field, if any @param field_name: str @returns: list or None """ return modules.Module(self.identifier.fields_module()).field_choices(field_name)
def load_csv(self, rowd): """Populate Entity data from CSV-formatted text. @param rowd: dict Headers/row cells for one line of a CSV file. @returns: list of changed fields """ module = modules.Module(self.identifier.fields_module()) modified = common.load_csv(self, module, rowd) ## special cases #def parsedt(txt): # d = datetime.now(config.TZ) # try: # d = converters.text_to_datetime(txt) # except: # try: # d = converters.text_to_datetime(txt) # except: # pass # return d if not hasattr(self, 'record_created'): self.record_created = datetime.now(config.TZ) if modified and hasattr(self, 'record_lastmod'): self.record_lastmod = datetime.now(config.TZ) self.rm_file_duplicates() return modified
def test_Module_labels_values(): module = TestModule() document = TestDocument() data = [ { 'id': 'ddr-test-123' }, { 'modified': '2015-10-20T15:42:26' }, { 'title': 'labels_values' }, ] json_data = models.common.load_json(document, module, json.dumps(data)) expected = [{ 'value': u'ddr-test-123', 'label': 'Object ID' }, { 'value': u'2015-10-20T15:42:26', 'label': 'Last Modified' }, { 'value': u'labels_values', 'label': 'Title' }] assert modules.Module(module).labels_values(document) == expected
def test_Module_path(): class TestModule(object): pass module = TestModule() module.__file__ = '/var/www/media/base/ddr/repo_models/testmodule.pyc' assert modules.Module( module).path == '/var/www/media/base/ddr/repo_models/testmodule.py'
def test_Module_function(): class TestModule(object): def hello(self, text): return 'hello %s' % text module = TestModule() module.__file__ = 'ddr/repo_models' assert modules.Module(module).function('hello', 'world') == 'hello world'
def object_metadata(module, repo_path): """Metadata for the ddrlocal/ddrcmdln and models definitions used. @param module: collection, entity, files model definitions module @param repo_path: Absolute path to root of object's repo @returns: dict """ repo = dvcs.repository(repo_path) gitversion = '; '.join([dvcs.git_version(repo), dvcs.annex_version(repo)]) data = { 'application': 'https://github.com/densho/ddr-cmdln.git', 'app_commit': dvcs.latest_commit(config.INSTALL_PATH), 'app_release': VERSION, 'defs_path': modules.Module(module).path, 'models_commit': dvcs.latest_commit(modules.Module(module).path), 'git_version': gitversion, } return data
def dump_csv(self, fields=[]): """Dump File data to list of values suitable for CSV. @returns: list of values """ # make sure we export 'id' if it's not in model FIELDS (ahem, files) if 'id' not in fields: fields.insert(0, 'id') module = modules.Module(self.identifier.fields_module()) if self.basename and not self.mimetype: self.mimetype = self.get_mimetype(force=True) return common.prep_csv(self, module, fields=fields)
def _filter_fields(i, data): """Run index_* functions on data @param i: Identifier @param data: dict @returns: dict data """ module = i.fields_module() for field in module.FIELDS: fieldname = field['name'] # run index_* functions on field data if present data[fieldname] = modules.Module(module).function( 'index_%s' % fieldname, data[fieldname]) return data
def load_json(document, module, json_text): """Populates object from JSON-formatted text; applies jsonload_{field} functions. Goes through module.FIELDS turning data in the JSON file into object attributes. TODO content fields really should into OBJECT.data OrderedDict or subobject. @param document: Collection/Entity/File object. @param module: collection/entity/file module from 'ddr' repo. @param json_text: JSON-formatted text @returns: dict """ try: json_data = json.loads(json_text) except ValueError: json_data = [ { 'title': 'ERROR: COULD NOT READ DATA (.JSON) FILE!' }, { '_error': 'Error: ValueError during read load_json.' }, ] # software and commit metadata for field in json_data: if is_object_metadata(field): setattr(document, 'object_metadata', field) break # field values from JSON for mf in module.FIELDS: for f in json_data: if hasattr(f, 'keys') and (f.keys()[0] == mf['name']): fieldname = f.keys()[0] # run jsonload_* functions on field data if present field_data = modules.Module(module).function( 'jsonload_%s' % fieldname, f.values()[0]) if isinstance(field_data, basestring): field_data = field_data.strip() setattr(document, fieldname, field_data) # Fill in missing fields with default values from module.FIELDS. # Note: should not replace fields that are just empty. for mf in module.FIELDS: if not hasattr(document, mf['name']): setattr(document, mf['name'], mf.get('default', None)) # Add timeszone to fields if not present apply_timezone(document, module) return json_data
def dump_json(obj, module, template=False, template_passthru=['id', 'record_created', 'record_lastmod'], exceptions=[]): """Arranges object data in list-of-dicts format before serialization. DDR keeps data in Git is to take advantage of versioning. Python dicts store data in random order which makes it impossible to meaningfully compare diffs of the data over time. DDR thus stores data as an alphabetically arranged list of dicts, with several exceptions. The first dict in the list is not part of the object itself but contains metadata about the state of the DDR application at the time the file was last written: the Git commit of the app, the release number, and the versions of Git and git-annex used. Python data types that cannot be represented in JSON (e.g. datetime) are converted into strings. @param obj: Collection/Entity/File object. @param module: modules.Module @param template: Boolean True if object to be used as blank template. @param template_passthru: list @param exceptions: list @returns: dict """ data = [] for mf in module.FIELDS: item = {} fieldname = mf['name'] field_data = '' if template and (fieldname not in template_passthru) and hasattr( mf, 'form'): # write default values field_data = mf['form']['initial'] elif hasattr(obj, mf['name']): # run jsondump_* functions on field data if present field_data = modules.Module(module).function( 'jsondump_%s' % fieldname, getattr(obj, fieldname)) item[fieldname] = field_data if fieldname not in exceptions: data.append(item) return data
def test_Module_cmp_model_definition_fields(): module = TestModule() module.FIELDS = [ { 'name': 'id', }, { 'name': 'modified', }, { 'name': 'title', }, ] m = modules.Module(module) data = [ {}, # object_metadata { 'id': 'ddr-test-123' }, { 'modified': '2015-10-20T15:42:26' }, { 'title': 'labels_values' }, ] expected0 = {'removed': [], 'added': []} out0 = m.cmp_model_definition_fields(json.dumps(data)) data.append({'new': 'new field'}) expected1 = {'removed': [], 'added': ['new']} out1 = m.cmp_model_definition_fields(json.dumps(data)) data.pop() # rm new data.pop() # rm title expected2 = {'removed': ['title'], 'added': []} out2 = m.cmp_model_definition_fields(json.dumps(data)) assert out0 == expected0 assert out1 == expected1 assert out2 == expected2
def form_prep(document, module): """Apply formprep_{field} functions to prep data dict to pass into DDRForm object. Certain fields require special processing. Data may need to be massaged and prepared for insertion into particular Django form objects. If a "formprep_{field}" function is present in the collectionmodule it will be executed. @param document: Collection, Entity, File document object @param module: collection, entity, files model definitions module @returns data: dict object as used by Django Form object. """ data = {} for f in module.FIELDS: if hasattr(document, f['name']) and f.get('form', None): fieldname = f['name'] # run formprep_* functions on field data if present field_data = modules.Module(module).function( 'formprep_%s' % fieldname, getattr(document, f['name'])) data[fieldname] = field_data return data
def form_post(document, module, cleaned_data): """Apply formpost_{field} functions to process cleaned_data from CollectionForm Certain fields require special processing. If a "formpost_{field}" function is present in the entitymodule it will be executed. NOTE: cleaned_data must contain items for all module.FIELDS. @param document: Collection, Entity, File document object @param module: collection, entity, files model definitions module @param cleaned_data: dict cleaned_data from DDRForm """ for f in module.FIELDS: if hasattr(document, f['name']) and f.get('form', None): fieldname = f['name'] # run formpost_* functions on field data if present field_data = modules.Module(module).function( 'formpost_%s' % fieldname, cleaned_data[fieldname]) setattr(document, fieldname, field_data) # update record_lastmod if hasattr(document, 'record_lastmod'): document.record_lastmod = datetime.now(config.TZ)
def labels_values(self): """Apply display_{field} functions to prep object data for the UI. """ module = self.identifier.fields_module() return modules.Module(module).labels_values(self)
def post(self, document, public_fields=[], additional_fields={}, parents={}, force=False): """Add a new document to an index or update an existing one. This function can produce ElasticSearch documents in two formats: - old-style list-of-dicts used in the DDR JSON files. - normal dicts used by ddr-public. DDR metadata JSON files are structured as a list of fieldname:value dicts. This is done so that the fields are always in the same order, making it possible to easily see the difference between versions of a file. [IMPORTANT: documents MUST contain an 'id' field!] In ElasticSearch, documents are structured in a normal dict so that faceting works properly. curl -XPUT 'http://localhost:9200/ddr/collection/ddr-testing-141' -d '{ ... }' @param document: Collection,Entity,File The object to post. @param public_fields: list @param additional_fields: dict @param parents: dict Basic metadata for parent documents. @param force: boolean Bypass status and public checks. @returns: JSON dict with status code and response """ logger.debug('post(%s, %s, %s)' % (self.indexname, document, force)) if force: publishable = True else: if not parents: parents = _parents_status([document.identifier.path_abs()]) publishable = _publishable([document.identifier.path_abs()], parents) if not publishable: return {'status': 403, 'response': 'object not publishable'} # instantiate appropriate subclass of ESObject / DocType # TODO Devil's advocate: why are we doing this? We already have the object. ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[document.identifier.model] d = ES_Class() fields_module = document.identifier.fields_module() d.meta.id = document.identifier.id for fieldname in doctype_fields(ES_Class): # index_* for complex fields if hasattr(fields_module, 'index_%s' % fieldname): field_data = modules.Module(fields_module).function( 'index_%s' % fieldname, getattr(document, fieldname), ) # everything else else: try: field_data = getattr(document, fieldname) except AttributeError as err: field_data = None if field_data: setattr(d, fieldname, field_data) # Add parts of id (e.g. repo, org, cid) to document as separate fields. for key in ['repo', 'org', 'cid', 'eid', 'sid', 'role', 'sha1']: setattr(d, key, document.identifier.parts.get(key, '')) d.collection_id = document.identifier.collection_id() if d.collection_id and (d.collection_id != document.identifier.id): # we don't want file-role (a stub) as parent d.parent_id = document.identifier.parent_id(stubs=0) else: # but we do want repository,organization (both stubs) d.parent_id = document.identifier.parent_id(stubs=1) logger.debug('saving') status = d.save(using=self.es, index=self.indexname) logger.debug(str(status)) return status
def test_Module_parse_commit(): module = TestModule() text = '95a3a0ed3232990ee8fbbc3065a11316bccd0b35 2015-03-26 15:49:58 -0700' expected = '95a3a0ed3232990ee8fbbc3065a11316bccd0b35' assert modules.Module(module)._parse_commit(text) == expected
def to_esobject(self, public_fields=[], public=True): """Returns an Elasticsearch DSL version of the object @param public_fields: list @param public: boolean @returns: subclass of repo_models.elastic.ESObject """ # instantiate appropriate subclass of ESObject / DocType # TODO Devil's advocate: why are we doing this? We already have the object. ES_Class = ELASTICSEARCH_CLASSES_BY_MODEL[self.identifier.model] fields_module = self.identifier.fields_module() if not public_fields: public_fields = [ f['name'] for f in fields_module.FIELDS if f['elasticsearch']['public'] ] img_path = '' if hasattr(self, 'mimetype') and (self.mimetype == 'text/html'): # TODO knows too much!!! img_path = os.path.join( self.identifier.collection_id(), '%s.htm' % self.id, ) elif hasattr(self, 'access_rel'): img_path = os.path.join( self.identifier.collection_id(), os.path.basename(self.access_rel), ) elif self.signature_id: img_path = os.path.join( self.identifier.collection_id(), access_filename(self.signature_id), ) download_path = '' if (self.identifier.model in ['file']): download_path = os.path.join( self.identifier.collection_id(), '%s%s' % (self.id, self.ext), ) d = ES_Class() d.meta.id = self.identifier.id d.id = self.identifier.id d.model = self.identifier.model if self.identifier.collection_id() != self.identifier.id: # we don't want file-role (a stub) as parent d.parent_id = self.identifier.parent_id(stubs=0) else: # but we do want repository,organization (both stubs) d.parent_id = self.identifier.parent_id(stubs=1) d.organization_id = self.identifier.organization_id() d.collection_id = self.identifier.collection_id() d.signature_id = self.signature_id if hasattr(self, 'ddrpublic_template_key'): signature, template_key = self.ddrpublic_template_key() if template_key: d.template = template_key # ID components (repo, org, cid, ...) as separate fields idparts = deepcopy(self.identifier.idparts) idparts.pop('model') for k in ID_COMPONENTS: setattr(d, k, '') # ensure all fields present for k, v in idparts.iteritems(): setattr(d, k, v) # links d.links_html = self.identifier.id d.links_json = self.identifier.id d.links_parent = self.identifier.parent_id(stubs=True) d.links_children = self.identifier.id d.links_img = img_path d.links_thumb = img_path # title,description if hasattr(self, 'title'): d.title = self.title else: d.title = self.label if hasattr(self, 'description'): d.description = self.description else: d.description = '' # breadcrumbs d.lineage = [{ 'id': i.id, 'model': i.model, 'idpart': str(MODELS_IDPARTS[i.model][-1][-1]), 'label': str(i.idparts[MODELS_IDPARTS[i.model][-1][-1]]), } for i in self.identifier.lineage(stubs=0)] # module-specific fields if hasattr(ES_Class, 'list_fields'): setattr(d, '_fields', ES_Class.list_fields()) # module-specific fields for fieldname in docstore.doctype_fields(ES_Class): # hide non-public fields if this is public if public and (fieldname not in public_fields): continue # complex fields use repo_models.MODEL.index_FIELD if present if hasattr(fields_module, 'index_%s' % fieldname): field_data = modules.Module(fields_module).function( 'index_%s' % fieldname, getattr(self, fieldname), ) else: try: field_data = getattr(self, fieldname) except AttributeError as err: field_data = None if field_data: setattr(d, fieldname, field_data) # "special" fields if (self.identifier.model in ['entity', 'segment']): # TODO find a way to search on creators.id # narrator_id for c in self.creators: try: d.narrator_id = c['id'] except: pass # topics & facility are too hard to search as nested objects # so attach extra 'topics_id' and 'facility_id' fields d.topics_id = [item['id'] for item in self.topics] d.facility_id = [item['id'] for item in self.facility] if (self.identifier.model in ['segment']): d.ia_meta = archivedotorg.download_segment_meta(self.identifier.id) if (self.identifier.model in ['file']): if download_path: d.links_download = download_path return d