def create(self, directory): """ Generates a single dataset from the given directory, provided it conforms to one of the specified templates: rootDirectory + subDirs (otherwise None is returned). """ if os.path.isdir(directory): if self.rootDirectory in directory: # remove rootDirectory, then split remaining path into subdirectories directory = directory.replace(self.rootDirectory,"") if directory.startswith("/"): directory = directory[1:] parts = directory.split(os.sep) for subDirs in self.subDirs: if len(parts) == len(subDirs): print 'Parsing directory: %s' % directory # add dataset-level metadata from configured parsers to fixed metadata fields metadata = self.fields.copy() dirpath = os.path.join(self.rootDirectory, directory) for parser in self.metadataParsers: met = parser.parseMetadata(dirpath) metadata = dict(metadata.items() + met.items()) # NOTE: met items override metadata items # add dataset-level access URLs urls = generateUrls(self.baseUrls, self.rootDirectory, directory) if len(urls)>0: metadata["url"] = urls # build Dataset id, title from sub-directory structure title = "" identifier = self.rootId for subDir in subDirs: if len(title)>0: title += ", " title += "%s=%s" % (string.capitalize(subDir), metadata[subDir][0]) identifier += ".%s" % metadata[subDir][0] # build 'id', 'instance_id, 'master_id' id = generateId(identifier, metadata, addVersion=self.addVersion) # optional mapping of metadata values if self.metadataMapper is not None: for key, values in metadata.items(): for i, value in enumerate(values): values[i] = self.metadataMapper.mappit(key, value) # create and return one Dataset record return DatasetRecord(id, title, metadata) # no Dataset record created - return None print "Directory %s does NOT match any sub-directory template" % directory return None
def create(self, datasetRecord, filepath): if os.path.isfile(filepath): dir, filename = os.path.split(filepath) if self._matches(filename): # optional limit on file age lastModDateTime = dt.datetime.fromtimestamp( os.path.getmtime(filepath) ) if self.maxDaysPast < 0 or (dt.datetime.now() - lastModDateTime).days <= self.maxDaysPast: name, extension = os.path.splitext(filename) ext = extension[1:] # remove '.' from file extension fields = {} fields['format'] = [ext] isImage = False for subtype, values in FILE_SUBTYPES.items(): if ext in values: fields['subtype'] = [subtype] if subtype==SUBTYPE_IMAGE: isImage=True # create image thumbnail ? if self.generateThumbnails and isImage: thumbnailPath = os.path.join(dir, "%s.%s" % (name, THUMBNAIL_EXT) ) self._generateThumbnail(filepath, thumbnailPath) # add file access URLs urls = generateUrls(self.baseUrls, self.rootDirectory, filepath, isImage=isImage) if len(urls)>0: fields["url"] = urls # add file-level metadata from configured parsers to fixed metadata metadata = self.fields.copy() metadata = dict(metadata.items() + fields.items()) # FIXME for parser in self.metadataParsers: met = parser.parseMetadata(filepath) metadata = dict(metadata.items() + met.items()) # NOTE: met items override metadata items # build 'id', 'instance_id, 'master_id' # start from dataset 'master_id' since it has no version, data_node information identifier = string.join( [datasetRecord.fields[MASTER_ID][0], filename], '.') id = generateId(identifier, metadata) # set record title to filename - rename 'title' global attribute if found try: title = metadata['title'][0] metadata['file title'] = [ title ] del metadata['title'] except KeyError: pass title = filename # file size metadata[SIZE] = [ os.path.getsize(filepath) ] # create Md5 checksum ? if self.generateChecksum: logging.debug('Computing Md5 checksum for file: %s ...' % filepath) md5 = md5_for_file(filepath, hr=True) logging.debug('...Md5 checksum=%s' % md5) metadata[CHECKSUM] = [md5] metadata[CHECKSUM_TYPE] = ['MD5'] # generate tracking ID ? if self.generateTrackingId: metadata[TRACKING_ID] = [str(uuid4())] # optional mapping of metadata values if self.metadataMapper is not None: for key, values in metadata.items(): for i, value in enumerate(values): values[i] = self.metadataMapper.mappit(key, value) return FileRecord(datasetRecord, id, title, metadata) # create no record return None else: raise Exception("%s is not a file" % filepath)