def generate_datafile(path, dataset, content=None, size=-1, verify=True, verified=True): '''Generates a datafile AND a replica to hold its contents''' from tardis.tardis_portal.models import Dataset_File, Replica, Location saved = settings.REQUIRE_DATAFILE_CHECKSUMS settings.REQUIRE_DATAFILE_CHECKSUMS = False try: datafile = Dataset_File() if content: datafile.size = str(len(content)) else: datafile.size = str(size) # Normally we use any old string for the datafile path, but some # tests require the path to be the same as what 'staging' would use if path == None: datafile.dataset_id = dataset.id datafile.save() path = "%s/%s/%s" % (dataset.get_first_experiment().id, dataset.id, datafile.id) filepath = os.path.normpath(FILE_STORE_PATH + '/' + path) if content: try: os.makedirs(os.path.dirname(filepath)) os.remove(filepath) except: pass file = open(filepath, 'wb+') file.write(content) file.close() datafile.mimetype = "application/unspecified" datafile.filename = os.path.basename(filepath) datafile.dataset_id = dataset.id datafile.save() location = _infer_location(path) replica = Replica(datafile=datafile, url=path, protocol='', location=location) if verify and content: if not replica.verify(allowEmptyChecksums=True): raise RuntimeError('verify failed!?!') else: replica.verified = verified replica.save() return (datafile, replica) finally: settings.REQUIRE_DATAFILE_CHECKSUMS = saved
def generate_datafile(path, dataset, content=None, size=-1, verify=True, verified=True, verify_checksums_req=False): '''Generates a datafile AND a replica to hold its contents''' from tardis.tardis_portal.models import Dataset_File, Replica, Location saved = settings.REQUIRE_DATAFILE_CHECKSUMS settings.REQUIRE_DATAFILE_CHECKSUMS = False try: datafile = Dataset_File() if content: datafile.size = str(len(content)) else: datafile.size = str(size) # Normally we use any old string for the datafile path, but some # tests require the path to be the same as what 'staging' would use if path == None: datafile.dataset_id = dataset.id datafile.save() path = "%s/%s/%s" % (dataset.get_first_experiment().id, dataset.id, datafile.id) filepath = os.path.normpath(settings.FILE_STORE_PATH + '/' + path) if content: try: os.makedirs(os.path.dirname(filepath)) os.remove(filepath) except: pass gen_file = open(filepath, 'wb+') gen_file.write(content) gen_file.close() datafile.mimetype = "application/unspecified" datafile.filename = os.path.basename(filepath) datafile.dataset_id = dataset.id datafile.save() settings.REQUIRE_DATAFILE_CHECKSUMS = verify_checksums_req location = _infer_location(path) replica = Replica(datafile=datafile, url=path, protocol='', location=location) if verify and content: if not replica.verify(): raise RuntimeError('verify failed!?!') replica.save() replica.verified = verified replica.save(update_fields=['verified']) # force no verification return (datafile, replica) finally: settings.REQUIRE_DATAFILE_CHECKSUMS = saved
def add_staged_file_to_dataset(rel_filepath, dataset_id, username, mimetype="application/octet-stream"): """ add file in user's staging path to a dataset may be replaced by main code functions. quick and dirty hack to get it working """ originfilepath = os.path.join(get_full_staging_path(username), rel_filepath) dataset = Dataset.objects.get(pk=dataset_id) newDatafile = Dataset_File() newDatafile.dataset = dataset newDatafile.size = os.path.getsize(originfilepath) newDatafile.protocol = "tardis" newDatafile.mimetype = mimetype file_dir = "/" + str(dataset.experiment.id) + "/" + str(dataset.id) + "/" file_path = file_dir + rel_filepath prelim_full_file_path = settings.FILE_STORE_PATH + file_path full_file_path = duplicate_file_check_rename(prelim_full_file_path) newDatafile.filename = os.path.basename(full_file_path) newDatafile.url = "%s://%s" % (newDatafile.protocol, full_file_path[ len(settings.FILE_STORE_PATH) + len(file_dir):]) if not os.path.exists(os.path.dirname(full_file_path)): os.makedirs(os.path.dirname(full_file_path)) shutil.move(originfilepath, full_file_path) newDatafile.save()
def testRemoteFile(self): content = urandom(1024) with NamedTemporaryFile() as f: # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = 'background_task_testfile' datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.url = 'file://' + path.abspath(f.name) datafile.save() def get_datafile(datafile): return Dataset_File.objects.get(id=datafile.id) # Check that it won't verify as it stands expect(get_datafile(datafile).verified).to_be(False) verify_files() expect(get_datafile(datafile).verified).to_be(False) expect(get_datafile(datafile).is_local()).to_be(False) # Fill in the content f.write(content) f.flush() # Check it now verifies verify_files() expect(get_datafile(datafile).verified).to_be(True) expect(get_datafile(datafile).is_local()).to_be(True)
def testLocalFile(self): content = urandom(1024) cf = ContentFile(content, 'background_task_testfile') # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = cf.name datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.save() replica = Replica(datafile=datafile, url=write_uploaded_file_to_dataset(self.dataset, cf), location=Location.get_default_location()) replica.save() def get_replica(datafile): return Replica.objects.get(datafile=datafile) # undo auto-verify: replica.verified = False replica.save(update_fields=['verified']) # Check that it's not currently verified expect(get_replica(datafile).verified).to_be(False) # Check it verifies verify_files() expect(get_replica(datafile).verified).to_be(True)
def process_enclosure(self, dataset, enclosure): filename = getattr(enclosure, 'title', basename(enclosure.href)) datafile = Dataset_File(filename=filename, dataset=dataset) try: datafile.mimetype = enclosure.mime except AttributeError: pass try: datafile.size = enclosure.length except AttributeError: pass try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save() url = enclosure.href # This means we will allow the atom feed to feed us any enclosure # URL that matches a registered location. Maybe we should restrict # this to a specific location. location = Location.get_location_for_url(url) if not location: logger.error('Rejected ingestion for unknown location %s' % url) return replica = Replica(datafile=datafile, url=url, location=location) replica.protocol = enclosure.href.partition('://')[0] replica.save() self.make_local_copy(replica)
def generate_datafile(path, dataset, content=None, size=-1, verify=True, verified=True): from tardis.tardis_portal.models import Dataset_File datafile = Dataset_File() # Normally we use any old string for the datafile path, but some # tests require the path to be the same as what 'staging' would use if path == None: datafile.dataset_id = dataset.id datafile.save() path = "%s/%s/%s" % (dataset.get_first_experiment().id, dataset.id, datafile.id) filepath = os.path.normpath(FILE_STORE_PATH + '/' + path) if content: try: os.makedirs(os.path.dirname(filepath)) os.remove(filepath) except: pass file = open(filepath, 'wb+') file.write(content) file.close() datafile.url = path datafile.mimetype = "application/unspecified" datafile.filename = os.path.basename(filepath) datafile.dataset_id = dataset.id if content: datafile.size = str(len(content)) else: datafile.size = str(size) if verify and content: if not datafile.verify(allowEmptyChecksums=True): raise RuntimeError('verify failed!?!') else: datafile.verified = verified datafile.save() return datafile
def _make_data_file(dataset, filename, content): # TODO: # create datasetfile f = mktemp() print "Inside make data file ", f open(f, "w+b").write(content) df = Dataset_File() df.dataset = dataset df.filename = filename df.url = 'file://'+f df.protocol = "staging" df.size = len(content) df.verify(allowEmptyChecksums=True) df.save() print "Df ---", df
def testRemoteFile(self): content = urandom(1024) with NamedTemporaryFile() as f: # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = 'background_task_testfile' datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.save() url = 'file://' + path.abspath(f.name) base_url = 'file://' + path.dirname(path.abspath(f.name)) location = self._get_or_create_local_location( 'test-staging-xxx', base_url, 'external', 10) replica = Replica(datafile=datafile, location=location, url=url) replica.save() def get_replica(replica): try: return Replica.objects.get(id=replica.id) except Replica.DoesNotExist: return None def get_new_replica(datafile): location = Location.get_default_location() return Replica.objects.get(datafile=datafile.id, location=location) # Check that it won't verify as it stands expect(get_replica(replica).verified).to_be(False) verify_files() expect(get_replica(replica).verified).to_be(False) expect(get_replica(replica).is_local()).to_be(False) # Fill in the content f.write(content) f.flush() # Check it now verifies verify_files() expect(get_replica(replica).id).to_be( get_new_replica(datafile).id) expect(get_new_replica(datafile).verified).to_be(True) expect(get_new_replica(datafile).is_local()).to_be(True)
def testLocalFile(self): content = urandom(1024) cf = ContentFile(content, 'background_task_testfile') # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = cf.name datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.url = write_uploaded_file_to_dataset(self.dataset, cf) datafile.save() def get_datafile(datafile): return Dataset_File.objects.get(id=datafile.id) # Check that it's not currently verified expect(get_datafile(datafile).verified).to_be(False) # Check it verifies verify_files() expect(get_datafile(datafile).verified).to_be(True)
def _create_datafile(): user = User.objects.create_user("testuser", "*****@*****.**", "pwd") user.save() UserProfile(user=user).save() full_access = Experiment.PUBLIC_ACCESS_FULL experiment = Experiment.objects.create(title="IIIF Test", created_by=user, public_access=full_access) experiment.save() ExperimentACL( experiment=experiment, pluginId="django_user", entityId=str(user.id), isOwner=True, canRead=True, canWrite=True, canDelete=True, aclOwnershipType=ExperimentACL.OWNER_OWNED, ).save() dataset = Dataset() dataset.save() dataset.experiments.add(experiment) dataset.save() # Create new Datafile tempfile = TemporaryUploadedFile("iiif_stored_file", None, None, None) with Image(filename="magick:rose") as img: img.format = "tiff" img.save(file=tempfile.file) tempfile.file.flush() datafile = Dataset_File(dataset=dataset) datafile.size = os.path.getsize(tempfile.file.name) # os.remove(tempfilename) datafile.filename = "iiif_named_file" datafile.url = write_uploaded_file_to_dataset(dataset, tempfile) datafile.verify(allowEmptyChecksums=True) datafile.save() return datafile
def _create_datafile(): user = User.objects.create_user('testuser', '*****@*****.**', 'pwd') user.save() UserProfile(user=user).save() full_access = Experiment.PUBLIC_ACCESS_FULL experiment = Experiment.objects.create(title="IIIF Test", created_by=user, public_access=full_access) experiment.save() ExperimentACL(experiment=experiment, pluginId='django_user', entityId=str(user.id), isOwner=True, canRead=True, canWrite=True, canDelete=True, aclOwnershipType=ExperimentACL.OWNER_OWNED).save() dataset = Dataset() dataset.save() dataset.experiments.add(experiment) dataset.save() # Create new Datafile tempfile = TemporaryUploadedFile('iiif_stored_file', None, None, None) with Image(filename='magick:rose') as img: img.format = 'tiff' img.save(file=tempfile.file) tempfile.file.flush() datafile = Dataset_File(dataset=dataset) datafile.size = os.path.getsize(tempfile.file.name) #os.remove(tempfilename) datafile.filename = 'iiif_named_file' datafile.url = write_uploaded_file_to_dataset(dataset, tempfile) datafile.verify(allowEmptyChecksums=True) datafile.save() return datafile
def process_enclosure(self, dataset, enclosure): filename = getattr(enclosure, 'title', basename(enclosure.href)) datafile = Dataset_File(url=enclosure.href, \ filename=filename, \ dataset=dataset) datafile.protocol = enclosure.href.partition('://')[0] try: datafile.mimetype = enclosure.mime except AttributeError: pass try: datafile.size = enclosure.length except AttributeError: pass try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save() self.make_local_copy(datafile)
def process_enclosure(self, dataset, enclosure): ''' Examines one "enclosure" from an entry, representing a datafile. Determines whether to process it, and if so, starts the transfer. ''' # TODO tjdett: This method needs a clean-up, as it's doing many more things than was originally intended. It now contains more more code about # deciding whether to process the enclosure than it does about actually processing it. That decision, or the influencing factors, should be refactored into separate methods. # Python has built-in time deltas and Django has time formatting functions, both of which would clean this code up considerably. def _get_enclosure_url(enclosure): ''' Optionally manipulate datafile URL, eg: http://foo.edu/bar.txt -> file:////fooserver/bar.txt''' if IngestOptions.USE_LOCAL_TRANSFERS: return enclosure.href.replace(IngestOptions.URL_BASE_TO_REPLACE, IngestOptions.LOCAL_SOURCE_PATH) else: return enclosure.href filename = getattr(enclosure, 'title', basename(enclosure.href)) # check if we were provided a full path, and hence a subdirectory for the file if (IngestOptions.DATAFILE_DIRECTORY_DEPTH >= 1 and getattr(enclosure, "path", "") != "" and enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:] != ""): filename = "/".join(enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:]) datafiles = dataset.dataset_file_set.filter(filename=filename) def fromunix1000 (tstr): return datetime.datetime.utcfromtimestamp(float(tstr)/1000) if datafiles.count() > 0: datafile = datafiles[0] from django.db.models import Max newest=datafiles.aggregate(Max('modification_time'))['modification_time__max'] if not newest:# datafile.modification_time: ### rethink this! return # We have this file, it has no time/date, let's skip it. def total_seconds(td): # exists on datetime.timedelta in Python 2.7 return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 timediff = total_seconds(fromunix1000(enclosure.modified) - newest) if timediff == 0: return # We have this file already, same time/date. elif timediff < 0: logging.getLogger(__name__).warn("Skipping datafile. File to ingest '{0}' is {1} *older* than stored file. Are the system clocks correct?". format(enclosure.href, self.human_time(-timediff))) return else: if not IngestOptions.ALLOW_UPDATING_DATAFILES: logging.getLogger(__name__).warn("Skipping datafile. ALLOW_UPDATING_DATAFILES is disabled, and '{0}' is {1}newer than stored file.". format(enclosure.href, self.human_time(timediff))) return logging.getLogger(__name__).info("Ingesting updated datafile. File to ingest '{0}' is {1} newer than stored file. This will create an additional copy.". format(enclosure.href, self.human_time(timediff))) if IngestOptions.HIDE_REPLACED_DATAFILES: # Mark all older versions of file as hidden. (!) try: from tardis.microtardis.models import Dataset_Hidden Dataset_Hidden.objects.filter(datafile__dataset=dataset).update(hidden=True) except ImportError: logger.warn("The MicroTardis app must be installed in order to use the HIDE_REPLACED_DATAFILES option. Existing version of datafile {0} " + "will not be hidden.".format(datafile.filename)) else: # no local copy already. logging.getLogger(__name__).info("Ingesting datafile: '{0}'".format(enclosure.href)) # Create a record and start transferring. datafile = Dataset_File(dataset=dataset, url=_get_enclosure_url(enclosure), filename=filename, created_time=fromunix1000(enclosure.created), modification_time=fromunix1000(enclosure.modified)) datafile.protocol = enclosure.href.partition('://')[0] datafile.mimetype = getattr(enclosure, "mime", datafile.mimetype) datafile.size = getattr(enclosure, "length", datafile.size) try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save()