def testRemoteFile(self): content = urandom(1024) with NamedTemporaryFile() as f: # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = 'background_task_testfile' datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.url = 'file://' + path.abspath(f.name) datafile.save() def get_datafile(datafile): return Dataset_File.objects.get(id=datafile.id) # Check that it won't verify as it stands expect(get_datafile(datafile).verified).to_be(False) verify_files() expect(get_datafile(datafile).verified).to_be(False) expect(get_datafile(datafile).is_local()).to_be(False) # Fill in the content f.write(content) f.flush() # Check it now verifies verify_files() expect(get_datafile(datafile).verified).to_be(True) expect(get_datafile(datafile).is_local()).to_be(True)
def process_enclosure(self, dataset, enclosure): filename = getattr(enclosure, 'title', basename(enclosure.href)) datafile = Dataset_File(filename=filename, dataset=dataset) try: datafile.mimetype = enclosure.mime except AttributeError: pass try: datafile.size = enclosure.length except AttributeError: pass try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save() url = enclosure.href # This means we will allow the atom feed to feed us any enclosure # URL that matches a registered location. Maybe we should restrict # this to a specific location. location = Location.get_location_for_url(url) if not location: logger.error('Rejected ingestion for unknown location %s' % url) return replica = Replica(datafile=datafile, url=url, location=location) replica.protocol = enclosure.href.partition('://')[0] replica.save() self.make_local_copy(replica)
def process_enclosure(self, dataset, enclosure): filename = getattr(enclosure, 'title', basename(enclosure.href)) datafile = Dataset_File(filename=filename, dataset=dataset) try: datafile.mimetype = enclosure.mime except AttributeError: pass try: datafile.size = enclosure.length except AttributeError: pass try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save() url = enclosure.href # This means we will allow the atom feed to feed us any enclosure # URL that matches a registered location. Maybe we should restrict # this to a specific location. location = Location.get_location_for_url(url) if not location: logger.error('Rejected ingestion for unknown location %s' % url) return replica = Replica(datafile=datafile, url=url, location=location) replica.protocol = enclosure.href.partition('://')[0] replica.save() self.make_local_copy(replica)
def testRemoteFile(self): content = urandom(1024) with NamedTemporaryFile() as f: # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = 'background_task_testfile' datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.url = 'file://' + path.abspath(f.name) datafile.save() def get_datafile(datafile): return Dataset_File.objects.get(id=datafile.id) # Check that it won't verify as it stands expect(get_datafile(datafile).verified).to_be(False) verify_files() expect(get_datafile(datafile).verified).to_be(False) expect(get_datafile(datafile).is_local()).to_be(False) # Fill in the content f.write(content) f.flush() # Check it now verifies verify_files() expect(get_datafile(datafile).verified).to_be(True) expect(get_datafile(datafile).is_local()).to_be(True)
def testLocalFile(self): content = urandom(1024) cf = ContentFile(content, 'background_task_testfile') # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = cf.name datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.save() replica = Replica(datafile=datafile, url=write_uploaded_file_to_dataset(self.dataset, cf), location=Location.get_default_location()) replica.save() def get_replica(datafile): return Replica.objects.get(datafile=datafile) # undo auto-verify: replica.verified = False replica.save(update_fields=['verified']) # Check that it's not currently verified expect(get_replica(datafile).verified).to_be(False) # Check it verifies verify_files() expect(get_replica(datafile).verified).to_be(True)
def testRemoteFile(self): content = urandom(1024) with NamedTemporaryFile() as f: # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = 'background_task_testfile' datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.save() url = 'file://' + path.abspath(f.name) base_url = 'file://' + path.dirname(path.abspath(f.name)) location = self._get_or_create_local_location( 'test-staging-xxx', base_url, 'external', 10) replica = Replica(datafile=datafile, location=location, url=url) replica.save() def get_replica(replica): try: return Replica.objects.get(id=replica.id) except Replica.DoesNotExist: return None def get_new_replica(datafile): location = Location.get_default_location() return Replica.objects.get(datafile=datafile.id, location=location) # Check that it won't verify as it stands expect(get_replica(replica).verified).to_be(False) verify_files() expect(get_replica(replica).verified).to_be(False) expect(get_replica(replica).is_local()).to_be(False) # Fill in the content f.write(content) f.flush() # Check it now verifies verify_files() expect(get_replica(replica).id).to_be( get_new_replica(datafile).id) expect(get_new_replica(datafile).verified).to_be(True) expect(get_new_replica(datafile).is_local()).to_be(True)
def testLocalFile(self): content = urandom(1024) cf = ContentFile(content, 'background_task_testfile') # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = cf.name datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.url = write_uploaded_file_to_dataset(self.dataset, cf) datafile.save() def get_datafile(datafile): return Dataset_File.objects.get(id=datafile.id) # Check that it's not currently verified expect(get_datafile(datafile).verified).to_be(False) # Check it verifies verify_files() expect(get_datafile(datafile).verified).to_be(True)
def testLocalFile(self): content = urandom(1024) cf = ContentFile(content, 'background_task_testfile') # Create new Datafile datafile = Dataset_File(dataset=self.dataset) datafile.filename = cf.name datafile.size = len(content) datafile.sha512sum = hashlib.sha512(content).hexdigest() datafile.url = write_uploaded_file_to_dataset(self.dataset, cf) datafile.save() def get_datafile(datafile): return Dataset_File.objects.get(id=datafile.id) # Check that it's not currently verified expect(get_datafile(datafile).verified).to_be(False) # Check it verifies verify_files() expect(get_datafile(datafile).verified).to_be(True)
def process_enclosure(self, dataset, enclosure): filename = getattr(enclosure, 'title', basename(enclosure.href)) datafile = Dataset_File(url=enclosure.href, \ filename=filename, \ dataset=dataset) datafile.protocol = enclosure.href.partition('://')[0] try: datafile.mimetype = enclosure.mime except AttributeError: pass try: datafile.size = enclosure.length except AttributeError: pass try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save() self.make_local_copy(datafile)
def process_enclosure(self, dataset, enclosure): ''' Examines one "enclosure" from an entry, representing a datafile. Determines whether to process it, and if so, starts the transfer. ''' # TODO tjdett: This method needs a clean-up, as it's doing many more things than was originally intended. It now contains more more code about # deciding whether to process the enclosure than it does about actually processing it. That decision, or the influencing factors, should be refactored into separate methods. # Python has built-in time deltas and Django has time formatting functions, both of which would clean this code up considerably. def _get_enclosure_url(enclosure): ''' Optionally manipulate datafile URL, eg: http://foo.edu/bar.txt -> file:////fooserver/bar.txt''' if IngestOptions.USE_LOCAL_TRANSFERS: return enclosure.href.replace(IngestOptions.URL_BASE_TO_REPLACE, IngestOptions.LOCAL_SOURCE_PATH) else: return enclosure.href filename = getattr(enclosure, 'title', basename(enclosure.href)) # check if we were provided a full path, and hence a subdirectory for the file if (IngestOptions.DATAFILE_DIRECTORY_DEPTH >= 1 and getattr(enclosure, "path", "") != "" and enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:] != ""): filename = "/".join(enclosure.path.split("/")[IngestOptions.DATAFILE_DIRECTORY_DEPTH:]) datafiles = dataset.dataset_file_set.filter(filename=filename) def fromunix1000 (tstr): return datetime.datetime.utcfromtimestamp(float(tstr)/1000) if datafiles.count() > 0: datafile = datafiles[0] from django.db.models import Max newest=datafiles.aggregate(Max('modification_time'))['modification_time__max'] if not newest:# datafile.modification_time: ### rethink this! return # We have this file, it has no time/date, let's skip it. def total_seconds(td): # exists on datetime.timedelta in Python 2.7 return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 timediff = total_seconds(fromunix1000(enclosure.modified) - newest) if timediff == 0: return # We have this file already, same time/date. elif timediff < 0: logging.getLogger(__name__).warn("Skipping datafile. File to ingest '{0}' is {1} *older* than stored file. Are the system clocks correct?". format(enclosure.href, self.human_time(-timediff))) return else: if not IngestOptions.ALLOW_UPDATING_DATAFILES: logging.getLogger(__name__).warn("Skipping datafile. ALLOW_UPDATING_DATAFILES is disabled, and '{0}' is {1}newer than stored file.". format(enclosure.href, self.human_time(timediff))) return logging.getLogger(__name__).info("Ingesting updated datafile. File to ingest '{0}' is {1} newer than stored file. This will create an additional copy.". format(enclosure.href, self.human_time(timediff))) if IngestOptions.HIDE_REPLACED_DATAFILES: # Mark all older versions of file as hidden. (!) try: from tardis.microtardis.models import Dataset_Hidden Dataset_Hidden.objects.filter(datafile__dataset=dataset).update(hidden=True) except ImportError: logger.warn("The MicroTardis app must be installed in order to use the HIDE_REPLACED_DATAFILES option. Existing version of datafile {0} " + "will not be hidden.".format(datafile.filename)) else: # no local copy already. logging.getLogger(__name__).info("Ingesting datafile: '{0}'".format(enclosure.href)) # Create a record and start transferring. datafile = Dataset_File(dataset=dataset, url=_get_enclosure_url(enclosure), filename=filename, created_time=fromunix1000(enclosure.created), modification_time=fromunix1000(enclosure.modified)) datafile.protocol = enclosure.href.partition('://')[0] datafile.mimetype = getattr(enclosure, "mime", datafile.mimetype) datafile.size = getattr(enclosure, "length", datafile.size) try: hash = enclosure.hash # Split on white space, then ':' to get tuples to feed into dict hashdict = dict([s.partition(':')[::2] for s in hash.split()]) # Set SHA-512 sum datafile.sha512sum = hashdict['sha-512'] except AttributeError: pass datafile.save()