def test_create_existing_task(self): """ Creates a task with a name that does exist. """ task = ProcessTask( name="ESSArch_Core.WorkflowEngine.tests.tasks.First", responsible=self.user ) task.full_clean()
def test_running_non_eagerly(self): settings.CELERY_ALWAYS_EAGER = False foo = 123 task = ProcessTask( name="ESSArch_Core.WorkflowEngine.tests.tasks.First", params={"foo": foo} ) res = task.run().get().get(task.pk) self.assertEqual(res, foo)
def test_create_nonexistent_task(self): """ Creates a task with a name that doesn't exist. """ with self.assertRaises(ValidationError): task = ProcessTask( name="nonexistent task", responsible=self.user ) task.full_clean()
def run(self, ip=None, xmlfile=None, validate_fileformat=True, validate_integrity=True, rootdir=None): step = ProcessStep.objects.create(name="Validate Files", parallel=True, parent_step_id=self.step) if any([validate_fileformat, validate_integrity]): if rootdir is None: rootdir = InformationPackage.objects.values_list( 'object_path', flat=True).get(pk=ip) tasks = [] for f in find_files(xmlfile, rootdir): if validate_fileformat and f.format is not None: tasks.append( ProcessTask( name=self.fileformat_task, params={ "filename": os.path.join(rootdir, f.path), "format_name": f.format, }, information_package_id=ip, responsible_id=self.responsible, processstep=step, )) if validate_integrity and f.checksum is not None and f.checksum_type is not None: tasks.append( ProcessTask( name=self.checksum_task, params={ "filename": os.path.join(rootdir, f.path), "checksum": f.checksum, "algorithm": f.checksum_type, }, information_package_id=ip, responsible_id=self.responsible, processstep=step, )) ProcessTask.objects.bulk_create(tasks) with allow_join_result(): return step.run().get()
def local(self, src, dst, block_size=65536, step=None): step = ProcessStep.objects.create(name="Copy %s to %s" % (src, dst), parent_step_id=step) fsize = os.stat(src).st_size idx = 0 tasks = [] directory = os.path.dirname(dst) try: os.makedirs(directory) except OSError as e: if e.errno != errno.EEXIST: raise open(dst, 'w').close() # remove content of destination if it exists while idx * block_size <= fsize: tasks.append( ProcessTask( name="ESSArch_Core.tasks.CopyChunk", args=[src, dst, idx * block_size, self.task_id], params={'block_size': block_size}, processstep=step, processstep_pos=idx, )) idx += 1 ProcessTask.objects.bulk_create(tasks, 1000) step.run().get()
def test_on_failure(self): """ Runs an incorrect task and checks if the result is empty and that the traceback is nonempty. """ foo = 123 try: task = ProcessTask( name="ESSArch_Core.WorkflowEngine.tests.tasks.First", params={ "bar": foo }, information_package=InformationPackage.objects.create() ) task.run() except TypeError: tb = traceback.format_exc() self.assertEqual(tb, task.traceback) self.assertIsNone(task.result) self.assertIsNotNone(task.traceback)
def test_create_physical_model(self): ip = InformationPackage.objects.create(Label="ip1") prepare_path = Path.objects.get(entity="path_preingest_prepare").value path = os.path.join(prepare_path, unicode(ip.pk)) task = ProcessTask( name="preingest.tasks.CreatePhysicalModel", params={ "structure": [{ "name": "dir1", "type": "folder" }, { "name": "dir2", "type": "folder", }, { "name": "file1", "type": "file" }] }, information_package=ip, ) task.run() self.assertTrue(os.path.isdir(os.path.join(path, 'dir1'))) self.assertTrue(os.path.isdir(os.path.join(path, 'dir2'))) self.assertFalse(os.path.isfile(os.path.join(path, 'file1'))) task.undo() self.assertFalse(os.path.isdir(os.path.join(path, 'dir1'))) self.assertFalse(os.path.isdir(os.path.join(path, 'dir2')))
def test_create_zip(self): # create directory prepare_path = Path.objects.get(entity="path_preingest_prepare").value dirname = os.path.join(prepare_path, "zipdir") os.makedirs(dirname) # create empty file filename = os.path.join(dirname, "file.txt") open(filename, "a").close() zipname = dirname + ".zip" task = ProcessTask( name="preingest.tasks.CreateZIP", params={ "dirname": dirname, "zipname": zipname }, ) task.run() self.assertTrue(os.path.isdir(dirname)) self.assertTrue(os.path.isfile(filename)) self.assertTrue(os.path.isfile(zipname)) shutil.rmtree(dirname) task.undo() self.assertTrue(os.path.isdir(dirname)) self.assertTrue(os.path.isfile(filename)) self.assertFalse(os.path.isfile(zipname))
def test_receive_sip(self): ip = InformationPackage.objects.create() srctar = os.path.join(self.ingest_reception, "%s.tar" % ip.pk) srcxml = os.path.join(self.ingest_reception, "%s.xml" % ip.pk) open(srctar, "a").close() open(srcxml, "a").close() ip.ObjectPath = os.path.join(self.ingest_reception, str(ip.pk) + ".tar") ip.save() task = ProcessTask( name="preingest.tasks.ReceiveSIP", params={ "ip": ip }, ) task.run() self.assertTrue(os.path.isfile(os.path.join(self.ingest_work, str(ip.pk) + ".tar"))) self.assertTrue(os.path.isfile(os.path.join(self.ingest_work, str(ip.pk) + ".xml"))) task.undo() self.assertFalse(os.path.isfile(os.path.join(self.ingest_work, str(ip.pk) + ".tar"))) self.assertFalse(os.path.isfile(os.path.join(self.ingest_work, str(ip.pk) + ".xml")))
def create(self, validated_data): tasks = [] for ip in validated_data['information_packages']: storage_methods = ip.get_migratable_storage_methods() if not storage_methods.exists(): raise ValueError('No storage methods available for migration') for storage_method in storage_methods: t = ProcessTask( name='ESSArch_Core.storage.tasks.StorageMigration', label='Migrate to {}'.format(storage_method), args=[str(storage_method.pk), validated_data['temp_path']], information_package=ip, responsible=self.context['request'].user, eager=False, ) tasks.append(t) ProcessTask.objects.bulk_create(tasks, 100) for t in tasks: t.run() return ProcessTask.objects.filter(pk__in=[t.pk for t in tasks])
def _create_on_error_tasks(errors, ip=None, responsible=None, eager=False, status=celery_states.PENDING): for on_error_idx, on_error in enumerate(errors): args = on_error.get('args', []) params = on_error.get('params', {}) result_params = on_error.get('result_params', {}) yield ProcessTask( name=on_error['name'], reference=on_error.get('reference', None), label=on_error.get('label'), hidden=on_error.get('hidden', False), args=args, params=params, result_params=result_params, eager=eager, information_package=ip, responsible=responsible, processstep_pos=on_error_idx, status=status, )
def identify_ip(self, request): fname = request.data.get('label') spec_data = request.data.get('specification_data', {}) uip = Path.objects.get(entity="path_ingest_unidentified").value container_file = os.path.join(uip, fname) if not os.path.isfile(container_file): return Response( {'status': '%s does not exist' % container_file}, status=status.HTTP_400_BAD_REQUEST ) spec = json.loads(open( os.path.join(settings.BASE_DIR, 'templates/SDTemplate.json') ).read()) ip_id = uuid.uuid4() spec_data['_OBJID'] = unicode(ip_id) spec_data['_OBJLABEL'] = spec_data.pop('LABEL') spec_data['_IP_CREATEDATE'] = timestamp_to_datetime( creation_date(container_file) ).isoformat() infoxml = u'%s.xml' % unicode(ip_id) infoxml = os.path.join(uip, infoxml) ProcessTask( name='preingest.tasks.GenerateXML', params={ 'info': spec_data, 'filesToCreate': { infoxml: spec }, 'folderToParse': container_file, }, ).run_eagerly() return Response({'status': 'Identified IP, created %s' % infoxml})
def test_create_ip_root_dir(self): ip = InformationPackage.objects.create(Label="ip1") prepare_path = Path.objects.get(entity="path_preingest_prepare").value prepare_path = os.path.join(prepare_path, unicode(ip.pk)) task = ProcessTask( name="preingest.tasks.CreateIPRootDir", params={ "information_package": ip, }, ) task.run() self.assertTrue(os.path.isdir(prepare_path)) task.undo() self.assertFalse(os.path.isdir(prepare_path))
def test_prepare_ip(self): label = "ip1" user = User.objects.create(username="******") task = ProcessTask(name="preingest.tasks.PrepareIP", params={ "label": label, "responsible": user }, responsible=user) task.run() self.assertTrue( InformationPackage.objects.filter(Label=label).exists()) task.undo() self.assertFalse( InformationPackage.objects.filter(Label=label).exists())
def test_submit_sip(self): ip = InformationPackage.objects.create(Label="ip1") srctar = os.path.join(self.preingest_reception, "%s.tar" % ip.pk) srcxml = os.path.join(self.preingest_reception, "%s.xml" % ip.pk) dsttar = os.path.join(self.ingest_reception, "%s.tar" % ip.pk) dstxml = os.path.join(self.ingest_reception, "%s.xml" % ip.pk) open(srctar, "a").close() open(srcxml, "a").close() task = ProcessTask( name="preingest.tasks.SubmitSIP", params={"ip": ip}, ) task.run() self.assertTrue(os.path.isfile(dsttar)) self.assertTrue(os.path.isfile(dstxml)) task.undo() self.assertFalse(os.path.isfile(dsttar)) self.assertFalse(os.path.isfile(dstxml))
def generate(self, folderToParse=None, algorithm='SHA-256'): files = [] mimetypes.suffix_map = {} mimetypes.encodings_map = {} mimetypes.types_map = {} mimetypes.common_types = {} mimetypes_file = Path.objects.get( entity="path_mimetypes_definitionfile").value mimetypes.init(files=[mimetypes_file]) mtypes = mimetypes.types_map responsible = None if folderToParse: folderToParse = folderToParse.rstrip('/') step = ProcessStep.objects.create( name="File operations for %s" % (os.path.basename(folderToParse)), parallel=True, ) tasks = [] if self.task is not None and self.task.step is not None: responsible = self.task.responsible step.parent_step_id = self.task.step step.save() folderToParse = unicode(folderToParse) external = self.find_external_dirs() for ext_file, ext_dir, ext_spec in external: ext_sub_dirs = next(walk(os.path.join(folderToParse, ext_dir)))[1] for sub_dir in ext_sub_dirs: ptr_file_path = os.path.join(ext_dir, sub_dir, ext_file) ext_info = self.info ext_info['_EXT'] = sub_dir ext_info['_EXT_HREF'] = ptr_file_path external_gen = XMLGenerator( filesToCreate={ os.path.join(folderToParse, ptr_file_path): ext_spec }, info=ext_info, task=self.task, ) external_gen.generate( os.path.join(folderToParse, ext_dir, sub_dir)) tasks.append( ProcessTask( name="ESSArch_Core.tasks.ParseFile", params={ 'filepath': os.path.join(folderToParse, ptr_file_path), 'mimetype': self.get_mimetype(mtypes, ptr_file_path), 'relpath': ptr_file_path, 'algorithm': algorithm, 'rootdir': sub_dir }, responsible_id=responsible, processstep=step, )) if os.path.isfile(folderToParse): tasks.append( ProcessTask( name="ESSArch_Core.tasks.ParseFile", params={ 'filepath': folderToParse, 'mimetype': self.get_mimetype(mtypes, folderToParse), 'relpath': os.path.basename(folderToParse), 'algorithm': algorithm }, processstep=step, responsible_id=responsible, )) elif os.path.isdir(folderToParse): for root, dirnames, filenames in walk(folderToParse): dirnames[:] = [ d for d in dirnames if d not in [e[1] for e in external] ] for fname in filenames: filepath = os.path.join(root, fname) relpath = os.path.relpath(filepath, folderToParse) tasks.append( ProcessTask( name="ESSArch_Core.tasks.ParseFile", params={ 'filepath': filepath, 'mimetype': self.get_mimetype(mtypes, filepath), 'relpath': relpath, 'algorithm': algorithm }, responsible_id=responsible, processstep=step, )) ProcessTask.objects.bulk_create(tasks, 1000) with allow_join_result(): for fileinfo in step.chunk(): files.append(fileinfo) for idx, f in enumerate(self.toCreate): fname = f['file'] rootEl = f['root'] self.info['_XML_FILENAME'] = os.path.basename(fname) tree = etree.ElementTree( rootEl.createLXMLElement(self.info, files=files, folderToParse=folderToParse, task=self.task)) tree.write(fname, pretty_print=True, xml_declaration=True, encoding='UTF-8') try: relpath = os.path.relpath(fname, folderToParse) except: relpath = fname if idx < len(self.toCreate) - 1: parsefile_task = ProcessTask.objects.create( name="ESSArch_Core.tasks.ParseFile", params={ 'filepath': fname, 'mimetype': self.get_mimetype(mtypes, fname), 'relpath': relpath, 'algorithm': algorithm }, responsible_id=responsible, processstep_id=self.task.step if self.task else None) with allow_join_result(): files.append(parsefile_task.run().get())
def remote(self, src, dst, requests_session=None, block_size=65536, step=None): step = ProcessStep.objects.create(name="Copy %s to %s" % (src, dst), parent_step_id=step) file_size = os.stat(src).st_size idx = 0 tasks = [] t = ProcessTask.objects.create( name="ESSArch_Core.tasks.CopyChunk", args=[src, dst, idx * block_size], params={ 'requests_session': requests_session, 'file_size': file_size, 'block_size': block_size, }, processstep=step, processstep_pos=idx, ) upload_id = t.run().get() idx += 1 while idx * block_size <= file_size: tasks.append( ProcessTask( name="ESSArch_Core.tasks.CopyChunk", args=[src, dst, idx * block_size], params={ 'requests_session': requests_session, 'file_size': file_size, 'block_size': block_size, 'upload_id': upload_id, }, processstep=step, processstep_pos=idx, )) idx += 1 ProcessTask.objects.bulk_create(tasks, 1000) step.resume().get() md5 = ProcessTask.objects.create( name="ESSArch_Core.tasks.CalculateChecksum", params={ "filename": src, "block_size": block_size, "algorithm": 'MD5' }, information_package_id=self.ip, responsible_id=self.responsible, ).run().get() completion_url = dst.rstrip('/') + '_complete/' m = MultipartEncoder(fields={ 'path': os.path.basename(src), 'upload_id': upload_id, 'md5': md5, }) headers = {'Content-Type': m.content_type} response = requests_session.post(completion_url, data=m, headers=headers) response.raise_for_status()
def run(self, filepath=None, mimetype=None, relpath=None, algorithm='SHA-256', rootdir=''): if not relpath: relpath = filepath relpath = win_to_posix(relpath) timestamp = creation_date(filepath) createdate = timestamp_to_datetime(timestamp) checksum_task = ProcessTask( name="ESSArch_Core.tasks.CalculateChecksum", params={ "filename": filepath, "algorithm": algorithm }, processstep_id=self.step, responsible_id=self.responsible, information_package_id=self.ip) fileformat_task = ProcessTask( name="ESSArch_Core.tasks.IdentifyFileFormat", params={ "filename": filepath, }, processstep_id=self.step, responsible_id=self.responsible, information_package_id=self.ip) ProcessTask.objects.bulk_create([checksum_task, fileformat_task]) checksum = checksum_task.run().get() self.set_progress(50, total=100) (format_name, format_version, format_registry_key) = fileformat_task.run().get() fileinfo = { 'FName': os.path.basename(relpath), 'FDir': rootdir, 'FChecksum': checksum, 'FID': str(uuid.uuid4()), 'daotype': "borndigital", 'href': relpath, 'FMimetype': mimetype, 'FCreated': createdate.isoformat(), 'FFormatName': format_name, 'FFormatVersion': format_version, 'FFormatRegistryKey': format_registry_key, 'FSize': str(os.path.getsize(filepath)), 'FUse': 'Datafile', 'FChecksumType': algorithm, 'FLoctype': 'URL', 'FLinkType': 'simple', 'FChecksumLib': 'hashlib', 'FLocationType': 'URI', 'FIDType': 'UUID', } return fileinfo