def perform_create(self, serializer): """Create a resource.""" process = serializer.validated_data.get('process') if not process.is_active: raise exceptions.ParseError( 'Process retired (id: {}, slug: {}/{}).'.format( process.id, process.slug, process.version)) with transaction.atomic(): instance = serializer.save() assign_contributor_permissions(instance) # Entity is added to the collection only when it is # created - when it only contains 1 Data object. entities = Entity.objects.annotate(num_data=Count('data')).filter( data=instance, num_data=1) # Assign data object to all specified collections. collection_pks = self.request.data.get('collections', []) for collection in Collection.objects.filter(pk__in=collection_pks): collection.data.add(instance) copy_permissions(collection, instance) # Add entities to which data belongs to the collection. for entity in entities: entity.collections.add(collection) copy_permissions(collection, entity)
def create(self, subprocess_parent=None, **kwargs): """Create new object with the given kwargs.""" obj = super().create(**kwargs) # Data dependencies obj.save_dependencies(obj.input, obj.process.input_schema) if subprocess_parent: DataDependency.objects.create( parent=subprocess_parent, child=obj, kind=DataDependency.KIND_SUBPROCESS, ) # Data was from a workflow / spawned process if not obj.in_container(): copy_permissions(subprocess_parent, obj) # Entity, Collection assignment entity_operation = self._handle_entity(obj) self._handle_collection(obj, entity_operation=entity_operation) # Assign contributor permission only if Data is not in the container. if not obj.in_container(): assign_contributor_permissions(obj) return obj
def perform_create(self, serializer): """Create a resource.""" process = serializer.validated_data.get('process') if not process.is_active: raise exceptions.ParseError( 'Process retired (id: {}, slug: {}/{}).'.format(process.id, process.slug, process.version) ) with transaction.atomic(): instance = serializer.save() assign_contributor_permissions(instance) # Entity is added to the collection only when it is # created - when it only contains 1 Data object. entities = Entity.objects.annotate(num_data=Count('data')).filter(data=instance, num_data=1) # Assign data object to all specified collections. collection_pks = self.request.data.get('collections', []) for collection in Collection.objects.filter(pk__in=collection_pks): collection.data.add(instance) copy_permissions(collection, instance) # Add entities to which data belongs to the collection. for entity in entities: entity.collections.add(collection) copy_permissions(collection, entity)
def test_copy_different_ctype(self): assign_perm('view_collection', self.contributor, self.collection) assign_perm('add_collection', self.contributor, self.collection) copy_permissions(self.collection, self.dst_process) # Only 'view' is copied as process has no 'add' permission. self.assertEqual(UserObjectPermission.objects.count(), 3)
def test_copy_different_ctype(self): assign_perm('view_collection', self.contributor, self.collection) assign_perm('add_collection', self.contributor, self.collection) copy_permissions(self.collection, self.dst_process) # Only 'view' is copied as process has no 'add' permission. self.assertEqual(UserObjectPermission.objects.count(), 3)
def move_to_collection(self, destination_collection): """Move data object to collection.""" self.validate_change_collection(destination_collection) self.collection = destination_collection if destination_collection: self.tags = destination_collection.tags copy_permissions(destination_collection, self) self.save()
def duplicate(self, contributor=None, inherit_entity=False, inherit_collection=False): """Duplicate (make a copy).""" if self.status not in [self.STATUS_DONE, self.STATUS_ERROR]: raise ValidationError('Data object must have done or error status to be duplicated') duplicate = Data.objects.get(id=self.id) duplicate.pk = None duplicate.slug = None duplicate.name = 'Copy of {}'.format(self.name) duplicate.duplicated = now() if contributor: duplicate.contributor = contributor duplicate.entity = None if inherit_entity: if not contributor.has_perm('add_entity', self.entity): raise ValidationError("You do not have add permission on entity {}.".format(self.entity)) duplicate.entity = self.entity duplicate.collection = None if inherit_collection: if not contributor.has_perm('add_collection', self.collection): raise ValidationError("You do not have add permission on collection {}.".format(self.collection)) duplicate.collection = self.collection duplicate._perform_save(force_insert=True) # pylint: disable=protected-access # Override fields that are automatically set on create. duplicate.created = self.created duplicate._perform_save() # pylint: disable=protected-access if self.location: self.location.data.add(duplicate) # pylint: disable=no-member duplicate.storages.set(self.storages.all()) # pylint: disable=no-member for migration in self.migration_history.order_by('created'): # pylint: disable=no-member migration.pk = None migration.data = duplicate migration.save(force_insert=True) # Inherit existing child dependencies. DataDependency.objects.bulk_create([ DataDependency(child=duplicate, parent=dependency.parent, kind=dependency.kind) for dependency in DataDependency.objects.filter(child=self) ]) # Inherit existing parent dependencies. DataDependency.objects.bulk_create([ DataDependency(child=dependency.child, parent=duplicate, kind=dependency.kind) for dependency in DataDependency.objects.filter(parent=self) ]) # Permissions assign_contributor_permissions(duplicate) copy_permissions(duplicate.entity, duplicate) copy_permissions(duplicate.collection, duplicate) return duplicate
def test_copy_permissions(self): self.src_process.set_permission(Permission.VIEW, self.contributor) self.src_process.set_permission(Permission.VIEW, self.group) copy_permissions(self.src_process, self.dst_process) self.assertTrue( self.contributor.has_perm(Permission.VIEW, self.dst_process)) # User inherites permission from group self.assertTrue(self.user.has_perm(Permission.VIEW, self.dst_process))
def test_copy_permissions(self): assign_perm('view_process', self.contributor, self.src_process) assign_perm('view_process', self.group, self.src_process) copy_permissions(self.src_process, self.dst_process) self.assertEqual(GroupObjectPermission.objects.count(), 2) self.assertEqual(UserObjectPermission.objects.count(), 2) self.assertTrue(self.contributor.has_perm('flow.view_process', self.dst_process)) # User inherites permission from group self.assertTrue(self.user.has_perm('flow.view_process', self.dst_process))
def test_copy_permissions(self): assign_perm('view_process', self.contributor, self.src_process) assign_perm('view_process', self.group, self.src_process) copy_permissions(self.src_process, self.dst_process) self.assertEqual(GroupObjectPermission.objects.count(), 2) self.assertEqual(UserObjectPermission.objects.count(), 2) self.assertTrue(self.contributor.has_perm('flow.view_process', self.dst_process)) # User inherites permission from group self.assertTrue(self.user.has_perm('flow.view_process', self.dst_process))
def register_descriptors(self, descriptor_schemas, user, force=False, verbosity=1): """Read and register descriptors.""" log_descriptors = [] for descriptor_schema in descriptor_schemas: for schema, _, _ in iterate_schema({}, descriptor_schema.get('schema', {})): if not schema['type'][-1].endswith(':'): schema['type'] += ':' if 'schema' not in descriptor_schema: descriptor_schema['schema'] = [] if not self.valid(descriptor_schema, DESCRIPTOR_SCHEMA): continue slug = descriptor_schema['slug'] version = descriptor_schema.get('version', '0.0.0') int_version = convert_version_string_to_int(version, VERSION_NUMBER_BITS) # `latest version` is returned as `int` so it has to be compared to `int_version` latest_version = DescriptorSchema.objects.filter(slug=slug).aggregate(Max('version'))['version__max'] if latest_version is not None and latest_version > int_version: self.stderr.write("Skip descriptor schema {}: newer version installed".format(slug)) continue previous_descriptor_qs = DescriptorSchema.objects.filter(slug=slug) if previous_descriptor_qs.exists(): previous_descriptor = previous_descriptor_qs.latest() else: previous_descriptor = None descriptor_query = DescriptorSchema.objects.filter(slug=slug, version=version) if descriptor_query.exists(): if not force: if verbosity > 0: self.stdout.write("Skip descriptor schema {}: same version installed".format(slug)) continue descriptor_query.update(**descriptor_schema) log_descriptors.append("Updated {}".format(slug)) else: descriptor = DescriptorSchema.objects.create(contributor=user, **descriptor_schema) assign_contributor_permissions(descriptor) if previous_descriptor: copy_permissions(previous_descriptor, descriptor) log_descriptors.append("Inserted {}".format(slug)) if log_descriptors and verbosity > 0: self.stdout.write("Descriptor schemas Updates:") for log in log_descriptors: self.stdout.write(" {}".format(log))
def move_to_collection(self, source_collection, destination_collection): """Move entity to destination collection.""" if source_collection == destination_collection: return self.collection = destination_collection if destination_collection: self.tags = destination_collection.tags copy_permissions(destination_collection, self) self.save() for datum in self.data.all(): datum.collection = destination_collection if destination_collection: datum.tags = destination_collection.tags copy_permissions(destination_collection, datum) datum.save()
def _handle_collection(obj, entity_operation=None): """Correctly assign Collection to Data and it's Entity. There are 2 x 4 possible scenarios how to handle collection assignment. One dimension in "decision matrix" is Data.collection: 1.x Data.collection = None 2.x Data.collection != None Second dimension is about Data.entity: x.1 Data.entity is None x.2 Data.entity was just created x.3 Data.entity already exists and Data.entity.collection = None x.4 Data.entity already exists and Data.entity.collection != None """ # 1.2 and 1.3 require no action. # 1.1 and 2.1: if not obj.entity: return if entity_operation == HandleEntityOperation.ADD and obj.collection: # 2.3 if not obj.entity.collection: raise ValueError( "Created Data has collection {} assigned, but it is added to entity {} that is not " "inside this collection.".format(obj.collection, obj.entity)) # 2.4 assert obj.collection == obj.entity.collection # 1.4 if not obj.collection and obj.entity and obj.entity.collection: obj.collection = obj.entity.collection obj.tags = obj.entity.collection.tags obj.save() # 2.2 if entity_operation == HandleEntityOperation.CREATE and obj.collection: obj.entity.collection = obj.collection obj.entity.tags = obj.collection.tags obj.entity.save() copy_permissions(obj.collection, obj.entity)
def create_entity(self): """Create entity if `flow_collection` is defined in process. Following rules applies for adding `Data` object to `Entity`: * Only add `Data object` to `Entity` if process has defined `flow_collwection` field * Add object to existing `Entity`, if all parents that are part of it (but not necessary all parents), are part of the same `Entity` * If parents belong to different `Entities` or do not belong to any `Entity`, create new `Entity` """ ds_slug = self.process.flow_collection # pylint: disable=no-member if ds_slug: entity_query = Entity.objects.filter( data__in=self.parents.all()).distinct() # pylint: disable=no-member if entity_query.count() == 1: entity = entity_query.first() copy_permissions(entity, self) else: descriptor_schema = DescriptorSchema.objects.filter( slug=ds_slug).latest() entity = Entity.objects.create( contributor=self.contributor, descriptor_schema=descriptor_schema, name=self.name, tags=self.tags, ) assign_contributor_permissions(entity) entity.data.add(self) # Inherite collections from entity. for collection in entity.collections.all(): collection.data.add(self)
def perform_create(self, serializer): """Create a resource.""" with transaction.atomic(): instance = serializer.save() assign_contributor_permissions(instance) # Entity is added to the collection only when it is # created - when it only contains 1 Data object. entities = Entity.objects.annotate(num_data=Count('data')).filter( data=instance, num_data=1) # Assign data object to all specified collections. collection_pks = self.request.data.get('collections', []) for collection in Collection.objects.filter(pk__in=collection_pks): collection.data.add(instance) copy_permissions(collection, instance) # Add entities to which data belongs to the collection. for entity in entities: entity.collections.add(collection) copy_permissions(collection, entity)
def duplicate(self, contributor=None, inherit_collection=False): """Duplicate (make a copy).""" duplicate = Entity.objects.get(id=self.id) duplicate.pk = None duplicate.slug = None duplicate.name = "Copy of {}".format(self.name) duplicate.duplicated = now() if contributor: duplicate.contributor = contributor duplicate.collection = None if inherit_collection: if not contributor.has_perm("edit_collection", self.collection): raise ValidationError( "You do not have edit permission on collection {}.".format( self.collection)) duplicate.collection = self.collection duplicate.save(force_insert=True) assign_contributor_permissions(duplicate) # Override fields that are automatically set on create. duplicate.created = self.created duplicate.save() # Duplicate entity's data objects. data = get_objects_for_user(contributor, "view_data", self.data.all()) duplicated_data = data.duplicate(contributor, inherit_collection=inherit_collection) duplicate.data.add(*duplicated_data) # Permissions assign_contributor_permissions(duplicate) copy_permissions(duplicate.collection, duplicate) return duplicate
def duplicate(self, contributor=None, inherit_collections=False): """Duplicate (make a copy).""" duplicate = Entity.objects.get(id=self.id) duplicate.pk = None duplicate.slug = None duplicate.name = 'Copy of {}'.format(self.name) duplicate.duplicated = now() if contributor: duplicate.contributor = contributor duplicate.save(force_insert=True) assign_contributor_permissions(duplicate) # Override fields that are automatically set on create. duplicate.created = self.created duplicate.save() # Duplicate entity's data objects. data = get_objects_for_user(contributor, 'view_data', self.data.all()) # pylint: disable=no-member duplicated_data = data.duplicate(contributor) duplicate.data.add(*duplicated_data) if inherit_collections: collections = get_objects_for_user( contributor, 'add_collection', self.collections.all() # pylint: disable=no-member ) for collection in collections: collection.entity_set.add(duplicate) copy_permissions(collection, duplicate) collection.data.add(*duplicated_data) for datum in duplicated_data: copy_permissions(collection, datum) return duplicate
def _handle_entity(obj): """Create entity if `entity.type` is defined in process. Following rules applies for adding `Data` object to `Entity`: * Only add `Data object` to `Entity` if process has defined `entity.type` field * Create new entity if parents do not belong to any `Entity` * Add object to existing `Entity`, if all parents that are part of it (but not necessary all parents), are part of the same `Entity` * If parents belong to different `Entities` don't do anything """ entity_type = obj.process.entity_type entity_descriptor_schema = obj.process.entity_descriptor_schema entity_input = obj.process.entity_input entity_always_create = obj.process.entity_always_create operation = HandleEntityOperation.PASS if entity_type: data_filter = {} if entity_input: input_id = dict_dot(obj.input, entity_input, default=lambda: None) if input_id is None: logger.warning( "Skipping creation of entity due to missing input.") return if isinstance(input_id, int): data_filter["data__pk"] = input_id elif isinstance(input_id, list): data_filter["data__pk__in"] = input_id else: raise ValueError( "Cannot create entity due to invalid value of field {}." .format(entity_input)) else: data_filter["data__in"] = obj.parents.all() entity_query = Entity.objects.filter(type=entity_type, **data_filter).distinct() entity_count = entity_query.count() if entity_count == 0 or entity_always_create: descriptor_schema = DescriptorSchema.objects.filter( slug=entity_descriptor_schema).latest() entity = Entity.objects.create( contributor=obj.contributor, descriptor_schema=descriptor_schema, type=entity_type, name=obj.name, tags=obj.tags, ) assign_contributor_permissions(entity) operation = HandleEntityOperation.CREATE elif entity_count == 1: entity = entity_query.first() obj.tags = entity.tags copy_permissions(entity, obj) operation = HandleEntityOperation.ADD else: logger.info( "Skipping creation of entity due to multiple entities found." ) entity = None if entity: obj.entity = entity obj.save() return operation
def register_processes(self, process_schemas, user, force=False, verbosity=1): """Read and register processors.""" log_processors = [] log_templates = [] for p in process_schemas: # TODO: Remove this when all processes are migrated to the # new syntax. if 'flow_collection' in p: if 'entity' in p: self.stderr.write( "Skip processor {}: only one of 'flow_collection' and 'entity' fields " "allowed".format(p['slug']) ) continue p['entity'] = {'type': p.pop('flow_collection')} if p['type'][-1] != ':': p['type'] += ':' if 'category' in p and not p['category'].endswith(':'): p['category'] += ':' for field in ['input', 'output']: for schema, _, _ in iterate_schema({}, p[field] if field in p else {}): if not schema['type'][-1].endswith(':'): schema['type'] += ':' # TODO: Check if schemas validate with our JSON meta schema and Processor model docs. if not self.valid(p, PROCESSOR_SCHEMA): continue if 'entity' in p: if 'type' not in p['entity']: self.stderr.write( "Skip process {}: 'entity.type' required if 'entity' defined".format(p['slug']) ) continue p['entity_type'] = p['entity']['type'] p['entity_descriptor_schema'] = p['entity'].get('descriptor_schema', p['entity_type']) p['entity_input'] = p['entity'].get('input', None) p.pop('entity') if not DescriptorSchema.objects.filter(slug=p['entity_descriptor_schema']).exists(): self.stderr.write( "Skip processor {}: Unknown descriptor schema '{}' used in 'entity' " "field.".format(p['slug'], p['entity_descriptor_schema']) ) continue if 'persistence' in p: persistence_mapping = { 'RAW': Process.PERSISTENCE_RAW, 'CACHED': Process.PERSISTENCE_CACHED, 'TEMP': Process.PERSISTENCE_TEMP, } p['persistence'] = persistence_mapping[p['persistence']] if 'scheduling_class' in p: scheduling_class_mapping = { 'interactive': Process.SCHEDULING_CLASS_INTERACTIVE, 'batch': Process.SCHEDULING_CLASS_BATCH } p['scheduling_class'] = scheduling_class_mapping[p['scheduling_class']] if 'input' in p: p['input_schema'] = p.pop('input') if 'output' in p: p['output_schema'] = p.pop('output') slug = p['slug'] if 'run' in p: # Set default language to 'bash' if not set. p['run'].setdefault('language', 'bash') # Transform output schema using the execution engine. try: execution_engine = manager.get_execution_engine(p['run']['language']) extra_output_schema = execution_engine.get_output_schema(p) if extra_output_schema: p.setdefault('output_schema', []).extend(extra_output_schema) except InvalidEngineError: self.stderr.write("Skip processor {}: execution engine '{}' not supported".format( slug, p['run']['language'] )) continue # Validate if container image is allowed based on the configured pattern. # NOTE: This validation happens here and is not deferred to executors because the idea # is that this will be moved to a "container" requirement independent of the # executor. if hasattr(settings, 'FLOW_CONTAINER_VALIDATE_IMAGE'): try: container_image = dict_dot(p, 'requirements.executor.docker.image') if not re.match(settings.FLOW_CONTAINER_VALIDATE_IMAGE, container_image): self.stderr.write("Skip processor {}: container image does not match '{}'".format( slug, settings.FLOW_CONTAINER_VALIDATE_IMAGE, )) continue except KeyError: pass version = p['version'] int_version = convert_version_string_to_int(version, VERSION_NUMBER_BITS) # `latest version` is returned as `int` so it has to be compared to `int_version` latest_version = Process.objects.filter(slug=slug).aggregate(Max('version'))['version__max'] if latest_version is not None and latest_version > int_version: self.stderr.write("Skip processor {}: newer version installed".format(slug)) continue previous_process_qs = Process.objects.filter(slug=slug) if previous_process_qs.exists(): previous_process = previous_process_qs.latest() else: previous_process = None process_query = Process.objects.filter(slug=slug, version=version) if process_query.exists(): if not force: if verbosity > 0: self.stdout.write("Skip processor {}: same version installed".format(slug)) continue process_query.update(**p) log_processors.append("Updated {}".format(slug)) else: process = Process.objects.create(contributor=user, **p) assign_contributor_permissions(process) if previous_process: copy_permissions(previous_process, process) log_processors.append("Inserted {}".format(slug)) if verbosity > 0: if log_processors: self.stdout.write("Processor Updates:") for log in log_processors: self.stdout.write(" {}".format(log)) if log_templates: self.stdout.write("Default Template Updates:") for log in log_templates: self.stdout.write(" {}".format(log))
def create_entity(self): """Create entity if `flow_collection` is defined in process. Following rules applies for adding `Data` object to `Entity`: * Only add `Data object` to `Entity` if process has defined `flow_collwection` field * Add object to existing `Entity`, if all parents that are part of it (but not necessary all parents), are part of the same `Entity` * If parents belong to different `Entities` or do not belong to any `Entity`, create new `Entity` """ entity_type = self.process.entity_type # pylint: disable=no-member entity_descriptor_schema = self.process.entity_descriptor_schema # pylint: disable=no-member entity_input = self.process.entity_input # pylint: disable=no-member if entity_type: data_filter = {} if entity_input: input_id = dict_dot(self.input, entity_input, default=lambda: None) if input_id is None: logger.warning( "Skipping creation of entity due to missing input.") return if isinstance(input_id, int): data_filter['data__pk'] = input_id elif isinstance(input_id, list): data_filter['data__pk__in'] = input_id else: raise ValueError( "Cannot create entity due to invalid value of field {}." .format(entity_input)) else: data_filter['data__in'] = self.parents.all() # pylint: disable=no-member entity_query = Entity.objects.filter(type=entity_type, **data_filter).distinct() entity_count = entity_query.count() if entity_count == 0: descriptor_schema = DescriptorSchema.objects.filter( slug=entity_descriptor_schema).latest() entity = Entity.objects.create( contributor=self.contributor, descriptor_schema=descriptor_schema, type=entity_type, name=self.name, tags=self.tags, ) assign_contributor_permissions(entity) elif entity_count == 1: entity = entity_query.first() copy_permissions(entity, self) else: logger.info( "Skipping creation of entity due to multiple entities found." ) entity = None if entity: entity.data.add(self) # Inherite collections from entity. for collection in entity.collections.all(): collection.data.add(self)
def register_processes(self, process_schemas, user, force=False, verbosity=1): """Read and register processors.""" log_processors = [] log_templates = [] for p in process_schemas: if p['type'][-1] != ':': p['type'] += ':' if 'category' in p and not p['category'].endswith(':'): p['category'] += ':' for field in ['input', 'output']: for schema, _, _ in iterate_schema( {}, p[field] if field in p else {}): if not schema['type'][-1].endswith(':'): schema['type'] += ':' # TODO: Check if schemas validate with our JSON meta schema and Processor model docs. if not self.valid(p, PROCESSOR_SCHEMA): continue if 'persistence' in p: persistence_mapping = { 'RAW': Process.PERSISTENCE_RAW, 'CACHED': Process.PERSISTENCE_CACHED, 'TEMP': Process.PERSISTENCE_TEMP, } p['persistence'] = persistence_mapping[p['persistence']] if 'scheduling_class' in p: scheduling_class_mapping = { 'interactive': Process.SCHEDULING_CLASS_INTERACTIVE, 'batch': Process.SCHEDULING_CLASS_BATCH } p['scheduling_class'] = scheduling_class_mapping[ p['scheduling_class']] if 'input' in p: p['input_schema'] = p.pop('input') if 'output' in p: p['output_schema'] = p.pop('output') slug = p['slug'] if 'run' in p: # Set default language to 'bash' if not set. p['run'].setdefault('language', 'bash') # Transform output schema using the execution engine. try: execution_engine = manager.get_execution_engine( p['run']['language']) extra_output_schema = execution_engine.get_output_schema(p) if extra_output_schema: p.setdefault('output_schema', []).extend(extra_output_schema) except InvalidEngineError: self.stderr.write( "Skip processor {}: execution engine '{}' not supported" .format(slug, p['run']['language'])) continue # Validate if container image is allowed based on the configured pattern. # NOTE: This validation happens here and is not deferred to executors because the idea # is that this will be moved to a "container" requirement independent of the # executor. if hasattr(settings, 'FLOW_CONTAINER_VALIDATE_IMAGE'): try: container_image = dict_dot( p, 'requirements.executor.docker.image') if not re.match(settings.FLOW_CONTAINER_VALIDATE_IMAGE, container_image): self.stderr.write( "Skip processor {}: container image does not match '{}'" .format( slug, settings.FLOW_CONTAINER_VALIDATE_IMAGE, )) continue except KeyError: pass version = p['version'] int_version = convert_version_string_to_int( version, VERSION_NUMBER_BITS) # `latest version` is returned as `int` so it has to be compared to `int_version` latest_version = Process.objects.filter(slug=slug).aggregate( Max('version'))['version__max'] if latest_version is not None and latest_version > int_version: self.stderr.write( "Skip processor {}: newer version installed".format(slug)) continue previous_process_qs = Process.objects.filter(slug=slug) if previous_process_qs.exists(): previous_process = previous_process_qs.latest() else: previous_process = None process_query = Process.objects.filter(slug=slug, version=version) if process_query.exists(): if not force: if verbosity > 0: self.stdout.write( "Skip processor {}: same version installed".format( slug)) continue process_query.update(**p) log_processors.append("Updated {}".format(slug)) else: process = Process.objects.create(contributor=user, **p) assign_contributor_permissions(process) if previous_process: copy_permissions(previous_process, process) log_processors.append("Inserted {}".format(slug)) if verbosity > 0: if log_processors: self.stdout.write("Processor Updates:") for log in log_processors: self.stdout.write(" {}".format(log)) if log_templates: self.stdout.write("Default Template Updates:") for log in log_templates: self.stdout.write(" {}".format(log))
def copy_objects(objects, contributor, name_prefix, obj_processor=None): """Make a copy of given queryset. Shallow copy given queryset and set contributor to the given value, prepend name with the prefix, set slug to a unique value, and set ``duplicated`` date to the current time. Special attention is paid to keep the ``created`` date to its original value. If ``obj_processor`` function is given, each object is passed to it and the return value is used instead of it. :param objects: A queryset to be copied. :type objects: `~resolwe.flow.models.base.BaseQuerySet` :param contributor: A Django user that will be assigned to copied objects as contributor. :type contributor: `~django.contrib.auth.models.User` :param str name_prefix: A prefix that will be prepend to the name of copied objects. """ first = objects.first() if not first: return objects name_max_length = first._meta.get_field("name").max_length model = first._meta.model new_objects = [] for obj in objects: new_obj = deepcopy(obj) new_obj.pk = None new_obj.slug = None new_obj.contributor = contributor new_obj.name = "{} {}".format(name_prefix, obj.name) new_obj._container_attributes = dict() if len(new_obj.name) > name_max_length: new_obj.name = "{}...".format(new_obj.name[:name_max_length - 3]) if obj_processor: new_obj = obj_processor(new_obj) new_objects.append(new_obj) try: # Add another atomic block to avoid corupting the main one. with transaction.atomic(): model.objects.bulk_create(new_objects) # Send the bulk create custom signal, avoid circular import. from resolwe.flow.signals import post_duplicate post_duplicate.send(sender=model, instances=new_objects, old_instances=objects) except IntegrityError: # Probably a slug collision occured, try to create objects one by one. for obj in new_objects: obj.slug = None # Call the parent method to skip pre-processing and validation. models.Model.save(obj) object_permission_group = dict() not_in_container = list() for old, new in zip(objects, new_objects): new.created = old.created new.duplicated = timezone.now() # Deal with permissions. When object is in container fix the pointer # to permission_group object. # When object is not in container new PermissionGroup proxy object must # be created, assigned to new object and permissions copied from old # object to new one. if getattr(new, "collection_id", None) or getattr( new, "entity_id", None): new.permission_group = new.topmost_container.permission_group else: not_in_container.append((new, old)) object_permission_group[new] = PermissionGroup() PermissionGroup.objects.bulk_create(object_permission_group.values()) for new, old in not_in_container: new.permission_group = object_permission_group[new] copy_permissions(old, new) assign_contributor_permissions(new, contributor) model.objects.bulk_update(new_objects, ["created", "duplicated", "permission_group"]) return new_objects
def register_descriptors(self, descriptor_schemas, user, force=False, verbosity=1): """Read and register descriptors.""" log_descriptors = [] for descriptor_schema in descriptor_schemas: for schema, _, _ in iterate_schema({}, descriptor_schema.get( "schema", {})): if not schema["type"][-1].endswith(":"): schema["type"] += ":" if "schema" not in descriptor_schema: descriptor_schema["schema"] = [] if not self.valid(descriptor_schema, DESCRIPTOR_SCHEMA): continue slug = descriptor_schema["slug"] version = descriptor_schema.get("version", "0.0.0") int_version = convert_version_string_to_int( version, VERSION_NUMBER_BITS) # `latest version` is returned as `int` so it has to be compared to `int_version` latest_version = DescriptorSchema.objects.filter( slug=slug).aggregate(Max("version"))["version__max"] if latest_version is not None and latest_version > int_version: self.stderr.write( "Skip descriptor schema {}: newer version installed". format(slug)) continue previous_descriptor_qs = DescriptorSchema.objects.filter(slug=slug) if previous_descriptor_qs.exists(): previous_descriptor = previous_descriptor_qs.latest() else: previous_descriptor = None descriptor_query = DescriptorSchema.objects.filter(slug=slug, version=version) if descriptor_query.exists(): if not force: if verbosity > 0: self.stdout.write( "Skip descriptor schema {}: same version installed" .format(slug)) continue descriptor_query.update(**descriptor_schema) log_descriptors.append("Updated {}".format(slug)) else: descriptor = DescriptorSchema.objects.create( contributor=user, **descriptor_schema) assign_contributor_permissions(descriptor) if previous_descriptor: copy_permissions(previous_descriptor, descriptor) log_descriptors.append("Inserted {}".format(slug)) if log_descriptors and verbosity > 0: self.stdout.write("Descriptor schemas Updates:") for log in log_descriptors: self.stdout.write(" {}".format(log))
def register_processes(self, process_schemas, user, force=False, verbosity=1): """Read and register processors.""" log_processors = [] log_templates = [] for p in process_schemas: # TODO: Remove this when all processes are migrated to the # new syntax. if "flow_collection" in p: if "entity" in p: self.stderr.write( "Skip processor {}: only one of 'flow_collection' and 'entity' fields " "allowed".format(p["slug"])) continue p["entity"] = {"type": p.pop("flow_collection")} if p["type"][-1] != ":": p["type"] += ":" if "category" in p and not p["category"].endswith(":"): p["category"] += ":" for field in ["input", "output"]: for schema, _, _ in iterate_schema( {}, p[field] if field in p else {}): if not schema["type"][-1].endswith(":"): schema["type"] += ":" # TODO: Check if schemas validate with our JSON meta schema and Processor model docs. if not self.valid(p, PROCESSOR_SCHEMA): continue if "entity" in p: if "type" not in p["entity"]: self.stderr.write( "Skip process {}: 'entity.type' required if 'entity' defined" .format(p["slug"])) continue if "input" in p["entity"] and p["entity"].get( "always_create", False): self.stderr.write( "Skip process {}: 'entity.input' will not be considered if 'entity.always_create' " "is set to true.".format(p["slug"])) continue p["entity_type"] = p["entity"]["type"] p["entity_descriptor_schema"] = p["entity"].get( "descriptor_schema", p["entity_type"]) p["entity_input"] = p["entity"].get("input", None) p["entity_always_create"] = p["entity"].get( "always_create", False) p.pop("entity") if not DescriptorSchema.objects.filter( slug=p["entity_descriptor_schema"]).exists(): self.stderr.write( "Skip processor {}: Unknown descriptor schema '{}' used in 'entity' " "field.".format(p["slug"], p["entity_descriptor_schema"])) continue if "persistence" in p: persistence_mapping = { "RAW": Process.PERSISTENCE_RAW, "CACHED": Process.PERSISTENCE_CACHED, "TEMP": Process.PERSISTENCE_TEMP, } p["persistence"] = persistence_mapping[p["persistence"]] if "scheduling_class" in p: scheduling_class_mapping = { "interactive": Process.SCHEDULING_CLASS_INTERACTIVE, "batch": Process.SCHEDULING_CLASS_BATCH, } p["scheduling_class"] = scheduling_class_mapping[ p["scheduling_class"]] if "input" in p: p["input_schema"] = p.pop("input") if "output" in p: p["output_schema"] = p.pop("output") slug = p["slug"] if "run" in p: # Set default language to 'bash' if not set. p["run"].setdefault("language", "bash") # Transform output schema using the execution engine. try: execution_engine = manager.get_execution_engine( p["run"]["language"]) extra_output_schema = execution_engine.get_output_schema(p) if extra_output_schema: p.setdefault("output_schema", []).extend(extra_output_schema) except InvalidEngineError: self.stderr.write( "Skip processor {}: execution engine '{}' not supported" .format(slug, p["run"]["language"])) continue # Validate if container image is allowed based on the configured pattern. # NOTE: This validation happens here and is not deferred to executors because the idea # is that this will be moved to a "container" requirement independent of the # executor. if hasattr(settings, "FLOW_CONTAINER_VALIDATE_IMAGE"): try: container_image = dict_dot( p, "requirements.executor.docker.image") if not re.match(settings.FLOW_CONTAINER_VALIDATE_IMAGE, container_image): self.stderr.write( "Skip processor {}: container image does not match '{}'" .format( slug, settings.FLOW_CONTAINER_VALIDATE_IMAGE, )) continue except KeyError: pass version = p["version"] int_version = convert_version_string_to_int( version, VERSION_NUMBER_BITS) # `latest version` is returned as `int` so it has to be compared to `int_version` latest_version = Process.objects.filter(slug=slug).aggregate( Max("version"))["version__max"] if latest_version is not None and latest_version > int_version: self.stderr.write( "Skip processor {}: newer version installed".format(slug)) continue previous_process_qs = Process.objects.filter(slug=slug) if previous_process_qs.exists(): previous_process = previous_process_qs.latest() else: previous_process = None process_query = Process.objects.filter(slug=slug, version=version) if process_query.exists(): if not force: if verbosity > 0: self.stdout.write( "Skip processor {}: same version installed".format( slug)) continue process_query.update(**p) log_processors.append("Updated {}".format(slug)) else: process = Process.objects.create(contributor=user, **p) assign_contributor_permissions(process) if previous_process: copy_permissions(previous_process, process) log_processors.append("Inserted {}".format(slug)) if verbosity > 0: if log_processors: self.stdout.write("Processor Updates:") for log in log_processors: self.stdout.write(" {}".format(log)) if log_templates: self.stdout.write("Default Template Updates:") for log in log_templates: self.stdout.write(" {}".format(log))
def evaluate(self, data): """Evaluate the code needed to compute a given Data object.""" expression_engine = data.process.requirements.get('expression-engine', None) if expression_engine is not None: expression_engine = self.get_expression_engine(expression_engine) # Parse steps. steps = data.process.run.get('program', None) if steps is None: return if not isinstance(steps, list): raise ExecutionError('Workflow program must be a list of steps.') # Expression engine evaluation context. context = { 'input': data.input, 'steps': collections.OrderedDict(), } for index, step in enumerate(steps): try: step_id = step['id'] step_slug = step['run'] except KeyError as error: raise ExecutionError('Incorrect definition of step "{}", missing property "{}".'.format( step.get('id', index), error )) # Fetch target process. process = Process.objects.filter(slug=step_slug).order_by('-version').first() if not process: raise ExecutionError('Incorrect definition of step "{}", invalid process "{}".'.format( step_id, step_slug )) # Process all input variables. step_input = step.get('input', {}) if not isinstance(step_input, dict): raise ExecutionError('Incorrect definition of step "{}", input must be a dictionary.'.format( step_id )) data_input = self._evaluate_expressions(expression_engine, step_id, step_input, context) # Create the data object. data_object = Data.objects.create( process=process, contributor=data.contributor, tags=data.tags, input=data_input, ) DataDependency.objects.create( parent=data, child=data_object, kind=DataDependency.KIND_SUBPROCESS, ) # Copy permissions. copy_permissions(data, data_object) # Copy collections. for collection in data.collection_set.all(): collection.data.add(data_object) context['steps'][step_id] = data_object.pk # Immediately set our status to done and output all data object identifiers. data.output = { 'steps': list(context['steps'].values()), } data.status = Data.STATUS_DONE
def register_descriptors(self, descriptor_schemas, user, force=False, verbosity=1): """Read and register descriptors.""" log_descriptors = [] for descriptor_schema in descriptor_schemas: for field in ['var', 'schema']: for schema, _, _ in iterate_schema({}, descriptor_schema.get( field, {})): if not schema['type'][-1].endswith(':'): schema['type'] += ':' # support backward compatibility # TODO: update .yml files and remove if 'slug' not in descriptor_schema: descriptor_schema['slug'] = slugify( descriptor_schema.pop('name').replace(':', '-')) descriptor_schema['name'] = descriptor_schema.pop('label') if 'schema' not in descriptor_schema: descriptor_schema['schema'] = [] if 'static' in descriptor_schema: descriptor_schema['schema'].extend( descriptor_schema.pop('static')) if 'var' in descriptor_schema: descriptor_schema['schema'].extend( descriptor_schema.pop('var')) if not self.valid(descriptor_schema, DESCRIPTOR_SCHEMA): continue slug = descriptor_schema['slug'] version = descriptor_schema.get('version', '0.0.0') int_version = convert_version_string_to_int( version, VERSION_NUMBER_BITS) # `latest version` is returned as `int` so it has to be compared to `int_version` latest_version = DescriptorSchema.objects.filter( slug=slug).aggregate(Max('version'))['version__max'] if latest_version is not None and latest_version > int_version: self.stderr.write( "Skip descriptor schema {}: newer version installed". format(slug)) continue previous_descriptor_qs = DescriptorSchema.objects.filter(slug=slug) if previous_descriptor_qs.exists(): previous_descriptor = previous_descriptor_qs.latest() else: previous_descriptor = None descriptor_query = DescriptorSchema.objects.filter(slug=slug, version=version) if descriptor_query.exists(): if not force: if verbosity > 0: self.stdout.write( "Skip descriptor schema {}: same version installed" .format(slug)) continue descriptor_query.update(**descriptor_schema) log_descriptors.append("Updated {}".format(slug)) else: descriptor = DescriptorSchema.objects.create( contributor=user, **descriptor_schema) if previous_descriptor: copy_permissions(previous_descriptor, descriptor) log_descriptors.append("Inserted {}".format(slug)) if len(log_descriptors) > 0 and verbosity > 0: self.stdout.write("Descriptor schemas Updates:") for log in log_descriptors: self.stdout.write(" {}".format(log))
def register_processes(self, process_schemas, user, force=False, verbosity=1): """Read and register processors.""" log_processors = [] log_templates = [] for p in process_schemas: if p['type'][-1] != ':': p['type'] += ':' if 'category' in p and not p['category'].endswith(':'): p['category'] += ':' # get `data_name` from `static` if 'static' in p: for schema, _, _ in iterate_schema({}, p['static']): if schema['name'] == 'name' and 'default' in schema: p['data_name'] = schema['default'] # support backward compatibility # TODO: update .yml files and remove if 'slug' not in p: p['slug'] = slugify(p.pop('name').replace(':', '-')) p['name'] = p.pop('label') p.pop('var', None) p.pop('static', None) for field in ['input', 'output', 'var', 'static']: for schema, _, _ in iterate_schema( {}, p[field] if field in p else {}): if not schema['type'][-1].endswith(':'): schema['type'] += ':' # TODO: Check if schemas validate with our JSON meta schema and Processor model docs. if not self.valid(p, PROCESSOR_SCHEMA): continue if 'persistence' in p: persistence_mapping = { 'RAW': Process.PERSISTENCE_RAW, 'CACHED': Process.PERSISTENCE_CACHED, 'TEMP': Process.PERSISTENCE_TEMP, } p['persistence'] = persistence_mapping[p['persistence']] if 'input' in p: p['input_schema'] = p.pop('input') if 'output' in p: p['output_schema'] = p.pop('output') slug = p['slug'] if 'run' in p: # Set default language to 'bash' if not set. p['run'].setdefault('language', 'bash') # Transform output schema using the execution engine. try: execution_engine = manager.get_execution_engine( p['run']['language']) extra_output_schema = execution_engine.get_output_schema(p) if extra_output_schema: p.setdefault('output_schema', []).extend(extra_output_schema) except InvalidEngineError: self.stderr.write( "Skip processor {}: execution engine '{}' not supported" .format(slug, p['run']['language'])) continue version = p['version'] int_version = convert_version_string_to_int( version, VERSION_NUMBER_BITS) # `latest version` is returned as `int` so it has to be compared to `int_version` latest_version = Process.objects.filter(slug=slug).aggregate( Max('version'))['version__max'] if latest_version is not None and latest_version > int_version: self.stderr.write( "Skip processor {}: newer version installed".format(slug)) continue previous_process_qs = Process.objects.filter(slug=slug) if previous_process_qs.exists(): previous_process = previous_process_qs.latest() else: previous_process = None process_query = Process.objects.filter(slug=slug, version=version) if process_query.exists(): if not force: if verbosity > 0: self.stdout.write( "Skip processor {}: same version installed".format( slug)) continue process_query.update(**p) log_processors.append("Updated {}".format(slug)) else: process = Process.objects.create(contributor=user, **p) if previous_process: copy_permissions(previous_process, process) log_processors.append("Inserted {}".format(slug)) if verbosity > 0: if len(log_processors) > 0: self.stdout.write("Processor Updates:") for log in log_processors: self.stdout.write(" {}".format(log)) if len(log_templates) > 0: self.stdout.write("Default Template Updates:") for log in log_templates: self.stdout.write(" {}".format(log))
def create_entity(self): """Create entity if `flow_collection` is defined in process. Following rules applies for adding `Data` object to `Entity`: * Only add `Data object` to `Entity` if process has defined `flow_collection` field * Add object to existing `Entity`, if all parents that are part of it (but not necessary all parents), are part of the same `Entity` * If parents belong to different `Entities` or do not belong to any `Entity`, create new `Entity` """ entity_type = self.process.entity_type # pylint: disable=no-member entity_descriptor_schema = self.process.entity_descriptor_schema # pylint: disable=no-member entity_input = self.process.entity_input # pylint: disable=no-member if entity_type: data_filter = {} if entity_input: input_id = dict_dot(self.input, entity_input, default=lambda: None) if input_id is None: logger.warning("Skipping creation of entity due to missing input.") return if isinstance(input_id, int): data_filter['data__pk'] = input_id elif isinstance(input_id, list): data_filter['data__pk__in'] = input_id else: raise ValueError( "Cannot create entity due to invalid value of field {}.".format(entity_input) ) else: data_filter['data__in'] = self.parents.all() # pylint: disable=no-member entity_query = Entity.objects.filter(type=entity_type, **data_filter).distinct() entity_count = entity_query.count() if entity_count == 0: descriptor_schema = DescriptorSchema.objects.filter( slug=entity_descriptor_schema ).latest() entity = Entity.objects.create( contributor=self.contributor, descriptor_schema=descriptor_schema, type=entity_type, name=self.name, tags=self.tags, ) assign_contributor_permissions(entity) elif entity_count == 1: entity = entity_query.first() copy_permissions(entity, self) else: logger.info("Skipping creation of entity due to multiple entities found.") entity = None if entity: entity.data.add(self) # Inherit collections from entity. for collection in entity.collections.all(): collection.data.add(self)
def test_copy_perms_wrong_ctype(self): with self.assertRaises(AssertionError): copy_permissions(self.src_process, self.collection)
def evaluate(self, data): """Evaluate the code needed to compute a given Data object.""" expression_engine = data.process.requirements.get('expression-engine', None) if expression_engine is not None: expression_engine = self.get_expression_engine(expression_engine) # Parse steps. steps = data.process.run.get('program', None) if steps is None: return if not isinstance(steps, list): raise ExecutionError('Workflow program must be a list of steps.') # Expression engine evaluation context. context = { 'input': data.input, 'steps': collections.OrderedDict(), } for index, step in enumerate(steps): try: step_id = step['id'] step_slug = step['run'] except KeyError as error: raise ExecutionError('Incorrect definition of step "{}", missing property "{}".'.format( step.get('id', index), error )) # Fetch target process. process = Process.objects.filter(slug=step_slug).order_by('-version').first() if not process: raise ExecutionError('Incorrect definition of step "{}", invalid process "{}".'.format( step_id, step_slug )) # Process all input variables. step_input = step.get('input', {}) if not isinstance(step_input, dict): raise ExecutionError('Incorrect definition of step "{}", input must be a dictionary.'.format( step_id )) data_input = self._evaluate_expressions(expression_engine, step_id, step_input, context) # Create the data object. data_object = Data.objects.create( process=process, contributor=data.contributor, input=data_input, ) DataDependency.objects.create( parent=data, child=data_object, kind=DataDependency.KIND_SUBPROCESS, ) # Copy permissions. copy_permissions(data, data_object) # Copy collections. for collection in data.collection_set.all(): collection.data.add(data_object) context['steps'][step_id] = data_object.pk # Immediately set our status to done and output all data object identifiers. data.output = { 'steps': list(context['steps'].values()), } data.status = Data.STATUS_DONE
def run(self, data_id, script, verbosity=1): """Execute the script and save results.""" if verbosity >= 1: print('RUN: {} {}'.format(data_id, script)) self.data_id = data_id self.process_failed = False # Fetch data instance to get any executor requirements. self.process = Data.objects.get(pk=data_id).process requirements = self.process.requirements self.requirements = requirements.get('executor', {}).get(self.name, {}) self.resources = requirements.get('resources', {}) data_dir = settings.FLOW_EXECUTOR['DATA_DIR'] dir_mode = getattr(settings, 'FLOW_EXECUTOR', {}).get('DATA_DIR_MODE', 0o755) output_path = os.path.join(data_dir, str(data_id)) os.mkdir(output_path) # os.mkdir is not guaranteed to set the given mode os.chmod(output_path, dir_mode) os.chdir(output_path) log_file = open('stdout.txt', 'w+') json_file = open('jsonout.txt', 'w+') proc_pid = self.start() self.update_data_status(status=Data.STATUS_PROCESSING, started=now(), process_pid=proc_pid) # Run processor and handle intermediate results self.run_script(script) spawn_processors = [] output = {} process_error, process_warning, process_info = [], [], [] process_progress, process_rc = 0, 0 # read processor output try: stdout = self.get_stdout() while True: line = stdout.readline() if not line: break try: if line.strip().startswith('run'): # Save processor and spawn if no errors log_file.write(line) log_file.flush() for obj in iterjson(line[3:].strip()): spawn_processors.append(obj) elif line.strip().startswith('export'): file_name = line[6:].strip() export_folder = settings.FLOW_EXECUTOR['UPLOAD_DIR'] unique_name = 'export_{}'.format(uuid.uuid4().hex) export_path = os.path.join(export_folder, unique_name) self.exported_files_mapper[ self.data_id][file_name] = unique_name shutil.move(file_name, export_path) else: # If JSON, save to MongoDB updates = {} for obj in iterjson(line): for key, val in six.iteritems(obj): if key.startswith('proc.'): if key == 'proc.error': process_error.append(val) if not process_rc: process_rc = 1 updates['process_rc'] = process_rc updates[ 'process_error'] = process_error updates['status'] = Data.STATUS_ERROR elif key == 'proc.warning': process_warning.append(val) updates[ 'process_warning'] = process_warning elif key == 'proc.info': process_info.append(val) updates['process_info'] = process_info elif key == 'proc.rc': process_rc = int(val) updates['process_rc'] = process_rc if process_rc != 0: updates[ 'status'] = Data.STATUS_ERROR elif key == 'proc.progress': process_progress = int( float(val) * 100) updates[ 'process_progress'] = process_progress else: dict_dot(output, key, val) updates['output'] = output if updates: updates['modified'] = now() self.update_data_status(**updates) if process_rc > 0: log_file.close() json_file.close() os.chdir(CWD) return # Debug output # Not referenced in Data object json_file.write(line) json_file.flush() except ValueError as ex: # Ignore if not JSON log_file.write(line) log_file.flush() except MemoryError as ex: logger.error(__("Out of memory: {}", ex)) except IOError as ex: # TODO: if ex.errno == 28: no more free space raise ex finally: # Store results log_file.close() json_file.close() os.chdir(CWD) return_code = self.end() if process_rc < return_code: process_rc = return_code # This transaction is needed to make sure that processing of # current data object is finished before manager for spawned # processes is triggered. with transaction.atomic(): if spawn_processors and process_rc == 0: parent_data = Data.objects.get(pk=self.data_id) # Spawn processors for d in spawn_processors: d['contributor'] = parent_data.contributor d['process'] = Process.objects.filter( slug=d['process']).latest() for field_schema, fields in iterate_fields( d.get('input', {}), d['process'].input_schema): type_ = field_schema['type'] name = field_schema['name'] value = fields[name] if type_ == 'basic:file:': fields[name] = self.hydrate_spawned_files( value, data_id) elif type_ == 'list:basic:file:': fields[name] = [ self.hydrate_spawned_files(fn, data_id) for fn in value ] with transaction.atomic(): d = Data.objects.create(**d) DataDependency.objects.create( parent=parent_data, child=d, kind=DataDependency.KIND_SUBPROCESS, ) # Copy permissions. copy_permissions(parent_data, d) # Entity is added to the collection only when it is # created - when it only contains 1 Data object. entities = Entity.objects.filter(data=d).annotate( num_data=Count('data')).filter(num_data=1) # Copy collections. for collection in parent_data.collection_set.all(): collection.data.add(d) # Add entities to which data belongs to the collection. for entity in entities: entity.collections.add(collection) if process_rc == 0 and not self.process_failed: self.update_data_status(status=Data.STATUS_DONE, process_progress=100, finished=now()) else: self.update_data_status(status=Data.STATUS_ERROR, process_progress=100, process_rc=process_rc, finished=now()) try: # Cleanup after processor data_purge(data_ids=[data_id], delete=True, verbosity=verbosity) except: # pylint: disable=bare-except logger.error(__("Purge error:\n\n{}", traceback.format_exc()))
def handle_finish(self, obj): """Handle an incoming ``Data`` finished processing request. :param obj: The Channels message object. Command object format: .. code-block:: none { 'command': 'finish', 'data_id': [id of the :class:`~resolwe.flow.models.Data` object this command changes], 'process_rc': [exit status of the processing] 'spawn_processes': [optional; list of spawn dictionaries], 'exported_files_mapper': [if spawn_processes present] } """ data_id = obj[ExecutorProtocol.DATA_ID] logger.debug(__("Finishing Data with id {} (handle_finish).", data_id), extra={ 'data_id': data_id, 'packet': obj }) with transaction.atomic(): # Spawn any new jobs in the request. spawned = False if ExecutorProtocol.FINISH_SPAWN_PROCESSES in obj: if is_testing(): # NOTE: This is a work-around for Django issue #10827 # (https://code.djangoproject.com/ticket/10827), same as in # TestCaseHelpers._pre_setup(). Because the listener is running # independently, it must clear the cache on its own. ContentType.objects.clear_cache() spawned = True exported_files_mapper = obj[ ExecutorProtocol.FINISH_EXPORTED_FILES] logger.debug(__( "Spawning new Data objects for Data with id {} (handle_finish).", data_id), extra={'data_id': data_id}) try: # This transaction is needed because we're running # asynchronously with respect to the main Django code # here; the manager can get nudged from elsewhere. with transaction.atomic(): parent_data = Data.objects.get(pk=data_id) # Spawn processes. for d in obj[ExecutorProtocol.FINISH_SPAWN_PROCESSES]: d['contributor'] = parent_data.contributor d['process'] = Process.objects.filter( slug=d['process']).latest() for field_schema, fields in iterate_fields( d.get('input', {}), d['process'].input_schema): type_ = field_schema['type'] name = field_schema['name'] value = fields[name] if type_ == 'basic:file:': fields[name] = self.hydrate_spawned_files( exported_files_mapper, value, data_id) elif type_ == 'list:basic:file:': fields[name] = [ self.hydrate_spawned_files( exported_files_mapper, fn, data_id) for fn in value ] with transaction.atomic(): d = Data.objects.create(**d) DataDependency.objects.create( parent=parent_data, child=d, kind=DataDependency.KIND_SUBPROCESS, ) # Copy permissions. copy_permissions(parent_data, d) # Entity is added to the collection only when it is # created - when it only contains 1 Data object. entities = Entity.objects.filter( data=d).annotate( num_data=Count('data')).filter( num_data=1) # Copy collections. for collection in parent_data.collection_set.all( ): collection.data.add(d) # Add entities to which data belongs to the collection. for entity in entities: entity.collections.add(collection) except Exception: # pylint: disable=broad-except logger.error(__( "Error while preparing spawned Data objects of process '{}' (handle_finish):\n\n{}", parent_data.process.slug, traceback.format_exc()), extra={'data_id': data_id}) # Data wrap up happens last, so that any triggered signals # already see the spawned children. What the children themselves # see is guaranteed by the transaction we're in. if ExecutorProtocol.FINISH_PROCESS_RC in obj: process_rc = obj[ExecutorProtocol.FINISH_PROCESS_RC] try: d = Data.objects.get(pk=data_id) except Data.DoesNotExist: logger.warning( "Data object does not exist (handle_finish).", extra={ 'data_id': data_id, }) async_to_sync(self._send_reply)(obj, { ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_ERROR }) return if process_rc == 0 and not d.status == Data.STATUS_ERROR: changeset = { 'status': Data.STATUS_DONE, 'process_progress': 100, 'finished': now() } else: changeset = { 'status': Data.STATUS_ERROR, 'process_progress': 100, 'process_rc': process_rc, 'finished': now() } obj[ExecutorProtocol.UPDATE_CHANGESET] = changeset self.handle_update(obj, internal_call=True) if not getattr(settings, 'FLOW_MANAGER_KEEP_DATA', False): try: # Clean up after process data_purge(data_ids=[data_id], delete=True, verbosity=self._verbosity) except Exception: # pylint: disable=broad-except logger.error(__("Purge error:\n\n{}", traceback.format_exc()), extra={'data_id': data_id}) # Notify the executor that we're done. async_to_sync(self._send_reply)( obj, { ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_OK }) # Now nudge the main manager to perform final cleanup. This is # needed even if there was no spawn baggage, since the manager # may need to know when executors have finished, to keep count # of them and manage synchronization. async_to_sync(consumer.send_event)({ WorkerProtocol.COMMAND: WorkerProtocol.FINISH, WorkerProtocol.DATA_ID: data_id, WorkerProtocol.FINISH_SPAWNED: spawned, WorkerProtocol.FINISH_COMMUNICATE_EXTRA: { 'executor': getattr(settings, 'FLOW_EXECUTOR', {}).get('NAME', 'resolwe.flow.executors.local'), }, })
def handle_finish(self, obj): """Handle an incoming ``Data`` finished processing request. :param obj: The Channels message object. Command object format: .. code-block:: none { 'command': 'finish', 'data_id': [id of the :class:`~resolwe.flow.models.Data` object this command changes], 'process_rc': [exit status of the processing] 'spawn_processes': [optional; list of spawn dictionaries], 'exported_files_mapper': [if spawn_processes present] } """ data_id = obj[ExecutorProtocol.DATA_ID] logger.debug( __("Finishing Data with id {} (handle_finish).", data_id), extra={ 'data_id': data_id, 'packet': obj } ) spawning_failed = False with transaction.atomic(): # Spawn any new jobs in the request. spawned = False if ExecutorProtocol.FINISH_SPAWN_PROCESSES in obj: if is_testing(): # NOTE: This is a work-around for Django issue #10827 # (https://code.djangoproject.com/ticket/10827), same as in # TestCaseHelpers._pre_setup(). Because the listener is running # independently, it must clear the cache on its own. ContentType.objects.clear_cache() spawned = True exported_files_mapper = obj[ExecutorProtocol.FINISH_EXPORTED_FILES] logger.debug( __("Spawning new Data objects for Data with id {} (handle_finish).", data_id), extra={ 'data_id': data_id } ) try: # This transaction is needed because we're running # asynchronously with respect to the main Django code # here; the manager can get nudged from elsewhere. with transaction.atomic(): parent_data = Data.objects.get(pk=data_id) # Spawn processes. for d in obj[ExecutorProtocol.FINISH_SPAWN_PROCESSES]: d['contributor'] = parent_data.contributor d['process'] = Process.objects.filter(slug=d['process']).latest() d['tags'] = parent_data.tags for field_schema, fields in iterate_fields(d.get('input', {}), d['process'].input_schema): type_ = field_schema['type'] name = field_schema['name'] value = fields[name] if type_ == 'basic:file:': fields[name] = self.hydrate_spawned_files( exported_files_mapper, value, data_id ) elif type_ == 'list:basic:file:': fields[name] = [self.hydrate_spawned_files(exported_files_mapper, fn, data_id) for fn in value] with transaction.atomic(): d = Data.objects.create(**d) DataDependency.objects.create( parent=parent_data, child=d, kind=DataDependency.KIND_SUBPROCESS, ) # Copy permissions. copy_permissions(parent_data, d) # Entity is added to the collection only when it is # created - when it only contains 1 Data object. entities = Entity.objects.filter(data=d).annotate(num_data=Count('data')).filter( num_data=1) # Copy collections. for collection in parent_data.collection_set.all(): collection.data.add(d) # Add entities to which data belongs to the collection. for entity in entities: entity.collections.add(collection) except Exception: # pylint: disable=broad-except logger.error( __( "Error while preparing spawned Data objects of process '{}' (handle_finish):\n\n{}", parent_data.process.slug, traceback.format_exc() ), extra={ 'data_id': data_id } ) spawning_failed = True # Data wrap up happens last, so that any triggered signals # already see the spawned children. What the children themselves # see is guaranteed by the transaction we're in. if ExecutorProtocol.FINISH_PROCESS_RC in obj: process_rc = obj[ExecutorProtocol.FINISH_PROCESS_RC] try: d = Data.objects.get(pk=data_id) except Data.DoesNotExist: logger.warning( "Data object does not exist (handle_finish).", extra={ 'data_id': data_id, } ) async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_ERROR}) return changeset = { 'process_progress': 100, 'finished': now(), } if spawning_failed: changeset['status'] = Data.STATUS_ERROR changeset['process_error'] = ["Error while preparing spawned Data objects"] elif process_rc == 0 and not d.status == Data.STATUS_ERROR: changeset['status'] = Data.STATUS_DONE else: changeset['status'] = Data.STATUS_ERROR changeset['process_rc'] = process_rc obj[ExecutorProtocol.UPDATE_CHANGESET] = changeset self.handle_update(obj, internal_call=True) if not getattr(settings, 'FLOW_MANAGER_KEEP_DATA', False): # Purge worker is not running in test runner, so we should skip triggering it. if not is_testing(): channel_layer = get_channel_layer() try: async_to_sync(channel_layer.send)( CHANNEL_PURGE_WORKER, { 'type': TYPE_PURGE_RUN, 'location_id': d.location.id, 'verbosity': self._verbosity, } ) except ChannelFull: logger.warning( "Cannot trigger purge because channel is full.", extra={'data_id': data_id} ) # Notify the executor that we're done. async_to_sync(self._send_reply)(obj, {ExecutorProtocol.RESULT: ExecutorProtocol.RESULT_OK}) # Now nudge the main manager to perform final cleanup. This is # needed even if there was no spawn baggage, since the manager # may need to know when executors have finished, to keep count # of them and manage synchronization. async_to_sync(consumer.send_event)({ WorkerProtocol.COMMAND: WorkerProtocol.FINISH, WorkerProtocol.DATA_ID: data_id, WorkerProtocol.FINISH_SPAWNED: spawned, WorkerProtocol.FINISH_COMMUNICATE_EXTRA: { 'executor': getattr(settings, 'FLOW_EXECUTOR', {}).get('NAME', 'resolwe.flow.executors.local'), }, })