def dimensions_and_resize(max_dim: int, vault: Vault, sub_path: str, image_to_write: Bean) -> Optional[str]: try: im = PIL_Image.open(vault.path_to(sub_path)) except DecompressionBombError: return "Image too large: %s" % sub_path image_to_write.width = im.size[0] image_to_write.height = im.size[1] # Generate a thumbnail if image is too large if (im.size[0] > max_dim) or (im.size[1] > max_dim): im.thumbnail((max_dim, max_dim)) if im.mode == 'P': # (8-bit pixels, mapped to any other mode using a color palette) # from https://pillow.readthedocs.io/en/latest/handbook/concepts.html#modes # Tested using a PNG with palette im = im.convert("RGB") thumb_relative_path, thumb_full_path = vault.thumbnail_paths( image_to_write.imgid) im.save(thumb_full_path) image_to_write.thumb_file_name = thumb_relative_path image_to_write.thumb_width = im.size[0] image_to_write.thumb_height = im.size[1] else: # Close the PIL image, when resized it was done during im.save # Otherwise there is a FD exhaustion on PyPy im.close() # Need empty fields for bulk insert image_to_write.thumb_file_name = None image_to_write.thumb_width = None image_to_write.thumb_height = None return None
def dimensions_and_resize(max_dim: int, vault: Vault, sub_path: str, image_to_write: Bean) -> Optional[str]: """ Get dimensions from given image, return a string with the error in case of issue. It is assumed that the image is valid, i.e. did not throw an exception in above validate() """ im = PIL_Image.open(vault.path_to(sub_path)) image_to_write.width = im.size[0] image_to_write.height = im.size[1] # Generate a thumbnail if image is too large if (im.size[0] > max_dim) or (im.size[1] > max_dim): if im.mode == 'P' or im.mode[0] == 'I': # (8-bit pixels, mapped to any other mode using a color palette) # from https://pillow.readthedocs.io/en/latest/handbook/concepts.html#modes # Tested using a PNG with palette im = im.convert("RGB") im.thumbnail((max_dim, max_dim)) thumb_relative_path, thumb_full_path = vault.thumbnail_paths(image_to_write.imgid) im.save(thumb_full_path) image_to_write.thumb_file_name = thumb_relative_path image_to_write.thumb_width = im.size[0] image_to_write.thumb_height = im.size[1] else: # Close the PIL image, when resized it was done during im.save # Otherwise there is a FD exhaustion on PyPy im.close() # Need empty fields for bulk insert image_to_write.thumb_file_name = None image_to_write.thumb_width = None image_to_write.thumb_height = None return None
def __init__(self, prj_id: int, req: SubsetReq): super().__init__(prj_id, req.task_id) # Load the destination project dest_prj = self.session.query(Project).get(req.dest_prj_id) assert dest_prj is not None self.dest_prj: Project = dest_prj self.req = req # Work vars self.to_clone: EnumeratedObjectSet = EnumeratedObjectSet(self.session, []) self.vault = Vault(join(self.link_src, 'vault')) self.first_query = True
def config() -> EcoTaxaConfig: # Setup link.INI_DIR = HERE link.INI_FILE = TEST_CONFIG conf = EcoTaxaConfig() # Inject low values for covering, even with test small dataset DBWriter.SEQUENCE_CACHE_SIZE = 5 TSVFile.REPORT_EVERY = 5 # Empty Vault vault = Vault(join(link.read_link(), 'vault')) shutil.rmtree(vault.sub_path("0000"), ignore_errors=True) yield conf # Teardown conf.cleanup()
def __init__(self, prj_id: int, req: Union[ImportPrepReq, ImportRealReq, SimpleImportReq]): super().__init__(prj_id, req.task_id) # Received from parameters """ The project ID to import into """ self.source_dir_or_zip: str = req.source_path """ The source file or directory """ self.req = req # From legacy code, vault and temptask are in src directory self.vault = Vault(join(self.link_src, 'vault'))
class ImageManagerService(Service): def __init__(self): super().__init__() self.vault = Vault(join(self.link_src, 'vault')) @staticmethod def compute_md5(fname): hash_md5 = hashlib.md5() with open(fname, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.digest() def do_digests(self, current_user_id: UserIDT, prj_id: Optional[ProjectIDT], max_digests: int) -> str: """ Pick some images without checksum and compute it. """ _user = RightsBO.user_has_role(self.session, current_user_id, Role.APP_ADMINISTRATOR) qry: Query = self.session.query(Image, ImageFile) if prj_id is not None: qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join( Project) qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path) qry = qry.filter(ImageFile.path.is_(None)) if prj_id is not None: qry = qry.filter(Project.projid == prj_id) qry = qry.limit(max_digests) cnt = 0 for an_img, img_file in qry.all(): cnt += 1 if img_file is None: # No image_file line, add it img_file = ImageFile(path=an_img.file_name) self.session.add(img_file) img_file_path = self.vault.sub_path(an_img.file_name) try: md5 = self.compute_md5(img_file_path) img_file.digest = md5 img_file.digest_type = '5' img_file.state = ImageFileStateEnum.OK.value except FileNotFoundError: img_file.state = ImageFileStateEnum.MISSING.value except Exception as e: logger.exception(e) img_file.state = ImageFileStateEnum.ERROR.value self.session.commit() return "Digest for %d images done." % cnt
def add_images(self, nb_files_to_add, start_progress: int, end_progress: int): # Add image files, linked to the TSV content self.update_progress(start_progress, "Start Image export") progress_range = end_progress - start_progress logger.info("Appending to zip file %s" % self.out_file_name) produced_path = self.out_path / self.out_file_name zfile = zipfile.ZipFile(produced_path, 'a', allowZip64=True, compression=zipfile.ZIP_DEFLATED) nb_files_added = 0 vault = Vault(join(self.link_src, 'vault')) temp_img_file = self.out_path / "images.csv" with open(temp_img_file, "r") as temp_images_csv_fd: for r in csv.DictReader(temp_images_csv_fd, delimiter='\t', quotechar='"', lineterminator='\n'): img_file_path = vault.path_to(r["src_path"]) path_in_zip = r["dst_path"] try: zfile.write(img_file_path, arcname=path_in_zip) except FileNotFoundError: logger.error("Not found image: %s", img_file_path) continue logger.info("Added file %s as %s", img_file_path, path_in_zip) nb_files_added += 1 if nb_files_added % self.IMAGES_REPORT_EVERY == 0: msg = "Added %d files" % nb_files_added logger.info(msg) progress = int(start_progress + progress_range / nb_files_to_add * nb_files_added) self.update_progress(progress, msg) zfile.close()
class SubsetServiceOnProject(JobServiceOnProjectBase): """ A task doing the subset operation. """ JOB_TYPE = "Subset" # Fetch this number of objects at a time, and write them, in a DB session CHUNK_SIZE = 100 def __init__(self, prj_id: int, req: SubsetReq): super().__init__(prj_id) # Load the destination project dest_prj = self.session.query(Project).get(req.dest_prj_id) assert dest_prj is not None self.dest_prj: Project = dest_prj self.req = req # Work vars self.to_clone: EnumeratedObjectSet = EnumeratedObjectSet( self.session, []) self.vault = Vault(join(self.link_src, 'vault')) self.first_query = True def init_args(self, args: Dict) -> Dict: super().init_args(args) args["req"] = self.req.dict() return args @staticmethod def deser_args(json_args: Dict): json_args["req"] = SubsetReq(**json_args["req"]) def run(self, current_user_id: int) -> SubsetRsp: """ Initial run, basically just create the job. """ # Security check RightsBO.user_wants(self.session, current_user_id, Action.READ, self.prj_id) RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE, self.dest_prj.projid) # OK, go background straight away self.create_job(self.JOB_TYPE, current_user_id) ret = SubsetRsp(job_id=self.job_id) return ret def do_background(self): """ Background part of the job. """ with LogsSwitcher(self): return self.do_run() def do_run(self) -> None: # OK logger.info("Starting subset of '%s'", self.prj.title) self.update_progress(5, "Determining objects to clone") self._find_what_to_clone() logger.info("Matched %s objects", len(self.to_clone)) if len(self.to_clone) == 0: errors = ["No object found to clone into subset."] self.set_job_result(errors=errors, infos={"infos": ""}) return self._do_clone() self.session.commit() # Recompute stats and so on ProjectBO.do_after_load(self.session, self.dest_prj.projid) self.session.commit() self.set_job_result(errors=[], infos={"rowcount": len(self.to_clone)}) def _do_clone(self): """ Cloning operation itself. Assumes that @see self.to_clone was populated before. """ # Get the mappings in source project, in order to determines the useful columns custom_mapping = ProjectMapping().load_from_project(self.prj) obj_mapping = custom_mapping.object_mappings used_columns = set(obj_mapping.real_cols_to_tsv.keys()) used_columns.add("orig_id") # By safety # Create a DB writer writer = DBWriter(self.session) # Narrow the writes in ObjectFields thanks to mappings of original project writer.generators({"obj_field": used_columns}) # Use import helpers dest_prj_id = self.dest_prj.projid import_how = ImportHow(prj_id=dest_prj_id, update_mode="No", custom_mapping=ProjectMapping(), skip_object_duplicates=False, loaded_files=[]) # Get parent (enclosing) Sample, Acquisition, Process. There should be 0 in this context... import_how.existing_parents = InBundle.fetch_existing_parents( self.session, prj_id=dest_prj_id) self._clone_all(import_how, writer) # Copy mappings to destination. We could narrow them to the minimum? custom_mapping.write_to_project(self.dest_prj) def _db_fetch(self, object_ids: ObjectIDListT) -> List[DBObjectTupleT]: """ Do a DB read of given objects, with auxiliary objects. :param object_ids: The list of IDs :return: """ # TODO: Depending on filter, the joins could be plain (not outer) # E.g. if asked for a set of samples ret: Query = self.ro_session.query(ObjectHeader) ret = ret.join(ObjectHeader.acquisition).join( Acquisition.process).join(Acquisition.sample) ret = ret.outerjoin(Image, ObjectHeader.all_images).outerjoin( ObjectCNNFeature).join(ObjectFields) ret = ret.filter(ObjectHeader.objid == any_(object_ids)) ret = ret.order_by(ObjectHeader.objid, Image.imgid) ret = ret.with_entities(ObjectHeader, ObjectFields, ObjectCNNFeature, Image, Sample, Acquisition, Process) if self.first_query: logger.info("Query: %s", str(ret)) self.first_query = False return ret.all() def _clone_all(self, import_how, writer): # Bean counting init nb_objects = 0 total_objects = len(self.to_clone) # Pick chunks of object ids for a_chunk in self.to_clone.get_objectid_chunks(self.CHUNK_SIZE): # Fetch them using SQLAlchemy db_tuples = self._db_fetch(a_chunk) # Send each 'line' for a_db_tuple in db_tuples: self._send_to_writer(import_how, writer, a_db_tuple) # Bean counting and reporting nb_objects += len(a_chunk) # Save writer.do_bulk_save() # Commit (it expires SQLAlchemy session-linked objects) self.session.commit() progress = int(90 * nb_objects / total_objects) self.update_progress(10 + progress, "Subset creation in progress") def _send_to_writer(self, import_how: ImportHow, writer: DBWriter, db_tuple: DBObjectTupleT): """ Send a single tuple from DB to DB :param import_how: :param writer: :param db_tuple: :return: """ obj_orm, fields_orm, cnn_features_orm, image_orm, sample_orm, acquisition_orm, process_orm = db_tuple # Transform all to key-less beans so they can be absorbed by DBWriter obj, fields, cnn_features, image, sample, acquisition, process = \ bean_of(obj_orm), bean_of(fields_orm), bean_of(cnn_features_orm), \ bean_of(image_orm), bean_of(sample_orm), \ bean_of(acquisition_orm), bean_of(process_orm) assert obj is not None and fields is not None # A few fields need adjustment obj.img0id = None # Cut images if asked so if not self.req.do_images: image = None # Write parent entities assert sample and acquisition and process dict_of_parents = { Sample.__tablename__: sample, Acquisition.__tablename__: acquisition, Process.__tablename__: process } TSVFile.add_parent_objects(import_how, self.session, obj, dict_of_parents) # Write object and children new_records = TSVFile.create_or_link_slaves( how=import_how, session=self.session, object_head_to_write=obj, object_fields_to_write=fields, image_to_write=image) writer.add_db_entities(obj, fields, image, new_records) # Keep track of existing objects if new_records > 1: # We now have an Id from sequences, so ref. it. import_how.existing_objects[obj.orig_id] = obj.objid if cnn_features is not None: writer.add_cnn_features(obj, cnn_features) # Do images if new_records > 0 and self.req.do_images and image and image.file_name is not None: # We have an image, with a new imgid but old paths have been copied old_imgpath = Path(self.vault.path_to(image.file_name)) image.file_name = None # In case, don't reference a non-existing file try: sub_path = self.vault.store_image(old_imgpath, image.imgid) image.file_name = sub_path except FileNotFoundError: pass # Proceed to thumbnail if any if image.thumb_file_name is not None: old_thumbnail_path = self.vault.path_to(image.thumb_file_name) thumb_relative_path, thumb_full_path = self.vault.thumbnail_paths( image.imgid) image.thumb_file_name = None # In case, don't reference a non-existing file try: # TODO: Call a primitive in Vault instead shutil.copyfile(old_thumbnail_path, thumb_full_path) image.thumb_file_name = thumb_relative_path except FileNotFoundError: pass def _find_what_to_clone(self): """ Determine the objects to clone. """ req = self.req # From required subsetting method... if req.limit_type == LimitMethods.constant: rank_function = 'rank' elif req.limit_type == LimitMethods.percent: rank_function = '100*percent_rank' else: rank_function = 'FunctionError' # And repartition key if req.group_type == GroupDefinitions.categories: part_key = "obh.classif_id" elif req.group_type == GroupDefinitions.samples: part_key = "sam.sampleid" elif req.group_type == GroupDefinitions.acquisitions: part_key = "acq.acquisid" else: part_key = "???" # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet( self.session, self.prj_id, self.req.filters) from_, where, params = object_set.get_sql(self._get_owner_id()) # noinspection SqlResolve sql = """ SELECT objid FROM ( SELECT """ + rank_function + """() OVER (PARTITION BY """ + part_key + """ ORDER BY RANDOM()) rang, obh.objid FROM """ + from_.get_sql() + """ """ + where.get_sql() + """ ) sr WHERE rang <= :ranklimit """ params['ranklimit'] = self.req.limit_value logger.info("SQL=%s", sql) logger.info("SQLParam=%s", params) res: Result = self.ro_session.execute(sql, params) ids = [r for r, in res] logger.info("There are %d IDs", len(ids)) self.to_clone = EnumeratedObjectSet(self.session, ids)
def __init__(self): super().__init__() self.vault = Vault(join(self.link_src, 'vault'))
def __init__(self, prj_id: int, req: Union[ImportReq, SimpleImportReq]): super().__init__(prj_id) """ The project ID to import into """ self.req = req # From legacy code, vault and temptask are in src directory self.vault = Vault(join(self.link_src, 'vault'))
class ImageManagerService(Service): def __init__(self): super().__init__() self.vault = Vault(join(self.link_src, 'vault')) @staticmethod def compute_md5(fname): hash_md5 = hashlib.md5() with open(fname, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.digest() def do_digests(self, current_user_id: UserIDT, prj_id: Optional[ProjectIDT], max_digests: int) -> str: """ Pick some images without checksum and compute it. """ _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) qry: Query = self.ro_session.query(Image.file_name) if prj_id is not None: # Find missing images in a project qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join( Project) qry = qry.filter(Project.projid == prj_id) else: # Find missing images globally pass qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path) qry = qry.filter(ImageFile.path.is_(None)) qry = qry.limit(max_digests) cnt = 0 with CodeTimer("Files without md5, query '%s':" % str(qry), logger): files_without_md5 = [file_name for file_name, in qry.all()] for an_img_file_name in files_without_md5: cnt += 1 img_file = ImageFile(path=an_img_file_name) self.session.add(img_file) self._md5_on_record(img_file) self.session.commit() # Eventually we can still satisfy the constraint while doing a few missing md5s left_for_unknown = max_digests - cnt if left_for_unknown > 0: # Also do unknown image file lines miss_qry: Query = self.session.query(ImageFile) miss_qry = miss_qry.filter( and_(ImageFile.state == ImageFileStateEnum.UNKNOWN.value, ImageFile.digest_type == '?')) if prj_id is not None: # Find unknown images in a project miss_qry = miss_qry.outerjoin( Image, Image.file_name == ImageFile.path) miss_qry = miss_qry.join(ObjectHeader).join(Acquisition).join( Sample).join(Project) miss_qry = miss_qry.filter(Project.projid == prj_id) # On purpose, no "order by" clause. Results are random, but sorting takes a while on lots of images miss_qry = miss_qry.limit(left_for_unknown) with CodeTimer( "Files with unknown state, query '%s':" % str(miss_qry), logger): missing_ones = [an_img_file for an_img_file in miss_qry.all()] for a_missing in missing_ones: cnt += 1 self._md5_on_record(a_missing) self.session.commit() return "Digest for %d images done." % cnt def _md5_on_record(self, img_file: ImageFile): img_file_path = self.vault.sub_path(img_file.path) try: md5 = self.compute_md5(img_file_path) img_file.digest = md5 img_file.digest_type = '5' img_file.state = ImageFileStateEnum.OK.value except FileNotFoundError: img_file.state = ImageFileStateEnum.MISSING.value except Exception as e: logger.exception(e) img_file.state = ImageFileStateEnum.ERROR.value def do_cleanup_dup_same_obj(self, current_user_id: UserIDT, prj_id: ProjectIDT, max_deletes: int) -> str: """ Simplest duplication pattern. Inside the same object there are several identical images. """ _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) orig_img = aliased(Image, name="orig") orig_file = aliased(ImageFile, name="orig_file") qry: Query = self.session.query(orig_img.file_name, orig_img.imgid, Image, ImageFile) # Select what to delete qry = qry.join(ObjectHeader, ObjectHeader.objid == Image.objid).join( Acquisition).join(Sample).join(Project) # We consider that original image is the oldest one, so others have a superior ID qry = qry.join( orig_img, and_(orig_img.objid == Image.objid, orig_img.orig_file_name == Image.orig_file_name, orig_img.width == Image.width, orig_img.height == Image.height, orig_img.imgid < Image.imgid)) # Must have a checksum, with the same state (sane) qry = qry.join( ImageFile, and_(ImageFile.path == Image.file_name, ImageFile.state == ImageFileStateEnum.OK.value)) qry = qry.join( orig_file, and_(orig_file.path == orig_img.file_name, orig_file.state == ImageFileStateEnum.OK.value)) # and the same value of course qry = qry.filter( and_(ImageFile.digest_type == orig_file.digest_type, ImageFile.digest == orig_file.digest)) qry = qry.filter(Project.projid == prj_id) qry = qry.order_by(Image.objid, orig_img.imgid, Image.imgid) qry = qry.limit(max_deletes) with CodeTimer( "Dups same objs inside %d, query '%s':" % (prj_id, str(qry)), logger): to_do = [(orig_file_name, orig_img_id, an_image, an_image_file) for orig_file_name, orig_img_id, an_image, an_image_file in qry.all()] ko_not_same = 0 ko_except = 0 # Prepare & start a remover thread that will run in // with DB queries remover = VaultRemover(self.link_src, logger).do_start() filecmp.clear_cache() deleted_imgids: Set[int] = set() for orig_file_name, orig_img_id, an_image, an_image_file in to_do: # The query returns multiple rows if there are more than 2 duplicates if orig_img_id in deleted_imgids: continue # Even if MD5s match, be paranoid and compare files orig_path = self.vault.sub_path(orig_file_name) dup_path = self.vault.sub_path(an_image.file_name) assert orig_path != dup_path orig_exists = exists(orig_path) dup_exists = exists(dup_path) if orig_exists: if dup_exists: try: same = filecmp.cmp(orig_path, dup_path, False) except Exception as exc: logger.info( "Exception while comparing orig:%s and dup:%s: %s", orig_path, dup_path, str(exc)) ko_except += 1 continue if not same: ko_not_same += 1 continue else: # Duplicate is gone already pass else: # DB record of physical file is wrong # TODO continue # Do the cleanup deleted_imgids.add(an_image.imgid) if dup_exists: remover.add_files([an_image.file_name]) self.session.delete(an_image) self.session.delete(an_image_file) # Wait for the files handled self.session.commit() remover.wait_for_done() return ( "Dupl remover for %s dup images done but %d problems %d false file comp" % (len(deleted_imgids), ko_except, ko_not_same))