Exemplo n.º 1
0
 def dimensions_and_resize(max_dim: int, vault: Vault, sub_path: str,
                           image_to_write: Bean) -> Optional[str]:
     try:
         im = PIL_Image.open(vault.path_to(sub_path))
     except DecompressionBombError:
         return "Image too large: %s" % sub_path
     image_to_write.width = im.size[0]
     image_to_write.height = im.size[1]
     # Generate a thumbnail if image is too large
     if (im.size[0] > max_dim) or (im.size[1] > max_dim):
         im.thumbnail((max_dim, max_dim))
         if im.mode == 'P':
             # (8-bit pixels, mapped to any other mode using a color palette)
             # from https://pillow.readthedocs.io/en/latest/handbook/concepts.html#modes
             # Tested using a PNG with palette
             im = im.convert("RGB")
         thumb_relative_path, thumb_full_path = vault.thumbnail_paths(
             image_to_write.imgid)
         im.save(thumb_full_path)
         image_to_write.thumb_file_name = thumb_relative_path
         image_to_write.thumb_width = im.size[0]
         image_to_write.thumb_height = im.size[1]
     else:
         # Close the PIL image, when resized it was done during im.save
         # Otherwise there is a FD exhaustion on PyPy
         im.close()
         # Need empty fields for bulk insert
         image_to_write.thumb_file_name = None
         image_to_write.thumb_width = None
         image_to_write.thumb_height = None
     return None
Exemplo n.º 2
0
 def dimensions_and_resize(max_dim: int, vault: Vault, sub_path: str, image_to_write: Bean) -> Optional[str]:
     """
         Get dimensions from given image, return a string with the error in case of issue.
         It is assumed that the image is valid, i.e. did not throw an exception in above validate()
     """
     im = PIL_Image.open(vault.path_to(sub_path))
     image_to_write.width = im.size[0]
     image_to_write.height = im.size[1]
     # Generate a thumbnail if image is too large
     if (im.size[0] > max_dim) or (im.size[1] > max_dim):
         if im.mode == 'P' or im.mode[0] == 'I':
             # (8-bit pixels, mapped to any other mode using a color palette)
             # from https://pillow.readthedocs.io/en/latest/handbook/concepts.html#modes
             # Tested using a PNG with palette
             im = im.convert("RGB")
         im.thumbnail((max_dim, max_dim))
         thumb_relative_path, thumb_full_path = vault.thumbnail_paths(image_to_write.imgid)
         im.save(thumb_full_path)
         image_to_write.thumb_file_name = thumb_relative_path
         image_to_write.thumb_width = im.size[0]
         image_to_write.thumb_height = im.size[1]
     else:
         # Close the PIL image, when resized it was done during im.save
         # Otherwise there is a FD exhaustion on PyPy
         im.close()
         # Need empty fields for bulk insert
         image_to_write.thumb_file_name = None
         image_to_write.thumb_width = None
         image_to_write.thumb_height = None
     return None
Exemplo n.º 3
0
    def __init__(self, prj_id: int, req: SubsetReq):

        super().__init__(prj_id, req.task_id)
        # Load the destination project
        dest_prj = self.session.query(Project).get(req.dest_prj_id)
        assert dest_prj is not None
        self.dest_prj: Project = dest_prj
        self.req = req
        # Work vars
        self.to_clone: EnumeratedObjectSet = EnumeratedObjectSet(self.session, [])
        self.vault = Vault(join(self.link_src, 'vault'))
        self.first_query = True
Exemplo n.º 4
0
def config() -> EcoTaxaConfig:
    # Setup
    link.INI_DIR = HERE
    link.INI_FILE = TEST_CONFIG
    conf = EcoTaxaConfig()
    # Inject low values for covering, even with test small dataset
    DBWriter.SEQUENCE_CACHE_SIZE = 5
    TSVFile.REPORT_EVERY = 5
    # Empty Vault
    vault = Vault(join(link.read_link(), 'vault'))
    shutil.rmtree(vault.sub_path("0000"), ignore_errors=True)
    yield conf
    # Teardown
    conf.cleanup()
Exemplo n.º 5
0
 def __init__(self, prj_id: int, req: Union[ImportPrepReq, ImportRealReq, SimpleImportReq]):
     super().__init__(prj_id, req.task_id)
     # Received from parameters
     """ The project ID to import into """
     self.source_dir_or_zip: str = req.source_path
     """ The source file or directory """
     self.req = req
     # From legacy code, vault and temptask are in src directory
     self.vault = Vault(join(self.link_src, 'vault'))
Exemplo n.º 6
0
class ImageManagerService(Service):
    def __init__(self):
        super().__init__()
        self.vault = Vault(join(self.link_src, 'vault'))

    @staticmethod
    def compute_md5(fname):
        hash_md5 = hashlib.md5()
        with open(fname, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.digest()

    def do_digests(self, current_user_id: UserIDT,
                   prj_id: Optional[ProjectIDT], max_digests: int) -> str:
        """
            Pick some images without checksum and compute it.
        """
        _user = RightsBO.user_has_role(self.session, current_user_id,
                                       Role.APP_ADMINISTRATOR)
        qry: Query = self.session.query(Image, ImageFile)
        if prj_id is not None:
            qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join(
                Project)
        qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path)
        qry = qry.filter(ImageFile.path.is_(None))
        if prj_id is not None:
            qry = qry.filter(Project.projid == prj_id)
        qry = qry.limit(max_digests)
        cnt = 0
        for an_img, img_file in qry.all():
            cnt += 1
            if img_file is None:
                # No image_file line, add it
                img_file = ImageFile(path=an_img.file_name)
                self.session.add(img_file)
            img_file_path = self.vault.sub_path(an_img.file_name)
            try:
                md5 = self.compute_md5(img_file_path)
                img_file.digest = md5
                img_file.digest_type = '5'
                img_file.state = ImageFileStateEnum.OK.value
            except FileNotFoundError:
                img_file.state = ImageFileStateEnum.MISSING.value
            except Exception as e:
                logger.exception(e)
                img_file.state = ImageFileStateEnum.ERROR.value
        self.session.commit()
        return "Digest for %d images done." % cnt
Exemplo n.º 7
0
    def add_images(self, nb_files_to_add, start_progress: int,
                   end_progress: int):
        # Add image files, linked to the TSV content
        self.update_progress(start_progress, "Start Image export")
        progress_range = end_progress - start_progress
        logger.info("Appending to zip file %s" % self.out_file_name)
        produced_path = self.out_path / self.out_file_name
        zfile = zipfile.ZipFile(produced_path,
                                'a',
                                allowZip64=True,
                                compression=zipfile.ZIP_DEFLATED)

        nb_files_added = 0
        vault = Vault(join(self.link_src, 'vault'))
        temp_img_file = self.out_path / "images.csv"
        with open(temp_img_file, "r") as temp_images_csv_fd:
            for r in csv.DictReader(temp_images_csv_fd,
                                    delimiter='\t',
                                    quotechar='"',
                                    lineterminator='\n'):
                img_file_path = vault.path_to(r["src_path"])
                path_in_zip = r["dst_path"]
                try:
                    zfile.write(img_file_path, arcname=path_in_zip)
                except FileNotFoundError:
                    logger.error("Not found image: %s", img_file_path)
                    continue
                logger.info("Added file %s as %s", img_file_path, path_in_zip)
                nb_files_added += 1
                if nb_files_added % self.IMAGES_REPORT_EVERY == 0:
                    msg = "Added %d files" % nb_files_added
                    logger.info(msg)
                    progress = int(start_progress + progress_range /
                                   nb_files_to_add * nb_files_added)
                    self.update_progress(progress, msg)
            zfile.close()
Exemplo n.º 8
0
class SubsetServiceOnProject(JobServiceOnProjectBase):
    """
        A task doing the subset operation.
    """
    JOB_TYPE = "Subset"

    # Fetch this number of objects at a time, and write them, in a DB session
    CHUNK_SIZE = 100

    def __init__(self, prj_id: int, req: SubsetReq):

        super().__init__(prj_id)
        # Load the destination project
        dest_prj = self.session.query(Project).get(req.dest_prj_id)
        assert dest_prj is not None
        self.dest_prj: Project = dest_prj
        self.req = req
        # Work vars
        self.to_clone: EnumeratedObjectSet = EnumeratedObjectSet(
            self.session, [])
        self.vault = Vault(join(self.link_src, 'vault'))
        self.first_query = True

    def init_args(self, args: Dict) -> Dict:
        super().init_args(args)
        args["req"] = self.req.dict()
        return args

    @staticmethod
    def deser_args(json_args: Dict):
        json_args["req"] = SubsetReq(**json_args["req"])

    def run(self, current_user_id: int) -> SubsetRsp:
        """
            Initial run, basically just create the job.
        """
        # Security check
        RightsBO.user_wants(self.session, current_user_id, Action.READ,
                            self.prj_id)
        RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE,
                            self.dest_prj.projid)
        # OK, go background straight away
        self.create_job(self.JOB_TYPE, current_user_id)
        ret = SubsetRsp(job_id=self.job_id)
        return ret

    def do_background(self):
        """
            Background part of the job.
        """
        with LogsSwitcher(self):
            return self.do_run()

    def do_run(self) -> None:
        # OK
        logger.info("Starting subset of '%s'", self.prj.title)

        self.update_progress(5, "Determining objects to clone")
        self._find_what_to_clone()

        logger.info("Matched %s objects", len(self.to_clone))
        if len(self.to_clone) == 0:
            errors = ["No object found to clone into subset."]
            self.set_job_result(errors=errors, infos={"infos": ""})
            return

        self._do_clone()
        self.session.commit()

        # Recompute stats and so on
        ProjectBO.do_after_load(self.session, self.dest_prj.projid)
        self.session.commit()

        self.set_job_result(errors=[], infos={"rowcount": len(self.to_clone)})

    def _do_clone(self):
        """
            Cloning operation itself. Assumes that @see self.to_clone was populated before.
        """
        # Get the mappings in source project, in order to determines the useful columns
        custom_mapping = ProjectMapping().load_from_project(self.prj)
        obj_mapping = custom_mapping.object_mappings
        used_columns = set(obj_mapping.real_cols_to_tsv.keys())
        used_columns.add("orig_id")  # By safety
        # Create a DB writer
        writer = DBWriter(self.session)
        # Narrow the writes in ObjectFields thanks to mappings of original project
        writer.generators({"obj_field": used_columns})
        # Use import helpers
        dest_prj_id = self.dest_prj.projid
        import_how = ImportHow(prj_id=dest_prj_id,
                               update_mode="No",
                               custom_mapping=ProjectMapping(),
                               skip_object_duplicates=False,
                               loaded_files=[])
        # Get parent (enclosing) Sample, Acquisition, Process. There should be 0 in this context...
        import_how.existing_parents = InBundle.fetch_existing_parents(
            self.session, prj_id=dest_prj_id)

        self._clone_all(import_how, writer)
        # Copy mappings to destination. We could narrow them to the minimum?
        custom_mapping.write_to_project(self.dest_prj)

    def _db_fetch(self, object_ids: ObjectIDListT) -> List[DBObjectTupleT]:
        """
            Do a DB read of given objects, with auxiliary objects.
            :param object_ids: The list of IDs
            :return:
        """
        # TODO: Depending on filter, the joins could be plain (not outer)
        # E.g. if asked for a set of samples
        ret: Query = self.ro_session.query(ObjectHeader)
        ret = ret.join(ObjectHeader.acquisition).join(
            Acquisition.process).join(Acquisition.sample)
        ret = ret.outerjoin(Image, ObjectHeader.all_images).outerjoin(
            ObjectCNNFeature).join(ObjectFields)
        ret = ret.filter(ObjectHeader.objid == any_(object_ids))
        ret = ret.order_by(ObjectHeader.objid, Image.imgid)
        ret = ret.with_entities(ObjectHeader, ObjectFields, ObjectCNNFeature,
                                Image, Sample, Acquisition, Process)

        if self.first_query:
            logger.info("Query: %s", str(ret))
            self.first_query = False

        return ret.all()

    def _clone_all(self, import_how, writer):

        # Bean counting init
        nb_objects = 0
        total_objects = len(self.to_clone)
        # Pick chunks of object ids
        for a_chunk in self.to_clone.get_objectid_chunks(self.CHUNK_SIZE):
            # Fetch them using SQLAlchemy
            db_tuples = self._db_fetch(a_chunk)
            # Send each 'line'
            for a_db_tuple in db_tuples:
                self._send_to_writer(import_how, writer, a_db_tuple)
            # Bean counting and reporting
            nb_objects += len(a_chunk)
            # Save
            writer.do_bulk_save()
            # Commit (it expires SQLAlchemy session-linked objects)
            self.session.commit()
            progress = int(90 * nb_objects / total_objects)
            self.update_progress(10 + progress, "Subset creation in progress")

    def _send_to_writer(self, import_how: ImportHow, writer: DBWriter,
                        db_tuple: DBObjectTupleT):
        """
            Send a single tuple from DB to DB
        :param import_how:
        :param writer:
        :param db_tuple:
        :return:
        """
        obj_orm, fields_orm, cnn_features_orm, image_orm, sample_orm, acquisition_orm, process_orm = db_tuple
        # Transform all to key-less beans so they can be absorbed by DBWriter
        obj, fields, cnn_features, image, sample, acquisition, process = \
            bean_of(obj_orm), bean_of(fields_orm), bean_of(cnn_features_orm), \
            bean_of(image_orm), bean_of(sample_orm), \
            bean_of(acquisition_orm), bean_of(process_orm)
        assert obj is not None and fields is not None
        # A few fields need adjustment
        obj.img0id = None
        # Cut images if asked so
        if not self.req.do_images:
            image = None
        # Write parent entities
        assert sample and acquisition and process
        dict_of_parents = {
            Sample.__tablename__: sample,
            Acquisition.__tablename__: acquisition,
            Process.__tablename__: process
        }
        TSVFile.add_parent_objects(import_how, self.session, obj,
                                   dict_of_parents)
        # Write object and children
        new_records = TSVFile.create_or_link_slaves(
            how=import_how,
            session=self.session,
            object_head_to_write=obj,
            object_fields_to_write=fields,
            image_to_write=image)
        writer.add_db_entities(obj, fields, image, new_records)
        # Keep track of existing objects
        if new_records > 1:
            # We now have an Id from sequences, so ref. it.
            import_how.existing_objects[obj.orig_id] = obj.objid
            if cnn_features is not None:
                writer.add_cnn_features(obj, cnn_features)
        # Do images
        if new_records > 0 and self.req.do_images and image and image.file_name is not None:
            # We have an image, with a new imgid but old paths have been copied
            old_imgpath = Path(self.vault.path_to(image.file_name))
            image.file_name = None  # In case, don't reference a non-existing file
            try:
                sub_path = self.vault.store_image(old_imgpath, image.imgid)
                image.file_name = sub_path
            except FileNotFoundError:
                pass
            # Proceed to thumbnail if any
            if image.thumb_file_name is not None:
                old_thumbnail_path = self.vault.path_to(image.thumb_file_name)
                thumb_relative_path, thumb_full_path = self.vault.thumbnail_paths(
                    image.imgid)
                image.thumb_file_name = None  # In case, don't reference a non-existing file
                try:
                    # TODO: Call a primitive in Vault instead
                    shutil.copyfile(old_thumbnail_path, thumb_full_path)
                    image.thumb_file_name = thumb_relative_path
                except FileNotFoundError:
                    pass

    def _find_what_to_clone(self):
        """
            Determine the objects to clone.
        """
        req = self.req
        # From required subsetting method...
        if req.limit_type == LimitMethods.constant:
            rank_function = 'rank'
        elif req.limit_type == LimitMethods.percent:
            rank_function = '100*percent_rank'
        else:
            rank_function = 'FunctionError'
        # And repartition key
        if req.group_type == GroupDefinitions.categories:
            part_key = "obh.classif_id"
        elif req.group_type == GroupDefinitions.samples:
            part_key = "sam.sampleid"
        elif req.group_type == GroupDefinitions.acquisitions:
            part_key = "acq.acquisid"
        else:
            part_key = "???"

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(
            self.session, self.prj_id, self.req.filters)
        from_, where, params = object_set.get_sql(self._get_owner_id())

        # noinspection SqlResolve
        sql = """
            SELECT objid FROM (
                SELECT """ + rank_function + """() OVER (PARTITION BY """ + part_key + """ ORDER BY RANDOM()) rang,
                       obh.objid
                  FROM """ + from_.get_sql() + """
                """ + where.get_sql() + """ ) sr
            WHERE rang <= :ranklimit """
        params['ranklimit'] = self.req.limit_value

        logger.info("SQL=%s", sql)
        logger.info("SQLParam=%s", params)

        res: Result = self.ro_session.execute(sql, params)
        ids = [r for r, in res]
        logger.info("There are %d IDs", len(ids))

        self.to_clone = EnumeratedObjectSet(self.session, ids)
Exemplo n.º 9
0
 def __init__(self):
     super().__init__()
     self.vault = Vault(join(self.link_src, 'vault'))
Exemplo n.º 10
0
 def __init__(self, prj_id: int, req: Union[ImportReq, SimpleImportReq]):
     super().__init__(prj_id)
     """ The project ID to import into """
     self.req = req
     # From legacy code, vault and temptask are in src directory
     self.vault = Vault(join(self.link_src, 'vault'))
Exemplo n.º 11
0
class ImageManagerService(Service):
    def __init__(self):
        super().__init__()
        self.vault = Vault(join(self.link_src, 'vault'))

    @staticmethod
    def compute_md5(fname):
        hash_md5 = hashlib.md5()
        with open(fname, "rb") as f:
            for chunk in iter(lambda: f.read(4096), b""):
                hash_md5.update(chunk)
        return hash_md5.digest()

    def do_digests(self, current_user_id: UserIDT,
                   prj_id: Optional[ProjectIDT], max_digests: int) -> str:
        """
            Pick some images without checksum and compute it.
        """
        _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                       Role.APP_ADMINISTRATOR)
        qry: Query = self.ro_session.query(Image.file_name)
        if prj_id is not None:
            # Find missing images in a project
            qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join(
                Project)
            qry = qry.filter(Project.projid == prj_id)
        else:
            # Find missing images globally
            pass
        qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path)
        qry = qry.filter(ImageFile.path.is_(None))
        qry = qry.limit(max_digests)
        cnt = 0
        with CodeTimer("Files without md5, query '%s':" % str(qry), logger):
            files_without_md5 = [file_name for file_name, in qry.all()]
        for an_img_file_name in files_without_md5:
            cnt += 1
            img_file = ImageFile(path=an_img_file_name)
            self.session.add(img_file)
            self._md5_on_record(img_file)
        self.session.commit()
        # Eventually we can still satisfy the constraint while doing a few missing md5s
        left_for_unknown = max_digests - cnt
        if left_for_unknown > 0:
            # Also do unknown image file lines
            miss_qry: Query = self.session.query(ImageFile)
            miss_qry = miss_qry.filter(
                and_(ImageFile.state == ImageFileStateEnum.UNKNOWN.value,
                     ImageFile.digest_type == '?'))
            if prj_id is not None:
                # Find unknown images in a project
                miss_qry = miss_qry.outerjoin(
                    Image, Image.file_name == ImageFile.path)
                miss_qry = miss_qry.join(ObjectHeader).join(Acquisition).join(
                    Sample).join(Project)
                miss_qry = miss_qry.filter(Project.projid == prj_id)
            # On purpose, no "order by" clause. Results are random, but sorting takes a while on lots of images
            miss_qry = miss_qry.limit(left_for_unknown)
            with CodeTimer(
                    "Files with unknown state, query '%s':" % str(miss_qry),
                    logger):
                missing_ones = [an_img_file for an_img_file in miss_qry.all()]
            for a_missing in missing_ones:
                cnt += 1
                self._md5_on_record(a_missing)
            self.session.commit()
        return "Digest for %d images done." % cnt

    def _md5_on_record(self, img_file: ImageFile):
        img_file_path = self.vault.sub_path(img_file.path)
        try:
            md5 = self.compute_md5(img_file_path)
            img_file.digest = md5
            img_file.digest_type = '5'
            img_file.state = ImageFileStateEnum.OK.value
        except FileNotFoundError:
            img_file.state = ImageFileStateEnum.MISSING.value
        except Exception as e:
            logger.exception(e)
            img_file.state = ImageFileStateEnum.ERROR.value

    def do_cleanup_dup_same_obj(self, current_user_id: UserIDT,
                                prj_id: ProjectIDT, max_deletes: int) -> str:
        """
            Simplest duplication pattern. Inside the same object there are several identical images.
        """
        _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                       Role.APP_ADMINISTRATOR)
        orig_img = aliased(Image, name="orig")
        orig_file = aliased(ImageFile, name="orig_file")
        qry: Query = self.session.query(orig_img.file_name, orig_img.imgid,
                                        Image,
                                        ImageFile)  # Select what to delete
        qry = qry.join(ObjectHeader, ObjectHeader.objid == Image.objid).join(
            Acquisition).join(Sample).join(Project)
        # We consider that original image is the oldest one, so others have a superior ID
        qry = qry.join(
            orig_img,
            and_(orig_img.objid == Image.objid,
                 orig_img.orig_file_name == Image.orig_file_name,
                 orig_img.width == Image.width,
                 orig_img.height == Image.height,
                 orig_img.imgid < Image.imgid))
        # Must have a checksum, with the same state (sane)
        qry = qry.join(
            ImageFile,
            and_(ImageFile.path == Image.file_name,
                 ImageFile.state == ImageFileStateEnum.OK.value))
        qry = qry.join(
            orig_file,
            and_(orig_file.path == orig_img.file_name,
                 orig_file.state == ImageFileStateEnum.OK.value))
        # and the same value of course
        qry = qry.filter(
            and_(ImageFile.digest_type == orig_file.digest_type,
                 ImageFile.digest == orig_file.digest))
        qry = qry.filter(Project.projid == prj_id)
        qry = qry.order_by(Image.objid, orig_img.imgid, Image.imgid)
        qry = qry.limit(max_deletes)
        with CodeTimer(
                "Dups same objs inside %d, query '%s':" % (prj_id, str(qry)),
                logger):
            to_do = [(orig_file_name, orig_img_id, an_image, an_image_file)
                     for orig_file_name, orig_img_id, an_image, an_image_file
                     in qry.all()]
        ko_not_same = 0
        ko_except = 0
        # Prepare & start a remover thread that will run in // with DB queries
        remover = VaultRemover(self.link_src, logger).do_start()
        filecmp.clear_cache()
        deleted_imgids: Set[int] = set()
        for orig_file_name, orig_img_id, an_image, an_image_file in to_do:
            # The query returns multiple rows if there are more than 2 duplicates
            if orig_img_id in deleted_imgids:
                continue
            # Even if MD5s match, be paranoid and compare files
            orig_path = self.vault.sub_path(orig_file_name)
            dup_path = self.vault.sub_path(an_image.file_name)
            assert orig_path != dup_path
            orig_exists = exists(orig_path)
            dup_exists = exists(dup_path)
            if orig_exists:
                if dup_exists:
                    try:
                        same = filecmp.cmp(orig_path, dup_path, False)
                    except Exception as exc:
                        logger.info(
                            "Exception while comparing orig:%s and dup:%s: %s",
                            orig_path, dup_path, str(exc))
                        ko_except += 1
                        continue
                    if not same:
                        ko_not_same += 1
                        continue
                else:
                    # Duplicate is gone already
                    pass
            else:
                # DB record of physical file is wrong
                # TODO
                continue
            # Do the cleanup
            deleted_imgids.add(an_image.imgid)
            if dup_exists:
                remover.add_files([an_image.file_name])
            self.session.delete(an_image)
            self.session.delete(an_image_file)
        # Wait for the files handled
        self.session.commit()
        remover.wait_for_done()
        return (
            "Dupl remover for %s dup images done but %d problems %d false file comp"
            % (len(deleted_imgids), ko_except, ko_not_same))