Exemplo n.º 1
0
    def _delete_chunk(session: Session,
                      a_chunk: ObjectIDListT) -> Tuple[int, int, List[str]]:
        """
            Delete a chunk from self's object list.
            Technical Note: We use SQLA Core as we don't want to fetch the rows
        """
        # Start with images which are not deleted via a CASCADE on DB side
        # This is maybe due to relationship cycle b/w ObjectHeader and Images @See comment in Image class
        img_del_qry: Delete = Image.__table__.delete()
        img_del_qry = img_del_qry.where(Image.objid == any_(a_chunk))
        img_del_qry = img_del_qry.returning(Image.file_name,
                                            Image.thumb_file_name)
        with CodeTimer("DELETE for %d images: " % len(a_chunk), logger):
            files_res = session.execute(img_del_qry)
            img_files = []
            nb_img_rows = 0
            for a_file_tuple in files_res:
                # We have main file and optionally the thumbnail one
                for a_file in a_file_tuple:
                    if a_file:
                        img_files.append(a_file)
                nb_img_rows += 1
            logger.info("Removed: %d rows, to remove: %d files", nb_img_rows,
                        len(img_files))

        obj_del_qry: Delete = ObjectHeader.__table__.delete()
        obj_del_qry = obj_del_qry.where(ObjectHeader.objid == any_(a_chunk))
        with CodeTimer("DELETE for %d objs: " % len(a_chunk), logger):
            nb_objs = session.execute(obj_del_qry).rowcount

        session.commit()
        # TODO: Cache delete
        return nb_objs, nb_img_rows, img_files
Exemplo n.º 2
0
 def __init__(self, session: Session, prj_ids: ProjectIDListT, public: bool = False):
     # Query the project and load neighbours as well
     qry: Query = session.query(Project, ProjectPrivilege)
     qry = qry.outerjoin(ProjectPrivilege, Project.privs_for_members).options(
         contains_eager(Project.privs_for_members))
     qry = qry.outerjoin(User, ProjectPrivilege.user).options(
         contains_eager(ProjectPrivilege.user))
     qry = qry.filter(Project.projid == any_(prj_ids))
     self.projects = []
     done = set()
     with CodeTimer("%s BO projects query & init:" % len(prj_ids), logger):
         for a_proj, a_pp in qry.all():
             # The query yields duplicates so we need to filter
             if a_proj.projid not in done:
                 if public:
                     self.projects.append(ProjectBO(a_proj))
                 else:
                     self.projects.append(ProjectBO(a_proj).enrich())
                 done.add(a_proj.projid)
     # Add instruments
     with CodeTimer("%s set instruments:" % len(prj_ids), logger):
         instruments = DescribedInstrumentSet(session, prj_ids)
         for a_project in self.projects:
             instrums = instruments.by_project.get(a_project.projid)
             if instrums is not None:
                 a_project.instrument = ",".join(instrums)
Exemplo n.º 3
0
 def do_digests(self, current_user_id: UserIDT,
                prj_id: Optional[ProjectIDT], max_digests: int) -> str:
     """
         Pick some images without checksum and compute it.
     """
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     qry: Query = self.ro_session.query(Image.file_name)
     if prj_id is not None:
         # Find missing images in a project
         qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join(
             Project)
         qry = qry.filter(Project.projid == prj_id)
     else:
         # Find missing images globally
         pass
     qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path)
     qry = qry.filter(ImageFile.path.is_(None))
     qry = qry.limit(max_digests)
     cnt = 0
     with CodeTimer("Files without md5, query '%s':" % str(qry), logger):
         files_without_md5 = [file_name for file_name, in qry.all()]
     for an_img_file_name in files_without_md5:
         cnt += 1
         img_file = ImageFile(path=an_img_file_name)
         self.session.add(img_file)
         self._md5_on_record(img_file)
     self.session.commit()
     # Eventually we can still satisfy the constraint while doing a few missing md5s
     left_for_unknown = max_digests - cnt
     if left_for_unknown > 0:
         # Also do unknown image file lines
         miss_qry: Query = self.session.query(ImageFile)
         miss_qry = miss_qry.filter(
             and_(ImageFile.state == ImageFileStateEnum.UNKNOWN.value,
                  ImageFile.digest_type == '?'))
         if prj_id is not None:
             # Find unknown images in a project
             miss_qry = miss_qry.outerjoin(
                 Image, Image.file_name == ImageFile.path)
             miss_qry = miss_qry.join(ObjectHeader).join(Acquisition).join(
                 Sample).join(Project)
             miss_qry = miss_qry.filter(Project.projid == prj_id)
         # On purpose, no "order by" clause. Results are random, but sorting takes a while on lots of images
         miss_qry = miss_qry.limit(left_for_unknown)
         with CodeTimer(
                 "Files with unknown state, query '%s':" % str(miss_qry),
                 logger):
             missing_ones = [an_img_file for an_img_file in miss_qry.all()]
         for a_missing in missing_ones:
             cnt += 1
             self._md5_on_record(a_missing)
         self.session.commit()
     return "Digest for %d images done." % cnt
Exemplo n.º 4
0
 def do_intra_step_1(self, loaded_files):
     # The mapping to custom columns, either empty or from previous import API_operations on same project.
     custom_mapping = ProjectMapping().load_from_project(self.prj)
     # Source bundle construction
     source_bundle = InBundle(
         self.source_dir_or_zip,
         Path(self.temp_for_task.data_dir_for(self.task_id)))
     # Configure the validation to come, directives.
     import_how = ImportHow(self.prj_id, self.req.update_mode,
                            custom_mapping, self.req.skip_existing_objects,
                            loaded_files)
     if self.req.skip_loaded_files:
         import_how.compute_skipped(source_bundle, logger)
     # A structure to collect validation result
     import_diag = ImportDiagnostic()
     if not self.req.skip_existing_objects:
         with CodeTimer(
                 "do_intra_step_1: Existing images for %d: " % self.prj_id,
                 logger):
             import_diag.existing_objects_and_image = Image.fetch_existing_images(
                 self.session, self.prj_id)
     import_diag.topology.read_from_db(self.session, prj_id=self.prj_id)
     # Do the bulk job of validation
     nb_rows = source_bundle.validate_import(import_how, import_diag,
                                             self.session,
                                             self.report_progress)
     return import_how, import_diag, nb_rows
Exemplo n.º 5
0
    def _db_fetch(self, objids: List[int]) -> List[DBObjectTupleT]:
        """
            Do a DB read of given objects, with auxiliary objects.
            Thanks to 'contains_eager' calls, the objects are loaded into SQLAlchemy session.
            :param objids:
            :return:
        """
        ret: Query = self.session.query(Project, Sample, Acquisition, Process, ObjectHeader, ObjectFields, Image)
        ret = ret.join(Sample, Project.all_samples).options(contains_eager(Project.all_samples))
        ret = ret.join(Acquisition, Sample.all_acquisitions)
        ret = ret.join(Process, Acquisition.process)
        ret = ret.join(ObjectHeader, Acquisition.all_objects)
        # Natural joins
        ret = ret.join(ObjectFields)
        ret = ret.join(Image, ObjectHeader.all_images).options(contains_eager(ObjectHeader.all_images))
        ret = ret.filter(ObjectHeader.objid == any_(objids))
        ret = ret.order_by(ObjectHeader.objid)
        ret = ret.order_by(Image.imgrank)

        if self.first_query:
            logger.info("Query: %s", str(ret))
            self.first_query = False

        with CodeTimer("Get Objects:", logger):
            objs = [an_obj for an_obj in ret.all()]

        # We get as many lines as images
        logger.info("NB ROWS JOIN=%d", len(objs))

        return objs
Exemplo n.º 6
0
 def _collect_existing_and_validate(self, source_dir_or_zip, loaded_files) \
         -> Tuple[ImportHow, ImportDiagnostic, int]:
     """
         Prepare the import by checking what's inside the project and scanning files to input.
     """
     # The mapping to TSV custom columns, either empty or from previous import operations on same project.
     mapping = ProjectMapping().load_from_project(self.prj)
     # Source bundle construction
     bundle_temp_dir = Path(self.temp_for_jobs.data_dir_for(self.job_id))
     source_bundle = InBundle(source_dir_or_zip, bundle_temp_dir)
     # Configure the validation to come, directives.
     import_how = ImportHow(self.prj_id, self.req.update_mode, mapping,
                            self.req.skip_existing_objects, loaded_files)
     if self.req.skip_loaded_files:
         import_how.compute_skipped(source_bundle, logger)
     # A structure to collect validation result
     import_diag = ImportDiagnostic()
     if not self.req.skip_existing_objects:
         with CodeTimer(
                 "collect_existing: Existing images for %d: " % self.prj_id,
                 logger):
             import_diag.existing_objects_and_image = Image.fetch_existing_images(
                 self.session, self.prj_id)
     import_diag.topology.read_from_db(self.session, prj_id=self.prj_id)
     # Do the bulk job of validation
     nb_rows = source_bundle.validate_import(
         import_how, import_diag, self.session,
         self.report_validation_progress)
     return import_how, import_diag, nb_rows
Exemplo n.º 7
0
 def read_taxo_stats(session: Session,
                     prj_ids: ProjectIDListT,
                     taxa_ids: Union[str, ClassifIDListT]) -> List[ProjectTaxoStats]:
     sql = """
     SELECT pts.projid, ARRAY_AGG(pts.id) as ids, 
            SUM(CASE WHEN pts.id = -1 THEN pts.nbr ELSE 0 END) as nb_u, 
            SUM(pts.nbr_v) as nb_v, SUM(pts.nbr_d) as nb_d, SUM(pts.nbr_p) as nb_p
       FROM projects_taxo_stat pts
      WHERE pts.projid = ANY(:ids)"""
     params: Dict[str, Any] = {'ids': prj_ids}
     if len(taxa_ids) > 0:
         if taxa_ids == 'all':
             pass
         else:
             sql += " AND pts.id = ANY(:tids)"
             params["tids"] = taxa_ids
     sql += """
     GROUP BY pts.projid"""
     if len(taxa_ids) > 0:
         sql += ", pts.id"
     res: Result = session.execute(text(sql), params)
     with CodeTimer("stats for %d projects:" % len(prj_ids), logger):
         ret = [ProjectTaxoStats(rec) for rec in res.fetchall()]
     for a_stat in ret:
         a_stat.used_taxa.sort()
     return ret
Exemplo n.º 8
0
 def wait_for_done(self):
     """
         Signal the thread that we have no more files, and wait for the job done.
     """
     self.logger.info("Approximately %d files in deletion queue",
                      self.files_queue.qsize())
     self.files_queue.put(None)
     with CodeTimer("Wait for files removal: ", self.logger):
         self.join()
Exemplo n.º 9
0
    def do_run(self, current_user_id: int) -> ImportRealRsp:
        """
            Do the real job using injected parameters.
            :return:
        """
        # Security check
        RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE,
                            self.prj_id)
        # OK
        loaded_files = none_to_empty(self.prj.fileloaded).splitlines()
        logger.info("Previously loaded files: %s", loaded_files)

        # Save mappings straight away
        self.save_mapping(self.custom_mapping)

        source_bundle = InBundle(
            self.req.source_path,
            Path(self.temp_for_task.data_dir_for(self.task_id)))
        # Configure the import to come, destination
        db_writer = DBWriter(self.session)
        import_where = ImportWhere(
            db_writer, self.vault,
            self.temp_for_task.base_dir_for(self.task_id))
        # Configure the import to come, directives
        import_how = ImportHow(self.prj_id, self.req.update_mode,
                               self.custom_mapping,
                               self.req.skip_existing_objects, loaded_files)
        import_how.taxo_mapping = self.req.taxo_mappings
        import_how.taxo_found = self.req.found_taxa
        import_how.found_users = self.req.found_users
        if self.req.skip_loaded_files:
            import_how.compute_skipped(source_bundle, logger)
        if not self.req.skip_existing_objects:
            with CodeTimer("run: Existing images for %d: " % self.prj_id,
                           logger):
                import_how.objects_and_images_to_skip = Image.fetch_existing_images(
                    self.session, self.prj_id)
        import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT']))

        # Do the bulk job of import
        row_count = source_bundle.do_import(import_where, import_how,
                                            self.req.rowcount,
                                            self.report_progress)

        # Update loaded files in DB, removing duplicates
        self.prj.fileloaded = "\n".join(set(import_how.loaded_files))
        self.session.commit()

        # Recompute stats
        ProjectBO.do_after_load(self.session, self.prj_id)
        self.session.commit()

        logger.info("Total of %d rows loaded" % row_count)

        # Prepare response
        ret = ImportRealRsp()
        return ret
Exemplo n.º 10
0
 def _safe_update(self, upd_sql: str, params: Dict):
     file_name = ObjectCache.file_name(self.projid)
     conn = SQLite3.get_conn(file_name, "rw")  # No create!
     try:
         with CodeTimer("SQLite update using '%s':" % upd_sql, logger):
             conn.execute(upd_sql, params)
             conn.commit()
     finally:
         conn.close()
Exemplo n.º 11
0
 def get_projects_ids(self) -> ProjectIDListT:
     """
         Return the project IDs for the held sample IDs.
     """
     qry: Query = self.session.query(Project.projid).distinct(Project.projid)
     qry = qry.join(Sample, Project.all_samples)
     qry = qry.filter(Sample.sampleid == any_(self.ids))
     with CodeTimer("Prjs for %d samples: " % len(self.ids), logger):
         return [an_id[0] for an_id in qry.all()]
Exemplo n.º 12
0
    def do_real(self) -> None:
        """
            Do the real job, i.e. write everywhere (DB/filesystem)
        """
        loaded_files = none_to_empty(self.prj.fileloaded).splitlines()
        logger.info("Previously loaded files: %s", loaded_files)

        found_users, taxo_found, col_mapping_dict, \
        nb_rows, source_path = self._load_vars_from_state(self.STATE_KEYS)

        # Save mappings straight away
        col_mapping = ProjectMapping().load_from_dict(col_mapping_dict)
        col_mapping.write_to_project(self.prj)
        self.session.commit()

        # TODO: Duplicated code
        source_bundle = InBundle(
            source_path, Path(self.temp_for_jobs.data_dir_for(self.job_id)))
        # Configure the import to come, destination
        db_writer = DBWriter(self.session)
        import_where = ImportWhere(
            db_writer, self.vault,
            self.temp_for_jobs.base_dir_for(self.job_id))
        # Configure the import to come, directives
        import_how = ImportHow(self.prj_id, self.req.update_mode, col_mapping,
                               self.req.skip_existing_objects, loaded_files)
        import_how.taxo_mapping = self.req.taxo_mappings
        import_how.found_taxa = taxo_found
        import_how.found_users = found_users
        if self.req.skip_loaded_files:
            import_how.compute_skipped(source_bundle, logger)
        if self.req.skip_existing_objects:
            # If we must skip existing objects then do an inventory of what's in already
            with CodeTimer("run: Existing images for %d: " % self.prj_id,
                           logger):
                import_how.objects_and_images_to_skip = Image.fetch_existing_images(
                    self.session, self.prj_id)
        import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT']))

        # Do the bulk job of import
        rowcount_from_validate = nb_rows
        row_count = source_bundle.do_import(import_where, import_how,
                                            rowcount_from_validate,
                                            self.report_progress)

        # Update loaded files in DB, removing duplicates
        self.prj.fileloaded = "\n".join(set(import_how.loaded_files))
        self.session.commit()

        # Recompute stats
        ProjectBO.do_after_load(self.session, self.prj_id)
        self.session.commit()

        msg = "Total of %d rows loaded" % row_count
        logger.info(msg)
        self.set_job_result(errors=[], infos={"rowcount": row_count})
Exemplo n.º 13
0
 def get_projects_ids(self) -> ProjectIDListT:
     """
         Return the project IDs for the owned objectsIDs.
     """
     qry: Query = self.session.query(Project.projid).distinct(
         Project.projid)
     qry = qry.join(Sample)
     qry = qry.join(Acquisition)
     qry = qry.join(ObjectHeader)
     qry = qry.filter(ObjectHeader.objid == any_(self.object_ids))
     with CodeTimer("Prjs for %d objs: " % len(self.object_ids), logger):
         return [an_id for an_id, in qry.all()]
Exemplo n.º 14
0
    def _get_last_classif_history(self, from_user_id: Optional[int], but_not_from_user_id: Optional[int]) \
            -> List[HistoricalLastClassif]:
        """
            Query for last classification history on all objects of self, mixed with present state in order
            to have restore-able lines.
        """
        # Get the histo entries
        subqry: Query = self.session.query(
            ObjectsClassifHisto,
            func.rank().over(
                partition_by=ObjectsClassifHisto.objid,
                order_by=ObjectsClassifHisto.classif_date.desc()).label("rnk"))
        if from_user_id:
            subqry = subqry.filter(
                ObjectsClassifHisto.classif_who == from_user_id)
        if but_not_from_user_id:
            subqry = subqry.filter(
                ObjectsClassifHisto.classif_who != but_not_from_user_id)
        subqry = subqry.filter(ObjectsClassifHisto.classif_type == "M")
        subq_alias: Alias = subqry.filter(
            ObjectsClassifHisto.objid == any_(self.object_ids)).subquery()

        # Also get some fields from ObjectHeader for referencing, info, and fallback
        qry = self.session.query(
            ObjectHeader.objid, ObjectHeader.classif_id,
            func.coalesce(subq_alias.c.classif_date,
                          ObjectHeader.classif_auto_when),
            subq_alias.c.classif_type,
            func.coalesce(subq_alias.c.classif_id,
                          ObjectHeader.classif_auto_id).label("h_classif_id"),
            func.coalesce(
                subq_alias.c.classif_qual,
                case([(ObjectHeader.classif_auto_id.isnot(None), 'P')])),
            subq_alias.c.classif_who)
        qry = qry.join(subq_alias,
                       ObjectHeader.objid == subq_alias.c.objid,
                       isouter=(from_user_id is None))
        if from_user_id is not None:
            # If taking history from a user, don't apply to the objects he/she classsified
            # in last already.
            qry = qry.filter(ObjectHeader.classif_who != from_user_id)
            qry = qry.filter(subq_alias.c.rnk == 1)
        else:
            # Taking any history, including nothing, so emit blank history (see isouter above)
            qry = qry.filter(ObjectHeader.objid == any_(self.object_ids))
            qry = qry.filter(
                or_(subq_alias.c.rnk == 1, subq_alias.c.rnk.is_(None)))
        logger.info("_get_last_classif_history qry:%s", str(qry))
        with CodeTimer("HISTORY for %d objs: " % len(self.object_ids), logger):
            ret = [HistoricalLastClassif(rec) for rec in qry.all()]
        logger.info("_get_last_classif_history qry: %d rows", len(ret))
        return ret
Exemplo n.º 15
0
 def _count(self) -> Optional[int]:
     # noinspection SqlResolve
     where_sql = self.cache_where.get_sql()
     select_sql = re.sub("objf?id", "COUNT(1)", self._from(), 1)
     read_sql = select_sql + where_sql
     try:
         with CodeTimer("SQLite count using '%s':" % read_sql, logger):
             assert self.conn
             res: Cursor = self.conn.execute(read_sql, self.where_params)
             cnt, = res.fetchone()
             res.close()
         return cnt
     except (OperationalError, ProgrammingError) as e:
         logger.error("In %s : %s", read_sql, str(e))
     except Exception as ae:
         logger.error(ae.__class__)
     return None
Exemplo n.º 16
0
 def read_taxo_stats(self) -> List[SampleTaxoStats]:
     sql = text("""
     SELECT sam.sampleid,
            ARRAY_AGG(DISTINCT COALESCE(obh.classif_id, -1)) as ids,
            SUM(CASE WHEN obh.classif_id <> -1 THEN 0 ELSE 1 END) as nb_u,
            COUNT(CASE WHEN obh.classif_qual = 'V' THEN 1 END) nbr_v,
            COUNT(CASE WHEN obh.classif_qual = 'D' THEN 1 END) nbr_d, 
            COUNT(CASE WHEN obh.classif_qual = 'P' THEN 1 END) nbr_p
       FROM obj_head obh
       JOIN acquisitions acq ON acq.acquisid = obh.acquisid 
       JOIN samples sam ON sam.sampleid = acq.acq_sample_id
      WHERE sam.sampleid = ANY(:ids)
      GROUP BY sam.sampleid;""")
     with CodeTimer("Stats for %d samples: " % len(self.ids), logger):
         res = self.session.execute(sql, {'ids': self.ids})
         ret = [SampleTaxoStats(rec) for rec in res]
     return ret
Exemplo n.º 17
0
 def _fetch(self):
     # noinspection SqlResolve
     where_sql = self.cache_where.get_sql()
     read_sql = self._from() + " %s %s LIMIT %d OFFSET %d" % (
         where_sql, self.pg_order.get_sql(), self.pg_window_size, self.pg_window_start)
     try:
         with CodeTimer("SQLite read using '%s':" % read_sql, logger):
             assert self.conn
             res: Cursor = self.conn.execute(read_sql, self.where_params)
             # TODO: try fetchmany
             objid_list = [objid for objid, in res]
             res.close()
         return objid_list
     except (OperationalError, ProgrammingError) as e:
         logger.error("In %s : %s", read_sql, str(e))
     except Exception as ae:
         logger.error(ae.__class__)
     return None
Exemplo n.º 18
0
    def validate_import(self, how: ImportHow, diag: ImportDiagnostic, session: Session, report_def: Callable) -> int:
        """
            Validate the full bundle, i.e. every contained file.
            :return:
        """
        with CodeTimer("validate_import: Existing images for %d: " % how.prj_id, logger):
            how.objects_and_images_to_skip = Image.fetch_existing_images(session, how.prj_id)

        total_row_count = self.validate_each_file(how, diag, report_def)

        if total_row_count == 0:
            # Try to be explicit in messages
            nb_found = len(self.possible_files)
            nb_skipped = len(diag.skipped_files)
            err_msg = ["No object to import."]
            if nb_found == 0:
                err_msg.append("* No .txt or .tsv file was found, of which name starts with 'ecotaxa'.")
            else:
                nb_validated = nb_found - nb_skipped
                if nb_skipped > 0:
                    if nb_validated == 0:
                        err_msg.append("* 'SKIP TSV' option was set and all TSV files were imported before.")
                    else:
                        err_msg.append("* 'SKIP TSV' option was set and new TSV file(s) are not compliant.")
                if nb_validated > 0:
                    err_msg.append("*  TSV file(s) might be empty.")
                if how.skip_object_duplicates:
                    err_msg.append("*  'SKIP OBJECTS' option was set and all objects might be in already.")
            diag.error("<br>".join(err_msg))

        if len(diag.classif_id_seen) > 0:
            self.check_classif(session, diag, diag.classif_id_seen)

        logger.info("Taxo Found = %s", how.taxo_found)
        logger.info("Users Found = %s", how.found_users)
        not_seen_fields = how.custom_mapping.all_fields.keys() - diag.cols_seen
        logger.info("For Information, not seen fields %s", not_seen_fields)
        if len(not_seen_fields) > 0:
            diag.warn("Some fields configured in the project are not seen in this import {0} "
                      .format(", ".join(not_seen_fields)))
        if diag.nb_objects_without_gps > 0:
            diag.warn("{0} object(s) don't have GPS information."
                      .format(diag.nb_objects_without_gps))
        return total_row_count
Exemplo n.º 19
0
    def summary(self, current_user_id: Optional[UserIDT], proj_id: ProjectIDT, filters: ProjectFilters,
                only_total: bool) -> Tuple[int, Optional[int], Optional[int], Optional[int]]:
        """
            Query the given project with given filters, return classification summary, or just grand total if
            only_total is set.
        """
        # Security check
        if current_user_id is None:
            RightsBO.anonymous_wants(self.session, Action.READ, proj_id)
            # Anonymous can only see validated objects
            # TODO: Dup code
            # noinspection PyTypeHints
            filters.statusfilter = "V"  # type:ignore
            user_id = -1
        else:
            user, _project = RightsBO.user_wants(self.session, current_user_id, Action.READ, proj_id)
            user_id = user.id

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(self.session, proj_id, filters)
        from_, where, params = object_set.get_sql(user_id)
        sql = """
    SET LOCAL enable_seqscan=FALSE;
    SELECT COUNT(*) nbr"""
        if only_total:
            sql += """, NULL nbr_v, NULL nbr_d, NULL nbr_p"""
        else:
            sql += """, 
           COUNT(CASE WHEN obh.classif_qual = 'V' THEN 1 END) nbr_v,
           COUNT(CASE WHEN obh.classif_qual = 'D' THEN 1 END) nbr_d, 
           COUNT(CASE WHEN obh.classif_qual = 'P' THEN 1 END) nbr_p"""
        sql += """
      FROM """ + from_.get_sql() + " " + where.get_sql()

        with CodeTimer("summary: V/D/P for %d using %s " % (proj_id, sql), logger):
            res: ResultProxy = self.session.execute(sql, params)

        nbr: int
        nbr_v: Optional[int]
        nbr_d: Optional[int]
        nbr_p: Optional[int]
        nbr, nbr_v, nbr_d, nbr_p = res.first()  # type:ignore
        return nbr, nbr_v, nbr_d, nbr_p
Exemplo n.º 20
0
    def _find_what_to_dump(self) -> None:
        """
            Determine the objects to dump.
        """
        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(self.session, self.prj.projid, self.filters)
        from_, where, params = object_set.get_sql(self.requester_id)

        sql = """ SELECT objid FROM """ + from_.get_sql() + where.get_sql()

        logger.info("SQL=%s", sql)
        logger.info("SQLParam=%s", params)

        with CodeTimer("Get IDs:", logger):
            res: ResultProxy = self.session.execute(sql, params)
        ids = [r['objid'] for r in res]

        logger.info("NB OBJIDS=%d", len(ids))

        self.ids_to_dump = ids
Exemplo n.º 21
0
 def do_cleanup_dup_same_obj(self, current_user_id: UserIDT,
                             prj_id: ProjectIDT, max_deletes: int) -> str:
     """
         Simplest duplication pattern. Inside the same object there are several identical images.
     """
     _user = RightsBO.user_has_role(self.ro_session, current_user_id,
                                    Role.APP_ADMINISTRATOR)
     orig_img = aliased(Image, name="orig")
     orig_file = aliased(ImageFile, name="orig_file")
     qry: Query = self.session.query(orig_img.file_name, orig_img.imgid,
                                     Image,
                                     ImageFile)  # Select what to delete
     qry = qry.join(ObjectHeader, ObjectHeader.objid == Image.objid).join(
         Acquisition).join(Sample).join(Project)
     # We consider that original image is the oldest one, so others have a superior ID
     qry = qry.join(
         orig_img,
         and_(orig_img.objid == Image.objid,
              orig_img.orig_file_name == Image.orig_file_name,
              orig_img.width == Image.width,
              orig_img.height == Image.height,
              orig_img.imgid < Image.imgid))
     # Must have a checksum, with the same state (sane)
     qry = qry.join(
         ImageFile,
         and_(ImageFile.path == Image.file_name,
              ImageFile.state == ImageFileStateEnum.OK.value))
     qry = qry.join(
         orig_file,
         and_(orig_file.path == orig_img.file_name,
              orig_file.state == ImageFileStateEnum.OK.value))
     # and the same value of course
     qry = qry.filter(
         and_(ImageFile.digest_type == orig_file.digest_type,
              ImageFile.digest == orig_file.digest))
     qry = qry.filter(Project.projid == prj_id)
     qry = qry.order_by(Image.objid, orig_img.imgid, Image.imgid)
     qry = qry.limit(max_deletes)
     with CodeTimer(
             "Dups same objs inside %d, query '%s':" % (prj_id, str(qry)),
             logger):
         to_do = [(orig_file_name, orig_img_id, an_image, an_image_file)
                  for orig_file_name, orig_img_id, an_image, an_image_file
                  in qry.all()]
     ko_not_same = 0
     ko_except = 0
     # Prepare & start a remover thread that will run in // with DB queries
     remover = VaultRemover(self.link_src, logger).do_start()
     filecmp.clear_cache()
     deleted_imgids: Set[int] = set()
     for orig_file_name, orig_img_id, an_image, an_image_file in to_do:
         # The query returns multiple rows if there are more than 2 duplicates
         if orig_img_id in deleted_imgids:
             continue
         # Even if MD5s match, be paranoid and compare files
         orig_path = self.vault.sub_path(orig_file_name)
         dup_path = self.vault.sub_path(an_image.file_name)
         assert orig_path != dup_path
         orig_exists = exists(orig_path)
         dup_exists = exists(dup_path)
         if orig_exists:
             if dup_exists:
                 try:
                     same = filecmp.cmp(orig_path, dup_path, False)
                 except Exception as exc:
                     logger.info(
                         "Exception while comparing orig:%s and dup:%s: %s",
                         orig_path, dup_path, str(exc))
                     ko_except += 1
                     continue
                 if not same:
                     ko_not_same += 1
                     continue
             else:
                 # Duplicate is gone already
                 pass
         else:
             # DB record of physical file is wrong
             # TODO
             continue
         # Do the cleanup
         deleted_imgids.add(an_image.imgid)
         if dup_exists:
             remover.add_files([an_image.file_name])
         self.session.delete(an_image)
         self.session.delete(an_image_file)
     # Wait for the files handled
     self.session.commit()
     remover.wait_for_done()
     return (
         "Dupl remover for %s dup images done but %d problems %d false file comp"
         % (len(deleted_imgids), ko_except, ko_not_same))
Exemplo n.º 22
0
    def projects_for_user(session: Session, user: User,
                          for_managing: bool = False,
                          not_granted: bool = False,
                          title_filter: str = '',
                          instrument_filter: str = '',
                          filter_subset: bool = False) -> List[ProjectIDT]:
        """
        :param session:
        :param user: The user for which the list is needed.
        :param for_managing: If set, list the projects that the user can manage.
        :param not_granted: If set, list (only) the projects on which given user has no right, so user can
                                request access to them.
        :param title_filter: If set, filter out the projects with title not matching the required string,
                                or if set to a number, filter out the projects of which ID does not match.
        :param instrument_filter: If set, filter out the projects which do not have given instrument in at least
                                     one sample.
        :param filter_subset: If set, filter out any project of which title contains 'subset'.
        :return: The project IDs
        """
        sql_params: Dict[str, Any] = {"user_id": user.id}

        # Default query: all projects, eventually with first manager information
        # noinspection SqlResolve
        sql = """SELECT p.projid
                       FROM projects p
                       LEFT JOIN ( """ + ProjectPrivilegeBO.first_manager_by_project() + """ ) fpm 
                         ON fpm.projid = p.projid """
        if not_granted:
            # Add the projects for which no entry is found in ProjectPrivilege
            sql += """
                       LEFT JOIN projectspriv pp ON p.projid = pp.projid AND pp.member = :user_id
                      WHERE pp.member is null """
            if for_managing:
                sql += " AND False "
        else:
            if not user.has_role(Role.APP_ADMINISTRATOR):
                # Not an admin, so restrict to projects which current user can work on, or view
                sql += """
                            JOIN projectspriv pp 
                              ON p.projid = pp.projid 
                             AND pp.member = :user_id """
                if for_managing:
                    sql += """
                             AND pp.privilege = '%s' """ % ProjectPrivilegeBO.MANAGE
            sql += " WHERE 1 = 1 "

        if title_filter != '':
            sql += """ 
                        AND ( title ILIKE '%%'|| :title ||'%%'
                              OR TO_CHAR(p.projid,'999999') LIKE '%%'|| :title ) """
            sql_params["title"] = title_filter

        if instrument_filter != '':
            sql += """
                         AND p.projid IN (SELECT DISTINCT sam.projid FROM samples sam, acquisitions acq
                                           WHERE acq.acq_sample_id = sam.sampleid
                                             AND acq.instrument ILIKE '%%'|| :instrum ||'%%' ) """
            sql_params["instrum"] = instrument_filter

        if filter_subset:
            sql += """
                         AND NOT title ILIKE '%%subset%%'  """

        with CodeTimer("Projects query:", logger):
            res: Result = session.execute(text(sql), sql_params)
            # single-element tuple :( DBAPI
            ret = [an_id for an_id, in res.fetchall()]
        return ret  # type:ignore
Exemplo n.º 23
0
 def read_user_stats(session: Session, prj_ids: ProjectIDListT) -> List[ProjectUserStats]:
     """
         Read the users (annotators) involved in each project.
         Also compute a summary of their activity. This can only be an estimate since, e.g.
         imported data contains exact same data as the one obtained from live actions.
     """
     # Activity count: Count 1 for present classification for a user per object.
     #  Of course, the classification date is the latest for the user.
     pqry: Query = session.query(Project.projid, User.id, User.name,
                                 func.count(ObjectHeader.objid),
                                 func.max(ObjectHeader.classif_when))
     pqry = pqry.join(Sample).join(Acquisition).join(ObjectHeader)
     pqry = pqry.join(User, User.id == ObjectHeader.classif_who)
     pqry = pqry.filter(Project.projid == any_(prj_ids))
     pqry = pqry.filter(ObjectHeader.classif_who == User.id)
     pqry = pqry.group_by(Project.projid, User.id)
     pqry = pqry.order_by(Project.projid, User.name)
     ret = []
     user_activities: Dict[UserIDT, UserActivity] = {}
     user_activities_per_project = {}
     stats_per_project = {}
     with CodeTimer("user present stats for %d projects, qry: %s:" % (len(prj_ids), str(pqry)), logger):
         last_prj = None
         for projid, user_id, user_name, cnt, last_date in pqry.all():
             last_date_str = last_date.replace(microsecond=0).isoformat()
             if projid != last_prj:
                 last_prj = projid
                 prj_stat = ProjectUserStats((projid, [], []))
                 ret.append(prj_stat)
                 user_activities = {}
                 # Store for second pass with history
                 stats_per_project[projid] = prj_stat
                 user_activities_per_project[projid] = user_activities
             prj_stat.annotators.append(MinimalUserBO((user_id, user_name)))
             user_activity = UserActivity((user_id, cnt, last_date_str))
             prj_stat.activities.append(user_activity)
             # Store for second pass
             user_activities[user_id] = user_activity
     # Activity count update: Add 1 for each entry in history for each user.
     # The dates in history are ignored, except for users which do not appear in first resultset.
     hqry: Query = session.query(Project.projid, User.id, User.name,
                                 func.count(ObjectsClassifHisto.objid),
                                 func.max(ObjectsClassifHisto.classif_date))
     hqry = hqry.join(Sample).join(Acquisition).join(ObjectHeader).join(ObjectsClassifHisto)
     hqry = hqry.join(User, User.id == ObjectsClassifHisto.classif_who)
     hqry = hqry.filter(Project.projid == any_(prj_ids))
     hqry = hqry.group_by(Project.projid, User.id)
     hqry = hqry.order_by(Project.projid, User.name)
     with CodeTimer("user history stats for %d projects, qry: %s:" % (len(prj_ids), str(hqry)), logger):
         last_prj = None
         for projid, user_id, user_name, cnt, last_date in hqry.all():
             last_date_str = last_date.replace(microsecond=0).isoformat()
             if projid != last_prj:
                 last_prj = projid
                 # Just in case
                 if projid not in user_activities_per_project:
                     continue
                 # Get stored data for the project
                 user_activities = user_activities_per_project[projid]
                 prj_stat = stats_per_project[projid]
             already_there = user_activities.get(user_id)
             if already_there is not None:
                 # A user in both history and present classification
                 already_there.nb_actions += cnt
             else:
                 # A user _only_ in history
                 prj_stat.annotators.append(MinimalUserBO((user_id, user_name)))
                 user_activity = UserActivity((user_id, cnt, last_date_str))
                 prj_stat.activities.append(user_activity)
                 user_activities[user_id] = user_activity
     return ret
Exemplo n.º 24
0
    def aggregate_for_sample(
            self, sample: Sample) -> Dict[ClassifIDT, AggregForTaxon]:
        """
            Do the aggregations for the sample for each taxon and return them, they will become emofs
                - 'Abundance' -> CountOfBiologicalEntity -> count of objects group by taxon
                - 'Concentration' -> AbundancePerUnitVolumeOfTheWaterBody
                    -> sum(individual_concentration) group by taxon
                        with individual_concentration = 1 / subsample_coef / total_water_volume
                - 'Biovolume' -> BiovolumeOfBiologicalEntity -> sum(individual_biovolume) group by taxon
                    with individual_biovolume = individual_volume / subsample_coef / total_water_volume
            The abundance can always be computed. The 2 other ones depend on availability of values
            for the project and the configuration variable.
        """
        # We return all per taxon.
        ret: Dict[ClassifIDT, EMODnetExport.AggregForTaxon] = {}

        count_per_taxon_per_acquis: Dict[AcquisitionIDT, Dict[ClassifIDT,
                                                              int]] = {}

        # Start with abundances, simple count and giving its keys to the returned dict.
        acquis_for_sample = SampleBO.get_acquisitions(self.session, sample)
        for an_acquis in acquis_for_sample:
            # Get counts for acquisition (subsample)
            count_per_taxon_for_acquis: Dict[
                ClassifIDT,
                int] = AcquisitionBO.get_sums_by_taxon(self.session,
                                                       an_acquis.acquisid)
            if self.auto_morpho:
                self.add_morpho_counts(count_per_taxon_for_acquis)
            count_per_taxon_per_acquis[
                an_acquis.acquisid] = count_per_taxon_for_acquis
            for an_id, count_4_acquis in count_per_taxon_for_acquis.items():
                aggreg_for_taxon = ret.get(an_id)
                if aggreg_for_taxon is None:
                    ret[an_id] = self.AggregForTaxon(count_4_acquis, None,
                                                     None)
                else:
                    aggreg_for_taxon.abundance += count_4_acquis

        if not self.with_computations:
            return ret

        # Enrich with concentrations
        subsampling_coeff_per_acquis: Dict[AcquisitionIDT, float] = {}
        try:
            # Fetch calculation data at sample level
            sample_volume = SampleBO.get_computed_var(
                sample, DefaultVars.volume_sampled)
        except TypeError as e:
            self.warnings.append(
                "Could not compute volume sampled from sample %s (%s),"
                " no concentration or biovolume will be computed." %
                (sample.orig_id, str(e)))
            sample_volume = -1
        if sample_volume > 0:
            # Cumulate for subsamples AKA acquisitions
            for an_acquis in acquis_for_sample:
                try:
                    subsampling_coefficient = AcquisitionBO.get_computed_var(
                        an_acquis, DefaultVars.subsample_coeff)
                    subsampling_coeff_per_acquis[
                        an_acquis.acquisid] = subsampling_coefficient
                except TypeError as e:
                    self.warnings.append(
                        "Could not compute subsampling coefficient from acquisition %s (%s),"
                        " no concentration or biovolume will be computed" %
                        (an_acquis.orig_id, str(e)))
                    logger.info(
                        "concentrations: no subsample coeff for '%s' (%s)",
                        an_acquis.orig_id, str(e))
                    continue
                # Get counts for acquisition (sub-sample)
                logger.info("computing concentrations for '%s'",
                            an_acquis.orig_id)
                count_per_taxon_for_acquis = count_per_taxon_per_acquis[
                    an_acquis.acquisid]
                for an_id, count_4_acquis in count_per_taxon_for_acquis.items(
                ):
                    aggreg_for_taxon = ret[an_id]
                    concentration_for_taxon = count_4_acquis / subsampling_coefficient / sample_volume
                    if aggreg_for_taxon.concentration is None:
                        aggreg_for_taxon.concentration = 0
                    aggreg_for_taxon.concentration += concentration_for_taxon

        # Enrich with biovolumes. This needs a computation for each object, so it's likely to be slow.
        if sample_volume > 0:
            # Mappings are constant for the sample
            # noinspection PyTypeChecker
            mapping = ProjectMapping().load_from_project(sample.project)
            # Cumulate for subsamples AKA acquisitions
            for an_acquis in acquis_for_sample:
                subsampling_coefficient = subsampling_coeff_per_acquis.get(
                    an_acquis.acquisid)
                if subsampling_coefficient is None:
                    logger.info("biovolumes: no subsample coeff for '%s'",
                                an_acquis.orig_id)
                    continue
                # Get pixel size from associated process, it a constant to individual biovol computations
                try:
                    pixel_size, = ProcessBO.get_free_fields(
                        an_acquis.process, ["particle_pixel_size_mm"], [float],
                        [None])
                except TypeError as _e:
                    logger.info("biovolumes: no pixel size for '%s'",
                                an_acquis.orig_id)
                    continue
                constants = {"pixel_size": pixel_size}
                # Get all objects for the acquisition. The filter on classif_id is useless for now.
                with CodeTimer("Objects IDs for '%s': " % an_acquis.orig_id,
                               logger):
                    acq_object_ids = AcquisitionBO.get_all_object_ids(
                        session=self.session,
                        acquis_id=an_acquis.acquisid,
                        classif_ids=list(ret.keys()))
                with CodeTimer("Objects for '%s': " % an_acquis.orig_id,
                               logger):
                    objects = ObjectBOSet(self.ro_session, acq_object_ids,
                                          mapping.object_mappings)
                nb_biovols = 0
                for an_obj in objects.all:
                    # Compute a biovol if possible
                    try:
                        biovol = ObjectBO.get_computed_var(
                            an_obj, DefaultVars.equivalent_ellipsoidal_volume,
                            mapping, constants)
                        biovol = -1
                    except TypeError as _e:
                        biovol = -1
                    if biovol == -1:
                        try:
                            biovol = ObjectBO.get_computed_var(
                                an_obj,
                                DefaultVars.equivalent_spherical_volume,
                                mapping, constants)
                        except TypeError as _e:
                            continue
                    # Aggregate by category/taxon
                    aggreg_for_taxon = ret[an_obj.classif_id]
                    individual_biovolume = biovol / subsampling_coefficient / sample_volume
                    if aggreg_for_taxon.biovolume is None:
                        aggreg_for_taxon.biovolume = 0
                    aggreg_for_taxon.biovolume += individual_biovolume
                    # Update stats
                    nb_biovols += 1
                # A bit of display
                logger.info(
                    "%d biovolumes computed for '%s' out of %d objects",
                    nb_biovols, an_acquis.orig_id, len(acq_object_ids))

        return ret
Exemplo n.º 25
0
 def fetch_existing_objects(session, prj_id):
     """
         Get existing object IDs (orig_id AKA object_id in TSV) from the project
     """
     with CodeTimer("Existing objects for %d: " % prj_id, logger):
         return ObjectHeader.fetch_existing_objects(session, prj_id)
Exemplo n.º 26
0
    def query(self, current_user_id: Optional[UserIDT], proj_id: ProjectIDT,
              filters: ProjectFilters,
              order_field: Optional[str] = None,
              window_start: Optional[int] = None,
              window_size: Optional[int] = None) \
            -> Tuple[ObjectIDWithParentsListT, int]:
        """
            Query the given project with given filters, return all IDs.
            If provided order_field, the result is sorted by this field.
            Ambiguity is solved in a stable (over calls) way.
            window_start and window_size allow to select a window of data in the result.
        """
        # Security check
        if current_user_id is None:
            RightsBO.anonymous_wants(self.session, Action.READ, proj_id)
            # Anonymous can only see validated objects
            # noinspection PyTypeHints
            filters.statusfilter = "V"  # type:ignore
            user_id = -1
        else:
            user, _project = RightsBO.user_wants(self.session, current_user_id, Action.READ, proj_id)
            user_id = user.id

        # The order field has an impact on the query
        order_clause = self.cook_order_clause(order_field)

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(self.session, proj_id, filters)

        from_, where, params = object_set.get_sql(user_id, order_clause)

        if "obf." in where.get_sql():
            # If the filter needs obj_field data it's more efficient to count with a window function
            # than issuing a second query.
            extra_col = ", COUNT(objid) OVER() AS total"
        else:
            # Otherwise, no need for obj_field in count, less DB buffers
            extra_col = ", 0 AS total"

        # The following hint is needed until we sort out why, time to time, there is a FTS on obj_head
        sql = """
    SET LOCAL enable_seqscan=FALSE;
    SELECT obh.objid, acq.acquisid, sam.sampleid %s
      FROM """ % extra_col + from_.get_sql() + " " + where.get_sql()

        # Add order & window if relevant
        if order_clause is not None:
            sql += order_clause.get_sql()
        if window_start is not None:
            sql += " OFFSET %d" % window_start
        if window_size is not None:
            sql += " LIMIT %d" % window_size

        with CodeTimer("query: for %d using %s " % (proj_id, sql), logger):
            res: ResultProxy = self.session.execute(sql, params)
        ids = []
        total = 0
        objid: int
        acquisid: int
        sampleid: int
        for objid, acquisid, sampleid, total in res:  # type:ignore
            ids.append((objid, acquisid, sampleid, proj_id))

        if total == 0:
            # Total was not computed or left to 0
            total, _nbr_v, _nbr_d, _nbr_p = self.summary(current_user_id, proj_id, filters, True)

        return ids, total