def _delete_chunk(session: Session, a_chunk: ObjectIDListT) -> Tuple[int, int, List[str]]: """ Delete a chunk from self's object list. Technical Note: We use SQLA Core as we don't want to fetch the rows """ # Start with images which are not deleted via a CASCADE on DB side # This is maybe due to relationship cycle b/w ObjectHeader and Images @See comment in Image class img_del_qry: Delete = Image.__table__.delete() img_del_qry = img_del_qry.where(Image.objid == any_(a_chunk)) img_del_qry = img_del_qry.returning(Image.file_name, Image.thumb_file_name) with CodeTimer("DELETE for %d images: " % len(a_chunk), logger): files_res = session.execute(img_del_qry) img_files = [] nb_img_rows = 0 for a_file_tuple in files_res: # We have main file and optionally the thumbnail one for a_file in a_file_tuple: if a_file: img_files.append(a_file) nb_img_rows += 1 logger.info("Removed: %d rows, to remove: %d files", nb_img_rows, len(img_files)) obj_del_qry: Delete = ObjectHeader.__table__.delete() obj_del_qry = obj_del_qry.where(ObjectHeader.objid == any_(a_chunk)) with CodeTimer("DELETE for %d objs: " % len(a_chunk), logger): nb_objs = session.execute(obj_del_qry).rowcount session.commit() # TODO: Cache delete return nb_objs, nb_img_rows, img_files
def __init__(self, session: Session, prj_ids: ProjectIDListT, public: bool = False): # Query the project and load neighbours as well qry: Query = session.query(Project, ProjectPrivilege) qry = qry.outerjoin(ProjectPrivilege, Project.privs_for_members).options( contains_eager(Project.privs_for_members)) qry = qry.outerjoin(User, ProjectPrivilege.user).options( contains_eager(ProjectPrivilege.user)) qry = qry.filter(Project.projid == any_(prj_ids)) self.projects = [] done = set() with CodeTimer("%s BO projects query & init:" % len(prj_ids), logger): for a_proj, a_pp in qry.all(): # The query yields duplicates so we need to filter if a_proj.projid not in done: if public: self.projects.append(ProjectBO(a_proj)) else: self.projects.append(ProjectBO(a_proj).enrich()) done.add(a_proj.projid) # Add instruments with CodeTimer("%s set instruments:" % len(prj_ids), logger): instruments = DescribedInstrumentSet(session, prj_ids) for a_project in self.projects: instrums = instruments.by_project.get(a_project.projid) if instrums is not None: a_project.instrument = ",".join(instrums)
def do_digests(self, current_user_id: UserIDT, prj_id: Optional[ProjectIDT], max_digests: int) -> str: """ Pick some images without checksum and compute it. """ _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) qry: Query = self.ro_session.query(Image.file_name) if prj_id is not None: # Find missing images in a project qry = qry.join(ObjectHeader).join(Acquisition).join(Sample).join( Project) qry = qry.filter(Project.projid == prj_id) else: # Find missing images globally pass qry = qry.outerjoin(ImageFile, Image.file_name == ImageFile.path) qry = qry.filter(ImageFile.path.is_(None)) qry = qry.limit(max_digests) cnt = 0 with CodeTimer("Files without md5, query '%s':" % str(qry), logger): files_without_md5 = [file_name for file_name, in qry.all()] for an_img_file_name in files_without_md5: cnt += 1 img_file = ImageFile(path=an_img_file_name) self.session.add(img_file) self._md5_on_record(img_file) self.session.commit() # Eventually we can still satisfy the constraint while doing a few missing md5s left_for_unknown = max_digests - cnt if left_for_unknown > 0: # Also do unknown image file lines miss_qry: Query = self.session.query(ImageFile) miss_qry = miss_qry.filter( and_(ImageFile.state == ImageFileStateEnum.UNKNOWN.value, ImageFile.digest_type == '?')) if prj_id is not None: # Find unknown images in a project miss_qry = miss_qry.outerjoin( Image, Image.file_name == ImageFile.path) miss_qry = miss_qry.join(ObjectHeader).join(Acquisition).join( Sample).join(Project) miss_qry = miss_qry.filter(Project.projid == prj_id) # On purpose, no "order by" clause. Results are random, but sorting takes a while on lots of images miss_qry = miss_qry.limit(left_for_unknown) with CodeTimer( "Files with unknown state, query '%s':" % str(miss_qry), logger): missing_ones = [an_img_file for an_img_file in miss_qry.all()] for a_missing in missing_ones: cnt += 1 self._md5_on_record(a_missing) self.session.commit() return "Digest for %d images done." % cnt
def do_intra_step_1(self, loaded_files): # The mapping to custom columns, either empty or from previous import API_operations on same project. custom_mapping = ProjectMapping().load_from_project(self.prj) # Source bundle construction source_bundle = InBundle( self.source_dir_or_zip, Path(self.temp_for_task.data_dir_for(self.task_id))) # Configure the validation to come, directives. import_how = ImportHow(self.prj_id, self.req.update_mode, custom_mapping, self.req.skip_existing_objects, loaded_files) if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) # A structure to collect validation result import_diag = ImportDiagnostic() if not self.req.skip_existing_objects: with CodeTimer( "do_intra_step_1: Existing images for %d: " % self.prj_id, logger): import_diag.existing_objects_and_image = Image.fetch_existing_images( self.session, self.prj_id) import_diag.topology.read_from_db(self.session, prj_id=self.prj_id) # Do the bulk job of validation nb_rows = source_bundle.validate_import(import_how, import_diag, self.session, self.report_progress) return import_how, import_diag, nb_rows
def _db_fetch(self, objids: List[int]) -> List[DBObjectTupleT]: """ Do a DB read of given objects, with auxiliary objects. Thanks to 'contains_eager' calls, the objects are loaded into SQLAlchemy session. :param objids: :return: """ ret: Query = self.session.query(Project, Sample, Acquisition, Process, ObjectHeader, ObjectFields, Image) ret = ret.join(Sample, Project.all_samples).options(contains_eager(Project.all_samples)) ret = ret.join(Acquisition, Sample.all_acquisitions) ret = ret.join(Process, Acquisition.process) ret = ret.join(ObjectHeader, Acquisition.all_objects) # Natural joins ret = ret.join(ObjectFields) ret = ret.join(Image, ObjectHeader.all_images).options(contains_eager(ObjectHeader.all_images)) ret = ret.filter(ObjectHeader.objid == any_(objids)) ret = ret.order_by(ObjectHeader.objid) ret = ret.order_by(Image.imgrank) if self.first_query: logger.info("Query: %s", str(ret)) self.first_query = False with CodeTimer("Get Objects:", logger): objs = [an_obj for an_obj in ret.all()] # We get as many lines as images logger.info("NB ROWS JOIN=%d", len(objs)) return objs
def _collect_existing_and_validate(self, source_dir_or_zip, loaded_files) \ -> Tuple[ImportHow, ImportDiagnostic, int]: """ Prepare the import by checking what's inside the project and scanning files to input. """ # The mapping to TSV custom columns, either empty or from previous import operations on same project. mapping = ProjectMapping().load_from_project(self.prj) # Source bundle construction bundle_temp_dir = Path(self.temp_for_jobs.data_dir_for(self.job_id)) source_bundle = InBundle(source_dir_or_zip, bundle_temp_dir) # Configure the validation to come, directives. import_how = ImportHow(self.prj_id, self.req.update_mode, mapping, self.req.skip_existing_objects, loaded_files) if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) # A structure to collect validation result import_diag = ImportDiagnostic() if not self.req.skip_existing_objects: with CodeTimer( "collect_existing: Existing images for %d: " % self.prj_id, logger): import_diag.existing_objects_and_image = Image.fetch_existing_images( self.session, self.prj_id) import_diag.topology.read_from_db(self.session, prj_id=self.prj_id) # Do the bulk job of validation nb_rows = source_bundle.validate_import( import_how, import_diag, self.session, self.report_validation_progress) return import_how, import_diag, nb_rows
def read_taxo_stats(session: Session, prj_ids: ProjectIDListT, taxa_ids: Union[str, ClassifIDListT]) -> List[ProjectTaxoStats]: sql = """ SELECT pts.projid, ARRAY_AGG(pts.id) as ids, SUM(CASE WHEN pts.id = -1 THEN pts.nbr ELSE 0 END) as nb_u, SUM(pts.nbr_v) as nb_v, SUM(pts.nbr_d) as nb_d, SUM(pts.nbr_p) as nb_p FROM projects_taxo_stat pts WHERE pts.projid = ANY(:ids)""" params: Dict[str, Any] = {'ids': prj_ids} if len(taxa_ids) > 0: if taxa_ids == 'all': pass else: sql += " AND pts.id = ANY(:tids)" params["tids"] = taxa_ids sql += """ GROUP BY pts.projid""" if len(taxa_ids) > 0: sql += ", pts.id" res: Result = session.execute(text(sql), params) with CodeTimer("stats for %d projects:" % len(prj_ids), logger): ret = [ProjectTaxoStats(rec) for rec in res.fetchall()] for a_stat in ret: a_stat.used_taxa.sort() return ret
def wait_for_done(self): """ Signal the thread that we have no more files, and wait for the job done. """ self.logger.info("Approximately %d files in deletion queue", self.files_queue.qsize()) self.files_queue.put(None) with CodeTimer("Wait for files removal: ", self.logger): self.join()
def do_run(self, current_user_id: int) -> ImportRealRsp: """ Do the real job using injected parameters. :return: """ # Security check RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE, self.prj_id) # OK loaded_files = none_to_empty(self.prj.fileloaded).splitlines() logger.info("Previously loaded files: %s", loaded_files) # Save mappings straight away self.save_mapping(self.custom_mapping) source_bundle = InBundle( self.req.source_path, Path(self.temp_for_task.data_dir_for(self.task_id))) # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_task.base_dir_for(self.task_id)) # Configure the import to come, directives import_how = ImportHow(self.prj_id, self.req.update_mode, self.custom_mapping, self.req.skip_existing_objects, loaded_files) import_how.taxo_mapping = self.req.taxo_mappings import_how.taxo_found = self.req.found_taxa import_how.found_users = self.req.found_users if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) if not self.req.skip_existing_objects: with CodeTimer("run: Existing images for %d: " % self.prj_id, logger): import_how.objects_and_images_to_skip = Image.fetch_existing_images( self.session, self.prj_id) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Do the bulk job of import row_count = source_bundle.do_import(import_where, import_how, self.req.rowcount, self.report_progress) # Update loaded files in DB, removing duplicates self.prj.fileloaded = "\n".join(set(import_how.loaded_files)) self.session.commit() # Recompute stats ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() logger.info("Total of %d rows loaded" % row_count) # Prepare response ret = ImportRealRsp() return ret
def _safe_update(self, upd_sql: str, params: Dict): file_name = ObjectCache.file_name(self.projid) conn = SQLite3.get_conn(file_name, "rw") # No create! try: with CodeTimer("SQLite update using '%s':" % upd_sql, logger): conn.execute(upd_sql, params) conn.commit() finally: conn.close()
def get_projects_ids(self) -> ProjectIDListT: """ Return the project IDs for the held sample IDs. """ qry: Query = self.session.query(Project.projid).distinct(Project.projid) qry = qry.join(Sample, Project.all_samples) qry = qry.filter(Sample.sampleid == any_(self.ids)) with CodeTimer("Prjs for %d samples: " % len(self.ids), logger): return [an_id[0] for an_id in qry.all()]
def do_real(self) -> None: """ Do the real job, i.e. write everywhere (DB/filesystem) """ loaded_files = none_to_empty(self.prj.fileloaded).splitlines() logger.info("Previously loaded files: %s", loaded_files) found_users, taxo_found, col_mapping_dict, \ nb_rows, source_path = self._load_vars_from_state(self.STATE_KEYS) # Save mappings straight away col_mapping = ProjectMapping().load_from_dict(col_mapping_dict) col_mapping.write_to_project(self.prj) self.session.commit() # TODO: Duplicated code source_bundle = InBundle( source_path, Path(self.temp_for_jobs.data_dir_for(self.job_id))) # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_jobs.base_dir_for(self.job_id)) # Configure the import to come, directives import_how = ImportHow(self.prj_id, self.req.update_mode, col_mapping, self.req.skip_existing_objects, loaded_files) import_how.taxo_mapping = self.req.taxo_mappings import_how.found_taxa = taxo_found import_how.found_users = found_users if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) if self.req.skip_existing_objects: # If we must skip existing objects then do an inventory of what's in already with CodeTimer("run: Existing images for %d: " % self.prj_id, logger): import_how.objects_and_images_to_skip = Image.fetch_existing_images( self.session, self.prj_id) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Do the bulk job of import rowcount_from_validate = nb_rows row_count = source_bundle.do_import(import_where, import_how, rowcount_from_validate, self.report_progress) # Update loaded files in DB, removing duplicates self.prj.fileloaded = "\n".join(set(import_how.loaded_files)) self.session.commit() # Recompute stats ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() msg = "Total of %d rows loaded" % row_count logger.info(msg) self.set_job_result(errors=[], infos={"rowcount": row_count})
def get_projects_ids(self) -> ProjectIDListT: """ Return the project IDs for the owned objectsIDs. """ qry: Query = self.session.query(Project.projid).distinct( Project.projid) qry = qry.join(Sample) qry = qry.join(Acquisition) qry = qry.join(ObjectHeader) qry = qry.filter(ObjectHeader.objid == any_(self.object_ids)) with CodeTimer("Prjs for %d objs: " % len(self.object_ids), logger): return [an_id for an_id, in qry.all()]
def _get_last_classif_history(self, from_user_id: Optional[int], but_not_from_user_id: Optional[int]) \ -> List[HistoricalLastClassif]: """ Query for last classification history on all objects of self, mixed with present state in order to have restore-able lines. """ # Get the histo entries subqry: Query = self.session.query( ObjectsClassifHisto, func.rank().over( partition_by=ObjectsClassifHisto.objid, order_by=ObjectsClassifHisto.classif_date.desc()).label("rnk")) if from_user_id: subqry = subqry.filter( ObjectsClassifHisto.classif_who == from_user_id) if but_not_from_user_id: subqry = subqry.filter( ObjectsClassifHisto.classif_who != but_not_from_user_id) subqry = subqry.filter(ObjectsClassifHisto.classif_type == "M") subq_alias: Alias = subqry.filter( ObjectsClassifHisto.objid == any_(self.object_ids)).subquery() # Also get some fields from ObjectHeader for referencing, info, and fallback qry = self.session.query( ObjectHeader.objid, ObjectHeader.classif_id, func.coalesce(subq_alias.c.classif_date, ObjectHeader.classif_auto_when), subq_alias.c.classif_type, func.coalesce(subq_alias.c.classif_id, ObjectHeader.classif_auto_id).label("h_classif_id"), func.coalesce( subq_alias.c.classif_qual, case([(ObjectHeader.classif_auto_id.isnot(None), 'P')])), subq_alias.c.classif_who) qry = qry.join(subq_alias, ObjectHeader.objid == subq_alias.c.objid, isouter=(from_user_id is None)) if from_user_id is not None: # If taking history from a user, don't apply to the objects he/she classsified # in last already. qry = qry.filter(ObjectHeader.classif_who != from_user_id) qry = qry.filter(subq_alias.c.rnk == 1) else: # Taking any history, including nothing, so emit blank history (see isouter above) qry = qry.filter(ObjectHeader.objid == any_(self.object_ids)) qry = qry.filter( or_(subq_alias.c.rnk == 1, subq_alias.c.rnk.is_(None))) logger.info("_get_last_classif_history qry:%s", str(qry)) with CodeTimer("HISTORY for %d objs: " % len(self.object_ids), logger): ret = [HistoricalLastClassif(rec) for rec in qry.all()] logger.info("_get_last_classif_history qry: %d rows", len(ret)) return ret
def _count(self) -> Optional[int]: # noinspection SqlResolve where_sql = self.cache_where.get_sql() select_sql = re.sub("objf?id", "COUNT(1)", self._from(), 1) read_sql = select_sql + where_sql try: with CodeTimer("SQLite count using '%s':" % read_sql, logger): assert self.conn res: Cursor = self.conn.execute(read_sql, self.where_params) cnt, = res.fetchone() res.close() return cnt except (OperationalError, ProgrammingError) as e: logger.error("In %s : %s", read_sql, str(e)) except Exception as ae: logger.error(ae.__class__) return None
def read_taxo_stats(self) -> List[SampleTaxoStats]: sql = text(""" SELECT sam.sampleid, ARRAY_AGG(DISTINCT COALESCE(obh.classif_id, -1)) as ids, SUM(CASE WHEN obh.classif_id <> -1 THEN 0 ELSE 1 END) as nb_u, COUNT(CASE WHEN obh.classif_qual = 'V' THEN 1 END) nbr_v, COUNT(CASE WHEN obh.classif_qual = 'D' THEN 1 END) nbr_d, COUNT(CASE WHEN obh.classif_qual = 'P' THEN 1 END) nbr_p FROM obj_head obh JOIN acquisitions acq ON acq.acquisid = obh.acquisid JOIN samples sam ON sam.sampleid = acq.acq_sample_id WHERE sam.sampleid = ANY(:ids) GROUP BY sam.sampleid;""") with CodeTimer("Stats for %d samples: " % len(self.ids), logger): res = self.session.execute(sql, {'ids': self.ids}) ret = [SampleTaxoStats(rec) for rec in res] return ret
def _fetch(self): # noinspection SqlResolve where_sql = self.cache_where.get_sql() read_sql = self._from() + " %s %s LIMIT %d OFFSET %d" % ( where_sql, self.pg_order.get_sql(), self.pg_window_size, self.pg_window_start) try: with CodeTimer("SQLite read using '%s':" % read_sql, logger): assert self.conn res: Cursor = self.conn.execute(read_sql, self.where_params) # TODO: try fetchmany objid_list = [objid for objid, in res] res.close() return objid_list except (OperationalError, ProgrammingError) as e: logger.error("In %s : %s", read_sql, str(e)) except Exception as ae: logger.error(ae.__class__) return None
def validate_import(self, how: ImportHow, diag: ImportDiagnostic, session: Session, report_def: Callable) -> int: """ Validate the full bundle, i.e. every contained file. :return: """ with CodeTimer("validate_import: Existing images for %d: " % how.prj_id, logger): how.objects_and_images_to_skip = Image.fetch_existing_images(session, how.prj_id) total_row_count = self.validate_each_file(how, diag, report_def) if total_row_count == 0: # Try to be explicit in messages nb_found = len(self.possible_files) nb_skipped = len(diag.skipped_files) err_msg = ["No object to import."] if nb_found == 0: err_msg.append("* No .txt or .tsv file was found, of which name starts with 'ecotaxa'.") else: nb_validated = nb_found - nb_skipped if nb_skipped > 0: if nb_validated == 0: err_msg.append("* 'SKIP TSV' option was set and all TSV files were imported before.") else: err_msg.append("* 'SKIP TSV' option was set and new TSV file(s) are not compliant.") if nb_validated > 0: err_msg.append("* TSV file(s) might be empty.") if how.skip_object_duplicates: err_msg.append("* 'SKIP OBJECTS' option was set and all objects might be in already.") diag.error("<br>".join(err_msg)) if len(diag.classif_id_seen) > 0: self.check_classif(session, diag, diag.classif_id_seen) logger.info("Taxo Found = %s", how.taxo_found) logger.info("Users Found = %s", how.found_users) not_seen_fields = how.custom_mapping.all_fields.keys() - diag.cols_seen logger.info("For Information, not seen fields %s", not_seen_fields) if len(not_seen_fields) > 0: diag.warn("Some fields configured in the project are not seen in this import {0} " .format(", ".join(not_seen_fields))) if diag.nb_objects_without_gps > 0: diag.warn("{0} object(s) don't have GPS information." .format(diag.nb_objects_without_gps)) return total_row_count
def summary(self, current_user_id: Optional[UserIDT], proj_id: ProjectIDT, filters: ProjectFilters, only_total: bool) -> Tuple[int, Optional[int], Optional[int], Optional[int]]: """ Query the given project with given filters, return classification summary, or just grand total if only_total is set. """ # Security check if current_user_id is None: RightsBO.anonymous_wants(self.session, Action.READ, proj_id) # Anonymous can only see validated objects # TODO: Dup code # noinspection PyTypeHints filters.statusfilter = "V" # type:ignore user_id = -1 else: user, _project = RightsBO.user_wants(self.session, current_user_id, Action.READ, proj_id) user_id = user.id # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet(self.session, proj_id, filters) from_, where, params = object_set.get_sql(user_id) sql = """ SET LOCAL enable_seqscan=FALSE; SELECT COUNT(*) nbr""" if only_total: sql += """, NULL nbr_v, NULL nbr_d, NULL nbr_p""" else: sql += """, COUNT(CASE WHEN obh.classif_qual = 'V' THEN 1 END) nbr_v, COUNT(CASE WHEN obh.classif_qual = 'D' THEN 1 END) nbr_d, COUNT(CASE WHEN obh.classif_qual = 'P' THEN 1 END) nbr_p""" sql += """ FROM """ + from_.get_sql() + " " + where.get_sql() with CodeTimer("summary: V/D/P for %d using %s " % (proj_id, sql), logger): res: ResultProxy = self.session.execute(sql, params) nbr: int nbr_v: Optional[int] nbr_d: Optional[int] nbr_p: Optional[int] nbr, nbr_v, nbr_d, nbr_p = res.first() # type:ignore return nbr, nbr_v, nbr_d, nbr_p
def _find_what_to_dump(self) -> None: """ Determine the objects to dump. """ # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet(self.session, self.prj.projid, self.filters) from_, where, params = object_set.get_sql(self.requester_id) sql = """ SELECT objid FROM """ + from_.get_sql() + where.get_sql() logger.info("SQL=%s", sql) logger.info("SQLParam=%s", params) with CodeTimer("Get IDs:", logger): res: ResultProxy = self.session.execute(sql, params) ids = [r['objid'] for r in res] logger.info("NB OBJIDS=%d", len(ids)) self.ids_to_dump = ids
def do_cleanup_dup_same_obj(self, current_user_id: UserIDT, prj_id: ProjectIDT, max_deletes: int) -> str: """ Simplest duplication pattern. Inside the same object there are several identical images. """ _user = RightsBO.user_has_role(self.ro_session, current_user_id, Role.APP_ADMINISTRATOR) orig_img = aliased(Image, name="orig") orig_file = aliased(ImageFile, name="orig_file") qry: Query = self.session.query(orig_img.file_name, orig_img.imgid, Image, ImageFile) # Select what to delete qry = qry.join(ObjectHeader, ObjectHeader.objid == Image.objid).join( Acquisition).join(Sample).join(Project) # We consider that original image is the oldest one, so others have a superior ID qry = qry.join( orig_img, and_(orig_img.objid == Image.objid, orig_img.orig_file_name == Image.orig_file_name, orig_img.width == Image.width, orig_img.height == Image.height, orig_img.imgid < Image.imgid)) # Must have a checksum, with the same state (sane) qry = qry.join( ImageFile, and_(ImageFile.path == Image.file_name, ImageFile.state == ImageFileStateEnum.OK.value)) qry = qry.join( orig_file, and_(orig_file.path == orig_img.file_name, orig_file.state == ImageFileStateEnum.OK.value)) # and the same value of course qry = qry.filter( and_(ImageFile.digest_type == orig_file.digest_type, ImageFile.digest == orig_file.digest)) qry = qry.filter(Project.projid == prj_id) qry = qry.order_by(Image.objid, orig_img.imgid, Image.imgid) qry = qry.limit(max_deletes) with CodeTimer( "Dups same objs inside %d, query '%s':" % (prj_id, str(qry)), logger): to_do = [(orig_file_name, orig_img_id, an_image, an_image_file) for orig_file_name, orig_img_id, an_image, an_image_file in qry.all()] ko_not_same = 0 ko_except = 0 # Prepare & start a remover thread that will run in // with DB queries remover = VaultRemover(self.link_src, logger).do_start() filecmp.clear_cache() deleted_imgids: Set[int] = set() for orig_file_name, orig_img_id, an_image, an_image_file in to_do: # The query returns multiple rows if there are more than 2 duplicates if orig_img_id in deleted_imgids: continue # Even if MD5s match, be paranoid and compare files orig_path = self.vault.sub_path(orig_file_name) dup_path = self.vault.sub_path(an_image.file_name) assert orig_path != dup_path orig_exists = exists(orig_path) dup_exists = exists(dup_path) if orig_exists: if dup_exists: try: same = filecmp.cmp(orig_path, dup_path, False) except Exception as exc: logger.info( "Exception while comparing orig:%s and dup:%s: %s", orig_path, dup_path, str(exc)) ko_except += 1 continue if not same: ko_not_same += 1 continue else: # Duplicate is gone already pass else: # DB record of physical file is wrong # TODO continue # Do the cleanup deleted_imgids.add(an_image.imgid) if dup_exists: remover.add_files([an_image.file_name]) self.session.delete(an_image) self.session.delete(an_image_file) # Wait for the files handled self.session.commit() remover.wait_for_done() return ( "Dupl remover for %s dup images done but %d problems %d false file comp" % (len(deleted_imgids), ko_except, ko_not_same))
def projects_for_user(session: Session, user: User, for_managing: bool = False, not_granted: bool = False, title_filter: str = '', instrument_filter: str = '', filter_subset: bool = False) -> List[ProjectIDT]: """ :param session: :param user: The user for which the list is needed. :param for_managing: If set, list the projects that the user can manage. :param not_granted: If set, list (only) the projects on which given user has no right, so user can request access to them. :param title_filter: If set, filter out the projects with title not matching the required string, or if set to a number, filter out the projects of which ID does not match. :param instrument_filter: If set, filter out the projects which do not have given instrument in at least one sample. :param filter_subset: If set, filter out any project of which title contains 'subset'. :return: The project IDs """ sql_params: Dict[str, Any] = {"user_id": user.id} # Default query: all projects, eventually with first manager information # noinspection SqlResolve sql = """SELECT p.projid FROM projects p LEFT JOIN ( """ + ProjectPrivilegeBO.first_manager_by_project() + """ ) fpm ON fpm.projid = p.projid """ if not_granted: # Add the projects for which no entry is found in ProjectPrivilege sql += """ LEFT JOIN projectspriv pp ON p.projid = pp.projid AND pp.member = :user_id WHERE pp.member is null """ if for_managing: sql += " AND False " else: if not user.has_role(Role.APP_ADMINISTRATOR): # Not an admin, so restrict to projects which current user can work on, or view sql += """ JOIN projectspriv pp ON p.projid = pp.projid AND pp.member = :user_id """ if for_managing: sql += """ AND pp.privilege = '%s' """ % ProjectPrivilegeBO.MANAGE sql += " WHERE 1 = 1 " if title_filter != '': sql += """ AND ( title ILIKE '%%'|| :title ||'%%' OR TO_CHAR(p.projid,'999999') LIKE '%%'|| :title ) """ sql_params["title"] = title_filter if instrument_filter != '': sql += """ AND p.projid IN (SELECT DISTINCT sam.projid FROM samples sam, acquisitions acq WHERE acq.acq_sample_id = sam.sampleid AND acq.instrument ILIKE '%%'|| :instrum ||'%%' ) """ sql_params["instrum"] = instrument_filter if filter_subset: sql += """ AND NOT title ILIKE '%%subset%%' """ with CodeTimer("Projects query:", logger): res: Result = session.execute(text(sql), sql_params) # single-element tuple :( DBAPI ret = [an_id for an_id, in res.fetchall()] return ret # type:ignore
def read_user_stats(session: Session, prj_ids: ProjectIDListT) -> List[ProjectUserStats]: """ Read the users (annotators) involved in each project. Also compute a summary of their activity. This can only be an estimate since, e.g. imported data contains exact same data as the one obtained from live actions. """ # Activity count: Count 1 for present classification for a user per object. # Of course, the classification date is the latest for the user. pqry: Query = session.query(Project.projid, User.id, User.name, func.count(ObjectHeader.objid), func.max(ObjectHeader.classif_when)) pqry = pqry.join(Sample).join(Acquisition).join(ObjectHeader) pqry = pqry.join(User, User.id == ObjectHeader.classif_who) pqry = pqry.filter(Project.projid == any_(prj_ids)) pqry = pqry.filter(ObjectHeader.classif_who == User.id) pqry = pqry.group_by(Project.projid, User.id) pqry = pqry.order_by(Project.projid, User.name) ret = [] user_activities: Dict[UserIDT, UserActivity] = {} user_activities_per_project = {} stats_per_project = {} with CodeTimer("user present stats for %d projects, qry: %s:" % (len(prj_ids), str(pqry)), logger): last_prj = None for projid, user_id, user_name, cnt, last_date in pqry.all(): last_date_str = last_date.replace(microsecond=0).isoformat() if projid != last_prj: last_prj = projid prj_stat = ProjectUserStats((projid, [], [])) ret.append(prj_stat) user_activities = {} # Store for second pass with history stats_per_project[projid] = prj_stat user_activities_per_project[projid] = user_activities prj_stat.annotators.append(MinimalUserBO((user_id, user_name))) user_activity = UserActivity((user_id, cnt, last_date_str)) prj_stat.activities.append(user_activity) # Store for second pass user_activities[user_id] = user_activity # Activity count update: Add 1 for each entry in history for each user. # The dates in history are ignored, except for users which do not appear in first resultset. hqry: Query = session.query(Project.projid, User.id, User.name, func.count(ObjectsClassifHisto.objid), func.max(ObjectsClassifHisto.classif_date)) hqry = hqry.join(Sample).join(Acquisition).join(ObjectHeader).join(ObjectsClassifHisto) hqry = hqry.join(User, User.id == ObjectsClassifHisto.classif_who) hqry = hqry.filter(Project.projid == any_(prj_ids)) hqry = hqry.group_by(Project.projid, User.id) hqry = hqry.order_by(Project.projid, User.name) with CodeTimer("user history stats for %d projects, qry: %s:" % (len(prj_ids), str(hqry)), logger): last_prj = None for projid, user_id, user_name, cnt, last_date in hqry.all(): last_date_str = last_date.replace(microsecond=0).isoformat() if projid != last_prj: last_prj = projid # Just in case if projid not in user_activities_per_project: continue # Get stored data for the project user_activities = user_activities_per_project[projid] prj_stat = stats_per_project[projid] already_there = user_activities.get(user_id) if already_there is not None: # A user in both history and present classification already_there.nb_actions += cnt else: # A user _only_ in history prj_stat.annotators.append(MinimalUserBO((user_id, user_name))) user_activity = UserActivity((user_id, cnt, last_date_str)) prj_stat.activities.append(user_activity) user_activities[user_id] = user_activity return ret
def aggregate_for_sample( self, sample: Sample) -> Dict[ClassifIDT, AggregForTaxon]: """ Do the aggregations for the sample for each taxon and return them, they will become emofs - 'Abundance' -> CountOfBiologicalEntity -> count of objects group by taxon - 'Concentration' -> AbundancePerUnitVolumeOfTheWaterBody -> sum(individual_concentration) group by taxon with individual_concentration = 1 / subsample_coef / total_water_volume - 'Biovolume' -> BiovolumeOfBiologicalEntity -> sum(individual_biovolume) group by taxon with individual_biovolume = individual_volume / subsample_coef / total_water_volume The abundance can always be computed. The 2 other ones depend on availability of values for the project and the configuration variable. """ # We return all per taxon. ret: Dict[ClassifIDT, EMODnetExport.AggregForTaxon] = {} count_per_taxon_per_acquis: Dict[AcquisitionIDT, Dict[ClassifIDT, int]] = {} # Start with abundances, simple count and giving its keys to the returned dict. acquis_for_sample = SampleBO.get_acquisitions(self.session, sample) for an_acquis in acquis_for_sample: # Get counts for acquisition (subsample) count_per_taxon_for_acquis: Dict[ ClassifIDT, int] = AcquisitionBO.get_sums_by_taxon(self.session, an_acquis.acquisid) if self.auto_morpho: self.add_morpho_counts(count_per_taxon_for_acquis) count_per_taxon_per_acquis[ an_acquis.acquisid] = count_per_taxon_for_acquis for an_id, count_4_acquis in count_per_taxon_for_acquis.items(): aggreg_for_taxon = ret.get(an_id) if aggreg_for_taxon is None: ret[an_id] = self.AggregForTaxon(count_4_acquis, None, None) else: aggreg_for_taxon.abundance += count_4_acquis if not self.with_computations: return ret # Enrich with concentrations subsampling_coeff_per_acquis: Dict[AcquisitionIDT, float] = {} try: # Fetch calculation data at sample level sample_volume = SampleBO.get_computed_var( sample, DefaultVars.volume_sampled) except TypeError as e: self.warnings.append( "Could not compute volume sampled from sample %s (%s)," " no concentration or biovolume will be computed." % (sample.orig_id, str(e))) sample_volume = -1 if sample_volume > 0: # Cumulate for subsamples AKA acquisitions for an_acquis in acquis_for_sample: try: subsampling_coefficient = AcquisitionBO.get_computed_var( an_acquis, DefaultVars.subsample_coeff) subsampling_coeff_per_acquis[ an_acquis.acquisid] = subsampling_coefficient except TypeError as e: self.warnings.append( "Could not compute subsampling coefficient from acquisition %s (%s)," " no concentration or biovolume will be computed" % (an_acquis.orig_id, str(e))) logger.info( "concentrations: no subsample coeff for '%s' (%s)", an_acquis.orig_id, str(e)) continue # Get counts for acquisition (sub-sample) logger.info("computing concentrations for '%s'", an_acquis.orig_id) count_per_taxon_for_acquis = count_per_taxon_per_acquis[ an_acquis.acquisid] for an_id, count_4_acquis in count_per_taxon_for_acquis.items( ): aggreg_for_taxon = ret[an_id] concentration_for_taxon = count_4_acquis / subsampling_coefficient / sample_volume if aggreg_for_taxon.concentration is None: aggreg_for_taxon.concentration = 0 aggreg_for_taxon.concentration += concentration_for_taxon # Enrich with biovolumes. This needs a computation for each object, so it's likely to be slow. if sample_volume > 0: # Mappings are constant for the sample # noinspection PyTypeChecker mapping = ProjectMapping().load_from_project(sample.project) # Cumulate for subsamples AKA acquisitions for an_acquis in acquis_for_sample: subsampling_coefficient = subsampling_coeff_per_acquis.get( an_acquis.acquisid) if subsampling_coefficient is None: logger.info("biovolumes: no subsample coeff for '%s'", an_acquis.orig_id) continue # Get pixel size from associated process, it a constant to individual biovol computations try: pixel_size, = ProcessBO.get_free_fields( an_acquis.process, ["particle_pixel_size_mm"], [float], [None]) except TypeError as _e: logger.info("biovolumes: no pixel size for '%s'", an_acquis.orig_id) continue constants = {"pixel_size": pixel_size} # Get all objects for the acquisition. The filter on classif_id is useless for now. with CodeTimer("Objects IDs for '%s': " % an_acquis.orig_id, logger): acq_object_ids = AcquisitionBO.get_all_object_ids( session=self.session, acquis_id=an_acquis.acquisid, classif_ids=list(ret.keys())) with CodeTimer("Objects for '%s': " % an_acquis.orig_id, logger): objects = ObjectBOSet(self.ro_session, acq_object_ids, mapping.object_mappings) nb_biovols = 0 for an_obj in objects.all: # Compute a biovol if possible try: biovol = ObjectBO.get_computed_var( an_obj, DefaultVars.equivalent_ellipsoidal_volume, mapping, constants) biovol = -1 except TypeError as _e: biovol = -1 if biovol == -1: try: biovol = ObjectBO.get_computed_var( an_obj, DefaultVars.equivalent_spherical_volume, mapping, constants) except TypeError as _e: continue # Aggregate by category/taxon aggreg_for_taxon = ret[an_obj.classif_id] individual_biovolume = biovol / subsampling_coefficient / sample_volume if aggreg_for_taxon.biovolume is None: aggreg_for_taxon.biovolume = 0 aggreg_for_taxon.biovolume += individual_biovolume # Update stats nb_biovols += 1 # A bit of display logger.info( "%d biovolumes computed for '%s' out of %d objects", nb_biovols, an_acquis.orig_id, len(acq_object_ids)) return ret
def fetch_existing_objects(session, prj_id): """ Get existing object IDs (orig_id AKA object_id in TSV) from the project """ with CodeTimer("Existing objects for %d: " % prj_id, logger): return ObjectHeader.fetch_existing_objects(session, prj_id)
def query(self, current_user_id: Optional[UserIDT], proj_id: ProjectIDT, filters: ProjectFilters, order_field: Optional[str] = None, window_start: Optional[int] = None, window_size: Optional[int] = None) \ -> Tuple[ObjectIDWithParentsListT, int]: """ Query the given project with given filters, return all IDs. If provided order_field, the result is sorted by this field. Ambiguity is solved in a stable (over calls) way. window_start and window_size allow to select a window of data in the result. """ # Security check if current_user_id is None: RightsBO.anonymous_wants(self.session, Action.READ, proj_id) # Anonymous can only see validated objects # noinspection PyTypeHints filters.statusfilter = "V" # type:ignore user_id = -1 else: user, _project = RightsBO.user_wants(self.session, current_user_id, Action.READ, proj_id) user_id = user.id # The order field has an impact on the query order_clause = self.cook_order_clause(order_field) # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet(self.session, proj_id, filters) from_, where, params = object_set.get_sql(user_id, order_clause) if "obf." in where.get_sql(): # If the filter needs obj_field data it's more efficient to count with a window function # than issuing a second query. extra_col = ", COUNT(objid) OVER() AS total" else: # Otherwise, no need for obj_field in count, less DB buffers extra_col = ", 0 AS total" # The following hint is needed until we sort out why, time to time, there is a FTS on obj_head sql = """ SET LOCAL enable_seqscan=FALSE; SELECT obh.objid, acq.acquisid, sam.sampleid %s FROM """ % extra_col + from_.get_sql() + " " + where.get_sql() # Add order & window if relevant if order_clause is not None: sql += order_clause.get_sql() if window_start is not None: sql += " OFFSET %d" % window_start if window_size is not None: sql += " LIMIT %d" % window_size with CodeTimer("query: for %d using %s " % (proj_id, sql), logger): res: ResultProxy = self.session.execute(sql, params) ids = [] total = 0 objid: int acquisid: int sampleid: int for objid, acquisid, sampleid, total in res: # type:ignore ids.append((objid, acquisid, sampleid, proj_id)) if total == 0: # Total was not computed or left to 0 total, _nbr_v, _nbr_d, _nbr_p = self.summary(current_user_id, proj_id, filters, True) return ids, total