Exemplo n.º 1
0
    def _do_clone(self):
        """
            Cloning operation itself. Assumes that @see self.to_clone was populated before.
        """
        # Get the mappings in source project, in order to determines the useful columns
        custom_mapping = ProjectMapping().load_from_project(self.prj)
        obj_mapping = custom_mapping.object_mappings
        used_columns = set(obj_mapping.real_cols_to_tsv.keys())
        used_columns.add("orig_id")  # By safety
        # Create a DB writer
        writer = DBWriter(self.session)
        # Narrow the writes in ObjectFields thanks to mappings of original project
        writer.generators({"obj_field": used_columns})
        # Use import helpers
        dest_prj_id = self.dest_prj.projid
        import_how = ImportHow(prj_id=dest_prj_id,
                               update_mode="No",
                               custom_mapping=ProjectMapping(),
                               skip_object_duplicates=False,
                               loaded_files=[])
        # Get parent (enclosing) Sample, Acquisition, Process. There should be 0 in this context...
        import_how.existing_parents = InBundle.fetch_existing_parents(
            self.session, prj_id=dest_prj_id)

        self._clone_all(import_how, writer)
        # Copy mappings to destination. We could narrow them to the minimum?
        custom_mapping.write_to_project(self.dest_prj)
Exemplo n.º 2
0
    def _verify_possible(self, dest_prj: Project,
                         src_prj: Project) -> List[str]:
        """
            Verify that the merge would not mean a loss of information.
                The mappings of src project should be preserved and copied into dest project.
                Augmented mappings should fit in the allowed maximum size for each entity.
            :param dest_prj:
            :param src_prj:
            :return: a list of problems, empty means we can proceed.
        """
        ret = []
        dest_mappings = ProjectMapping().load_from_project(dest_prj)
        src_mappings = ProjectMapping().load_from_project(src_prj)
        a_tbl: MappedTableTypeT
        for a_tbl in MAPPED_TABLES:
            mappings_for_dest_tbl = dest_mappings.by_table[a_tbl]
            mappings_for_src_tbl = src_mappings.by_table[a_tbl]
            # Compute the new mapping and eventual transformations to get there
            aug, remaps, errs = mappings_for_dest_tbl.augmented_with(
                mappings_for_src_tbl)
            ret.extend(errs)
            if len(remaps) > 0:
                self.remap_operations[a_tbl] = remaps
            # Load future mapping
            self.dest_augmented_mappings.by_table[a_tbl].load_from(aug)

        # Also check problems on consistency of unique orig_id
        dest_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                       prj_id=self.prj_id)
        src_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                      prj_id=self.src_prj_id)

        for an_orig_id_container in [
                Sample.__tablename__, Acquisition.__tablename__
        ]:
            # key=orig_id value, value=full record
            dest_orig_ids = dest_parents[an_orig_id_container]
            src_orig_ids = src_parents[an_orig_id_container]
            common_orig_ids = set(dest_orig_ids.keys()).intersection(
                src_orig_ids.keys())
            if len(common_orig_ids) != 0:
                logger.info("Common %s orig_ids: %s", an_orig_id_container,
                            common_orig_ids)
            for common_orig_id in common_orig_ids:
                orm_diff = orm_equals(dest_orig_ids[common_orig_id],
                                      src_orig_ids[common_orig_id])
                if orm_diff:
                    msg = (
                        "Data conflict: %s record with orig_id '%s' is different in destination project: %s"
                        %
                        (an_orig_id_container, common_orig_id, str(orm_diff)))
                    # TODO: Should be an error?
                    logger.warning(msg)
        return ret
Exemplo n.º 3
0
 def get_computed_var(
         cls,
         mapped: Any,  # TODO: Should be 'MappedEntity'
         var: ProjectVar,
         mapping: Optional[ProjectMapping] = None,
         constants: Optional[Dict] = None) -> Any:
     """
         For given mapped entity, return the result of evaluating the formula (which returns a variable).
     """
     if mapping is None:
         mapping = ProjectMapping().load_from_project(
             cls.PROJECT_ACCESSOR(mapped))
     # Filter what is known in mapping.
     mapped_cols = getattr(mapping, cls.MAPPING_IN_PROJECT).find_tsv_cols(
         var.variable_names)
     types = [float] * len(mapped_cols)
     absent = [None] * len(mapped_cols)
     # Data extraction and formula evaluation
     _errs, var_vals = cls._get_values(mapped,
                                       mapped_cols,
                                       field_types=types,
                                       field_absent_vals=absent)
     if constants is not None:
         var_vals.update(constants)
     try:
         import math
         ret = eval(var.code, {"math": math}, var_vals)
         if not var.is_valid(ret):
             raise TypeError("Not valid %s: %s" % (var.formula, str(ret)))
     except Exception as e:
         # Basically anything can happen here
         raise TypeError(str(e))
     return ret
Exemplo n.º 4
0
 def get_free_fields(
         cls,
         mapped: Any,  # TODO: Should be 'MappedEntity'
         field_list: List[str],
         field_types: List[Any],
         field_absent_vals: List[Any]) -> List[Any]:
     """
         Get free fields _values_ for the mapped entity, inside the project
         :param field_list: The names of the free columns for which value is returned
         :param field_types: The basic types e.g. int or float. Used as converters/verifiers e.g: float()
         :param field_absent_vals: The values used as marker for 'absent'. 'nan' for float.
     """
     assert len(field_list) == len(field_types) == len(field_absent_vals)
     mapping = ProjectMapping().load_from_project(
         cls.PROJECT_ACCESSOR(mapped))
     mapped_cols = getattr(mapping,
                           cls.MAPPING_IN_PROJECT).find_tsv_cols(field_list)
     if len(mapped_cols) != len(field_list):
         raise TypeError("at least one of %s free column is absent" %
                         field_list)
     # OK we have the real columns, get the values
     errs, ret = cls._get_values(mapped, mapped_cols, field_types,
                                 field_absent_vals)
     if len(errs) > 0:
         raise TypeError(",".join(errs))
     return list(ret.values())
Exemplo n.º 5
0
 def do_intra_step_1(self, loaded_files):
     # The mapping to custom columns, either empty or from previous import API_operations on same project.
     custom_mapping = ProjectMapping().load_from_project(self.prj)
     # Source bundle construction
     source_bundle = InBundle(
         self.source_dir_or_zip,
         Path(self.temp_for_task.data_dir_for(self.task_id)))
     # Configure the validation to come, directives.
     import_how = ImportHow(self.prj_id, self.req.update_mode,
                            custom_mapping, self.req.skip_existing_objects,
                            loaded_files)
     if self.req.skip_loaded_files:
         import_how.compute_skipped(source_bundle, logger)
     # A structure to collect validation result
     import_diag = ImportDiagnostic()
     if not self.req.skip_existing_objects:
         with CodeTimer(
                 "do_intra_step_1: Existing images for %d: " % self.prj_id,
                 logger):
             import_diag.existing_objects_and_image = Image.fetch_existing_images(
                 self.session, self.prj_id)
     import_diag.topology.read_from_db(self.session, prj_id=self.prj_id)
     # Do the bulk job of validation
     nb_rows = source_bundle.validate_import(import_how, import_diag,
                                             self.session,
                                             self.report_progress)
     return import_how, import_diag, nb_rows
Exemplo n.º 6
0
 def _collect_existing_and_validate(self, source_dir_or_zip, loaded_files) \
         -> Tuple[ImportHow, ImportDiagnostic, int]:
     """
         Prepare the import by checking what's inside the project and scanning files to input.
     """
     # The mapping to TSV custom columns, either empty or from previous import operations on same project.
     mapping = ProjectMapping().load_from_project(self.prj)
     # Source bundle construction
     bundle_temp_dir = Path(self.temp_for_jobs.data_dir_for(self.job_id))
     source_bundle = InBundle(source_dir_or_zip, bundle_temp_dir)
     # Configure the validation to come, directives.
     import_how = ImportHow(self.prj_id, self.req.update_mode, mapping,
                            self.req.skip_existing_objects, loaded_files)
     if self.req.skip_loaded_files:
         import_how.compute_skipped(source_bundle, logger)
     # A structure to collect validation result
     import_diag = ImportDiagnostic()
     if not self.req.skip_existing_objects:
         with CodeTimer(
                 "collect_existing: Existing images for %d: " % self.prj_id,
                 logger):
             import_diag.existing_objects_and_image = Image.fetch_existing_images(
                 self.session, self.prj_id)
     import_diag.topology.read_from_db(self.session, prj_id=self.prj_id)
     # Do the bulk job of validation
     nb_rows = source_bundle.validate_import(
         import_how, import_diag, self.session,
         self.report_validation_progress)
     return import_how, import_diag, nb_rows
Exemplo n.º 7
0
 def enrich(self):
     """
         Add DB fields and relations as (hopefully more) meaningful attributes
     """
     # Decode mappings to avoid exposing internal field
     mappings = ProjectMapping().load_from_project(self._project)
     self.obj_free_cols = mappings.object_mappings.tsv_cols_to_real
     self.sample_free_cols = mappings.sample_mappings.tsv_cols_to_real
     self.acquisition_free_cols = mappings.acquisition_mappings.tsv_cols_to_real
     self.process_free_cols = mappings.process_mappings.tsv_cols_to_real
     # Decode text list into numerical
     db_list = self._project.initclassiflist
     db_list = db_list if db_list else ""
     self.init_classif_list = [int(x) for x in db_list.split(",") if x.isdigit()]
     # Dispatch members by right
     by_right = {ProjectPrivilegeBO.MANAGE: self.managers,
                 ProjectPrivilegeBO.ANNOTATE: self.annotators,
                 ProjectPrivilegeBO.VIEW: self.viewers}
     a_priv: ProjectPrivilege
     # noinspection PyTypeChecker
     for a_priv in self._project.privs_for_members:
         if a_priv.user is None:  # TODO: There is a line with NULL somewhere in DB
             continue
         if not a_priv.user.active:
             continue
         # noinspection PyTypeChecker
         by_right[a_priv.privilege].append(a_priv.user)
         if 'C' == a_priv.extra:
             self.contact = a_priv.user
     return self
Exemplo n.º 8
0
 def get_free_fields(cls, acquis: Acquisition,
                     fields_list: List[str]) -> List[Any]:
     """ Get free fields _value_ for the acquisition. """
     mapping = ProjectMapping().load_from_project(acquis.sample.project)
     real_cols = mapping.acquisition_mappings.find_tsv_cols(fields_list)
     if len(real_cols) != len(fields_list):
         raise TypeError("free column not found")
     return [getattr(acquis, real_col) for real_col in real_cols]
Exemplo n.º 9
0
 def __init__(self, prj_id: int, src_prj_id: int, dry_run: bool):
     super().__init__()
     # params
     self.prj_id = prj_id
     self.src_prj_id = src_prj_id
     self.dry_run = dry_run
     # work vars
     self.remap_operations: Dict[MappedTableTypeT, List[RemapOp]] = {}
     self.dest_augmented_mappings = ProjectMapping()
Exemplo n.º 10
0
    def do_real(self) -> None:
        """
            Do the real job, i.e. write everywhere (DB/filesystem)
        """
        loaded_files = none_to_empty(self.prj.fileloaded).splitlines()
        logger.info("Previously loaded files: %s", loaded_files)

        found_users, taxo_found, col_mapping_dict, \
        nb_rows, source_path = self._load_vars_from_state(self.STATE_KEYS)

        # Save mappings straight away
        col_mapping = ProjectMapping().load_from_dict(col_mapping_dict)
        col_mapping.write_to_project(self.prj)
        self.session.commit()

        # TODO: Duplicated code
        source_bundle = InBundle(
            source_path, Path(self.temp_for_jobs.data_dir_for(self.job_id)))
        # Configure the import to come, destination
        db_writer = DBWriter(self.session)
        import_where = ImportWhere(
            db_writer, self.vault,
            self.temp_for_jobs.base_dir_for(self.job_id))
        # Configure the import to come, directives
        import_how = ImportHow(self.prj_id, self.req.update_mode, col_mapping,
                               self.req.skip_existing_objects, loaded_files)
        import_how.taxo_mapping = self.req.taxo_mappings
        import_how.found_taxa = taxo_found
        import_how.found_users = found_users
        if self.req.skip_loaded_files:
            import_how.compute_skipped(source_bundle, logger)
        if self.req.skip_existing_objects:
            # If we must skip existing objects then do an inventory of what's in already
            with CodeTimer("run: Existing images for %d: " % self.prj_id,
                           logger):
                import_how.objects_and_images_to_skip = Image.fetch_existing_images(
                    self.session, self.prj_id)
        import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT']))

        # Do the bulk job of import
        rowcount_from_validate = nb_rows
        row_count = source_bundle.do_import(import_where, import_how,
                                            rowcount_from_validate,
                                            self.report_progress)

        # Update loaded files in DB, removing duplicates
        self.prj.fileloaded = "\n".join(set(import_how.loaded_files))
        self.session.commit()

        # Recompute stats
        ProjectBO.do_after_load(self.session, self.prj_id)
        self.session.commit()

        msg = "Total of %d rows loaded" % row_count
        logger.info(msg)
        self.set_job_result(errors=[], infos={"rowcount": row_count})
Exemplo n.º 11
0
 def _apply_on_all(self, clazz: MappedTableTypeT, project: Project, updates: List[ColUpdate]) -> int:
     """
         Apply all updates on all impacted rows by their ID.
     """
     prj_mappings = ProjectMapping().load_from_project(project)
     tbl_mappings = prj_mappings.by_table[clazz].tsv_cols_to_real
     clean_updates = self._sanitize_updates(clazz, tbl_mappings, updates)
     # Eventually there is nothing left after filtering
     if len(clean_updates) == 0:
         return 0
     return self._do_updates(clazz, clean_updates)
Exemplo n.º 12
0
 def query(self, current_user_id: Optional[int], acquisition_id: AcquisitionIDT) -> Optional[AcquisitionBO]:
     ret = AcquisitionBO(self.ro_session, acquisition_id)
     if not ret.exists():
         return None
     assert ret.acquis is not None
     # Security check
     if current_user_id is None:
         project = RightsBO.anonymous_wants(self.ro_session, Action.READ, ret.acquis.sample.projid)
     else:
         _user, project = RightsBO.user_wants(self.session, current_user_id, Action.READ, ret.acquis.sample.projid)
     mappings = ProjectMapping().load_from_project(project)
     ret.map_free_columns(mappings.acquisition_mappings)
     return ret
Exemplo n.º 13
0
    def do_import(self):
        """
            Do the real job, i.e. copy files while creating records.
        """
        errors = []
        self.manage_uploaded()
        self.unzip_if_needed()
        # Use a Bundle
        source_bundle = InBundle(
            self.source_dir_or_zip,
            Path(self.temp_for_task.data_dir_for(self.task_id)))
        # Clean it, in case the ZIP contains a CSV
        source_bundle.remove_all_tsvs()
        images = source_bundle.list_image_files()
        # Configure the import to come, destination
        db_writer = DBWriter(self.session)
        import_where = ImportWhere(
            db_writer, self.vault,
            self.temp_for_task.base_dir_for(self.task_id))
        # Configure the import to come, directives
        import_how = ImportHow(prj_id=self.prj_id,
                               update_mode="",
                               custom_mapping=ProjectMapping(),
                               skip_object_duplicates=False,
                               loaded_files=[])
        import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT']))
        # Generate TSV
        req_values = self.req.values
        if req_values.get(SimpleImportFields.userlb, ""):
            import_how.found_users["user"] = {
                "id": req_values.get(SimpleImportFields.userlb)
            }
            req_values[SimpleImportFields.userlb] = "user"
        if req_values.get(SimpleImportFields.status, ""):
            req_values[SimpleImportFields.status] = classif_qual.get(
                req_values[SimpleImportFields.status], "")
        self.make_tsv(source_bundle, images)
        # Import
        nb_image_files = len(images)
        nb_images = source_bundle.do_import(import_where, import_how,
                                            nb_image_files,
                                            self.report_progress)
        self.session.commit()

        # Recompute stats and so on
        ProjectBO.do_after_load(self.session, self.prj_id)
        self.session.commit()

        ret = SimpleImportRsp(errors=errors, nb_images=nb_images)
        return ret
Exemplo n.º 14
0
 def __init__(self, current_user: int, prj_id: int, filters: ProjectFilters):
     super().__init__()
     self.requester_id = current_user
     self.filters = filters
     prj = self.session.query(Project).get(prj_id)
     # We don't check for existence
     self.prj: Project = prj
     # Work vars
     self.mapping = ProjectMapping()
     if self.prj:
         self.mapping.load_from_project(self.prj)
     self.ids_to_dump: List[int] = []
     self.already_dumped: Set = set()
     self.first_query = True
Exemplo n.º 15
0
 def query(self, current_user_id: Optional[int], object_id: ObjectIDT) -> Optional[ObjectBO]:
     ret = ObjectBO(self.ro_session, object_id)
     if not ret.exists():
         return None
     # Security check
     projid = ret.header.acquisition.sample.projid
     if current_user_id is None:
         project = RightsBO.anonymous_wants(self.session, Action.READ, projid)
     else:
         _user, project = RightsBO.user_wants(self.session, current_user_id, Action.READ, projid)
     assert project is not None
     mappings = ProjectMapping().load_from_project(project)
     ret.map_free_columns(mappings.object_mappings)
     return ret
Exemplo n.º 16
0
 def query(self, current_user_id: Optional[UserIDT], sample_id: SampleIDT) -> Optional[SampleBO]:
     ret = SampleBO(self.ro_session, sample_id)
     if not ret.exists():
         return None
     assert ret.sample is not None
     assert ret.sample.projid is not None  # TODO: Why need this?
     # Security check
     if current_user_id is None:
         project = RightsBO.anonymous_wants(self.ro_session, Action.READ, ret.sample.projid)
     else:
         _user, project = RightsBO.user_wants(self.session, current_user_id, Action.READ, ret.sample.projid)
     mappings = ProjectMapping().load_from_project(project)
     ret.map_free_columns(mappings.sample_mappings)
     return ret
Exemplo n.º 17
0
 def get_all_num_columns_values(self, session: Session):
     """
         Get all numerical free fields values for all objects in a project.
     """
     from DB.helpers.ORM import MetaData
     metadata = MetaData(bind=session.get_bind())
     # TODO: Cache in a member
     mappings = ProjectMapping().load_from_project(self._project)
     num_fields_cols = set([col for col in mappings.object_mappings.tsv_cols_to_real.values()
                            if col[0] == 'n'])
     obj_fields_tbl = minimal_table_of(metadata, ObjectFields, num_fields_cols, exact_floats=True)
     qry: Query = session.query(Project)
     qry = qry.join(Project.all_samples).join(Sample.all_acquisitions).join(Acquisition.all_objects)
     qry = qry.join(obj_fields_tbl, ObjectHeader.objid == obj_fields_tbl.c.objfid)
     qry = qry.filter(Project.projid == self._project.projid)
     qry = qry.order_by(Acquisition.acquisid)
     qry = qry.with_entities(Acquisition.acquisid, Acquisition.orig_id, obj_fields_tbl)
     return qry.all()
Exemplo n.º 18
0
    def create_tsv(self, src_project: Project,
                   end_progress: int) -> Tuple[int, int]:
        """
            Create the TSV file.
        """
        req = self.req
        proj_id = src_project.projid
        self.update_progress(1, "Start TSV export")
        progress_range = end_progress - 1

        # Get a fast count of the maximum of what to do
        count_sql = "SELECT SUM(nbr) AS cnt FROM projects_taxo_stat WHERE projid = :prj"
        res = self.ro_session.execute(text(count_sql), {"prj": proj_id})
        obj_count = res.first()[0]

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(
            self.ro_session, proj_id, self.filters)

        # Backup or not, the column namings are taken from common mapping
        # @See Mapping.py
        # TSV column order
        # field_order = ["object_id", "object_lat", "object_lon", "object_date", "object_time", "object_depth_max",
        #                "object_annotation_status", "object_annotation_person_name", "object_annotation_person_email",
        #                "object_annotation_date", "object_annotation_time", "object_annotation_category"]
        # formats = {"object_date": "TO_CHAR({0},'YYYYMMDD')",
        #            "object_time": "TO_CHAR({0},'HH24MISS')",
        #            "object_annotation_date": "TO_CHAR({0},'YYYYMMDD')",
        #            "object_annotation_time": "TO_CHAR({0},'HH24MISS')",
        #            "object_annotation_status": """
        #                  CASE {0}
        #                     WHEN 'V' then 'validated'
        #                     WHEN 'P' then 'predicted'
        #                     WHEN 'D' then 'dubious'
        #                     ELSE {0}
        #                  END
        #            """
        #            }
        # prefices = {ObjectHeader.__tablename__: "obh",
        #             }
        # for a_fld in field_order:
        #     mpg = GlobalMapping.PREDEFINED_FIELDS[a_fld]
        #     mpg[""]
        #     assert a_fld in GlobalMapping.PREDEFINED_FIELDS, "%s is not a mapped column" % a_fld
        date_fmt, time_fmt = "YYYYMMDD", "HH24MISS"
        if req.format_dates_times:
            date_fmt, time_fmt = "YYYY-MM-DD", "HH24:MI:SS"

        select_clause = "select "

        if req.with_images or (req.exp_type == ExportTypeEnum.backup):
            select_clause += "img.orig_file_name AS img_file_name, img.imgrank AS img_rank"
            if req.with_images:
                select_clause += ", img.file_name AS img_src_path"
            select_clause += ",\n"

        select_clause += """obh.orig_id AS object_id, obh.latitude AS object_lat, obh.longitude AS object_lon,
                         TO_CHAR(obh.objdate,'{0}') AS object_date,
                         TO_CHAR(obh.objtime,'{1}') AS object_time,
                         obh.object_link, obh.depth_min AS object_depth_min, obh.depth_max AS object_depth_max,
                         CASE obh.classif_qual 
                            WHEN 'V' then 'validated' 
                            WHEN 'P' then 'predicted' 
                            WHEN 'D' then 'dubious' 
                            ELSE obh.classif_qual 
                         END AS object_annotation_status,                
                         usr.name AS object_annotation_person_name, usr.email AS object_annotation_person_email,
                         TO_CHAR(obh.classif_when,'{0}') AS object_annotation_date,
                         TO_CHAR(obh.classif_when,'{1}') AS object_annotation_time,                
                         txo.display_name AS object_annotation_category 
                    """.format(date_fmt, time_fmt)
        if req.exp_type == ExportTypeEnum.backup:
            select_clause += ", txo.id AS object_annotation_category_id"
        else:
            select_clause += "," + TaxonomyBO.parents_sql(
                "obh.classif_id") + " AS object_annotation_hierarchy"

        if 'C' in req.tsv_entities:
            select_clause += "\n, obh.complement_info"

        # Deal with mappings, the goal is to emit SQL which will reconstitute the TSV structure
        src_mappings = ProjectMapping().load_from_project(src_project)
        if 'O' in req.tsv_entities:
            select_clause += "\n " + src_mappings.object_mappings.as_select_list(
                "obf")

        if 'S' in req.tsv_entities:
            select_clause += "\n, sam.orig_id AS sample_id, sam.dataportal_descriptor AS sample_dataportal_descriptor "
            select_clause += src_mappings.sample_mappings.as_select_list("sam")

        if 'P' in req.tsv_entities:
            select_clause += "\n, prc.orig_id AS process_id "
            select_clause += src_mappings.process_mappings.as_select_list(
                "prc")

        if 'A' in req.tsv_entities:
            select_clause += "\n, acq.orig_id AS acq_id, acq.instrument AS acq_instrument "
            select_clause += src_mappings.acquisition_mappings.as_select_list(
                "acq")

        if req.exp_type == ExportTypeEnum.dig_obj_ident:
            select_clause += "\n, obh.objid"

        if req.with_internal_ids:
            select_clause += """\n, obh.objid, 
                    obh.acquisid AS processid_internal, obh.acquisid AS acq_id_internal, 
                    sam.sampleid AS sample_id_internal, 
                    obh.classif_id, obh.classif_who, obh.classif_auto_id, txp.name classif_auto_name, 
                    obh.classif_auto_score, obh.classif_auto_when,
                    obh.random_value object_random_value, obh.sunpos object_sunpos """
            if 'S' in req.tsv_entities:
                select_clause += "\n, sam.latitude sample_lat, sam.longitude sample_long "

        # TODO: The condition on o.projid=1 in historical code below prevents any data production
        # if 'H' in req.tsv_entities:
        #     sql1 += " , oh.classif_date AS histoclassif_date, classif_type AS histoclassif_type, " \
        #             "to3.name histoclassif_name, oh.classif_qual histoclassif_qual,uo3.name histoclassif_who, " \
        #             "classif_score histoclassif_score"
        #     sql2 += """ LEFT JOIN (select o.objid, classif_date, classif_type, och.classif_id,
        #                                   och.classif_qual, och.classif_who, classif_score
        #                              from objectsclassifhisto och
        #                              join objects o on o.objid=och.objid and o.projid=1 {0}
        #                            union all
        #                            select o.objid, o.classif_when classif_date, 'C' classif_type, classif_id,
        #                                   classif_qual, classif_who, NULL
        #                              from objects o {0} where o.projid=1
        #                           ) oh on o.objid=oh.objid
        #                 LEFT JOIN taxonomy to3 on oh.classif_id=to3.id
        #                 LEFT JOIN users uo3 on oh.classif_who=uo3.id
        #             """.format(samplefilter)

        order_clause = OrderClause()
        if req.split_by == "sample":
            order_clause.add_expression("sam", "orig_id")
            split_field = "sample_id"  # AKA sam.orig_id, but renamed in select list
        elif req.split_by == "taxo":
            select_clause += "\n, txo.display_name AS taxo_parent_child "
            order_clause.add_expression(None, "taxo_parent_child")
            split_field = "taxo_parent_child"
        else:
            order_clause.add_expression("sam", "orig_id")
            split_field = "object_id"  # cette valeur permet d'éviter des erreurs plus loin dans r[split_field]
        order_clause.add_expression("obh", "objid")

        if req.with_images or (req.exp_type == ExportTypeEnum.backup):
            order_clause.add_expression(None, "img_rank")

        # Base SQL comes from filters
        from_, where, params = object_set.get_sql(
            self._get_owner_id(),
            order_clause,
            select_clause,
            all_images=not req.only_first_image)
        sql = select_clause + " FROM " + from_.get_sql() + where.get_sql(
        ) + order_clause.get_sql()
        logger.info("Execute SQL : %s" % sql)
        logger.info("Params : %s" % params)

        res = self.ro_session.execute(text(sql), params)

        now_txt = DateTime.now_time().strftime("%Y%m%d_%H%M")
        self.out_file_name = "export_{0:d}_{1:s}.{2}".format(
            proj_id, now_txt, "zip")

        produced_path = self.out_path / self.out_file_name
        zfile = zipfile.ZipFile(produced_path,
                                'w',
                                allowZip64=True,
                                compression=zipfile.ZIP_DEFLATED)

        splitcsv = (req.split_by != "")
        csv_filename = 'data.tsv'  # Just a temp name as there is a rename while filling up the Zip
        if splitcsv:
            # Produce into the same temp file all the time, at zipping time the name in archive will vary
            prev_value = "NotAssigned"  # To trigger a sequence change immediately
        else:
            # The zip will contain a single TSV with same base name as the zip
            prev_value = self.out_file_name.replace('.zip', '')

        csv_path: Path = self.out_path / csv_filename  # Constant path to a (sometimes) changing file
        csv_fd: Optional[IO] = None
        csv_wtr = None

        # Store the images to save in a separate CSV. Useless if not exporting images but who cares.
        temp_img_file = self.out_path / "images.csv"
        img_file_fd = open(temp_img_file, 'w')
        img_wtr = csv.DictWriter(img_file_fd, ["src_path", "dst_path"],
                                 delimiter='\t',
                                 quotechar='"',
                                 lineterminator='\n')
        img_wtr.writeheader()

        # Prepare TSV structure
        col_descs = [
            a_desc for a_desc in res.cursor.description
            if a_desc.name != "img_src_path"
        ]
        # read latitude column to get float DB type
        for a_desc in col_descs:
            if a_desc.name == "object_lat":
                db_float_type = a_desc.type_code
                break
        else:
            raise
        float_cols = set()
        # Prepare float separator conversion, if not required the set will just be empty
        if req.coma_as_separator:
            for a_desc in col_descs:
                if a_desc.type_code == db_float_type:
                    float_cols.add(a_desc.name)

        tsv_cols = [a_desc.name for a_desc in col_descs]
        tsv_types_line = {
            name: ('[f]' if a_desc.type_code == db_float_type else '[t]')
            for name, a_desc in zip(tsv_cols, col_descs)
        }
        nb_rows = 0
        nb_images = 0
        used_dst_pathes = set()
        for r in res:
            # Rows from SQLAlchemy are not mutable, so we need a clone for arranging values
            a_row = dict(r)
            if ((splitcsv and (prev_value != a_row[split_field])
                 )  # At each split column values change
                    or (nb_rows == 0)):  # And anyway for the first row
                # Start of sequence, eventually end of previous sequence
                if csv_fd:
                    csv_fd.close()  # Close previous file
                    self.store_csv_into_zip(zfile, prev_value, csv_path)
                if splitcsv:
                    prev_value = a_row[split_field]
                logger.info("Writing into file %s", csv_path)
                csv_fd = open(csv_path, 'w', encoding='latin_1')
                csv_wtr = csv.DictWriter(csv_fd,
                                         tsv_cols,
                                         delimiter='\t',
                                         quotechar='"',
                                         lineterminator='\n',
                                         quoting=csv.QUOTE_NONNUMERIC)
                csv_wtr.writeheader()
                if req.exp_type == ExportTypeEnum.backup:
                    # Write types line for backup type
                    csv_wtr.writerow(tsv_types_line)
            if req.with_images:
                copy_op = {"src_path": a_row.pop("img_src_path")}
                if req.exp_type == ExportTypeEnum.dig_obj_ident:
                    # Images will be stored in a per-category directory, but there is a single TSV at the Zip root
                    categ = a_row['object_annotation_category']
                    # All names cannot directly become directories
                    a_row['img_file_name'] = self.get_DOI_imgfile_name(
                        a_row['objid'], a_row['img_rank'], categ,
                        a_row['img_file_name'])
                    copy_op["dst_path"] = a_row['img_file_name']
                else:  # It's a backup
                    # Images are stored in the Zip subdirectory per sample/taxo, i.e. at the same place as
                    # their referring TSV
                    dst_path = "{0}/{1}".format(prev_value,
                                                a_row['img_file_name'])
                    if dst_path in used_dst_pathes:
                        # Avoid duplicates in zip as only the last entry will be present during unzip
                        # root cause: for UVP6 bundles, the vignette and original image are both stored
                        # with the same name.
                        img_with_rank = "{0}/{1}".format(
                            a_row['img_rank'], a_row['img_file_name'])
                        a_row[
                            'img_file_name'] = img_with_rank  # write into TSV the corrected path
                        dst_path = prev_value + "/" + img_with_rank
                    used_dst_pathes.add(dst_path)
                    copy_op["dst_path"] = dst_path
                img_wtr.writerow(copy_op)
                nb_images += 1
            # Remove CR from comments
            if 'C' in req.tsv_entities and a_row['complement_info']:
                a_row['complement_info'] = ' '.join(
                    a_row['complement_info'].splitlines())
            # Replace decimal separator
            for cname in float_cols:
                if a_row[cname] is not None:
                    a_row[cname] = str(a_row[cname]).replace('.', ',')
            assert csv_wtr is not None
            # Produce the row in the TSV
            csv_wtr.writerow(a_row)
            nb_rows += 1
            if nb_rows % self.ROWS_REPORT_EVERY == 0:
                msg = "Row %d of max %d" % (nb_rows, obj_count)
                logger.info(msg)
                self.update_progress(1 + progress_range / obj_count * nb_rows,
                                     msg)
        if csv_fd:
            csv_fd.close()  # Close last file
            self.store_csv_into_zip(zfile, prev_value, csv_path)
        logger.info("Extracted %d rows", nb_rows)
        img_file_fd.close()
        if zfile:
            zfile.close()
        return nb_rows, nb_images
Exemplo n.º 19
0
 def __init__(self, prj_id: int, req: ImportRealReq):
     super().__init__(prj_id, req)
     # Transcode from serialized form
     self.custom_mapping = ProjectMapping().load_from_dict(req.mappings)
Exemplo n.º 20
0
    def aggregate_for_sample(
            self, sample: Sample) -> Dict[ClassifIDT, AggregForTaxon]:
        """
            Do the aggregations for the sample for each taxon and return them, they will become emofs
                - 'Abundance' -> CountOfBiologicalEntity -> count of objects group by taxon
                - 'Concentration' -> AbundancePerUnitVolumeOfTheWaterBody
                    -> sum(individual_concentration) group by taxon
                        with individual_concentration = 1 / subsample_coef / total_water_volume
                - 'Biovolume' -> BiovolumeOfBiologicalEntity -> sum(individual_biovolume) group by taxon
                    with individual_biovolume = individual_volume / subsample_coef / total_water_volume
            The abundance can always be computed. The 2 other ones depend on availability of values
            for the project and the configuration variable.
        """
        # We return all per taxon.
        ret: Dict[ClassifIDT, EMODnetExport.AggregForTaxon] = {}

        count_per_taxon_per_acquis: Dict[AcquisitionIDT, Dict[ClassifIDT,
                                                              int]] = {}

        # Start with abundances, simple count and giving its keys to the returned dict.
        acquis_for_sample = SampleBO.get_acquisitions(self.session, sample)
        for an_acquis in acquis_for_sample:
            # Get counts for acquisition (subsample)
            count_per_taxon_for_acquis: Dict[
                ClassifIDT,
                int] = AcquisitionBO.get_sums_by_taxon(self.session,
                                                       an_acquis.acquisid)
            if self.auto_morpho:
                self.add_morpho_counts(count_per_taxon_for_acquis)
            count_per_taxon_per_acquis[
                an_acquis.acquisid] = count_per_taxon_for_acquis
            for an_id, count_4_acquis in count_per_taxon_for_acquis.items():
                aggreg_for_taxon = ret.get(an_id)
                if aggreg_for_taxon is None:
                    ret[an_id] = self.AggregForTaxon(count_4_acquis, None,
                                                     None)
                else:
                    aggreg_for_taxon.abundance += count_4_acquis

        if not self.with_computations:
            return ret

        # Enrich with concentrations
        subsampling_coeff_per_acquis: Dict[AcquisitionIDT, float] = {}
        try:
            # Fetch calculation data at sample level
            sample_volume = SampleBO.get_computed_var(
                sample, DefaultVars.volume_sampled)
        except TypeError as e:
            self.warnings.append(
                "Could not compute volume sampled from sample %s (%s),"
                " no concentration or biovolume will be computed." %
                (sample.orig_id, str(e)))
            sample_volume = -1
        if sample_volume > 0:
            # Cumulate for subsamples AKA acquisitions
            for an_acquis in acquis_for_sample:
                try:
                    subsampling_coefficient = AcquisitionBO.get_computed_var(
                        an_acquis, DefaultVars.subsample_coeff)
                    subsampling_coeff_per_acquis[
                        an_acquis.acquisid] = subsampling_coefficient
                except TypeError as e:
                    self.warnings.append(
                        "Could not compute subsampling coefficient from acquisition %s (%s),"
                        " no concentration or biovolume will be computed" %
                        (an_acquis.orig_id, str(e)))
                    logger.info(
                        "concentrations: no subsample coeff for '%s' (%s)",
                        an_acquis.orig_id, str(e))
                    continue
                # Get counts for acquisition (sub-sample)
                logger.info("computing concentrations for '%s'",
                            an_acquis.orig_id)
                count_per_taxon_for_acquis = count_per_taxon_per_acquis[
                    an_acquis.acquisid]
                for an_id, count_4_acquis in count_per_taxon_for_acquis.items(
                ):
                    aggreg_for_taxon = ret[an_id]
                    concentration_for_taxon = count_4_acquis / subsampling_coefficient / sample_volume
                    if aggreg_for_taxon.concentration is None:
                        aggreg_for_taxon.concentration = 0
                    aggreg_for_taxon.concentration += concentration_for_taxon

        # Enrich with biovolumes. This needs a computation for each object, so it's likely to be slow.
        if sample_volume > 0:
            # Mappings are constant for the sample
            # noinspection PyTypeChecker
            mapping = ProjectMapping().load_from_project(sample.project)
            # Cumulate for subsamples AKA acquisitions
            for an_acquis in acquis_for_sample:
                subsampling_coefficient = subsampling_coeff_per_acquis.get(
                    an_acquis.acquisid)
                if subsampling_coefficient is None:
                    logger.info("biovolumes: no subsample coeff for '%s'",
                                an_acquis.orig_id)
                    continue
                # Get pixel size from associated process, it a constant to individual biovol computations
                try:
                    pixel_size, = ProcessBO.get_free_fields(
                        an_acquis.process, ["particle_pixel_size_mm"], [float],
                        [None])
                except TypeError as _e:
                    logger.info("biovolumes: no pixel size for '%s'",
                                an_acquis.orig_id)
                    continue
                constants = {"pixel_size": pixel_size}
                # Get all objects for the acquisition. The filter on classif_id is useless for now.
                with CodeTimer("Objects IDs for '%s': " % an_acquis.orig_id,
                               logger):
                    acq_object_ids = AcquisitionBO.get_all_object_ids(
                        session=self.session,
                        acquis_id=an_acquis.acquisid,
                        classif_ids=list(ret.keys()))
                with CodeTimer("Objects for '%s': " % an_acquis.orig_id,
                               logger):
                    objects = ObjectBOSet(self.ro_session, acq_object_ids,
                                          mapping.object_mappings)
                nb_biovols = 0
                for an_obj in objects.all:
                    # Compute a biovol if possible
                    try:
                        biovol = ObjectBO.get_computed_var(
                            an_obj, DefaultVars.equivalent_ellipsoidal_volume,
                            mapping, constants)
                        biovol = -1
                    except TypeError as _e:
                        biovol = -1
                    if biovol == -1:
                        try:
                            biovol = ObjectBO.get_computed_var(
                                an_obj,
                                DefaultVars.equivalent_spherical_volume,
                                mapping, constants)
                        except TypeError as _e:
                            continue
                    # Aggregate by category/taxon
                    aggreg_for_taxon = ret[an_obj.classif_id]
                    individual_biovolume = biovol / subsampling_coefficient / sample_volume
                    if aggreg_for_taxon.biovolume is None:
                        aggreg_for_taxon.biovolume = 0
                    aggreg_for_taxon.biovolume += individual_biovolume
                    # Update stats
                    nb_biovols += 1
                # A bit of display
                logger.info(
                    "%d biovolumes computed for '%s' out of %d objects",
                    nb_biovols, an_acquis.orig_id, len(acq_object_ids))

        return ret