Python ProjectMapping示例，BO.Mappings.ProjectMapping Python示例

示例#1

0

显示文件

文件： Subset.py 项目： ecotaxa/ecotaxa_back

    def _do_clone(self):
        """
            Cloning operation itself. Assumes that @see self.to_clone was populated before.
        """
        # Get the mappings in source project, in order to determines the useful columns
        custom_mapping = ProjectMapping().load_from_project(self.prj)
        obj_mapping = custom_mapping.object_mappings
        used_columns = set(obj_mapping.real_cols_to_tsv.keys())
        used_columns.add("orig_id")  # By safety
        # Create a DB writer
        writer = DBWriter(self.session)
        # Narrow the writes in ObjectFields thanks to mappings of original project
        writer.generators({"obj_field": used_columns})
        # Use import helpers
        dest_prj_id = self.dest_prj.projid
        import_how = ImportHow(prj_id=dest_prj_id,
                               update_mode="No",
                               custom_mapping=ProjectMapping(),
                               skip_object_duplicates=False,
                               loaded_files=[])
        # Get parent (enclosing) Sample, Acquisition, Process. There should be 0 in this context...
        import_how.existing_parents = InBundle.fetch_existing_parents(
            self.session, prj_id=dest_prj_id)

        self._clone_all(import_how, writer)
        # Copy mappings to destination. We could narrow them to the minimum?
        custom_mapping.write_to_project(self.dest_prj)

示例#2

0

显示文件

 def __init__(self, prj_id: int, src_prj_id: int, dry_run: bool):
     super().__init__()
     # params
     self.prj_id = prj_id
     self.src_prj_id = src_prj_id
     self.dry_run = dry_run
     # work vars
     self.remap_operations: Dict[MappedTableTypeT, List[RemapOp]] = {}
     self.dest_augmented_mappings = ProjectMapping()

示例#3

0

显示文件

    def do_real(self) -> None:
        """
            Do the real job, i.e. write everywhere (DB/filesystem)
        """
        loaded_files = none_to_empty(self.prj.fileloaded).splitlines()
        logger.info("Previously loaded files: %s", loaded_files)

        found_users, taxo_found, col_mapping_dict, \
        nb_rows, source_path = self._load_vars_from_state(self.STATE_KEYS)

        # Save mappings straight away
        col_mapping = ProjectMapping().load_from_dict(col_mapping_dict)
        col_mapping.write_to_project(self.prj)
        self.session.commit()

        # TODO: Duplicated code
        source_bundle = InBundle(
            source_path, Path(self.temp_for_jobs.data_dir_for(self.job_id)))
        # Configure the import to come, destination
        db_writer = DBWriter(self.session)
        import_where = ImportWhere(
            db_writer, self.vault,
            self.temp_for_jobs.base_dir_for(self.job_id))
        # Configure the import to come, directives
        import_how = ImportHow(self.prj_id, self.req.update_mode, col_mapping,
                               self.req.skip_existing_objects, loaded_files)
        import_how.taxo_mapping = self.req.taxo_mappings
        import_how.found_taxa = taxo_found
        import_how.found_users = found_users
        if self.req.skip_loaded_files:
            import_how.compute_skipped(source_bundle, logger)
        if self.req.skip_existing_objects:
            # If we must skip existing objects then do an inventory of what's in already
            with CodeTimer("run: Existing images for %d: " % self.prj_id,
                           logger):
                import_how.objects_and_images_to_skip = Image.fetch_existing_images(
                    self.session, self.prj_id)
        import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT']))

        # Do the bulk job of import
        rowcount_from_validate = nb_rows
        row_count = source_bundle.do_import(import_where, import_how,
                                            rowcount_from_validate,
                                            self.report_progress)

        # Update loaded files in DB, removing duplicates
        self.prj.fileloaded = "\n".join(set(import_how.loaded_files))
        self.session.commit()

        # Recompute stats
        ProjectBO.do_after_load(self.session, self.prj_id)
        self.session.commit()

        msg = "Total of %d rows loaded" % row_count
        logger.info(msg)
        self.set_job_result(errors=[], infos={"rowcount": row_count})

示例#4

0

显示文件

    def _verify_possible(self, dest_prj: Project,
                         src_prj: Project) -> List[str]:
        """
            Verify that the merge would not mean a loss of information.
                The mappings of src project should be preserved and copied into dest project.
                Augmented mappings should fit in the allowed maximum size for each entity.
            :param dest_prj:
            :param src_prj:
            :return: a list of problems, empty means we can proceed.
        """
        ret = []
        dest_mappings = ProjectMapping().load_from_project(dest_prj)
        src_mappings = ProjectMapping().load_from_project(src_prj)
        a_tbl: MappedTableTypeT
        for a_tbl in MAPPED_TABLES:
            mappings_for_dest_tbl = dest_mappings.by_table[a_tbl]
            mappings_for_src_tbl = src_mappings.by_table[a_tbl]
            # Compute the new mapping and eventual transformations to get there
            aug, remaps, errs = mappings_for_dest_tbl.augmented_with(
                mappings_for_src_tbl)
            ret.extend(errs)
            if len(remaps) > 0:
                self.remap_operations[a_tbl] = remaps
            # Load future mapping
            self.dest_augmented_mappings.by_table[a_tbl].load_from(aug)

        # Also check problems on consistency of unique orig_id
        dest_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                       prj_id=self.prj_id)
        src_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                      prj_id=self.src_prj_id)

        for an_orig_id_container in [
                Sample.__tablename__, Acquisition.__tablename__
        ]:
            # key=orig_id value, value=full record
            dest_orig_ids = dest_parents[an_orig_id_container]
            src_orig_ids = src_parents[an_orig_id_container]
            common_orig_ids = set(dest_orig_ids.keys()).intersection(
                src_orig_ids.keys())
            if len(common_orig_ids) != 0:
                logger.info("Common %s orig_ids: %s", an_orig_id_container,
                            common_orig_ids)
            for common_orig_id in common_orig_ids:
                orm_diff = orm_equals(dest_orig_ids[common_orig_id],
                                      src_orig_ids[common_orig_id])
                if orm_diff:
                    msg = (
                        "Data conflict: %s record with orig_id '%s' is different in destination project: %s"
                        %
                        (an_orig_id_container, common_orig_id, str(orm_diff)))
                    # TODO: Should be an error?
                    logger.warning(msg)
        return ret

示例#5

0

显示文件

 def __init__(self, current_user: int, prj_id: int, filters: ProjectFilters):
     super().__init__()
     self.requester_id = current_user
     self.filters = filters
     prj = self.session.query(Project).get(prj_id)
     # We don't check for existence
     self.prj: Project = prj
     # Work vars
     self.mapping = ProjectMapping()
     if self.prj:
         self.mapping.load_from_project(self.prj)
     self.ids_to_dump: List[int] = []
     self.already_dumped: Set = set()
     self.first_query = True

示例#6

0

显示文件

 def _collect_existing_and_validate(self, source_dir_or_zip, loaded_files) \
         -> Tuple[ImportHow, ImportDiagnostic, int]:
     """
         Prepare the import by checking what's inside the project and scanning files to input.
     """
     # The mapping to TSV custom columns, either empty or from previous import operations on same project.
     mapping = ProjectMapping().load_from_project(self.prj)
     # Source bundle construction
     bundle_temp_dir = Path(self.temp_for_jobs.data_dir_for(self.job_id))
     source_bundle = InBundle(source_dir_or_zip, bundle_temp_dir)
     # Configure the validation to come, directives.
     import_how = ImportHow(self.prj_id, self.req.update_mode, mapping,
                            self.req.skip_existing_objects, loaded_files)
     if self.req.skip_loaded_files:
         import_how.compute_skipped(source_bundle, logger)
     # A structure to collect validation result
     import_diag = ImportDiagnostic()
     if not self.req.skip_existing_objects:
         with CodeTimer(
                 "collect_existing: Existing images for %d: " % self.prj_id,
                 logger):
             import_diag.existing_objects_and_image = Image.fetch_existing_images(
                 self.session, self.prj_id)
     import_diag.topology.read_from_db(self.session, prj_id=self.prj_id)
     # Do the bulk job of validation
     nb_rows = source_bundle.validate_import(
         import_how, import_diag, self.session,
         self.report_validation_progress)
     return import_how, import_diag, nb_rows

示例#7

0

显示文件

文件： MappedEntity.py 项目： ecotaxa/ecotaxa_back

 def get_free_fields(
         cls,
         mapped: Any,  # TODO: Should be 'MappedEntity'
         field_list: List[str],
         field_types: List[Any],
         field_absent_vals: List[Any]) -> List[Any]:
     """
         Get free fields _values_ for the mapped entity, inside the project
         :param field_list: The names of the free columns for which value is returned
         :param field_types: The basic types e.g. int or float. Used as converters/verifiers e.g: float()
         :param field_absent_vals: The values used as marker for 'absent'. 'nan' for float.
     """
     assert len(field_list) == len(field_types) == len(field_absent_vals)
     mapping = ProjectMapping().load_from_project(
         cls.PROJECT_ACCESSOR(mapped))
     mapped_cols = getattr(mapping,
                           cls.MAPPING_IN_PROJECT).find_tsv_cols(field_list)
     if len(mapped_cols) != len(field_list):
         raise TypeError("at least one of %s free column is absent" %
                         field_list)
     # OK we have the real columns, get the values
     errs, ret = cls._get_values(mapped, mapped_cols, field_types,
                                 field_absent_vals)
     if len(errs) > 0:
         raise TypeError(",".join(errs))
     return list(ret.values())

示例#8

0

显示文件

文件： MappedEntity.py 项目： ecotaxa/ecotaxa_back

 def get_computed_var(
         cls,
         mapped: Any,  # TODO: Should be 'MappedEntity'
         var: ProjectVar,
         mapping: Optional[ProjectMapping] = None,
         constants: Optional[Dict] = None) -> Any:
     """
         For given mapped entity, return the result of evaluating the formula (which returns a variable).
     """
     if mapping is None:
         mapping = ProjectMapping().load_from_project(
             cls.PROJECT_ACCESSOR(mapped))
     # Filter what is known in mapping.
     mapped_cols = getattr(mapping, cls.MAPPING_IN_PROJECT).find_tsv_cols(
         var.variable_names)
     types = [float] * len(mapped_cols)
     absent = [None] * len(mapped_cols)
     # Data extraction and formula evaluation
     _errs, var_vals = cls._get_values(mapped,
                                       mapped_cols,
                                       field_types=types,
                                       field_absent_vals=absent)
     if constants is not None:
         var_vals.update(constants)
     try:
         import math
         ret = eval(var.code, {"math": math}, var_vals)
         if not var.is_valid(ret):
             raise TypeError("Not valid %s: %s" % (var.formula, str(ret)))
     except Exception as e:
         # Basically anything can happen here
         raise TypeError(str(e))
     return ret

示例#9

0

显示文件

文件： Import.py 项目： grololo06/ecotaxa_back

 def do_intra_step_1(self, loaded_files):
     # The mapping to custom columns, either empty or from previous import API_operations on same project.
     custom_mapping = ProjectMapping().load_from_project(self.prj)
     # Source bundle construction
     source_bundle = InBundle(
         self.source_dir_or_zip,
         Path(self.temp_for_task.data_dir_for(self.task_id)))
     # Configure the validation to come, directives.
     import_how = ImportHow(self.prj_id, self.req.update_mode,
                            custom_mapping, self.req.skip_existing_objects,
                            loaded_files)
     if self.req.skip_loaded_files:
         import_how.compute_skipped(source_bundle, logger)
     # A structure to collect validation result
     import_diag = ImportDiagnostic()
     if not self.req.skip_existing_objects:
         with CodeTimer(
                 "do_intra_step_1: Existing images for %d: " % self.prj_id,
                 logger):
             import_diag.existing_objects_and_image = Image.fetch_existing_images(
                 self.session, self.prj_id)
     import_diag.topology.read_from_db(self.session, prj_id=self.prj_id)
     # Do the bulk job of validation
     nb_rows = source_bundle.validate_import(import_how, import_diag,
                                             self.session,
                                             self.report_progress)
     return import_how, import_diag, nb_rows

示例#10

0

显示文件

文件： Project.py 项目： ecotaxa/ecotaxa_back

 def enrich(self):
     """
         Add DB fields and relations as (hopefully more) meaningful attributes
     """
     # Decode mappings to avoid exposing internal field
     mappings = ProjectMapping().load_from_project(self._project)
     self.obj_free_cols = mappings.object_mappings.tsv_cols_to_real
     self.sample_free_cols = mappings.sample_mappings.tsv_cols_to_real
     self.acquisition_free_cols = mappings.acquisition_mappings.tsv_cols_to_real
     self.process_free_cols = mappings.process_mappings.tsv_cols_to_real
     # Decode text list into numerical
     db_list = self._project.initclassiflist
     db_list = db_list if db_list else ""
     self.init_classif_list = [int(x) for x in db_list.split(",") if x.isdigit()]
     # Dispatch members by right
     by_right = {ProjectPrivilegeBO.MANAGE: self.managers,
                 ProjectPrivilegeBO.ANNOTATE: self.annotators,
                 ProjectPrivilegeBO.VIEW: self.viewers}
     a_priv: ProjectPrivilege
     # noinspection PyTypeChecker
     for a_priv in self._project.privs_for_members:
         if a_priv.user is None:  # TODO: There is a line with NULL somewhere in DB
             continue
         if not a_priv.user.active:
             continue
         # noinspection PyTypeChecker
         by_right[a_priv.privilege].append(a_priv.user)
         if 'C' == a_priv.extra:
             self.contact = a_priv.user
     return self

示例#11

0

显示文件

 def get_free_fields(cls, acquis: Acquisition,
                     fields_list: List[str]) -> List[Any]:
     """ Get free fields _value_ for the acquisition. """
     mapping = ProjectMapping().load_from_project(acquis.sample.project)
     real_cols = mapping.acquisition_mappings.find_tsv_cols(fields_list)
     if len(real_cols) != len(fields_list):
         raise TypeError("free column not found")
     return [getattr(acquis, real_col) for real_col in real_cols]

示例#12

0

显示文件

文件： MappedTable.py 项目： grololo06/ecotaxa_back

 def _apply_on_all(self, clazz: MappedTableTypeT, project: Project, updates: List[ColUpdate]) -> int:
     """
         Apply all updates on all impacted rows by their ID.
     """
     prj_mappings = ProjectMapping().load_from_project(project)
     tbl_mappings = prj_mappings.by_table[clazz].tsv_cols_to_real
     clean_updates = self._sanitize_updates(clazz, tbl_mappings, updates)
     # Eventually there is nothing left after filtering
     if len(clean_updates) == 0:
         return 0
     return self._do_updates(clazz, clean_updates)

示例#13

0

显示文件

文件： ObjectParents.py 项目： ecotaxa/ecotaxa_back

 def query(self, current_user_id: Optional[int], acquisition_id: AcquisitionIDT) -> Optional[AcquisitionBO]:
     ret = AcquisitionBO(self.ro_session, acquisition_id)
     if not ret.exists():
         return None
     assert ret.acquis is not None
     # Security check
     if current_user_id is None:
         project = RightsBO.anonymous_wants(self.ro_session, Action.READ, ret.acquis.sample.projid)
     else:
         _user, project = RightsBO.user_wants(self.session, current_user_id, Action.READ, ret.acquis.sample.projid)
     mappings = ProjectMapping().load_from_project(project)
     ret.map_free_columns(mappings.acquisition_mappings)
     return ret

示例#14

0

显示文件

文件： Object.py 项目： ecotaxa/ecotaxa_back

 def query(self, current_user_id: Optional[int], object_id: ObjectIDT) -> Optional[ObjectBO]:
     ret = ObjectBO(self.ro_session, object_id)
     if not ret.exists():
         return None
     # Security check
     projid = ret.header.acquisition.sample.projid
     if current_user_id is None:
         project = RightsBO.anonymous_wants(self.session, Action.READ, projid)
     else:
         _user, project = RightsBO.user_wants(self.session, current_user_id, Action.READ, projid)
     assert project is not None
     mappings = ProjectMapping().load_from_project(project)
     ret.map_free_columns(mappings.object_mappings)
     return ret

示例#15

0

显示文件

文件： ObjectParents.py 项目： ecotaxa/ecotaxa_back

 def query(self, current_user_id: Optional[UserIDT], sample_id: SampleIDT) -> Optional[SampleBO]:
     ret = SampleBO(self.ro_session, sample_id)
     if not ret.exists():
         return None
     assert ret.sample is not None
     assert ret.sample.projid is not None  # TODO: Why need this?
     # Security check
     if current_user_id is None:
         project = RightsBO.anonymous_wants(self.ro_session, Action.READ, ret.sample.projid)
     else:
         _user, project = RightsBO.user_wants(self.session, current_user_id, Action.READ, ret.sample.projid)
     mappings = ProjectMapping().load_from_project(project)
     ret.map_free_columns(mappings.sample_mappings)
     return ret

示例#16

0

显示文件

    def do_import(self):
        """
            Do the real job, i.e. copy files while creating records.
        """
        errors = []
        self.manage_uploaded()
        self.unzip_if_needed()
        # Use a Bundle
        source_bundle = InBundle(
            self.source_dir_or_zip,
            Path(self.temp_for_task.data_dir_for(self.task_id)))
        # Clean it, in case the ZIP contains a CSV
        source_bundle.remove_all_tsvs()
        images = source_bundle.list_image_files()
        # Configure the import to come, destination
        db_writer = DBWriter(self.session)
        import_where = ImportWhere(
            db_writer, self.vault,
            self.temp_for_task.base_dir_for(self.task_id))
        # Configure the import to come, directives
        import_how = ImportHow(prj_id=self.prj_id,
                               update_mode="",
                               custom_mapping=ProjectMapping(),
                               skip_object_duplicates=False,
                               loaded_files=[])
        import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT']))
        # Generate TSV
        req_values = self.req.values
        if req_values.get(SimpleImportFields.userlb, ""):
            import_how.found_users["user"] = {
                "id": req_values.get(SimpleImportFields.userlb)
            }
            req_values[SimpleImportFields.userlb] = "user"
        if req_values.get(SimpleImportFields.status, ""):
            req_values[SimpleImportFields.status] = classif_qual.get(
                req_values[SimpleImportFields.status], "")
        self.make_tsv(source_bundle, images)
        # Import
        nb_image_files = len(images)
        nb_images = source_bundle.do_import(import_where, import_how,
                                            nb_image_files,
                                            self.report_progress)
        self.session.commit()

        # Recompute stats and so on
        ProjectBO.do_after_load(self.session, self.prj_id)
        self.session.commit()

        ret = SimpleImportRsp(errors=errors, nb_images=nb_images)
        return ret

示例#17

0

显示文件

文件： Project.py 项目： ecotaxa/ecotaxa_back

 def get_all_num_columns_values(self, session: Session):
     """
         Get all numerical free fields values for all objects in a project.
     """
     from DB.helpers.ORM import MetaData
     metadata = MetaData(bind=session.get_bind())
     # TODO: Cache in a member
     mappings = ProjectMapping().load_from_project(self._project)
     num_fields_cols = set([col for col in mappings.object_mappings.tsv_cols_to_real.values()
                            if col[0] == 'n'])
     obj_fields_tbl = minimal_table_of(metadata, ObjectFields, num_fields_cols, exact_floats=True)
     qry: Query = session.query(Project)
     qry = qry.join(Project.all_samples).join(Sample.all_acquisitions).join(Acquisition.all_objects)
     qry = qry.join(obj_fields_tbl, ObjectHeader.objid == obj_fields_tbl.c.objfid)
     qry = qry.filter(Project.projid == self._project.projid)
     qry = qry.order_by(Acquisition.acquisid)
     qry = qry.with_entities(Acquisition.acquisid, Acquisition.orig_id, obj_fields_tbl)
     return qry.all()

示例#18

0

显示文件

文件： TSVFile.py 项目： ecotaxa/ecotaxa_back

 def dispatch_fields_by_table(custom_mapping: ProjectMapping,
                              field_set: Set) -> Dict[str, Set]:
     """
         Build a set of target DB columns by target table.
     :param custom_mapping:
     :param field_set:
     :return:
     """
     ret: Dict[str, Set] = {
         alias: set()
         for alias in GlobalMapping.TARGET_CLASSES.keys()
     }
     for a_field in field_set:
         mapping = GlobalMapping.PREDEFINED_FIELDS.get(a_field)
         if not mapping:
             mapping = custom_mapping.search_field(a_field)
             assert mapping is not None
         target_tbl = mapping["table"]
         target_fld = mapping["field"]
         ret[target_tbl].add(target_fld)
     return ret

示例#19

0

显示文件

文件： ForProject.py 项目： ecotaxa/ecotaxa_back

    def create_tsv(self, src_project: Project,
                   end_progress: int) -> Tuple[int, int]:
        """
            Create the TSV file.
        """
        req = self.req
        proj_id = src_project.projid
        self.update_progress(1, "Start TSV export")
        progress_range = end_progress - 1

        # Get a fast count of the maximum of what to do
        count_sql = "SELECT SUM(nbr) AS cnt FROM projects_taxo_stat WHERE projid = :prj"
        res = self.ro_session.execute(text(count_sql), {"prj": proj_id})
        obj_count = res.first()[0]

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(
            self.ro_session, proj_id, self.filters)

        # Backup or not, the column namings are taken from common mapping
        # @See Mapping.py
        # TSV column order
        # field_order = ["object_id", "object_lat", "object_lon", "object_date", "object_time", "object_depth_max",
        #                "object_annotation_status", "object_annotation_person_name", "object_annotation_person_email",
        #                "object_annotation_date", "object_annotation_time", "object_annotation_category"]
        # formats = {"object_date": "TO_CHAR({0},'YYYYMMDD')",
        #            "object_time": "TO_CHAR({0},'HH24MISS')",
        #            "object_annotation_date": "TO_CHAR({0},'YYYYMMDD')",
        #            "object_annotation_time": "TO_CHAR({0},'HH24MISS')",
        #            "object_annotation_status": """
        #                  CASE {0}
        #                     WHEN 'V' then 'validated'
        #                     WHEN 'P' then 'predicted'
        #                     WHEN 'D' then 'dubious'
        #                     ELSE {0}
        #                  END
        #            """
        #            }
        # prefices = {ObjectHeader.__tablename__: "obh",
        #             }
        # for a_fld in field_order:
        #     mpg = GlobalMapping.PREDEFINED_FIELDS[a_fld]
        #     mpg[""]
        #     assert a_fld in GlobalMapping.PREDEFINED_FIELDS, "%s is not a mapped column" % a_fld
        date_fmt, time_fmt = "YYYYMMDD", "HH24MISS"
        if req.format_dates_times:
            date_fmt, time_fmt = "YYYY-MM-DD", "HH24:MI:SS"

        select_clause = "select "

        if req.with_images or (req.exp_type == ExportTypeEnum.backup):
            select_clause += "img.orig_file_name AS img_file_name, img.imgrank AS img_rank"
            if req.with_images:
                select_clause += ", img.file_name AS img_src_path"
            select_clause += ",\n"

        select_clause += """obh.orig_id AS object_id, obh.latitude AS object_lat, obh.longitude AS object_lon,
                         TO_CHAR(obh.objdate,'{0}') AS object_date,
                         TO_CHAR(obh.objtime,'{1}') AS object_time,
                         obh.object_link, obh.depth_min AS object_depth_min, obh.depth_max AS object_depth_max,
                         CASE obh.classif_qual 
                            WHEN 'V' then 'validated' 
                            WHEN 'P' then 'predicted' 
                            WHEN 'D' then 'dubious' 
                            ELSE obh.classif_qual 
                         END AS object_annotation_status,                
                         usr.name AS object_annotation_person_name, usr.email AS object_annotation_person_email,
                         TO_CHAR(obh.classif_when,'{0}') AS object_annotation_date,
                         TO_CHAR(obh.classif_when,'{1}') AS object_annotation_time,                
                         txo.display_name AS object_annotation_category 
                    """.format(date_fmt, time_fmt)
        if req.exp_type == ExportTypeEnum.backup:
            select_clause += ", txo.id AS object_annotation_category_id"
        else:
            select_clause += "," + TaxonomyBO.parents_sql(
                "obh.classif_id") + " AS object_annotation_hierarchy"

        if 'C' in req.tsv_entities:
            select_clause += "\n, obh.complement_info"

        # Deal with mappings, the goal is to emit SQL which will reconstitute the TSV structure
        src_mappings = ProjectMapping().load_from_project(src_project)
        if 'O' in req.tsv_entities:
            select_clause += "\n " + src_mappings.object_mappings.as_select_list(
                "obf")

        if 'S' in req.tsv_entities:
            select_clause += "\n, sam.orig_id AS sample_id, sam.dataportal_descriptor AS sample_dataportal_descriptor "
            select_clause += src_mappings.sample_mappings.as_select_list("sam")

        if 'P' in req.tsv_entities:
            select_clause += "\n, prc.orig_id AS process_id "
            select_clause += src_mappings.process_mappings.as_select_list(
                "prc")

        if 'A' in req.tsv_entities:
            select_clause += "\n, acq.orig_id AS acq_id, acq.instrument AS acq_instrument "
            select_clause += src_mappings.acquisition_mappings.as_select_list(
                "acq")

        if req.exp_type == ExportTypeEnum.dig_obj_ident:
            select_clause += "\n, obh.objid"

        if req.with_internal_ids:
            select_clause += """\n, obh.objid, 
                    obh.acquisid AS processid_internal, obh.acquisid AS acq_id_internal, 
                    sam.sampleid AS sample_id_internal, 
                    obh.classif_id, obh.classif_who, obh.classif_auto_id, txp.name classif_auto_name, 
                    obh.classif_auto_score, obh.classif_auto_when,
                    obh.random_value object_random_value, obh.sunpos object_sunpos """
            if 'S' in req.tsv_entities:
                select_clause += "\n, sam.latitude sample_lat, sam.longitude sample_long "

        # TODO: The condition on o.projid=1 in historical code below prevents any data production
        # if 'H' in req.tsv_entities:
        #     sql1 += " , oh.classif_date AS histoclassif_date, classif_type AS histoclassif_type, " \
        #             "to3.name histoclassif_name, oh.classif_qual histoclassif_qual,uo3.name histoclassif_who, " \
        #             "classif_score histoclassif_score"
        #     sql2 += """ LEFT JOIN (select o.objid, classif_date, classif_type, och.classif_id,
        #                                   och.classif_qual, och.classif_who, classif_score
        #                              from objectsclassifhisto och
        #                              join objects o on o.objid=och.objid and o.projid=1 {0}
        #                            union all
        #                            select o.objid, o.classif_when classif_date, 'C' classif_type, classif_id,
        #                                   classif_qual, classif_who, NULL
        #                              from objects o {0} where o.projid=1
        #                           ) oh on o.objid=oh.objid
        #                 LEFT JOIN taxonomy to3 on oh.classif_id=to3.id
        #                 LEFT JOIN users uo3 on oh.classif_who=uo3.id
        #             """.format(samplefilter)

        order_clause = OrderClause()
        if req.split_by == "sample":
            order_clause.add_expression("sam", "orig_id")
            split_field = "sample_id"  # AKA sam.orig_id, but renamed in select list
        elif req.split_by == "taxo":
            select_clause += "\n, txo.display_name AS taxo_parent_child "
            order_clause.add_expression(None, "taxo_parent_child")
            split_field = "taxo_parent_child"
        else:
            order_clause.add_expression("sam", "orig_id")
            split_field = "object_id"  # cette valeur permet d'éviter des erreurs plus loin dans r[split_field]
        order_clause.add_expression("obh", "objid")

        if req.with_images or (req.exp_type == ExportTypeEnum.backup):
            order_clause.add_expression(None, "img_rank")

        # Base SQL comes from filters
        from_, where, params = object_set.get_sql(
            self._get_owner_id(),
            order_clause,
            select_clause,
            all_images=not req.only_first_image)
        sql = select_clause + " FROM " + from_.get_sql() + where.get_sql(
        ) + order_clause.get_sql()
        logger.info("Execute SQL : %s" % sql)
        logger.info("Params : %s" % params)

        res = self.ro_session.execute(text(sql), params)

        now_txt = DateTime.now_time().strftime("%Y%m%d_%H%M")
        self.out_file_name = "export_{0:d}_{1:s}.{2}".format(
            proj_id, now_txt, "zip")

        produced_path = self.out_path / self.out_file_name
        zfile = zipfile.ZipFile(produced_path,
                                'w',
                                allowZip64=True,
                                compression=zipfile.ZIP_DEFLATED)

        splitcsv = (req.split_by != "")
        csv_filename = 'data.tsv'  # Just a temp name as there is a rename while filling up the Zip
        if splitcsv:
            # Produce into the same temp file all the time, at zipping time the name in archive will vary
            prev_value = "NotAssigned"  # To trigger a sequence change immediately
        else:
            # The zip will contain a single TSV with same base name as the zip
            prev_value = self.out_file_name.replace('.zip', '')

        csv_path: Path = self.out_path / csv_filename  # Constant path to a (sometimes) changing file
        csv_fd: Optional[IO] = None
        csv_wtr = None

        # Store the images to save in a separate CSV. Useless if not exporting images but who cares.
        temp_img_file = self.out_path / "images.csv"
        img_file_fd = open(temp_img_file, 'w')
        img_wtr = csv.DictWriter(img_file_fd, ["src_path", "dst_path"],
                                 delimiter='\t',
                                 quotechar='"',
                                 lineterminator='\n')
        img_wtr.writeheader()

        # Prepare TSV structure
        col_descs = [
            a_desc for a_desc in res.cursor.description
            if a_desc.name != "img_src_path"
        ]
        # read latitude column to get float DB type
        for a_desc in col_descs:
            if a_desc.name == "object_lat":
                db_float_type = a_desc.type_code
                break
        else:
            raise
        float_cols = set()
        # Prepare float separator conversion, if not required the set will just be empty
        if req.coma_as_separator:
            for a_desc in col_descs:
                if a_desc.type_code == db_float_type:
                    float_cols.add(a_desc.name)

        tsv_cols = [a_desc.name for a_desc in col_descs]
        tsv_types_line = {
            name: ('[f]' if a_desc.type_code == db_float_type else '[t]')
            for name, a_desc in zip(tsv_cols, col_descs)
        }
        nb_rows = 0
        nb_images = 0
        used_dst_pathes = set()
        for r in res:
            # Rows from SQLAlchemy are not mutable, so we need a clone for arranging values
            a_row = dict(r)
            if ((splitcsv and (prev_value != a_row[split_field])
                 )  # At each split column values change
                    or (nb_rows == 0)):  # And anyway for the first row
                # Start of sequence, eventually end of previous sequence
                if csv_fd:
                    csv_fd.close()  # Close previous file
                    self.store_csv_into_zip(zfile, prev_value, csv_path)
                if splitcsv:
                    prev_value = a_row[split_field]
                logger.info("Writing into file %s", csv_path)
                csv_fd = open(csv_path, 'w', encoding='latin_1')
                csv_wtr = csv.DictWriter(csv_fd,
                                         tsv_cols,
                                         delimiter='\t',
                                         quotechar='"',
                                         lineterminator='\n',
                                         quoting=csv.QUOTE_NONNUMERIC)
                csv_wtr.writeheader()
                if req.exp_type == ExportTypeEnum.backup:
                    # Write types line for backup type
                    csv_wtr.writerow(tsv_types_line)
            if req.with_images:
                copy_op = {"src_path": a_row.pop("img_src_path")}
                if req.exp_type == ExportTypeEnum.dig_obj_ident:
                    # Images will be stored in a per-category directory, but there is a single TSV at the Zip root
                    categ = a_row['object_annotation_category']
                    # All names cannot directly become directories
                    a_row['img_file_name'] = self.get_DOI_imgfile_name(
                        a_row['objid'], a_row['img_rank'], categ,
                        a_row['img_file_name'])
                    copy_op["dst_path"] = a_row['img_file_name']
                else:  # It's a backup
                    # Images are stored in the Zip subdirectory per sample/taxo, i.e. at the same place as
                    # their referring TSV
                    dst_path = "{0}/{1}".format(prev_value,
                                                a_row['img_file_name'])
                    if dst_path in used_dst_pathes:
                        # Avoid duplicates in zip as only the last entry will be present during unzip
                        # root cause: for UVP6 bundles, the vignette and original image are both stored
                        # with the same name.
                        img_with_rank = "{0}/{1}".format(
                            a_row['img_rank'], a_row['img_file_name'])
                        a_row[
                            'img_file_name'] = img_with_rank  # write into TSV the corrected path
                        dst_path = prev_value + "/" + img_with_rank
                    used_dst_pathes.add(dst_path)
                    copy_op["dst_path"] = dst_path
                img_wtr.writerow(copy_op)
                nb_images += 1
            # Remove CR from comments
            if 'C' in req.tsv_entities and a_row['complement_info']:
                a_row['complement_info'] = ' '.join(
                    a_row['complement_info'].splitlines())
            # Replace decimal separator
            for cname in float_cols:
                if a_row[cname] is not None:
                    a_row[cname] = str(a_row[cname]).replace('.', ',')
            assert csv_wtr is not None
            # Produce the row in the TSV
            csv_wtr.writerow(a_row)
            nb_rows += 1
            if nb_rows % self.ROWS_REPORT_EVERY == 0:
                msg = "Row %d of max %d" % (nb_rows, obj_count)
                logger.info(msg)
                self.update_progress(1 + progress_range / obj_count * nb_rows,
                                     msg)
        if csv_fd:
            csv_fd.close()  # Close last file
            self.store_csv_into_zip(zfile, prev_value, csv_path)
        logger.info("Extracted %d rows", nb_rows)
        img_file_fd.close()
        if zfile:
            zfile.close()
        return nb_rows, nb_images

示例#20

0

显示文件

class JsonDumper(Service):
    """
        Dump in JSON form a project content, with filters.
            Mapped columns are rendered with their user-visible name (TSV one).
            The fields are reduced to 4(max)-letter names for saving bandwidth and DB fields independence.
        TODO as an option:
            No numeric primary or foreign key is present so the output can be diff-ed.
            The sub-entities are ordered by their 'natural' key, e.g. sample_id for samples.
    """

    def __init__(self, current_user: int, prj_id: int, filters: ProjectFilters):
        super().__init__()
        self.requester_id = current_user
        self.filters = filters
        prj = self.session.query(Project).get(prj_id)
        # We don't check for existence
        self.prj: Project = prj
        # Work vars
        self.mapping = ProjectMapping()
        if self.prj:
            self.mapping.load_from_project(self.prj)
        self.ids_to_dump: List[int] = []
        self.already_dumped: Set = set()
        self.first_query = True

    def run(self, out_stream: IO):
        """
            Produce the json into given stream.
        """
        if self.prj is None:
            to_stream = {}
        else:
            self._find_what_to_dump()
            # TODO: The result seems unneeded so far, we just need the stuff loaded in SQLA session
            _to_dump = self._db_fetch(self.ids_to_dump)
            # prj = to_dump[0][0]
            if len(self.ids_to_dump) == 0 or len(_to_dump) == 0:
                to_stream = {}
            else:
                to_stream = self.dump_row(out_stream, self.prj)
        json.dump(obj=to_stream, fp=out_stream, indent="  ")

    def dump_row(self, out_stream: IO, a_row: Model) -> Dict[str, Any]:
        """
            Dump inside returned value the fields and contained/linked entities from a_row.
        """
        ret: Dict = {}
        self._dump_into_dict(out_stream, a_row, ret)
        return ret

    def _dump_into_dict(self, out_stream: IO, a_row: Model, tgt_dict: Dict):
        """
            Dump inside the tgt_dict all fields and contained/linked entities from a_row.
        """
        # Ensure there no infinite loop
        assert a_row not in self.already_dumped
        self.already_dumped.add(a_row)
        # Dump using instructions
        row_class = type(a_row)
        desc: JSONDesc = JSON_FIELDS[row_class]
        for a_field_or_relation, how in desc.items():
            fld_name = a_field_or_relation.key
            # This is where SQLAlchemy does all its magic when it's a relation
            attr = getattr(a_row, fld_name)
            if isinstance(attr, list):
                # Serialize the list of child entities, ordinary relationship
                children: List[Dict] = []
                tgt_dict[how] = children
                for a_child_row in sorted(attr):
                    child_obj = self.dump_row(out_stream, a_child_row)
                    children.append(child_obj)
            elif isinstance(attr, Model):
                # A twin object AKA 'uselist=False' relationship
                if isinstance(attr, Process):
                    # Keep process in single-entity list for now
                    child_obj = self.dump_row(out_stream, attr)
                    tgt_dict[how] = [child_obj]
                else:
                    # Dump into same output dict.
                    self._dump_into_dict(out_stream, attr, tgt_dict)
            else:
                # Ordinary field
                if attr is not None:
                    tgt_dict[how] = attr
        # Dump mapped fields if any
        tbl_mapping = self.mapping.by_table_name.get(row_class.__tablename__)
        if tbl_mapping is not None:
            for a_db_col, a_tsv_col in tbl_mapping.real_cols_to_tsv.items():
                attr = getattr(a_row, a_db_col)
                if attr is not None:
                    tgt_dict[a_tsv_col] = attr

    def _find_what_to_dump(self) -> None:
        """
            Determine the objects to dump.
        """
        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(self.session, self.prj.projid, self.filters)
        from_, where, params = object_set.get_sql(self.requester_id)

        sql = """ SELECT objid FROM """ + from_.get_sql() + where.get_sql()

        logger.info("SQL=%s", sql)
        logger.info("SQLParam=%s", params)

        with CodeTimer("Get IDs:", logger):
            res: ResultProxy = self.session.execute(sql, params)
        ids = [r['objid'] for r in res]

        logger.info("NB OBJIDS=%d", len(ids))

        self.ids_to_dump = ids

    def _db_fetch(self, objids: List[int]) -> List[DBObjectTupleT]:
        """
            Do a DB read of given objects, with auxiliary objects.
            Thanks to 'contains_eager' calls, the objects are loaded into SQLAlchemy session.
            :param objids:
            :return:
        """
        ret: Query = self.session.query(Project, Sample, Acquisition, Process, ObjectHeader, ObjectFields, Image)
        ret = ret.join(Sample, Project.all_samples).options(contains_eager(Project.all_samples))
        ret = ret.join(Acquisition, Sample.all_acquisitions)
        ret = ret.join(Process, Acquisition.process)
        ret = ret.join(ObjectHeader, Acquisition.all_objects)
        # Natural joins
        ret = ret.join(ObjectFields)
        ret = ret.join(Image, ObjectHeader.all_images).options(contains_eager(ObjectHeader.all_images))
        ret = ret.filter(ObjectHeader.objid == any_(objids))
        ret = ret.order_by(ObjectHeader.objid)
        ret = ret.order_by(Image.imgrank)

        if self.first_query:
            logger.info("Query: %s", str(ret))
            self.first_query = False

        with CodeTimer("Get Objects:", logger):
            objs = [an_obj for an_obj in ret.all()]

        # We get as many lines as images
        logger.info("NB ROWS JOIN=%d", len(objs))

        return objs

示例#21

0

显示文件

文件： Import.py 项目： grololo06/ecotaxa_back

 def __init__(self, prj_id: int, req: ImportRealReq):
     super().__init__(prj_id, req)
     # Transcode from serialized form
     self.custom_mapping = ProjectMapping().load_from_dict(req.mappings)

示例#22

0

显示文件

文件： EMODnet.py 项目： ecotaxa/ecotaxa_back

    def aggregate_for_sample(
            self, sample: Sample) -> Dict[ClassifIDT, AggregForTaxon]:
        """
            Do the aggregations for the sample for each taxon and return them, they will become emofs
                - 'Abundance' -> CountOfBiologicalEntity -> count of objects group by taxon
                - 'Concentration' -> AbundancePerUnitVolumeOfTheWaterBody
                    -> sum(individual_concentration) group by taxon
                        with individual_concentration = 1 / subsample_coef / total_water_volume
                - 'Biovolume' -> BiovolumeOfBiologicalEntity -> sum(individual_biovolume) group by taxon
                    with individual_biovolume = individual_volume / subsample_coef / total_water_volume
            The abundance can always be computed. The 2 other ones depend on availability of values
            for the project and the configuration variable.
        """
        # We return all per taxon.
        ret: Dict[ClassifIDT, EMODnetExport.AggregForTaxon] = {}

        count_per_taxon_per_acquis: Dict[AcquisitionIDT, Dict[ClassifIDT,
                                                              int]] = {}

        # Start with abundances, simple count and giving its keys to the returned dict.
        acquis_for_sample = SampleBO.get_acquisitions(self.session, sample)
        for an_acquis in acquis_for_sample:
            # Get counts for acquisition (subsample)
            count_per_taxon_for_acquis: Dict[
                ClassifIDT,
                int] = AcquisitionBO.get_sums_by_taxon(self.session,
                                                       an_acquis.acquisid)
            if self.auto_morpho:
                self.add_morpho_counts(count_per_taxon_for_acquis)
            count_per_taxon_per_acquis[
                an_acquis.acquisid] = count_per_taxon_for_acquis
            for an_id, count_4_acquis in count_per_taxon_for_acquis.items():
                aggreg_for_taxon = ret.get(an_id)
                if aggreg_for_taxon is None:
                    ret[an_id] = self.AggregForTaxon(count_4_acquis, None,
                                                     None)
                else:
                    aggreg_for_taxon.abundance += count_4_acquis

        if not self.with_computations:
            return ret

        # Enrich with concentrations
        subsampling_coeff_per_acquis: Dict[AcquisitionIDT, float] = {}
        try:
            # Fetch calculation data at sample level
            sample_volume = SampleBO.get_computed_var(
                sample, DefaultVars.volume_sampled)
        except TypeError as e:
            self.warnings.append(
                "Could not compute volume sampled from sample %s (%s),"
                " no concentration or biovolume will be computed." %
                (sample.orig_id, str(e)))
            sample_volume = -1
        if sample_volume > 0:
            # Cumulate for subsamples AKA acquisitions
            for an_acquis in acquis_for_sample:
                try:
                    subsampling_coefficient = AcquisitionBO.get_computed_var(
                        an_acquis, DefaultVars.subsample_coeff)
                    subsampling_coeff_per_acquis[
                        an_acquis.acquisid] = subsampling_coefficient
                except TypeError as e:
                    self.warnings.append(
                        "Could not compute subsampling coefficient from acquisition %s (%s),"
                        " no concentration or biovolume will be computed" %
                        (an_acquis.orig_id, str(e)))
                    logger.info(
                        "concentrations: no subsample coeff for '%s' (%s)",
                        an_acquis.orig_id, str(e))
                    continue
                # Get counts for acquisition (sub-sample)
                logger.info("computing concentrations for '%s'",
                            an_acquis.orig_id)
                count_per_taxon_for_acquis = count_per_taxon_per_acquis[
                    an_acquis.acquisid]
                for an_id, count_4_acquis in count_per_taxon_for_acquis.items(
                ):
                    aggreg_for_taxon = ret[an_id]
                    concentration_for_taxon = count_4_acquis / subsampling_coefficient / sample_volume
                    if aggreg_for_taxon.concentration is None:
                        aggreg_for_taxon.concentration = 0
                    aggreg_for_taxon.concentration += concentration_for_taxon

        # Enrich with biovolumes. This needs a computation for each object, so it's likely to be slow.
        if sample_volume > 0:
            # Mappings are constant for the sample
            # noinspection PyTypeChecker
            mapping = ProjectMapping().load_from_project(sample.project)
            # Cumulate for subsamples AKA acquisitions
            for an_acquis in acquis_for_sample:
                subsampling_coefficient = subsampling_coeff_per_acquis.get(
                    an_acquis.acquisid)
                if subsampling_coefficient is None:
                    logger.info("biovolumes: no subsample coeff for '%s'",
                                an_acquis.orig_id)
                    continue
                # Get pixel size from associated process, it a constant to individual biovol computations
                try:
                    pixel_size, = ProcessBO.get_free_fields(
                        an_acquis.process, ["particle_pixel_size_mm"], [float],
                        [None])
                except TypeError as _e:
                    logger.info("biovolumes: no pixel size for '%s'",
                                an_acquis.orig_id)
                    continue
                constants = {"pixel_size": pixel_size}
                # Get all objects for the acquisition. The filter on classif_id is useless for now.
                with CodeTimer("Objects IDs for '%s': " % an_acquis.orig_id,
                               logger):
                    acq_object_ids = AcquisitionBO.get_all_object_ids(
                        session=self.session,
                        acquis_id=an_acquis.acquisid,
                        classif_ids=list(ret.keys()))
                with CodeTimer("Objects for '%s': " % an_acquis.orig_id,
                               logger):
                    objects = ObjectBOSet(self.ro_session, acq_object_ids,
                                          mapping.object_mappings)
                nb_biovols = 0
                for an_obj in objects.all:
                    # Compute a biovol if possible
                    try:
                        biovol = ObjectBO.get_computed_var(
                            an_obj, DefaultVars.equivalent_ellipsoidal_volume,
                            mapping, constants)
                        biovol = -1
                    except TypeError as _e:
                        biovol = -1
                    if biovol == -1:
                        try:
                            biovol = ObjectBO.get_computed_var(
                                an_obj,
                                DefaultVars.equivalent_spherical_volume,
                                mapping, constants)
                        except TypeError as _e:
                            continue
                    # Aggregate by category/taxon
                    aggreg_for_taxon = ret[an_obj.classif_id]
                    individual_biovolume = biovol / subsampling_coefficient / sample_volume
                    if aggreg_for_taxon.biovolume is None:
                        aggreg_for_taxon.biovolume = 0
                    aggreg_for_taxon.biovolume += individual_biovolume
                    # Update stats
                    nb_biovols += 1
                # A bit of display
                logger.info(
                    "%d biovolumes computed for '%s' out of %d objects",
                    nb_biovols, an_acquis.orig_id, len(acq_object_ids))

        return ret

示例#23

0

显示文件

class MergeService(Service, LogEmitter):
    """
        Merge operation, move everything from source into destination project.
    """
    def __init__(self, prj_id: int, src_prj_id: int, dry_run: bool):
        super().__init__()
        # params
        self.prj_id = prj_id
        self.src_prj_id = src_prj_id
        self.dry_run = dry_run
        # work vars
        self.remap_operations: Dict[MappedTableTypeT, List[RemapOp]] = {}
        self.dest_augmented_mappings = ProjectMapping()

    def log_file_path(self) -> str:
        return "merge_%d_in_%d.log" % (self.prj_id, self.src_prj_id)

    def run(self, current_user_id: int) -> MergeRsp:
        with LogsSwitcher(self):
            return self.do_run(current_user_id)

    def do_run(self, current_user_id: int) -> MergeRsp:
        """
            Run the service, merge the projects.
        :return:
        """
        # Security check
        RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE,
                            self.prj_id)
        RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE,
                            self.src_prj_id)
        # OK
        prj = self.session.query(Project).get(self.prj_id)
        assert prj is not None
        src_prj = self.session.query(Project).get(self.src_prj_id)
        assert src_prj is not None

        logger.info("Validating Merge of '%s'", prj.title)
        ret = MergeRsp()
        errs = self._verify_possible(prj, src_prj)
        ret.errors = errs
        # Exit if errors or dry run
        if self.dry_run or len(errs) > 0:
            return ret

        logger.info("Remaps: %s", self.remap_operations)
        # Go for real if not dry run AND len(errs) == 0
        logger.info("Starting Merge of '%s'", prj.title)
        self._do_merge(prj)
        self.session.commit()

        # Recompute stats and so on
        ProjectBO.do_after_load(self.session, prj_id=self.prj_id)
        self.session.commit()
        return ret

    def _verify_possible(self, dest_prj: Project,
                         src_prj: Project) -> List[str]:
        """
            Verify that the merge would not mean a loss of information.
                The mappings of src project should be preserved and copied into dest project.
                Augmented mappings should fit in the allowed maximum size for each entity.
            :param dest_prj:
            :param src_prj:
            :return: a list of problems, empty means we can proceed.
        """
        ret = []
        dest_mappings = ProjectMapping().load_from_project(dest_prj)
        src_mappings = ProjectMapping().load_from_project(src_prj)
        a_tbl: MappedTableTypeT
        for a_tbl in MAPPED_TABLES:
            mappings_for_dest_tbl = dest_mappings.by_table[a_tbl]
            mappings_for_src_tbl = src_mappings.by_table[a_tbl]
            # Compute the new mapping and eventual transformations to get there
            aug, remaps, errs = mappings_for_dest_tbl.augmented_with(
                mappings_for_src_tbl)
            ret.extend(errs)
            if len(remaps) > 0:
                self.remap_operations[a_tbl] = remaps
            # Load future mapping
            self.dest_augmented_mappings.by_table[a_tbl].load_from(aug)

        # Also check problems on consistency of unique orig_id
        dest_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                       prj_id=self.prj_id)
        src_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                      prj_id=self.src_prj_id)

        for an_orig_id_container in [
                Sample.__tablename__, Acquisition.__tablename__
        ]:
            # key=orig_id value, value=full record
            dest_orig_ids = dest_parents[an_orig_id_container]
            src_orig_ids = src_parents[an_orig_id_container]
            common_orig_ids = set(dest_orig_ids.keys()).intersection(
                src_orig_ids.keys())
            if len(common_orig_ids) != 0:
                logger.info("Common %s orig_ids: %s", an_orig_id_container,
                            common_orig_ids)
            for common_orig_id in common_orig_ids:
                orm_diff = orm_equals(dest_orig_ids[common_orig_id],
                                      src_orig_ids[common_orig_id])
                if orm_diff:
                    msg = (
                        "Data conflict: %s record with orig_id '%s' is different in destination project: %s"
                        %
                        (an_orig_id_container, common_orig_id, str(orm_diff)))
                    # TODO: Should be an error?
                    logger.warning(msg)
        return ret

    def _do_merge(self, dest_prj: Project):
        """
            Real merge operation.
        """
        # Loop over involved tables and remap free columns
        for a_mapped_tbl in MAPPED_TABLES:
            remaps = self.remap_operations.get(a_mapped_tbl)
            # Do the remappings if any
            if remaps is not None:
                logger.info("Doing re-mapping in %s: %s",
                            a_mapped_tbl.__tablename__, remaps)
                ProjectBO.remap(self.session, self.src_prj_id, a_mapped_tbl,
                                remaps)

        # Collect orig_id
        dest_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                       prj_id=self.prj_id)
        src_parents = InBundle.fetch_existing_parents(self.ro_session,
                                                      prj_id=self.src_prj_id)

        # Compute needed projections in order to keep orig_id unicity
        common_samples = self.get_ids_for_common_orig_id(
            Sample, dest_parents, src_parents)
        common_acquisitions = self.get_ids_for_common_orig_id(
            Acquisition, dest_parents, src_parents)

        # Align foreign keys, to Project, Sample and Acquisition
        for a_fk_to_proj_tbl in [
                Sample, Acquisition, ObjectHeader, ParticleProject
        ]:
            upd: Query = self.session.query(a_fk_to_proj_tbl)
            if a_fk_to_proj_tbl == Sample:
                # Move (i.e. change project) samples which are 'new' from merged project,
                #    so take all of them from src project...
                upd = upd.filter(
                    a_fk_to_proj_tbl.projid == self.src_prj_id)  # type: ignore
                # ...but not the ones with same orig_id, which are presumably equal.
                upd = upd.filter(
                    Sample.sampleid != all_(list(common_samples.keys())))
                # And update the column
                upd_values = {'projid': self.prj_id}
            elif a_fk_to_proj_tbl == Acquisition:
                # Acquisitions which were created, in source, under new samples, will 'follow'
                #    them during above move, thanks to the FK on acq_sample_id.
                # BUT some acquisitions were potentially created in source project, inside
                #    forked samples. They need to be attached to the dest (self) corresponding sample.
                if len(common_samples) > 0:
                    # Build a CTE with values for the update
                    smp_cte = values_cte("upd_smp", ("src_id", "dst_id"),
                                         [(k, v)
                                          for k, v in common_samples.items()])
                    smp_subqry = self.session.query(smp_cte.c.column2).filter(
                        smp_cte.c.column1 == Acquisition.acq_sample_id)
                    upd_values = {
                        'acq_sample_id':
                        func.coalesce(
                            smp_subqry.scalar_subquery(),  # type: ignore
                            Acquisition.acq_sample_id)
                    }
                    upd = upd.filter(Acquisition.acq_sample_id == any_(
                        list(common_samples.keys())))
                    # upd = upd.filter(Acquisition.acquisid != all_(list(common_acquisitions.keys())))
                if len(common_samples) == 0:
                    # Nothing to do. There were only new samples, all of them moved to self.
                    continue
            elif a_fk_to_proj_tbl == ObjectHeader:
                # Generated SQL looks like:
                # with upd_acq (src_id, dst_id) as (values (5,6), (7,8))
                # update obj_head
                #    set acquisid = coalesce((select dst_id from upd_acq where acquisid=src_id), acquisid)
                #  where acquisid in (select src_id from upd_acq)
                if len(common_acquisitions) > 0:
                    # Object must follow its acquisition
                    acq_cte = values_cte(
                        "upd_acq", ("src_id", "dst_id"),
                        [(k, v) for k, v in common_acquisitions.items()])
                    acq_subqry = self.session.query(acq_cte.c.column2).filter(
                        acq_cte.c.column1 == ObjectHeader.acquisid)
                    upd_values = {
                        'acquisid':
                        func.coalesce(
                            acq_subqry.scalar_subquery(),  # type:ignore
                            ObjectHeader.acquisid)
                    }
                    upd = upd.filter(ObjectHeader.acquisid == any_(
                        list(common_acquisitions.keys())))
                if len(common_acquisitions) == 0:
                    # Nothing to do. There were only new acquisitions, all of them moved to self.
                    continue
            else:
                # For Particle project
                upd = upd.filter(
                    ParticleProject.projid == self.src_prj_id)  # type: ignore
                upd_values = {'projid': self.prj_id}
            rowcount = upd.update(values=upd_values, synchronize_session=False)
            table_name = a_fk_to_proj_tbl.__tablename__  # type: ignore
            logger.info("Update in %s: %s rows", table_name, rowcount)

        # Acquisition & twin Process have followed their enclosing Sample

        # Remove the parents which are duplicate from orig_id point of view
        for a_fk_to_proj_tbl in [Acquisition, Sample]:
            to_del: Query = self.session.query(a_fk_to_proj_tbl)
            if a_fk_to_proj_tbl == Acquisition:
                # Remove conflicting acquisitions, they should be empty?
                to_del = to_del.filter(Acquisition.acquisid == any_(
                    list(common_acquisitions.keys())))  # type: ignore
            elif a_fk_to_proj_tbl == Sample:
                # Remove conflicting samples
                to_del = to_del.filter(Sample.sampleid == any_(
                    list(common_samples.keys())))  # type: ignore
            rowcount = to_del.delete(synchronize_session=False)
            table_name = a_fk_to_proj_tbl.__tablename__  # type: ignore
            logger.info("Delete in %s: %s rows", table_name, rowcount)

        self.dest_augmented_mappings.write_to_project(dest_prj)

        ProjectPrivilegeBO.generous_merge_into(self.session, self.prj_id,
                                               self.src_prj_id)

        # Completely erase the source project
        ProjectBO.delete(self.session, self.src_prj_id)

    @staticmethod
    def get_ids_for_common_orig_id(a_parent_class, dest_parents, src_parents) -> \
            Dict[Union[SampleIDT, AcquisitionIDT], Union[SampleIDT, AcquisitionIDT]]:
        """
            Return a link between IDs for resolving colliding orig_id.
            E.g. sample 'moose2015_ge_leg2_026' is present in source with ID 15482
                and also in destination with ID 84678
            -> return {15482:84678}, to read 15482->84678
        :param a_parent_class: Sample/Acquisition
        :param dest_parents:
        :param src_parents:
        :return:
        """
        ret = {}
        dst_orig_ids = dest_parents[a_parent_class.__tablename__]
        src_orig_ids = src_parents[a_parent_class.__tablename__]
        common_orig_ids = set(dst_orig_ids.keys()).intersection(
            src_orig_ids.keys())
        for a_common_orig_id in common_orig_ids:
            src_orig_id = src_orig_ids[a_common_orig_id].pk()
            dst_orig_id = dst_orig_ids[a_common_orig_id].pk()
            ret[src_orig_id] = dst_orig_id
        return ret