Пример #1
0
    def _find_what_to_clone(self):
        """
            Determine the objects to clone.
        """
        req = self.req
        # From required subsetting method...
        if req.limit_type == LimitMethods.constant:
            rank_function = 'rank'
        elif req.limit_type == LimitMethods.percent:
            rank_function = '100*percent_rank'
        else:
            rank_function = 'FunctionError'

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(self.session, self.prj_id, self.req.filters)
        from_, where, params = object_set.get_sql(self.task.owner_id)

        sql = """
            SELECT objid FROM (
                SELECT """ + rank_function + """() OVER (PARTITION BY obh.classif_id ORDER BY RANDOM()) rang,
                       obh.objid
                  FROM """ + from_.get_sql() + """
                """ + where.get_sql() + """ ) sr
            WHERE rang <= :ranklimit """
        params['ranklimit'] = self.req.limit_value

        logger.info("SQL=%s", sql)
        logger.info("SQLParam=%s", params)

        res: ResultProxy = self.session.execute(sql, params)
        ids = [r[0] for r in res]
        logger.info("There are %d IDs", len(ids))

        self.to_clone = EnumeratedObjectSet(self.session, ids)
Пример #2
0
    def create_summary(self, src_project: Project):
        req = self.req
        proj_id = src_project.projid
        self.update_progress(1, "Start Summary export")

        now_txt = DateTime.now_time().strftime("%Y%m%d_%H%M")
        self.out_file_name = "export_summary_{0:d}_{1:s}.tsv".format(
            src_project.projid, now_txt)
        out_file = self.temp_for_jobs.base_dir_for(
            self.job_id) / self.out_file_name

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(
            self.ro_session, proj_id, self.filters)

        # By default, select (and group by) unambiguous category name
        sels = ["txo.display_name"]
        if self.req.sum_subtotal == "A":
            sels[:0] = ["acq.orig_id"]
        elif self.req.sum_subtotal == "S":
            sels[:0] = [
                "sam.orig_id", "sam.latitude", "sam.longitude",
                "MAX(obh.objdate) AS date"
            ]
        sels.append("COUNT(*) AS nbr")

        select_clause = "SELECT " + ", ".join(sels)
        not_aggregated = [a_sel for a_sel in sels if " " not in a_sel]
        group_clause = " GROUP BY " + ", ".join(not_aggregated)
        order_clause = OrderClause()
        for a_sel in not_aggregated:
            alias, col = a_sel.split(".")
            order_clause.add_expression(alias, col)

        # Base SQL comes from filters
        from_, where, params = object_set.get_sql(self._get_owner_id(),
                                                  order_clause, select_clause)
        sql = select_clause + " FROM " + from_.get_sql() + where.get_sql(
        ) + group_clause + order_clause.get_sql()

        logger.info("Execute SQL : %s", sql)
        logger.info("Params : %s", params)
        res = self.ro_session.execute(text(sql), params)

        msg = "Creating file %s" % out_file
        logger.info(msg)
        self.update_progress(50, msg)
        nb_lines = self.write_result_to_csv(res, out_file)
        msg = "Extracted %d rows" % nb_lines
        logger.info(msg)
        self.update_progress(90, msg)
        return nb_lines
Пример #3
0
    def _find_what_to_clone(self):
        """
            Determine the objects to clone.
        """
        req = self.req
        # From required subsetting method...
        if req.limit_type == LimitMethods.constant:
            rank_function = 'rank'
        elif req.limit_type == LimitMethods.percent:
            rank_function = '100*percent_rank'
        else:
            rank_function = 'FunctionError'
        # And repartition key
        if req.group_type == GroupDefinitions.categories:
            part_key = "obh.classif_id"
        elif req.group_type == GroupDefinitions.samples:
            part_key = "sam.sampleid"
        elif req.group_type == GroupDefinitions.acquisitions:
            part_key = "acq.acquisid"
        else:
            part_key = "???"

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(
            self.session, self.prj_id, self.req.filters)
        from_, where, params = object_set.get_sql(self._get_owner_id())

        # noinspection SqlResolve
        sql = """
            SELECT objid FROM (
                SELECT """ + rank_function + """() OVER (PARTITION BY """ + part_key + """ ORDER BY RANDOM()) rang,
                       obh.objid
                  FROM """ + from_.get_sql() + """
                """ + where.get_sql() + """ ) sr
            WHERE rang <= :ranklimit """
        params['ranklimit'] = self.req.limit_value

        logger.info("SQL=%s", sql)
        logger.info("SQLParam=%s", params)

        res: Result = self.ro_session.execute(sql, params)
        ids = [r for r, in res]
        logger.info("There are %d IDs", len(ids))

        self.to_clone = EnumeratedObjectSet(self.session, ids)
Пример #4
0
    def summary(self, current_user_id: Optional[UserIDT], proj_id: ProjectIDT, filters: ProjectFilters,
                only_total: bool) -> Tuple[int, Optional[int], Optional[int], Optional[int]]:
        """
            Query the given project with given filters, return classification summary, or just grand total if
            only_total is set.
        """
        # Security check
        if current_user_id is None:
            RightsBO.anonymous_wants(self.session, Action.READ, proj_id)
            # Anonymous can only see validated objects
            # TODO: Dup code
            # noinspection PyTypeHints
            filters.statusfilter = "V"  # type:ignore
            user_id = -1
        else:
            user, _project = RightsBO.user_wants(self.session, current_user_id, Action.READ, proj_id)
            user_id = user.id

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(self.session, proj_id, filters)
        from_, where, params = object_set.get_sql(user_id)
        sql = """
    SET LOCAL enable_seqscan=FALSE;
    SELECT COUNT(*) nbr"""
        if only_total:
            sql += """, NULL nbr_v, NULL nbr_d, NULL nbr_p"""
        else:
            sql += """, 
           COUNT(CASE WHEN obh.classif_qual = 'V' THEN 1 END) nbr_v,
           COUNT(CASE WHEN obh.classif_qual = 'D' THEN 1 END) nbr_d, 
           COUNT(CASE WHEN obh.classif_qual = 'P' THEN 1 END) nbr_p"""
        sql += """
      FROM """ + from_.get_sql() + " " + where.get_sql()

        with CodeTimer("summary: V/D/P for %d using %s " % (proj_id, sql), logger):
            res: ResultProxy = self.session.execute(sql, params)

        nbr: int
        nbr_v: Optional[int]
        nbr_d: Optional[int]
        nbr_p: Optional[int]
        nbr, nbr_v, nbr_d, nbr_p = res.first()  # type:ignore
        return nbr, nbr_v, nbr_d, nbr_p
Пример #5
0
    def _find_what_to_dump(self) -> None:
        """
            Determine the objects to dump.
        """
        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(self.session, self.prj.projid, self.filters)
        from_, where, params = object_set.get_sql(self.requester_id)

        sql = """ SELECT objid FROM """ + from_.get_sql() + where.get_sql()

        logger.info("SQL=%s", sql)
        logger.info("SQLParam=%s", params)

        with CodeTimer("Get IDs:", logger):
            res: ResultProxy = self.session.execute(sql, params)
        ids = [r['objid'] for r in res]

        logger.info("NB OBJIDS=%d", len(ids))

        self.ids_to_dump = ids
Пример #6
0
    def create_tsv(self, src_project: Project,
                   end_progress: int) -> Tuple[int, int]:
        """
            Create the TSV file.
        """
        req = self.req
        proj_id = src_project.projid
        self.update_progress(1, "Start TSV export")
        progress_range = end_progress - 1

        # Get a fast count of the maximum of what to do
        count_sql = "SELECT SUM(nbr) AS cnt FROM projects_taxo_stat WHERE projid = :prj"
        res = self.ro_session.execute(text(count_sql), {"prj": proj_id})
        obj_count = res.first()[0]

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(
            self.ro_session, proj_id, self.filters)

        # Backup or not, the column namings are taken from common mapping
        # @See Mapping.py
        # TSV column order
        # field_order = ["object_id", "object_lat", "object_lon", "object_date", "object_time", "object_depth_max",
        #                "object_annotation_status", "object_annotation_person_name", "object_annotation_person_email",
        #                "object_annotation_date", "object_annotation_time", "object_annotation_category"]
        # formats = {"object_date": "TO_CHAR({0},'YYYYMMDD')",
        #            "object_time": "TO_CHAR({0},'HH24MISS')",
        #            "object_annotation_date": "TO_CHAR({0},'YYYYMMDD')",
        #            "object_annotation_time": "TO_CHAR({0},'HH24MISS')",
        #            "object_annotation_status": """
        #                  CASE {0}
        #                     WHEN 'V' then 'validated'
        #                     WHEN 'P' then 'predicted'
        #                     WHEN 'D' then 'dubious'
        #                     ELSE {0}
        #                  END
        #            """
        #            }
        # prefices = {ObjectHeader.__tablename__: "obh",
        #             }
        # for a_fld in field_order:
        #     mpg = GlobalMapping.PREDEFINED_FIELDS[a_fld]
        #     mpg[""]
        #     assert a_fld in GlobalMapping.PREDEFINED_FIELDS, "%s is not a mapped column" % a_fld
        date_fmt, time_fmt = "YYYYMMDD", "HH24MISS"
        if req.format_dates_times:
            date_fmt, time_fmt = "YYYY-MM-DD", "HH24:MI:SS"

        select_clause = "select "

        if req.with_images or (req.exp_type == ExportTypeEnum.backup):
            select_clause += "img.orig_file_name AS img_file_name, img.imgrank AS img_rank"
            if req.with_images:
                select_clause += ", img.file_name AS img_src_path"
            select_clause += ",\n"

        select_clause += """obh.orig_id AS object_id, obh.latitude AS object_lat, obh.longitude AS object_lon,
                         TO_CHAR(obh.objdate,'{0}') AS object_date,
                         TO_CHAR(obh.objtime,'{1}') AS object_time,
                         obh.object_link, obh.depth_min AS object_depth_min, obh.depth_max AS object_depth_max,
                         CASE obh.classif_qual 
                            WHEN 'V' then 'validated' 
                            WHEN 'P' then 'predicted' 
                            WHEN 'D' then 'dubious' 
                            ELSE obh.classif_qual 
                         END AS object_annotation_status,                
                         usr.name AS object_annotation_person_name, usr.email AS object_annotation_person_email,
                         TO_CHAR(obh.classif_when,'{0}') AS object_annotation_date,
                         TO_CHAR(obh.classif_when,'{1}') AS object_annotation_time,                
                         txo.display_name AS object_annotation_category 
                    """.format(date_fmt, time_fmt)
        if req.exp_type == ExportTypeEnum.backup:
            select_clause += ", txo.id AS object_annotation_category_id"
        else:
            select_clause += "," + TaxonomyBO.parents_sql(
                "obh.classif_id") + " AS object_annotation_hierarchy"

        if 'C' in req.tsv_entities:
            select_clause += "\n, obh.complement_info"

        # Deal with mappings, the goal is to emit SQL which will reconstitute the TSV structure
        src_mappings = ProjectMapping().load_from_project(src_project)
        if 'O' in req.tsv_entities:
            select_clause += "\n " + src_mappings.object_mappings.as_select_list(
                "obf")

        if 'S' in req.tsv_entities:
            select_clause += "\n, sam.orig_id AS sample_id, sam.dataportal_descriptor AS sample_dataportal_descriptor "
            select_clause += src_mappings.sample_mappings.as_select_list("sam")

        if 'P' in req.tsv_entities:
            select_clause += "\n, prc.orig_id AS process_id "
            select_clause += src_mappings.process_mappings.as_select_list(
                "prc")

        if 'A' in req.tsv_entities:
            select_clause += "\n, acq.orig_id AS acq_id, acq.instrument AS acq_instrument "
            select_clause += src_mappings.acquisition_mappings.as_select_list(
                "acq")

        if req.exp_type == ExportTypeEnum.dig_obj_ident:
            select_clause += "\n, obh.objid"

        if req.with_internal_ids:
            select_clause += """\n, obh.objid, 
                    obh.acquisid AS processid_internal, obh.acquisid AS acq_id_internal, 
                    sam.sampleid AS sample_id_internal, 
                    obh.classif_id, obh.classif_who, obh.classif_auto_id, txp.name classif_auto_name, 
                    obh.classif_auto_score, obh.classif_auto_when,
                    obh.random_value object_random_value, obh.sunpos object_sunpos """
            if 'S' in req.tsv_entities:
                select_clause += "\n, sam.latitude sample_lat, sam.longitude sample_long "

        # TODO: The condition on o.projid=1 in historical code below prevents any data production
        # if 'H' in req.tsv_entities:
        #     sql1 += " , oh.classif_date AS histoclassif_date, classif_type AS histoclassif_type, " \
        #             "to3.name histoclassif_name, oh.classif_qual histoclassif_qual,uo3.name histoclassif_who, " \
        #             "classif_score histoclassif_score"
        #     sql2 += """ LEFT JOIN (select o.objid, classif_date, classif_type, och.classif_id,
        #                                   och.classif_qual, och.classif_who, classif_score
        #                              from objectsclassifhisto och
        #                              join objects o on o.objid=och.objid and o.projid=1 {0}
        #                            union all
        #                            select o.objid, o.classif_when classif_date, 'C' classif_type, classif_id,
        #                                   classif_qual, classif_who, NULL
        #                              from objects o {0} where o.projid=1
        #                           ) oh on o.objid=oh.objid
        #                 LEFT JOIN taxonomy to3 on oh.classif_id=to3.id
        #                 LEFT JOIN users uo3 on oh.classif_who=uo3.id
        #             """.format(samplefilter)

        order_clause = OrderClause()
        if req.split_by == "sample":
            order_clause.add_expression("sam", "orig_id")
            split_field = "sample_id"  # AKA sam.orig_id, but renamed in select list
        elif req.split_by == "taxo":
            select_clause += "\n, txo.display_name AS taxo_parent_child "
            order_clause.add_expression(None, "taxo_parent_child")
            split_field = "taxo_parent_child"
        else:
            order_clause.add_expression("sam", "orig_id")
            split_field = "object_id"  # cette valeur permet d'éviter des erreurs plus loin dans r[split_field]
        order_clause.add_expression("obh", "objid")

        if req.with_images or (req.exp_type == ExportTypeEnum.backup):
            order_clause.add_expression(None, "img_rank")

        # Base SQL comes from filters
        from_, where, params = object_set.get_sql(
            self._get_owner_id(),
            order_clause,
            select_clause,
            all_images=not req.only_first_image)
        sql = select_clause + " FROM " + from_.get_sql() + where.get_sql(
        ) + order_clause.get_sql()
        logger.info("Execute SQL : %s" % sql)
        logger.info("Params : %s" % params)

        res = self.ro_session.execute(text(sql), params)

        now_txt = DateTime.now_time().strftime("%Y%m%d_%H%M")
        self.out_file_name = "export_{0:d}_{1:s}.{2}".format(
            proj_id, now_txt, "zip")

        produced_path = self.out_path / self.out_file_name
        zfile = zipfile.ZipFile(produced_path,
                                'w',
                                allowZip64=True,
                                compression=zipfile.ZIP_DEFLATED)

        splitcsv = (req.split_by != "")
        csv_filename = 'data.tsv'  # Just a temp name as there is a rename while filling up the Zip
        if splitcsv:
            # Produce into the same temp file all the time, at zipping time the name in archive will vary
            prev_value = "NotAssigned"  # To trigger a sequence change immediately
        else:
            # The zip will contain a single TSV with same base name as the zip
            prev_value = self.out_file_name.replace('.zip', '')

        csv_path: Path = self.out_path / csv_filename  # Constant path to a (sometimes) changing file
        csv_fd: Optional[IO] = None
        csv_wtr = None

        # Store the images to save in a separate CSV. Useless if not exporting images but who cares.
        temp_img_file = self.out_path / "images.csv"
        img_file_fd = open(temp_img_file, 'w')
        img_wtr = csv.DictWriter(img_file_fd, ["src_path", "dst_path"],
                                 delimiter='\t',
                                 quotechar='"',
                                 lineterminator='\n')
        img_wtr.writeheader()

        # Prepare TSV structure
        col_descs = [
            a_desc for a_desc in res.cursor.description
            if a_desc.name != "img_src_path"
        ]
        # read latitude column to get float DB type
        for a_desc in col_descs:
            if a_desc.name == "object_lat":
                db_float_type = a_desc.type_code
                break
        else:
            raise
        float_cols = set()
        # Prepare float separator conversion, if not required the set will just be empty
        if req.coma_as_separator:
            for a_desc in col_descs:
                if a_desc.type_code == db_float_type:
                    float_cols.add(a_desc.name)

        tsv_cols = [a_desc.name for a_desc in col_descs]
        tsv_types_line = {
            name: ('[f]' if a_desc.type_code == db_float_type else '[t]')
            for name, a_desc in zip(tsv_cols, col_descs)
        }
        nb_rows = 0
        nb_images = 0
        used_dst_pathes = set()
        for r in res:
            # Rows from SQLAlchemy are not mutable, so we need a clone for arranging values
            a_row = dict(r)
            if ((splitcsv and (prev_value != a_row[split_field])
                 )  # At each split column values change
                    or (nb_rows == 0)):  # And anyway for the first row
                # Start of sequence, eventually end of previous sequence
                if csv_fd:
                    csv_fd.close()  # Close previous file
                    self.store_csv_into_zip(zfile, prev_value, csv_path)
                if splitcsv:
                    prev_value = a_row[split_field]
                logger.info("Writing into file %s", csv_path)
                csv_fd = open(csv_path, 'w', encoding='latin_1')
                csv_wtr = csv.DictWriter(csv_fd,
                                         tsv_cols,
                                         delimiter='\t',
                                         quotechar='"',
                                         lineterminator='\n',
                                         quoting=csv.QUOTE_NONNUMERIC)
                csv_wtr.writeheader()
                if req.exp_type == ExportTypeEnum.backup:
                    # Write types line for backup type
                    csv_wtr.writerow(tsv_types_line)
            if req.with_images:
                copy_op = {"src_path": a_row.pop("img_src_path")}
                if req.exp_type == ExportTypeEnum.dig_obj_ident:
                    # Images will be stored in a per-category directory, but there is a single TSV at the Zip root
                    categ = a_row['object_annotation_category']
                    # All names cannot directly become directories
                    a_row['img_file_name'] = self.get_DOI_imgfile_name(
                        a_row['objid'], a_row['img_rank'], categ,
                        a_row['img_file_name'])
                    copy_op["dst_path"] = a_row['img_file_name']
                else:  # It's a backup
                    # Images are stored in the Zip subdirectory per sample/taxo, i.e. at the same place as
                    # their referring TSV
                    dst_path = "{0}/{1}".format(prev_value,
                                                a_row['img_file_name'])
                    if dst_path in used_dst_pathes:
                        # Avoid duplicates in zip as only the last entry will be present during unzip
                        # root cause: for UVP6 bundles, the vignette and original image are both stored
                        # with the same name.
                        img_with_rank = "{0}/{1}".format(
                            a_row['img_rank'], a_row['img_file_name'])
                        a_row[
                            'img_file_name'] = img_with_rank  # write into TSV the corrected path
                        dst_path = prev_value + "/" + img_with_rank
                    used_dst_pathes.add(dst_path)
                    copy_op["dst_path"] = dst_path
                img_wtr.writerow(copy_op)
                nb_images += 1
            # Remove CR from comments
            if 'C' in req.tsv_entities and a_row['complement_info']:
                a_row['complement_info'] = ' '.join(
                    a_row['complement_info'].splitlines())
            # Replace decimal separator
            for cname in float_cols:
                if a_row[cname] is not None:
                    a_row[cname] = str(a_row[cname]).replace('.', ',')
            assert csv_wtr is not None
            # Produce the row in the TSV
            csv_wtr.writerow(a_row)
            nb_rows += 1
            if nb_rows % self.ROWS_REPORT_EVERY == 0:
                msg = "Row %d of max %d" % (nb_rows, obj_count)
                logger.info(msg)
                self.update_progress(1 + progress_range / obj_count * nb_rows,
                                     msg)
        if csv_fd:
            csv_fd.close()  # Close last file
            self.store_csv_into_zip(zfile, prev_value, csv_path)
        logger.info("Extracted %d rows", nb_rows)
        img_file_fd.close()
        if zfile:
            zfile.close()
        return nb_rows, nb_images
Пример #7
0
    def query(self, current_user_id: Optional[UserIDT], proj_id: ProjectIDT,
              filters: ProjectFilters,
              order_field: Optional[str] = None,
              window_start: Optional[int] = None,
              window_size: Optional[int] = None) \
            -> Tuple[ObjectIDWithParentsListT, int]:
        """
            Query the given project with given filters, return all IDs.
            If provided order_field, the result is sorted by this field.
            Ambiguity is solved in a stable (over calls) way.
            window_start and window_size allow to select a window of data in the result.
        """
        # Security check
        if current_user_id is None:
            RightsBO.anonymous_wants(self.session, Action.READ, proj_id)
            # Anonymous can only see validated objects
            # noinspection PyTypeHints
            filters.statusfilter = "V"  # type:ignore
            user_id = -1
        else:
            user, _project = RightsBO.user_wants(self.session, current_user_id, Action.READ, proj_id)
            user_id = user.id

        # The order field has an impact on the query
        order_clause = self.cook_order_clause(order_field)

        # Prepare a where clause and parameters from filter
        object_set: DescribedObjectSet = DescribedObjectSet(self.session, proj_id, filters)

        from_, where, params = object_set.get_sql(user_id, order_clause)

        if "obf." in where.get_sql():
            # If the filter needs obj_field data it's more efficient to count with a window function
            # than issuing a second query.
            extra_col = ", COUNT(objid) OVER() AS total"
        else:
            # Otherwise, no need for obj_field in count, less DB buffers
            extra_col = ", 0 AS total"

        # The following hint is needed until we sort out why, time to time, there is a FTS on obj_head
        sql = """
    SET LOCAL enable_seqscan=FALSE;
    SELECT obh.objid, acq.acquisid, sam.sampleid %s
      FROM """ % extra_col + from_.get_sql() + " " + where.get_sql()

        # Add order & window if relevant
        if order_clause is not None:
            sql += order_clause.get_sql()
        if window_start is not None:
            sql += " OFFSET %d" % window_start
        if window_size is not None:
            sql += " LIMIT %d" % window_size

        with CodeTimer("query: for %d using %s " % (proj_id, sql), logger):
            res: ResultProxy = self.session.execute(sql, params)
        ids = []
        total = 0
        objid: int
        acquisid: int
        sampleid: int
        for objid, acquisid, sampleid, total in res:  # type:ignore
            ids.append((objid, acquisid, sampleid, proj_id))

        if total == 0:
            # Total was not computed or left to 0
            total, _nbr_v, _nbr_d, _nbr_p = self.summary(current_user_id, proj_id, filters, True)

        return ids, total