def _find_what_to_clone(self): """ Determine the objects to clone. """ req = self.req # From required subsetting method... if req.limit_type == LimitMethods.constant: rank_function = 'rank' elif req.limit_type == LimitMethods.percent: rank_function = '100*percent_rank' else: rank_function = 'FunctionError' # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet(self.session, self.prj_id, self.req.filters) from_, where, params = object_set.get_sql(self.task.owner_id) sql = """ SELECT objid FROM ( SELECT """ + rank_function + """() OVER (PARTITION BY obh.classif_id ORDER BY RANDOM()) rang, obh.objid FROM """ + from_.get_sql() + """ """ + where.get_sql() + """ ) sr WHERE rang <= :ranklimit """ params['ranklimit'] = self.req.limit_value logger.info("SQL=%s", sql) logger.info("SQLParam=%s", params) res: ResultProxy = self.session.execute(sql, params) ids = [r[0] for r in res] logger.info("There are %d IDs", len(ids)) self.to_clone = EnumeratedObjectSet(self.session, ids)
def create_summary(self, src_project: Project): req = self.req proj_id = src_project.projid self.update_progress(1, "Start Summary export") now_txt = DateTime.now_time().strftime("%Y%m%d_%H%M") self.out_file_name = "export_summary_{0:d}_{1:s}.tsv".format( src_project.projid, now_txt) out_file = self.temp_for_jobs.base_dir_for( self.job_id) / self.out_file_name # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet( self.ro_session, proj_id, self.filters) # By default, select (and group by) unambiguous category name sels = ["txo.display_name"] if self.req.sum_subtotal == "A": sels[:0] = ["acq.orig_id"] elif self.req.sum_subtotal == "S": sels[:0] = [ "sam.orig_id", "sam.latitude", "sam.longitude", "MAX(obh.objdate) AS date" ] sels.append("COUNT(*) AS nbr") select_clause = "SELECT " + ", ".join(sels) not_aggregated = [a_sel for a_sel in sels if " " not in a_sel] group_clause = " GROUP BY " + ", ".join(not_aggregated) order_clause = OrderClause() for a_sel in not_aggregated: alias, col = a_sel.split(".") order_clause.add_expression(alias, col) # Base SQL comes from filters from_, where, params = object_set.get_sql(self._get_owner_id(), order_clause, select_clause) sql = select_clause + " FROM " + from_.get_sql() + where.get_sql( ) + group_clause + order_clause.get_sql() logger.info("Execute SQL : %s", sql) logger.info("Params : %s", params) res = self.ro_session.execute(text(sql), params) msg = "Creating file %s" % out_file logger.info(msg) self.update_progress(50, msg) nb_lines = self.write_result_to_csv(res, out_file) msg = "Extracted %d rows" % nb_lines logger.info(msg) self.update_progress(90, msg) return nb_lines
def _find_what_to_clone(self): """ Determine the objects to clone. """ req = self.req # From required subsetting method... if req.limit_type == LimitMethods.constant: rank_function = 'rank' elif req.limit_type == LimitMethods.percent: rank_function = '100*percent_rank' else: rank_function = 'FunctionError' # And repartition key if req.group_type == GroupDefinitions.categories: part_key = "obh.classif_id" elif req.group_type == GroupDefinitions.samples: part_key = "sam.sampleid" elif req.group_type == GroupDefinitions.acquisitions: part_key = "acq.acquisid" else: part_key = "???" # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet( self.session, self.prj_id, self.req.filters) from_, where, params = object_set.get_sql(self._get_owner_id()) # noinspection SqlResolve sql = """ SELECT objid FROM ( SELECT """ + rank_function + """() OVER (PARTITION BY """ + part_key + """ ORDER BY RANDOM()) rang, obh.objid FROM """ + from_.get_sql() + """ """ + where.get_sql() + """ ) sr WHERE rang <= :ranklimit """ params['ranklimit'] = self.req.limit_value logger.info("SQL=%s", sql) logger.info("SQLParam=%s", params) res: Result = self.ro_session.execute(sql, params) ids = [r for r, in res] logger.info("There are %d IDs", len(ids)) self.to_clone = EnumeratedObjectSet(self.session, ids)
def summary(self, current_user_id: Optional[UserIDT], proj_id: ProjectIDT, filters: ProjectFilters, only_total: bool) -> Tuple[int, Optional[int], Optional[int], Optional[int]]: """ Query the given project with given filters, return classification summary, or just grand total if only_total is set. """ # Security check if current_user_id is None: RightsBO.anonymous_wants(self.session, Action.READ, proj_id) # Anonymous can only see validated objects # TODO: Dup code # noinspection PyTypeHints filters.statusfilter = "V" # type:ignore user_id = -1 else: user, _project = RightsBO.user_wants(self.session, current_user_id, Action.READ, proj_id) user_id = user.id # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet(self.session, proj_id, filters) from_, where, params = object_set.get_sql(user_id) sql = """ SET LOCAL enable_seqscan=FALSE; SELECT COUNT(*) nbr""" if only_total: sql += """, NULL nbr_v, NULL nbr_d, NULL nbr_p""" else: sql += """, COUNT(CASE WHEN obh.classif_qual = 'V' THEN 1 END) nbr_v, COUNT(CASE WHEN obh.classif_qual = 'D' THEN 1 END) nbr_d, COUNT(CASE WHEN obh.classif_qual = 'P' THEN 1 END) nbr_p""" sql += """ FROM """ + from_.get_sql() + " " + where.get_sql() with CodeTimer("summary: V/D/P for %d using %s " % (proj_id, sql), logger): res: ResultProxy = self.session.execute(sql, params) nbr: int nbr_v: Optional[int] nbr_d: Optional[int] nbr_p: Optional[int] nbr, nbr_v, nbr_d, nbr_p = res.first() # type:ignore return nbr, nbr_v, nbr_d, nbr_p
def _find_what_to_dump(self) -> None: """ Determine the objects to dump. """ # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet(self.session, self.prj.projid, self.filters) from_, where, params = object_set.get_sql(self.requester_id) sql = """ SELECT objid FROM """ + from_.get_sql() + where.get_sql() logger.info("SQL=%s", sql) logger.info("SQLParam=%s", params) with CodeTimer("Get IDs:", logger): res: ResultProxy = self.session.execute(sql, params) ids = [r['objid'] for r in res] logger.info("NB OBJIDS=%d", len(ids)) self.ids_to_dump = ids
def create_tsv(self, src_project: Project, end_progress: int) -> Tuple[int, int]: """ Create the TSV file. """ req = self.req proj_id = src_project.projid self.update_progress(1, "Start TSV export") progress_range = end_progress - 1 # Get a fast count of the maximum of what to do count_sql = "SELECT SUM(nbr) AS cnt FROM projects_taxo_stat WHERE projid = :prj" res = self.ro_session.execute(text(count_sql), {"prj": proj_id}) obj_count = res.first()[0] # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet( self.ro_session, proj_id, self.filters) # Backup or not, the column namings are taken from common mapping # @See Mapping.py # TSV column order # field_order = ["object_id", "object_lat", "object_lon", "object_date", "object_time", "object_depth_max", # "object_annotation_status", "object_annotation_person_name", "object_annotation_person_email", # "object_annotation_date", "object_annotation_time", "object_annotation_category"] # formats = {"object_date": "TO_CHAR({0},'YYYYMMDD')", # "object_time": "TO_CHAR({0},'HH24MISS')", # "object_annotation_date": "TO_CHAR({0},'YYYYMMDD')", # "object_annotation_time": "TO_CHAR({0},'HH24MISS')", # "object_annotation_status": """ # CASE {0} # WHEN 'V' then 'validated' # WHEN 'P' then 'predicted' # WHEN 'D' then 'dubious' # ELSE {0} # END # """ # } # prefices = {ObjectHeader.__tablename__: "obh", # } # for a_fld in field_order: # mpg = GlobalMapping.PREDEFINED_FIELDS[a_fld] # mpg[""] # assert a_fld in GlobalMapping.PREDEFINED_FIELDS, "%s is not a mapped column" % a_fld date_fmt, time_fmt = "YYYYMMDD", "HH24MISS" if req.format_dates_times: date_fmt, time_fmt = "YYYY-MM-DD", "HH24:MI:SS" select_clause = "select " if req.with_images or (req.exp_type == ExportTypeEnum.backup): select_clause += "img.orig_file_name AS img_file_name, img.imgrank AS img_rank" if req.with_images: select_clause += ", img.file_name AS img_src_path" select_clause += ",\n" select_clause += """obh.orig_id AS object_id, obh.latitude AS object_lat, obh.longitude AS object_lon, TO_CHAR(obh.objdate,'{0}') AS object_date, TO_CHAR(obh.objtime,'{1}') AS object_time, obh.object_link, obh.depth_min AS object_depth_min, obh.depth_max AS object_depth_max, CASE obh.classif_qual WHEN 'V' then 'validated' WHEN 'P' then 'predicted' WHEN 'D' then 'dubious' ELSE obh.classif_qual END AS object_annotation_status, usr.name AS object_annotation_person_name, usr.email AS object_annotation_person_email, TO_CHAR(obh.classif_when,'{0}') AS object_annotation_date, TO_CHAR(obh.classif_when,'{1}') AS object_annotation_time, txo.display_name AS object_annotation_category """.format(date_fmt, time_fmt) if req.exp_type == ExportTypeEnum.backup: select_clause += ", txo.id AS object_annotation_category_id" else: select_clause += "," + TaxonomyBO.parents_sql( "obh.classif_id") + " AS object_annotation_hierarchy" if 'C' in req.tsv_entities: select_clause += "\n, obh.complement_info" # Deal with mappings, the goal is to emit SQL which will reconstitute the TSV structure src_mappings = ProjectMapping().load_from_project(src_project) if 'O' in req.tsv_entities: select_clause += "\n " + src_mappings.object_mappings.as_select_list( "obf") if 'S' in req.tsv_entities: select_clause += "\n, sam.orig_id AS sample_id, sam.dataportal_descriptor AS sample_dataportal_descriptor " select_clause += src_mappings.sample_mappings.as_select_list("sam") if 'P' in req.tsv_entities: select_clause += "\n, prc.orig_id AS process_id " select_clause += src_mappings.process_mappings.as_select_list( "prc") if 'A' in req.tsv_entities: select_clause += "\n, acq.orig_id AS acq_id, acq.instrument AS acq_instrument " select_clause += src_mappings.acquisition_mappings.as_select_list( "acq") if req.exp_type == ExportTypeEnum.dig_obj_ident: select_clause += "\n, obh.objid" if req.with_internal_ids: select_clause += """\n, obh.objid, obh.acquisid AS processid_internal, obh.acquisid AS acq_id_internal, sam.sampleid AS sample_id_internal, obh.classif_id, obh.classif_who, obh.classif_auto_id, txp.name classif_auto_name, obh.classif_auto_score, obh.classif_auto_when, obh.random_value object_random_value, obh.sunpos object_sunpos """ if 'S' in req.tsv_entities: select_clause += "\n, sam.latitude sample_lat, sam.longitude sample_long " # TODO: The condition on o.projid=1 in historical code below prevents any data production # if 'H' in req.tsv_entities: # sql1 += " , oh.classif_date AS histoclassif_date, classif_type AS histoclassif_type, " \ # "to3.name histoclassif_name, oh.classif_qual histoclassif_qual,uo3.name histoclassif_who, " \ # "classif_score histoclassif_score" # sql2 += """ LEFT JOIN (select o.objid, classif_date, classif_type, och.classif_id, # och.classif_qual, och.classif_who, classif_score # from objectsclassifhisto och # join objects o on o.objid=och.objid and o.projid=1 {0} # union all # select o.objid, o.classif_when classif_date, 'C' classif_type, classif_id, # classif_qual, classif_who, NULL # from objects o {0} where o.projid=1 # ) oh on o.objid=oh.objid # LEFT JOIN taxonomy to3 on oh.classif_id=to3.id # LEFT JOIN users uo3 on oh.classif_who=uo3.id # """.format(samplefilter) order_clause = OrderClause() if req.split_by == "sample": order_clause.add_expression("sam", "orig_id") split_field = "sample_id" # AKA sam.orig_id, but renamed in select list elif req.split_by == "taxo": select_clause += "\n, txo.display_name AS taxo_parent_child " order_clause.add_expression(None, "taxo_parent_child") split_field = "taxo_parent_child" else: order_clause.add_expression("sam", "orig_id") split_field = "object_id" # cette valeur permet d'éviter des erreurs plus loin dans r[split_field] order_clause.add_expression("obh", "objid") if req.with_images or (req.exp_type == ExportTypeEnum.backup): order_clause.add_expression(None, "img_rank") # Base SQL comes from filters from_, where, params = object_set.get_sql( self._get_owner_id(), order_clause, select_clause, all_images=not req.only_first_image) sql = select_clause + " FROM " + from_.get_sql() + where.get_sql( ) + order_clause.get_sql() logger.info("Execute SQL : %s" % sql) logger.info("Params : %s" % params) res = self.ro_session.execute(text(sql), params) now_txt = DateTime.now_time().strftime("%Y%m%d_%H%M") self.out_file_name = "export_{0:d}_{1:s}.{2}".format( proj_id, now_txt, "zip") produced_path = self.out_path / self.out_file_name zfile = zipfile.ZipFile(produced_path, 'w', allowZip64=True, compression=zipfile.ZIP_DEFLATED) splitcsv = (req.split_by != "") csv_filename = 'data.tsv' # Just a temp name as there is a rename while filling up the Zip if splitcsv: # Produce into the same temp file all the time, at zipping time the name in archive will vary prev_value = "NotAssigned" # To trigger a sequence change immediately else: # The zip will contain a single TSV with same base name as the zip prev_value = self.out_file_name.replace('.zip', '') csv_path: Path = self.out_path / csv_filename # Constant path to a (sometimes) changing file csv_fd: Optional[IO] = None csv_wtr = None # Store the images to save in a separate CSV. Useless if not exporting images but who cares. temp_img_file = self.out_path / "images.csv" img_file_fd = open(temp_img_file, 'w') img_wtr = csv.DictWriter(img_file_fd, ["src_path", "dst_path"], delimiter='\t', quotechar='"', lineterminator='\n') img_wtr.writeheader() # Prepare TSV structure col_descs = [ a_desc for a_desc in res.cursor.description if a_desc.name != "img_src_path" ] # read latitude column to get float DB type for a_desc in col_descs: if a_desc.name == "object_lat": db_float_type = a_desc.type_code break else: raise float_cols = set() # Prepare float separator conversion, if not required the set will just be empty if req.coma_as_separator: for a_desc in col_descs: if a_desc.type_code == db_float_type: float_cols.add(a_desc.name) tsv_cols = [a_desc.name for a_desc in col_descs] tsv_types_line = { name: ('[f]' if a_desc.type_code == db_float_type else '[t]') for name, a_desc in zip(tsv_cols, col_descs) } nb_rows = 0 nb_images = 0 used_dst_pathes = set() for r in res: # Rows from SQLAlchemy are not mutable, so we need a clone for arranging values a_row = dict(r) if ((splitcsv and (prev_value != a_row[split_field]) ) # At each split column values change or (nb_rows == 0)): # And anyway for the first row # Start of sequence, eventually end of previous sequence if csv_fd: csv_fd.close() # Close previous file self.store_csv_into_zip(zfile, prev_value, csv_path) if splitcsv: prev_value = a_row[split_field] logger.info("Writing into file %s", csv_path) csv_fd = open(csv_path, 'w', encoding='latin_1') csv_wtr = csv.DictWriter(csv_fd, tsv_cols, delimiter='\t', quotechar='"', lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) csv_wtr.writeheader() if req.exp_type == ExportTypeEnum.backup: # Write types line for backup type csv_wtr.writerow(tsv_types_line) if req.with_images: copy_op = {"src_path": a_row.pop("img_src_path")} if req.exp_type == ExportTypeEnum.dig_obj_ident: # Images will be stored in a per-category directory, but there is a single TSV at the Zip root categ = a_row['object_annotation_category'] # All names cannot directly become directories a_row['img_file_name'] = self.get_DOI_imgfile_name( a_row['objid'], a_row['img_rank'], categ, a_row['img_file_name']) copy_op["dst_path"] = a_row['img_file_name'] else: # It's a backup # Images are stored in the Zip subdirectory per sample/taxo, i.e. at the same place as # their referring TSV dst_path = "{0}/{1}".format(prev_value, a_row['img_file_name']) if dst_path in used_dst_pathes: # Avoid duplicates in zip as only the last entry will be present during unzip # root cause: for UVP6 bundles, the vignette and original image are both stored # with the same name. img_with_rank = "{0}/{1}".format( a_row['img_rank'], a_row['img_file_name']) a_row[ 'img_file_name'] = img_with_rank # write into TSV the corrected path dst_path = prev_value + "/" + img_with_rank used_dst_pathes.add(dst_path) copy_op["dst_path"] = dst_path img_wtr.writerow(copy_op) nb_images += 1 # Remove CR from comments if 'C' in req.tsv_entities and a_row['complement_info']: a_row['complement_info'] = ' '.join( a_row['complement_info'].splitlines()) # Replace decimal separator for cname in float_cols: if a_row[cname] is not None: a_row[cname] = str(a_row[cname]).replace('.', ',') assert csv_wtr is not None # Produce the row in the TSV csv_wtr.writerow(a_row) nb_rows += 1 if nb_rows % self.ROWS_REPORT_EVERY == 0: msg = "Row %d of max %d" % (nb_rows, obj_count) logger.info(msg) self.update_progress(1 + progress_range / obj_count * nb_rows, msg) if csv_fd: csv_fd.close() # Close last file self.store_csv_into_zip(zfile, prev_value, csv_path) logger.info("Extracted %d rows", nb_rows) img_file_fd.close() if zfile: zfile.close() return nb_rows, nb_images
def query(self, current_user_id: Optional[UserIDT], proj_id: ProjectIDT, filters: ProjectFilters, order_field: Optional[str] = None, window_start: Optional[int] = None, window_size: Optional[int] = None) \ -> Tuple[ObjectIDWithParentsListT, int]: """ Query the given project with given filters, return all IDs. If provided order_field, the result is sorted by this field. Ambiguity is solved in a stable (over calls) way. window_start and window_size allow to select a window of data in the result. """ # Security check if current_user_id is None: RightsBO.anonymous_wants(self.session, Action.READ, proj_id) # Anonymous can only see validated objects # noinspection PyTypeHints filters.statusfilter = "V" # type:ignore user_id = -1 else: user, _project = RightsBO.user_wants(self.session, current_user_id, Action.READ, proj_id) user_id = user.id # The order field has an impact on the query order_clause = self.cook_order_clause(order_field) # Prepare a where clause and parameters from filter object_set: DescribedObjectSet = DescribedObjectSet(self.session, proj_id, filters) from_, where, params = object_set.get_sql(user_id, order_clause) if "obf." in where.get_sql(): # If the filter needs obj_field data it's more efficient to count with a window function # than issuing a second query. extra_col = ", COUNT(objid) OVER() AS total" else: # Otherwise, no need for obj_field in count, less DB buffers extra_col = ", 0 AS total" # The following hint is needed until we sort out why, time to time, there is a FTS on obj_head sql = """ SET LOCAL enable_seqscan=FALSE; SELECT obh.objid, acq.acquisid, sam.sampleid %s FROM """ % extra_col + from_.get_sql() + " " + where.get_sql() # Add order & window if relevant if order_clause is not None: sql += order_clause.get_sql() if window_start is not None: sql += " OFFSET %d" % window_start if window_size is not None: sql += " LIMIT %d" % window_size with CodeTimer("query: for %d using %s " % (proj_id, sql), logger): res: ResultProxy = self.session.execute(sql, params) ids = [] total = 0 objid: int acquisid: int sampleid: int for objid, acquisid, sampleid, total in res: # type:ignore ids.append((objid, acquisid, sampleid, proj_id)) if total == 0: # Total was not computed or left to 0 total, _nbr_v, _nbr_d, _nbr_p = self.summary(current_user_id, proj_id, filters, True) return ids, total