def do_import(self, where: ImportWhere, how: ImportHow, rowcount: int, report_def: Callable) -> int: """ Import the full bundle, i.e. every contained file. :param where: :param how: :param rowcount: Total rowcount, from preparation step. :param report_def: A def to call at certain points for reporting progress. :return: The total number of rows """ random.seed() stats = ImportStats(rowcount, report_def) # Borrow session from writer session = where.db_writer.session # Get parent (enclosing) Sample, Acquisition, Process, if any how.existing_parents = self.fetch_existing_parents(session, prj_id=how.prj_id) for alias, _clazz in GlobalMapping.PARENT_CLASSES.items(): log_line = {v.orig_id: v.pk() for k, v in how.existing_parents[alias].items()} logger.info("existing %s = %s", alias, log_line) # The created objects (unicity from object_id in TSV, orig_id in model) how.existing_objects = self.fetch_existing_objects(session, prj_id=how.prj_id) # The stored images (unicity for object ID + rank) how.image_ranks_per_obj = self.fetch_existing_ranks(session, prj_id=how.prj_id) ret = self.import_each_file(where, how, stats) return ret
def _collect_existing_and_validate(self, source_dir_or_zip, loaded_files) \ -> Tuple[ImportHow, ImportDiagnostic, int]: """ Prepare the import by checking what's inside the project and scanning files to input. """ # The mapping to TSV custom columns, either empty or from previous import operations on same project. mapping = ProjectMapping().load_from_project(self.prj) # Source bundle construction bundle_temp_dir = Path(self.temp_for_jobs.data_dir_for(self.job_id)) source_bundle = InBundle(source_dir_or_zip, bundle_temp_dir) # Configure the validation to come, directives. import_how = ImportHow(self.prj_id, self.req.update_mode, mapping, self.req.skip_existing_objects, loaded_files) if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) # A structure to collect validation result import_diag = ImportDiagnostic() if not self.req.skip_existing_objects: with CodeTimer( "collect_existing: Existing images for %d: " % self.prj_id, logger): import_diag.existing_objects_and_image = Image.fetch_existing_images( self.session, self.prj_id) import_diag.topology.read_from_db(self.session, prj_id=self.prj_id) # Do the bulk job of validation nb_rows = source_bundle.validate_import( import_how, import_diag, self.session, self.report_validation_progress) return import_how, import_diag, nb_rows
def _do_clone(self): """ Cloning operation itself. Assumes that @see self.to_clone was populated before. """ # Get the mappings in source project, in order to determines the useful columns custom_mapping = ProjectMapping().load_from_project(self.prj) obj_mapping = custom_mapping.object_mappings used_columns = set(obj_mapping.real_cols_to_tsv.keys()) used_columns.add("orig_id") # By safety # Create a DB writer writer = DBWriter(self.session) # Narrow the writes in ObjectFields thanks to mappings of original project writer.generators({"obj_field": used_columns}) # Use import helpers dest_prj_id = self.dest_prj.projid import_how = ImportHow(prj_id=dest_prj_id, update_mode="No", custom_mapping=ProjectMapping(), skip_object_duplicates=False, loaded_files=[]) # Get parent (enclosing) Sample, Acquisition, Process. There should be 0 in this context... import_how.existing_parents = InBundle.fetch_existing_parents( self.session, prj_id=dest_prj_id) self._clone_all(import_how, writer) # Copy mappings to destination. We could narrow them to the minimum? custom_mapping.write_to_project(self.dest_prj)
def do_intra_step_1(self, loaded_files): # The mapping to custom columns, either empty or from previous import API_operations on same project. custom_mapping = ProjectMapping().load_from_project(self.prj) # Source bundle construction source_bundle = InBundle( self.source_dir_or_zip, Path(self.temp_for_task.data_dir_for(self.task_id))) # Configure the validation to come, directives. import_how = ImportHow(self.prj_id, self.req.update_mode, custom_mapping, self.req.skip_existing_objects, loaded_files) if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) # A structure to collect validation result import_diag = ImportDiagnostic() if not self.req.skip_existing_objects: with CodeTimer( "do_intra_step_1: Existing images for %d: " % self.prj_id, logger): import_diag.existing_objects_and_image = Image.fetch_existing_images( self.session, self.prj_id) import_diag.topology.read_from_db(self.session, prj_id=self.prj_id) # Do the bulk job of validation nb_rows = source_bundle.validate_import(import_how, import_diag, self.session, self.report_progress) return import_how, import_diag, nb_rows
def before_import(self, how: ImportHow): how.vignette_maker = None # Pick vignette-ing config file from the zipped directory potential_config = self.path / self.VIGNETTE_CONFIG if potential_config.exists(): vignette_maker_cfg = configparser.ConfigParser() vignette_maker_cfg.read(potential_config.as_posix()) how.vignette_maker = VignetteMaker(vignette_maker_cfg, self.path, self.TEMP_VIGNETTE)
def do_run(self, current_user_id: int) -> ImportRealRsp: """ Do the real job using injected parameters. :return: """ # Security check RightsBO.user_wants(self.session, current_user_id, Action.ADMINISTRATE, self.prj_id) # OK loaded_files = none_to_empty(self.prj.fileloaded).splitlines() logger.info("Previously loaded files: %s", loaded_files) # Save mappings straight away self.save_mapping(self.custom_mapping) source_bundle = InBundle( self.req.source_path, Path(self.temp_for_task.data_dir_for(self.task_id))) # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_task.base_dir_for(self.task_id)) # Configure the import to come, directives import_how = ImportHow(self.prj_id, self.req.update_mode, self.custom_mapping, self.req.skip_existing_objects, loaded_files) import_how.taxo_mapping = self.req.taxo_mappings import_how.taxo_found = self.req.found_taxa import_how.found_users = self.req.found_users if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) if not self.req.skip_existing_objects: with CodeTimer("run: Existing images for %d: " % self.prj_id, logger): import_how.objects_and_images_to_skip = Image.fetch_existing_images( self.session, self.prj_id) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Do the bulk job of import row_count = source_bundle.do_import(import_where, import_how, self.req.rowcount, self.report_progress) # Update loaded files in DB, removing duplicates self.prj.fileloaded = "\n".join(set(import_how.loaded_files)) self.session.commit() # Recompute stats ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() logger.info("Total of %d rows loaded" % row_count) # Prepare response ret = ImportRealRsp() return ret
def do_real(self) -> None: """ Do the real job, i.e. write everywhere (DB/filesystem) """ loaded_files = none_to_empty(self.prj.fileloaded).splitlines() logger.info("Previously loaded files: %s", loaded_files) found_users, taxo_found, col_mapping_dict, \ nb_rows, source_path = self._load_vars_from_state(self.STATE_KEYS) # Save mappings straight away col_mapping = ProjectMapping().load_from_dict(col_mapping_dict) col_mapping.write_to_project(self.prj) self.session.commit() # TODO: Duplicated code source_bundle = InBundle( source_path, Path(self.temp_for_jobs.data_dir_for(self.job_id))) # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_jobs.base_dir_for(self.job_id)) # Configure the import to come, directives import_how = ImportHow(self.prj_id, self.req.update_mode, col_mapping, self.req.skip_existing_objects, loaded_files) import_how.taxo_mapping = self.req.taxo_mappings import_how.found_taxa = taxo_found import_how.found_users = found_users if self.req.skip_loaded_files: import_how.compute_skipped(source_bundle, logger) if self.req.skip_existing_objects: # If we must skip existing objects then do an inventory of what's in already with CodeTimer("run: Existing images for %d: " % self.prj_id, logger): import_how.objects_and_images_to_skip = Image.fetch_existing_images( self.session, self.prj_id) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Do the bulk job of import rowcount_from_validate = nb_rows row_count = source_bundle.do_import(import_where, import_how, rowcount_from_validate, self.report_progress) # Update loaded files in DB, removing duplicates self.prj.fileloaded = "\n".join(set(import_how.loaded_files)) self.session.commit() # Recompute stats ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() msg = "Total of %d rows loaded" % row_count logger.info(msg) self.set_job_result(errors=[], infos={"rowcount": row_count})
def do_import(self): """ Do the real job, i.e. copy files while creating records. """ errors = [] self.manage_uploaded() self.unzip_if_needed() # Use a Bundle source_bundle = InBundle( self.source_dir_or_zip, Path(self.temp_for_task.data_dir_for(self.task_id))) # Clean it, in case the ZIP contains a CSV source_bundle.remove_all_tsvs() images = source_bundle.list_image_files() # Configure the import to come, destination db_writer = DBWriter(self.session) import_where = ImportWhere( db_writer, self.vault, self.temp_for_task.base_dir_for(self.task_id)) # Configure the import to come, directives import_how = ImportHow(prj_id=self.prj_id, update_mode="", custom_mapping=ProjectMapping(), skip_object_duplicates=False, loaded_files=[]) import_how.do_thumbnail_above(int(self.config['THUMBSIZELIMIT'])) # Generate TSV req_values = self.req.values if req_values.get(SimpleImportFields.userlb, ""): import_how.found_users["user"] = { "id": req_values.get(SimpleImportFields.userlb) } req_values[SimpleImportFields.userlb] = "user" if req_values.get(SimpleImportFields.status, ""): req_values[SimpleImportFields.status] = classif_qual.get( req_values[SimpleImportFields.status], "") self.make_tsv(source_bundle, images) # Import nb_image_files = len(images) nb_images = source_bundle.do_import(import_where, import_how, nb_image_files, self.report_progress) self.session.commit() # Recompute stats and so on ProjectBO.do_after_load(self.session, self.prj_id) self.session.commit() ret = SimpleImportRsp(errors=errors, nb_images=nb_images) return ret
def validate_import(self, how: ImportHow, diag: ImportDiagnostic, session: Session, report_def: Callable) -> int: """ Validate the full bundle, i.e. every contained file. :return: """ with CodeTimer("validate_import: Existing images for %d: " % how.prj_id, logger): how.objects_and_images_to_skip = Image.fetch_existing_images(session, how.prj_id) total_row_count = self.validate_each_file(how, diag, report_def) if total_row_count == 0: # Try to be explicit in messages nb_found = len(self.possible_files) nb_skipped = len(diag.skipped_files) err_msg = ["No object to import."] if nb_found == 0: err_msg.append("* No .txt or .tsv file was found, of which name starts with 'ecotaxa'.") else: nb_validated = nb_found - nb_skipped if nb_skipped > 0: if nb_validated == 0: err_msg.append("* 'SKIP TSV' option was set and all TSV files were imported before.") else: err_msg.append("* 'SKIP TSV' option was set and new TSV file(s) are not compliant.") if nb_validated > 0: err_msg.append("* TSV file(s) might be empty.") if how.skip_object_duplicates: err_msg.append("* 'SKIP OBJECTS' option was set and all objects might be in already.") diag.error("<br>".join(err_msg)) if len(diag.classif_id_seen) > 0: self.check_classif(session, diag, diag.classif_id_seen) logger.info("Taxo Found = %s", how.taxo_found) logger.info("Users Found = %s", how.found_users) not_seen_fields = how.custom_mapping.all_fields.keys() - diag.cols_seen logger.info("For Information, not seen fields %s", not_seen_fields) if len(not_seen_fields) > 0: diag.warn("Some fields configured in the project are not seen in this import {0} " .format(", ".join(not_seen_fields))) if diag.nb_objects_without_gps > 0: diag.warn("{0} object(s) don't have GPS information." .format(diag.nb_objects_without_gps)) return total_row_count
def after_import(how: ImportHow): how.vignette_maker = None