def main(): argparse = ArgumentParser( description='Retrieve file information from ENA and add them to EVAPRO. ' 'Remove extra vcf and index files in EVAPRO is they are not in ENA') argparse.add_argument( '--project_accession', required=False, type=str, help= 'Specify the project accession for which the retrieval should be done. This will apply to the whole projetc' ) argparse.add_argument( '--analysis_accession', required=False, type=str, help= 'Specify the analysis accession for which the retrieval should be done.' ) log_cfg.add_stdout_handler() args = argparse.parse_args() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() if args.analysis_accession: populate_files_info_from_ena(args.analysis_accession) elif args.project_accession: populate_files_info_from_ena(args.project_accession) else: logger.warning( 'You need to provide a project of analysis accession to use.')
def main(): arg_parser = argparse.ArgumentParser( description= 'Compare the sample name in the VCF file and the one specified in the metadata.' ) arg_parser.add_argument('--metadata-file', required=True, dest='metadata_file', help='EVA Submission Metadata Excel sheet') arg_parser.add_argument( '--vcf-dir', required=True, dest='vcf_dir', help='Path to the directory in which submitted files can be found') arg_parser.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level', ) args = arg_parser.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() compare_spreadsheet_and_vcf(args.metadata_file, args.vcf_dir)
def main(): argparse = ArgumentParser( description='Migrate an in-progress submission to the current cluster') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number of the submission to migrate') argparse.add_argument('--project', required=False, type=str, help='Optional associated project accession') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadMigration(args.eload) as eload: eload.migrate(args.project)
def main(): argparse = ArgumentParser(description='Accession and ingest submission data into EVA') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission.') argparse.add_argument('--instance', required=False, type=int, choices=range(1, 13), default=1, help='The instance id to use for accessioning. Only needed if running accessioning.') argparse.add_argument('--tasks', required=False, type=str, nargs='+', default=EloadIngestion.all_tasks, choices=EloadIngestion.all_tasks, help='Task or set of tasks to perform during ingestion.') argparse.add_argument('--vep_cache_assembly_name', required=False, type=str, help='The assembly name used in the VEP cache to help the script to find the correct cache ' 'to use. This should be only used rarely when the script cannot find the VEP cache but ' 'we know it exists.') argparse.add_argument('--resume', action='store_true', default=False, help='Whether to resume an existing Nextflow process within ingestion.') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadIngestion(args.eload) as ingestion: ingestion.upgrade_config_if_needed() ingestion.ingest( instance_id=args.instance, tasks=args.tasks, vep_cache_assembly_name=args.vep_cache_assembly_name, resume=args.resume )
def main(): argparse = ArgumentParser( description='Update metadata after study has been ingested') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission.') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadIngestion(args.eload) as ingestion: ingestion.upgrade_config_if_needed() ingestion.update_assembly_set_in_analysis() ingestion.insert_browsable_files() ingestion.update_browsable_files_with_date() ingestion.update_files_with_ftp_path() ingestion.refresh_study_browser() ingestion.update_loaded_assembly_in_browsable_files() ingestion.check_assembly_set_id_coherence()
def main(): argparse = ArgumentParser( description='Inspect FTP boxes to detect new submission. ' 'Provide a report that specify the project title') argparse.add_argument( '--ftp_box', required=True, type=int, choices=range(1, 21), help='box number where the data should have been uploaded') argparse.add_argument('--submitter', required=False, type=str, help='the name of the directory for that submitter.') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level', ) args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() if args.submitter: inspect_one_user(args.ftp_box, args.submitter) else: inspect_all_users(args.ftp_box)
def main(): argparse = ArgumentParser( description= 'Upgrade ELOAD config to a format compatible with current automation') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--analysis_alias', required=False, type=str, help='Analysis alias to use') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with Eload(args.eload) as eload: eload.upgrade_config_if_needed(args.analysis_alias)
def main(): parser = argparse.ArgumentParser(description='Download and store a reference sequence or assembly.') parser.add_argument("-a", "--assembly-accession", help="Assembly for which the process has to be run, e.g. GCA_000002285.2", required=True) parser.add_argument("-s", "--species", help="Species scientific name under which this accession should be stored. " "This is only used to create the directory", required=True) parser.add_argument("-o", "--output-directory", help="Base directory under which all species assemblies are stored. " "Will use the one defined in config file if omitted") parser.add_argument("-c", "--clear", help="Flag to clear existing data in FASTA file and starting from scratch", action='store_true') parser.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = parser.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() try: assembly_fasta_path, assembly_report_path = get_reference_fasta_and_report( args.species, args.assembly_accession, args.output_directory, args.clear ) logger.info('FASTA: ' + assembly_fasta_path) logger.info('REPORT: ' + assembly_report_path) except Exception as ex: logger.exception(ex) return 1 return 0
def setUp(self) -> None: config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file) # Need to set the directory so that the relative path set in the config file works from the top directory os.chdir(ROOT_DIR) self.validation = EloadValidation(2)
def main(): argparse = ArgumentParser(description='Prepare to process backlog study and validate VCFs.') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--force_config', action='store_true', default=False, help='Overwrite the configuration file after backing it up.') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() preparation = EloadBacklog(args.eload) preparation.fill_in_config(args.force_config) preparation.report() validation = EloadValidation(args.eload) validation_tasks = ['assembly_check', 'vcf_check'] validation.validate(validation_tasks) logger.info('Preparation complete, if files are valid please run ingestion as normal.')
def main(): argparse = ArgumentParser(description='Broker validated ELOAD to BioSamples and ENA') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') argparse.add_argument('--vcf_files', required=False, type=str, help='VCF files to use in the brokering', nargs='+') argparse.add_argument('--metadata_file', required=False, type=str, help='VCF files to use in the brokering') argparse.add_argument('--force', required=False, type=str, nargs='+', default=[], choices=EloadBrokering.all_brokering_tasks, help='When not set, the script only performs the tasks that were not successful. Can be ' 'set to specify one or several tasks to force during the brokering regardless of ' 'previous status') argparse.add_argument('--report', action='store_true', default=False, help='Set the script to only report the results based on previously run brokering.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() # Optionally Set the valid VCF and metadata file brokering = EloadBrokering(args.eload, args.vcf_files, args.metadata_file) brokering.upgrade_config_if_needed() if not args.report: brokering.broker(brokering_tasks_to_force=args.force) brokering.report()
def setUp(self): config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file) # Need to set the directory so that the relative path set in the config file works from the top directory os.chdir(self.top_dir) self.eload = EloadBacklog(44)
def main(): argparse = ArgumentParser(description='Copies data from the ftp (if specified) and search for VCF and metadata files.' 'then create a config file storing information about the eload') argparse.add_argument('--ftp_box', required=False, type=int, choices=range(1, 21), help='box number where the data should have been uploaded. Required to copy the data from the FTP') argparse.add_argument('--submitter', required=False, type=str, help='the name of the directory for that submitter. Required to copy the data from the FTP') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--taxid', required=False, type=str, help='Override and replace the taxonomy id provided in the metadata spreadsheet.') argparse.add_argument('--reference', required=False, type=str, help='Override and replace the reference sequence accession provided in the metadata ' 'spreadsheet.') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() eload = EloadPreparation(args.eload) if args.ftp_box and args.submitter: eload.copy_from_ftp(args.ftp_box, args.submitter) eload.detect_all(args.taxid, args.reference)
def main(): argparse = ArgumentParser(description='Validate an ELOAD by checking the data and metadata format and semantics.') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--validation_tasks', required=False, type=str, nargs='+', default=EloadValidation.all_validation_tasks, choices=EloadValidation.all_validation_tasks, help='task or set of tasks to perform during validation') argparse.add_argument('--set_as_valid', action='store_true', default=False, help='Set the script to consider all validation tasks performed as valid in the final ' 'evaluation. This does not affect the actual report but only change the final ' 'evaluation') argparse.add_argument('--merge_per_analysis', action='store_true', default=False, help='Whether to merge vcf files per analysis if possible.') argparse.add_argument('--report', action='store_true', default=False, help='Set the script to only report the results based on previously run validation.') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadValidation(args.eload) as eload: eload.upgrade_config_if_needed() if not args.report: eload.validate(args.validation_tasks, args.set_as_valid, args.merge_per_analysis) eload.report()
def setUp(self) -> None: config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file) # Need to set the directory so that the relative path set in the config file works from the top directory os.chdir(ROOT_DIR) self.validation = EloadValidation(2) # Used to restore test config after each test self.original_cfg = deepcopy(self.validation.eload_cfg.content)
def setUp(self): config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file) # Need to set the directory so that the relative path set in the config file works from the top directory os.chdir(self.top_dir) self.eload = EloadMigration(66) self.original_config = deepcopy(self.eload.eload_cfg.content)
def setUp(self): config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file) # Need to set the directory so that the relative path set in the config file works from the top directory os.chdir(self.top_dir) with patch('eva_submission.eload_ingestion.get_mongo_uri_for_eva_profile', autospec=True): self.eload = EloadIngestion(33) # Used to restore test config after each test self.original_cfg = deepcopy(self.eload.eload_cfg.content)
def setUp(self) -> None: resources_folder = os.path.join(ROOT_DIR, 'tests', 'resources') brokering_folder = os.path.join(resources_folder, 'brokering') config_file = os.path.join(resources_folder, 'submission_config.yml') load_config(config_file) metadata_file = os.path.join(brokering_folder, 'metadata_sheet.xlsx') self.uploader = ENAUploader('ELOAD_1', metadata_file, brokering_folder) self.uploader_async = ENAUploaderAsync('ELOAD_1', metadata_file, brokering_folder)
def setUp(self) -> None: config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file) # Need to set the directory so that the relative path set in the config file works from the top directory os.chdir(self.top_dir) self.eload = EloadBrokering(3) # Ensure we've cleared any past brokering status self.eload.eload_cfg.pop('brokering', 'Biosamples') self.existing_eload = EloadBrokering(4)
def setUp(self): config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file) # Need to set the directory so that the relative path set in the config file works from the top directory os.chdir(self.top_dir) # Set up vep cache directory and vep os.makedirs(cfg['vep_cache_path'], exist_ok=True) os.makedirs(os.path.join(cfg['vep_path'], 'ensembl-vep-release-104/vep'), exist_ok=True) os.makedirs(os.path.join(cfg['vep_path'], 'ensembl-vep-release-97/vep'), exist_ok=True)
def setUp(self): config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file) # Need to set the directory so that the relative path set in the config file works from the top directory os.chdir(ROOT_DIR) self.eload = Eload(55) self.updated_config = EloadConfig(os.path.join(self.eload.eload_dir, 'updated_config.yml')) # Used to restore test config after each test self.original_cfg = deepcopy(self.eload.eload_cfg.content) self.original_updated_cfg = deepcopy(self.updated_config.content) self.updated_config.set('version', value=__version__)
def setUp(self): config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file) # Need to set the directory so that the relative path set in the config file works from the top directory os.chdir(ROOT_DIR) self.eload = Eload(55) self.original_config = EloadConfig(os.path.join(self.eload.eload_dir, 'original_config.yml')) self.updated_config = EloadConfig(os.path.join(self.eload.eload_dir, 'updated_config.yml')) # Setup the config self.eload.eload_cfg.content = deepcopy(self.original_config.content) self.original_updated_cfg = deepcopy(self.updated_config.content) self.updated_config.set('version', value=__version__) # Get the log file name self.logfile_name = os.path.join(self.eload.eload_dir, str(self.eload.eload) + "_submission.log")
def main(): argparse = ArgumentParser( description= 'Create a database with the provided name if it does not exist already' ) argparse.add_argument('--database_name', required=True, type=int, help='The database name') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') log_cfg.add_stdout_handler() args = argparse.parse_args() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() provision_new_database_for_variant_warehouse(args.database_name)
def setUp(self): config_file = os.path.join(self.resources_folder, 'submission_config.yml') load_config(config_file)
def main(): validation_tasks = ['aggregation_check', 'assembly_check', 'vcf_check'] forced_validation_tasks = ['metadata_check', 'sample_check'] argparse = ArgumentParser( description='Prepare to process backlog study and validate VCFs.') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument( '--project_accession', required=False, type=str, help='Set this project instead of the one associated with this eload. ' 'Useful when the association is not set in the database. ' 'The project needs to exists in the DB.') argparse.add_argument( '--analysis_accessions', required=False, type=str, nargs='+', help= 'Set these analysis instead of the ones associated with the project. ' 'Useful when wanting to use a subset of the analysis. ' 'The analyses need to exists in the DB.') argparse.add_argument( '--force_config', action='store_true', default=False, help='Overwrite the configuration file after backing it up.') argparse.add_argument( '--keep_config', action='store_true', default=False, help= 'Keep the configuration file as it is and only run the validation on it.' ) argparse.add_argument( '--validation_tasks', required=False, type=str, nargs='+', default=validation_tasks, choices=validation_tasks, help='task or set of tasks to perform during validation') argparse.add_argument( '--merge_per_analysis', action='store_true', default=False, help='Whether to merge vcf files per analysis if possible.') argparse.add_argument( '--set_as_valid', action='store_true', default=False, help= 'Set the script to consider all validation tasks performed as valid in the final ' 'evaluation. This does not affect the actual report but only change the final ' 'evaluation') argparse.add_argument( '--report', action='store_true', default=False, help= 'Set the script to only report the results based on previously run preparation.' ) argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadBacklog( args.eload, project_accession=args.project_accession, analysis_accessions=args.analysis_accessions) as preparation: # Pass the eload config object to validation so that the two objects share the same state with EloadValidation(args.eload, preparation.eload_cfg) as validation: if not args.report and not args.keep_config: preparation.fill_in_config(args.force_config) if not args.report: validation.validate(args.validation_tasks) # Also mark the other validation tasks as force so they are all passable if args.set_as_valid: forced_validation_tasks = validation.all_validation_tasks for validation_task in forced_validation_tasks: validation.eload_cfg.set('validation', validation_task, 'forced', value=True) validation.mark_valid_files_and_metadata( args.merge_per_analysis) if args.merge_per_analysis: preparation.copy_valid_config_to_brokering_after_merge() preparation.report() validation.report() logger.info( 'Preparation complete, if files are valid please run ingestion as normal.' )
def main(): argparse = ArgumentParser( description='Accession and ingest submission data into EVA') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission.') argparse.add_argument( '--instance', required=False, type=int, choices=range(1, 13), help= 'The instance id to use for accessioning. Only needed if running accessioning.' ) # TODO infer aggregation from vcf files, VEP version & cache version from species argparse.add_argument('--aggregation', required=False, type=str.lower, choices=['basic', 'none'], help='The aggregation type (case insensitive).') argparse.add_argument( '--vep_version', required=False, type=int, help= 'VEP version to use for annotation. Only needed if running variant load.' ) argparse.add_argument( '--vep_cache_version', required=False, type=int, help= 'VEP cache version to use for annotation. Only needed if running variant load.' ) argparse.add_argument( '--db_name', required=False, type=str, help= 'Name of existing variant database in MongoDB. Only needed if adding a new database.' ) argparse.add_argument( '--tasks', required=False, type=str, nargs='+', default=EloadIngestion.all_tasks, choices=EloadIngestion.all_tasks, help='Task or set of tasks to perform during ingestion.') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() ingestion = EloadIngestion(args.eload) ingestion.ingest(aggregation=args.aggregation, instance_id=args.instance, vep_version=args.vep_version, vep_cache_version=args.vep_cache_version, db_name=args.db_name, tasks=args.tasks)
def main(): argparse = ArgumentParser( description='Accession and ingest submission data into EVA') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission.') argparse.add_argument( '--instance', required=False, type=int, choices=range(1, 13), help= 'The instance id to use for accessioning. Only needed if running accessioning.' ) # TODO infer aggregation from vcf files, VEP version & cache version from species argparse.add_argument('--aggregation', required=False, type=str.lower, choices=['basic', 'none'], help='The aggregation type (case insensitive).') action_vep_version = argparse.add_argument( '--vep_version', required=False, type=int, help= 'VEP version to use for annotation. Only needed if running variant load.' ) argparse.add_argument( '--vep_cache_version', required=False, type=int, help= 'VEP cache version to use for annotation. Only needed if running variant load.' ) argparse.add_argument( '--db_name', required=False, type=str, help= 'Name of an existing variant database in MongoDB. Submission should have a single ' 'assembly accession. Only needed if adding a new database. ex: db_name' ) argparse.add_argument( '--db_name_mapping', required=False, type=str, nargs='+', help= 'List with the mapping for assembly accession and existing variant database in MongoDB.' 'Only needed if adding a new databases.' 'ex: GCA_000000001.1,db_name1 GCA_000000002.2,db_name2') argparse.add_argument( '--tasks', required=False, type=str, nargs='+', default=EloadIngestion.all_tasks, choices=EloadIngestion.all_tasks, help='Task or set of tasks to perform during ingestion.') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level.') action_skip_annotation = argparse.add_argument( '--skip_annotation', action='store_true', default=False, help='Flag to skip VEP annotation running variant load.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) if args.skip_annotation is True and (args.vep_version is not None or args.vep_cache_version is not None): raise ArgumentError( action_skip_annotation, "Can't provide both \"--skip_annotation\" and \"--vep_version and --vep_cache_version\". Remove VEP/Cache versions or the skip flag and try again." ) if (args.vep_version is None and args.vep_cache_version is not None) or ( args.vep_version is not None and args.vep_cache_version is None): raise ArgumentError( action_vep_version, "Both \"--vep_version and --vep_cache_version\" should be specified together. Skip both arguments for auto-detection of these versions." ) # Load the config_file from default location load_config() ingestion = EloadIngestion(args.eload) ingestion.upgrade_config_if_needed() ingestion.ingest(aggregation=args.aggregation, instance_id=args.instance, vep_version=args.vep_version, vep_cache_version=args.vep_cache_version, skip_annotation=args.skip_annotation, db_name=args.db_name, db_name_mapping=args.db_name_mapping, tasks=args.tasks)