def main(): argparse = ArgumentParser(description='Prepare to process backlog study and validate VCFs.') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--force_config', action='store_true', default=False, help='Overwrite the configuration file after backing it up.') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() preparation = EloadBacklog(args.eload) preparation.fill_in_config(args.force_config) preparation.report() validation = EloadValidation(args.eload) validation_tasks = ['assembly_check', 'vcf_check'] validation.validate(validation_tasks) logger.info('Preparation complete, if files are valid please run ingestion as normal.')
def main(): argparse = ArgumentParser(description='Accession and ingest submission data into EVA') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission.') argparse.add_argument('--instance', required=False, type=int, choices=range(1, 13), default=1, help='The instance id to use for accessioning. Only needed if running accessioning.') argparse.add_argument('--tasks', required=False, type=str, nargs='+', default=EloadIngestion.all_tasks, choices=EloadIngestion.all_tasks, help='Task or set of tasks to perform during ingestion.') argparse.add_argument('--vep_cache_assembly_name', required=False, type=str, help='The assembly name used in the VEP cache to help the script to find the correct cache ' 'to use. This should be only used rarely when the script cannot find the VEP cache but ' 'we know it exists.') argparse.add_argument('--resume', action='store_true', default=False, help='Whether to resume an existing Nextflow process within ingestion.') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadIngestion(args.eload) as ingestion: ingestion.upgrade_config_if_needed() ingestion.ingest( instance_id=args.instance, tasks=args.tasks, vep_cache_assembly_name=args.vep_cache_assembly_name, resume=args.resume )
def main(): argparse = ArgumentParser( description='Retrieve file information from ENA and add them to EVAPRO. ' 'Remove extra vcf and index files in EVAPRO is they are not in ENA') argparse.add_argument( '--project_accession', required=False, type=str, help= 'Specify the project accession for which the retrieval should be done. This will apply to the whole projetc' ) argparse.add_argument( '--analysis_accession', required=False, type=str, help= 'Specify the analysis accession for which the retrieval should be done.' ) log_cfg.add_stdout_handler() args = argparse.parse_args() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() if args.analysis_accession: populate_files_info_from_ena(args.analysis_accession) elif args.project_accession: populate_files_info_from_ena(args.project_accession) else: logger.warning( 'You need to provide a project of analysis accession to use.')
def main(): argparse = ArgumentParser( description='Update metadata after study has been ingested') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission.') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadIngestion(args.eload) as ingestion: ingestion.upgrade_config_if_needed() ingestion.update_assembly_set_in_analysis() ingestion.insert_browsable_files() ingestion.update_browsable_files_with_date() ingestion.update_files_with_ftp_path() ingestion.refresh_study_browser() ingestion.update_loaded_assembly_in_browsable_files() ingestion.check_assembly_set_id_coherence()
def main(): argparse = ArgumentParser( description= 'Upgrade ELOAD config to a format compatible with current automation') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--analysis_alias', required=False, type=str, help='Analysis alias to use') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with Eload(args.eload) as eload: eload.upgrade_config_if_needed(args.analysis_alias)
def main(): argparse = ArgumentParser(description='Copies data from the ftp (if specified) and search for VCF and metadata files.' 'then create a config file storing information about the eload') argparse.add_argument('--ftp_box', required=False, type=int, choices=range(1, 21), help='box number where the data should have been uploaded. Required to copy the data from the FTP') argparse.add_argument('--submitter', required=False, type=str, help='the name of the directory for that submitter. Required to copy the data from the FTP') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--taxid', required=False, type=str, help='Override and replace the taxonomy id provided in the metadata spreadsheet.') argparse.add_argument('--reference', required=False, type=str, help='Override and replace the reference sequence accession provided in the metadata ' 'spreadsheet.') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() eload = EloadPreparation(args.eload) if args.ftp_box and args.submitter: eload.copy_from_ftp(args.ftp_box, args.submitter) eload.detect_all(args.taxid, args.reference)
def main(): argparse = ArgumentParser( description='Migrate an in-progress submission to the current cluster') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number of the submission to migrate') argparse.add_argument('--project', required=False, type=str, help='Optional associated project accession') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadMigration(args.eload) as eload: eload.migrate(args.project)
def main(): parser = argparse.ArgumentParser( description='Create and load the clustering and release tracking table', add_help=False) parser.add_argument("--private-config-xml-file", help="ex: /path/to/eva-maven-settings.xml", required=True) parser.add_argument("--release-version", help="version of the release", type=int, required=True) parser.add_argument( "--reference-directory", help= "Directory where the reference genomes exists or should be downloaded", required=True) parser.add_argument( "--taxonomy", help="taxonomy id for which rs count needs to be updated", type=int, required=False) parser.add_argument('--tasks', required=False, type=str, nargs='+', default=all_tasks, choices=all_tasks, help='Task or set of tasks to perform.') parser.add_argument('--help', action='help', help='Show this help message and exit') args = parser.parse_args() logging_config.add_stdout_handler() if not args.tasks: args.tasks = all_tasks if 'create_and_fill_table' in args.tasks: create_table(args.private_config_xml_file) fill_in_from_previous_inventory(args.private_config_xml_file, args.release_version) fill_in_table_from_remapping(args.private_config_xml_file, args.release_version, args.reference_directory) if 'fill_rs_count' in args.tasks: if not args.taxonomy: raise Exception( "For running task 'fill_rs_count', it is mandatory to provide taxonomy arguments" ) mongo_source_uri = get_mongo_uri_for_eva_profile( 'production', args.private_config_xml_file) mongo_source = MongoDatabase(uri=mongo_source_uri, db_name="eva_accession_sharded") fill_num_rs_id_for_taxonomy_and_assembly(mongo_source, args.private_config_xml_file, args.release_version, args.taxonomy, args.reference_directory)
def main(): arg_parser = argparse.ArgumentParser( description= 'Compare the sample name in the VCF file and the one specified in the metadata.' ) arg_parser.add_argument('--metadata-file', required=True, dest='metadata_file', help='EVA Submission Metadata Excel sheet') arg_parser.add_argument( '--vcf-dir', required=True, dest='vcf_dir', help='Path to the directory in which submitted files can be found') arg_parser.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level', ) args = arg_parser.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() compare_spreadsheet_and_vcf(args.metadata_file, args.vcf_dir)
def main(): argparse = ArgumentParser(description='Validate an ELOAD by checking the data and metadata format and semantics.') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--validation_tasks', required=False, type=str, nargs='+', default=EloadValidation.all_validation_tasks, choices=EloadValidation.all_validation_tasks, help='task or set of tasks to perform during validation') argparse.add_argument('--set_as_valid', action='store_true', default=False, help='Set the script to consider all validation tasks performed as valid in the final ' 'evaluation. This does not affect the actual report but only change the final ' 'evaluation') argparse.add_argument('--merge_per_analysis', action='store_true', default=False, help='Whether to merge vcf files per analysis if possible.') argparse.add_argument('--report', action='store_true', default=False, help='Set the script to only report the results based on previously run validation.') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadValidation(args.eload) as eload: eload.upgrade_config_if_needed() if not args.report: eload.validate(args.validation_tasks, args.set_as_valid, args.merge_per_analysis) eload.report()
def main(): argparse = ArgumentParser(description='Broker validated ELOAD to BioSamples and ENA') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') argparse.add_argument('--vcf_files', required=False, type=str, help='VCF files to use in the brokering', nargs='+') argparse.add_argument('--metadata_file', required=False, type=str, help='VCF files to use in the brokering') argparse.add_argument('--force', required=False, type=str, nargs='+', default=[], choices=EloadBrokering.all_brokering_tasks, help='When not set, the script only performs the tasks that were not successful. Can be ' 'set to specify one or several tasks to force during the brokering regardless of ' 'previous status') argparse.add_argument('--report', action='store_true', default=False, help='Set the script to only report the results based on previously run brokering.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() # Optionally Set the valid VCF and metadata file brokering = EloadBrokering(args.eload, args.vcf_files, args.metadata_file) brokering.upgrade_config_if_needed() if not args.report: brokering.broker(brokering_tasks_to_force=args.force) brokering.report()
def main(): argparse = ArgumentParser( description= 'Run entire variant remapping pipeline for a given assembly and taxonomy.' ) argparse.add_argument('--assembly', help='Assembly to be process') argparse.add_argument('--taxonomy_id', help='Taxonomy id to be process') argparse.add_argument('--list_jobs', help='Display the list of jobs to be run.', action='store_true', default=False) argparse.add_argument( '--resume', help='If a process has been run already this will resume it.', action='store_true', default=False) args = argparse.parse_args() load_config() if args.list_jobs: RemappingJob().list_assemblies_to_process() elif args.assembly and args.taxonomy_id: logging_config.add_stdout_handler() RemappingJob().process_one_assembly(args.assembly, args.taxonomy_id, args.resume) else: raise ArgumentError( 'One of (--assembly and --taxonomy_id) or --list_jobs options is required' )
def main(): parser = argparse.ArgumentParser( description='Generate custom assembly report for a given assembly', add_help=False) parser.add_argument( "-a", "--assembly-accession", help= "Assembly for which the process has to be run, e.g. GCA_000002315.3", required=True) parser.add_argument("-f", "--fasta-file", help="Path to the fasta file containing the assembly", required=True) parser.add_argument( "-r", "--report-file", help="Path to the assembly report file containing the assembly", required=True) parser.add_argument('--help', action='help', help='Show this help message and exit') args = parser.parse_args() load_config() logging_config.add_stdout_handler() assembly = CustomAssemblyFromDatabase(args.assembly_accession, args.fasta_file, args.report_file) assembly.generate_assembly_report() assembly.generate_fasta()
def main(): argparse = ArgumentParser( description='Inspect FTP boxes to detect new submission. ' 'Provide a report that specify the project title') argparse.add_argument( '--ftp_box', required=True, type=int, choices=range(1, 21), help='box number where the data should have been uploaded') argparse.add_argument('--submitter', required=False, type=str, help='the name of the directory for that submitter.') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level', ) args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() if args.submitter: inspect_one_user(args.ftp_box, args.submitter) else: inspect_all_users(args.ftp_box)
def main(): parser = argparse.ArgumentParser(description='Download and store a reference sequence or assembly.') parser.add_argument("-a", "--assembly-accession", help="Assembly for which the process has to be run, e.g. GCA_000002285.2", required=True) parser.add_argument("-s", "--species", help="Species scientific name under which this accession should be stored. " "This is only used to create the directory", required=True) parser.add_argument("-o", "--output-directory", help="Base directory under which all species assemblies are stored. " "Will use the one defined in config file if omitted") parser.add_argument("-c", "--clear", help="Flag to clear existing data in FASTA file and starting from scratch", action='store_true') parser.add_argument('--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = parser.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() try: assembly_fasta_path, assembly_report_path = get_reference_fasta_and_report( args.species, args.assembly_accession, args.output_directory, args.clear ) logger.info('FASTA: ' + assembly_fasta_path) logger.info('REPORT: ' + assembly_report_path) except Exception as ex: logger.exception(ex) return 1 return 0
def main(): argparse = ArgumentParser() argparse.add_argument('--settings_xml_file', help='File containing the connection to the database', required=False) argparse.add_argument('--study', help='The study in the assembly to correct', required=True) argparse.add_argument('--assembly', help='The assembly accession of the entities that needs to be changed', required=True) argparse.add_argument('--contigs', help='The contigs to modify. they should be provided as they appeared in the record', nargs='+') args = argparse.parse_args() log_cfg.add_stdout_handler() deprecate(args.settings_xml_file, args.study, args.assembly, args.contigs) logger.info("Finished successfully.")
def main(): argparse = ArgumentParser() argparse.add_argument( '--settings_xml_file', help='File containing the connection to the database', required=False) argparse.add_argument( '--database_name', help='The name of the database from the variant warehouse', required=True) argparse.add_argument( '--contigs', help= 'The contigs to modify. they should be provided as they appeared in the record', nargs='+') args = argparse.parse_args() log_cfg.add_stdout_handler() deprecate(args.settings_xml_file, args.database_name, args.contigs) logger.info("Finished successfully.")
def main(): parser = argparse.ArgumentParser( description= 'Download analyses for processing from the Covid-19 DP project', formatter_class=argparse.RawTextHelpFormatter, add_help=False) parser.add_argument( "--project", default='PRJEB45554', required=False, help="project from which analyses needs to be downloaded") parser.add_argument("--batch-size", default=100000, required=False, help="batch size of ENA analyses download") parser.add_argument( "--processed-file-directory", required=True, help= "full path to the directory where all the processed files are present") parser.add_argument( "--target-file", required=True, help="full path to the target file that will be created") parser.add_argument( "--field", choices=['run_ref', 'analysis_accession'], required=True, help= "field whose names has been used as file name and should be used for lookup" ) args = parser.parse_args() logging_config.add_stdout_handler() prepare_processed_analyses_file(args.project, args.batch_size, args.processed_file_directory, args.target_file, args.field)
def main(): argparser = ArgumentParser() argparser.add_argument("--private-config-xml-file", help="ex: /path/to/eva-maven-settings.xml", required=True) argparser.add_argument("--assembly_accession", help="GCA_000003205.1", required=True) argparser.add_argument("--assembly_report_path", help="path to the report to check contigs against", required=True) args = argparser.parse_args() genbank_to_row = get_contig_genbank(args.assembly_report_path) log_cfg.add_stdout_handler() with psycopg2.connect(get_pg_metadata_uri_for_eva_profile( "development", args.private_config_xml_file), user="******") as pg_conn: eva_contigs, dbSNP_contigs = get_contigs_accessions_for( pg_conn, args.assembly_accession) for contig in eva_contigs: if contig not in genbank_to_row: logger.warning( 'For assembly {} contig {} found in EVA is not genbank in the report {}' .format(args.assembly_accession, contig, args.assembly_report_path)) for contig in dbSNP_contigs: if contig not in genbank_to_row: logger.warning( 'For assembly {} contig {} found in dbSNP is not genbank in the report {}' .format(args.assembly_accession, contig, args.assembly_report_path)) return 0
def main(): argparse = ArgumentParser( description= 'Create a database with the provided name if it does not exist already' ) argparse.add_argument('--database_name', required=True, type=int, help='The database name') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') log_cfg.add_stdout_handler() args = argparse.parse_args() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() provision_new_database_for_variant_warehouse(args.database_name)
def main(): parser = argparse.ArgumentParser( description= 'Correct assembly error in assembly GCA_015227675.1 by replacing it GCA_015227675.2', add_help=False) parser.add_argument( "--mongo-source-uri", help= "Mongo Source URI (ex: mongodb://user:@mongos-source-host:27017/admin)", required=True) parser.add_argument( "--mongo-source-secrets-file", help= "Full path to the Mongo Source secrets file (ex: /path/to/mongo/source/secret)", required=True) parser.add_argument("--batch-size", help="number of document processed at once", required=False, type=int, default=1000) parser.add_argument("--debug", help="Set the script to output debug message", default=False, action='store_true') args = parser.parse_args() if args.debug: logging_config.add_stdout_handler(logging.DEBUG) else: logging_config.add_stdout_handler() mongo_source = MongoDatabase(uri=args.mongo_source_uri, secrets_file=args.mongo_source_secrets_file, db_name="eva_accession_sharded") replace_variant_entities(mongo_source, batch_size=int(args.batch_size)) del mongo_source
def main(): validation_tasks = ['aggregation_check', 'assembly_check', 'vcf_check'] forced_validation_tasks = ['metadata_check', 'sample_check'] argparse = ArgumentParser( description='Prepare to process backlog study and validate VCFs.') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission') argparse.add_argument( '--project_accession', required=False, type=str, help='Set this project instead of the one associated with this eload. ' 'Useful when the association is not set in the database. ' 'The project needs to exists in the DB.') argparse.add_argument( '--analysis_accessions', required=False, type=str, nargs='+', help= 'Set these analysis instead of the ones associated with the project. ' 'Useful when wanting to use a subset of the analysis. ' 'The analyses need to exists in the DB.') argparse.add_argument( '--force_config', action='store_true', default=False, help='Overwrite the configuration file after backing it up.') argparse.add_argument( '--keep_config', action='store_true', default=False, help= 'Keep the configuration file as it is and only run the validation on it.' ) argparse.add_argument( '--validation_tasks', required=False, type=str, nargs='+', default=validation_tasks, choices=validation_tasks, help='task or set of tasks to perform during validation') argparse.add_argument( '--merge_per_analysis', action='store_true', default=False, help='Whether to merge vcf files per analysis if possible.') argparse.add_argument( '--set_as_valid', action='store_true', default=False, help= 'Set the script to consider all validation tasks performed as valid in the final ' 'evaluation. This does not affect the actual report but only change the final ' 'evaluation') argparse.add_argument( '--report', action='store_true', default=False, help= 'Set the script to only report the results based on previously run preparation.' ) argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() with EloadBacklog( args.eload, project_accession=args.project_accession, analysis_accessions=args.analysis_accessions) as preparation: # Pass the eload config object to validation so that the two objects share the same state with EloadValidation(args.eload, preparation.eload_cfg) as validation: if not args.report and not args.keep_config: preparation.fill_in_config(args.force_config) if not args.report: validation.validate(args.validation_tasks) # Also mark the other validation tasks as force so they are all passable if args.set_as_valid: forced_validation_tasks = validation.all_validation_tasks for validation_task in forced_validation_tasks: validation.eload_cfg.set('validation', validation_task, 'forced', value=True) validation.mark_valid_files_and_metadata( args.merge_per_analysis) if args.merge_per_analysis: preparation.copy_valid_config_to_brokering_after_merge() preparation.report() validation.report() logger.info( 'Preparation complete, if files are valid please run ingestion as normal.' )
def main(): argparse = ArgumentParser( description='Accession and ingest submission data into EVA') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission.') argparse.add_argument( '--instance', required=False, type=int, choices=range(1, 13), help= 'The instance id to use for accessioning. Only needed if running accessioning.' ) # TODO infer aggregation from vcf files, VEP version & cache version from species argparse.add_argument('--aggregation', required=False, type=str.lower, choices=['basic', 'none'], help='The aggregation type (case insensitive).') action_vep_version = argparse.add_argument( '--vep_version', required=False, type=int, help= 'VEP version to use for annotation. Only needed if running variant load.' ) argparse.add_argument( '--vep_cache_version', required=False, type=int, help= 'VEP cache version to use for annotation. Only needed if running variant load.' ) argparse.add_argument( '--db_name', required=False, type=str, help= 'Name of an existing variant database in MongoDB. Submission should have a single ' 'assembly accession. Only needed if adding a new database. ex: db_name' ) argparse.add_argument( '--db_name_mapping', required=False, type=str, nargs='+', help= 'List with the mapping for assembly accession and existing variant database in MongoDB.' 'Only needed if adding a new databases.' 'ex: GCA_000000001.1,db_name1 GCA_000000002.2,db_name2') argparse.add_argument( '--tasks', required=False, type=str, nargs='+', default=EloadIngestion.all_tasks, choices=EloadIngestion.all_tasks, help='Task or set of tasks to perform during ingestion.') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level.') action_skip_annotation = argparse.add_argument( '--skip_annotation', action='store_true', default=False, help='Flag to skip VEP annotation running variant load.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) if args.skip_annotation is True and (args.vep_version is not None or args.vep_cache_version is not None): raise ArgumentError( action_skip_annotation, "Can't provide both \"--skip_annotation\" and \"--vep_version and --vep_cache_version\". Remove VEP/Cache versions or the skip flag and try again." ) if (args.vep_version is None and args.vep_cache_version is not None) or ( args.vep_version is not None and args.vep_cache_version is None): raise ArgumentError( action_vep_version, "Both \"--vep_version and --vep_cache_version\" should be specified together. Skip both arguments for auto-detection of these versions." ) # Load the config_file from default location load_config() ingestion = EloadIngestion(args.eload) ingestion.upgrade_config_if_needed() ingestion.ingest(aggregation=args.aggregation, instance_id=args.instance, vep_version=args.vep_version, vep_cache_version=args.vep_cache_version, skip_annotation=args.skip_annotation, db_name=args.db_name, db_name_mapping=args.db_name_mapping, tasks=args.tasks)
def main(): argparse = ArgumentParser( description='Accession and ingest submission data into EVA') argparse.add_argument('--eload', required=True, type=int, help='The ELOAD number for this submission.') argparse.add_argument( '--instance', required=False, type=int, choices=range(1, 13), help= 'The instance id to use for accessioning. Only needed if running accessioning.' ) # TODO infer aggregation from vcf files, VEP version & cache version from species argparse.add_argument('--aggregation', required=False, type=str.lower, choices=['basic', 'none'], help='The aggregation type (case insensitive).') argparse.add_argument( '--vep_version', required=False, type=int, help= 'VEP version to use for annotation. Only needed if running variant load.' ) argparse.add_argument( '--vep_cache_version', required=False, type=int, help= 'VEP cache version to use for annotation. Only needed if running variant load.' ) argparse.add_argument( '--db_name', required=False, type=str, help= 'Name of existing variant database in MongoDB. Only needed if adding a new database.' ) argparse.add_argument( '--tasks', required=False, type=str, nargs='+', default=EloadIngestion.all_tasks, choices=EloadIngestion.all_tasks, help='Task or set of tasks to perform during ingestion.') argparse.add_argument( '--debug', action='store_true', default=False, help='Set the script to output logging information at debug level.') args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(logging.DEBUG) # Load the config_file from default location load_config() ingestion = EloadIngestion(args.eload) ingestion.ingest(aggregation=args.aggregation, instance_id=args.instance, vep_version=args.vep_version, vep_cache_version=args.vep_cache_version, db_name=args.db_name, tasks=args.tasks)
import argparse import hashlib import traceback import pymongo from ebi_eva_common_pyutils.logger import logging_config from ebi_eva_common_pyutils.mongodb import MongoDatabase from pymongo import WriteConcern from pymongo.read_concern import ReadConcern logger = logging_config.get_logger(__name__) logging_config.add_stdout_handler() def get_SHA1(variant_rec): """Calculate the SHA1 digest from the seq, study, contig, start, ref, and alt attributes of the variant""" h = hashlib.sha1() keys = ['seq', 'study', 'contig', 'start', 'ref', 'alt'] h.update('_'.join([str(variant_rec[key]) for key in keys]).encode()) return h.hexdigest().upper() def get_contig_equivalents(): return { '1_random.1': 'AABR07046142.1', '1_random.15': 'KL567881.1', '1_random.2': 'KL567884.1', '1_random.21': 'KL567886.1', '1_random.23': 'KL567887.1', '1_random.27': 'KL567889.1', '1_random.4': 'KL567892.1',
def main(): argparse = ArgumentParser() argparse.add_argument( '--input', help='Path to the file containing the taxonomies and assemblies', required=True) argparse.add_argument( '--properties_dir', help= 'Path to the directory where the release1 application.properties are stored', required=True) argparse.add_argument( '--assembly_dirs', help= 'Path to the directory containing pre-downloaded species assemblies', required=True, nargs='+') argparse.add_argument( '--download_dir', help= 'Path to the temporary directory where additional species assemblies will be downloaded', required=True, ) argparse.add_argument( '--release2_reference_folder', help= 'Path to the directory where selected fasta and report will be copied', required=True) argparse.add_argument( '--output_assemblies_tsv', help= 'Path to the tsv file that will contain the list of assemblies to process', required=True) argparse.add_argument( '--output_taxonomy_tsv', help= 'Path to the tsv file that will contain the list of species to process', required=True) argparse.add_argument( '--eva_accession_path', help= 'path to the directory that contain eva-accession code and private json file.' ) argparse.add_argument("--private_config_xml_file", help="ex: /path/to/eva-maven-settings.xml", required=True) argparse.add_argument('--debug', help='Set login level to debug', action='store_true', default=False) args = argparse.parse_args() log_cfg.add_stdout_handler() if args.debug: log_cfg.set_log_level(level=logging.DEBUG) global eva_accession_path if args.eva_accession_path: eva_accession_path = args.eva_accession_path aggregate_list_of_species(args.input, args.properties_dir, args.assembly_dirs, args.download_dir, args.release2_reference_folder, args.output_assemblies_tsv, args.output_taxonomy_tsv, args.private_config_xml_file)