def main(): parser = argparse.ArgumentParser(prog='patho_typing.py', description='In silico pathogenic typing directly from raw Illumina reads', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--version', help='Version information', action='version', version='{prog} v{version}'.format(prog=parser.prog, version=__version__)) parser_required = parser.add_argument_group('Required options') parser_required.add_argument('-f', '--fastq', nargs='+', action=utils.required_length((1, 2), '--fastq'), type=argparse.FileType('r'), metavar=('/path/to/input/file.fq.gz'), help='Path to single OR paired-end fastq files. If two files are passed, they will be' ' assumed as being the paired fastq files', required=True) parser_required.add_argument('-s', '--species', nargs=2, type=str, metavar=('Yersinia', 'enterocolitica'), help='Species name', required=True) parser_optional_general = parser.add_argument_group('General facultative options') parser_optional_general.add_argument('-o', '--outdir', type=str, metavar='/path/to/output/directory/', help='Path to the directory where the information will be stored', required=False, default='.') parser_optional_general.add_argument('-j', '--threads', type=int, metavar='N', help='Number of threads to use', required=False, default=1) parser_optional_general.add_argument('--trueCoverage', action='store_true', help='Assess true coverage before continue typing') parser_optional_general.add_argument('--noCheckPoint', action='store_true', help='Ignore the true coverage checking point') parser_optional_general.add_argument('--minGeneCoverage', type=int, metavar='N', help='Minimum typing percentage of target reference gene sequence covered to' ' consider a gene to be present (value between [0, 100])', required=False) parser_optional_general.add_argument('--minGeneIdentity', type=int, metavar='N', help='Minimum typing percentage of identity of reference gene sequence covered' ' to consider a gene to be present (value between [0, 100]). One INDEL' ' will be considered as one difference', required=False) parser_optional_general.add_argument('--minGeneDepth', type=int, metavar='N', help='Minimum typing gene average coverage depth of present positions to' ' consider a gene to be present (default is 1/3 of average sample' ' coverage or 15x)', required=False) parser_optional_general.add_argument('--doNotRemoveConsensus', action='store_true', help='Do not remove ReMatCh consensus sequences') parser_optional_general.add_argument('--debug', action='store_true', help='DeBug Mode: do not remove temporary files') args = parser.parse_args() if args.minGeneCoverage is not None and (args.minGeneCoverage < 0 or args.minGeneCoverage > 100): parser.error('--minGeneCoverage should be a value between [0, 100]') if args.minGeneIdentity is not None and (args.minGeneIdentity < 0 or args.minGeneIdentity > 100): parser.error('--minGeneIdentity should be a value between [0, 100]') start_time = time.time() args.outdir = os.path.abspath(args.outdir) if not os.path.isdir(args.outdir): os.makedirs(args.outdir) # Start logger logfile, time_str = utils.start_logger(args.outdir) script_path = utils.general_information(logfile, __version__, args.outdir, time_str) print('\n') rematch = include_rematch_dependencies_path() args.fastq = [fastq.name for fastq in args.fastq] reference_file, trueCoverage_file, trueCoverage_sequences, trueCoverage_headers, trueCoverage_config, typing_file, \ typing_sequences, typing_headers, typing_rules, typing_config = \ set_reference(args.species, args.outdir, script_path, args.trueCoverage) original_reference_file = str(reference_file) confirm_genes_fasta_rules(typing_headers, typing_rules) run_successfully, bam_file = mapping_reads(args.fastq, reference_file, args.threads, args.outdir, False, 1) if run_successfully: rematch_dir = os.path.join(args.outdir, 'rematch', '') if not os.path.isdir(rematch_dir): os.makedirs(rematch_dir) if args.trueCoverage: if trueCoverage_file is not None: trueCoverage_dir = os.path.join(rematch_dir, 'trueCoverage', '') if not os.path.isdir(trueCoverage_dir): os.makedirs(trueCoverage_dir) print('\n') run_successfully, trueCoverage_bam = split_bam(bam_file, trueCoverage_headers, trueCoverage_dir, args.threads) if run_successfully: run_successfully = indexAlignment(trueCoverage_bam) if run_successfully: reference_file = os.path.join(trueCoverage_dir, 'reference.fasta') write_sequeces(reference_file, trueCoverage_sequences) index_fasta_samtools(reference_file, None, None, True) config = parse_config(trueCoverage_config) runtime, run_successfully, sample_data_general, data_by_gene = \ run_rematch.run_rematch(rematch, trueCoverage_dir, reference_file, trueCoverage_bam, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) if run_successfully and sample_data_general['mean_sample_coverage'] is not None and \ sample_data_general['number_absent_genes'] is not None and \ sample_data_general['number_genes_multiple_alleles'] is not None: if args.minGeneDepth is None: args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \ sample_data_general['mean_sample_coverage'] / 3 > 15 else \ 15 exit_info = [] if sample_data_general['mean_sample_coverage'] < config['minimum_read_coverage']: exit_info.append('Sample coverage ({mean}) lower than the minimum' ' required ({minimum})' ''.format(mean=sample_data_general['mean_sample_coverage'], minimum=config['minimum_read_coverage'])) if sample_data_general['number_absent_genes'] > config['maximum_number_absent_genes']: exit_info.append('Number of absent genes ({number}) higher than the' ' maximum allowed ({maximum})' ''.format(number=sample_data_general['number_absent_genes'], maximum=config['maximum_number_absent_genes'])) if sample_data_general['number_genes_multiple_alleles'] > \ config['maximum_number_genes_multiple_alleles']: exit_info.append('Number of genes with multiple alleles' ' ({number}) higher than the maximum' ' allowed ({maximum})' ''.format(number=sample_data_general['number_genes_multiple_alleles'], maximum=config['maximum_number_genes_multiple_alleles'])) if len(exit_info) > 0: print('\n' + '\n'.join(exit_info) + '\n') e = 'TrueCoverage requirements not fulfilled' print('\n' + e + '\n') if not args.noCheckPoint: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit(e) else: e = 'TrueCoverage module did not run successfully' print('\n' + e + '\n') if not args.noCheckPoint: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit(e) print('\n') typing_dir = os.path.join(rematch_dir, 'typing', '') if not os.path.isdir(typing_dir): os.makedirs(typing_dir) run_successfully, bam_file = split_bam(bam_file, typing_headers, typing_dir, args.threads) if run_successfully: run_successfully = indexAlignment(bam_file) if run_successfully: reference_file = os.path.join(typing_dir, 'reference.fasta') write_sequeces(reference_file, typing_sequences) index_fasta_samtools(reference_file, None, None, True) rematch_dir = str(typing_dir) if not run_successfully: if args.noCheckPoint: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit('Something in the required TrueCoverage analysis went wrong') else: print('\n' 'WARNING: it was not found trueCoverage target files. trueCoverage will not run.' '\n') if run_successfully: config = parse_config(typing_config) if args.minGeneCoverage is not None: config['minimum_gene_coverage'] = args.minGeneCoverage if args.minGeneIdentity is not None: config['minimum_gene_identity'] = args.minGeneIdentity runtime, run_successfully, sample_data_general, data_by_gene = \ run_rematch.run_rematch(rematch, rematch_dir, reference_file, bam_file, args.threads, config['length_extra_seq'], config['minimum_depth_presence'], config['minimum_depth_call'], config['minimum_depth_frequency_dominant_allele'], config['minimum_gene_coverage'], config['minimum_gene_identity'], args.debug, args.doNotRemoveConsensus) if run_successfully and data_by_gene is not None: if args.minGeneDepth is None: args.minGeneDepth = sample_data_general['mean_sample_coverage'] / 3 if \ sample_data_general['mean_sample_coverage'] / 3 > 15 else \ 15 _, _, _ = typing.typing(data_by_gene, typing_rules, config['minimum_gene_coverage'], config['minimum_gene_identity'], args.minGeneDepth, args.outdir) else: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit('ReMatCh run for pathotyping did not run successfully') else: clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) _ = utils.runTime(start_time) sys.exit('Something did not run successfully') clean_pathotyping_folder(args.outdir, original_reference_file, args.debug) print('\n') _ = utils.runTime(start_time)
def reads_subcommand(args): msg = [] # if args.reference is None and args.org is None: # argparse.ArgumentParser.error('--reference or --org must be provided') if args.minGeneCoverage < 0 or args.minGeneCoverage > 100: msg.append('--minGeneCoverage should be a value between [0, 100]') if args.minGeneIdentity < 0 or args.minGeneIdentity > 100: msg.append('--minGeneIdentity should be a value between [0, 100]') if len(msg) > 0: argparse.ArgumentParser(prog='assembly subcommand options').error( '\n'.join(msg)) rematch_script = include_rematch_dependencies_path() utils.required_programs({'rematch.py': ['--version', '>=', '4.0.1']}) args.fastq = [os.path.abspath(fastq.name) for fastq in args.fastq] folders_2_remove = [] references_dir = os.path.join(args.outdir, 'references', '') if not os.path.isdir(references_dir): os.makedirs(references_dir) folders_2_remove.append(references_dir) references_headers = {} clean_run_rematch = True if args.reference is not None: clean_run_rematch = False args.reference = [ os.path.abspath(reference) for reference in args.reference ] reference_files = {} for reference in args.reference: fasta_file, index_files, pickle_file = check_reference_exist( reference) if not fasta_file and len(index_files) == 0 and not pickle_file: sys.exit( 'Missing reference fasta file, Bowtie2 index of pickle file for {}' .format(reference)) reference_files[reference] = fasta_file, index_files, pickle_file references_to_use = [] for reference in args.reference: reference_files_found = reference_files[reference] reference_file = reference # Create symlink to pickle file if reference_files_found[2]: symlink = os.path.join( references_dir, os.path.basename(reference_file + '.pkl')) if os.path.islink(symlink): os.unlink(symlink) os.symlink(reference_file + '.pkl', symlink) header_gene_list, _ = utils.extractVariableFromPickle( reference_file + '.pkl') else: new_reference_file, header_gene_list, seq_reference_dict = \ run_rematch.clean_headers_reference_file(reference_file=reference_file, outdir=references_dir) if new_reference_file != reference_file: utils.Bcolors_print( 'WARNING: Sequences headers were renamed for {}'. format(reference), 'WARNING') reference_file = new_reference_file pickle_file = os.path.join( references_dir, os.path.basename(reference_file) + '.pkl') utils.saveVariableToPickle( (header_gene_list, seq_reference_dict), pickle_file) if reference_file == reference: # Create symlinks to reference file if reference_files_found[0]: symlink = os.path.join(references_dir, os.path.basename(reference_file)) if os.path.islink(symlink): os.unlink(symlink) os.symlink(reference_file, symlink) reference_file = symlink else: # Create reference file if does not exist # From index files if len(reference_files_found[1]) > 0: run_successfully, reference_file = \ run_rematch.run_bowtie_inspect(index_without_sufix=reference_file, outdir=references_dir) if not run_successfully: sys.exit( 'Something went wrong while creating the reference fasta file from Bowtie2' ' index {}'.format(reference_file)) # From pickle file elif reference_files_found[2]: _, seq_reference_dict = utils.extractVariableFromPickle( reference_file + '.pkl') reference_file = os.path.join( references_dir, os.path.basename(reference_file)) write_seq_from_sequence_dict(seq_reference_dict, reference_file) # Create symlinks to index files if len(reference_files_found[1]) > 0: for index_file in reference_files_found[1]: symlink = os.path.join(references_dir, os.path.basename(index_file)) if os.path.islink(symlink): os.unlink(symlink) os.symlink(index_file, symlink) references_to_use.append(reference_file) references_headers[reference_file] = dict(header_gene_list) args.reference = references_to_use else: args.reference, config = get_fasta_config(args.org) references_to_use = [] for reference in args.reference: new_reference_file, header_gene_list, seq_reference_dict = \ run_rematch.clean_headers_reference_file(reference_file=reference, outdir=references_dir) if new_reference_file != reference: utils.Bcolors_print( 'WARNING: Sequences headers were renamed for {}'.format( reference), 'WARNING') references_to_use.append(new_reference_file) references_headers[new_reference_file] = dict(header_gene_list) pickle_file = os.path.join( references_dir, os.path.basename(new_reference_file) + '.pkl') else: symlink = os.path.join(references_dir, os.path.basename(reference)) if os.path.islink(symlink): os.unlink(symlink) os.symlink(reference, symlink) references_to_use.append(symlink) references_headers[symlink] = dict(header_gene_list) pickle_file = os.path.join(references_dir, os.path.basename(symlink) + '.pkl') utils.saveVariableToPickle((header_gene_list, seq_reference_dict), pickle_file) args.reference = references_to_use config = parse_config(config) args.extraSeq = config['length_extra_seq'] args.minCovPresence = config['minimum_depth_presence'] args.minCovCall = config['minimum_depth_call'] args.minGeneCoverage = config['minimum_gene_coverage'] args.typeSeparator = '_' print('\n' 'Settings that will be used:\n' ' reference: {reference}\n' ' extraSeq: {extraSeq}\n' ' minCovPresence: {minCovPresence}\n' ' minCovCall: {minCovCall}\n' ' minGeneCoverage: {minGeneCoverage}\n' ' Type separator character: {typeSeparator}' '\n'.format(reference=args.reference, extraSeq=args.extraSeq, minCovPresence=args.minCovPresence, minCovCall=args.minCovCall, minGeneCoverage=args.minGeneCoverage, typeSeparator=args.typeSeparator)) pickles_folder = os.path.join(args.outdir, 'pickles', '') # Run ReMatCh pickle_file = os.path.join(pickles_folder, 'rematch_module.pkl') if args.resume and os.path.isfile(pickle_file): print('ReMatCh module already run') references_results, module_dir = utils.extractVariableFromPickle( pickle_file) folders_2_remove.append(module_dir) else: _, references_results, module_dir = run_rematch.run_rematch( rematch_script, args.outdir, args.reference, args.fastq, args.threads, args.extraSeq, args.minCovPresence, args.minCovCall, args.minFrequencyDominantAllele, args.minGeneCoverage, args.minGeneIdentity, args.debug, args.doNotRemoveConsensus, args.bowtieAlgo, clean_run_rematch=clean_run_rematch) folders_2_remove.append(module_dir) utils.saveVariableToPickle([references_results, module_dir], pickle_file) return folders_2_remove, references_results, args.reference, references_headers