def parse_command_line(): """Process command-line arguments.""" description = """ This is the aTRAM script. It takes a query sequence and a blast database built with the atram_preprocessor.py script and builds an assembly. If you specify more than one query sequence and/or more than one blast database then aTRAM will build one assembly for each query/blast DB pair. NOTE: You may use a text file to hold the command-line arguments like: @/path/to/args.txt. This is particularly useful when specifying multiple blast databases or multiple query sequences. """ parser = argparse.ArgumentParser( fromfile_prefix_chars='@', formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(description)) parser.add_argument('--version', action='version', version='%(prog)s {}'.format(db.ATRAM_VERSION)) group = parser.add_argument_group('required arguments') group.add_argument('-b', '--blast-db', '--sra', '--db', '--database', required=True, metavar='DB', nargs='+', help="""This needs to match the DB prefix you entered for atram_preprocessor.py. You may repeat this argument to run the --query sequence(s) against multiple blast databases.""") group.add_argument('-q', '--query', '--target', '--probe', required=False, nargs='+', help="""The path to the fasta file with sequences of interest. You may repeat this argument. If you do then Each --query sequence file will be run against every --blast-db.""") group.add_argument('-Q', '--query-split', '--target-split', required=False, nargs='+', help="""The path to the fasta file with multiple sequences of interest. This will take every sequence in the fasta file and treat it as if it were its own --query argument. So every sequence in --query-split will be run against every --blast-db. """) group.add_argument('-o', '--output-prefix', required=True, help="""This is the prefix of all of the output files. So you can identify different blast output file sets. You may include a directory as part of the prefix. aTRAM will add suffixes to differentiate ouput files.""") group.add_argument( '-a', '--assembler', default='none', choices=['abyss', 'trinity', 'velvet', 'spades', 'none'], help="""Which assembler to use. Choosing "none" (the default) will do a single blast run and stop before any assembly.""") group.add_argument('-i', '--iterations', type=int, default=5, metavar='N', help="""The number of pipeline iterations. The default is "5".""") group.add_argument('-p', '--protein', action='store_true', help="""Are the query sequences protein? aTRAM will guess if you skip this argument.""") group.add_argument('--fraction', type=float, default=1.0, help="""Use only the specified fraction of the aTRAM database. The default is 1.0.""") cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1) group.add_argument('--cpus', '--processes', '--max-processes', type=int, default=cpus, help="""Number of CPU processors to use. This will also be used for the assemblers when possible. We will use {} out of {} cpus.""".format( cpus, os.cpu_count())) group.add_argument('--log-file', help="""Log file (full path)".""") group.add_argument('--path', help="""If the assembler or blast you want to use is not in your $PATH then use this to prepend directories to your path.""") group.add_argument('-t', '--temp-dir', metavar='DIR', help="""Place temporary files in this directory. All files will be deleted after aTRAM completes. The directory must exist.""") group.add_argument('--keep-temp-dir', action='store_true', help="""This flag will keep the temporary files in the --temp-dir around for debugging.""") group.add_argument('-T', '--timeout', metavar='SECONDS', default=300, type=int, help="""How many seconds to wait for an assembler before stopping the run. To wait forever set this to 0. The default is "300" (5 minutes).""") group = parser.add_argument_group( 'optional values for blast-filtering contigs') group.add_argument('--no-filter', action='store_true', help="""Do not filter the assembled contigs. This will: set both the --bit-score and --contig-length to 0""") group.add_argument('--bit-score', type=float, default=70.0, metavar='SCORE', help="""Remove contigs that have a value less than this. The default is "70.0". This is turned off by the --no-filter argument.""") group.add_argument('--contig-length', '--length', type=int, default=100, help="""Remove blast hits that are shorter than this length. The default is "100". This is turned off by the --no-filter argument.""") blast.command_line_args(parser) assembly.command_line_args(parser) args = vars(parser.parse_args()) check_query_args(args) blast.check_args(args) # Set defaults and adjust arguments based on other arguments args['cov_cutoff'] = assembly.default_cov_cutoff(args['cov_cutoff']) args['blast_db'] = blast.touchup_blast_db_names(args['blast_db']) args['kmer'] = assembly.default_kmer(args['kmer'], args['assembler']) args['max_target_seqs'] = blast.default_max_target_seqs( args['max_target_seqs'], args['blast_db'], args['max_memory']) setup_blast_args(args) set_protein_arg(args) setup_path_arg(args) find_programs(args) util.temp_dir_exists(args['temp_dir'], args.get('debug_dir')) util.set_blast_batch_size(args['batch_size']) return args
def parse_command_line(): """Process command-line arguments.""" description = """ This script prepares data for use by the atram.py script. It takes fasta or fastq files of paired-end (or single-end) sequence reads and creates a set of atram databases. You need to prepare the sequence read archive files so that the header lines contain only a sequence ID with the optional paired-end suffix at the end of the header line. The separator for the optional trailing paired-end suffix may be a space, a slash "/", a dot ".", or an underscore "_". For example: >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/1 GATTAA... >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/2 ATAGCC... >DBRHHJN1:427:H9YYAADXX:1:1101:10006:63769/2 CGAAAA... """ parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(description)) parser.add_argument('--version', action='version', version='%(prog)s {}'.format(db.ATRAM_VERSION)) parser.add_argument( '--end-1', '-1', metavar='FASTA_or_FASTQ', nargs='+', help="""Sequence read archive files that have only end 1 sequences. The sequence names do not need an end suffix, we will assume the suffix is always 1. The files are in fasta or fastq format. You may enter more than one file or you may use wildcards. """) parser.add_argument( '--end-2', '-2', metavar='FASTA_or_FASTQ', nargs='+', help="""Sequence read archive files that have only end 2 sequences. The sequence names do not need an end suffix, we will assume the suffix is always 2. The files are in fasta or fastq format. You may enter more than one file or you may use wildcards. """) parser.add_argument( '--mixed-ends', '-m', metavar='FASTA_or_FASTQ', nargs='+', help="""Sequence read archive files that have a mix of both end 1 and end 2 sequences (or single ends). The files are in fasta or fastq format. You may enter more than one file or you may use wildcards. """) parser.add_argument( '--single-ends', '-0', metavar='FASTA_or_FASTQ', nargs='+', help="""Sequence read archive files that have only unpaired sequences. Any sequence suffix will be ignored. The files are in fasta or fastq format. You may enter more than one file or you may use wildcards. """) group = parser.add_argument_group('preprocessor arguments') blast_db = join('.', 'atram_' + date.today().isoformat()) group.add_argument( '-b', '--blast-db', '--output', '--db', default=blast_db, metavar='DB', help="""This is the prefix of all of the blast database files. So you can identify different blast database sets. You may include a directory as part of the prefix. The default is "{}". """.format(blast_db)) cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1) group.add_argument( '--cpus', '--processes', '--max-processes', type=int, default=cpus, help="""Number of CPU threads to use. On this machine the default is ("{}")""".format(cpus)) group.add_argument( '-t', '--temp-dir', metavar='DIR', help="""Place temporary files in this directory. All files will be deleted after aTRAM completes. The directory must exist.""") group.add_argument( '--keep-temp-dir', action='store_true', help="""This flag will keep the temporary files in the --temp-dir around for debugging.""") group.add_argument( '-l', '--log-file', help="""Log file (full path). The default is to use the DB and program name to come up with a name like "<DB>_atram_preprocessor.log".""") group.add_argument( '-s', '--shards', '--number', type=int, metavar='SHARDS', dest='shard_count', help="""Number of blast DB shards to create. The default is to have each shard contain roughly 250MB of sequence data.""") group.add_argument( '--path', help="""If blast or makeblastdb is not in your $PATH then use this to prepend directories to your path.""") group.add_argument( '--fasta', action='store_true', help="""Are these fasta files? If you do not specify either --fasta or --fastq then aTRAM will guess the file type by looking at the last character of the file name.""") group.add_argument( '--fastq', action='store_true', help="""Are these fastq files? If you do not specify either --fasta or --fastq then aTRAM will guess the file type by looking at the last character of the file name.""") group.add_argument( '--gzip', action='store_true', help="""Are these gzip files?""") group.add_argument( '--bzip', action='store_true', help="""Are these gzip files?""") args = vars(parser.parse_args()) # Prepend to PATH environment variable if requested if args['path']: os.environ['PATH'] = '{}:{}'.format(args['path'], os.environ['PATH']) all_files = [] for ends in ['mixed_ends', 'end_1', 'end_2', 'single_ends']: if args.get(ends): all_files.extend([i for i in args[ends]]) args['shard_count'] = blast.default_shard_count( args['shard_count'], all_files) blast.make_blast_output_dir(args['blast_db']) blast.find_program('makeblastdb') util.temp_dir_exists(args['temp_dir']) return args
def parse_command_line(): """Process command-line arguments.""" description = """ This script prepares data for use by the atram.py script. It takes fasta or fastq files of paired-end (or single-end) sequence reads and creates a set of atram databases. You need to prepare the sequence read archive files so that the header lines contain only a sequence ID with the optional paired-end suffix at the end of the header line. The separator for the optional trailing paired-end suffix may be a space, a slash "/", a dot ".", or an underscore "_". For example: >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/1 GATTAA... >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/2 ATAGCC... >DBRHHJN1:427:H9YYAADXX:1:1101:10006:63769/2 CGAAAA... """ parser = argparse.ArgumentParser( fromfile_prefix_chars='@', formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(description)) parser.add_argument('--version', action='version', version='%(prog)s {}'.format(db.ATRAM_VERSION)) parser.add_argument( '--end-1', '-1', metavar='FASTA/Q', action='append', help="""Sequence read archive files that have only end 1 sequences. The sequence names do not need an end suffix, we will assume the suffix is always 1. The files are in fasta or fastq format. You may repeat this argument or use wildcards. """) parser.add_argument( '--end-2', '-2', metavar='FASTA/Q', action='append', help="""Sequence read archive files that have only end 2 sequences. The sequence names do not need an end suffix, we will assume the suffix is always 2. The files are in fasta or fastq format. You may repeat this argument or use wildcards. """) parser.add_argument( '--mixed-ends', '-m', metavar='FASTA/Q', action='append', help="""Sequence read archive files that have a mix of both end 1 and end 2 sequences (or single ends). The files are in fasta or fastq format. You may repeat this argument or use wildcards. """) parser.add_argument( '--single-ends', '-0', metavar='FASTA/Q', action='append', help="""Sequence read archive files that have only unpaired sequences. Any sequence suffix will be ignored. The files are in fasta or fastq format. You may repeat this argument or use wildcards. """) group = parser.add_argument_group('preprocessor arguments') blast_db = join('.', 'atram_' + date.today().isoformat()) group.add_argument( '-b', '--blast-db', '--db', default=blast_db, metavar='DB', help="""This is the prefix of all of the blast database files. So you can identify different blast database sets. You may include a directory as part of the prefix. The default is "{}". """.format(blast_db)) cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1) group.add_argument( '--cpus', '--processes', '--max-processes', type=int, default=cpus, help="""Number of CPU threads to use. On this machine the default is ("{}")""".format(cpus)) group.add_argument( '-t', '--temp-dir', metavar='DIR', help="""Place temporary files in this directory. All files will be deleted after aTRAM completes. The directory must exist.""") group.add_argument( '--keep-temp-dir', action='store_true', help="""This flag will keep the temporary files in the --temp-dir around for debugging.""") group.add_argument( '-l', '--log-file', help="""Log file (full path). The default is to use the DB and program name to come up with a name like "<DB>_atram_preprocessor.log".""") group.add_argument( '--log-level', choices=['debug', 'info', 'error'], default='info', help="""Log messages of the given level (or above). 'debug' shows the most messages and 'error' shows the least. The default is 'info'""") group.add_argument( '-s', '--shards', '--number', type=int, metavar='SHARDS', dest='shard_count', help="""Number of blast DB shards to create. The default is to have each shard contain roughly 250MB of sequence data.""") group.add_argument( '--path', help="""If makeblastdb is not in your $PATH then use this to prepend directories to your path.""") group.add_argument( '--fasta', action='store_true', help="""Are these fasta files? If you do not specify either --fasta or --fastq then aTRAM will guess the file type by looking at the last character of the file name.""") group.add_argument( '--fastq', action='store_true', help="""Are these fastq files? If you do not specify either --fasta or --fastq then aTRAM will guess the file type by looking at the last character of the file name.""") group.add_argument('--gzip', action='store_true', help="""Are these gzip files?""") group.add_argument('--bzip', action='store_true', help="""Are these bzip files?""") args = vars(parser.parse_args()) # Prepend to PATH environment variable if requested if args['path']: os.environ['PATH'] = '{}:{}'.format(args['path'], os.environ['PATH']) all_files = [] for ends in ['mixed_ends', 'end_1', 'end_2', 'single_ends']: if args.get(ends): end_files = [glob(p) for p in args[ends]] end_files = sorted(list(chain.from_iterable(end_files))) args[ends] = end_files all_files.extend(end_files) args['shard_count'] = blast.default_shard_count(args, all_files) blast.make_blast_output_dir(args['blast_db']) blast.find_program('makeblastdb') util.temp_dir_exists(args['temp_dir']) return args
def parse_command_line(): """Process command-line arguments.""" description = """ This program will find and stitch together exons from targeted assemblies using amino acid targets and DNA assemblies. """ parser = argparse.ArgumentParser( fromfile_prefix_chars='@', formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(description)) parser.add_argument('--version', action='version', version='%(prog)s {}'.format(db.ATRAM_VERSION)) parser.add_argument('-T', '--taxa', metavar='TAXA', required=True, help="""A text file of all of your taxon names.""") parser.add_argument( '-r', '--reference-genes', '--refs', metavar='FASTA', required=True, help="""Reference amino acid sequences in a FASTA file.""") parser.add_argument('-a', '--assemblies-dir', metavar='PATH', required=True, help="""The path to the DNA contigs.""") parser.add_argument( '-O', '--overlap', type=int, default=10, help="""Contigs must overlap by this many codons before it is considered a real overlap.""") parser.add_argument( '-t', '--temp-dir', metavar='DIR', help="""Place temporary files in this directory. All files will be deleted after aTRAM completes. The directory must exist.""") parser.add_argument( '--keep-temp-dir', action='store_true', help="""This flag will keep the temporary files in the --temp-dir around for debugging.""") parser.add_argument('-l', '--log-file', help="""Log file (full path). The default is "atram_stitcher_<date>.log".""") parser.add_argument( '--log-level', choices=['debug', 'info', 'error'], default='info', help="""Log messages of the given level (or above). 'debug' shows the most messages and 'error' shows the least. The default is 'info'""") parser.add_argument( '-i', '--iterations', type=int, default=2, metavar='N', help="""The number of times to run the main stitcher loop. This must be either 1 or 2, the default is 2.""") parser.add_argument( '-o', '--output-prefix', help="""This is the prefix of all of the output files. So you can identify different stitcher output file sets. You may include a directory as part of the prefix. The stitcher will add suffixes to differentiate output files.""") parser.add_argument( '-f', '--file-filter', default='*.fasta', help="""Use this to filter files in the assemblies directory. For example '*filtered*.fasta' will select all fasta files in the assemblies directory with the word filtered in them. The default is to select all fasta files in the assemblies directory '*.fasta'.""") parser.add_argument( '--reference-name', action='store_true', help="""Prepend the reference name to the final assembled gene name? if false the gene name in the reference file with just be the <taxon-name> if you select this then the assembled gene name will be <reference-name>.<taxon-name>.""") args = parser.parse_args() util.temp_dir_exists(args.temp_dir) if not args.output_prefix: args.output_prefix = join('.', 'atram_stitcher_' + date.today().isoformat()) if not args.log_file and args.output_prefix[-1] == '/': args.log_file = join( args.output_prefix, 'atram_stitcher_' + date.today().isoformat() + '.log') else: args.log_file = args.output_prefix + '.log' if 1 > args.iterations > 2: log.fatal('The iterations must be either 1 or 2.') return args
def parse_command_line(): """Process command-line arguments.""" description = """ This is the aTRAM script. It takes a query sequence and a blast database built with the atram_preprocessor.py script and builds an assembly. If you specify more than one query sequence and/or more than one blast database then aTRAM will build one assembly for each query/blast DB pair. NOTE: You may use a text file to hold the command-line arguments like: @/path/to/args.txt. This is particularly useful when specifying multiple blast databases or multiple query sequences. """ parser = argparse.ArgumentParser( fromfile_prefix_chars='@', formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent(description)) parser.add_argument('--version', action='version', version='%(prog)s {}'.format(db.ATRAM_VERSION)) group = parser.add_argument_group('required arguments') group.add_argument('-b', '--blast-db', '--sra', '--db', '--database', required=True, metavar='DB', nargs='+', help="""This needs to match the DB prefix you entered for atram_preprocessor.py. You may repeat this argument to run the --query sequence(s) against multiple blast databases.""") group.add_argument('-q', '--query', '--target', '--probe', required=False, nargs='+', help="""The path to the fasta file with sequences of interest. You may repeat this argument. If you do then Each --query sequence file will be run against every --blast-db.""") group.add_argument('-Q', '--query-split', '--target-split', required=False, nargs='+', help="""The path to the fasta file with multiple sequences of interest. This will take every sequence in the fasta file and treat it as if it were its own --query argument. So every sequence in --query-split will be run against every --blast-db. """) group.add_argument('-o', '--output-prefix', required=True, help="""This is the prefix of all of the output files. So you can identify different blast output file sets. You may include a directory as part of the prefix. aTRAM will add suffixes to differentiate ouput files.""") group.add_argument('-a', '--assembler', default='none', choices=['abyss', 'trinity', 'velvet', 'spades', 'none'], help="""Which assembler to use. Choosing "none" (the default) will do a single blast run and stop before any assembly.""") group.add_argument('-i', '--iterations', type=int, default=5, metavar='N', help="""The number of pipeline iterations. The default is "5".""") group.add_argument('-p', '--protein', action='store_true', help="""Are the query sequences protein? aTRAM will guess if you skip this argument.""") group.add_argument('--fraction', type=float, default=1.0, help="""Use only the specified fraction of the aTRAM database. The default is 1.0.""") cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1) group.add_argument('--cpus', '--processes', '--max-processes', type=int, default=cpus, help="""Number of CPU processors to use. This will also be used for the assemblers when possible. We will use {} out of {} cpus.""".format( cpus, os.cpu_count())) group.add_argument('--log-file', help="""Log file (full path)".""") group.add_argument('--path', help="""If the assembler or blast you want to use is not in your $PATH then use this to prepend directories to your path.""") group.add_argument('-t', '--temp-dir', metavar='DIR', help="""Place temporary files in this directory. All files will be deleted after aTRAM completes. The directory must exist.""") group.add_argument('--keep-temp-dir', action='store_true', help="""This flag will keep the temporary files in the --temp-dir around for debugging.""") group.add_argument('-T', '--timeout', metavar='SECONDS', default=300, type=int, help="""How many seconds to wait for an assembler before stopping the run. To wait forever set this to 0. The default is "300" (5 minutes).""") group = parser.add_argument_group( 'optional values for blast-filtering contigs') group.add_argument('--no-filter', action='store_true', help="""Do not filter the assembled contigs. This will: set both the --bit-score and --contig-length to 0""") group.add_argument('--bit-score', type=float, default=70.0, metavar='SCORE', help="""Remove contigs that have a value less than this. The default is "70.0". This is turned off by the --no-filter argument.""") group.add_argument('--contig-length', '--length', type=int, default=100, help="""Remove blast hits that are shorter than this length. The default is "100". This is turned off by the --no-filter argument.""") blast.command_line_args(parser) assembly.command_line_args(parser) args = vars(parser.parse_args()) check_query_args(args) blast.check_args(args) # Set defaults and adjust arguments based on other arguments args['cov_cutoff'] = assembly.default_cov_cutoff(args['cov_cutoff']) args['blast_db'] = blast.touchup_blast_db_names(args['blast_db']) args['kmer'] = assembly.default_kmer(args['kmer'], args['assembler']) args['max_target_seqs'] = blast.default_max_target_seqs( args['max_target_seqs'], args['blast_db'], args['max_memory']) setup_blast_args(args) set_protein_arg(args) setup_path_arg(args) find_programs(args) util.temp_dir_exists(args['temp_dir'], args.get('debug_dir')) util.set_blast_batch_size(args['batch_size']) return args
def parse_command_line(): """Process command-line arguments.""" description = """ This program will align contigs to a reference sequence and put them into the correct reading frame. """ parser = argparse.ArgumentParser(fromfile_prefix_chars='@', description=textwrap.dedent(description)) parser.add_argument('--version', action='version', version='%(prog)s {}'.format(db.ATRAM_VERSION)) parser.add_argument('-T', '--taxa', metavar='TAXA', required=True, help="""A text file of all of your taxon names.""") parser.add_argument( '-r', '--reference-genes', '--refs', metavar='FASTA', required=True, help="""Reference amino acid sequences in a FASTA file.""") parser.add_argument('-a', '--assemblies-dir', metavar='PATH', required=True, help="""The path to the DNA contigs.""") parser.add_argument('-m', '--min-length', metavar='LENGTH', default=100, type=int, help="""Remove contigs that are less than this length. (default %(default)s)""") parser.add_argument( '-t', '--temp-dir', metavar='DIR', help="""Place temporary files in this directory. All files will be deleted after aTRAM completes. The directory must exist.""") parser.add_argument( '--keep-temp-dir', action='store_true', help="""This flag will keep the temporary files in the --temp-dir around for debugging.""") parser.add_argument('-l', '--log-file', help="""Log file (full path).""") parser.add_argument( '--log-level', choices=['debug', 'info', 'error', 'fatal'], default='info', help="""Log messages of the given level (or above). 'debug' shows the most messages and 'fatal' shows the least. (default %(default)s)""") parser.add_argument( '-o', '--output-prefix', help="""This is the prefix of all of the output files. So you can identify different stitcher output file sets. You may include a directory as part of the prefix. The stitcher will add suffixes to differentiate output files.""") parser.add_argument( '-f', '--file-filter', default='*.fasta', help="""Use this to filter files in the assemblies directory. For example '*filtered*.fasta' will select all fasta files in the assemblies directory with the word filtered in them. The default is to select all fasta files in the assemblies directory. (default %(default)s)""") parser.add_argument( '--reference-name', action='store_true', help="""Prepend the reference name to the final assembled gene name? if false the gene name in the reference file with just be the <taxon-name> if you select this then the assembled gene name will be <reference-name>.<taxon-name>.""") parser.add_argument( '--long-contig', type=float, default=0.7, help="""A long contig is considered to be this fraction [0-1] of the longest contig assembled by exonerate. (default %(default)s)""") args = parser.parse_args() util.temp_dir_exists(args.temp_dir) if not args.output_prefix: args.output_prefix = join('.', 'atram_framer_' + date.today().isoformat()) return args