예제 #1
0
파일: atram.py 프로젝트: adsweet/aTRAM
def parse_command_line():
    """Process command-line arguments."""
    description = """
        This is the aTRAM script. It takes a query sequence and a blast
        database built with the atram_preprocessor.py script and builds an
        assembly.

        If you specify more than one query sequence and/or more than one blast
        database then aTRAM will build one assembly for each query/blast
        DB pair.

        NOTE: You may use a text file to hold the command-line arguments
        like: @/path/to/args.txt. This is particularly useful when specifying
        multiple blast databases or multiple query sequences.
        """
    parser = argparse.ArgumentParser(
        fromfile_prefix_chars='@',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent(description))

    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {}'.format(db.ATRAM_VERSION))

    group = parser.add_argument_group('required arguments')

    group.add_argument('-b',
                       '--blast-db',
                       '--sra',
                       '--db',
                       '--database',
                       required=True,
                       metavar='DB',
                       nargs='+',
                       help="""This needs to match the DB prefix you
                            entered for atram_preprocessor.py. You may repeat
                            this argument to run the --query sequence(s)
                            against multiple blast databases.""")

    group.add_argument('-q',
                       '--query',
                       '--target',
                       '--probe',
                       required=False,
                       nargs='+',
                       help="""The path to the fasta file with sequences of
                            interest. You may repeat this argument. If you do
                            then Each --query sequence  file will be run
                            against every --blast-db.""")

    group.add_argument('-Q',
                       '--query-split',
                       '--target-split',
                       required=False,
                       nargs='+',
                       help="""The path to the fasta file with multiple
                            sequences of interest. This will take every
                            sequence in the fasta file and treat it as if it
                            were its own --query argument. So every sequence in
                            --query-split will be run against every --blast-db.
                            """)

    group.add_argument('-o',
                       '--output-prefix',
                       required=True,
                       help="""This is the prefix of all of the output files.
                            So you can identify different blast output file
                            sets. You may include a directory as part of the
                            prefix. aTRAM will add suffixes to differentiate
                            ouput files.""")

    group.add_argument(
        '-a',
        '--assembler',
        default='none',
        choices=['abyss', 'trinity', 'velvet', 'spades', 'none'],
        help="""Which assembler to use. Choosing "none" (the
                            default) will do a single blast run and stop before
                            any assembly.""")

    group.add_argument('-i',
                       '--iterations',
                       type=int,
                       default=5,
                       metavar='N',
                       help="""The number of pipeline iterations.
                            The default is "5".""")

    group.add_argument('-p',
                       '--protein',
                       action='store_true',
                       help="""Are the query sequences protein?
                            aTRAM will guess if you skip this argument.""")

    group.add_argument('--fraction',
                       type=float,
                       default=1.0,
                       help="""Use only the specified fraction of the aTRAM
                            database. The default is 1.0.""")

    cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1)
    group.add_argument('--cpus',
                       '--processes',
                       '--max-processes',
                       type=int,
                       default=cpus,
                       help="""Number of CPU processors to use. This will also
                            be used for the assemblers when possible. We will
                            use {} out of {} cpus.""".format(
                           cpus, os.cpu_count()))

    group.add_argument('--log-file', help="""Log file (full path)".""")

    group.add_argument('--path',
                       help="""If the assembler or blast you want to use is not
                            in your $PATH then use this to prepend
                            directories to your path.""")

    group.add_argument('-t',
                       '--temp-dir',
                       metavar='DIR',
                       help="""Place temporary files in this directory. All
                            files will be deleted after aTRAM completes. The
                            directory must exist.""")

    group.add_argument('--keep-temp-dir',
                       action='store_true',
                       help="""This flag will keep the temporary files in the
                            --temp-dir around for debugging.""")

    group.add_argument('-T',
                       '--timeout',
                       metavar='SECONDS',
                       default=300,
                       type=int,
                       help="""How many seconds to wait for an assembler before
                            stopping the run. To wait forever set this to 0.
                            The default is "300" (5 minutes).""")

    group = parser.add_argument_group(
        'optional values for blast-filtering contigs')

    group.add_argument('--no-filter',
                       action='store_true',
                       help="""Do not filter the assembled contigs. This will:
                            set both the --bit-score and --contig-length
                            to 0""")

    group.add_argument('--bit-score',
                       type=float,
                       default=70.0,
                       metavar='SCORE',
                       help="""Remove contigs that have a value less than this.
                            The default is "70.0". This is turned off by the
                            --no-filter argument.""")

    group.add_argument('--contig-length',
                       '--length',
                       type=int,
                       default=100,
                       help="""Remove blast hits that are shorter than this
                            length. The default is "100". This is turned
                            off by the --no-filter argument.""")

    blast.command_line_args(parser)
    assembly.command_line_args(parser)

    args = vars(parser.parse_args())

    check_query_args(args)
    blast.check_args(args)

    # Set defaults and adjust arguments based on other arguments
    args['cov_cutoff'] = assembly.default_cov_cutoff(args['cov_cutoff'])
    args['blast_db'] = blast.touchup_blast_db_names(args['blast_db'])
    args['kmer'] = assembly.default_kmer(args['kmer'], args['assembler'])
    args['max_target_seqs'] = blast.default_max_target_seqs(
        args['max_target_seqs'], args['blast_db'], args['max_memory'])

    setup_blast_args(args)
    set_protein_arg(args)
    setup_path_arg(args)
    find_programs(args)
    util.temp_dir_exists(args['temp_dir'], args.get('debug_dir'))
    util.set_blast_batch_size(args['batch_size'])

    return args
예제 #2
0
def parse_command_line():
    """Process command-line arguments."""
    description = """
        This script prepares data for use by the atram.py
        script. It takes fasta or fastq files of paired-end (or
        single-end) sequence reads and creates a set of atram
        databases.

        You need to prepare the sequence read archive files so that the
        header lines contain only a sequence ID with the optional
        paired-end suffix at the end of the header line. The separator
        for the optional trailing paired-end suffix may be a space,
        a slash "/", a dot ".", or an underscore "_".

        For example:

            >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/1
            GATTAA...
            >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/2
            ATAGCC...
            >DBRHHJN1:427:H9YYAADXX:1:1101:10006:63769/2
            CGAAAA...
        """

    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent(description))

    parser.add_argument('--version', action='version',
                        version='%(prog)s {}'.format(db.ATRAM_VERSION))

    parser.add_argument(
        '--end-1', '-1', metavar='FASTA_or_FASTQ', nargs='+',
        help="""Sequence read archive files that have only end 1 sequences. The
            sequence names do not need an end suffix, we will assume the suffix
            is always 1. The files are in fasta or fastq format. You may enter
            more than one file or you may use wildcards.
            """)

    parser.add_argument(
        '--end-2', '-2', metavar='FASTA_or_FASTQ', nargs='+',
        help="""Sequence read archive files that have only end 2 sequences.
            The sequence names do not need an end suffix, we will assume the
            suffix is always 2. The files are in fasta or fastq format. You may
            enter more than one file or you may use wildcards.
            """)

    parser.add_argument(
        '--mixed-ends', '-m', metavar='FASTA_or_FASTQ', nargs='+',
        help="""Sequence read archive files that have a mix of both end 1 and
            end 2 sequences (or single ends). The files are in fasta or fastq
            format. You may enter more than one file or you may use wildcards.
            """)

    parser.add_argument(
        '--single-ends', '-0', metavar='FASTA_or_FASTQ', nargs='+',
        help="""Sequence read archive files that have only unpaired sequences.
            Any sequence suffix will be ignored. The files are in fasta or
            fastq format. You may enter more than one file or you may use
            wildcards.
            """)

    group = parser.add_argument_group('preprocessor arguments')

    blast_db = join('.', 'atram_' + date.today().isoformat())
    group.add_argument(
        '-b', '--blast-db', '--output', '--db', default=blast_db, metavar='DB',
        help="""This is the prefix of all of the blast database files. So you
            can identify different blast database sets. You may include a
            directory as part of the prefix. The default is "{}".
            """.format(blast_db))

    cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1)
    group.add_argument(
        '--cpus', '--processes', '--max-processes', type=int, default=cpus,
        help="""Number of CPU threads to use. On this machine the default is
        ("{}")""".format(cpus))

    group.add_argument(
        '-t', '--temp-dir', metavar='DIR',
        help="""Place temporary files in this directory. All files will be
            deleted after aTRAM completes. The directory must exist.""")

    group.add_argument(
        '--keep-temp-dir', action='store_true',
        help="""This flag will keep the temporary files in the --temp-dir
        around for debugging.""")

    group.add_argument(
        '-l', '--log-file',
        help="""Log file (full path). The default is to use the DB and program
            name to come up with a name like "<DB>_atram_preprocessor.log".""")

    group.add_argument(
        '-s', '--shards', '--number', type=int, metavar='SHARDS',
        dest='shard_count',
        help="""Number of blast DB shards to create. The default is to have
            each shard contain roughly 250MB of sequence data.""")

    group.add_argument(
        '--path',
        help="""If blast or makeblastdb is not in your $PATH then use this to
            prepend directories to your path.""")

    group.add_argument(
        '--fasta', action='store_true',
        help="""Are these fasta files? If you do not specify either --fasta or
            --fastq then aTRAM will guess the file type by looking at the last
            character of the file name.""")

    group.add_argument(
        '--fastq', action='store_true',
        help="""Are these fastq files? If you do not specify either --fasta or
            --fastq then aTRAM will guess the file type by looking at the last
            character of the file name.""")

    group.add_argument(
        '--gzip', action='store_true',
        help="""Are these gzip files?""")

    group.add_argument(
        '--bzip', action='store_true',
        help="""Are these gzip files?""")

    args = vars(parser.parse_args())

    # Prepend to PATH environment variable if requested
    if args['path']:
        os.environ['PATH'] = '{}:{}'.format(args['path'], os.environ['PATH'])

    all_files = []
    for ends in ['mixed_ends', 'end_1', 'end_2', 'single_ends']:
        if args.get(ends):
            all_files.extend([i for i in args[ends]])

    args['shard_count'] = blast.default_shard_count(
        args['shard_count'], all_files)

    blast.make_blast_output_dir(args['blast_db'])

    blast.find_program('makeblastdb')

    util.temp_dir_exists(args['temp_dir'])

    return args
예제 #3
0
def parse_command_line():
    """Process command-line arguments."""
    description = """
        This script prepares data for use by the atram.py
        script. It takes fasta or fastq files of paired-end (or
        single-end) sequence reads and creates a set of atram
        databases.

        You need to prepare the sequence read archive files so that the
        header lines contain only a sequence ID with the optional
        paired-end suffix at the end of the header line. The separator
        for the optional trailing paired-end suffix may be a space,
        a slash "/", a dot ".", or an underscore "_".

        For example:

            >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/1
            GATTAA...
            >DBRHHJN1:427:H9YYAADXX:1:1101:10001:77019/2
            ATAGCC...
            >DBRHHJN1:427:H9YYAADXX:1:1101:10006:63769/2
            CGAAAA...
        """

    parser = argparse.ArgumentParser(
        fromfile_prefix_chars='@',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent(description))

    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {}'.format(db.ATRAM_VERSION))

    parser.add_argument(
        '--end-1',
        '-1',
        metavar='FASTA/Q',
        action='append',
        help="""Sequence read archive files that have only end 1 sequences. The
            sequence names do not need an end suffix, we will assume the suffix
            is always 1. The files are in fasta or fastq format. You may
            repeat this argument or use wildcards.
            """)

    parser.add_argument(
        '--end-2',
        '-2',
        metavar='FASTA/Q',
        action='append',
        help="""Sequence read archive files that have only end 2 sequences.
            The sequence names do not need an end suffix, we will assume the
            suffix is always 2. The files are in fasta or fastq format. You
            may repeat this argument or use wildcards.
            """)

    parser.add_argument(
        '--mixed-ends',
        '-m',
        metavar='FASTA/Q',
        action='append',
        help="""Sequence read archive files that have a mix of both end 1 and
            end 2 sequences (or single ends). The files are in fasta or fastq
            format. You may repeat this argument or use wildcards.
            """)

    parser.add_argument(
        '--single-ends',
        '-0',
        metavar='FASTA/Q',
        action='append',
        help="""Sequence read archive files that have only unpaired sequences.
            Any sequence suffix will be ignored. The files are in fasta or
            fastq format. You may repeat this argument or use wildcards.
            """)

    group = parser.add_argument_group('preprocessor arguments')

    blast_db = join('.', 'atram_' + date.today().isoformat())
    group.add_argument(
        '-b',
        '--blast-db',
        '--db',
        default=blast_db,
        metavar='DB',
        help="""This is the prefix of all of the blast database files. So you
            can identify different blast database sets. You may include a
            directory as part of the prefix. The default is "{}".
            """.format(blast_db))

    cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1)
    group.add_argument(
        '--cpus',
        '--processes',
        '--max-processes',
        type=int,
        default=cpus,
        help="""Number of CPU threads to use. On this machine the default is
        ("{}")""".format(cpus))

    group.add_argument(
        '-t',
        '--temp-dir',
        metavar='DIR',
        help="""Place temporary files in this directory. All files will be
            deleted after aTRAM completes. The directory must exist.""")

    group.add_argument(
        '--keep-temp-dir',
        action='store_true',
        help="""This flag will keep the temporary files in the --temp-dir
        around for debugging.""")

    group.add_argument(
        '-l',
        '--log-file',
        help="""Log file (full path). The default is to use the DB and program
            name to come up with a name like "<DB>_atram_preprocessor.log".""")
    group.add_argument(
        '--log-level',
        choices=['debug', 'info', 'error'],
        default='info',
        help="""Log messages of the given level (or above). 'debug' shows the
            most messages and 'error' shows the least. The default is
            'info'""")

    group.add_argument(
        '-s',
        '--shards',
        '--number',
        type=int,
        metavar='SHARDS',
        dest='shard_count',
        help="""Number of blast DB shards to create. The default is to have
            each shard contain roughly 250MB of sequence data.""")

    group.add_argument(
        '--path',
        help="""If makeblastdb is not in your $PATH then use this to prepend
            directories to your path.""")

    group.add_argument(
        '--fasta',
        action='store_true',
        help="""Are these fasta files? If you do not specify either --fasta or
            --fastq then aTRAM will guess the file type by looking at the last
            character of the file name.""")

    group.add_argument(
        '--fastq',
        action='store_true',
        help="""Are these fastq files? If you do not specify either --fasta or
            --fastq then aTRAM will guess the file type by looking at the last
            character of the file name.""")

    group.add_argument('--gzip',
                       action='store_true',
                       help="""Are these gzip files?""")

    group.add_argument('--bzip',
                       action='store_true',
                       help="""Are these bzip files?""")

    args = vars(parser.parse_args())

    # Prepend to PATH environment variable if requested
    if args['path']:
        os.environ['PATH'] = '{}:{}'.format(args['path'], os.environ['PATH'])

    all_files = []
    for ends in ['mixed_ends', 'end_1', 'end_2', 'single_ends']:
        if args.get(ends):
            end_files = [glob(p) for p in args[ends]]
            end_files = sorted(list(chain.from_iterable(end_files)))
            args[ends] = end_files
            all_files.extend(end_files)

    args['shard_count'] = blast.default_shard_count(args, all_files)

    blast.make_blast_output_dir(args['blast_db'])

    blast.find_program('makeblastdb')

    util.temp_dir_exists(args['temp_dir'])

    return args
예제 #4
0
def parse_command_line():
    """Process command-line arguments."""
    description = """
        This program will find and stitch together exons from targeted
        assemblies using amino acid targets and DNA assemblies.
        """

    parser = argparse.ArgumentParser(
        fromfile_prefix_chars='@',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent(description))

    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {}'.format(db.ATRAM_VERSION))

    parser.add_argument('-T',
                        '--taxa',
                        metavar='TAXA',
                        required=True,
                        help="""A text file of all of your taxon names.""")

    parser.add_argument(
        '-r',
        '--reference-genes',
        '--refs',
        metavar='FASTA',
        required=True,
        help="""Reference amino acid sequences in a FASTA file.""")

    parser.add_argument('-a',
                        '--assemblies-dir',
                        metavar='PATH',
                        required=True,
                        help="""The path to the DNA contigs.""")

    parser.add_argument(
        '-O',
        '--overlap',
        type=int,
        default=10,
        help="""Contigs must overlap by this many codons before it is
            considered a real overlap.""")

    parser.add_argument(
        '-t',
        '--temp-dir',
        metavar='DIR',
        help="""Place temporary files in this directory. All files will be
            deleted after aTRAM completes. The directory must exist.""")

    parser.add_argument(
        '--keep-temp-dir',
        action='store_true',
        help="""This flag will keep the temporary files in the --temp-dir
        around for debugging.""")

    parser.add_argument('-l',
                        '--log-file',
                        help="""Log file (full path). The default is
            "atram_stitcher_<date>.log".""")
    parser.add_argument(
        '--log-level',
        choices=['debug', 'info', 'error'],
        default='info',
        help="""Log messages of the given level (or above). 'debug' shows the
            most messages and 'error' shows the least. The default is
            'info'""")

    parser.add_argument(
        '-i',
        '--iterations',
        type=int,
        default=2,
        metavar='N',
        help="""The number of times to run the main stitcher loop. This
            must be either 1 or 2, the default is 2.""")

    parser.add_argument(
        '-o',
        '--output-prefix',
        help="""This is the prefix of all of the output files. So you can
            identify different stitcher output file sets. You may include a
            directory as part of the prefix. The stitcher will add suffixes to
            differentiate output files.""")

    parser.add_argument(
        '-f',
        '--file-filter',
        default='*.fasta',
        help="""Use this to filter files in the assemblies directory. For
            example '*filtered*.fasta' will select all fasta files in the
            assemblies directory with the word filtered in them. The default
            is to select all fasta files in the assemblies directory
            '*.fasta'.""")

    parser.add_argument(
        '--reference-name',
        action='store_true',
        help="""Prepend the reference name to the final assembled gene name?
            if false the gene name in the reference file with just be the
            <taxon-name> if you select this then the assembled gene name
            will be <reference-name>.<taxon-name>.""")

    args = parser.parse_args()

    util.temp_dir_exists(args.temp_dir)

    if not args.output_prefix:
        args.output_prefix = join('.',
                                  'atram_stitcher_' + date.today().isoformat())

    if not args.log_file and args.output_prefix[-1] == '/':
        args.log_file = join(
            args.output_prefix,
            'atram_stitcher_' + date.today().isoformat() + '.log')
    else:
        args.log_file = args.output_prefix + '.log'

    if 1 > args.iterations > 2:
        log.fatal('The iterations must be either 1 or 2.')

    return args
예제 #5
0
파일: atram.py 프로젝트: juliema/aTRAM
def parse_command_line():
    """Process command-line arguments."""
    description = """
        This is the aTRAM script. It takes a query sequence and a blast
        database built with the atram_preprocessor.py script and builds an
        assembly.

        If you specify more than one query sequence and/or more than one blast
        database then aTRAM will build one assembly for each query/blast
        DB pair.

        NOTE: You may use a text file to hold the command-line arguments
        like: @/path/to/args.txt. This is particularly useful when specifying
        multiple blast databases or multiple query sequences.
        """
    parser = argparse.ArgumentParser(
        fromfile_prefix_chars='@',
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=textwrap.dedent(description))

    parser.add_argument('--version', action='version',
                        version='%(prog)s {}'.format(db.ATRAM_VERSION))

    group = parser.add_argument_group('required arguments')

    group.add_argument('-b', '--blast-db', '--sra', '--db', '--database',
                       required=True, metavar='DB', nargs='+',
                       help="""This needs to match the DB prefix you
                            entered for atram_preprocessor.py. You may repeat
                            this argument to run the --query sequence(s)
                            against multiple blast databases.""")

    group.add_argument('-q', '--query', '--target', '--probe',
                       required=False, nargs='+',
                       help="""The path to the fasta file with sequences of
                            interest. You may repeat this argument. If you do
                            then Each --query sequence  file will be run
                            against every --blast-db.""")

    group.add_argument('-Q', '--query-split', '--target-split',
                       required=False, nargs='+',
                       help="""The path to the fasta file with multiple
                            sequences of interest. This will take every
                            sequence in the fasta file and treat it as if it
                            were its own --query argument. So every sequence in
                            --query-split will be run against every --blast-db.
                            """)

    group.add_argument('-o', '--output-prefix', required=True,
                       help="""This is the prefix of all of the output files.
                            So you can identify different blast output file
                            sets. You may include a directory as part of the
                            prefix. aTRAM will add suffixes to differentiate
                            ouput files.""")

    group.add_argument('-a', '--assembler', default='none',
                       choices=['abyss', 'trinity', 'velvet', 'spades',
                                'none'],
                       help="""Which assembler to use. Choosing "none" (the
                            default) will do a single blast run and stop before
                            any assembly.""")

    group.add_argument('-i', '--iterations', type=int, default=5, metavar='N',
                       help="""The number of pipeline iterations.
                            The default is "5".""")

    group.add_argument('-p', '--protein', action='store_true',
                       help="""Are the query sequences protein?
                            aTRAM will guess if you skip this argument.""")

    group.add_argument('--fraction', type=float, default=1.0,
                       help="""Use only the specified fraction of the aTRAM
                            database. The default is 1.0.""")

    cpus = min(10, os.cpu_count() - 4 if os.cpu_count() > 4 else 1)
    group.add_argument('--cpus', '--processes', '--max-processes',
                       type=int, default=cpus,
                       help="""Number of CPU processors to use. This will also
                            be used for the assemblers when possible. We will
                            use {} out of {} cpus.""".format(
                                cpus, os.cpu_count()))

    group.add_argument('--log-file', help="""Log file (full path)".""")

    group.add_argument('--path',
                       help="""If the assembler or blast you want to use is not
                            in your $PATH then use this to prepend
                            directories to your path.""")

    group.add_argument('-t', '--temp-dir', metavar='DIR',
                       help="""Place temporary files in this directory. All
                            files will be deleted after aTRAM completes. The
                            directory must exist.""")

    group.add_argument('--keep-temp-dir', action='store_true',
                       help="""This flag will keep the temporary files in the
                            --temp-dir around for debugging.""")

    group.add_argument('-T', '--timeout', metavar='SECONDS', default=300,
                       type=int,
                       help="""How many seconds to wait for an assembler before
                            stopping the run. To wait forever set this to 0.
                            The default is "300" (5 minutes).""")

    group = parser.add_argument_group(
        'optional values for blast-filtering contigs')

    group.add_argument('--no-filter', action='store_true',
                       help="""Do not filter the assembled contigs. This will:
                            set both the --bit-score and --contig-length
                            to 0""")

    group.add_argument('--bit-score', type=float, default=70.0,
                       metavar='SCORE',
                       help="""Remove contigs that have a value less than this.
                            The default is "70.0". This is turned off by the
                            --no-filter argument.""")

    group.add_argument('--contig-length', '--length', type=int, default=100,
                       help="""Remove blast hits that are shorter than this
                            length. The default is "100". This is turned
                            off by the --no-filter argument.""")

    blast.command_line_args(parser)
    assembly.command_line_args(parser)

    args = vars(parser.parse_args())

    check_query_args(args)
    blast.check_args(args)

    # Set defaults and adjust arguments based on other arguments
    args['cov_cutoff'] = assembly.default_cov_cutoff(args['cov_cutoff'])
    args['blast_db'] = blast.touchup_blast_db_names(args['blast_db'])
    args['kmer'] = assembly.default_kmer(args['kmer'], args['assembler'])
    args['max_target_seqs'] = blast.default_max_target_seqs(
        args['max_target_seqs'], args['blast_db'], args['max_memory'])

    setup_blast_args(args)
    set_protein_arg(args)
    setup_path_arg(args)
    find_programs(args)
    util.temp_dir_exists(args['temp_dir'], args.get('debug_dir'))
    util.set_blast_batch_size(args['batch_size'])

    return args
예제 #6
0
def parse_command_line():
    """Process command-line arguments."""
    description = """
        This program will align contigs to a reference sequence and put them
        into the correct reading frame.
        """

    parser = argparse.ArgumentParser(fromfile_prefix_chars='@',
                                     description=textwrap.dedent(description))

    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {}'.format(db.ATRAM_VERSION))

    parser.add_argument('-T',
                        '--taxa',
                        metavar='TAXA',
                        required=True,
                        help="""A text file of all of your taxon names.""")

    parser.add_argument(
        '-r',
        '--reference-genes',
        '--refs',
        metavar='FASTA',
        required=True,
        help="""Reference amino acid sequences in a FASTA file.""")

    parser.add_argument('-a',
                        '--assemblies-dir',
                        metavar='PATH',
                        required=True,
                        help="""The path to the DNA contigs.""")

    parser.add_argument('-m',
                        '--min-length',
                        metavar='LENGTH',
                        default=100,
                        type=int,
                        help="""Remove contigs that are less than this length.
            (default %(default)s)""")

    parser.add_argument(
        '-t',
        '--temp-dir',
        metavar='DIR',
        help="""Place temporary files in this directory. All files will be
            deleted after aTRAM completes. The directory must exist.""")

    parser.add_argument(
        '--keep-temp-dir',
        action='store_true',
        help="""This flag will keep the temporary files in the --temp-dir
        around for debugging.""")

    parser.add_argument('-l', '--log-file', help="""Log file (full path).""")
    parser.add_argument(
        '--log-level',
        choices=['debug', 'info', 'error', 'fatal'],
        default='info',
        help="""Log messages of the given level (or above). 'debug' shows the
            most messages and 'fatal' shows the least.
            (default %(default)s)""")

    parser.add_argument(
        '-o',
        '--output-prefix',
        help="""This is the prefix of all of the output files. So you can
            identify different stitcher output file sets. You may include a
            directory as part of the prefix. The stitcher will add suffixes to
            differentiate output files.""")

    parser.add_argument(
        '-f',
        '--file-filter',
        default='*.fasta',
        help="""Use this to filter files in the assemblies directory. For
            example '*filtered*.fasta' will select all fasta files in the
            assemblies directory with the word filtered in them. The default
            is to select all fasta files in the assemblies directory.
           (default %(default)s)""")

    parser.add_argument(
        '--reference-name',
        action='store_true',
        help="""Prepend the reference name to the final assembled gene name?
            if false the gene name in the reference file with just be the
            <taxon-name> if you select this then the assembled gene name
            will be <reference-name>.<taxon-name>.""")

    parser.add_argument(
        '--long-contig',
        type=float,
        default=0.7,
        help="""A long contig is considered to be this fraction [0-1] of the
            longest contig assembled by exonerate. (default %(default)s)""")

    args = parser.parse_args()

    util.temp_dir_exists(args.temp_dir)

    if not args.output_prefix:
        args.output_prefix = join('.',
                                  'atram_framer_' + date.today().isoformat())

    return args