예제 #1
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)])
        infile = in_seqs.toFasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                         prefix='split_fasta_tests',
                                         suffix='')
            close(_)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(
                LoadSeqs(data=infile, aligned=False),
                LoadSeqs(data=actual_seqs, aligned=False))
예제 #2
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                      prefix='split_fasta_tests',
                                      suffix='')
        close(fd)
        infile = [
            '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3',
            'CCTT--AA'
        ]

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs),
                                                  DNA))
예제 #3
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(_)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(
            LoadSeqs(data=infile, aligned=False),
            LoadSeqs(data=actual_seqs, aligned=False))
예제 #4
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(_)
        infile = [
            '>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA', '>seq3',
            'CCTT--AA'
        ]

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(LoadSeqs(data=infile, aligned=False),
                         LoadSeqs(data=actual_seqs, aligned=False))
예제 #5
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)])
        infile = in_seqs.toFasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                         prefix='split_fasta_tests',
                                         suffix='')
            close(_)

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(LoadSeqs(data=infile, aligned=False),
                             LoadSeqs(data=actual_seqs, aligned=False))
예제 #6
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        _, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(_)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(
            LoadSeqs(data=infile, aligned=False),
            LoadSeqs(data=actual_seqs, aligned=False))
예제 #7
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        fd, filename_prefix = mkstemp(dir=get_qiime_temp_dir(),
                                     prefix='split_fasta_tests',
                                     suffix='')
        close(fd)
        infile = ['>seq1', 'AACCTTAA', '>seq2', 'TTAACC', 'AATTAA',
                  '>seq3', 'CCTT--AA']

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(
            SequenceCollection.from_fasta_records(parse_fasta(infile), DNA),
            SequenceCollection.from_fasta_records(parse_fasta(actual_seqs), DNA))
예제 #8
0
 def test_split_fasta_equal_num_seqs_per_file(self):
     """split_fasta funcs as expected when equal num seqs go to each file
     """
     filename_prefix = get_tmp_filename(tmp_dir=get_qiime_temp_dir(),
                                        prefix='split_fasta_tests',
                                        suffix='',
                                        result_constructor=str)
     infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
      '>seq3','CCTT--AA']
      
     actual = split_fasta(infile, 1, filename_prefix)
     actual_seqs = []
     for fp in actual:
         actual_seqs += list(open(fp))
     remove_files(actual)
     
     expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(3)]
     
     self.assertEqual(actual,expected)
     self.assertEqual(\
      LoadSeqs(data=infile,aligned=False),\
      LoadSeqs(data=actual_seqs,aligned=False))
예제 #9
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method
    similarity = opts.similarity
    sortmerna_coverage = opts.sortmerna_coverage
    sortmerna_db = opts.sortmerna_db

    if assignment_method == 'sortmerna':
        # similarity must be between (0,1]
        if not 0 < similarity <= 1:
            option_parser.error('--similarity must be between (0,1].')
        # coverage must be between (0.1]
        if not 0 < sortmerna_coverage <= 1:
            option_parser.error('--sortmerna_coverage must be '
                                'between (0,1].')
        # check ID to taxonomy filepath
        if not opts.id_to_taxonomy_fp:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with sortmerna.')
        # check reference sequences filepath
        if not opts.reference_seqs_fp:
            option_parser.error(
                'sortmerna always requires --reference_seqs_fp '
                '(with or without sortmerna_db)')
        # check indexed database, if provided (not mandatory)
        elif sortmerna_db:
            if isfile(sortmerna_db + '.stats') is False:
                option_parser.error('%s does not exist, make sure you have '
                                    'indexed the database using indexdb_rna' %
                                    (sortmerna_db + '.stats'))

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error('Either a blast db (via -b) or a collection '
                                'of reference sequences (via -r) must be '
                                'passed to assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
            option_parser.error('A filepath for an id to taxonomy map must be '
                                'specified (via -t) along with the reference '
                                'sequences fp to train the Rdp Classifier.')
        else:
            pass

    if assignment_method == 'uclust':
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with uclust.')
        if opts.reference_seqs_fp is None:
            option_parser.error('--reference_seqs_fp is required when '
                                'assigning with uclust.')

    if assignment_method == 'rtax':
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error(
                'RTAX classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error(
                'RTAX classification requires the FASTA files '
                'produced by split_illumina_fastq.py for both reads, '
                'in addition to the cluster representatives.  Pass '
                'these via --read_1_seqs_fp and --read_2_seqs_fp.')

    if assignment_method == 'mothur':
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                'Mothur classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')

    taxon_assigner_constructor =\
        assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy'
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + '/' + fname + '_tax_assignments.txt'
    log_path = output_dir + '/' + fname + '_tax_assignments.log'

    if assignment_method == 'blast':
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params['blast_db'] = opts.blast_db
        else:
            params['reference_seqs_filepath'] = opts.reference_seqs_fp
        params['Max E value'] = opts.blast_e_value

    elif assignment_method == 'mothur':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp

    elif assignment_method == 'uclust':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['min_consensus_fraction'] = opts.min_consensus_fraction
        params['similarity'] = similarity
        params['max_accepts'] = opts.uclust_max_accepts

    elif assignment_method == 'sortmerna':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['sortmerna_db'] = sortmerna_db
        params['min_consensus_fraction'] = opts.min_consensus_fraction
        params['min_percent_id'] = float(similarity * 100.0)
        params['min_percent_cov'] = float(sortmerna_coverage * 100.0)
        params['best_N_alignments'] = opts.sortmerna_best_N_alignments
        params['e_value'] = opts.sortmerna_e_value
        params['threads'] = opts.sortmerna_threads

    elif assignment_method == 'rdp':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params[
            'training_data_properties_fp'] = opts.training_data_properties_fp
        params['max_memory'] = "%sM" % opts.rdp_max_memory

    elif assignment_method == 'rtax':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['read_1_seqs_fp'] = opts.read_1_seqs_fp
        params['read_2_seqs_fp'] = opts.read_2_seqs_fp
        params['single_ok'] = opts.single_ok
        params['no_single_ok_generic'] = opts.no_single_ok_generic
        params['header_id_regex'] = opts.header_id_regex
        params['read_id_regex'] = opts.read_id_regex
        params['amplicon_id_regex'] = opts.amplicon_id_regex

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    fd, temp_result_path = mkstemp(prefix='assign-tax')
    close(fd)
    taxon_assigner = taxon_assigner_constructor(params)
    if assignment_method == "sortmerna":
        taxon_assigner(input_sequences_filepath,
                       result_path=result_path,
                       log_path=log_path)
    else:
        taxon_assigner(input_sequences_filepath,
                       result_path=temp_result_path,
                       log_path=log_path)

        # This is an ugly hack, and needs to be pushed upstream to
        # the taxon assigners (except for sortmerna, which already outputs
        # only the first field for all headers in the Blast tabular output).
        # The output taxonomy maps that are returned by the taxon assigners
        # contain the full sequence headers as the first field (so including
        # "comment" text in the fasta headers), but for consistency with the
        # input taxonomy maps, should only contain the sequence identifier.
        # This modifies those entries to contain only the sequence identifer,
        # discarding any comment information. The formatting of these result
        # files needs to be centralized, and at that stage this processing
        # should happen there rather than here.
        result_f = open(result_path, 'w')
        for line in open(temp_result_path, 'U'):
            fields = line.strip().split('\t')
            seq_id = fields[0].split()[0]
            result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:])))
        result_f.close()
        remove_files([temp_result_path])
예제 #10
0
 def tearDown(self):
     remove_files(self._paths_to_clean_up)
예제 #11
0
    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    temp_result_path = get_tmp_filename(prefix='assign-tax')
    taxon_assigner = taxon_assigner_constructor(params)
    taxon_assigner(input_sequences_filepath,\
     result_path=temp_result_path,log_path=log_path)

    ## This is an ugly hack, and needs to be pushed upstream to
    ## the taxon assigners. The output taxonomy maps that are returned by the
    ## taxon assigners contain the full sequence headers as the first field
    ## (so including "comment" text in the fasta headers), but for consistency
    ## with the input taxonomy maps, should only contain the sequence identifier.
    ## This modifies those entries to contain only the sequence identifer,
    ## discarding any comment information. The formatting of these result files
    ## needs to be centralized, and at that stage this processing should
    ## happen there rather than here.
    result_f = open(result_path, 'w')
    for line in open(temp_result_path, 'U'):
        fields = line.strip().split('\t')
        seq_id = fields[0].split()[0]
        result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:])))
    result_f.close()
    remove_files([temp_result_path])


if __name__ == "__main__":
    main()
예제 #12
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method

    if assignment_method == "blast":
        if not opts.id_to_taxonomy_fp:
            option_parser.error("Option --id_to_taxonomy_fp is required when " "assigning with blast.")
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error(
                "Either a blast db (via -b) or a collection of "
                "reference sequences (via -r) must be passed to "
                "assign taxonomy using blast."
            )

    if assignment_method == "rdp":
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    "A filepath for reference sequences must be "
                    "specified (via -r) along with the id_to_taxonomy "
                    "file to train the Rdp Classifier."
                )
        elif opts.reference_seqs_fp is not None:
            option_parser.error(
                "A filepath for an id to taxonomy map must be "
                "specified (via -t) along with the reference "
                "sequences fp to train the Rdp Classifier."
            )
        else:
            pass

    if assignment_method == "uclust":
        if opts.id_to_taxonomy_fp is None:
            option_parser.error("Option --id_to_taxonomy_fp is required when " "assigning with uclust.")
        if opts.reference_seqs_fp is None:
            option_parser.error("Option --reference_seqs_fp is required when " "assigning with uclust.")

    if assignment_method == "rtax":
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error(
                "RTAX classification requires both a filepath for "
                "reference sequences (via -r) and an id_to_taxonomy "
                "file (via -t)."
            )
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error(
                "RTAX classification requires the FASTA files "
                "produced by split_illumina_fastq.py for both reads, "
                "in addition to the cluster representatives.  Pass "
                "these via --read_1_seqs_fp and --read_2_seqs_fp."
            )

    if assignment_method == "mothur":
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                "Mothur classification requires both a filepath for "
                "reference sequences (via -r) and an id_to_taxonomy "
                "file (via -t)."
            )

    if assignment_method == "tax2tree":
        if opts.tree_fp is None:
            option_parser.error(
                "Tax2Tree classification requires a "
                "filepath to a prebuilt tree (via --tree_fp) containing "
                "both the representative and reference sequences. Check "
                "Tax2Tree documentation for help building a tree."
            )
        if opts.id_to_taxonomy_fp is None:
            option_parser.error("Tax2Tree classification requires a " "filepath for an id_to_taxonomy file (via -t).")

    taxon_assigner_constructor = assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {"id_to_taxonomy_filepath": id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + "_assigned_taxonomy"
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + "/" + fname + "_tax_assignments.txt"
    log_path = output_dir + "/" + fname + "_tax_assignments.log"

    if assignment_method == "blast":
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params["blast_db"] = opts.blast_db
        else:
            params["reference_seqs_filepath"] = opts.reference_seqs_fp
        params["Max E value"] = opts.e_value

    elif assignment_method == "mothur":
        params["Confidence"] = opts.confidence
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["reference_sequences_fp"] = opts.reference_seqs_fp

    elif assignment_method == "uclust":
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["reference_sequences_fp"] = opts.reference_seqs_fp
        params["min_consensus_fraction"] = opts.uclust_min_consensus_fraction
        params["similarity"] = opts.uclust_similarity
        params["max_accepts"] = opts.uclust_max_accepts

    elif assignment_method == "rdp":
        params["Confidence"] = opts.confidence
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["reference_sequences_fp"] = opts.reference_seqs_fp
        params["training_data_properties_fp"] = opts.training_data_properties_fp
        params["max_memory"] = "%sM" % opts.rdp_max_memory

    elif assignment_method == "rtax":
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["reference_sequences_fp"] = opts.reference_seqs_fp
        params["read_1_seqs_fp"] = opts.read_1_seqs_fp
        params["read_2_seqs_fp"] = opts.read_2_seqs_fp
        params["single_ok"] = opts.single_ok
        params["no_single_ok_generic"] = opts.no_single_ok_generic
        params["header_id_regex"] = opts.header_id_regex
        params["read_id_regex"] = opts.read_id_regex
        params["amplicon_id_regex"] = opts.amplicon_id_regex

    elif assignment_method == "tax2tree":
        params["id_to_taxonomy_fp"] = opts.id_to_taxonomy_fp
        params["tree_fp"] = opts.tree_fp

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    fd, temp_result_path = mkstemp(prefix="assign-tax")
    close(fd)
    taxon_assigner = taxon_assigner_constructor(params)
    taxon_assigner(input_sequences_filepath, result_path=temp_result_path, log_path=log_path)

    # This is an ugly hack, and needs to be pushed upstream to
    # the taxon assigners. The output taxonomy maps that are returned by the
    # taxon assigners contain the full sequence headers as the first field
    # (so including "comment" text in the fasta headers), but for consistency
    # with the input taxonomy maps, should only contain the sequence identifier.
    # This modifies those entries to contain only the sequence identifer,
    # discarding any comment information. The formatting of these result files
    # needs to be centralized, and at that stage this processing should
    # happen there rather than here.
    result_f = open(result_path, "w")
    for line in open(temp_result_path, "U"):
        fields = line.strip().split("\t")
        seq_id = fields[0].split()[0]
        result_f.write("%s\t%s\n" % (seq_id, "\t".join(fields[1:])))
    result_f.close()
    remove_files([temp_result_path])
 def tearDown(self):
     remove_files(self._paths_to_clean_up)
예제 #14
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method
    similarity = opts.similarity
    sortmerna_coverage = opts.sortmerna_coverage
    sortmerna_db = opts.sortmerna_db

    if assignment_method == 'sortmerna':
        # similarity must be between (0,1]
        if not 0 < similarity <= 1:
            option_parser.error('--similarity must be between (0,1].')
        # coverage must be between (0.1]
        if not 0 < sortmerna_coverage <= 1:
            option_parser.error('--sortmerna_coverage must be '
                                'between (0,1].')
        # check ID to taxonomy filepath
        if not opts.id_to_taxonomy_fp:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with sortmerna.')
        # check reference sequences filepath
        if not opts.reference_seqs_fp:
            option_parser.error('sortmerna always requires --reference_seqs_fp '
                                '(with or without sortmerna_db)')
        # check indexed database, if provided (not mandatory)
        elif sortmerna_db:
            if isfile(sortmerna_db + '.stats') is False:
                option_parser.error('%s does not exist, make sure you have '
                                    'indexed the database using indexdb_rna' %
                                    (sortmerna_db + '.stats'))

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error('Either a blast db (via -b) or a collection '
                                'of reference sequences (via -r) must be '
                                'passed to assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
            option_parser.error(
                'A filepath for an id to taxonomy map must be '
                'specified (via -t) along with the reference '
                'sequences fp to train the Rdp Classifier.')
        else:
            pass

    if assignment_method == 'uclust':
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('--id_to_taxonomy_fp is required when '
                                'assigning with uclust.')
        if opts.reference_seqs_fp is None:
            option_parser.error('--reference_seqs_fp is required when '
                                'assigning with uclust.')

    if assignment_method == 'rtax':
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error('RTAX classification requires both a filepath for '
                                'reference sequences (via -r) and an id_to_taxonomy '
                                'file (via -t).')
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error('RTAX classification requires the FASTA files '
                                'produced by split_illumina_fastq.py for both reads, '
                                'in addition to the cluster representatives.  Pass '
                                'these via --read_1_seqs_fp and --read_2_seqs_fp.')

    if assignment_method == 'mothur':
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                'Mothur classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')

    taxon_assigner_constructor =\
        assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy'
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + '/' + fname + '_tax_assignments.txt'
    log_path = output_dir + '/' + fname + '_tax_assignments.log'

    if assignment_method == 'blast':
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params['blast_db'] = opts.blast_db
        else:
            params['reference_seqs_filepath'] = opts.reference_seqs_fp
        params['Max E value'] = opts.blast_e_value

    elif assignment_method == 'mothur':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp

    elif assignment_method == 'uclust':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['min_consensus_fraction'] = opts.min_consensus_fraction
        params['similarity'] = similarity
        params['max_accepts'] = opts.uclust_max_accepts

    elif assignment_method == 'sortmerna':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['sortmerna_db'] = sortmerna_db
        params['min_consensus_fraction'] = opts.min_consensus_fraction
        params['min_percent_id'] = float(similarity*100.0)
        params['min_percent_cov'] = float(sortmerna_coverage*100.0)
        params['best_N_alignments'] = opts.sortmerna_best_N_alignments
        params['e_value'] = opts.sortmerna_e_value
        params['threads'] = opts.sortmerna_threads

    elif assignment_method == 'rdp':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params[
            'training_data_properties_fp'] = opts.training_data_properties_fp
        params['max_memory'] = "%sM" % opts.rdp_max_memory

    elif assignment_method == 'rtax':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['read_1_seqs_fp'] = opts.read_1_seqs_fp
        params['read_2_seqs_fp'] = opts.read_2_seqs_fp
        params['single_ok'] = opts.single_ok
        params['no_single_ok_generic'] = opts.no_single_ok_generic
        params['header_id_regex'] = opts.header_id_regex
        params['read_id_regex'] = opts.read_id_regex
        params['amplicon_id_regex'] = opts.amplicon_id_regex

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    fd, temp_result_path = mkstemp(prefix='assign-tax')
    close(fd)
    taxon_assigner = taxon_assigner_constructor(params)
    if assignment_method == "sortmerna":
        taxon_assigner(input_sequences_filepath,
                       result_path=result_path,
                       log_path=log_path)
    else:
        taxon_assigner(input_sequences_filepath,
                       result_path=temp_result_path,
                       log_path=log_path)

        # This is an ugly hack, and needs to be pushed upstream to
        # the taxon assigners (except for sortmerna, which already outputs
        # only the first field for all headers in the Blast tabular output).
        # The output taxonomy maps that are returned by the taxon assigners
        # contain the full sequence headers as the first field (so including
        # "comment" text in the fasta headers), but for consistency with the
        # input taxonomy maps, should only contain the sequence identifier.
        # This modifies those entries to contain only the sequence identifer,
        # discarding any comment information. The formatting of these result
        # files needs to be centralized, and at that stage this processing
        # should happen there rather than here.
        result_f = open(result_path, 'w')
        for line in open(temp_result_path, 'U'):
            fields = line.strip().split('\t')
            seq_id = fields[0].split()[0]
            result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:])))
        result_f.close()
        remove_files([temp_result_path])
예제 #15
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                                'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error(
                'Either a blast db (via -b) or a collection of '
                'reference sequences (via -r) must be passed to '
                'assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
            option_parser.error('A filepath for an id to taxonomy map must be '
                                'specified (via -t) along with the reference '
                                'sequences fp to train the Rdp Classifier.')
        else:
            pass

    if assignment_method == 'uclust':
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                                'assigning with uclust.')
        if opts.reference_seqs_fp is None:
            option_parser.error('Option --reference_seqs_fp is required when '
                                'assigning with uclust.')

    if assignment_method == 'rtax':
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error(
                'RTAX classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error(
                'RTAX classification requires the FASTA files '
                'produced by split_illumina_fastq.py for both reads, '
                'in addition to the cluster representatives.  Pass '
                'these via --read_1_seqs_fp and --read_2_seqs_fp.')

    if assignment_method == 'mothur':
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                'Mothur classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')

    if assignment_method == 'tax2tree':
        if opts.tree_fp is None:
            option_parser.error(
                'Tax2Tree classification requires a '
                'filepath to a prebuilt tree (via --tree_fp) containing '
                'both the representative and reference sequences. Check '
                'Tax2Tree documentation for help building a tree.')
        if opts.id_to_taxonomy_fp is None:
            option_parser.error(
                'Tax2Tree classification requires a '
                'filepath for an id_to_taxonomy file (via -t).')

    taxon_assigner_constructor =\
        assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy'
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + '/' + fname + '_tax_assignments.txt'
    log_path = output_dir + '/' + fname + '_tax_assignments.log'

    if assignment_method == 'blast':
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params['blast_db'] = opts.blast_db
        else:
            params['reference_seqs_filepath'] = opts.reference_seqs_fp
        params['Max E value'] = opts.e_value

    elif assignment_method == 'mothur':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp

    elif assignment_method == 'uclust':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['min_consensus_fraction'] = opts.uclust_min_consensus_fraction
        params['similarity'] = opts.uclust_similarity
        params['max_accepts'] = opts.uclust_max_accepts

    elif assignment_method == 'rdp':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params[
            'training_data_properties_fp'] = opts.training_data_properties_fp
        params['max_memory'] = "%sM" % opts.rdp_max_memory

    elif assignment_method == 'rtax':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['read_1_seqs_fp'] = opts.read_1_seqs_fp
        params['read_2_seqs_fp'] = opts.read_2_seqs_fp
        params['single_ok'] = opts.single_ok
        params['no_single_ok_generic'] = opts.no_single_ok_generic
        params['header_id_regex'] = opts.header_id_regex
        params['read_id_regex'] = opts.read_id_regex
        params['amplicon_id_regex'] = opts.amplicon_id_regex

    elif assignment_method == 'tax2tree':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['tree_fp'] = opts.tree_fp

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    temp_result_path = get_tmp_filename(prefix='assign-tax')
    taxon_assigner = taxon_assigner_constructor(params)
    taxon_assigner(input_sequences_filepath,
                   result_path=temp_result_path,
                   log_path=log_path)

    # This is an ugly hack, and needs to be pushed upstream to
    # the taxon assigners. The output taxonomy maps that are returned by the
    # taxon assigners contain the full sequence headers as the first field
    # (so including "comment" text in the fasta headers), but for consistency
    # with the input taxonomy maps, should only contain the sequence identifier.
    # This modifies those entries to contain only the sequence identifer,
    # discarding any comment information. The formatting of these result files
    # needs to be centralized, and at that stage this processing should
    # happen there rather than here.
    result_f = open(result_path, 'w')
    for line in open(temp_result_path, 'U'):
        fields = line.strip().split('\t')
        seq_id = fields[0].split()[0]
        result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:])))
    result_f.close()
    remove_files([temp_result_path])
예제 #16
0
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    temp_result_path = get_tmp_filename(prefix='assign-tax')
    taxon_assigner = taxon_assigner_constructor(params)
    taxon_assigner(input_sequences_filepath,
                   result_path=temp_result_path,
                   log_path=log_path)
    
    ## This is an ugly hack, and needs to be pushed upstream to
    ## the taxon assigners. The output taxonomy maps that are returned by the 
    ## taxon assigners contain the full sequence headers as the first field
    ## (so including "comment" text in the fasta headers), but for consistency
    ## with the input taxonomy maps, should only contain the sequence identifier.
    ## This modifies those entries to contain only the sequence identifer, 
    ## discarding any comment information. The formatting of these result files
    ## needs to be centralized, and at that stage this processing should
    ## happen there rather than here.
    result_f = open(result_path,'w')
    for line in open(temp_result_path,'U'):
        fields = line.strip().split('\t')
        seq_id = fields[0].split()[0]
        result_f.write('%s\t%s\n' % (seq_id,'\t'.join(fields[1:])))
    result_f.close()
    remove_files([temp_result_path])
        


if __name__ == "__main__":
    main()
예제 #17
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
    assignment_method = opts.assignment_method

    if assignment_method == 'blast':
        if not opts.id_to_taxonomy_fp:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                                'assigning with blast.')
        if not (opts.reference_seqs_fp or opts.blast_db):
            option_parser.error('Either a blast db (via -b) or a collection of '
                                'reference sequences (via -r) must be passed to '
                                'assign taxonomy using blast.')

    if assignment_method == 'rdp':
        try:
            validate_rdp_version()
        except RuntimeError as e:
            option_parser.error(e)

        if opts.id_to_taxonomy_fp is not None:
            if opts.reference_seqs_fp is None:
                option_parser.error(
                    'A filepath for reference sequences must be '
                    'specified (via -r) along with the id_to_taxonomy '
                    'file to train the Rdp Classifier.')
        elif opts.reference_seqs_fp is not None:
            option_parser.error(
                'A filepath for an id to taxonomy map must be '
                'specified (via -t) along with the reference '
                'sequences fp to train the Rdp Classifier.')
        else:
            pass

    if assignment_method == 'uclust':
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('Option --id_to_taxonomy_fp is required when '
                                'assigning with uclust.')
        if opts.reference_seqs_fp is None:
            option_parser.error('Option --reference_seqs_fp is required when '
                                'assigning with uclust.')

    if assignment_method == 'rtax':
        if opts.id_to_taxonomy_fp is None or opts.reference_seqs_fp is None:
            option_parser.error('RTAX classification requires both a filepath for '
                                'reference sequences (via -r) and an id_to_taxonomy '
                                'file (via -t).')
        if opts.read_1_seqs_fp is None:  # or opts.read_2_seqs_fp is None:
            option_parser.error('RTAX classification requires the FASTA files '
                                'produced by split_illumina_fastq.py for both reads, '
                                'in addition to the cluster representatives.  Pass '
                                'these via --read_1_seqs_fp and --read_2_seqs_fp.')

    if assignment_method == 'mothur':
        if None in [opts.id_to_taxonomy_fp, opts.reference_seqs_fp]:
            option_parser.error(
                'Mothur classification requires both a filepath for '
                'reference sequences (via -r) and an id_to_taxonomy '
                'file (via -t).')

    if assignment_method == 'tax2tree':
        if opts.tree_fp is None:
            option_parser.error('Tax2Tree classification requires a '
                                'filepath to a prebuilt tree (via --tree_fp) containing '
                                'both the representative and reference sequences. Check '
                                'Tax2Tree documentation for help building a tree.')
        if opts.id_to_taxonomy_fp is None:
            option_parser.error('Tax2Tree classification requires a '
                                'filepath for an id_to_taxonomy file (via -t).')

    taxon_assigner_constructor =\
        assignment_method_constructors[assignment_method]
    input_sequences_filepath = opts.input_fasta_fp

    try:
        id_to_taxonomy_fp = opts.id_to_taxonomy_fp
        params = {'id_to_taxonomy_filepath': id_to_taxonomy_fp}
    except IndexError:
        params = {}

    # Build the output filenames
    output_dir = opts.output_dir or assignment_method + '_assigned_taxonomy'
    try:
        mkdir(output_dir)
    except OSError:
        # output_dir already exists
        pass

    fpath, ext = splitext(input_sequences_filepath)
    input_dir, fname = split(fpath)
    result_path = output_dir + '/' + fname + '_tax_assignments.txt'
    log_path = output_dir + '/' + fname + '_tax_assignments.log'

    if assignment_method == 'blast':
        # one of these must have a value, otherwise we'd have
        # an optparse error
        if opts.blast_db:
            params['blast_db'] = opts.blast_db
        else:
            params['reference_seqs_filepath'] = opts.reference_seqs_fp
        params['Max E value'] = opts.e_value

    elif assignment_method == 'mothur':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp

    elif assignment_method == 'uclust':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['min_consensus_fraction'] = opts.uclust_min_consensus_fraction
        params['similarity'] = opts.uclust_similarity
        params['max_accepts'] = opts.uclust_max_accepts

    elif assignment_method == 'rdp':
        params['Confidence'] = opts.confidence
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params[
            'training_data_properties_fp'] = opts.training_data_properties_fp
        params['max_memory'] = "%sM" % opts.rdp_max_memory

    elif assignment_method == 'rtax':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['reference_sequences_fp'] = opts.reference_seqs_fp
        params['read_1_seqs_fp'] = opts.read_1_seqs_fp
        params['read_2_seqs_fp'] = opts.read_2_seqs_fp
        params['single_ok'] = opts.single_ok
        params['no_single_ok_generic'] = opts.no_single_ok_generic
        params['header_id_regex'] = opts.header_id_regex
        params['read_id_regex'] = opts.read_id_regex
        params['amplicon_id_regex'] = opts.amplicon_id_regex

    elif assignment_method == 'tax2tree':
        params['id_to_taxonomy_fp'] = opts.id_to_taxonomy_fp
        params['tree_fp'] = opts.tree_fp

    else:
        # should not be able to get here as an unknown classifier would
        # have raised an optparse error
        exit(1)
    temp_result_path = get_tmp_filename(prefix='assign-tax')
    taxon_assigner = taxon_assigner_constructor(params)
    taxon_assigner(input_sequences_filepath,
                   result_path=temp_result_path,
                   log_path=log_path)

    # This is an ugly hack, and needs to be pushed upstream to
    # the taxon assigners. The output taxonomy maps that are returned by the
    # taxon assigners contain the full sequence headers as the first field
    # (so including "comment" text in the fasta headers), but for consistency
    # with the input taxonomy maps, should only contain the sequence identifier.
    # This modifies those entries to contain only the sequence identifer,
    # discarding any comment information. The formatting of these result files
    # needs to be centralized, and at that stage this processing should
    # happen there rather than here.
    result_f = open(result_path, 'w')
    for line in open(temp_result_path, 'U'):
        fields = line.strip().split('\t')
        seq_id = fields[0].split()[0]
        result_f.write('%s\t%s\n' % (seq_id, '\t'.join(fields[1:])))
    result_f.close()
    remove_files([temp_result_path])