예제 #1
0
def load_tree_files(tree_dir):
    """Load trees from filepaths

    checks if  filenames indicate that trees are from different
    distance methods.  If so, warns user.
    loads trees into phylonode objects
    returns [trees]
    raises a RuntimeError if no  trees are loaded
    """
    tree_file_names = os.listdir(tree_dir)
    # ignore invisible files like .DS_Store
    tree_file_names = [
        fname for fname in tree_file_names if not fname.startswith('.')
    ]

    # try to warn user if using multiple types of trees {
    try:
        base_names = []
        for fname in tree_file_names:
            base_names.append(parse_rarefaction_fname(fname)[0])
    except ValueError:
        pass
    else:
        if len(set(base_names)) > 1:
            warnstr = """
warning: trees are named differently, please be sure you're not
comparing trees generated in different manners, unless you're quite sure
that's what you intend to do.  types: """ + str(set(base_names)) + """
continuing anyway..."""
            warn(warnstr)
    # }
    trees = []
    for fname in tree_file_names:
        try:
            f = open(os.path.join(tree_dir, fname), 'U')
            tree = TreeNode.from_newick(f)
            tree.filepath = fname
            trees.append(tree)
            f.close()
        except IOError as err:
            sys.stderr.write('error loading tree ' + fname + '\n')
            exit(1)
    if len(trees) == 0:
        raise RuntimeError('Error: no trees loaded' +
                           ', check that tree directory has has valid trees')
    return trees
예제 #2
0
def load_tree_files(tree_dir):
    """Load trees from filepaths

    checks if  filenames indicate that trees are from different
    distance methods.  If so, warns user.
    loads trees into phylonode objects
    returns [trees]
    raises a RuntimeError if no  trees are loaded
    """
    tree_file_names = os.listdir(tree_dir)
    # ignore invisible files like .DS_Store
    tree_file_names = [fname for fname in tree_file_names if not
                       fname.startswith('.')]

    # try to warn user if using multiple types of trees {
    try:
        base_names = []
        for fname in tree_file_names:
            base_names.append(parse_rarefaction_fname(fname)[0])
    except ValueError:
        pass
    else:
        if len(set(base_names)) > 1:
            warnstr = """
warning: trees are named differently, please be sure you're not
comparing trees generated in different manners, unless you're quite sure
that's what you intend to do.  types: """ + str(set(base_names)) + """
continuing anyway..."""
            warn(warnstr)
    # }
    trees = []
    for fname in tree_file_names:
        try:
            f = open(os.path.join(tree_dir, fname), 'U')
            tree = TreeNode.from_newick(f)
            tree.filepath = fname
            trees.append(tree)
            f.close()
        except IOError as err:
            sys.stderr.write('error loading tree ' + fname + '\n')
            exit(1)
    if len(trees) == 0:
        raise RuntimeError('Error: no trees loaded' +
                           ', check that tree directory has has valid trees')
    return trees
예제 #3
0
    def test_run_pick_de_novo_otus_muscle(self):
        """run_pick_de_novo_otus w muscle generates expected results
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = {'alignment_method': 'muscle'}
        self.params['filter_alignment'] = \
            {'suppress_lane_mask_filter': None,
             'entropy_threshold': '0.10'}

        run_pick_de_novo_otus(self.test_data['seqs'][0],
                              self.test_out,
                              call_commands_serially,
                              self.params,
                              self.qiime_config,
                              parallel=False,
                              status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out, 'muscle_aligned_seqs',
                            '%s_rep_set_aligned.fasta' % input_file_basename)
        taxonomy_assignments_fp = join(
            self.test_out, 'uclust_assigned_taxonomy',
            '%s_rep_set_tax_assignments.txt' % input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # all OTUs align
        self.assertEqual(count_seqs(alignment_fp)[0], num_otus)

        # all OTUs in tree
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), num_otus)

        # check that the two final output files have non-zero size
        self.assertTrue(getsize(tree_fp) > 0)
        self.assertTrue(getsize(otu_table_fp) > 0)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'
        ]
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # expected OTUs
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum(
            [v.sum() for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table,
                         count_seqs(self.test_data['seqs'][0])[0])
예제 #4
0
    def test_run_pick_de_novo_otus_parallel(self):
        """run_pick_de_novo_otus generates expected results in parallel
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = \
            {'template_fp': self.test_data['refseqs_aligned'][0]}
        self.params['filter_alignment'] = \
            {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]}
        actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=True,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out, 'pynast_aligned_seqs',
                            '%s_rep_set_aligned.fasta' % input_file_basename)
        failures_fp = join(self.test_out, 'pynast_aligned_seqs',
                           '%s_rep_set_failures.fasta' % input_file_basename)
        taxonomy_assignments_fp = join(
            self.test_out, 'uclust_assigned_taxonomy',
            '%s_rep_set_tax_assignments.txt' % input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        self.assertEqual(actual_tree_fp, tree_fp)
        self.assertEqual(actual_otu_table_fp, otu_table_fp)

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # number of seqs which aligned + num of seqs which failed to
        # align sum to the number of OTUs
        self.assertEqual(
            count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus)

        # number of tips in the tree equals the number of sequences that
        # aligned
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0])

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1', 'f2', 'f3', 'f4', 'p1', 'p2', 't1', 't2', 'not16S.1'
        ]
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # otu ids are as expected
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum(
            [v.sum() for v in otu_table.iter_data()])
        self.assertEqual(number_seqs_in_otu_table,
                         count_seqs(self.test_data['seqs'][0])[0])

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)
예제 #5
0
    def test_run_pick_de_novo_otus_muscle(self):
        """run_pick_de_novo_otus w muscle generates expected results
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = {'alignment_method': 'muscle'}
        self.params['filter_alignment'] = \
            {'suppress_lane_mask_filter': None,
             'entropy_threshold': '0.10'}

        run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=False,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out,
                            'muscle_aligned_seqs', '%s_rep_set_aligned.fasta' %
                            input_file_basename)
        taxonomy_assignments_fp = join(self.test_out,
                                       'uclust_assigned_taxonomy',
                                       '%s_rep_set_tax_assignments.txt' %
                                       input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # all OTUs align
        self.assertEqual(count_seqs(alignment_fp)[0], num_otus)

        # all OTUs in tree
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), num_otus)

        # check that the two final output files have non-zero size
        self.assertTrue(getsize(tree_fp) > 0)
        self.assertTrue(getsize(otu_table_fp) > 0)

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1',
            'f2',
            'f3',
            'f4',
            'p1',
            'p2',
            't1',
            't2',
            'not16S.1']
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # expected OTUs
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum([v.sum()
                                       for v in otu_table.iter_data()])
        self.assertEqual(
            number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])
예제 #6
0
    def test_run_pick_de_novo_otus_parallel(self):
        """run_pick_de_novo_otus generates expected results in parallel
        """
        self.params['assign_taxonomy'] = \
            {'id_to_taxonomy_fp': self.test_data['refseqs_tax'][0],
             'reference_seqs_fp': self.test_data['refseqs'][0]}
        self.params['align_seqs'] = \
            {'template_fp': self.test_data['refseqs_aligned'][0]}
        self.params['filter_alignment'] = \
            {'lane_mask_fp': self.test_data['refseqs_aligned_lanemask'][0]}
        actual_tree_fp, actual_otu_table_fp = run_pick_de_novo_otus(
            self.test_data['seqs'][0],
            self.test_out,
            call_commands_serially,
            self.params,
            self.qiime_config,
            parallel=True,
            status_update_callback=no_status_updates)

        input_file_basename = splitext(split(self.test_data['seqs'][0])[1])[0]
        otu_map_fp = join(self.test_out, 'uclust_picked_otus',
                          '%s_otus.txt' % input_file_basename)
        alignment_fp = join(self.test_out,
                            'pynast_aligned_seqs', '%s_rep_set_aligned.fasta' %
                            input_file_basename)
        failures_fp = join(self.test_out,
                           'pynast_aligned_seqs', '%s_rep_set_failures.fasta' %
                           input_file_basename)
        taxonomy_assignments_fp = join(self.test_out,
                                       'uclust_assigned_taxonomy',
                                       '%s_rep_set_tax_assignments.txt' %
                                       input_file_basename)
        otu_table_fp = join(self.test_out, 'otu_table.biom')
        tree_fp = join(self.test_out, 'rep_set.tre')

        self.assertEqual(actual_tree_fp, tree_fp)
        self.assertEqual(actual_otu_table_fp, otu_table_fp)

        # Number of OTUs falls within a range that was manually
        # confirmed
        otu_map_lines = list(open(otu_map_fp))
        num_otus = len(otu_map_lines)
        otu_map_otu_ids = [o.split()[0] for o in otu_map_lines]
        self.assertEqual(num_otus, 14)

        # all otus get taxonomy assignments
        taxonomy_assignment_lines = list(open(taxonomy_assignments_fp))
        self.assertEqual(len(taxonomy_assignment_lines), num_otus)

        # number of seqs which aligned + num of seqs which failed to
        # align sum to the number of OTUs
        self.assertEqual(
            count_seqs(alignment_fp)[0] + count_seqs(failures_fp)[0], num_otus)

        # number of tips in the tree equals the number of sequences that
        # aligned
        with open(tree_fp) as f:
            tree = TreeNode.from_newick(f)
        self.assertEqual(len(list(tree.tips())), count_seqs(alignment_fp)[0])

        # parse the otu table
        otu_table = load_table(otu_table_fp)
        expected_sample_ids = [
            'f1',
            'f2',
            'f3',
            'f4',
            'p1',
            'p2',
            't1',
            't2',
            'not16S.1']
        # sample IDs are as expected
        self.assertItemsEqual(otu_table.ids(), expected_sample_ids)
        # otu ids are as expected
        self.assertItemsEqual(otu_table.ids(axis='observation'),
                              otu_map_otu_ids)
        # number of sequences in the full otu table equals the number of
        # input sequences
        number_seqs_in_otu_table = sum([v.sum()
                                       for v in otu_table.iter_data()])
        self.assertEqual(
            number_seqs_in_otu_table, count_seqs(self.test_data['seqs'][0])[0])

        # Check that the log file is created and has size > 0
        log_fp = glob(join(self.test_out, 'log*.txt'))[0]
        self.assertTrue(getsize(log_fp) > 0)