Пример #1
0
 def test_cdhit_clusters_from_seqs(self):
     """cdhit_clusters_from_seqs returns expected clusters"""
     exp = [['cdhit_test_seqs_0'],['cdhit_test_seqs_1'],\
            ['cdhit_test_seqs_2'],['cdhit_test_seqs_3'],\
            ['cdhit_test_seqs_4'],['cdhit_test_seqs_5'],\
            ['cdhit_test_seqs_6','cdhit_test_seqs_8'],\
            ['cdhit_test_seqs_7'],['cdhit_test_seqs_9']]
     obs = cdhit_clusters_from_seqs(dna_seqs, DNA)
     self.assertEqual(obs, exp)
Пример #2
0
 def test_cdhit_clusters_from_seqs(self):
     """cdhit_clusters_from_seqs returns expected clusters"""
     exp = [['cdhit_test_seqs_0'],['cdhit_test_seqs_1'],\
            ['cdhit_test_seqs_2'],['cdhit_test_seqs_3'],\
            ['cdhit_test_seqs_4'],['cdhit_test_seqs_5'],\
            ['cdhit_test_seqs_6','cdhit_test_seqs_8'],\
            ['cdhit_test_seqs_7'],['cdhit_test_seqs_9']]
     obs = cdhit_clusters_from_seqs(dna_seqs, DNA)
     self.assertEqual(obs, exp)
Пример #3
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 id_len=0,
                 prefix_prefilter_length=None,
                 trie_prefilter=False):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.
        
        Parameters:
        seq_path: path to file of sequences
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.
        id_len: if set, truncates ids to n chars (you don't want this!)
        prefix_prefilter_length: prefilters the sequence collection so 
         sequences whose first prefix_prefilter_length characters are 
         identical will automatically be grouped into the same OTU [off by 
         default, 100 is typically a good value if this filtering is 
         desired] -- useful for large sequence collections, when cdhit doesn't
         scale well
        trie_prefilter: prefilter the sequence collection such that all sequences
         which are a prefix of another sequence are clustered with the other sequence.
         Togther with cd-hit this is a non-heuristic filter reduces run time a lot.
         Still a bit slower than the prefix_prefilter toggled with prefix_prefilter_length.
        """
        moltype = DNA
        log_lines = []

        # create the params dict to pass to cd-hit-est -- IS THERE A
        # BETTER WAY TO MAKE self.Params INTO THE params DICT TO PASS
        # TO THE APPLICATION CONTROLLERS?
        cd_hit_params = copy(self.Params)
        del cd_hit_params['Application']
        del cd_hit_params['Algorithm']
        cd_hit_params['-d'] = id_len  #turn off id truncation
        cd_hit_params['-g'] = "1"
        if (prefix_prefilter_length != None and trie_prefilter):
            log_lines.append("Both prefilters selected. Deactivate \
            trie_prefilter")
            trie_prefilter = False

        if prefix_prefilter_length != None:
            log_lines.append(\
             'Prefix-based prefiltering, prefix length: %d' \
             % prefix_prefilter_length )
            seqs, filter_map = self._prefilter_exact_prefixes(\
              MinimalFastaParser(open(seq_path)),prefix_prefilter_length)
            log_lines.append(\
             'Prefix-based prefiltering, post-filter num seqs: %d' \
             % len(seqs))

        elif trie_prefilter:
            log_lines.append(\
                         'Trie-based prefiltering')
            seqs, filter_map = self._prefilter_with_trie(seq_path)

            log_lines.append(\
                         'Trie-based prefiltering, post-filter num seqs: %d' \
                         % len(seqs))

        else:
            log_lines.append('No prefix-based prefiltering.')
            # Load the seq path. Right now, cdhit_clusters_from_seqs
            # doesn't support being passed a file path even though the
            # seqs do get written to a fasta file before being passed
            # to cd-hit-est. We may want to change that in the future
            # to avoid the overhead of loading large sequence collections
            # during this step.
            seqs = LoadSeqs(seq_path,
                            moltype=moltype,
                            aligned=False,
                            label_to_name=lambda x: x.split()[0])

        # Get the clusters by running cd-hit-est against the
        # sequence collection
        clusters = cdhit_clusters_from_seqs(\
         seqs=seqs,moltype=moltype,params=cd_hit_params)

        if prefix_prefilter_length != None or trie_prefilter:
            clusters = self._map_filtered_clusters_to_full_clusters(\
             clusters,filter_map)

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, 'w')
            for i, cluster in enumerate(clusters):
                of.write('%s\t%s\n' % (i, '\t'.join(cluster)))
            of.close()
            result = None
            log_lines.append('Result path: %s' % result_path)
        else:
            # if the user did not provide a result_path, store
            # the clusters in a dict of {otu_id:[seq_ids]}, where
            # otu_id is arbitrary
            result = dict(enumerate(clusters))
            log_lines.append('Result path: None, returned as dict.')

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            log_lines = [str(self)] + log_lines
            log_file.write('\n'.join(log_lines))

        # return the result (note this is None if the data was
        # written to file)
        return result
Пример #4
0
    def __call__(
        self, seq_path, result_path=None, log_path=None, id_len=0, prefix_prefilter_length=None, trie_prefilter=False
    ):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.
        
        Parameters:
        seq_path: path to file of sequences
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.
        id_len: if set, truncates ids to n chars (you don't want this!)
        prefix_prefilter_length: prefilters the sequence collection so 
         sequences whose first prefix_prefilter_length characters are 
         identical will automatically be grouped into the same OTU [off by 
         default, 100 is typically a good value if this filtering is 
         desired] -- useful for large sequence collections, when cdhit doesn't
         scale well
        trie_prefilter: prefilter the sequence collection such that all sequences
         which are a prefix of another sequence are clustered with the other sequence.
         Togther with cd-hit this is a non-heuristic filter reduces run time a lot.
         Still a bit slower than the prefix_prefilter toggled with prefix_prefilter_length.
        """
        moltype = DNA
        log_lines = []

        # create the params dict to pass to cd-hit-est -- IS THERE A
        # BETTER WAY TO MAKE self.Params INTO THE params DICT TO PASS
        # TO THE APPLICATION CONTROLLERS?
        cd_hit_params = copy(self.Params)
        del cd_hit_params["Application"]
        del cd_hit_params["Algorithm"]
        cd_hit_params["-d"] = id_len  # turn off id truncation
        cd_hit_params["-g"] = "1"
        if prefix_prefilter_length != None and trie_prefilter:
            log_lines.append(
                "Both prefilters selected. Deactivate \
            trie_prefilter"
            )
            trie_prefilter = False

        if prefix_prefilter_length != None:
            log_lines.append("Prefix-based prefiltering, prefix length: %d" % prefix_prefilter_length)
            seqs, filter_map = self._prefilter_exact_prefixes(
                MinimalFastaParser(open(seq_path)), prefix_prefilter_length
            )
            log_lines.append("Prefix-based prefiltering, post-filter num seqs: %d" % len(seqs))

        elif trie_prefilter:
            log_lines.append("Trie-based prefiltering")
            seqs, filter_map = self._prefilter_with_trie(seq_path)

            log_lines.append("Trie-based prefiltering, post-filter num seqs: %d" % len(seqs))

        else:
            log_lines.append("No prefix-based prefiltering.")
            # Load the seq path. Right now, cdhit_clusters_from_seqs
            # doesn't support being passed a file path even though the
            # seqs do get written to a fasta file before being passed
            # to cd-hit-est. We may want to change that in the future
            # to avoid the overhead of loading large sequence collections
            # during this step.
            seqs = LoadSeqs(seq_path, moltype=moltype, aligned=False, label_to_name=lambda x: x.split()[0])

        # Get the clusters by running cd-hit-est against the
        # sequence collection
        clusters = cdhit_clusters_from_seqs(seqs=seqs, moltype=moltype, params=cd_hit_params)

        if prefix_prefilter_length != None or trie_prefilter:
            clusters = self._map_filtered_clusters_to_full_clusters(clusters, filter_map)

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, "w")
            for i, cluster in enumerate(clusters):
                of.write("%s\t%s\n" % (i, "\t".join(cluster)))
            of.close()
            result = None
            log_lines.append("Result path: %s" % result_path)
        else:
            # if the user did not provide a result_path, store
            # the clusters in a dict of {otu_id:[seq_ids]}, where
            # otu_id is arbitrary
            result = dict(enumerate(clusters))
            log_lines.append("Result path: None, returned as dict.")

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, "w")
            log_lines = [str(self)] + log_lines
            log_file.write("\n".join(log_lines))

        # return the result (note this is None if the data was
        # written to file)
        return result