Exemplo n.º 1
0
    def test_clusters_from_uc_file(self):
        """ clusters_from_uc_file functions as expected """

        expected_clusters = {'s2':['s2','s3']}
        expected_failures = ['s1']
        expected_new_seeds = ['s2']
        self.assertEqual(clusters_from_uc_file(self.uc_lines1),
         (expected_clusters,expected_failures,expected_new_seeds))
Exemplo n.º 2
0
    def test_clusters_from_uc_file(self):
        """ clusters_from_uc_file functions as expected """

        expected_clusters = {"s2": ["s2", "s3"]}
        expected_failures = ["s1"]
        expected_new_seeds = ["s2"]
        self.assertEqual(
            clusters_from_uc_file(self.uc_lines1), (expected_clusters, expected_failures, expected_new_seeds)
        )
Exemplo n.º 3
0
    def test_clusters_from_uc_file_multiple_hits(self):
        """ clusters_from_uc_file handles error_on_multiple_hits correctly
        """
        # when a query hits multiple hits and error_on_multiple_hits=True
        # an error should be raised
        self.assertRaises(
            UclustParseError,
            clusters_from_uc_file,
            self.uc_lines_w_multiple_hits_per_query,
            error_on_multiple_hits=True,
        )

        # when a query hits multiple hits and error_on_multiple_hits=False
        # the query should show up in multiple clusters
        actual = clusters_from_uc_file(self.uc_lines_w_multiple_hits_per_query, error_on_multiple_hits=False)
        expected_clusters = {"s2": ["s2", "s3"], "s4": ["s4", "s3"]}
        expected_failures = ["s1"]
        expected_new_seeds = ["s2", "s4"]
        self.assertEqual(actual, (expected_clusters, expected_failures, expected_new_seeds))
Exemplo n.º 4
0
 def test_clusters_from_uc_file_multiple_hits(self):
     """ clusters_from_uc_file handles error_on_multiple_hits correctly
     """
     # when a query hits multiple hits and error_on_multiple_hits=True
     # an error should be raised
     self.assertRaises(UclustParseError,
                       clusters_from_uc_file,
                       self.uc_lines_w_multiple_hits_per_query,
                       error_on_multiple_hits=True)
     
     # when a query hits multiple hits and error_on_multiple_hits=False
     # the query should show up in multiple clusters
     actual = clusters_from_uc_file(self.uc_lines_w_multiple_hits_per_query,
                                    error_on_multiple_hits=False)
     expected_clusters = {'s2':['s2','s3'],
                          's4':['s4','s3']}
     expected_failures = ['s1']
     expected_new_seeds = ['s2','s4']
     self.assertEqual(actual,
      (expected_clusters,expected_failures,expected_new_seeds))
Exemplo n.º 5
0
def cluster_seqs(seqspath, simm, folderout='/tmp', gapopen=None, gapext=None):
    if folderout[-1] != "/":
        folderout += "/"

    params = {
        '--usersort': True,
        '--id': float(simm),
        '--maxaccepts': 20,
        '--maxrejects': 500,
        '--stepwords': 20,
        '--hsp': 0,
        '--match': 1,
        '--mismatch': -1
    }
    if gapopen is not None:
        params['--gapopen'] = gapopen
    if gapext is not None:
        params['--gapext'] = gapext
    uclust = Uclust(params, WorkingDir='/tmp')
    input_data = {
        '--input': seqspath,
        '--uc': folderout + "clusters.uc",
        '--log': folderout + "clusters.log"
    }
    result = uclust(input_data)
    clusters, failures, newseeds = clusters_from_uc_file(result['ClusterFile'])

    seqs = LoadSeqs(seqspath, aligned=False)
    convheader = {}
    clusterseqs = {}
    #create dictinary to convert shortened headers to full headers
    for header in seqs.getSeqNames():
        convheader[header.split()[0]] = header
    #match headers in each cluster to seqs to create cluster tuples list
    for num, cluster in enumerate(clusters):
        clusterseqs["cluster_" + str(num)] = []
        for header in clusters[cluster]:
            clusterseqs["cluster_" + str(num)].append((convheader[header],
                                              seqs.getSeq(convheader[header])))

    return clusterseqs
Exemplo n.º 6
0
def cluster_seqs(seqspath, simm, folderout='/tmp', gapopen=None, gapext=None):
    if folderout[-1] != "/":
        folderout += "/"

    params = {
        '--usersort': True,
        '--id': float(simm),
        '--maxaccepts': 20,
        '--maxrejects': 500,
        '--stepwords': 20,
        '--hsp': 0,
        '--match': 1,
        '--mismatch': -1
    }
    if gapopen is not None:
        params['--gapopen'] = gapopen
    if gapext is not None:
        params['--gapext'] = gapext
    uclust = Uclust(params, WorkingDir='/tmp')
    input_data = {
        '--input': seqspath,
        '--uc': folderout + "clusters.uc",
        '--log': folderout + "clusters.log"
    }
    result = uclust(input_data)
    clusters, failures, newseeds = clusters_from_uc_file(result['ClusterFile'])

    seqs = LoadSeqs(seqspath, aligned=False)
    convheader = {}
    clusterseqs = {}
    #create dictinary to convert shortened headers to full headers
    for header in seqs.getSeqNames():
        convheader[header.split()[0]] = header
    #match headers in each cluster to seqs to create cluster tuples list
    for num, cluster in enumerate(clusters):
        clusterseqs["cluster_" + str(num)] = []
        for header in clusters[cluster]:
            clusterseqs["cluster_" + str(num)].append(
                (convheader[header], seqs.getSeq(convheader[header])))

    return clusterseqs
def create_clusters(fastain, folderout, simmilarity, minseqs=0):
    params = {
        '--log': folderout + str(simmilarity) + "_clusters.log",
        '--usersort': False,
        '--id': float(simmilarity),
        '--maxaccepts': 20,
        '--maxrejects': 500,
        '--stepwords': 20,
        '--w': 12,
        '--gapopen': '10.0',
        '--gapext': '10.0',
    }
    uclust = Uclust(params, WorkingDir='/tmp')
    input_data = {
        '--input': fastain,
        '--uc': folderout + str(simmilarity) + "_clusters.uc"
    }
    result = uclust(input_data)
    clusters, failures, new_seeds = clusters_from_uc_file(result['ClusterFile'])
    #read in headers to rebuild full headers
    headers = {}
    for header, seq in MinimalFastaParser(open(fastain)):
        headers[header.split()[0]] = (header, seq)
    otus = []
    otusout = open(folderout + str(simmilarity) + "_clusters.txt", 'w')
    for group, cluster in enumerate(clusters):
        count = 0
        for seq in cluster:
            count += int(seq.split("_")[1])
        if count >= minseqs:
            otusout.write(str(group) + "\t")
            otus.append([])
            #map headers back to orignal ones with counts
            for header in clusters[cluster]:
                otus[group].append(headers[header])
                otusout.write(headers[header][0] + "\t")
            otusout.write("\n")
    otusout.close()
    return otus
Exemplo n.º 8
0
     '--maxaccepts': 20,
     '--maxrejects': 500,
     '--stepwords': 20,
     '--w': 12,
     '--gapopen': '1.0/*TI',
     '--gapext': '1.0',
     '--hsp': 0
 }
 uclust = Uclust(params, WorkingDir='/tmp')
 input_data = {
     '--input': argv[1],
     '--uc': argv[2] + argv[3] + "_clusters.uc",
     '--log': argv[2] + argv[3] + "_clusters.log"
 }
 result = uclust(input_data)
 clusters, failures, new_seeds = clusters_from_uc_file(result['ClusterFile'])
 print "RESULTS: ", len(clusters)
 headers = {}
 for header, seq in MinimalFastaParser(open(argv[1])):
     headers[header.split()[0]] = header
 otusout = open(argv[2] + argv[3] + "_clusters.txt", 'w')
 for group, cluster in enumerate(clusters):
     otusout.write(str(group) + "\t")
     #map headers back to orignal ones with counts
     for header in clusters[cluster]:
         otusout.write(headers[header] + "\t")
     otusout.write("\n")
 otusout.close()
 #log = open(argv[2] + argv[3] + "_clusters.log", 'w')
 #log.write('\n'.join(input_data) + '\n'.join(params) +
 #    "clusters: " + str(len(clusters)) + "\n")