Exemplo n.º 1
0
    def test_matrix_parse(self):
        if TestCluster.module == 'Bio.Cluster':
            from Bio.Cluster import treecluster
        elif TestCluster.module == 'Pycluster':
            from Pycluster import treecluster

        # Normal matrix, no errors
        data1 = numpy.array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5],
                             [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9],
                             [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5],
                             [5.1, 5.2]])

        # Another normal matrix, no errors; written as a list
        data2 = [[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5],
                 [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]]

        # Ragged matrix
        data3 = [[91.1, 92.2, 93.3, 94.4, 95.5], [93.1, 93.2, 91.3, 92.4],
                 [94.1, 92.2, 90.3], [12.1, 92.0, 90.0, 95.0, 90.0]]

        # Matrix with bad cells
        data4 = [[7.1, 7.2, 7.3, 7.4, 7.5], [7.1, 7.2, 7.3, 7.4, 'snoopy'],
                 [7.1, 7.2, 7.3, None, None]]

        # Matrix with a bad row
        data5 = [[23.1, 23.2, 23.3, 23.4, 23.5], None,
                 [23.1, 23.0, 23.0, 23.0, 23.0]]

        # Various references that don't point to matrices at all
        data6 = "snoopy"
        data7 = {'a': [[2.3, 1.2], [3.3, 5.6]]}
        data8 = []
        data9 = [None]

        try:
            treecluster(data1)
        except Exception:  # TODO - Which exceptions?
            self.fail("treecluster failed to accept matrix data1")

        try:
            treecluster(data2)
        except Exception:  # TODO - Which exceptions?
            self.fail("treecluster failed to accept matrix data2")

        self.assertRaises(TypeError, treecluster, data3)
        self.assertRaises(TypeError, treecluster, data4)
        self.assertRaises(TypeError, treecluster, data5)
        self.assertRaises(TypeError, treecluster, data6)
        self.assertRaises(TypeError, treecluster, data7)
        self.assertRaises(TypeError, treecluster, data8)
        self.assertRaises(TypeError, treecluster, data9)
Exemplo n.º 2
0
def get_clusters_from_seqlist(seqlist, dist_threshold=0.05):
    """Cluster a list of sequences by a distance identity threshold

    Parameters
    ----------
    seqlist : list
        list of sequences as str
    dist_threshold : float
        Max distance value to retain, branches above this length in the 
        hierarchical clustering tree will be cut.

    Returns
    -------
    list
        list of lists - input sequences now grouped by cluster
    list
        list of int - cluster memberships of the originally input list
    """
    if len(seqlist) == 1:
        # Skip alignment if there is only one sequence
        return([seqlist], [0])
    else:
        aligner = PairwiseAligner()
        aligner.mode = "local"

        # Convert sequence list to distance matrix
        distmatrix = []
        for seq1 in seqlist:
            row = []
            for seq2 in seqlist:
                maxlen = max([len(seq1), len(seq2)])
                # Take percentage identity of pairwise alignment score (match base
                # +1, all other operations +0) over the longer sequence in pair
                idval = aligner.align(seq1, seq2).score / maxlen
                distval = 1 - idval  # convert to distance fraction
                row.append(distval)
            distmatrix.append(row)
        # Hierarchical clustering from the distance matrix
        htree = treecluster(data=None, distancematrix=array(distmatrix))
        # Find number of branches with length longer than threshold, and add 1
        # to get number of cuts
        cuts = 1 + len([htree[i].distance for i in range(len(htree))
                        if htree[i].distance > dist_threshold])
        clust_ids = list(htree.cut(cuts))
        clust_seqs_dict = defaultdict(list)
        for i in range(len(seqlist)):
            clust_seqs_dict[clust_ids[i]] += [seqlist[i]]
        # Convert dict of lists to list of lists
        clust_seqs = [clust_seqs_dict[i] for i in clust_seqs_dict]
        return(clust_seqs, clust_ids)
Exemplo n.º 3
0
    def test_matrix_parse(self):
        if TestCluster.module == 'Bio.Cluster':
            from Bio.Cluster import treecluster
        elif TestCluster.module == 'Pycluster':
            from Pycluster import treecluster

        # Normal matrix, no errors
        data1 = numpy.array([[1.1, 1.2],
                             [1.4, 1.3],
                             [1.1, 1.5],
                             [2.0, 1.5],
                             [1.7, 1.9],
                             [1.7, 1.9],
                             [5.7, 5.9],
                             [5.7, 5.9],
                             [3.1, 3.3],
                             [5.4, 5.3],
                             [5.1, 5.5],
                             [5.0, 5.5],
                             [5.1, 5.2]])

        # Another normal matrix, no errors; written as a list
        data2 = [[1.1, 2.2, 3.3, 4.4, 5.5],
                  [3.1, 3.2, 1.3, 2.4, 1.5],
                  [4.1, 2.2, 0.3, 5.4, 0.5],
                  [12.1, 2.0, 0.0, 5.0, 0.0]]

        # Ragged matrix
        data3 = [[91.1, 92.2, 93.3, 94.4, 95.5],
                  [93.1, 93.2, 91.3, 92.4],
                  [94.1, 92.2, 90.3],
                  [12.1, 92.0, 90.0, 95.0, 90.0]]

        # Matrix with bad cells
        data4 = [[7.1, 7.2, 7.3, 7.4, 7.5],
                   [7.1, 7.2, 7.3, 7.4, 'snoopy'],
                   [7.1, 7.2, 7.3, None, None]]

        # Matrix with a bad row
        data5 = [[23.1, 23.2, 23.3, 23.4, 23.5],
                   None,
                   [23.1, 23.0, 23.0, 23.0, 23.0]]

        # Various references that don't point to matrices at all
        data6 = "snoopy"
        data7 = {'a': [[2.3, 1.2], [3.3, 5.6]]}
        data8 = []
        data9 = [None]

        try:
            treecluster(data1)
        except:
            self.fail("treecluster failed to accept matrix data1")

        try:
            treecluster(data2)
        except:
            self.fail("treecluster failed to accept matrix data2")

        self.assertRaises(TypeError, lambda: treecluster(data3))
        self.assertRaises(TypeError, lambda: treecluster(data4))
        self.assertRaises(TypeError, lambda: treecluster(data5))
        self.assertRaises(TypeError, lambda: treecluster(data6))
        self.assertRaises(TypeError, lambda: treecluster(data7))
        self.assertRaises(TypeError, lambda: treecluster(data8))
        self.assertRaises(TypeError, lambda: treecluster(data9))
Exemplo n.º 4
0
    def test_treecluster(self):
        if TestCluster.module == 'Bio.Cluster':
            from Bio.Cluster import treecluster
        elif TestCluster.module == 'Pycluster':
            from Pycluster import treecluster

        # First data set
        weight1 = [1, 1, 1, 1, 1]
        data1 = numpy.array([[1.1, 2.2, 3.3, 4.4, 5.5],
                                [3.1, 3.2, 1.3, 2.4, 1.5],
                                [4.1, 2.2, 0.3, 5.4, 0.5],
                                [12.1, 2.0, 0.0, 5.0, 0.0]])
        mask1 = numpy.array([[1, 1, 1, 1, 1],
                             [1, 1, 1, 1, 1],
                             [1, 1, 1, 1, 1],
                             [1, 1, 1, 1, 1]], int)

        # TODO - Use a context manager here once we drop Python 2.6
        # Method should be one letter:
        self.assertRaises(ValueError, treecluster,
                          **{"data": data1, "mask": mask1, "weight": weight1,
                             "transpose": 0, "method": "any", "dist": "e"})

        # Distance should be one letter:
        self.assertRaises(ValueError, treecluster,
                          **{"data": data1, "mask": mask1, "weight": weight1,
                             "transpose": 0, "method": "any", "dist": "euclidean"})

        # test first data set
        # Pairwise average-linkage clustering"
        tree = treecluster(data=data1, mask=mask1, weight=weight1,
                           transpose=0, method='a', dist='e')
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 2)
        self.assertEqual(tree[0].right, 1)
        self.assertAlmostEqual(tree[0].distance, 2.600, places=3)
        self.assertEqual(tree[1].left, -1)
        self.assertEqual(tree[1].right, 0)
        self.assertAlmostEqual(tree[1].distance, 7.300, places=3)
        self.assertEqual(tree[2].left, 3)
        self.assertEqual(tree[2].right, -2)
        self.assertAlmostEqual(tree[2].distance, 21.348, places=3)

        # Pairwise single-linkage clustering
        tree = treecluster(data=data1, mask=mask1, weight=weight1,
                           transpose=0, method='s', dist='e')
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 1)
        self.assertEqual(tree[0].right, 2)
        self.assertAlmostEqual(tree[0].distance, 2.600, places=3)
        self.assertEqual(tree[1].left, 0)
        self.assertEqual(tree[1].right, -1)
        self.assertAlmostEqual(tree[1].distance, 5.800, places=3)
        self.assertEqual(tree[2].left, -2)
        self.assertEqual(tree[2].right, 3)
        self.assertAlmostEqual(tree[2].distance, 12.908, places=3)

        # Pairwise centroid-linkage clustering
        tree = treecluster(data=data1, mask=mask1, weight=weight1,
                           transpose=0, method='c', dist='e')
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 1)
        self.assertEqual(tree[0].right, 2)
        self.assertAlmostEqual(tree[0].distance, 2.600, places=3)
        self.assertEqual(tree[1].left, 0)
        self.assertEqual(tree[1].right, -1)
        self.assertAlmostEqual(tree[1].distance, 6.650, places=3)
        self.assertEqual(tree[2].left, -2)
        self.assertEqual(tree[2].right, 3)
        self.assertAlmostEqual(tree[2].distance, 19.437, places=3)

        # Pairwise maximum-linkage clustering
        tree = treecluster(data=data1, mask=mask1, weight=weight1,
                           transpose=0, method='m', dist='e')
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 2)
        self.assertEqual(tree[0].right, 1)
        self.assertAlmostEqual(tree[0].distance, 2.600, places=3)
        self.assertEqual(tree[1].left, -1)
        self.assertEqual(tree[1].right, 0)
        self.assertAlmostEqual(tree[1].distance, 8.800, places=3)
        self.assertEqual(tree[2].left, 3)
        self.assertEqual(tree[2].right, -2)
        self.assertAlmostEqual(tree[2].distance, 32.508, places=3)

        # Second data set
        weight2 = [1, 1]
        data2 = numpy.array([[0.8223, 0.9295],
                             [1.4365, 1.3223],
                             [1.1623, 1.5364],
                             [2.1826, 1.1934],
                             [1.7763, 1.9352],
                             [1.7215, 1.9912],
                             [2.1812, 5.9935],
                             [5.3290, 5.9452],
                             [3.1491, 3.3454],
                             [5.1923, 5.3156],
                             [4.7735, 5.4012],
                             [5.1297, 5.5645],
                             [5.3934, 5.1823]])
        mask2 = numpy.array([[1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1],
                             [1, 1]], int)

        # Test second data set
        # Pairwise average-linkage clustering
        tree = treecluster(data=data2, mask=mask2, weight=weight2,
                           transpose=0, method='a', dist='e')
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 5)
        self.assertEqual(tree[0].right, 4)
        self.assertAlmostEqual(tree[0].distance, 0.003, places=3)
        self.assertEqual(tree[1].left, 9)
        self.assertEqual(tree[1].right, 12)
        self.assertAlmostEqual(tree[1].distance, 0.029, places=3)
        self.assertEqual(tree[2].left, 2)
        self.assertEqual(tree[2].right, 1)
        self.assertAlmostEqual(tree[2].distance, 0.061, places=3)
        self.assertEqual(tree[3].left, 11)
        self.assertEqual(tree[3].right, -2)
        self.assertAlmostEqual(tree[3].distance, 0.070, places=3)
        self.assertEqual(tree[4].left, -4)
        self.assertEqual(tree[4].right, 10)
        self.assertAlmostEqual(tree[4].distance, 0.128, places=3)
        self.assertEqual(tree[5].left, 7)
        self.assertEqual(tree[5].right, -5)
        self.assertAlmostEqual(tree[5].distance, 0.224, places=3)
        self.assertEqual(tree[6].left, -3)
        self.assertEqual(tree[6].right, 0)
        self.assertAlmostEqual(tree[6].distance, 0.254, places=3)
        self.assertEqual(tree[7].left, -1)
        self.assertEqual(tree[7].right, 3)
        self.assertAlmostEqual(tree[7].distance, 0.391, places=3)
        self.assertEqual(tree[8].left, -8)
        self.assertEqual(tree[8].right, -7)
        self.assertAlmostEqual(tree[8].distance, 0.532, places=3)
        self.assertEqual(tree[9].left, 8)
        self.assertEqual(tree[9].right, -9)
        self.assertAlmostEqual(tree[9].distance, 3.234, places=3)
        self.assertEqual(tree[10].left, -6)
        self.assertEqual(tree[10].right, 6)
        self.assertAlmostEqual(tree[10].distance, 4.636, places=3)
        self.assertEqual(tree[11].left, -11)
        self.assertEqual(tree[11].right, -10)
        self.assertAlmostEqual(tree[11].distance, 12.741, places=3)

        # Pairwise single-linkage clustering
        tree = treecluster(data=data2, mask=mask2, weight=weight2,
                           transpose=0, method='s', dist='e')
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 4)
        self.assertEqual(tree[0].right, 5)
        self.assertAlmostEqual(tree[0].distance, 0.003, places=3)
        self.assertEqual(tree[1].left, 9)
        self.assertEqual(tree[1].right, 12)
        self.assertAlmostEqual(tree[1].distance, 0.029, places=3)
        self.assertEqual(tree[2].left, 11)
        self.assertEqual(tree[2].right, -2)
        self.assertAlmostEqual(tree[2].distance, 0.033, places=3)
        self.assertEqual(tree[3].left, 1)
        self.assertEqual(tree[3].right, 2)
        self.assertAlmostEqual(tree[3].distance, 0.061, places=3)
        self.assertEqual(tree[4].left, 10)
        self.assertEqual(tree[4].right, -3)
        self.assertAlmostEqual(tree[4].distance, 0.077, places=3)
        self.assertEqual(tree[5].left, 7)
        self.assertEqual(tree[5].right, -5)
        self.assertAlmostEqual(tree[5].distance, 0.092, places=3)
        self.assertEqual(tree[6].left, 0)
        self.assertEqual(tree[6].right, -4)
        self.assertAlmostEqual(tree[6].distance, 0.242, places=3)
        self.assertEqual(tree[7].left, -7)
        self.assertEqual(tree[7].right, -1)
        self.assertAlmostEqual(tree[7].distance, 0.246, places=3)
        self.assertEqual(tree[8].left, 3)
        self.assertEqual(tree[8].right, -8)
        self.assertAlmostEqual(tree[8].distance, 0.287, places=3)
        self.assertEqual(tree[9].left, -9)
        self.assertEqual(tree[9].right, 8)
        self.assertAlmostEqual(tree[9].distance, 1.936, places=3)
        self.assertEqual(tree[10].left, -10)
        self.assertEqual(tree[10].right, -6)
        self.assertAlmostEqual(tree[10].distance, 3.432, places=3)
        self.assertEqual(tree[11].left, 6)
        self.assertEqual(tree[11].right, -11)
        self.assertAlmostEqual(tree[11].distance, 3.535, places=3)

        # Pairwise centroid-linkage clustering
        tree = treecluster(data=data2, mask=mask2, weight=weight2,
                           transpose=0, method='c', dist='e')
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 4)
        self.assertEqual(tree[0].right, 5)
        self.assertAlmostEqual(tree[0].distance, 0.003, places=3)
        self.assertEqual(tree[1].left, 12)
        self.assertEqual(tree[1].right, 9)
        self.assertAlmostEqual(tree[1].distance, 0.029, places=3)
        self.assertEqual(tree[2].left, 1)
        self.assertEqual(tree[2].right, 2)
        self.assertAlmostEqual(tree[2].distance, 0.061, places=3)
        self.assertEqual(tree[3].left, -2)
        self.assertEqual(tree[3].right, 11)
        self.assertAlmostEqual(tree[3].distance, 0.063, places=3)
        self.assertEqual(tree[4].left, 10)
        self.assertEqual(tree[4].right, -4)
        self.assertAlmostEqual(tree[4].distance, 0.109, places=3)
        self.assertEqual(tree[5].left, -5)
        self.assertEqual(tree[5].right, 7)
        self.assertAlmostEqual(tree[5].distance, 0.189, places=3)
        self.assertEqual(tree[6].left, 0)
        self.assertEqual(tree[6].right, -3)
        self.assertAlmostEqual(tree[6].distance, 0.239, places=3)
        self.assertEqual(tree[7].left, 3)
        self.assertEqual(tree[7].right, -1)
        self.assertAlmostEqual(tree[7].distance, 0.390, places=3)
        self.assertEqual(tree[8].left, -7)
        self.assertEqual(tree[8].right, -8)
        self.assertAlmostEqual(tree[8].distance, 0.382, places=3)
        self.assertEqual(tree[9].left, -9)
        self.assertEqual(tree[9].right, 8)
        self.assertAlmostEqual(tree[9].distance, 3.063, places=3)
        self.assertEqual(tree[10].left, 6)
        self.assertEqual(tree[10].right, -6)
        self.assertAlmostEqual(tree[10].distance, 4.578, places=3)
        self.assertEqual(tree[11].left, -10)
        self.assertEqual(tree[11].right, -11)
        self.assertAlmostEqual(tree[11].distance, 11.536, places=3)

        # Pairwise maximum-linkage clustering
        tree = treecluster(data=data2, mask=mask2, weight=weight2,
                           transpose=0, method='m', dist='e')
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 5)
        self.assertEqual(tree[0].right, 4)
        self.assertAlmostEqual(tree[0].distance, 0.003, places=3)
        self.assertEqual(tree[1].left, 9)
        self.assertEqual(tree[1].right, 12)
        self.assertAlmostEqual(tree[1].distance, 0.029, places=3)
        self.assertEqual(tree[2].left, 2)
        self.assertEqual(tree[2].right, 1)
        self.assertAlmostEqual(tree[2].distance, 0.061, places=3)
        self.assertEqual(tree[3].left, 11)
        self.assertEqual(tree[3].right, 10)
        self.assertAlmostEqual(tree[3].distance, 0.077, places=3)
        self.assertEqual(tree[4].left, -2)
        self.assertEqual(tree[4].right, -4)
        self.assertAlmostEqual(tree[4].distance, 0.216, places=3)
        self.assertEqual(tree[5].left, -3)
        self.assertEqual(tree[5].right, 0)
        self.assertAlmostEqual(tree[5].distance, 0.266, places=3)
        self.assertEqual(tree[6].left, -5)
        self.assertEqual(tree[6].right, 7)
        self.assertAlmostEqual(tree[6].distance, 0.302, places=3)
        self.assertEqual(tree[7].left, -1)
        self.assertEqual(tree[7].right, 3)
        self.assertAlmostEqual(tree[7].distance, 0.425, places=3)
        self.assertEqual(tree[8].left, -8)
        self.assertEqual(tree[8].right, -6)
        self.assertAlmostEqual(tree[8].distance, 0.968, places=3)
        self.assertEqual(tree[9].left, 8)
        self.assertEqual(tree[9].right, 6)
        self.assertAlmostEqual(tree[9].distance, 3.975, places=3)
        self.assertEqual(tree[10].left, -10)
        self.assertEqual(tree[10].right, -7)
        self.assertAlmostEqual(tree[10].distance, 5.755, places=3)
        self.assertEqual(tree[11].left, -11)
        self.assertEqual(tree[11].right, -9)
        self.assertAlmostEqual(tree[11].distance, 22.734, places=3)
# @Date:   2019-05-27T10:15:26+08:00
# @Email:  [email protected]
# @Filename: BioPy_1730416009_0527.py
# @Last modified time: 2019-05-27T14:31:37+08:00
import pandas as pd
from Bio.Cluster import treecluster

# Use pandas to read the excel-format file
dfrm = pd.read_excel('./ExpressionData.xlsx')
# Convert the table into a matrix/array
data_array = dfrm.drop('ID', axis=1).values
# Perform hierarchical clustering (For Gene/Protein)
tree_gene = treecluster(data_array, transpose=0, method='s', dist='e')
# Perform hierarchical clustering (For Experiment Condiction
# -> transpose=1,method=pairwise single-linkage clustering)
tree_exp = treecluster(data_array, transpose=1, method='m', dist='e')
# Output the result
with open('./Results.txt', 'wt') as outFile:
    outFile.write("# Cluster Tree of Exp Condiction\n")
    outFile.write(str(tree_exp) + '\n')
    outFile.write("# Cluster Tree of Gene\n")
    outFile.write(str(tree_gene) + '\n')
Exemplo n.º 6
0
def test_matrix_parse(module):
    if module == 'Bio.Cluster':
        from Bio.Cluster import treecluster
    elif module == 'Pycluster':
        from Pycluster import treecluster
    else:
        raise 'Unknown module name', module
    print "test_matrix_parse:"
    # Normal matrix, no errors
    data1 = array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5], [1.7, 1.9],
                   [1.7, 1.9], [5.7, 5.9], [5.7, 5.9], [3.1, 3.3], [5.4, 5.3],
                   [5.1, 5.5], [5.0, 5.5], [5.1, 5.2]])

    # Another normal matrix, no errors; written as a list
    data2 = [[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5],
             [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]]

    # Ragged matrix
    data3 = [[91.1, 92.2, 93.3, 94.4, 95.5], [93.1, 93.2, 91.3, 92.4],
             [94.1, 92.2, 90.3], [12.1, 92.0, 90.0, 95.0, 90.0]]

    # Matrix with bad cells
    data4 = [[
        7.1,
        7.2,
        7.3,
        7.4,
        7.5,
    ], [7.1, 7.2, 7.3, 7.4, 'snoopy'], [7.1, 7.2, 7.3, None, None]]

    # Matrix with a bad row
    data5 = [[23.1, 23.2, 23.3, 23.4, 23.5], None,
             [23.1, 23.0, 23.0, 23.0, 23.0]]

    # Various references that don't point to matrices at all
    data6 = "snoopy"
    data7 = {'a': [[2.3, 1.2], [3.3, 5.6]]}
    data8 = []
    data9 = [None]
    data10 = [[None]]

    try:
        result = treecluster(data1)
        print "Read data1 (correct)"
    except:
        "Error: treecluster failed to accept matrix data1"
    try:
        result = treecluster(data2)
        print "Read data2 (correct)"
    except:
        "Error: treecluster failed to accept matrix data2"
    try:
        result = treecluster(data3)
        print "Error: treecluster incorrectly accepted data3"
    except:
        print "Refused incorrect matrix data3"
    try:
        result = treecluster(data4)
        print "Error: treecluster incorrectly accepted data4"
    except:
        print "Refused incorrect matrix data4"
    try:
        result = treecluster(data5)
        print "Error: treecluster incorrectly accepted data5"
    except:
        print "Refused incorrect matrix data5"
    try:
        result = treecluster(data6)
        print "Error: treecluster incorrectly accepted data6"
    except:
        print "Refused incorrect matrix data6"
    try:
        result = treecluster(data7)
        print "Error: treecluster incorrectly accepted data7"
    except:
        print "Refused incorrect matrix data7"
    try:
        result = treecluster(data8)
        print "Error: treecluster incorrectly accepted data8"
    except:
        print "Refused incorrect matrix data8"
    try:
        result = treecluster(data9)
        print "Error: treecluster incorrectly accepted data9"
    except:
        print "Refused incorrect matrix data9"
    try:
        result = treecluster(data10)
        print "Error: treecluster incorrectly accepted data10"
    except:
        print "Refused incorrect matrix data10"
    print
Exemplo n.º 7
0
def test_treecluster(module):
    if module == 'Bio.Cluster':
        from Bio.Cluster import treecluster
    elif module == 'Pycluster':
        from Pycluster import treecluster
    else:
        raise 'Unknown module name', module
    print "test_treecluster:"
    # First data set
    weight1 = [1, 1, 1, 1, 1]
    data1 = array([[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5],
                   [4.1, 2.2, 0.3, 5.4, 0.5], [12.1, 2.0, 0.0, 5.0, 0.0]])
    mask1 = array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1],
                   [1, 1, 1, 1, 1]])

    # Second data set
    weight2 = [1, 1]
    data2 = array([[0.8223, 0.9295], [1.4365, 1.3223], [1.1623, 1.5364],
                   [2.1826, 1.1934], [1.7763, 1.9352], [1.7215, 1.9912],
                   [2.1812, 5.9935], [5.3290, 5.9452], [3.1491, 3.3454],
                   [5.1923, 5.3156], [4.7735, 5.4012], [5.1297, 5.5645],
                   [5.3934, 5.1823]])
    mask2 = array([[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1],
                   [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]])

    # test first data set
    print "First data set:"
    print_matrix(data1, mask1)
    print "Pairwise average-linkage clustering"
    result, linkdist = treecluster(data=data1,
                                   mask=mask1,
                                   weight=weight1,
                                   transpose=0,
                                   method='a',
                                   dist='e')
    print "Number of nodes is %d (should be %d)" % (len(result),
                                                    len(data1) - 1)
    print "Number of link distances is %d (should be %d)" % (len(linkdist),
                                                             len(data1) - 1)
    for i in range(len(result)):
        print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (
            i, result[i][0], result[i][1], linkdist[i])

    print "Pairwise single-linkage clustering"
    result, linkdist = treecluster(data=data1,
                                   mask=mask1,
                                   weight=weight1,
                                   transpose=0,
                                   method='s',
                                   dist='e')
    print "Number of nodes is %d (should be %d)" % (len(result),
                                                    len(data1) - 1)
    print "Number of link distances is %d (should be %d)" % (len(linkdist),
                                                             len(data1) - 1)
    for i in range(len(result)):
        print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (
            i, result[i][0], result[i][1], linkdist[i])

    print "Pairwise centroid-linkage clustering"
    result, linkdist = treecluster(data=data1,
                                   mask=mask1,
                                   weight=weight1,
                                   transpose=0,
                                   method='c',
                                   dist='e')
    print "Number of nodes is %d (should be %d)" % (len(result),
                                                    len(data1) - 1)
    print "Number of link distances is %d (should be %d)" % (len(linkdist),
                                                             len(data1) - 1)
    for i in range(len(result)):
        print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (
            i, result[i][0], result[i][1], linkdist[i])

    print "Pairwise maximum-linkage clustering"
    result, linkdist = treecluster(data=data1,
                                   mask=mask1,
                                   weight=weight1,
                                   transpose=0,
                                   method='m',
                                   dist='e')
    print "Number of nodes is %d (should be %d)" % (len(result),
                                                    len(data1) - 1)
    print "Number of link distances is %d (should be %d)" % (len(linkdist),
                                                             len(data1) - 1)
    for i in range(len(result)):
        print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (
            i, result[i][0], result[i][1], linkdist[i])

    # Test second data set
    print "Second data set:"
    print "Pairwise average-linkage clustering"
    result, linkdist = treecluster(data=data2,
                                   mask=mask2,
                                   weight=weight2,
                                   transpose=0,
                                   method='a',
                                   dist='e')
    print "Number of nodes is %d (should be %d)" % (len(result),
                                                    len(data2) - 1)
    print "Number of link distances is %d (should be %d)" % (len(linkdist),
                                                             len(data2) - 1)
    for i in range(len(result)):
        print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (
            i, result[i][0], result[i][1], linkdist[i])

    print "Pairwise single-linkage clustering"
    result, linkdist = treecluster(data=data2,
                                   mask=mask2,
                                   weight=weight2,
                                   transpose=0,
                                   method='s',
                                   dist='e')
    print "Number of nodes is %d (should be %d)" % (len(result),
                                                    len(data2) - 1)
    print "Number of link distances is %d (should be %d)" % (len(linkdist),
                                                             len(data2) - 1)
    for i in range(len(result)):
        print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (
            i, result[i][0], result[i][1], linkdist[i])

    print "Pairwise centroid-linkage clustering"
    result, linkdist = treecluster(data=data2,
                                   mask=mask2,
                                   weight=weight2,
                                   transpose=0,
                                   method='c',
                                   dist='e')
    print "Number of nodes is %d (should be %d)" % (len(result),
                                                    len(data2) - 1)
    print "Number of link distances is %d (should be %d)" % (len(linkdist),
                                                             len(data2) - 1)
    for i in range(len(result)):
        print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (
            i, result[i][0], result[i][1], linkdist[i])

    print "Pairwise maximum-linkage clustering"
    result, linkdist = treecluster(data=data2,
                                   mask=mask2,
                                   weight=weight2,
                                   transpose=0,
                                   method='m',
                                   dist='e')
    print "Number of nodes is %d (should be %d)" % (len(result),
                                                    len(data2) - 1)
    print "Number of link distances is %d (should be %d)" % (len(linkdist),
                                                             len(data2) - 1)
    for i in range(len(result)):
        print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (
            i, result[i][0], result[i][1], linkdist[i])
    print
Exemplo n.º 8
0
    def test_treecluster(self):
        if TestCluster.module == "Bio.Cluster":
            from Bio.Cluster import treecluster
        elif TestCluster.module == "Pycluster":
            from Pycluster import treecluster

        # First data set
        weight1 = [1, 1, 1, 1, 1]
        data1 = numpy.array(
            [
                [1.1, 2.2, 3.3, 4.4, 5.5],
                [3.1, 3.2, 1.3, 2.4, 1.5],
                [4.1, 2.2, 0.3, 5.4, 0.5],
                [12.1, 2.0, 0.0, 5.0, 0.0],
            ]
        )
        mask1 = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]], int)

        # test first data set
        # Pairwise average-linkage clustering"
        tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method="a", dist="e")
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 2)
        self.assertEqual(tree[0].right, 1)
        self.assertAlmostEqual(tree[0].distance, 2.600, 3)
        self.assertEqual(tree[1].left, -1)
        self.assertEqual(tree[1].right, 0)
        self.assertAlmostEqual(tree[1].distance, 7.300, 3)
        self.assertEqual(tree[2].left, 3)
        self.assertEqual(tree[2].right, -2)
        self.assertAlmostEqual(tree[2].distance, 21.348, 3)

        # Pairwise single-linkage clustering
        tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method="s", dist="e")
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 1)
        self.assertEqual(tree[0].right, 2)
        self.assertAlmostEqual(tree[0].distance, 2.600, 3)
        self.assertEqual(tree[1].left, 0)
        self.assertEqual(tree[1].right, -1)
        self.assertAlmostEqual(tree[1].distance, 5.800, 3)
        self.assertEqual(tree[2].left, -2)
        self.assertEqual(tree[2].right, 3)
        self.assertAlmostEqual(tree[2].distance, 12.908, 3)

        # Pairwise centroid-linkage clustering
        tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method="c", dist="e")
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 1)
        self.assertEqual(tree[0].right, 2)
        self.assertAlmostEqual(tree[0].distance, 2.600, 3)
        self.assertEqual(tree[1].left, 0)
        self.assertEqual(tree[1].right, -1)
        self.assertAlmostEqual(tree[1].distance, 6.650, 3)
        self.assertEqual(tree[2].left, -2)
        self.assertEqual(tree[2].right, 3)
        self.assertAlmostEqual(tree[2].distance, 19.437, 3)

        # Pairwise maximum-linkage clustering
        tree = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method="m", dist="e")
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 2)
        self.assertEqual(tree[0].right, 1)
        self.assertAlmostEqual(tree[0].distance, 2.600, 3)
        self.assertEqual(tree[1].left, -1)
        self.assertEqual(tree[1].right, 0)
        self.assertAlmostEqual(tree[1].distance, 8.800, 3)
        self.assertEqual(tree[2].left, 3)
        self.assertEqual(tree[2].right, -2)
        self.assertAlmostEqual(tree[2].distance, 32.508, 3)

        # Second data set
        weight2 = [1, 1]
        data2 = numpy.array(
            [
                [0.8223, 0.9295],
                [1.4365, 1.3223],
                [1.1623, 1.5364],
                [2.1826, 1.1934],
                [1.7763, 1.9352],
                [1.7215, 1.9912],
                [2.1812, 5.9935],
                [5.3290, 5.9452],
                [3.1491, 3.3454],
                [5.1923, 5.3156],
                [4.7735, 5.4012],
                [5.1297, 5.5645],
                [5.3934, 5.1823],
            ]
        )
        mask2 = numpy.array(
            [[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]],
            int,
        )

        # Test second data set
        # Pairwise average-linkage clustering
        tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method="a", dist="e")
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 5)
        self.assertEqual(tree[0].right, 4)
        self.assertAlmostEqual(tree[0].distance, 0.003, 3)
        self.assertEqual(tree[1].left, 9)
        self.assertEqual(tree[1].right, 12)
        self.assertAlmostEqual(tree[1].distance, 0.029, 3)
        self.assertEqual(tree[2].left, 2)
        self.assertEqual(tree[2].right, 1)
        self.assertAlmostEqual(tree[2].distance, 0.061, 3)
        self.assertEqual(tree[3].left, 11)
        self.assertEqual(tree[3].right, -2)
        self.assertAlmostEqual(tree[3].distance, 0.070, 3)
        self.assertEqual(tree[4].left, -4)
        self.assertEqual(tree[4].right, 10)
        self.assertAlmostEqual(tree[4].distance, 0.128, 3)
        self.assertEqual(tree[5].left, 7)
        self.assertEqual(tree[5].right, -5)
        self.assertAlmostEqual(tree[5].distance, 0.224, 3)
        self.assertEqual(tree[6].left, -3)
        self.assertEqual(tree[6].right, 0)
        self.assertAlmostEqual(tree[6].distance, 0.254, 3)
        self.assertEqual(tree[7].left, -1)
        self.assertEqual(tree[7].right, 3)
        self.assertAlmostEqual(tree[7].distance, 0.391, 3)
        self.assertEqual(tree[8].left, -8)
        self.assertEqual(tree[8].right, -7)
        self.assertAlmostEqual(tree[8].distance, 0.532, 3)
        self.assertEqual(tree[9].left, 8)
        self.assertEqual(tree[9].right, -9)
        self.assertAlmostEqual(tree[9].distance, 3.234, 3)
        self.assertEqual(tree[10].left, -6)
        self.assertEqual(tree[10].right, 6)
        self.assertAlmostEqual(tree[10].distance, 4.636, 3)
        self.assertEqual(tree[11].left, -11)
        self.assertEqual(tree[11].right, -10)
        self.assertAlmostEqual(tree[11].distance, 12.741, 3)

        # Pairwise single-linkage clustering
        tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method="s", dist="e")
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 4)
        self.assertEqual(tree[0].right, 5)
        self.assertAlmostEqual(tree[0].distance, 0.003, 3)
        self.assertEqual(tree[1].left, 9)
        self.assertEqual(tree[1].right, 12)
        self.assertAlmostEqual(tree[1].distance, 0.029, 3)
        self.assertEqual(tree[2].left, 11)
        self.assertEqual(tree[2].right, -2)
        self.assertAlmostEqual(tree[2].distance, 0.033, 3)
        self.assertEqual(tree[3].left, 1)
        self.assertEqual(tree[3].right, 2)
        self.assertAlmostEqual(tree[3].distance, 0.061, 3)
        self.assertEqual(tree[4].left, 10)
        self.assertEqual(tree[4].right, -3)
        self.assertAlmostEqual(tree[4].distance, 0.077, 3)
        self.assertEqual(tree[5].left, 7)
        self.assertEqual(tree[5].right, -5)
        self.assertAlmostEqual(tree[5].distance, 0.092, 3)
        self.assertEqual(tree[6].left, 0)
        self.assertEqual(tree[6].right, -4)
        self.assertAlmostEqual(tree[6].distance, 0.242, 3)
        self.assertEqual(tree[7].left, -7)
        self.assertEqual(tree[7].right, -1)
        self.assertAlmostEqual(tree[7].distance, 0.246, 3)
        self.assertEqual(tree[8].left, 3)
        self.assertEqual(tree[8].right, -8)
        self.assertAlmostEqual(tree[8].distance, 0.287, 3)
        self.assertEqual(tree[9].left, -9)
        self.assertEqual(tree[9].right, 8)
        self.assertAlmostEqual(tree[9].distance, 1.936, 3)
        self.assertEqual(tree[10].left, -10)
        self.assertEqual(tree[10].right, -6)
        self.assertAlmostEqual(tree[10].distance, 3.432, 3)
        self.assertEqual(tree[11].left, 6)
        self.assertEqual(tree[11].right, -11)
        self.assertAlmostEqual(tree[11].distance, 3.535, 3)

        # Pairwise centroid-linkage clustering
        tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method="c", dist="e")
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 4)
        self.assertEqual(tree[0].right, 5)
        self.assertAlmostEqual(tree[0].distance, 0.003, 3)
        self.assertEqual(tree[1].left, 12)
        self.assertEqual(tree[1].right, 9)
        self.assertAlmostEqual(tree[1].distance, 0.029, 3)
        self.assertEqual(tree[2].left, 1)
        self.assertEqual(tree[2].right, 2)
        self.assertAlmostEqual(tree[2].distance, 0.061, 3)
        self.assertEqual(tree[3].left, -2)
        self.assertEqual(tree[3].right, 11)
        self.assertAlmostEqual(tree[3].distance, 0.063, 3)
        self.assertEqual(tree[4].left, 10)
        self.assertEqual(tree[4].right, -4)
        self.assertAlmostEqual(tree[4].distance, 0.109, 3)
        self.assertEqual(tree[5].left, -5)
        self.assertEqual(tree[5].right, 7)
        self.assertAlmostEqual(tree[5].distance, 0.189, 3)
        self.assertEqual(tree[6].left, 0)
        self.assertEqual(tree[6].right, -3)
        self.assertAlmostEqual(tree[6].distance, 0.239, 3)
        self.assertEqual(tree[7].left, 3)
        self.assertEqual(tree[7].right, -1)
        self.assertAlmostEqual(tree[7].distance, 0.390, 3)
        self.assertEqual(tree[8].left, -7)
        self.assertEqual(tree[8].right, -8)
        self.assertAlmostEqual(tree[8].distance, 0.382, 3)
        self.assertEqual(tree[9].left, -9)
        self.assertEqual(tree[9].right, 8)
        self.assertAlmostEqual(tree[9].distance, 3.063, 3)
        self.assertEqual(tree[10].left, 6)
        self.assertEqual(tree[10].right, -6)
        self.assertAlmostEqual(tree[10].distance, 4.578, 3)
        self.assertEqual(tree[11].left, -10)
        self.assertEqual(tree[11].right, -11)
        self.assertAlmostEqual(tree[11].distance, 11.536, 3)

        # Pairwise maximum-linkage clustering
        tree = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method="m", dist="e")
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 5)
        self.assertEqual(tree[0].right, 4)
        self.assertAlmostEqual(tree[0].distance, 0.003, 3)
        self.assertEqual(tree[1].left, 9)
        self.assertEqual(tree[1].right, 12)
        self.assertAlmostEqual(tree[1].distance, 0.029, 3)
        self.assertEqual(tree[2].left, 2)
        self.assertEqual(tree[2].right, 1)
        self.assertAlmostEqual(tree[2].distance, 0.061, 3)
        self.assertEqual(tree[3].left, 11)
        self.assertEqual(tree[3].right, 10)
        self.assertAlmostEqual(tree[3].distance, 0.077, 3)
        self.assertEqual(tree[4].left, -2)
        self.assertEqual(tree[4].right, -4)
        self.assertAlmostEqual(tree[4].distance, 0.216, 3)
        self.assertEqual(tree[5].left, -3)
        self.assertEqual(tree[5].right, 0)
        self.assertAlmostEqual(tree[5].distance, 0.266, 3)
        self.assertEqual(tree[6].left, -5)
        self.assertEqual(tree[6].right, 7)
        self.assertAlmostEqual(tree[6].distance, 0.302, 3)
        self.assertEqual(tree[7].left, -1)
        self.assertEqual(tree[7].right, 3)
        self.assertAlmostEqual(tree[7].distance, 0.425, 3)
        self.assertEqual(tree[8].left, -8)
        self.assertEqual(tree[8].right, -6)
        self.assertAlmostEqual(tree[8].distance, 0.968, 3)
        self.assertEqual(tree[9].left, 8)
        self.assertEqual(tree[9].right, 6)
        self.assertAlmostEqual(tree[9].distance, 3.975, 3)
        self.assertEqual(tree[10].left, -10)
        self.assertEqual(tree[10].right, -7)
        self.assertAlmostEqual(tree[10].distance, 5.755, 3)
        self.assertEqual(tree[11].left, -11)
        self.assertEqual(tree[11].right, -9)
        self.assertAlmostEqual(tree[11].distance, 22.734, 3)
Exemplo n.º 9
0
def do_treecluster_images():
    """特征维度对各层次聚类的影响"""
    outDir = '/Users/brobear/PycharmProjects/TextClusteringAnalysis/txt2'
    txt_dict = getWordCount(outDir)

    xx = range(100, 1000, 100)
    xx = [300, 600]
    for topN in xx:
        data, textNames = TC(txt_dict, topN=topN)[:2]
        # # 不降维
        # tfidf_dict = myTFIDF(txt_dict, itc=False)
        # data, textNames, wordName = dict2Array(tfidf_dict)

        # method 's': 最小距离法  'm': 最大距离法 'c': 重心法  'a': 类平均法
        # dist e 欧式距离 u 余弦距离
        tree = treecluster(data=data, method='m', dist='e')
        # tree2 = treecluster(data=data, method='s', dist='e')
        # tree3 = treecluster(data=data, method='a', dist='e')
        # tree4 = treecluster(data=data, method='c', dist='e')
        args = range(2, 50)
        # args = list(range(2, 15, 3)) + [21, 27, 30, 40, 50, 60, 70, 80, 100, 150, 250]
        d = [[], [], [], [], []]  # 轮廓系数
        ksize = [[], [], [], [], []]  # 最大类的大小
        for k in args:
            clusterid = tree.cut(nclusters=k)
            d[0].append(silhouette_score(data, clusterid, metric='euclidean'))
            ksize[0].append(max(size_of_cluster(clusterid)))
            clustering = AgglomerativeClustering(linkage='ward', n_clusters=k)  # ['ward','complete','average']
            clustering.fit(data)
            d[1].append(silhouette_score(data, clustering.labels_, metric='euclidean'))
            ksize[1].append(max(size_of_cluster(clustering.labels_)))
            # clusterid2 = tree2.cut(nclusters=k)
            # d[2].append(silhouette_score(data, clusterid2, metric='euclidean'))
            # ksize[2].append(max(size_of_cluster(clusterid2)))
            # clusterid3 = tree3.cut(nclusters=k)
            # d[3].append(silhouette_score(data, clusterid3, metric='euclidean'))
            # ksize[3].append(max(size_of_cluster(clusterid3)))
            # clusterid4 = tree4.cut(nclusters=k)
            # d[4].append(silhouette_score(data, clusterid4, metric='euclidean'))
            # ksize[4].append(max(size_of_cluster(clusterid4)))

            # d[2].append(hierarchical(data, k, 'complete'))#m,e
            # d[3].append(hierarchical(data, k, 'average'))#a,e
        # 用subplot()方法绘制多幅图形
        plt.figure(figsize=(6, 6))
        # 创建第一个画板
        plt.figure(1)
        # 将第一个画板划分为2行1列组成的区块,并获取到第一块区域
        ax1 = plt.subplot(211)
        realN = 0
        # 在第一个子区域中绘图
        for di in d:
            if len(di) > 1:
                plt.plot(args, di, marker='o')
                realN += 1
        # plt.legend(xx)
        plt.legend(range(realN))
        plt.xlabel = 'k'
        plt.ylabel = 'silhouette'
        # plt.ylim(-1, 1)

        # 选中第二个子区域,并绘图
        ax2 = plt.subplot(212)
        for di in ksize:
            if len(di) > 1:
                plt.plot(args, di, marker='o')
        plt.legend(range(realN))
        plt.xlabel = 'k'
        plt.ylabel = 'MAXcluster'
        # plt.ylim(0, 2000)
        ax1.set_title('feature number=%d by TC' % topN)
        ax2.set_title("max size of clusters")
        plt.savefig('./treecluster_images/feature number=%d by TC 1<k<50' % topN)
        plt.show()
Exemplo n.º 10
0
    def test_matrix_parse(self):
        if TestCluster.module == 'Bio.Cluster':
            from Bio.Cluster import treecluster
        elif TestCluster.module == 'Pycluster':
            from Pycluster import treecluster

        # Normal matrix, no errors
        data1 = numpy.array([[1.1, 1.2], [1.4, 1.3], [1.1, 1.5], [2.0, 1.5],
                             [1.7, 1.9], [1.7, 1.9], [5.7, 5.9], [5.7, 5.9],
                             [3.1, 3.3], [5.4, 5.3], [5.1, 5.5], [5.0, 5.5],
                             [5.1, 5.2]])

        # Another normal matrix, no errors; written as a list
        data2 = [[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5],
                 [4.1, 2.2, 0.3, 5.4, 0.5], [2.1, 2.0, 0.0, 5.0, 0.0]]

        # Rows are not contiguous
        data3 = data1[::2, :]

        # Columns are not contiguous
        data4 = numpy.array(data2)[:, ::2]

        # Matrix using float32
        data5 = numpy.array(
            [[1.1, 2.2, 3.3, 4.4, 5.5], [3.1, 3.2, 1.3, 2.4, 1.5],
             [4.1, 2.2, 0.3, 5.4, 0.5], [2.1, 2.0, 0.0, 5.0, 0.0]],
            numpy.float32)

        # Matrix using int
        data6 = numpy.array([[1, 2, 3, 4, 5], [3, 3, 1, 2, 1], [4, 2, 0, 5, 0],
                             [2, 2, 0, 5, 0]], numpy.int32)
        try:
            treecluster(data1)
        except Exception:
            self.fail("treecluster failed to accept matrix data1")

        try:
            treecluster(data2)
        except Exception:
            self.fail("treecluster failed to accept matrix data2")

        try:
            treecluster(data3)
        except Exception:
            self.fail("treecluster failed to accept matrix data3")

        try:
            treecluster(data4)
        except Exception:
            self.fail("treecluster failed to accept matrix data4")

        try:
            treecluster(data5)
        except Exception:
            self.fail("treecluster failed to accept matrix data5")

        try:
            treecluster(data6)
        except Exception:
            self.fail("treecluster failed to accept matrix data6")

        # Ragged matrix
        data7 = [[91.1, 92.2, 93.3, 94.4, 95.5], [93.1, 93.2, 91.3, 92.4],
                 [94.1, 92.2, 90.3], [12.1, 92.0, 90.0, 95.0, 90.0]]

        # Matrix with bad cells
        data8 = [[7.1, 7.2, 7.3, 7.4, 7.5], [7.1, 7.2, 7.3, 7.4, 'snoopy'],
                 [7.1, 7.2, 7.3, None, None]]

        # Matrix with a bad row
        data9 = [[23.1, 23.2, 23.3, 23.4, 23.5], None,
                 [23.1, 23.0, 23.0, 23.0, 23.0]]

        # Various references that don't point to matrices at all
        data10 = "snoopy"
        data11 = {'a': [[2.3, 1.2], [3.3, 5.6]]}
        data12 = []
        data13 = [None]

        # Array of incorrect rank
        data14 = numpy.array([[[1.1, 1.2], [2.3, 1.2], [3.4, 1.6]],
                              [[1.4, 1.3], [3.2, 4.5], [9.8, 4.9]],
                              [[1.1, 1.5], [1.1, 2.3], [6.5, 0.4]]])

        # Array with non-numerical values
        data15 = numpy.array([['a', 'b', 'c'], ['e', 'f', 'g']], 'c')

        # Empty array
        data16 = numpy.array([[]], 'd')

        self.assertRaises(ValueError, treecluster, data7)
        self.assertRaises(ValueError, treecluster, data8)
        self.assertRaises(ValueError, treecluster, data9)
        self.assertRaises(ValueError, treecluster, data10)
        self.assertRaises(TypeError, treecluster, data11)
        self.assertRaises(ValueError, treecluster, data12)
        self.assertRaises(ValueError, treecluster, data13)
        self.assertRaises(ValueError, treecluster, data14)
        self.assertRaises(ValueError, treecluster, data15)
        self.assertRaises(ValueError, treecluster, data16)
Exemplo n.º 11
0
    def test_treecluster(self):
        if TestCluster.module == 'Bio.Cluster':
            from Bio.Cluster import treecluster
        elif TestCluster.module == 'Pycluster':
            from Pycluster import treecluster

        # First data set
        weight1 = [1, 1, 1, 1, 1]
        data1 = numpy.array([[1.1, 2.2, 3.3, 4.4, 5.5],
                             [3.1, 3.2, 1.3, 2.4, 1.5],
                             [4.1, 2.2, 0.3, 5.4, 0.5],
                             [9.7, 2.0, 0.0, 5.0, 0.0]])
        mask1 = numpy.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1],
                             [1, 1, 1, 1, 1]], int)

        # test first data set
        # Pairwise average-linkage clustering
        tree = treecluster(data=data1,
                           mask=mask1,
                           weight=weight1,
                           transpose=0,
                           method='a',
                           dist='e')
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 2)
        self.assertEqual(tree[0].right, 1)
        self.assertAlmostEqual(tree[0].distance, 2.600, places=3)
        self.assertEqual(tree[1].left, -1)
        self.assertEqual(tree[1].right, 0)
        self.assertAlmostEqual(tree[1].distance, 7.300, places=3)
        self.assertEqual(tree[2].left, 3)
        self.assertEqual(tree[2].right, -2)
        self.assertAlmostEqual(tree[2].distance, 13.540, places=3)
        indices = tree.sort([0, 1, 2, 3])
        self.assertEqual(len(indices), len(data1))
        self.assertEqual(indices[0], 0)
        self.assertEqual(indices[1], 1)
        self.assertEqual(indices[2], 2)
        self.assertEqual(indices[3], 3)
        indices = tree.sort([0, 3, 2, 1])
        self.assertEqual(len(indices), len(data1))
        self.assertEqual(indices[0], 3)
        self.assertEqual(indices[1], 0)
        self.assertEqual(indices[2], 2)
        self.assertEqual(indices[3], 1)

        # Pairwise single-linkage clustering
        tree = treecluster(data=data1,
                           mask=mask1,
                           weight=weight1,
                           transpose=0,
                           method='s',
                           dist='e')
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 1)
        self.assertEqual(tree[0].right, 2)
        self.assertAlmostEqual(tree[0].distance, 2.600, places=3)
        self.assertEqual(tree[1].left, 0)
        self.assertEqual(tree[1].right, -1)
        self.assertAlmostEqual(tree[1].distance, 5.800, places=3)
        self.assertEqual(tree[2].left, -2)
        self.assertEqual(tree[2].right, 3)
        self.assertAlmostEqual(tree[2].distance, 6.380, places=3)
        indices = tree.sort([0, 1, 2, 3])
        self.assertEqual(len(indices), len(data1))
        self.assertEqual(indices[0], 0)
        self.assertEqual(indices[1], 1)
        self.assertEqual(indices[2], 2)
        self.assertEqual(indices[3], 3)
        indices = tree.sort([0, 3, 2, 1])
        self.assertEqual(len(indices), len(data1))
        self.assertEqual(indices[0], 3)
        self.assertEqual(indices[1], 0)
        self.assertEqual(indices[2], 2)
        self.assertEqual(indices[3], 1)

        # Pairwise centroid-linkage clustering
        tree = treecluster(data=data1,
                           mask=mask1,
                           weight=weight1,
                           transpose=0,
                           method='c',
                           dist='e')
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 1)
        self.assertEqual(tree[0].right, 2)
        self.assertAlmostEqual(tree[0].distance, 2.600, places=3)
        self.assertEqual(tree[1].left, 0)
        self.assertEqual(tree[1].right, -1)
        self.assertAlmostEqual(tree[1].distance, 6.650, places=3)
        self.assertEqual(tree[2].left, -2)
        self.assertEqual(tree[2].right, 3)
        self.assertAlmostEqual(tree[2].distance, 11.629, places=3)
        indices = tree.sort([0, 1, 2, 3])
        self.assertEqual(len(indices), len(data1))
        self.assertEqual(indices[0], 0)
        self.assertEqual(indices[1], 1)
        self.assertEqual(indices[2], 2)
        self.assertEqual(indices[3], 3)
        indices = tree.sort([0, 3, 2, 1])
        self.assertEqual(len(indices), len(data1))
        self.assertEqual(indices[0], 3)
        self.assertEqual(indices[1], 0)
        self.assertEqual(indices[2], 2)
        self.assertEqual(indices[3], 1)

        # Pairwise maximum-linkage clustering
        tree = treecluster(data=data1,
                           mask=mask1,
                           weight=weight1,
                           transpose=0,
                           method='m',
                           dist='e')
        self.assertEqual(len(tree), len(data1) - 1)
        self.assertEqual(tree[0].left, 2)
        self.assertEqual(tree[0].right, 1)
        self.assertAlmostEqual(tree[0].distance, 2.600, places=3)
        self.assertEqual(tree[1].left, -1)
        self.assertEqual(tree[1].right, 0)
        self.assertAlmostEqual(tree[1].distance, 8.800, places=3)
        self.assertEqual(tree[2].left, 3)
        self.assertEqual(tree[2].right, -2)
        self.assertAlmostEqual(tree[2].distance, 23.100, places=3)
        indices = tree.sort([0, 1, 2, 3])
        self.assertEqual(len(indices), len(data1))
        self.assertEqual(indices[0], 0)
        self.assertEqual(indices[1], 1)
        self.assertEqual(indices[2], 2)
        self.assertEqual(indices[3], 3)
        indices = tree.sort([0, 3, 2, 1])
        self.assertEqual(len(indices), len(data1))
        self.assertEqual(indices[0], 3)
        self.assertEqual(indices[1], 0)
        self.assertEqual(indices[2], 2)
        self.assertEqual(indices[3], 1)

        # Second data set
        weight2 = [1, 1]
        data2 = numpy.array([[0.8223, 0.9295], [1.4365, 1.3223],
                             [1.1623, 1.5364], [2.1826, 1.1934],
                             [1.7763, 1.9352], [1.7215, 1.9912],
                             [2.1812, 5.9935], [5.3290, 5.9452],
                             [3.1491, 3.3454], [5.1923, 5.3156],
                             [4.7735, 5.4012], [5.1297, 5.5645],
                             [5.3934, 5.1823]])
        mask2 = numpy.array(
            [[1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1],
             [1, 1], [1, 1], [1, 1], [1, 1], [1, 1]], int)

        # Test second data set
        # Pairwise average-linkage clustering
        tree = treecluster(data=data2,
                           mask=mask2,
                           weight=weight2,
                           transpose=0,
                           method='a',
                           dist='e')
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 5)
        self.assertEqual(tree[0].right, 4)
        self.assertAlmostEqual(tree[0].distance, 0.003, places=3)
        self.assertEqual(tree[1].left, 9)
        self.assertEqual(tree[1].right, 12)
        self.assertAlmostEqual(tree[1].distance, 0.029, places=3)
        self.assertEqual(tree[2].left, 2)
        self.assertEqual(tree[2].right, 1)
        self.assertAlmostEqual(tree[2].distance, 0.061, places=3)
        self.assertEqual(tree[3].left, 11)
        self.assertEqual(tree[3].right, -2)
        self.assertAlmostEqual(tree[3].distance, 0.070, places=3)
        self.assertEqual(tree[4].left, -4)
        self.assertEqual(tree[4].right, 10)
        self.assertAlmostEqual(tree[4].distance, 0.128, places=3)
        self.assertEqual(tree[5].left, 7)
        self.assertEqual(tree[5].right, -5)
        self.assertAlmostEqual(tree[5].distance, 0.224, places=3)
        self.assertEqual(tree[6].left, -3)
        self.assertEqual(tree[6].right, 0)
        self.assertAlmostEqual(tree[6].distance, 0.254, places=3)
        self.assertEqual(tree[7].left, -1)
        self.assertEqual(tree[7].right, 3)
        self.assertAlmostEqual(tree[7].distance, 0.391, places=3)
        self.assertEqual(tree[8].left, -8)
        self.assertEqual(tree[8].right, -7)
        self.assertAlmostEqual(tree[8].distance, 0.532, places=3)
        self.assertEqual(tree[9].left, 8)
        self.assertEqual(tree[9].right, -9)
        self.assertAlmostEqual(tree[9].distance, 3.234, places=3)
        self.assertEqual(tree[10].left, -6)
        self.assertEqual(tree[10].right, 6)
        self.assertAlmostEqual(tree[10].distance, 4.636, places=3)
        self.assertEqual(tree[11].left, -11)
        self.assertEqual(tree[11].right, -10)
        self.assertAlmostEqual(tree[11].distance, 12.741, places=3)
        indices = tree.sort()
        self.assertEqual(len(indices), len(data2))
        self.assertEqual(indices[0], 7)
        self.assertEqual(indices[1], 11)
        self.assertEqual(indices[2], 9)
        self.assertEqual(indices[3], 12)
        self.assertEqual(indices[4], 10)
        self.assertEqual(indices[5], 6)
        self.assertEqual(indices[6], 8)
        self.assertEqual(indices[7], 5)
        self.assertEqual(indices[8], 4)
        self.assertEqual(indices[9], 3)
        self.assertEqual(indices[10], 2)
        self.assertEqual(indices[11], 1)
        self.assertEqual(indices[12], 0)

        # Pairwise single-linkage clustering
        tree = treecluster(data=data2,
                           mask=mask2,
                           weight=weight2,
                           transpose=0,
                           method='s',
                           dist='e')
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 4)
        self.assertEqual(tree[0].right, 5)
        self.assertAlmostEqual(tree[0].distance, 0.003, places=3)
        self.assertEqual(tree[1].left, 9)
        self.assertEqual(tree[1].right, 12)
        self.assertAlmostEqual(tree[1].distance, 0.029, places=3)
        self.assertEqual(tree[2].left, 11)
        self.assertEqual(tree[2].right, -2)
        self.assertAlmostEqual(tree[2].distance, 0.033, places=3)
        self.assertEqual(tree[3].left, 1)
        self.assertEqual(tree[3].right, 2)
        self.assertAlmostEqual(tree[3].distance, 0.061, places=3)
        self.assertEqual(tree[4].left, 10)
        self.assertEqual(tree[4].right, -3)
        self.assertAlmostEqual(tree[4].distance, 0.077, places=3)
        self.assertEqual(tree[5].left, 7)
        self.assertEqual(tree[5].right, -5)
        self.assertAlmostEqual(tree[5].distance, 0.092, places=3)
        self.assertEqual(tree[6].left, 0)
        self.assertEqual(tree[6].right, -4)
        self.assertAlmostEqual(tree[6].distance, 0.242, places=3)
        self.assertEqual(tree[7].left, -7)
        self.assertEqual(tree[7].right, -1)
        self.assertAlmostEqual(tree[7].distance, 0.246, places=3)
        self.assertEqual(tree[8].left, 3)
        self.assertEqual(tree[8].right, -8)
        self.assertAlmostEqual(tree[8].distance, 0.287, places=3)
        self.assertEqual(tree[9].left, -9)
        self.assertEqual(tree[9].right, 8)
        self.assertAlmostEqual(tree[9].distance, 1.936, places=3)
        self.assertEqual(tree[10].left, -10)
        self.assertEqual(tree[10].right, -6)
        self.assertAlmostEqual(tree[10].distance, 3.432, places=3)
        self.assertEqual(tree[11].left, 6)
        self.assertEqual(tree[11].right, -11)
        self.assertAlmostEqual(tree[11].distance, 3.535, places=3)
        indices = tree.sort()
        self.assertEqual(len(indices), len(data2))
        self.assertEqual(indices[0], 6)
        self.assertEqual(indices[1], 3)
        self.assertEqual(indices[2], 0)
        self.assertEqual(indices[3], 1)
        self.assertEqual(indices[4], 2)
        self.assertEqual(indices[5], 4)
        self.assertEqual(indices[6], 5)
        self.assertEqual(indices[7], 8)
        self.assertEqual(indices[8], 7)
        self.assertEqual(indices[9], 10)
        self.assertEqual(indices[10], 11)
        self.assertEqual(indices[11], 9)
        self.assertEqual(indices[12], 12)

        # Pairwise centroid-linkage clustering
        tree = treecluster(data=data2,
                           mask=mask2,
                           weight=weight2,
                           transpose=0,
                           method='c',
                           dist='e')
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 4)
        self.assertEqual(tree[0].right, 5)
        self.assertAlmostEqual(tree[0].distance, 0.003, places=3)
        self.assertEqual(tree[1].left, 12)
        self.assertEqual(tree[1].right, 9)
        self.assertAlmostEqual(tree[1].distance, 0.029, places=3)
        self.assertEqual(tree[2].left, 1)
        self.assertEqual(tree[2].right, 2)
        self.assertAlmostEqual(tree[2].distance, 0.061, places=3)
        self.assertEqual(tree[3].left, -2)
        self.assertEqual(tree[3].right, 11)
        self.assertAlmostEqual(tree[3].distance, 0.063, places=3)
        self.assertEqual(tree[4].left, 10)
        self.assertEqual(tree[4].right, -4)
        self.assertAlmostEqual(tree[4].distance, 0.109, places=3)
        self.assertEqual(tree[5].left, -5)
        self.assertEqual(tree[5].right, 7)
        self.assertAlmostEqual(tree[5].distance, 0.189, places=3)
        self.assertEqual(tree[6].left, 0)
        self.assertEqual(tree[6].right, -3)
        self.assertAlmostEqual(tree[6].distance, 0.239, places=3)
        self.assertEqual(tree[7].left, 3)
        self.assertEqual(tree[7].right, -1)
        self.assertAlmostEqual(tree[7].distance, 0.390, places=3)
        self.assertEqual(tree[8].left, -7)
        self.assertEqual(tree[8].right, -8)
        self.assertAlmostEqual(tree[8].distance, 0.382, places=3)
        self.assertEqual(tree[9].left, -9)
        self.assertEqual(tree[9].right, 8)
        self.assertAlmostEqual(tree[9].distance, 3.063, places=3)
        self.assertEqual(tree[10].left, 6)
        self.assertEqual(tree[10].right, -6)
        self.assertAlmostEqual(tree[10].distance, 4.578, places=3)
        self.assertEqual(tree[11].left, -10)
        self.assertEqual(tree[11].right, -11)
        self.assertAlmostEqual(tree[11].distance, 11.536, places=3)
        indices = tree.sort()
        self.assertEqual(len(indices), len(data2))
        self.assertEqual(indices[0], 0)
        self.assertEqual(indices[1], 1)
        self.assertEqual(indices[2], 2)
        self.assertEqual(indices[3], 3)
        self.assertEqual(indices[4], 4)
        self.assertEqual(indices[5], 5)
        self.assertEqual(indices[6], 8)
        self.assertEqual(indices[7], 6)
        self.assertEqual(indices[8], 10)
        self.assertEqual(indices[9], 12)
        self.assertEqual(indices[10], 9)
        self.assertEqual(indices[11], 11)
        self.assertEqual(indices[12], 7)

        # Pairwise maximum-linkage clustering
        tree = treecluster(data=data2,
                           mask=mask2,
                           weight=weight2,
                           transpose=0,
                           method='m',
                           dist='e')
        self.assertEqual(len(tree), len(data2) - 1)
        self.assertEqual(tree[0].left, 5)
        self.assertEqual(tree[0].right, 4)
        self.assertAlmostEqual(tree[0].distance, 0.003, places=3)
        self.assertEqual(tree[1].left, 9)
        self.assertEqual(tree[1].right, 12)
        self.assertAlmostEqual(tree[1].distance, 0.029, places=3)
        self.assertEqual(tree[2].left, 2)
        self.assertEqual(tree[2].right, 1)
        self.assertAlmostEqual(tree[2].distance, 0.061, places=3)
        self.assertEqual(tree[3].left, 11)
        self.assertEqual(tree[3].right, 10)
        self.assertAlmostEqual(tree[3].distance, 0.077, places=3)
        self.assertEqual(tree[4].left, -2)
        self.assertEqual(tree[4].right, -4)
        self.assertAlmostEqual(tree[4].distance, 0.216, places=3)
        self.assertEqual(tree[5].left, -3)
        self.assertEqual(tree[5].right, 0)
        self.assertAlmostEqual(tree[5].distance, 0.266, places=3)
        self.assertEqual(tree[6].left, -5)
        self.assertEqual(tree[6].right, 7)
        self.assertAlmostEqual(tree[6].distance, 0.302, places=3)
        self.assertEqual(tree[7].left, -1)
        self.assertEqual(tree[7].right, 3)
        self.assertAlmostEqual(tree[7].distance, 0.425, places=3)
        self.assertEqual(tree[8].left, -8)
        self.assertEqual(tree[8].right, -6)
        self.assertAlmostEqual(tree[8].distance, 0.968, places=3)
        self.assertEqual(tree[9].left, 8)
        self.assertEqual(tree[9].right, 6)
        self.assertAlmostEqual(tree[9].distance, 3.975, places=3)
        self.assertEqual(tree[10].left, -10)
        self.assertEqual(tree[10].right, -7)
        self.assertAlmostEqual(tree[10].distance, 5.755, places=3)
        self.assertEqual(tree[11].left, -11)
        self.assertEqual(tree[11].right, -9)
        self.assertAlmostEqual(tree[11].distance, 22.734, places=3)
        indices = tree.sort()
        self.assertEqual(len(indices), len(data2))
        self.assertEqual(indices[0], 8)
        self.assertEqual(indices[1], 6)
        self.assertEqual(indices[2], 9)
        self.assertEqual(indices[3], 12)
        self.assertEqual(indices[4], 11)
        self.assertEqual(indices[5], 10)
        self.assertEqual(indices[6], 7)
        self.assertEqual(indices[7], 5)
        self.assertEqual(indices[8], 4)
        self.assertEqual(indices[9], 3)
        self.assertEqual(indices[10], 2)
        self.assertEqual(indices[11], 1)
        self.assertEqual(indices[12], 0)
Exemplo n.º 12
0
    def test_mask_parse(self):
        if TestCluster.module == 'Bio.Cluster':
            from Bio.Cluster import treecluster
        elif TestCluster.module == 'Pycluster':
            from Pycluster import treecluster

        # data matrix
        data = numpy.array([[1.1, 2.2, 3.3, 4.4,
                             5.5], [3.1, 3.2, 1.3, 2.4, 1.5],
                            [4.1, 2.2, 0.3, 5.4, 0.5],
                            [2.1, 2.0, 0.0, 5.0, 0.0]])

        # Normal mask, no errors
        mask1 = numpy.array([[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1],
                             [1, 0, 1, 1, 0]])

        # Same mask, no errors; written as a list
        mask2 = [[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1],
                 [1, 0, 1, 1, 0]]

        # Rows are not contiguous
        mask3 = numpy.array([[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 1, 0, 0],
                             [1, 1, 0, 1, 1], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1],
                             [1, 1, 0, 1, 1], [1, 0, 1, 1, 0]])
        mask3 = mask3[::2, :]

        # Columns are not contiguous
        mask4 = numpy.array([[1, 1, 0, 1, 0, 1, 0, 0, 1, 1],
                             [1, 1, 1, 0, 0, 1, 1, 0, 0, 1],
                             [1, 1, 0, 1, 1, 1, 0, 1, 1, 0],
                             [1, 0, 1, 1, 0, 1, 0, 0, 1, 1]])
        mask4 = mask4[:, ::2]

        # Matrix using int16
        mask5 = numpy.array([[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 1, 0, 0],
                             [1, 1, 0, 1, 1]], numpy.int16)

        # Matrix using float
        mask6 = numpy.array(
            [[1.0, 2.2, 3.1, 4.8, 5.1], [3.3, 3.3, 1.4, 2.4, 1.2],
             [4.1, 2.2, 0.6, 5.5, 0.6], [2.7, 2.5, 0.4, 5.7, 0.2]],
            numpy.float)
        try:
            treecluster(data, mask1)
        except Exception:
            self.fail("treecluster failed to accept matrix mask1")

        try:
            treecluster(data, mask2)
        except Exception:
            self.fail("treecluster failed to accept matrix mask2")

        try:
            treecluster(data, mask3)
        except Exception:
            self.fail("treecluster failed to accept matrix mask3")

        try:
            treecluster(data, mask4)
        except Exception:
            self.fail("treecluster failed to accept matrix mask4")

        try:
            treecluster(data, mask5)
        except Exception:
            self.fail("treecluster failed to accept matrix mask5")

        try:
            treecluster(data, mask6)
        except Exception:
            self.fail("treecluster failed to accept matrix mask6")

        # Ragged mask
        mask7 = [[1, 1, 0, 1], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1], [1, 1, 0]]

        # Mask with incorrect number of rows
        mask8 = numpy.array([[1, 1, 0, 1, 0], [1, 1, 1, 0, 0], [1, 1, 0, 1, 1],
                             [0, 1, 1, 0, 1], [1, 0, 1, 1, 0]])

        # Mask with incorrect number of columns
        mask9 = numpy.array([[1, 1, 0, 1, 0, 1], [1, 1, 1, 0, 0, 0],
                             [0, 1, 1, 0, 1, 1], [1, 0, 1, 1, 0, 1]])

        # Matrix with bad cells
        mask10 = [[1, 1, 0, 1, 0], [1, 1, 1, 0, 'snoopy'], [1, 1, 0, 1, 1],
                  [1, 0, 1, 1, 0]]

        # Matrix with a bad row
        mask11 = [[1, 1, 0, 1, 0], None, [1, 1, 0, 1, 1], [1, 0, 1, 1, 0]]

        # Array with non-numerical values
        mask12 = numpy.array([['a', 'b', 'c'], ['e', 'f', 'g']], 'c')

        # Empty arrays
        mask13 = numpy.array([[]], 'd')
        mask14 = []

        # Array of incorrect rank
        mask15 = numpy.array([[[1, 1], [0, 1], [1, 1]], [[1, 1], [0, 1],
                                                         [1, 1]],
                              [[1, 1], [1, 1], [1, 0]]])

        # References that cannot be converted to a matrix of int
        mask16 = "snoopy"
        mask17 = {'a': [[1, 0], [1, 1]]}
        mask18 = [None]

        self.assertRaises(ValueError, treecluster, data, mask7)
        self.assertRaises(ValueError, treecluster, data, mask8)
        self.assertRaises(ValueError, treecluster, data, mask9)
        self.assertRaises(ValueError, treecluster, data, mask10)
        self.assertRaises(ValueError, treecluster, data, mask11)
        self.assertRaises(ValueError, treecluster, data, mask12)
        self.assertRaises(ValueError, treecluster, data, mask13)
        self.assertRaises(ValueError, treecluster, data, mask14)
        self.assertRaises(ValueError, treecluster, data, mask15)
        self.assertRaises(ValueError, treecluster, data, mask16)
        self.assertRaises(TypeError, treecluster, data, mask17)
        self.assertRaises(TypeError, treecluster, data, mask18)
Exemplo n.º 13
0
def test_matrix_parse(module):
  if module=='Bio.Cluster':
    from Bio.Cluster import treecluster
  elif module=='Pycluster':
    from Pycluster import treecluster
  else:
    raise 'Unknown module name', module
  print "test_matrix_parse:"
  # Normal matrix, no errors
  data1 = array([[ 1.1, 1.2 ],
                 [ 1.4, 1.3 ],
                 [ 1.1, 1.5 ],
                 [ 2.0, 1.5 ],
                 [ 1.7, 1.9 ],
                 [ 1.7, 1.9 ],
                 [ 5.7, 5.9 ],
                 [ 5.7, 5.9 ],
                 [ 3.1, 3.3 ],
                 [ 5.4, 5.3 ],
                 [ 5.1, 5.5 ],
                 [ 5.0, 5.5 ],
                 [ 5.1, 5.2 ]])

  # Another normal matrix, no errors; written as a list
  data2 =  [[  1.1, 2.2, 3.3, 4.4, 5.5 ], 
            [  3.1, 3.2, 1.3, 2.4, 1.5 ], 
            [  4.1, 2.2, 0.3, 5.4, 0.5 ], 
            [ 12.1, 2.0, 0.0, 5.0, 0.0 ]]

  # Ragged matrix
  data3 =  [[ 91.1, 92.2, 93.3, 94.4, 95.5], 
            [ 93.1, 93.2, 91.3, 92.4 ], 
            [ 94.1, 92.2, 90.3 ], 
            [ 12.1, 92.0, 90.0, 95.0, 90.0 ]]

  # Matrix with bad cells
  data4 =  [ [ 7.1, 7.2, 7.3, 7.4, 7.5, ],
             [ 7.1, 7.2, 7.3, 7.4, 'snoopy' ], 
             [ 7.1, 7.2, 7.3, None, None]] 

  # Matrix with a bad row
  data5 =  [ [ 23.1, 23.2, 23.3, 23.4, 23.5], 
             None,
             [ 23.1, 23.0, 23.0, 23.0, 23.0]]

  # Various references that don't point to matrices at all
  data6 = "snoopy"
  data7 = {'a': [[2.3,1.2],[3.3,5.6]]}
  data8 = []
  data9 = [None]
  data10 = [[None]]

  try:
    result = treecluster(data1)
    print "Read data1 (correct)"
  except: "Error: treecluster failed to accept matrix data1"
  try:
    result = treecluster(data2)
    print "Read data2 (correct)"
  except: "Error: treecluster failed to accept matrix data2"
  try:
    result = treecluster(data3)
    print "Error: treecluster incorrectly accepted data3"
  except: print "Refused incorrect matrix data3"
  try:
    result = treecluster(data4)
    print "Error: treecluster incorrectly accepted data4"
  except: print "Refused incorrect matrix data4"
  try:
    result = treecluster(data5)
    print "Error: treecluster incorrectly accepted data5"
  except: print "Refused incorrect matrix data5"
  try:
    result = treecluster(data6)
    print "Error: treecluster incorrectly accepted data6"
  except: print "Refused incorrect matrix data6"
  try:
    result = treecluster(data7)
    print "Error: treecluster incorrectly accepted data7"
  except: print "Refused incorrect matrix data7"
  try:
    result = treecluster(data8)
    print "Error: treecluster incorrectly accepted data8"
  except: print "Refused incorrect matrix data8"
  try:
    result = treecluster(data9)
    print "Error: treecluster incorrectly accepted data9"
  except: print "Refused incorrect matrix data9"
  try:
    result = treecluster(data10)
    print "Error: treecluster incorrectly accepted data10"
  except: print "Refused incorrect matrix data10"
  print
Exemplo n.º 14
0
def test_treecluster(module):
  if module=='Bio.Cluster':
    from Bio.Cluster import treecluster
  elif module=='Pycluster':
    from Pycluster import treecluster
  else:
    raise 'Unknown module name', module
  print "test_treecluster:"
  # First data set
  weight1 =  [ 1,1,1,1,1 ]
  data1   =  array([[  1.1, 2.2, 3.3, 4.4, 5.5], 
                    [  3.1, 3.2, 1.3, 2.4, 1.5], 
                    [  4.1, 2.2, 0.3, 5.4, 0.5], 
                    [ 12.1, 2.0, 0.0, 5.0, 0.0]])
  mask1 = array([[ 1, 1, 1, 1, 1], 
                 [ 1, 1, 1, 1, 1], 
                 [ 1, 1, 1, 1, 1], 
                 [ 1, 1, 1, 1, 1]])

  # Second data set
  weight2 =  [ 1,1 ]
  data2 = array([[ 0.8223, 0.9295 ],
                 [ 1.4365, 1.3223 ],
                 [ 1.1623, 1.5364 ],
                 [ 2.1826, 1.1934 ],
                 [ 1.7763, 1.9352 ],
                 [ 1.7215, 1.9912 ],
                 [ 2.1812, 5.9935 ],
                 [ 5.3290, 5.9452 ],
                 [ 3.1491, 3.3454 ],
                 [ 5.1923, 5.3156 ],
                 [ 4.7735, 5.4012 ],
                 [ 5.1297, 5.5645 ],
                 [ 5.3934, 5.1823 ]])
  mask2 = array([[ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ],
                 [ 1, 1 ]])

  # test first data set
  print "First data set:"
  print_matrix(data1, mask1)
  print "Pairwise average-linkage clustering"
  result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='a', dist='e')
  print "Number of nodes is %d (should be %d)" % (len(result), len(data1)-1)
  print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1)-1)
  for i in range(len(result)):
    print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i])

  print "Pairwise single-linkage clustering"
  result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='s', dist='e')
  print "Number of nodes is %d (should be %d)" % (len(result), len(data1)-1)
  print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1)-1)
  for i in range(len(result)):
    print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i])

  print "Pairwise centroid-linkage clustering"
  result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='c', dist='e')
  print "Number of nodes is %d (should be %d)" % (len(result), len(data1)-1)
  print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1)-1)
  for i in range(len(result)):
    print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i])

  print "Pairwise maximum-linkage clustering"
  result, linkdist = treecluster(data=data1, mask=mask1, weight=weight1, transpose=0, method='m', dist='e')
  print "Number of nodes is %d (should be %d)" % (len(result), len(data1)-1)
  print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data1)-1)
  for i in range(len(result)):
    print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i])

  # Test second data set
  print "Second data set:"
  print "Pairwise average-linkage clustering"
  result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='a', dist='e')
  print "Number of nodes is %d (should be %d)" % (len(result), len(data2)-1)
  print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2)-1)
  for i in range(len(result)):
    print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i])

  print "Pairwise single-linkage clustering"
  result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='s', dist='e')
  print "Number of nodes is %d (should be %d)" % (len(result), len(data2)-1)
  print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2)-1)
  for i in range(len(result)):
    print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i])

  print "Pairwise centroid-linkage clustering"
  result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='c', dist='e')
  print "Number of nodes is %d (should be %d)" % (len(result), len(data2)-1)
  print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2)-1)
  for i in range(len(result)):
    print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i])

  print "Pairwise maximum-linkage clustering"
  result, linkdist = treecluster(data=data2, mask=mask2, weight=weight2, transpose=0, method='m', dist='e')
  print "Number of nodes is %d (should be %d)" % (len(result), len(data2)-1)
  print "Number of link distances is %d (should be %d)" % (len(linkdist), len(data2)-1)
  for i in range(len(result)):
    print "Node %3d joins node %3d with node %3d; link distance is %7.3f" % (i, result[i][0], result[i][1], linkdist[i])
  print
from Bio.Cluster import treecluster
import numpy as np
from Bio.Cluster import distancematrix
data=np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12],[0,1,2,3]])
tree = treecluster(data)
print(tree)
# 예제 데이터도 없어...
# 아무튼 이런 식으로 array로 그릴수도 있고
tree = treecluster(data,dist="b",distancematrix=None)
print(tree)
# 다른 옵션을 줄 수도 있다.
distances=distancematrix((data))
tree = treecluster(data=None,distancematrix=distances)
print(tree)
# Distance matrix를 미리 계산해 그걸로 그릴 수도 있다.
# ValueError: use either data or distancematrix; do not use both
# Data와 Distance matrix중 하나는 None이어야 한다. 안그러면 위 에러가 반긴다.