示例#1
0
def test_hierarchical():
    # gets the clustering results from agglomerative clustering, and checks that the number of different labels is correct
    # use tiny datasets because this ish takes forever
    test_cluster5 = algs.HierarchicalClustering(5)
    labels5 = test_cluster5.cluster(ligands[:100])
    assert (1 in labels5 and 2 in labels5 and 3 in labels5 and 4 in labels5
            and 5 in labels5)

    test_cluster2 = algs.HierarchicalClustering(10, seed=6)
    labels2 = test_cluster2.cluster(ligands[:100])
    assert (1 in labels2 and 2 in labels2)
示例#2
0
def test_similarity():
    testcluster = np.array([[3.08232755e-01, 7.31276243e-01],
                            [1.38059574e-01, 5.96831094e-01],
                            [7.17477934e-01, 6.92660634e-01],
                            [1.04842083e-01, 5.81815300e-01],
                            [2.63517862e-01, 8.56987831e-01],
                            [6.82660482e-01, 7.65745298e-01],
                            [3.30899459e-01, 1.27005643e-01],
                            [2.15388524e+00, 2.76495447e+00],
                            [2.02847470e+00, 2.17510569e+00],
                            [2.81339552e+00, 2.92175026e+00],
                            [2.11079023e+00, 2.70619934e+00],
                            [2.51975852e+00, 2.72664963e+00]])

    TestPT = algs.PartitionClustering(rawdata=testcluster,
                                      n_clusters=2,
                                      max_iteration=100)
    TestPT.runClustering()

    score = algs.SilhouetteScore(TestPT)
    assert (score > 0.80)

    TestPT = algs.HierarchicalClustering(rawdata=testcluster, n_clusters=2)
    TestPT.runClustering()

    score = algs.SilhouetteScore(TestPT)
    assert (score > 0.80)
示例#3
0
def test_Hierarchical(ligand_test):
    """this function will test the k-means clustering for a series of k values and confirm that the end result has at least 1 ligand per cluster"""
    test_cluster = algs.HierarchicalClustering('single-linkage',1)
    test_cluster.get_data(ligand_test)
    test_cluster.cluster()
    for i in range(10):
        number_clusters = len(numpy.unique(test_cluster.dendogram[i,:]))
        assert number_clusters == (10-i)
示例#4
0
def test_hierarchical():
    ligands = read_test_ligands('ligand_information.csv')
    distanceMatrix = algs.makeDistanceMatrix(ligands)
    hc = algs.HierarchicalClustering()
    hcclusters = hc.cluster(ligands, distanceMatrix, 2)
    ligandIDs = []
    for cluster in hcclusters:
        for ligand in cluster.ligands:
            ligandIDs.append(ligand.ligandID)
    assert ligandIDs == [0, 0, 0, 0, 0, 1, 1, 1, 1,
                         1], 'Hierarchical Clustering Test Failed :('
    print('Hierarchical Clustering Test Passed')
示例#5
0
def test_quality_metric():
    # Testing quality metric with pre-computed silhouette score
    test_array = np.array([[1, 1, 1, 1, 0, 0, 0, 0, 0],
                           [0, 0, 0, 0, 1, 1, 1, 0, 0],
                           [0, 0, 0, 0, 1, 1, 0, 0, 0],
                           [0, 0, 0, 0, 1, 1, 0, 1, 0],
                           [1, 0, 1, 1, 0, 0, 0, 0, 0],
                           [0, 1, 1, 0, 0, 0, 0, 0, 0],
                           [1, 1, 0, 0, 0, 0, 0, 0, 0]])
    h_cluster = algs.HierarchicalClustering(num_clusters=2, linkage='ward')
    h_labels = list(h_cluster.cluster(test_array))
    cq = h_cluster.cluster_quality(test_array, h_labels)
    assert np.around(cq, decimals=2) == 0.51
def test_hierarchical():
    ligand_dict = algs.read_in_ligands("ligand_information.csv")
    ctrl = pd.DataFrame(
        {
            'a': [np.nan, 17, 21, 31, 23],
            'b': [17, np.nan, 30, 34, 21],
            'c': [21, 30, np.nan, 28, 39],
            'd': [31, 34, 28, np.nan, 43],
            'e': [23, 21, 39, 43, np.nan]
        },
        index=["a", "b", "c", "d", "e"])
    ctr1_dict = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4}

    tester = algs.HierarchicalClustering(ligand_dict,
                                         linkage_metric="single",
                                         distance_metric="euclidean",
                                         desired_k=1,
                                         testing=True)  #intialiize object
    tester.proximity_matrix = ctrl
    tester.Nclusters = 5  #manually overwrite the areas I want to test
    tester.element_dict = ctr1_dict
    tmp = tester.cluster()
    correct_one = [['d', ' e', ' c', ' a', ' b']]
    assert tmp.cluster_results == correct_one, "Failing to merge columns correctly -- one column"

    tester = algs.HierarchicalClustering(ligand_dict,
                                         linkage_metric="single",
                                         distance_metric="euclidean",
                                         desired_k=3,
                                         testing=True)  #intialiize object
    tester.proximity_matrix = ctrl
    tester.Nclusters = 5  #overwrite the areas I want to test
    tester.element_dict = ctr1_dict
    tmp = tester.cluster()
    correct_three = [['d'], ['e'], ['c', ' a', ' b']]
    assert tmp.cluster_results == correct_three, "Failing to merge columns correctly -- three column"
示例#7
0
def test_hierarchical():
    # Setup
    thresh = 0.42

    x_input_hc = np.array([[0., 0., 1., 0., 0.], [1., 0., 0., 0., 0.],
                           [0., 1., 0., 1., 1.], [0., 1., 0., 0., 0.],
                           [0., 0., 0., 1., 0.]])

    desired_p_lab_hc = np.array([0, 1, 2, 2, 2])

    # Exercise
    HC = algs.HierarchicalClustering(x_input_hc, thresh)
    p_lab_hc = HC.cluster()

    # Verify
    np.testing.assert_array_equal(p_lab_hc, desired_p_lab_hc)
示例#8
0
def test_hierarchical():
    # Since this clustering is deterministic then I will
    test_array = np.array([[1, 1, 1, 1, 0, 0, 0, 0, 0],
                           [0, 0, 0, 0, 1, 1, 1, 0, 0],
                           [0, 0, 0, 0, 1, 1, 0, 0, 0],
                           [0, 0, 0, 0, 1, 1, 0, 1, 0],
                           [1, 0, 1, 1, 0, 0, 0, 0, 0],
                           [0, 1, 1, 0, 0, 0, 0, 0, 0],
                           [1, 1, 0, 0, 0, 0, 0, 0, 0]])
    h_cluster = algs.HierarchicalClustering(num_clusters=2, linkage='ward')
    h_labels = list(h_cluster.cluster(test_array))
    # Checking it creates the proper number of clusters
    assert len(np.unique(h_labels)) == 2
    # Checking that id does correct clustering
    assert h_labels == [0, 1, 1, 1, 0, 0, 0
                        ] or h_labels == [1, 0, 0, 0, 1, 1, 1]
示例#9
0
def test_hierarchical():
    testcluster = np.array([[3.08232755e-01, 7.31276243e-01],
                            [1.38059574e-01, 5.96831094e-01],
                            [7.17477934e-01, 6.92660634e-01],
                            [1.04842083e-01, 5.81815300e-01],
                            [2.63517862e-01, 8.56987831e-01],
                            [6.82660482e-01, 7.65745298e-01],
                            [3.30899459e-01, 1.27005643e-01],
                            [2.15388524e+00, 2.76495447e+00],
                            [2.02847470e+00, 2.17510569e+00],
                            [2.81339552e+00, 2.92175026e+00],
                            [2.11079023e+00, 2.70619934e+00],
                            [2.51975852e+00, 2.72664963e+00]])

    TestHC = algs.HierarchicalClustering(rawdata=testcluster, n_clusters=2)
    TestHC.runClustering()

    assert (TestHC.DistanceMatrix.shape == (len(testcluster),
                                            len(testcluster)))
    assert (len(TestHC.clusters) == 2)
    sorted_clus = []
    for clus in TestHC.clusters:
        sorted_clus.append(sorted(clus))
    assert (sorted(sorted_clus) == [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11]])
示例#10
0
def main():
    LigandInformation = pd.read_csv("../ligand_information.csv", sep=",")
    LigandData = Ligand.Ligand(LigandID=LigandInformation['LigandID'],
                               score=LigandInformation['Score'],
                               SMILES=LigandInformation['SMILES'],
                               OnBits=LigandInformation['OnBits'])
    LigandData.OnbitToLong()

    #Question 2
    #I am going to implement a Umap of the ligand
    fit = umap.UMAP()
    u = fit.fit_transform(LigandData.long[0:2000])

    #Used to just load the data for later questions/visualizations, if you uncomment, keep the loadtxt command to so the data
    #can be typecast to a numpy array, for array index notation consistentcy .
    np.savetxt('UmapDimensionalSpace.txt', [u[:, 0], u[:, 1]])
    u = np.loadtxt("UmapDimensionalSpace.txt")
    #
    plt.scatter(u[0], u[1])
    plt.title('UMAP embedding of Ligands')
    plt.show()

    LABEL_COLOR_MAP = {1: 'r', 2: 'b', 3: 'g', 4: 'y', 5: 'm', 6: 'c'}
    #Question 3 +4
    score = []
    for i in range(1, 10):
        print(i)
        PT = algs.PartitionClustering(LigandData.long[0:2000],
                                      n_clusters=i,
                                      max_iteration=100)
        PT.runClustering()
        score.append(algs.SilhouetteScore(PT))
        del PT
    print(score)

    #Will rerun singluar test on higest silscore to get data for generation.
    #Best score was found when K=6, see Guardado_Miguel_BMI203_HW2_WriteUp.pdf for more info.
    PT_k6 = algs.PartitionClustering(LigandData.long[0:2000],
                                     n_clusters=6,
                                     max_iteration=100)
    PT_k6.runClustering()
    print(PT_k6.clusterassignment)
    label_color = [LABEL_COLOR_MAP[l] for l in PT_k6.clusterassignment]
    print(np.unique(PT_k6.clusterassignment))
    u = np.loadtxt("UmapDimensionalSpace.txt")

    plt.figure(figsize=(20, 10))
    plt.scatter(u[0], u[1], c=label_color)
    plt.title('UMAP embedding of Ligands,2000 Ligands, 6 clusters')
    plt.show()

    #Question 5+6
    score = []
    for i in range(1, 10):
        print(i)
        HC = algs.PartitionClustering(LigandData.long[0:2000],
                                      n_clusters=i,
                                      max_iteration=100)
        HC.runClustering()
        score.append(algs.SilhouetteScore(HC))
        print(score)
        del HC
    print(score)
    HC_k4 = algs.HierarchicalClustering(LigandData.long[0:2000], n_clusters=4)
    HC_k4.runClustering()
    print(HC_k4.clusterassignment)
    label_color = [LABEL_COLOR_MAP[l] for l in HC_k4.clusterassignment]
    print(np.unique(HC_k4.clusterassignment))
    u = np.loadtxt("UmapDimensionalSpace.txt")

    plt.figure(figsize=(20, 10))
    plt.scatter(u[0], u[1], c=label_color)
    plt.title('UMAP embedding of Ligands,2000 Ligands, 4 clusters')
    plt.show()

    # #Question 7
    arr1 = np.array(
        [0.2035, 0.0933, 0.18810, 0.0485, 0.396001, 0.22705, 0.08660, 0.29346])
    arr2 = np.array([
        0.0953, 0.08660, 0.26153, 0.081803, 0.163898, 0.233848, 0.09873,
        -0.167866
    ])
    print(np.sum(arr1 - arr2))
    print(algs.CalculatePairWiseDistance(arr1, arr2))

    k = [4, 6]
    for n_cluster in k:
        PT = algs.PartitionClustering(LigandData.long[0:2000],
                                      n_clusters=n_cluster,
                                      max_iteration=100)
        PT.runClustering()
        HC = algs.HierarchicalClustering(LigandData.long[0:2000],
                                         n_clusters=n_cluster)
        HC.runClustering()
        print(algs.TanimotoCoeff(PT.clusters, HC.clusters))