Exemplo n.º 1
0
def test_similarity():
    testcluster = np.array([[3.08232755e-01, 7.31276243e-01],
                            [1.38059574e-01, 5.96831094e-01],
                            [7.17477934e-01, 6.92660634e-01],
                            [1.04842083e-01, 5.81815300e-01],
                            [2.63517862e-01, 8.56987831e-01],
                            [6.82660482e-01, 7.65745298e-01],
                            [3.30899459e-01, 1.27005643e-01],
                            [2.15388524e+00, 2.76495447e+00],
                            [2.02847470e+00, 2.17510569e+00],
                            [2.81339552e+00, 2.92175026e+00],
                            [2.11079023e+00, 2.70619934e+00],
                            [2.51975852e+00, 2.72664963e+00]])

    TestPT = algs.PartitionClustering(rawdata=testcluster,
                                      n_clusters=2,
                                      max_iteration=100)
    TestPT.runClustering()

    score = algs.SilhouetteScore(TestPT)
    assert (score > 0.80)

    TestPT = algs.HierarchicalClustering(rawdata=testcluster, n_clusters=2)
    TestPT.runClustering()

    score = algs.SilhouetteScore(TestPT)
    assert (score > 0.80)
Exemplo n.º 2
0
def test_Partitioning(ligand_test):
    """this will cluster all 10 given objects and confirm that there are the appropriate number of clusters at each level of the dendogram"""               
    test_km_cluster = algs.PartitionClustering(10,1024)
    test_km_cluster.get_data(ligand_test)
    for i in range(1,10):
        test_km_cluster.update_cluster_number(i)
        test_km_cluster.cluster()
        number_clusters = len(numpy.unique(test_km_cluster.cluster_assignments))
        assert number_clusters == i
Exemplo n.º 3
0
def test_silhouette_coeff(ligand_test):
    """This function tests if coeff falls in expected range"""
    test_km_cluster = algs.PartitionClustering(2,1024)
    test_km_cluster.get_data(ligand_test)
    test_km_cluster.cluster()
    distance_matrix_test = algs.euclid_distance(test_km_cluster.similarity_matrix)
    for j in range(10):
        test_score = algs.silhouette_score(distance_matrix_test,numpy.transpose(test_km_cluster.cluster_assignments)[0],j)
    
        assert test_score > -1 and test_score < 1
Exemplo n.º 4
0
def test_Jaccard_Index(ligand_test):
    """This functions tests if identical clusters return a value of 1 and if completely different clusters return a value of 0"""
    test_km_cluster = algs.PartitionClustering(2,1024)
    test_km_cluster.get_data(ligand_test)
    test_km_cluster.cluster()
    wrong_clusters = numpy.array([20,21,22,23,24,25,26,27,28,39])
    correct_index = algs.Jaccard_Index(test_km_cluster.cluster_assignments,test_km_cluster.cluster_assignments)
    wrong_index   = algs.Jaccard_Index(test_km_cluster.cluster_assignments,wrong_clusters)
    assert correct_index == 1
    assert wrong_index   == 0
Exemplo n.º 5
0
def test_partitioning():
    # gets the clustering results from Kmeans clustering, and checks that the number of different labels is correct
    test_cluster5 = algs.PartitionClustering(5, seed=2)
    labels5 = test_cluster5.cluster(ligands[:500])
    assert (1 in labels5 and 2 in labels5 and 3 in labels5 and 4 in labels5
            and 5 in labels5)

    test_cluster2 = algs.PartitionClustering(10, seed=6)
    labels2 = test_cluster2.cluster(ligands[:500])
    assert (1 in labels2 and 2 in labels2)

    # for each ligand, check that the closest cluster centroid is the cluster that it belongs to
    test_cluster2 = algs.PartitionClustering(10, seed=6)
    labels2 = test_cluster2.cluster(ligands[:500])
    for i, ligand in enumerate(ligands[:500]):
        distances = []
        for cluster in test_cluster2.clusters:
            distances.append(
                algs.tanimoto_distance(ligand.bit_vector, cluster.centroid))
        assert (np.argmin(np.array(distances)) + 1 == labels2[i])
Exemplo n.º 6
0
def test_partitioning():
    ligands = read_test_ligands('ligand_information.csv')
    distanceMatrix = algs.makeDistanceMatrix(ligands)
    pc = algs.PartitionClustering()
    pcclusters = pc.cluster(ligands, distanceMatrix, 2)
    ligandIDs = []
    for cluster in pcclusters:
        for ligand in cluster.ligands:
            ligandIDs.append(ligand.ligandID)
    assert ligandIDs == [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] or ligandIDs == [
        1, 1, 1, 1, 1, 0, 0, 0, 0, 0
    ], 'Partition Clustering Test Failed'
    print('Partition Clustering Test Passed')
Exemplo n.º 7
0
def test_partitioning():
    # Setup
    n_clusters = 1

    x_input_km = np.array([[0., 0., 1., 0., 0.], [1., 0., 0., 0., 0.],
                           [0., 1., 0., 1., 1.], [0., 1., 0., 0., 0.],
                           [0., 0., 0., 1., 0.]])

    desired_p_lab_km = np.array([0, 0, 0, 0, 0])

    # Exercise
    kmodes = algs.PartitionClustering(x_input_km, n_clusters)
    c, c_lab, p_lab_km = kmodes.cluster()

    # Verify
    np.testing.assert_array_equal(p_lab_km, desired_p_lab_km)
Exemplo n.º 8
0
def test_partitioning():
    # Since the partition clustering is not deterministic
    # I am checking that the number of clusters are correct
    # I am also checking that a basic small clustering test
    # example clusters correctly.
    test_array = np.array([[1, 1, 1, 1, 0, 0, 0, 0, 0],
                           [0, 0, 0, 0, 1, 1, 1, 0, 0],
                           [0, 0, 0, 0, 1, 1, 0, 0, 0],
                           [0, 0, 0, 0, 1, 1, 0, 1, 0],
                           [1, 0, 1, 1, 0, 0, 0, 0, 0],
                           [0, 1, 1, 0, 0, 0, 0, 0, 0],
                           [1, 1, 0, 0, 0, 0, 0, 0, 0]])
    p_cluster = algs.PartitionClustering(num_clusters=2, max_iter=500)
    p_labels = list(p_cluster.cluster(test_array))
    # Checking it creates the proper number of clusters
    assert len(np.unique(p_labels)) == 2
    # Checking that id does correct clustering
    assert p_labels == [0, 1, 1, 1, 0, 0, 0
                        ] or p_labels == [1, 0, 0, 0, 1, 1, 1]
Exemplo n.º 9
0
def test_partitioning():
    X = {
        "a": algs.Ligand("a", 0, "whatever", np.array([1, 2])),
        "b": algs.Ligand("b", 0, "whatever", np.array([1, 4])),
        "c": algs.Ligand("c", 0, "whatever", np.array([1, 0])),
        "d": algs.Ligand("d", 0, "whatever", np.array([10, 2])),
        "e": algs.Ligand("e", 0, "whatever", np.array([10, 4])),
        "f": algs.Ligand("f", 0, "whatever", np.array([10, 0]))
    }
    tester = algs.PartitionClustering(X,
                                      distance_metric="euclidean",
                                      desired_k=2).cluster(seed=1)
    assert tester.labels == {
        'a': 0,
        'b': 0,
        'c': 0,
        'd': 1,
        'e': 1,
        'f': 1
    }, "Failing kmeans easy test"
Exemplo n.º 10
0
def test_partitioning():
    testcluster = np.array([[3.08232755e-01, 7.31276243e-01],
                            [1.38059574e-01, 5.96831094e-01],
                            [7.17477934e-01, 6.92660634e-01],
                            [1.04842083e-01, 5.81815300e-01],
                            [2.63517862e-01, 8.56987831e-01],
                            [6.82660482e-01, 7.65745298e-01],
                            [3.30899459e-01, 1.27005643e-01],
                            [2.15388524e+00, 2.76495447e+00],
                            [2.02847470e+00, 2.17510569e+00],
                            [2.81339552e+00, 2.92175026e+00],
                            [2.11079023e+00, 2.70619934e+00],
                            [2.51975852e+00, 2.72664963e+00]])

    TestPT = algs.PartitionClustering(rawdata=testcluster,
                                      max_iteration=100,
                                      n_clusters=2)
    TestPT.runClustering()
    assert (len(TestPT.centroids) == 2)
    assert (len(TestPT.clusters) == 2)
    assert (sorted(TestPT.centroids) == [0, 10])
    assert (sorted(TestPT.clusters) == [[0, 1, 2, 3, 4, 5, 6],
                                        [7, 8, 9, 10, 11]])
Exemplo n.º 11
0
def main():
    LigandInformation = pd.read_csv("../ligand_information.csv", sep=",")
    LigandData = Ligand.Ligand(LigandID=LigandInformation['LigandID'],
                               score=LigandInformation['Score'],
                               SMILES=LigandInformation['SMILES'],
                               OnBits=LigandInformation['OnBits'])
    LigandData.OnbitToLong()

    #Question 2
    #I am going to implement a Umap of the ligand
    fit = umap.UMAP()
    u = fit.fit_transform(LigandData.long[0:2000])

    #Used to just load the data for later questions/visualizations, if you uncomment, keep the loadtxt command to so the data
    #can be typecast to a numpy array, for array index notation consistentcy .
    np.savetxt('UmapDimensionalSpace.txt', [u[:, 0], u[:, 1]])
    u = np.loadtxt("UmapDimensionalSpace.txt")
    #
    plt.scatter(u[0], u[1])
    plt.title('UMAP embedding of Ligands')
    plt.show()

    LABEL_COLOR_MAP = {1: 'r', 2: 'b', 3: 'g', 4: 'y', 5: 'm', 6: 'c'}
    #Question 3 +4
    score = []
    for i in range(1, 10):
        print(i)
        PT = algs.PartitionClustering(LigandData.long[0:2000],
                                      n_clusters=i,
                                      max_iteration=100)
        PT.runClustering()
        score.append(algs.SilhouetteScore(PT))
        del PT
    print(score)

    #Will rerun singluar test on higest silscore to get data for generation.
    #Best score was found when K=6, see Guardado_Miguel_BMI203_HW2_WriteUp.pdf for more info.
    PT_k6 = algs.PartitionClustering(LigandData.long[0:2000],
                                     n_clusters=6,
                                     max_iteration=100)
    PT_k6.runClustering()
    print(PT_k6.clusterassignment)
    label_color = [LABEL_COLOR_MAP[l] for l in PT_k6.clusterassignment]
    print(np.unique(PT_k6.clusterassignment))
    u = np.loadtxt("UmapDimensionalSpace.txt")

    plt.figure(figsize=(20, 10))
    plt.scatter(u[0], u[1], c=label_color)
    plt.title('UMAP embedding of Ligands,2000 Ligands, 6 clusters')
    plt.show()

    #Question 5+6
    score = []
    for i in range(1, 10):
        print(i)
        HC = algs.PartitionClustering(LigandData.long[0:2000],
                                      n_clusters=i,
                                      max_iteration=100)
        HC.runClustering()
        score.append(algs.SilhouetteScore(HC))
        print(score)
        del HC
    print(score)
    HC_k4 = algs.HierarchicalClustering(LigandData.long[0:2000], n_clusters=4)
    HC_k4.runClustering()
    print(HC_k4.clusterassignment)
    label_color = [LABEL_COLOR_MAP[l] for l in HC_k4.clusterassignment]
    print(np.unique(HC_k4.clusterassignment))
    u = np.loadtxt("UmapDimensionalSpace.txt")

    plt.figure(figsize=(20, 10))
    plt.scatter(u[0], u[1], c=label_color)
    plt.title('UMAP embedding of Ligands,2000 Ligands, 4 clusters')
    plt.show()

    # #Question 7
    arr1 = np.array(
        [0.2035, 0.0933, 0.18810, 0.0485, 0.396001, 0.22705, 0.08660, 0.29346])
    arr2 = np.array([
        0.0953, 0.08660, 0.26153, 0.081803, 0.163898, 0.233848, 0.09873,
        -0.167866
    ])
    print(np.sum(arr1 - arr2))
    print(algs.CalculatePairWiseDistance(arr1, arr2))

    k = [4, 6]
    for n_cluster in k:
        PT = algs.PartitionClustering(LigandData.long[0:2000],
                                      n_clusters=n_cluster,
                                      max_iteration=100)
        PT.runClustering()
        HC = algs.HierarchicalClustering(LigandData.long[0:2000],
                                         n_clusters=n_cluster)
        HC.runClustering()
        print(algs.TanimotoCoeff(PT.clusters, HC.clusters))
Exemplo n.º 12
0
    f.close()
    
##to perform partitioning clustering load in the entire ligand set as this algorithm is quicker and can operate on the whole data set
with open('ligand_information.csv') as f:
    ligands_txt = f.readlines()
    
all_ligands = []
for i in range(1,len(ligands_txt)):
    all_ligands.append(algs.ligand(ligands_txt[i]))
    

    
##cluster all the ligands with k-means will blindly assume 10 clusters to start 
##this is included even post K optimization, because tSNE coordinates are calculated
##with the similarity matrix generated by this algorithm
kmeans_cluster_all = algs.PartitionClustering(10,1024)
kmeans_cluster_all.get_data(all_ligands)
kmeans_cluster_all.cluster()    

##cluster the condensed version of the ligand set similar to above
kmeans_cluster_short = algs.PartitionClustering(10,1024)
kmeans_cluster_short.get_data(all_ligands_short)
kmeans_cluster_short.cluster()    


##plot the tSNE dimension reduction plot for question 2
#generate the tSNE coordinates
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results_all = tsne.fit_transform(kmeans_cluster_all.similarity_matrix)
tsne_results_short = tsne.fit_transform(kmeans_cluster_short.similarity_matrix)
Exemplo n.º 13
0
cluster_scores = [[], [], [], [], []]

for ligand, label in zip(ligands, partition_labels):
    cluster_scores[label-1].append(ligand.score)

for i, scores in enumerate(cluster_scores):
    plt.hist(scores)
    plt.xlabel("Vina Score")
    plt.title("Distribution of Vina Scores for cluster "+ str(i+1))
    plt.show()
'''

################## QUESTION 10 ##################
# get k-means clustering
partition_cluster = algs.PartitionClustering(num_clusters=5,
                                             seed=6,
                                             max_iterations=25)
partition_labels = partition_cluster.cluster(ligands)

# find the
# group the scores for each cluster
clusters = [[], [], [], [], []]

for ligand, label in zip(ligands, partition_labels):
    clusters[label - 1].append(ligand)

# find the top scoring ligand in each cluster and print
for i, cluster in enumerate(clusters):
    max_score = float("-inf")
    for ligand in cluster:
        if ligand.score > max_score: