def test_partition_clustering(): # tractable subset pdb_ids = [276, 39299, 38031] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) sim_matrix = cluster.similarity_matrix(active_sites) assert cluster.cluster_by_partitioning([], {}) == [] assert cluster.cluster_by_partitioning([active_sites[0]], {}) == [[active_sites[0]]]
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701, 10701,10814,13052,14181,15813] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb"%id) active_sites.append(io.read_active_site(filepath)) cluster.get_order_residues(active_sites) # update this assertion assert len(cluster.cluster_by_partitioning(active_sites,2)[0]) == 2 assert len(cluster.cluster_by_partitioning(active_sites,3)[0]) == 3
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) assert cluster.cluster_by_partitioning(active_sites) == [["276"], ["4629"], ["10701"]] # check empty active sites doesn't crash assert cluster.cluster_by_partitioning(None) is None
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb"%id) active_sites.append(io.read_active_site(filepath)) # update this assertion #Slice out clusters that have an assignment clusters_out = cluster.cluster_by_partitioning(active_sites) non_empty_clusters = [i for i in clusters_out if len(i) > 1] distances_from_centroids = [i[1][1] for i in non_empty_clusters] print('non-empty clusters for k-means is', non_empty_clusters) print('non-empty cluster distance from centroids are', distances_from_centroids) #Check that the length of non-empty cluster list is <= 3 (in case where e.g. two or three residues get assigned to same cluster) assert len(non_empty_clusters) <= 3 #For my particular variant of k-means, check that the distance between the cluster residue #and the centroid is >= 0 (nonnegativity) for i in distances_from_centroids: assert i >= 0
def test_compareTwoMethods(): allPossible = [] for filename in os.listdir("data"): allPossible.append(int(filename.split(".")[0])) hierTotal = 0 partTotal = 0 iterations = 100 numPDBs = 15 for i in range(0, iterations): indices = random.sample(range(0, len(allPossible)), numPDBs) pdb_ids = [] for j in indices: pdb_ids.append(allPossible[j]) active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) hierScore = cluster.qualityMetric( cluster.cluster_hierarchically(active_sites)) partScore = cluster.qualityMetric( cluster.cluster_by_partitioning(active_sites)) hierTotal += hierScore partTotal += partScore print("hierScoreAverage: ", hierTotal / float(iterations)) print("partScoreAverage: ", partTotal / float(iterations))
def test_hierarchical_clustering(): # tractable subset # pdb_ids = [276, 4629, 10701] pdb_ids = [276, 1806, 3458, 3733, 10814, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) assert [] not in cluster.cluster_by_partitioning(active_sites) assert len(cluster.cluster_by_partitioning(active_sites)) == 3 pdb_ids = [276] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) assert len(cluster.cluster_by_partitioning(active_sites)) == 1
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) clustering = cluster.cluster_by_partitioning(active_sites, [2]) # clusters more similar clusters together assert get_names(flatten(clustering[0])) in [['276', '4629'], ['10701']] assert get_names(flatten(clustering[1])) in [['276', '4629'], ['10701']] # len(clustered_list.unique()==k) active_sites = read_active_sites("data") assert len(cluster.cluster_by_partitioning(active_sites, [2])) == 2 assert len(cluster.cluster_by_partitioning(active_sites, [3])) == 3
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) assert len(cluster.cluster_by_partitioning(active_sites).keys()) >= 2
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) # update this assertion assert cluster.cluster_by_partitioning(active_sites) == [[2], [], [0, 1]]
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) assert cluster.test_format_part( cluster.cluster_by_partitioning(active_sites, 2)) == [[276], [4629, 10701]]
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) clusters = cluster.cluster_by_partitioning(active_sites, 1)[0] assert [int(c.name) for c in clusters] == [276, 4629, 10701]
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) # clusters produce k number of final labels assert len( cluster.cluster_by_partitioning(active_sites)) == 3 #k = 3 in my code
def test_partition_clustering(): # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) # update this assertion # checking the the three sites cluster as expected assert np.array_equal( cluster.cluster_by_partitioning(active_sites, 2)[0], [1, 1, 0])
def test_partition_clustering(): random.seed(40) # tractable subset pdb_ids = [276, 4629, 10701] active_sites = [] for id in pdb_ids: filepath = os.path.join("data", "%i.pdb" % id) active_sites.append(io.read_active_site(filepath)) # update this assertion label = cluster.cluster_by_partitioning(active_sites, 2) #assert cluster.cluster_by_partitioning(active_sites) == [] assert all(label[1] == [0, 0, 1])
from matplotlib import pyplot as plt import pandas as pd active_sites = io.read_active_sites('./data') number_clusters = [] p_sil_scores = [] h_sil_scores = [] r_sil_scores = [] for i in range(2,10): # clustering all sites by partition print('Finding %d clusters by partitioning'% i) p_clusters, p_distances = cluster.cluster_by_partitioning(active_sites, i) # clustering all sites hierarchically print('Finding %d clusters hierarchically'% i) h_clusters, h_distances = cluster.cluster_hierarchically(active_sites, i) # generating random clusters labels print('Generating %d random cluster labels'% i) r_clusters = np.random.choice(range(0,i), 136) # to evaluate the clusters I will get the silhouette score for each method of clustering # as well as randomly generated cluster labels p_sil_scores.append(silhouette_score(p_distances, p_clusters)) h_sil_scores.append(silhouette_score(h_distances, h_clusters)) r_sil_scores.append(silhouette_score(p_distances, r_clusters))
#from .utils import Atom, Residue, ActiveSite from hw2skeleton import io from hw2skeleton import cluster import matplotlib.pyplot as plt import numpy as np active_sites = io.read_active_sites( "C:\\Users\Zoë\Documents\GitHub\hw2-skeleton\data") #site1 = active_sites[5] #site2 = active_sites[7] #print('site1: ', site1.categories) #print('site2: ', site2.categories) #sim = cluster.compute_similarity(site1,site2) # Run for one clustering by kmeans Pclusters, PmaxDistance = cluster.cluster_by_partitioning(active_sites) ##for i in clusters: ## print(i.toStr()) io.write_clustering("clusterPk=10", Pclusters) # Run for just one clustering by agglomerative clustering Hclusters, distH = cluster.cluster_hierarchically(active_sites) io.write_clustering("clusterHcutoff=0.3", Hclusters) ## Run for one clustering by agglomerative clustering #Hclusters, HmaxDist, Hclusterings = cluster.cluster_hierarchically(active_sites) #io.write_mult_clusterings("clusteringsH1", Hclusterings) #%% ## Clusterings of multiple k values in kmeans #kvals = [2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,120,130,136]