def process_hierarchy(inf, h, method):
    df = pd.read_csv(inf, header=0, index_col=0)
    df = df.fillna(0)
    strains = df.index
    df = 1 - (df / 100)
    df_v = ssd.squareform(
        df, force='tovector',
        checks=False)  # flatten matrix to condensed distance vector
    if method == 'single':
        li = sch.single(df_v)
    elif method == 'complete':
        li = sch.complete(df_v)
    elif method == 'average':
        li = sch.average(df_v)
    elif method == 'weighted':
        li = sch.weighted(df_v)
    else:
        print('\nERROR: Please enter a valid clustering method\n')
        sys.exit()
    hclus = cut_tree(
        li, height=h
    )  # using the height (percent ID as decimal, for example), cluster OFUs from dendrogram
    hclus = pd.DataFrame(hclus, index=strains)
    hclus.ix[:,
             0] += 1  # cut_tree defaults to the first 'cluster' being named "0"; this just bumps all IDs +1
    return hclus
Пример #2
0
def get_centroids(train_pack):
    # unpack x_train
    x_train = train_pack[0]
    distance_threshold = train_pack[1]
    clustering_type = train_pack[2]

    if clustering_type == 'Agglomerative':
        dist_mat=pdist(x_train,metric='euclidean')
        Z = weighted(dist_mat)
        dn = hierarchy.dendrogram(Z)
        labels=fcluster(Z, t=distance_threshold, criterion='distance')

        total_number = [0 for x in range(0,max(labels))]
        centroids = [[0 for x in range(len(x_train[0]))] for y in range(0,max(labels))]
        for j in range(0,len(x_train)):
            centroids[labels[j]-1]+=x_train[j]
            total_number[labels[j]-1]+=1
        for j in range(0,len(centroids)):
            centroids[j] = np.divide(centroids[j],total_number[j])

    elif clustering_type == 'Agg_Var':
        if len(x_train)>0:
            centroids = [[0 for x in range(len(x_train[0]))]]
            # initalize centroids
            centroids[0] = x_train[0]
            total_num = [1]
            for i in range(1,len(x_train)):
                distances=[]
                indices = []
                for j in range(0,len(centroids)):
                    d = find_distance(x_train[i],centroids[j],distance_metric)
                    if d<distance_threshold:
                        distances.append(d)
                        indices.append(j)
                if len(distances)==0:
                    centroids.append(x_train[i])
                    total_num.append(1)
                else:
                    min_d = np.argmin(distances)
                    centroids[indices[min_d]] = np.add(np.multiply(total_num[indices[min_d]],centroids[indices[min_d]]),x_train[i])
                    total_num[indices[min_d]]+=1
                    centroids[indices[min_d]] = np.divide(centroids[indices[min_d]],(total_num[indices[min_d]]))
                    #min_d = np.argmin(distances)
                    #centroids[indices[min_d]] = np.add(centroids[indices[min_d]],x_train[i])
                    #total_num[indices[min_d]]+=1
            #for j in range(0,len(total_num)):
            #    centroids[j]=np.divide(centroids[j],total_num[j])
        else:
            centroids = []

    elif clustering_type == 'k_means':
        kmeans = KMeans(n_clusters=distance_threshold, random_state = 0).fit(x_train)
        centroids = kmeans.cluster_centers_
    elif clustering_type == 'NCM':
        centroids = [[0 for x in range(len(x_train[0]))]]
        centroids[0] = np.average(x_train,0)
    return centroids
Пример #3
0
def write_tree():
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    triu = np.square(dmx.as_matrix())
    hclust = weighted(triu)
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
Пример #4
0
def detectHierarchical(G, numClusters, sites, unipartite, fast):
	numNodes = G.number_of_nodes()
	
	if unipartite == True:
		if fast == True:
			W = pickle.load(open("weightsUnipartite.p", "rb"))
		else:
			W = getStartingWeights(G, numNodes, True)
			pickle.dump(W, open("weightsUnipartite.p", "wb"))
	else:
		if fast == True:
			W = W = pickle.load(open("weightsBipartite.p", "rb"))
		else:
			W = getStartingWeights(G, numNodes, False)
			pickle.dump(W, open("weightsBipartite.p", "wb"))
	
	if unipartite == True:	
		Z=hierarchy.weighted(W)  
		#pickle.dump(Z, open("ZUnipartite.p", "wb"))
		#Z = pickle.load(open("ZUnipartite.p", "rb"))
	else:		
		Z=hierarchy.weighted(W)  
		#pickle.dump(Z, open("ZBipartite.p", "wb"))
		#Z = pickle.load(open"ZBipartite.p", "rb")
	
	membership=list(hierarchy.fcluster(Z,numClusters, 'maxclust')) 

	# print "number of distinct clusters: ", len(set(membership))
	# for i in xrange(len(set(membership))):
	# 	k = 0
	# 	for j in xrange(len(membership)):
	# 		if membership[j] == i+1:
	# 			k+=1
	# 	print k, "nodes in cluster number", i+1
	# 	k=0


	clusters = {}
	for i in xrange(len(membership)):
		if i in sites:
			clusters[i] = membership[i]
	return clusters
Пример #5
0
def __apply_cluster_alg(cluster_data=[], alg="kmean", prior_cluster_num=2, t=0.155):
    pass
    """clustering"""
    if alg == "kmean":
        from scipy.cluster.vq import whiten

        cluster_data = whiten(cluster_data)
        from scipy.cluster.vq import kmeans, vq

        centroids, _ = kmeans(cluster_data, prior_cluster_num, iter=250)
        idx, dist = vq(cluster_data, centroids)
        return idx, prior_cluster_num
    elif alg == "spec":
        from sklearn import cluster
        from sklearn.preprocessing import StandardScaler

        X = cluster_data
        X = StandardScaler().fit_transform(X)
        spectral = cluster.SpectralClustering(n_clusters=prior_cluster_num, eigen_solver="arpack")
        spectral.fit(X)
        import numpy as N

        idx = spectral.labels_.astype(N.int)
        return idx, prior_cluster_num
    else:
        """hierarchical clustering
		   http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html"""
        import scipy.cluster.hierarchy as hcluster

        """needs distance matrix: 
		   http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.spatial.distance.pdist.html"""
        import scipy.spatial.distance as dist

        distmat = dist.pdist(cluster_data, "minkowski")  #'euclidean')
        if alg == "hflat":
            link = hcluster.linkage(distmat)
        elif alg == "hcomp":
            link = hcluster.complete(distmat)
        elif alg == "hweight":
            link = hcluster.weighted(distmat)
        elif alg == "havg":
            link = hcluster.average(distmat)
        idx = hcluster.fcluster(link, t=t, criterion="distance")
        import numpy as N

        post_cluster_num = len(N.unique(idx))
        print "# of channels established:", post_cluster_num
        assert post_cluster_num < 64, "number of cluster too large to be biological meaningful"
        return idx, post_cluster_num
def write_tree(cluster_method):
    import scipy.spatial.distance as ssd
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    triu = np.square(dmx.values)
    distArray = ssd.squareform(triu)
    if cluster_method == "average":
        hclust = average(distArray)
    elif cluster_method == "weighted":
        hclust = weighted(distArray)
    else:
        print("invalid cluster method chosen")
        sys.exit()
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
Пример #7
0
    def get_tree(self):
        from ete3.coretype.tree import TreeError
        import numpy as np
        from skbio.tree import TreeNode
        from scipy.cluster.hierarchy import weighted

        ids = self.dmx.index.tolist()
        triu = np.triu(self.dmx.as_matrix())
        hclust = weighted(triu)
        t = TreeNode.from_linkage_matrix(hclust, ids)
        nw = t.__str__().replace("'", "")
        self.tree = Tree(nw)
        try:
            # midpoint root tree
            self.tree.set_outgroup(self.tree.get_midpoint_outgroup())
        except TreeError:
            self.log.error("Unable to midpoint root tree")
        self.tree.write(outfile=self.nw_path)
Пример #8
0
def write_tree(cluster_method):
    import scipy.spatial.distance as ssd
    dmx = pd.read_csv("distance_matrix", index_col=0, sep="\t")
    ids = dmx.index.tolist()
    #triu = np.square(dmx.as_matrix())
    triu = np.square(dmx.values)
    distArray = ssd.squareform(triu)
    if cluster_method == "average":
        hclust = average(distArray)
    elif cluster_method == "weighted":
        hclust = weighted(distArray)
    else:
        print("invalid cluster method chosen")
        sys.exit()
    t = TreeNode.from_linkage_matrix(hclust, ids)
    nw = t.__str__().replace("'", "")
    outfile = open("bsr_matrix.tree", "w")
    outfile.write(nw)
    outfile.close()
Пример #9
0
 def get_tree(self):
     # Use decorator instead of if statement
     if self.tree_complete is False:
         from ete3.coretype.tree import TreeError
         import numpy as np
         # import matplotlib as mpl
         # mpl.use('TkAgg')
         from skbio.tree import TreeNode
         from scipy.cluster.hierarchy import weighted
         ids = ['{}.fasta'.format(i) for i in self.dmx.index.tolist()]
         triu = np.triu(self.dmx.as_matrix())
         hclust = weighted(triu)
         t = TreeNode.from_linkage_matrix(hclust, ids)
         nw = t.__str__().replace("'", "")
         self.tree = Tree(nw)
         # midpoint root tree
         try:
             self.tree.set_outgroup(self.tree.get_midpoint_outgroup())
         except TreeError as e:
             self.log.exception()
         self.tree.write(outfile=self.nw_path)
Пример #10
0
	def CalculateClusterTree(self):
		fullMatrix = self.GenerateFullMatrix(self.results)
		dissMatrix = []
		labels = fullMatrix.keys()
		for i in xrange(0, len(labels)):
			sampleNameI = labels[i]
			for j in xrange(i+1, len(labels)):
				sampleNameJ = labels[j]
				dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ])
				
		# calculate hierarchical cluster tree
		if self.radioSingleLinkage.GetValue():
			linkageMatrix = single(dissMatrix)
		elif self.radioUPGMA.GetValue():
			linkageMatrix = average(dissMatrix)
		elif self.radioCompleteLinkage.GetValue():
			linkageMatrix = complete(dissMatrix)
		elif self.radioWeighted.GetValue():
			linkageMatrix = weighted(dissMatrix)
			
		root = to_tree(linkageMatrix)
		
		# create Newick string
		return self.CreateNewickString(root, labels) + ';'
Пример #11
0
    def CalculateClusterTree(self):
        fullMatrix = self.GenerateFullMatrix(self.results)
        dissMatrix = []
        labels = fullMatrix.keys()
        for i in xrange(0, len(labels)):
            sampleNameI = labels[i]
            for j in xrange(i + 1, len(labels)):
                sampleNameJ = labels[j]
                dissMatrix.append(fullMatrix[sampleNameI][sampleNameJ])

        # calculate hierarchical cluster tree
        if self.radioSingleLinkage.GetValue():
            linkageMatrix = single(dissMatrix)
        elif self.radioUPGMA.GetValue():
            linkageMatrix = average(dissMatrix)
        elif self.radioCompleteLinkage.GetValue():
            linkageMatrix = complete(dissMatrix)
        elif self.radioWeighted.GetValue():
            linkageMatrix = weighted(dissMatrix)

        root = to_tree(linkageMatrix)

        # create Newick string
        return self.CreateNewickString(root, labels) + ';'
Пример #12
0
def testSelfRecruitment(pathWork, pathRai, numSpecies, sizeChop, numDB, numSeq):
    import os
    import random
    import numpy as np
    import scipy.cluster.vq as scv
    import scipy.cluster.hierarchy as sch
    #import networkx as nx
    #import community as cm
    #import matplotlib.pyplot as plt

    if pathWork[-1] != '/':
        pathWork = pathWork + '/'
    if pathRai[-1] != '/':
        pathRai = pathRai + '/'
            
    allList = os.listdir(pathWork)
    genomeList = []
    for file in allList:
        if file[-4:] == ".fna":
            genomeList.append(file)

    subset = random.sample(genomeList, numSpecies)

    for file in subset:
        # make sequences to be matched
        ensureDir(pathWork+"Sequences/")
        chopRandom(pathWork+file, pathWork+"Sequences/", sizeChop, sizeChop/10, numSeq)
        # make sequences to be matched to
        ensureDir(pathWork+"DataBase/")
        chopRandom(pathWork+file, pathWork+"DataBase/", sizeChop, sizeChop/10, numDB)

    # Make RAI databases
    os.system("{!s}raiphy -e .fna -m 2 -I {!s}Sequences/ -d {!s}seqs".format(pathRai,pathWork,pathWork))
    os.system("{!s}raiphy -e .fna -m 2 -I {!s}DataBase/ -d {!s}db".format(pathRai,pathWork,pathWork))

    # Data sets for further evaluation
    namesSq, RaiSq = rai2Numpy(pathWork+"seqs")
    namesDb, RaiDb = rai2Numpy(pathWork+"db")
            
    # namesAll = namesSq + namesDb
    RaiAll = np.concatenate([RaiSq,RaiDb])

    csvout = open("{!s}seqs.csv".format(pathIn),'w')
    for r in RaiAll:
        csvout.write(",".join(str(x) for x in r)+"\n")
            
    # Run RAIphy
    os.system("{!s}raiphy -e .fna -m 0 -I {!s}Sequences/ -d {!s}db -o {!s}output".format(pathRai,pathWork,pathWork,pathWork))
    # Evaluate RAIphy results
    raiDict = {}
    for k in namesDb:
        raiDict[k] = [k]
    raiRes = open("{!s}output".format(pathWork),'r')
    raiRes.readline()
    buf = raiRes.readline().rstrip()
    while buf:
        key = buf[3:]
        buf = raiRes.readline().rstrip()
        sq = buf[1:]
        raiDict[key].append(sq)
        buf = raiRes.readline().rstrip()
    raiList = []
    for k in raiDict:
        raiList.append(raiDict[k])

    print "RAIphy cluster list: {!s}".format(raiList)

    os.system("rm -r {!s}Sequences/".format(pathWork))
    os.system("rm -r {!s}seqs".format(pathWork))
    os.system("rm -r {!s}DataBase/".format(pathWork))
    os.system("rm -r {!s}db".format(pathWork))
    os.system("rm -r {!s}output".format(pathWork))

    print "Files removed"

    # K-means
    whitened = newWhiten(RaiAll)
    centroidsNoSeeds, _ = scv.kmeans(whitened, numSpecies)
    idsNoSeeds, _ = scv.vq(whitened, centroidsNoSeeds)
    # Want to implement k-means with initial seeds, but holy moly things go wrong in spectacular fashion.
    # TODO: k-means with initial seeds.  Might have to code my own version.
  
    # Compute distance matrix and graph for other clusterings
    D = np.zeros((len(RaiAll),len(RaiAll)),dtype=np.float)
    for i in range(len(RaiAll)):
        for j in range(i):
            dist = np.linalg.norm(RaiAll[i]-RaiAll[j])
            D[i][j] = dist
            D[j][i] = dist
    #max = numpy.max(D)
    #G = nx.Graph()
    #for i in range(len(D)):
    #    for j in range(i):
    #        G.add_edge(namesAll[i], namesAll[j], weight = max-D[i][j])

    # hierarchy-based clusterings
    WeightedLink = sch.weighted(D)
    # mylen = len(namesDb)
    clustWeightedLink = sch.fcluster(WeightedLink,numSpecies,criterion='maxclust')
    hist, bins = np.histogram(clustWeightedLink, bins=numSpecies)
    print hist
Пример #13
0
#print a
distArray = ssd.squareform(matrix)

#z = linkage(b, method='single', metric='Y')
#print z
print distArray
z = linkage(distArray)  #euclidean and simple
print z
x = single(distArray)
print x
y = complete(distArray)
print y
a = average(distArray)
print a
b = weighted(distArray)
print b
"""
c = centroid(distArray)
print c
m = median(distArray)
print m
w = ward(distArray)
print w
"""

#d = dendrogram(z,labels=labels)
#d1 = dendrogram(x, labels=labels)
d2 = dendrogram(w, labels=labels)
plt.figure(1)
plt.title(
Пример #14
0
    euclid_data = pdist(data, 'euclidean');
    logging.info("Time: %s" % (time.time() - start));

    logging.info("Clustering start");
    start = time.time();
    Z = hierarchy.complete(euclid_data);
    worker.hierarchy_draw(Z, niks, 'study_complete_euclid', 0.4);
    logging.info("Time complete: %s" % (time.time() - start));

    start = time.time();
    Z = hierarchy.average(euclid_data);
    worker.hierarchy_draw(Z, niks, 'study_average_euclid', 0.25);
    logging.info("Time average: %s" % (time.time() - start));

    start = time.time();
    Z = hierarchy.weighted(euclid_data);
    worker.hierarchy_draw(Z, niks, 'study_weighted_euclid', 0.25);
    logging.info("Time weighted: %s" % (time.time() - start));



    logging.info("\nSecondStep");

    logging.info("Distance other");
    start = time.time();
    sqeuclid_data = pdist(data, 'sqeuclidean');
    cityblock_data = pdist(data, 'cityblock');
    logging.info("Time: %s" % (time.time() - start));

    logging.info("Clustering start");