def check_maxRstat_one_cluster_linkage(self, i): # Tests maxRstat(Z, R, i) on linkage with one cluster. Z = np.asarray([[0, 1, 0.3, 4]], dtype=np.double) R = np.asarray([[0, 0, 0, 0.3]], dtype=np.double) MD = maxRstat(Z, R, 1) expectedMD = calculate_maximum_inconsistencies(Z, R, 1) assert_allclose(MD, expectedMD, atol=1e-15)
def check_maxRstat_Q_linkage(self, method, i): # Tests maxRstat(Z, R, i) on the Q data set X = hierarchy_test_data.Q_X Z = linkage(X, method) R = inconsistent(Z) MD = maxRstat(Z, R, 1) expectedMD = calculate_maximum_inconsistencies(Z, R, 1) assert_allclose(MD, expectedMD, atol=1e-15)
def check_maxRstat_Q_linkage(self, method, i): # Tests maxRstat(Z, R, i) on the Q data set X = eo['Q-X'] Y = pdist(X) Z = linkage(X, method) R = inconsistent(Z) MD = maxRstat(Z, R, 1) expectedMD = calculate_maximum_inconsistencies(Z, R, 1) assert_allclose(MD, expectedMD, atol=1e-15)
def process(self, **kwargs) -> Dict[str, Dict[np.ndarray, str]]: Z = kwargs['Linkage'] IncM = kwargs['IncM'] R = IncM['R'] stat = self.ctrls['Stat'].currentText() if stat == 'mean': i = 0 elif stat == 'stdev': i = 1 elif stat == 'num_links': i = 2 elif stat == 'inc_coef': i = 3 MR = hierarchy.maxRstat(Z, R, i) return {'MaxStat': {'MR': MR, 'stat': stat}}
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels): """ Generate silhoutte score based on hierarchical clustering. Args: dendroMatrix: list, occurance of words in different files distance_metric: string, style of distance metric in the dendrogram linkage_method: string, style of linkage method in the dendrogram labels: list, file names Returns: silhouetteScore: string, containing the result of silhouette score silhouetteAnnotation: string, annotation of the silhouette score score: float, silhouette score inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank """ activeFiles = len(labels) - 1 if ( activeFiles > 2 ): # since "number of lables should be more than 2 and less than n_samples - 1" Y = metrics.pairwise.pairwise_distances(dendroMatrix, metric=distance_metric) Z = hierarchy.linkage(Y, method=linkage_method) monocrit = None # 'maxclust' range maxclustMax = len(labels) - 1 # 'incosistent' range R = hierarchy.inconsistent(Z, 2) inconsistentMax = R[-1][-1] slen = len('%.*f' % (2, inconsistentMax)) inconsistentMax = float(str(inconsistentMax)[:slen]) # 'distance' range d = hierarchy.cophenet(Z) distanceMax = d.max() slen = len('%.*f' % (2, distanceMax)) distanceMax = float(str(distanceMax)[:slen]) distanceMin = d.min() + 0.01 slen = len('%.*f' % (2, distanceMin)) distanceMin = float(str(distanceMin)[:slen]) # 'monocrit' range MR = hierarchy.maxRstat(Z, R, 0) monocritMax = MR.max() slen = len('%.*f' % (2, monocritMax)) monocritMax = float(str(monocritMax)[:slen]) monocritMin = MR.min() + 0.01 slen = len('%.*f' % (2, monocritMin)) monocritMin = float(str(monocritMin)[:slen]) threshold = request.form['threshold'] if threshold == '': threshold = str(threshold) else: threshold = float(threshold) if request.form['criterion'] == 'maxclust': criterion = 'maxclust' if (threshold == '') or (threshold > maxclustMax): threshold = len(labels) - 1 else: threshold = round(float(threshold)) elif request.form['criterion'] == 'distance': criterion = 'distance' if (threshold == '') or (threshold > distanceMax) or (threshold < distanceMin): threshold = distanceMax elif request.form['criterion'] == 'inconsistent': criterion = 'inconsistent' if (threshold == '') or (threshold > inconsistentMax): threshold = inconsistentMax elif request.form['criterion'] == 'monocrit': criterion = 'monocrit' monocrit = MR if (threshold == '') or (threshold > monocritMax) or (threshold < monocritMin): threshold = monocritMax scoreLabel = hierarchy.fcluster(Z, t=threshold, criterion=criterion, monocrit=monocrit) if len( set(scoreLabel) ) <= 1: # this means all the files are divided into only 1 or less cluster silhouetteScore = "Silhouette Score: invalid for only 1 cluster." silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster" score = 'invalid for only 1 cluster' inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A' else: score = metrics.silhouette_score(Y, labels=scoreLabel, metric='precomputed') score = round(score, constants.ROUND_DIGIT) inequality = '≤'.decode('utf-8') silhouetteScore = "Silhouette Score: " + str( score ) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)" silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar." else: silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files." silhouetteAnnotation = "" score = 'invalid for less than or equal to 2 files.' threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A' return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
def silhouette_score(dendroMatrix, distance_metric, linkage_method, labels): """ Generate silhoutte score based on hierarchical clustering. Args: dendroMatrix: list, occurance of words in different files distance_metric: string, style of distance metric in the dendrogram linkage_method: string, style of linkage method in the dendrogram labels: list, file names Returns: silhouetteScore: string, containing the result of silhouette score silhouetteAnnotation: string, annotation of the silhouette score score: float, silhouette score inconsistentMax: float, upper bound of threshold to calculate silhouette score if using Inconsistent criterion maxclustMax: integer, upper bound of threshold to calculate silhouette score if using Maxclust criterion distanceMax: float, upper bound of threshold to calculate silhouette score if using Distance criterion distanceMin: float, lower bound of threshold to calculate silhouette score if using Distance criterion monocritMax: float, upper bound of threshold to calculate silhouette score if using Monocrit criterion monocritMin: float, lower bound of threshold to calculate silhouette score if using Monocrit criterion threshold: float/integer/string, threshold (t) value that users entered, equals to 'N/A' if users leave the field blank """ activeFiles = len(labels) - 1 if (activeFiles > 2): # since "number of lables should be more than 2 and less than n_samples - 1" Y = metrics.pairwise.pairwise_distances(dendroMatrix, metric=distance_metric) Z = hierarchy.linkage(Y, method=linkage_method) monocrit = None # 'maxclust' range maxclustMax = len(labels) - 1 # 'incosistent' range R = hierarchy.inconsistent(Z, 2) inconsistentMax = R[-1][-1] slen = len('%.*f' % (2, inconsistentMax)) inconsistentMax = float(str(inconsistentMax)[:slen]) # 'distance' range d = hierarchy.cophenet(Z) distanceMax = d.max() slen = len('%.*f' % (2, distanceMax)) distanceMax = float(str(distanceMax)[:slen]) distanceMin = d.min() + 0.01 slen = len('%.*f' % (2, distanceMin)) distanceMin = float(str(distanceMin)[:slen]) # 'monocrit' range MR = hierarchy.maxRstat(Z, R, 0) monocritMax = MR.max() slen = len('%.*f' % (2, monocritMax)) monocritMax = float(str(monocritMax)[:slen]) monocritMin = MR.min() + 0.01 slen = len('%.*f' % (2, monocritMin)) monocritMin = float(str(monocritMin)[:slen]) threshold = request.form['threshold'] if threshold == '': threshold = str(threshold) else: threshold = float(threshold) if request.form['criterion'] == 'maxclust': criterion = 'maxclust' if (threshold == '') or (threshold > maxclustMax): threshold = len(labels) - 1 else: threshold = round(float(threshold)) elif request.form['criterion'] == 'distance': criterion = 'distance' if (threshold == '') or (threshold > distanceMax) or (threshold < distanceMin): threshold = distanceMax elif request.form['criterion'] == 'inconsistent': criterion = 'inconsistent' if (threshold == '') or (threshold > inconsistentMax): threshold = inconsistentMax elif request.form['criterion'] == 'monocrit': criterion = 'monocrit' monocrit = MR if (threshold == '') or (threshold > monocritMax) or (threshold < monocritMin): threshold = monocritMax scoreLabel = hierarchy.fcluster(Z, t=threshold, criterion=criterion, monocrit=monocrit) if len(set(scoreLabel)) <= 1: # this means all the files are divided into only 1 or less cluster silhouetteScore = "Silhouette Score: invalid for only 1 cluster." silhouetteAnnotation = "because your file are too similar to each other, program classify all of them in the same cluster" score = 'invalid for only 1 cluster' inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = threshold = 'N/A' else: score = metrics.silhouette_score(Y, labels=scoreLabel, metric='precomputed') score = round(score, constants.ROUND_DIGIT) inequality = '≤'.decode('utf-8') silhouetteScore = "Silhouette Score: " + str( score) + "\n(-1 " + inequality + " Silhouette Score " + inequality + " 1)" silhouetteAnnotation = "The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar." else: silhouetteScore = "Silhouette Score: invalid for less than or equal to 2 files." silhouetteAnnotation = "" score = 'invalid for less than or equal to 2 files.' threshold = inconsistentMax = maxclustMax = distanceMax = distanceMin = monocritMax = monocritMin = 'N/A' return silhouetteScore, silhouetteAnnotation, score, inconsistentMax, maxclustMax, distanceMax, distanceMin, monocritMax, monocritMin, threshold
def get_clusters_Hierarchy_clustering(x, hier_dict): #default value L_method = 'single' L_metric = 'euclidean' t = 0.9 criterionH = 'inconsistent' depth = 2 R = None colR = 3 #L_metric can be 'braycurtis’, ‘canberra’, ‘chebyshev’, ‘cityblock’, #‘correlation’, ‘cosine’, ‘dice’, ‘euclidean’, ‘hamming’, ‘jaccard’, # ‘kulsinski’, ‘mahalanobis’, ‘matching’, ‘minkowski’, #‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, #‘sokalsneath’, ‘sqeuclidean’ #**Note that ‘jensenshannon’,‘yule’may result in a condensed distance matrix which contains infinite value if 'L_metric' in hier_dict.keys(): L_metric = hier_dict['L_metric'] # L_method can be 'single', 'complete','average','weighted','centroid','median','ward' if 'L_method' in hier_dict.keys(): L_method = hier_dict['L_method'] if L_method == 'centroid' or L_method == 'median' or L_method == 'ward': if L_metric != 'euclidean': L_metric = 'euclidean' print('\n') print('*************Note:**************') print('Method ' + str(L_method) + ' requires the distance metric to be Euclidean') if 'optimal_ordering' in hier_dict.keys(): optimal_ordering = hier_dict['optimal_ordering'] else: optimal_ordering = False Z = linkage(x, method=L_method, metric=L_metric, optimal_ordering=optimal_ordering) #criterion can be if 'criterionH' in hier_dict.keys(): criterionH = hier_dict['criterionH'] else: criterionH = 'inconsistent' if 'depth' in hier_dict.keys(): depth = hier_dict['depth'] else: depth = 2 if 't' in hier_dict.keys(): t = hier_dict['t'] #for 'maxclust' or 'maxclust_monocrit' criteria, #t would be max number of clusters requested. elif criterionH == 'maxclust_monocrit' or criterionH == 'maxclust': t = 20 if 'R' in hier_dict.keys(): R = hier_dict['R'] if criterionH == 'inconsistent' or criterionH == 'maxclust_monocrit': #The inconsistency matrix to use for the 'inconsistent' criterion. #R is computed if not provided. if R is None: R = inconsistent(Z, d=depth) else: R = np.asarray(R, order='c') if criterionH == 'monocrit': if R is None: R = inconsistent(Z, d=depth) #colR is the column of 'R' to use as the statistic return fcluster(Z, criterion='monocrit', t=t, monocrit=maxRstat(Z, R, colR)) elif criterionH == 'maxclust_monocrit': return fcluster(Z, criterion='maxclust_monocrit', t=t, monocrit=maxinconsts(Z, R)) else: return fcluster(Z, criterion=criterionH, depth=depth, R=R, t=t)
def _cluster_by_monocrit(linkage_table: numpy.ndarray, cutoff: float, inconsistent: pandas.DataFrame) -> numpy.ndarray: MR = hierarchy.maxRstat(linkage_table, inconsistent.values, 1) clusters = hierarchy.fcluster(linkage_table, t = cutoff, criterion = 'monocrit', monocrit = MR) return clusters