def cluster_reps(reps, threshold=1.0, plot=1): """Do clustering based `reps`. Returns a tuple with - The indices of the largest cluster found - The condensed distance matrix - Cluster linkage Keyword arguments: `threshold` : chisq threshold to use in discrimination. `plot` : Plot results, if True. """ cdm = chi2cdm(reps) links = hc.linkage(cdm, method='complete') clist = filter_with_linkage(links, threshold) print("Clusters: %s" % str(clist)) if plot: first = reps[0,...] aver = mean_stack(reps) filtered = mean_stack(reps[clist[0],...]) plot_clustering(filtered, first, aver, clist[0], cdm, links, threshold) return (clist[0], cdm, links)
def filter_outliers(reps, threshold=1.0, plot=1): """Filter by removing repetitions having mutual chisq above `threshold`. Returns a tuple containing the included indices and the condensed distance matrix. Repetitions are removed iteratively by checking which repetition contributes the largest number of over the threshold chi-squared values (outliers) in the chisq-distance matrix, and removing that point. If two repetitions cause an equal number of outliers, the repetition which has the highest chisq distance to a non-outlier distance matrix point is removed. """ cdm = chi2cdm(reps) dmat = squareform(cdm) incinds = filter_distmat(dmat, threshold) if plot: first = reps[0,...] aver = mean_stack(reps) filtered = mean_stack(reps[incinds,...]) plot_outliers(filtered, first, aver, incinds, cdm, threshold) return incinds, cdm