def maximize_Krzanowski_Lai_index(self): # Krzanowski -Lai index self.W = pl.zeros(len(self.Kvals)) p = self._nfeat for j, K in enumerate(self.Kvals): if self.verbose: print(f"Running with K={K} clusters") self.clusters = AgglomerativeClustering( n_clusters=K, affinity="precomputed", linkage="average", connectivity=self.connectivity, ) self.clusters.fit_predict(self._Affinity) # estimate WCSS for the samples self.W[j] = self.get_WCSS(K, self.clusters.labels_, self._distance_matr) # see eq. 3.1 of Krzanowski and Lai 1988 Biometrics DIFF = pl.array([ self.Kvals[1:], (self.W[:-1] * self.Kvals[:-1]**(2 / p)) - (self.W[1:] * self.Kvals[1:]**(2 / p)), ]) # for k=1, KL index is undefined self.KL = pl.array( [self.Kvals[1:-1], pl.fabs(DIFF[1, :-1] / DIFF[1, 1:])]) # see eq. 3.2 maxindex = self.KL[1, :].argmax() return pl.int_(self.KL[0, maxindex])
def meanvariable(month, variable):#(month, variable) month can be either 'Label' or 'Month' and variable can be either of the 3 variables listofmonth = list((df.apply(set)[month]))#get all the unique months (this will be either 12 or 227 averages = []#create an empty list for the averages to go into for i in listofmonth: dd = df[df[month] == i] #create a new dataset that only has the data for each individual month in averages.append(np.mean(dd[variable])) #graphs look the same using meadian or mean, no real outliers anyway x = pl.int_(listofmonth) plt.scatter(x, averages) #plot the mean against the month plt.xlabel('Months beginning July 2011') plt.ylabel(variable) plt.xlim(0,22) plt.show()
def minimize_davies_bouldin_score(self): self.DB = pl.zeros(len(self.Kvals)) for j, K in enumerate(self.Kvals): if self.verbose: print(f"Running with K={K} clusters") clusters = AgglomerativeClustering( n_clusters=K, affinity="precomputed", linkage="average", connectivity=self.connectivity, ) clusters.fit_predict(self._Affinity) self.DB[j] = davies_bouldin_score(X=self._X, labels=clusters.labels_) return pl.int_(self.Kvals[self.DB.argmin()])
def maximize_calinski_harabasz_score(self): self.CH = pl.zeros(len(self.Kvals)) for j, K in enumerate(self.Kvals): if self.verbose: print(f"Running with K={K} clusters") clusters = AgglomerativeClustering( n_clusters=K, affinity="precomputed", linkage="average", connectivity=self.connectivity, ) clusters.fit_predict(self._Affinity) self.CH[j] = calinski_harabasz_score(X=self._X, labels=clusters.labels_) return pl.int_(self.Kvals[self.CH.argmax()])
def maximize_silhouette_score(self): self.silhouette = pl.zeros(len(self.Kvals)) for j, K in enumerate(self.Kvals): if self.verbose: print(f"Running with K={K} clusters") clusters = AgglomerativeClustering( n_clusters=K, affinity="precomputed", linkage="average", connectivity=self.connectivity, ) clusters.fit_predict(self._Affinity) self.silhouette[j] = silhouette_score(X=self._distance_matr, labels=clusters.labels_, metric="precomputed") maxindex = self.silhouette.argmax() return pl.int_(self.Kvals[maxindex])
def meanvariable( month, variable ): #(month, variable) month can be either 'Label' or 'Month' and variable can be either of the 3 variables listofmonth = list( (df.apply(set)[month] )) #get all the unique months (this will be either 12 or 227 averages = [] #create an empty list for the averages to go into for i in listofmonth: dd = df[ df[month] == i] #create a new dataset that only has the data for each individual month in averages.append( np.mean(dd[variable]) ) #graphs look the same using meadian or mean, no real outliers anyway x = pl.int_(listofmonth) plt.scatter(x, averages) #plot the mean against the month plt.xlabel('Months beginning July 2011') plt.ylabel(variable) plt.xlim(0, 22) plt.show()
print "array is:", array print "array[:-1] is:", array[:-1] #same as list print "array[2:] is:", array[2:] #same as list print "array[2:-1] is:", array[2:-1] #same as list print "array[2:7] is:", array[2:7] #same as list print "array[::2] is:", array[::2] #same as list print "array[1::2] is:", array[1::2] #same as list array[::2] = array[::2] * 10 print "array[::2] = array[::2]*10:", array #for pylab (numpy) array! array[::2] = 10 print "array[::2] = 10:", array #for pylab (numpy) array! array[::2] = pylab.array([13, 14, 15, 16, 17]) print "array[::2] = pylab.array([13,14,15,16,17]):", array #for pylab (numpy) array! print "*******************Search and replace with index array*******************" randomArr = pylab.random(20) * 20 randomArr = pylab.int_(randomArr) addrArr = pylab.arange(20) print "this is a random arr:", randomArr print "find all values > 10" indicies = randomArr > 10 print "indicies = randomArr > 10:", indicies #boolean array! print "randomArr[indicies] is:", randomArr[indicies] print "*******************" print "find all values < 10" indicies = randomArr < 10 print "indicies = randomArr < 10:", indicies #boolean array! print "randomArr[indicies] is:", randomArr[indicies] print "*******************" print "find all values < 10 in one step" print "randomArr[randomArr < 10] is:", randomArr[randomArr < 10] print "*******************"
print "array is:", array print "array[:-1] is:", array[:-1] #same as list print "array[2:] is:", array[2:] #same as list print "array[2:-1] is:", array[2:-1] #same as list print "array[2:7] is:", array[2:7] #same as list print "array[::2] is:", array[::2] #same as list print "array[1::2] is:", array[1::2] #same as list array[::2] = array[::2]*10 print "array[::2] = array[::2]*10:", array #for pylab (numpy) array! array[::2] = 10 print "array[::2] = 10:", array #for pylab (numpy) array! array[::2] = pylab.array([13,14,15,16,17]) print "array[::2] = pylab.array([13,14,15,16,17]):", array #for pylab (numpy) array! print "*******************Search and replace with index array*******************" randomArr = pylab.random(20)*20 randomArr = pylab.int_(randomArr) addrArr = pylab.arange(20) print "this is a random arr:", randomArr print "find all values > 10" indicies = randomArr > 10 print "indicies = randomArr > 10:", indicies #boolean array! print "randomArr[indicies] is:", randomArr[indicies] print "*******************" print "find all values < 10" indicies = randomArr < 10 print "indicies = randomArr < 10:", indicies #boolean array! print "randomArr[indicies] is:", randomArr[indicies] print "*******************" print "find all values < 10 in one step" print "randomArr[randomArr < 10] is:", randomArr[randomArr < 10] print "*******************"
def main(args): comm = MPI.COMM_WORLD workdir = os.getcwd() rank = comm.Get_rank() nprocs = comm.Get_size() nside = args.nside string = args.spectral_parameter sky = get_sky(nside, 's1d1') if string == 'Bs': param = (sky.components[0].pl_index) elif string == 'Bd': param = (sky.components[1].mbb_index) elif string == 'Td': param = (sky.components[1].mbb_temperature).value sigmaparam = hp.read_map(args.parameter_uncertainties, verbose=False) sigmaparam = cu.check_nside(nsideout=nside, mapin=sigmaparam) mask = np.ma.masked_less(sigmaparam, 1e-7).mask sigmaparam[mask] = param[mask] * .0005 if rank == 0: hp.mollview(sigmaparam, norm='hist', sub=122) hp.mollview(param, norm='hist', sub=121) pl.show() if np.bool(args.KS_weight): if path.exists( f'{workdir}/affinities/KS_distance_{string}_{args.nside}.npz'): Q = np.load( f'{workdir}/affinities/KS_distance_{string}_{args.nside}.npz' )['affinity'] else: Q = build_adjacency_from_KS_distance(nside=nside, comm=comm, X=param, sigmaX=sigmaparam, ntests=5, nresample=100) if rank == 0: np.savez( f'{workdir}/affinities/KS_distance_{string}_{args.nside}.npz', affinity=Q) else: Q = None A = build_adjacency_from_heat_kernel(nside, comm, KS_weighted=np.bool(args.KS_weight), Q=Q, alpha=args.KS_weight) if rank == 0: pl.subplot(121) pl.title('Heat Kernel matrix ') pl.imshow(np.log(A)) pl.subplot(122) pl.title('KS distance matrix ') pl.imshow(np.log(Q)) pl.show() L = estimate_Laplacian_matrix(A, kind='unnormalized') lmax = nside - 1 Nmax = np.int_(from_ell_to_index(lmax)[1]) if rank == 0: print( f"Estimating eigenvalues up to lmax= {nside -1 }, i.e. the first {Nmax} eigenvectors of the Laplacian " ) l, W = estimate_Ritz_eigenpairs(L, n_eig=Nmax) E = build_distance_matrix_from_eigenvectors(W[:, 1:], comm=comm) if rank == 0: np.savez( f'{workdir}/affinities/{string}_euclidean_distance_eigenvectors_{args.KS_weight:.2f}_{args.nside}.npz', distance=E, eigenvectors=W[:, 1:], eigenvalues=l[1:]) clusters = AgglomerativeClustering( distance_threshold=args.distance_threshold, affinity='precomputed', linkage='average', compute_full_tree=True, n_clusters=None).fit(E) patches = pl.zeros_like(param, dtype=pl.int_) patches = pl.int_(clusters.labels_) if rank == 0: hp.mollview(patches, cmap=pl.cm.tab20) pl.show() fmap = f'{workdir}/clusterpatches/{string}_clusters_spectralclus_{args.KS_weight:.2f}_{args.nside}.fits' hp.write_map(fmap, patches, overwrite=True) comm.Disconnect pass
def estimate_Gap_statistics(self, nrefs): masknans = pl.ma.masked_not_equal(self._X[:, 0], 0).mask minvals = self._X[masknans, :].min(axis=0) maxvals = self._X[masknans, :].max(axis=0) meanvals = self._X[masknans, :].mean(axis=0) stdvals = self._X[masknans, :].std(axis=0) ref_Affinity = [] Dref = [] # Compute a random uniform reference distribution of features # precompute Distances and affinities. for i in range(nrefs): random_X = pl.ones_like(self._X) # random_X [:,0 ] =np.random.uniform (low = minvals[0] , high=maxvals[0], size=pl.int_( self._X.shape[0]/10 ) ) random_X[:, 1] = np.random.uniform( low=pl.quantile(q=0.16, a=self._X[masknans, 1]), high=pl.quantile(q=0.16, a=self._X[masknans, 1]), size=pl.int_(self._X.shape[0]), ) random_X[:, 0] = np.random.normal(loc=meanvals[0], scale=stdvals[0], size=pl.int_(self._X.shape[0])) ref_D = self._metric.pairwise(random_X) ref_D = pl.ma.fix_invalid(ref_D, fill_value=1.0).data Dref.append(ref_D) ref_Affinity.append(pairwise_kernels(ref_D, metric="precomputed")) self.Gaps = pl.zeros(len(self.Kvals)) self.sd = self.Gaps * 0.0 self.W = self.Gaps * 0.0 # KL index p = self._nfeat for j, K in enumerate(self.Kvals): if self.verbose: print(f"Running with K={K} clusters") self.clusters = AgglomerativeClustering( n_clusters=K, affinity="precomputed", linkage="average", connectivity=self.connectivity, ) self.clusters.fit_predict(self._Affinity) # estimate WCSS for the samples W = self.get_WCSS(K, self.clusters.labels_, self._distance_matr) self.W[j] = W # estimate WCSS for random samples ref_W = pl.zeros(nrefs) for i in range(nrefs): ref_clusters = AgglomerativeClustering( n_clusters=K, affinity="precomputed", linkage="average", connectivity=self.connectivity, ) ref_clusters.fit_predict(ref_Affinity[i]) ref_W[i] = self.get_WCSS(K, ref_clusters.labels_, Dref[i]) self.sd[j] = np.std(np.log(ref_W)) * np.sqrt(1 + 1.0 / nrefs) self.Gaps[j] = np.mean(np.log(ref_W)) - np.log(W) ## see section 4 of Tibishrani et al. http://web.stanford.edu/~hastie/Papers/gap.pdf gaps_criterion = pl.array( [self.Kvals[:-1], self.Gaps[:-1] - self.Gaps[1:] + self.sd[1:]]) mask = pl.array(gaps_criterion[1, :] >= 0) return pl.int_(gaps_criterion[0, mask][0])
def main(args): angles = args.haversine_distance affinity = args.affinity string = args.spectral_parameter nside = args.nside sky = get_sky(nside, 's1d1') if string == 'Bs': param = (sky.components[0].pl_index) elif string == 'Bd': param = (sky.components[1].mbb_index) elif string == 'Td': param = (sky.components[1].mbb_temperature).value sigmaparam = hp.read_map(args.parameter_uncertainties, verbose=False) Galmask = hp.read_map(args.galmask, verbose=False) Galmask = cu.check_nside(nsideout=nside, mapin=Galmask) sigmaparam = cu.check_nside(nsideout=nside, mapin=sigmaparam) Galmask = pl.ma.masked_not_equal(Galmask, 0).mask ones = pl.ma.masked_greater(Galmask, 0).mask fsky = (Galmask[ones].shape[0] / Galmask.shape[0]) if args.verbose: print(f' Clustering on fsky = {fsky:.2f}% ') #param[~Galmask]=0 #sigmaparam [~Galmask ] =0 noisestring = '' if args.add_noise: noise_param = np.random.normal(loc=pl.zeros_like(param), scale=sigmaparam / 10.) param += noise_param param = hp.smoothing(param, fwhm=pl.radians(args.parameter_resolution), verbose=False) noisestring = '_noise' save_affinity = True anglestring = '' if angles: anglestring = '_haversine' fmap = ( '/Users/peppe/work/adaptive_compsep/clusterpatches/' + f'clusters_{affinity}{anglestring}_galmask_{string}_{nside}{noisestring}_{args.optimization}.fits' ) file_affinity = ( f'/Users/peppe/work/adaptive_compsep/affinities/' + f'{affinity}{anglestring}_galmask_{string}_{nside}{noisestring}.npy') Cluster = ClusterData([param, sigmaparam], nfeatures=2, nside=nside, affinity=affinity, file_affinity=file_affinity, include_haversine=angles, verbose=args.verbose, save_affinity=save_affinity, scaler=None, feature_weights=[1, 1], galactic_mask=Galmask) Kmin = 2 Kmax = 200 Cluster(nvals=args.num_cluster_evaluation, Kmax=Kmax - 1, Kmin=Kmin, minimize=args.optimization, parameter_string=string) if args.optimization == 'partition': label1 = r'Under partition ' label2 = r'Over partition ' ylabel = 'Partition measure ' elif args.optimization == 'residuals': label1 = r'Syst. residuals' label2 = r'Stat. residuals' ylabel = r'Residuals [ $\mu K^2$ ] ' pl.title(string) pl.plot(Cluster.Kvals, Cluster.Vu, '.', label=label1) pl.plot(Cluster.Kvals, Cluster.Vo, '.', label=label2) pl.plot(Cluster.Kvals, pl.sqrt(Cluster.Vo**2 + Cluster.Vu**2), '-', label=r'Root Squared Sum ') pl.legend() pl.xlabel('K', fontsize=15) pl.ylabel(ylabel, fontsize=15) pl.show() patches = pl.zeros_like(param, dtype=pl.int_) patches[Galmask] = pl.int_(Cluster.clusters.labels_) + 1 hp.mollview(patches, cmap=pl.cm.tab20) pl.show() hp.write_map(fmap, patches, overwrite=True)