示例#1
0
    def maximize_Krzanowski_Lai_index(self):
        # Krzanowski -Lai    index
        self.W = pl.zeros(len(self.Kvals))

        p = self._nfeat
        for j, K in enumerate(self.Kvals):
            if self.verbose:
                print(f"Running with K={K} clusters")

            self.clusters = AgglomerativeClustering(
                n_clusters=K,
                affinity="precomputed",
                linkage="average",
                connectivity=self.connectivity,
            )
            self.clusters.fit_predict(self._Affinity)
            # estimate WCSS for the samples
            self.W[j] = self.get_WCSS(K, self.clusters.labels_,
                                      self._distance_matr)
        # see eq. 3.1 of Krzanowski and Lai 1988 Biometrics
        DIFF = pl.array([
            self.Kvals[1:],
            (self.W[:-1] * self.Kvals[:-1]**(2 / p)) -
            (self.W[1:] * self.Kvals[1:]**(2 / p)),
        ])

        # for k=1, KL index is undefined

        self.KL = pl.array(
            [self.Kvals[1:-1],
             pl.fabs(DIFF[1, :-1] / DIFF[1, 1:])])  # see eq. 3.2
        maxindex = self.KL[1, :].argmax()

        return pl.int_(self.KL[0, maxindex])
def meanvariable(month, variable):#(month, variable) month can be either 'Label' or 'Month' and variable can be either of the 3 variables
    listofmonth = list((df.apply(set)[month]))#get all the unique months (this will be either 12 or 227
    averages = []#create an empty list for the averages to go into
    for i in listofmonth:
        dd = df[df[month] == i]  #create a new dataset that only has the data for each individual month in
        averages.append(np.mean(dd[variable])) #graphs look the same using meadian or mean, no real outliers anyway
    x = pl.int_(listofmonth)    
    plt.scatter(x, averages)  #plot the mean against the month
    plt.xlabel('Months beginning July 2011')
    plt.ylabel(variable)
    plt.xlim(0,22)
    plt.show()
示例#3
0
 def minimize_davies_bouldin_score(self):
     self.DB = pl.zeros(len(self.Kvals))
     for j, K in enumerate(self.Kvals):
         if self.verbose:
             print(f"Running with K={K} clusters")
         clusters = AgglomerativeClustering(
             n_clusters=K,
             affinity="precomputed",
             linkage="average",
             connectivity=self.connectivity,
         )
         clusters.fit_predict(self._Affinity)
         self.DB[j] = davies_bouldin_score(X=self._X,
                                           labels=clusters.labels_)
     return pl.int_(self.Kvals[self.DB.argmin()])
示例#4
0
 def maximize_calinski_harabasz_score(self):
     self.CH = pl.zeros(len(self.Kvals))
     for j, K in enumerate(self.Kvals):
         if self.verbose:
             print(f"Running with K={K} clusters")
         clusters = AgglomerativeClustering(
             n_clusters=K,
             affinity="precomputed",
             linkage="average",
             connectivity=self.connectivity,
         )
         clusters.fit_predict(self._Affinity)
         self.CH[j] = calinski_harabasz_score(X=self._X,
                                              labels=clusters.labels_)
     return pl.int_(self.Kvals[self.CH.argmax()])
示例#5
0
 def maximize_silhouette_score(self):
     self.silhouette = pl.zeros(len(self.Kvals))
     for j, K in enumerate(self.Kvals):
         if self.verbose:
             print(f"Running with K={K} clusters")
         clusters = AgglomerativeClustering(
             n_clusters=K,
             affinity="precomputed",
             linkage="average",
             connectivity=self.connectivity,
         )
         clusters.fit_predict(self._Affinity)
         self.silhouette[j] = silhouette_score(X=self._distance_matr,
                                               labels=clusters.labels_,
                                               metric="precomputed")
     maxindex = self.silhouette.argmax()
     return pl.int_(self.Kvals[maxindex])
示例#6
0
def meanvariable(
    month, variable
):  #(month, variable) month can be either 'Label' or 'Month' and variable can be either of the 3 variables
    listofmonth = list(
        (df.apply(set)[month]
         ))  #get all the unique months (this will be either 12 or 227
    averages = []  #create an empty list for the averages to go into
    for i in listofmonth:
        dd = df[
            df[month] ==
            i]  #create a new dataset that only has the data for each individual month in
        averages.append(
            np.mean(dd[variable])
        )  #graphs look the same using meadian or mean, no real outliers anyway
    x = pl.int_(listofmonth)
    plt.scatter(x, averages)  #plot the mean against the month
    plt.xlabel('Months beginning July 2011')
    plt.ylabel(variable)
    plt.xlim(0, 22)
    plt.show()
示例#7
0
print "array is:", array
print "array[:-1] is:", array[:-1]  #same as list
print "array[2:] is:", array[2:]  #same as list
print "array[2:-1] is:", array[2:-1]  #same as list
print "array[2:7] is:", array[2:7]  #same as list
print "array[::2] is:", array[::2]  #same as list
print "array[1::2] is:", array[1::2]  #same as list
array[::2] = array[::2] * 10
print "array[::2] = array[::2]*10:", array  #for pylab (numpy) array!
array[::2] = 10
print "array[::2] = 10:", array  #for pylab (numpy) array!
array[::2] = pylab.array([13, 14, 15, 16, 17])
print "array[::2] = pylab.array([13,14,15,16,17]):", array  #for pylab (numpy) array!
print "*******************Search and replace with index array*******************"
randomArr = pylab.random(20) * 20
randomArr = pylab.int_(randomArr)
addrArr = pylab.arange(20)
print "this is a random arr:", randomArr
print "find all values > 10"
indicies = randomArr > 10
print "indicies = randomArr > 10:", indicies  #boolean array!
print "randomArr[indicies] is:", randomArr[indicies]
print "*******************"
print "find all values < 10"
indicies = randomArr < 10
print "indicies = randomArr < 10:", indicies  #boolean array!
print "randomArr[indicies] is:", randomArr[indicies]
print "*******************"
print "find all values < 10 in one step"
print "randomArr[randomArr < 10] is:", randomArr[randomArr < 10]
print "*******************"
print "array is:", array
print "array[:-1] is:", array[:-1] #same as list
print "array[2:] is:", array[2:] #same as list
print "array[2:-1] is:", array[2:-1] #same as list
print "array[2:7] is:", array[2:7] #same as list
print "array[::2] is:", array[::2] #same as list
print "array[1::2] is:", array[1::2] #same as list
array[::2] = array[::2]*10
print "array[::2] = array[::2]*10:", array #for pylab (numpy) array!
array[::2] = 10
print "array[::2] = 10:", array #for pylab (numpy) array!
array[::2] = pylab.array([13,14,15,16,17])
print "array[::2] = pylab.array([13,14,15,16,17]):", array #for pylab (numpy) array!
print "*******************Search and replace with index array*******************"
randomArr = pylab.random(20)*20
randomArr = pylab.int_(randomArr)
addrArr = pylab.arange(20)
print "this is a random arr:", randomArr
print "find all values > 10"
indicies = randomArr > 10
print "indicies = randomArr > 10:", indicies #boolean array!
print "randomArr[indicies] is:", randomArr[indicies]
print "*******************"
print "find all values < 10"
indicies = randomArr < 10
print "indicies = randomArr < 10:", indicies #boolean array!
print "randomArr[indicies] is:", randomArr[indicies]
print "*******************"
print "find all values < 10 in one step"
print "randomArr[randomArr < 10] is:", randomArr[randomArr < 10]
print "*******************"
示例#9
0
def main(args):
    comm = MPI.COMM_WORLD
    workdir = os.getcwd()
    rank = comm.Get_rank()
    nprocs = comm.Get_size()
    nside = args.nside

    string = args.spectral_parameter
    sky = get_sky(nside, 's1d1')

    if string == 'Bs':

        param = (sky.components[0].pl_index)
    elif string == 'Bd':
        param = (sky.components[1].mbb_index)
    elif string == 'Td':
        param = (sky.components[1].mbb_temperature).value

    sigmaparam = hp.read_map(args.parameter_uncertainties, verbose=False)
    sigmaparam = cu.check_nside(nsideout=nside, mapin=sigmaparam)
    mask = np.ma.masked_less(sigmaparam, 1e-7).mask
    sigmaparam[mask] = param[mask] * .0005
    if rank == 0:
        hp.mollview(sigmaparam, norm='hist', sub=122)
        hp.mollview(param, norm='hist', sub=121)
        pl.show()

    if np.bool(args.KS_weight):
        if path.exists(
                f'{workdir}/affinities/KS_distance_{string}_{args.nside}.npz'):
            Q = np.load(
                f'{workdir}/affinities/KS_distance_{string}_{args.nside}.npz'
            )['affinity']
        else:
            Q = build_adjacency_from_KS_distance(nside=nside,
                                                 comm=comm,
                                                 X=param,
                                                 sigmaX=sigmaparam,
                                                 ntests=5,
                                                 nresample=100)
            if rank == 0:
                np.savez(
                    f'{workdir}/affinities/KS_distance_{string}_{args.nside}.npz',
                    affinity=Q)

    else:
        Q = None

    A = build_adjacency_from_heat_kernel(nside,
                                         comm,
                                         KS_weighted=np.bool(args.KS_weight),
                                         Q=Q,
                                         alpha=args.KS_weight)
    if rank == 0:
        pl.subplot(121)
        pl.title('Heat Kernel matrix  ')
        pl.imshow(np.log(A))
        pl.subplot(122)
        pl.title('KS distance matrix ')
        pl.imshow(np.log(Q))
        pl.show()
    L = estimate_Laplacian_matrix(A, kind='unnormalized')
    lmax = nside - 1
    Nmax = np.int_(from_ell_to_index(lmax)[1])
    if rank == 0:
        print(
            f"Estimating eigenvalues up to lmax= {nside  -1 }, i.e. the first {Nmax} eigenvectors of the Laplacian "
        )
    l, W = estimate_Ritz_eigenpairs(L, n_eig=Nmax)

    E = build_distance_matrix_from_eigenvectors(W[:, 1:], comm=comm)
    if rank == 0:
        np.savez(
            f'{workdir}/affinities/{string}_euclidean_distance_eigenvectors_{args.KS_weight:.2f}_{args.nside}.npz',
            distance=E,
            eigenvectors=W[:, 1:],
            eigenvalues=l[1:])

    clusters = AgglomerativeClustering(
        distance_threshold=args.distance_threshold,
        affinity='precomputed',
        linkage='average',
        compute_full_tree=True,
        n_clusters=None).fit(E)

    patches = pl.zeros_like(param, dtype=pl.int_)

    patches = pl.int_(clusters.labels_)
    if rank == 0:
        hp.mollview(patches, cmap=pl.cm.tab20)
        pl.show()
        fmap = f'{workdir}/clusterpatches/{string}_clusters_spectralclus_{args.KS_weight:.2f}_{args.nside}.fits'
        hp.write_map(fmap, patches, overwrite=True)

    comm.Disconnect

    pass
示例#10
0
    def estimate_Gap_statistics(self, nrefs):
        masknans = pl.ma.masked_not_equal(self._X[:, 0], 0).mask
        minvals = self._X[masknans, :].min(axis=0)
        maxvals = self._X[masknans, :].max(axis=0)
        meanvals = self._X[masknans, :].mean(axis=0)
        stdvals = self._X[masknans, :].std(axis=0)
        ref_Affinity = []
        Dref = []

        # Compute a random uniform reference distribution of features
        # precompute Distances and affinities.
        for i in range(nrefs):

            random_X = pl.ones_like(self._X)
            # random_X [:,0 ] =np.random.uniform (low = minvals[0] , high=maxvals[0], size=pl.int_( self._X.shape[0]/10 ) )
            random_X[:, 1] = np.random.uniform(
                low=pl.quantile(q=0.16, a=self._X[masknans, 1]),
                high=pl.quantile(q=0.16, a=self._X[masknans, 1]),
                size=pl.int_(self._X.shape[0]),
            )
            random_X[:, 0] = np.random.normal(loc=meanvals[0],
                                              scale=stdvals[0],
                                              size=pl.int_(self._X.shape[0]))
            ref_D = self._metric.pairwise(random_X)
            ref_D = pl.ma.fix_invalid(ref_D, fill_value=1.0).data

            Dref.append(ref_D)

            ref_Affinity.append(pairwise_kernels(ref_D, metric="precomputed"))

        self.Gaps = pl.zeros(len(self.Kvals))
        self.sd = self.Gaps * 0.0
        self.W = self.Gaps * 0.0  # KL index
        p = self._nfeat
        for j, K in enumerate(self.Kvals):
            if self.verbose:
                print(f"Running with K={K} clusters")
            self.clusters = AgglomerativeClustering(
                n_clusters=K,
                affinity="precomputed",
                linkage="average",
                connectivity=self.connectivity,
            )
            self.clusters.fit_predict(self._Affinity)
            # estimate WCSS for the samples
            W = self.get_WCSS(K, self.clusters.labels_, self._distance_matr)
            self.W[j] = W
            # estimate WCSS for random samples
            ref_W = pl.zeros(nrefs)

            for i in range(nrefs):
                ref_clusters = AgglomerativeClustering(
                    n_clusters=K,
                    affinity="precomputed",
                    linkage="average",
                    connectivity=self.connectivity,
                )
                ref_clusters.fit_predict(ref_Affinity[i])
                ref_W[i] = self.get_WCSS(K, ref_clusters.labels_, Dref[i])

            self.sd[j] = np.std(np.log(ref_W)) * np.sqrt(1 + 1.0 / nrefs)
            self.Gaps[j] = np.mean(np.log(ref_W)) - np.log(W)

        ## see section 4 of Tibishrani et al. http://web.stanford.edu/~hastie/Papers/gap.pdf

        gaps_criterion = pl.array(
            [self.Kvals[:-1], self.Gaps[:-1] - self.Gaps[1:] + self.sd[1:]])
        mask = pl.array(gaps_criterion[1, :] >= 0)
        return pl.int_(gaps_criterion[0, mask][0])
示例#11
0
def main(args):

    angles = args.haversine_distance

    affinity = args.affinity

    string = args.spectral_parameter
    nside = args.nside
    sky = get_sky(nside, 's1d1')
    if string == 'Bs':

        param = (sky.components[0].pl_index)
    elif string == 'Bd':
        param = (sky.components[1].mbb_index)
    elif string == 'Td':
        param = (sky.components[1].mbb_temperature).value

    sigmaparam = hp.read_map(args.parameter_uncertainties, verbose=False)
    Galmask = hp.read_map(args.galmask, verbose=False)
    Galmask = cu.check_nside(nsideout=nside, mapin=Galmask)
    sigmaparam = cu.check_nside(nsideout=nside, mapin=sigmaparam)

    Galmask = pl.ma.masked_not_equal(Galmask, 0).mask

    ones = pl.ma.masked_greater(Galmask, 0).mask
    fsky = (Galmask[ones].shape[0] / Galmask.shape[0])
    if args.verbose: print(f' Clustering on fsky =   {fsky:.2f}%   ')

    #param[~Galmask]=0
    #sigmaparam [~Galmask ] =0
    noisestring = ''
    if args.add_noise:
        noise_param = np.random.normal(loc=pl.zeros_like(param),
                                       scale=sigmaparam / 10.)
        param += noise_param
        param = hp.smoothing(param,
                             fwhm=pl.radians(args.parameter_resolution),
                             verbose=False)

        noisestring = '_noise'

    save_affinity = True

    anglestring = ''
    if angles:
        anglestring = '_haversine'

    fmap = (
        '/Users/peppe/work/adaptive_compsep/clusterpatches/' +
        f'clusters_{affinity}{anglestring}_galmask_{string}_{nside}{noisestring}_{args.optimization}.fits'
    )
    file_affinity = (
        f'/Users/peppe/work/adaptive_compsep/affinities/' +
        f'{affinity}{anglestring}_galmask_{string}_{nside}{noisestring}.npy')

    Cluster = ClusterData([param, sigmaparam],
                          nfeatures=2,
                          nside=nside,
                          affinity=affinity,
                          file_affinity=file_affinity,
                          include_haversine=angles,
                          verbose=args.verbose,
                          save_affinity=save_affinity,
                          scaler=None,
                          feature_weights=[1, 1],
                          galactic_mask=Galmask)

    Kmin = 2
    Kmax = 200
    Cluster(nvals=args.num_cluster_evaluation,
            Kmax=Kmax - 1,
            Kmin=Kmin,
            minimize=args.optimization,
            parameter_string=string)

    if args.optimization == 'partition':
        label1 = r'Under partition  '
        label2 = r'Over partition '
        ylabel = 'Partition measure '

    elif args.optimization == 'residuals':
        label1 = r'Syst. residuals'
        label2 = r'Stat. residuals'
        ylabel = r'Residuals [ $\mu K^2$ ]  '

    pl.title(string)
    pl.plot(Cluster.Kvals, Cluster.Vu, '.', label=label1)
    pl.plot(Cluster.Kvals, Cluster.Vo, '.', label=label2)
    pl.plot(Cluster.Kvals,
            pl.sqrt(Cluster.Vo**2 + Cluster.Vu**2),
            '-',
            label=r'Root Squared Sum ')
    pl.legend()
    pl.xlabel('K', fontsize=15)

    pl.ylabel(ylabel, fontsize=15)
    pl.show()

    patches = pl.zeros_like(param, dtype=pl.int_)

    patches[Galmask] = pl.int_(Cluster.clusters.labels_) + 1

    hp.mollview(patches, cmap=pl.cm.tab20)
    pl.show()

    hp.write_map(fmap, patches, overwrite=True)