Пример #1
0
    def run(self, data_y, q_max=10, show=False, tol=1e-4):
        """
        Method to reduce dimension. Every iteration run all points. The new data
        is stored in attribute 'data_x'.

        Parameters
        ----------
        data_y : numpy.array
            Array with the original data.
        q_max : int (default = 10)
            Number of iterations. Each iteration run all points in 'data_y'.
        show : boolean (default = False)
            If True, shows the stress curve along time.
        tol : float (default = 1e-4)
            Tolerance for the stopping criteria.
        
        Returns
        -------
        data_x : numpy.array
            New data representation.
        """
        self.data_y = data_y
        n = len(data_y)
        triu = np.triu_indices(n, 1)
        dist_y = pdist(data_y)
        data_x = PCA(self.p).fit_transform(data_y)
        stress = np.zeros(q_max)
        print("Progress: 0.0%", end='\r')
        for q in range(q_max):
            alpha = max(0.001, self.alpha / (1 + q))
            lmbda = max(0.1, self.lmbd / (1 + q))
            for i in range(n):
                dist_x = cdist(data_x[i].reshape(1, -1), data_x)
                dy = np.delete(dist_y[i], i, 0)
                dx = np.delete(dist_x, i, 1)
                delta_x = (alpha * (lmbda > dx) * (dy - dx) / dx).reshape(
                    (-1, 1)) * (data_x[i] - np.delete(data_x, i, 0))
                delta_x = np.insert(delta_x, i, 0, axis=0)
                data_x -= delta_x
            dist_x = pdist(data_x)
            stress[q] = self._stress(dist_y[triu], dist_x[triu], lmbda)
            if stress[q] < tol:
                print("Progress: 100.00%")
                print(f"Tol achieved in iteration {q}")
                break
            print(f"Progress: {round((q+1)*100/q_max,2)}%  ", end='\r')
        if show:
            plt.plot(np.arange(q_max), stress, marker='.', c='black')
            plt.xlabel("Iteration")
            plt.ylabel("Stress")
            plt.show()
        print()
        self.data_x = data_x
        return data_x
Пример #2
0
def estimate_med_dist(features, num_slices=5000, percentile=50):

    pdists = pdist(features.values[:num_slices, :],
                   metric='euclidean').reshape(-1, 1)  # to handle sparse data
    med_dist = np.percentile(pdists[pdists > np.finfo(float).eps * 10],
                             percentile)
    med_dist = np.max((0.05, med_dist))
    return med_dist
Пример #3
0
def get_IBM_from_pairwise_dist(teX_mag, trX_mag, IBM, K, metric, P=None):
    teX_feed = teX_mag
    trX_feed = trX_mag
    if P is not None and metric == 'hamming':
        teX_feed, _ = WTA(teX_mag, P)
        trX_feed, _ = WTA(trX_mag, P)
    D = pdist(teX_feed.T, trX_feed.T, metric)
    F, T = teX_mag.shape
    return get_IBM_med_mean(D, IBM, K, T)
Пример #4
0
def get_sim_matrix(trX_mag, metric, errmetric, P=None):
    trX_feed = trX_mag
    if metric == 'hamming' and P is not None:
        trX_Pidx, _ = WTA(trX_mag, P)
        trX_feed = trX_Pidx
    sim = pdist(trX_feed.T, metric=metric)
    if errmetric == 'xent':
        return 1 - sim
    return sim
Пример #5
0
 def cluster_2point0(df):   
     '''
     In this function we computed median of each cluster and tried to merge cluster if the distance between their median is less than thr km.
     PARAMETERS:   df -> dataframe having all the points with column X,Y,Z,dbz
                     eps,min_pnts are the parameters for the DBSCAN on the basis of X,Y,Z values
                     eps2 , min_pts2 will be the parameter of dbz level of clustering
                     thr is the threshold value on the basis of which the cloud cluster has to be merged
     '''
     df = plot_dbscan(df, eps,min_pts, eps2=eps2, min_pts2=min_pts2)   #df has now 2 additional columns label level 0 and label level 1
     df_median = df[~df['label level 1'].str.contains('-1_')].groupby('label level 1').median()[['X', 'Y', 'Z']] #ignoring the noise points of label level one and grouping df by median.
     pdistance = pdist(df_median)  #computing distance matrix
     while np.amin(pdistance) <= thr:    #entry condition
         df_median = df.groupby('label level 1').median()[['X', 'Y', 'Z']]   #grouping each points with their labels and computing medoid
         pdistance = pdist(df_median)
         for i in range(len(pdistance)): 
             pdistance[i][i] = np.inf
         idx = np.argwhere(pdistance == (np.amin(pdistance)))[0]   #using the index of the min distance to get the labels, index here are the labels of 'labels level 1'
         df_median.index[idx[0]], df_median.index[idx[1]]
         df['label level 1'].replace({df_median.index[idx[1]]:df_median.index[idx[0]]}, inplace=True) # replacing the second cluster name with the first cluster's name
     return df   #COLUMNS : X,Y,Z,dbz,label level 0,label level 1
Пример #6
0
def getdist(f):
    fout = f.split("/")[-1].replace(".txt", ".meg")
    mat = pd.read_table(f, index_col=0)
    dm = pdist(mat.T, metric='euclidean', n_jobs=10)
    dm = pd.DataFrame(dm, index=mat.columns, columns=mat.columns)
    dm = dm.mask(np.triu(np.ones(dm.shape)).astype(np.bool))
    dm.to_csv(fout, sep="\t", index=False, header=False)
    s = open(fout).read()
    cs = ["#%s" % c for c in mat.columns]
    cs = "\n".join(cs)
    ss = "#mega\n!TITLE  Genetic distance data;\n!Format DataType=distance;\n!Description\n    CYQ try;\n"
    ss = ss + cs + s
    with open(fout, "w") as f2:
        f2.write(ss)
Пример #7
0
 def calc_loss(self, gap, feature):
     loss = torch.zeros(1)
     if self.flag_calc_loss:
         # calculate median distance between all pairs of points
         med_dist = np.median(
             pdist(gap.detach().cpu().numpy(),
                   metric='euclidean').reshape(-1, 1))
         # calculate current kernel bandwidth as moving average of previous sigma and current median distance
         sigma_gap = np.maximum(
             self.decay_factor * self.sigma_gap +
             (1 - self.decay_factor) * med_dist, 0.005)
         gap = gap[:, :self.
                   activation_size]  # penalize only the latent representation, not the external features
         hsic_features = HSIC(gap,
                              feature,
                              kernelX='Gaussian',
                              kernelY='Gaussian',
                              sigmaX=sigma_gap,
                              sigmaY=self.external_feature_std,
                              device=self.device)
         loss = self.lambda_hsic * hsic_features
     return loss
Пример #8
0
    def gen_inits_WH(self, init='random', seed=1, H_ortho=True):
        ''' The function is to initialize the factors W, H for nonnegative matrix factorization
        There are some options:
            1. random ------  generate W, H randomly
            2. kmeans ------  generate H based on cluster assignments obtained by Kmeans
                            then W = data_mat * H (since H is orthogonal)
            3. nmf    ------  use sklearn.nmf on data matrix firstly to get W, H for initialization
            4. kmeans++ ----  use heuristic strategy kmeans++ to get cluster assignment
                                    which can be used for H and W = data_mat * H

        Args:
            data (numpy array or mat): the input data
            init (string): the name of method used for generating the initializations
            rank (int): the rank for decomposition
            seed (float): the seed for random generator
        Returns:
            numpy matrix W and H
        '''
        ortho = 'ortho' if H_ortho else ''
        data_name = self.data_kind + str(self.data_num)

        initW_path = os.path.join(self.root_dir, 'inits', data_name,
                                  'W' + str(seed) + '.csv')
        initH_path = os.path.join(self.root_dir, 'inits', data_name,
                                  'H' + '_' + ortho + str(seed) + '.csv')
        if os.path.exists(initW_path) and os.path.exists(initH_path):
            if seed < 100:
                W_init = self.read_data_from_csvfile(initW_path)
            H_init = self.read_data_from_csvfile(initH_path)
        else:
            (
                m, n
            ) = self.data_mat.shape  # get the size of data matrix to be decomposed

            np.random.seed(seed)
            if init == 'random':
                abs_mat = np.absolute(self.data_mat)
                #print np.any(abs_mat < 0)
                avg = np.sqrt(abs_mat.mean() / self.num_of_cls)
                print 'mean: ' + str(abs_mat.mean())
                print 'rank: ' + str(self.num_of_cls)
                print 'avg: ' + str(avg)
                W_init = np.asmatrix(avg * np.random.random(
                    (m, self.num_of_cls)))
                H_init = np.asmatrix(avg * np.random.random(
                    (n, self.num_of_cls)))
            elif init == 'kmeans':
                km = sklearn_KMeans(n_clusters=self.num_of_cls).fit(
                    self.data_mat.transpose())
                clusters = km.predict(self.data_mat.transpose())
                H_init = np.asmatrix(np.zeros((n, self.num_of_cls)))
                for i in range(len(clusters)):
                    H_init[i, clusters[i]] = 1
                H_init = H_init * np.diag(
                    np.diag(H_init.transpose() * H_init)**(-0.5))
                W_init = self.data_mat * H_init
            elif init == 'nmf':
                model = sklearn_NMF(n_components=self.num_of_cls,
                                    init='nndsvd',
                                    random_state=0)
                W = model.fit_transform(self.data_mat.transpose())
                H = model.components_
                H_init = np.asmatrix(W)
                W_init = np.asmatrix(H).transpose()
            elif init == 'kmeans++':
                print 'using k++ initialization....'
                data_mat = self.data_mat.transpose()
                initial_centroids = np.ones((self.num_of_cls, m)) * (-1)
                ind_list = []
                idx = np.random.choice(n)
                ind_list.append(idx)
                initial_centroids[0, :] = data_mat[idx, :]
                while len(ind_list) < self.rank:
                    cent = initial_centroids[0:len(ind_list), :]
                    D2 = np.array([
                        min([LA.norm(x - c)**2 for c in cent])
                        for x in data_mat
                    ])
                    probs = D2 / D2.sum()
                    cumprobs = probs.cumsum()
                    #r = random.random()
                    r = np.random.random()
                    idx = np.where(cumprobs >= r)[0][0]
                    ind_list.append(idx)
                    initial_centroids[len(ind_list) - 1, :] = data_mat[idx, :]
                print ind_list

                W_init = np.asmatrix(initial_centroids).transpose()
                distances = np.ones((m, self.num_of_cls)) * (-1)
                for centroid_idx in range(self.num_of_cls):
                    for data_idx in range(n):
                        distances[data_idx, centroid_idx] = LA.norm(
                            data_mat[data_idx, :] -
                            initial_centroids[centroid_idx, :])

                cluster_assignments = np.argmin(distances, axis=1)
                temp_H = np.asmatrix(np.zeros((n, self.num_of_cls)))
                for j in range(n):
                    temp_H[j, cluster_assignments[j]] = 1

                #temp_H = np.diag(np.diag(temp_H * temp_H.transpose()) ** (-0.5)) * temp_H
                H_init = np.asmatrix(temp_H)

            else:
                raise ValueError(
                    'Error: invalid int parameter - init (None, random, kmeans, nmf)!!'
                )

            H_init = np.asmatrix(H_init.transpose())

            if H_ortho:
                #H_init = np.asmatrix(H_init.transpose())
                (ha, hb) = H_init.shape
                ortho = LA.norm(
                    H_init * H_init.transpose() - np.asmatrix(np.eye(ha)),
                    'fro')
                print H_init * H_init.transpose()
                if ortho > 1e-6:
                    H = np.zeros((ha, hb))
                    ind = np.asarray(np.argmax(H_init, 0))[0, :]
                    for j in range(hb):
                        H[ind[j], j] = 1
                    H = np.asmatrix(H)
                    temp = np.diag(H * H.transpose())
                    if np.any(temp == 0):
                        print temp
                        raise ValueError("some rows of H are zeros!!!")
                    H = np.asmatrix(np.diag(temp**(-0.5))) * H
                    H_init = H

        if seed >= 100:
            np.random.seed(seed)
            (m, n) = self.data_mat.shape

            # find centers from the smallest clusters
            cls_idx, cls_sizes = np.unique(self.true_labels,
                                           return_counts=True)
            s_id = cls_idx[np.argmax(cls_sizes)]
            id_list = np.where(self.true_labels == s_id)[0]
            print s_id
            print id_list

            dis_mat = pdist(self.data_mat.transpose())
            print np.argmin(dis_mat)
            print np.unravel_index(dis_mat.argmin(), dis_mat.shape)
            print np.where(dis_mat == np.min(dis_mat[np.nonzero(dis_mat)]))
            print 'select initial points -----'
            select_idx = [997, 998, 999]
            print select_idx
            #print id_list
            #select_idx = np.random.choice(id_list, self.num_of_cls, replace = False)

            W_init = self.data_mat[:, select_idx]
            #raise ValueError('TTEST!')
            W_init = np.asmatrix(W_init)
            print W_init.shape

            # save generated initializations
            f_manager = FileManager(self.root_dir)
            f_manager.add_file(initW_path)
            np.savetxt(initW_path, np.asmatrix(W_init), delimiter=',')
            f_manager.add_file(initH_path)
            np.savetxt(initH_path, np.asmatrix(H_init), delimiter=',')

        return np.asmatrix(W_init), np.asmatrix(H_init)
Пример #9
0
def generate_metrics(hashes, labels, hamming_N=500, hamming_R=2):
    dists = pdist(hashes, metric="hamming") * hashes.shape[1]
    mAP = mean_average_precision(dists, labels)
    precision_at_N = precision_at_sample(dists, labels, hamming_N)
    hamming_rank = hamming_radius(dists, labels, hamming_R)
    return mAP, precision_at_N, hamming_rank