예제 #1
0
def test_emd2_multi():

    from ot.datasets import get_1D_gauss as gauss

    n = 1000  # nb bins

    # bin positions
    x = np.arange(n, dtype=np.float64)

    # Gaussian distributions
    a = gauss(n, m=20, s=5)  # m= mean, s= std

    ls = np.arange(20, 1000, 20)
    nb = len(ls)
    b = np.zeros((n, nb))
    for i in range(nb):
        b[:, i] = gauss(n, m=ls[i], s=10)

    # loss matrix
    M = ot.dist(x.reshape((n, 1)), x.reshape((n, 1)))
    # M/=M.max()

    print('Computing {} EMD '.format(nb))

    # emd loss 1 proc
    ot.tic()
    emd1 = ot.emd2(a, b, M, 1)
    ot.toc('1 proc : {} s')

    # emd loss multipro proc
    ot.tic()
    emdn = ot.emd2(a, b, M)
    ot.toc('multi proc : {} s')

    np.testing.assert_allclose(emd1, emdn)
예제 #2
0
파일: test_ot.py 프로젝트: eddardd/POT
def test_emd_emd2_devices_tf():
    if not tf:
        return
    nx = ot.backend.TensorflowBackend()

    n_samples = 100
    n_features = 2
    rng = np.random.RandomState(0)
    x = rng.randn(n_samples, n_features)
    y = rng.randn(n_samples, n_features)
    a = ot.utils.unif(n_samples)
    M = ot.dist(x, y)

    # Check that everything stays on the CPU
    with tf.device("/CPU:0"):
        ab, Mb = nx.from_numpy(a, M)
        Gb = ot.emd(ab, ab, Mb)
        w = ot.emd2(ab, ab, Mb)
        nx.assert_same_dtype_device(Mb, Gb)
        nx.assert_same_dtype_device(Mb, w)

    if len(tf.config.list_physical_devices('GPU')) > 0:
        # Check that everything happens on the GPU
        ab, Mb = nx.from_numpy(a, M)
        Gb = ot.emd(ab, ab, Mb)
        w = ot.emd2(ab, ab, Mb)
        nx.assert_same_dtype_device(Mb, Gb)
        nx.assert_same_dtype_device(Mb, w)
        assert nx.dtype_device(Gb)[1].startswith("GPU")
def double_wasserstein1(X_train_smote):

    n, m, r, _ = X_train_smote.shape

    # uniform measures at points clouds of card m
    a2 = np.ones(m) / m
    b2 = np.ones(m) / m

    # uniform measures at points of card r
    a1 = np.ones(r) / r
    b1 = np.ones(r) / r

    # 1st level distance matrix of size m x m
    M1 = np.zeros((m, m))

    # M1 loop
    for i in range(m):
        for j in range(i + 1, m):

            # pairwise squared Euclidean distances as the ground metric
            M0_ij = ot.dist(X_train_smote[0, i],
                            X_train_smote[1, j],
                            metric="sqeuclidean")

            # 2-Wasserstein distance btw point clouds, take square root
            M1[i, j] = ot.emd2(a1, b1, M0_ij)**0.5

    # 1st level symmetrize
    M1 = M1 + M1.T
    np.fill_diagonal(M1, 1e9)

    # 1-Wasserstein distance btw collections of point clouds
    W1 = ot.emd2(a2, b2, M1)

    return W1
예제 #4
0
파일: distances.py 프로젝트: wibruce/HOTT
def wmd(p, q, C, truncate=None):
    """ Word mover's distance between distributions p and q with cost M."""
    if truncate is None:
        return ot.emd2(p, q, C)
    id_p = np.argsort(p)[-truncate:]
    id_q = np.argsort(q)[-truncate:]
    C_reduced = C[id_p][:, id_q]
    return ot.emd2(p[id_p], q[id_q], C_reduced)
예제 #5
0
def w2(k):
    print(k)
    w2z = np.zeros((2, 10, 10, 500))
    #w2b = np.zeros((10,10,500))
    for j in range(1, 11):
        print('   j: ' + str(j))
        for i in range(1):
            try:
                #Y = np.load('/home/archithpc/data/batch-variable-genes-3/sim-cluster-' + str(j) + '-' + str(i) + '.npy')[k,j-1,i-1]
                #Y = np.loadtxt('/home/archithpc/data/batch-effect-variable-genes-3/' + str(k) + '/sim-cluster-' + str(j) + '-' + str(i) + '.csv')
                dist_obs = squareform(pdist(Y))
                dist_obs_norm = normalize(dist_obs, norm='l1')

                #path = './'+ folder + '/matern/' + init + '/'  + str(k) + '/' + str(j) + '/' + str(i) + '/model-output-' + iters + '.hdf5'
                #fit = h5py.File(path, 'r')
                #zfit = fit['x_mean']

                #dist_z = squareform(pdist(zfit))
                #dist_z_norm = normalize(dist_z, norm = 'l1')

                ols = LinearRegression(fit_intercept=False)
                ols.fit(batch_var, Y)
                Yres = Y - ols.predict(batch_var)
                pca = PCA(n_components=3)
                zpca = pca.fit_transform(Yres)

                dist_pca = squareform(pdist(zpca))
                dist_pca_norm = normalize(dist_pca, norm='l1')

                #seurat = pd.read_csv('./'+folder+'/results/seurat-results/sim-' + str(i) + '.csv')
                #dist_seurat = squareform(pdist(seurat.values))
                #dist_s_norm = normalize(dist_seurat, norm = 'l1')

                #mnn = pd.read_csv('./'+folder+'/mnn-results/' + str(k) + '/sim-' + str(j) + '-' + str(i) + '.csv')
                #dist_m = squareform(pdist(mnn.values))
                #dist_m_norm = normalize(dist_m, norm = 'l1')

                for n in range(0, 500):
                    #w2z[0,j-1,i-1,n] = ot.emd2(dist_z_norm[j,:],dist_norm[j,:],M)
                    #w2z[1,j-1,i-1,n] = ot.emd2(dist_z_norm[j,:],dist_obs_norm[j,:],M)

                    w2p[0, j - 1, i - 1, n] = ot.emd2(dist_pca_norm[j, :],
                                                      dist_norm[j, :], M)
                    w2p[1, j - 1, i - 1, n] = ot.emd2(dist_pca_norm[j, :],
                                                      dist_obs_norm[j, :], M)

                    #w2s[0,k,j-1,i-1,n] = ot.emd2(dist_s_norm[j,:],dist_norm[j,:],M)
                    #w2s[1,k,j-1,i-1,n] = ot.emd2(dist_s_norm[j,:],dist_obs_norm[j,:],M)

                    #w2m[0,j-1,i-1,n] = ot.emd2(dist_m_norm[j,:],dist_norm[j,:],M)
                    #w2m[1,j-1,i-1,n] = ot.emd2(dist_m_norm[j,:],dist_obs_norm[j,:],M)

                    #w2b[j-1,i-1,n] = ot.emd2(dist_norm[j,:],dist_obs_norm[j,:],M)
            except:
                w2z[0, j - 1, i - 1, :] = -1 * np.ones(500)
                w2z[1, j - 1, i - 1, :] = -1 * np.ones(500)
    return w2p
예제 #6
0
def test_periodic_phi(gdim, M):
    events = np.random.rand(nev, M, 1 + gdim)
    for phi_col in range(1, gdim + 1):
        emds1 = emd.emds_pot(events, R=1.0, gdim=gdim, n_jobs=1, verbose=0)
        events_c = np.copy(events)
        events_c[:, :, phi_col] += 2 * np.pi * np.random.randint(
            -10, 10, size=(nev, M))
        emds2 = emd.emds_pot(events_c,
                             R=1.0,
                             gdim=gdim,
                             periodic_phi=True,
                             phi_col=phi_col,
                             n_jobs=1,
                             verbose=0)
        assert epsilon_diff(emds1, emds2, 10**-12)

        ev1 = np.random.rand(10, 1 + gdim) * 4 * np.pi
        ev2 = np.random.rand(20, 1 + gdim) * 4 * np.pi
        thetaw = np.zeros((len(ev1), len(ev2)))
        thetar = np.zeros((len(ev1), len(ev2)))
        for i, p1 in enumerate(ev1):
            for j, p2 in enumerate(ev2):
                dw, dr = 0., 0.
                for m, (k1, k2) in enumerate(zip(p1, p2)):
                    if m == 0:
                        continue
                    elif m == phi_col:
                        dw += (k1 - k2)**2
                        dr += np.min([
                            abs(k1 - (k2 + 2 * np.pi * n))
                            for n in range(-3, 3)
                        ])**2
                    else:
                        dw += (k1 - k2)**2
                        dr += (k1 - k2)**2
                thetaw[i, j] = np.sqrt(dw)
                thetar[i, j] = np.sqrt(dr)

        zs1 = np.ascontiguousarray(ev1[:, 0] / np.sum(ev1[:, 0]))
        zs2 = np.ascontiguousarray(ev2[:, 0] / np.sum(ev2[:, 0]))
        ot_w, ot_r = ot.emd2(zs1, zs2, thetaw), ot.emd2(zs1, zs2, thetar)

        ef_w = emd.emd_pot(ev1,
                           ev2,
                           norm=True,
                           gdim=gdim,
                           periodic_phi=False,
                           phi_col=phi_col)
        ef_r = emd.emd_pot(ev1,
                           ev2,
                           norm=True,
                           gdim=gdim,
                           periodic_phi=True,
                           phi_col=phi_col)

        assert epsilon_diff(ot_w, ef_w, 10**-14)
        assert epsilon_diff(ot_r, ef_r, 10**-14)
예제 #7
0
파일: test_ot.py 프로젝트: ylyslzx/POT
def test_emd2_multi():
    n = 500  # nb bins

    # bin positions
    x = np.arange(n, dtype=np.float64)

    # Gaussian distributions
    a = gauss(n, m=20, s=5)  # m= mean, s= std

    ls = np.arange(20, 500, 20)
    nb = len(ls)
    b = np.zeros((n, nb))
    for i in range(nb):
        b[:, i] = gauss(n, m=ls[i], s=10)

    # loss matrix
    M = ot.dist(x.reshape((n, 1)), x.reshape((n, 1)))
    # M/=M.max()

    print('Computing {} EMD '.format(nb))

    # emd loss 1 proc
    ot.tic()
    emd1 = ot.emd2(a, b, M, 1)
    ot.toc('1 proc : {} s')

    # emd loss multipro proc
    ot.tic()
    emdn = ot.emd2(a, b, M)
    ot.toc('multi proc : {} s')

    ot.tic()
    emdn2 = ot.emd2(a, b, M, dense=False)
    ot.toc('multi proc : {} s')

    np.testing.assert_allclose(emd1, emdn)
    np.testing.assert_allclose(emd1, emdn2, rtol=1e-6)

    # emd loss multipro proc with log
    ot.tic()
    emdn = ot.emd2(a, b, M, log=True, return_matrix=True)
    ot.toc('multi proc : {} s')

    for i in range(len(emdn)):
        emd = emdn[i]
        log = emd[1]
        cost = emd[0]
        check_duality_gap(a, b[:, i], M, log['G'], log['u'], log['v'], cost)
        emdn[i] = cost

    emdn = np.array(emdn)
    np.testing.assert_allclose(emd1, emdn)
예제 #8
0
파일: test_ot.py 프로젝트: eddardd/POT
def test_emd2_gradients():
    n_samples = 100
    n_features = 2
    rng = np.random.RandomState(0)

    x = rng.randn(n_samples, n_features)
    y = rng.randn(n_samples, n_features)
    a = ot.utils.unif(n_samples)

    M = ot.dist(x, y)

    if torch:

        a1 = torch.tensor(a, requires_grad=True)
        b1 = torch.tensor(a, requires_grad=True)
        M1 = torch.tensor(M, requires_grad=True)

        val, log = ot.emd2(a1, b1, M1, log=True)

        val.backward()

        assert a1.shape == a1.grad.shape
        assert b1.shape == b1.grad.shape
        assert M1.shape == M1.grad.shape

        assert np.allclose(
            a1.grad.cpu().detach().numpy(), log['u'].cpu().detach().numpy() -
            log['u'].cpu().detach().numpy().mean())

        assert np.allclose(
            b1.grad.cpu().detach().numpy(), log['v'].cpu().detach().numpy() -
            log['v'].cpu().detach().numpy().mean())

        # Testing for bug #309, checking for scaling of gradient
        a2 = torch.tensor(a, requires_grad=True)
        b2 = torch.tensor(a, requires_grad=True)
        M2 = torch.tensor(M, requires_grad=True)

        val = 10.0 * ot.emd2(a2, b2, M2)

        val.backward()

        assert np.allclose(10.0 * a1.grad.cpu().detach().numpy(),
                           a2.grad.cpu().detach().numpy())
        assert np.allclose(10.0 * b1.grad.cpu().detach().numpy(),
                           b2.grad.cpu().detach().numpy())
        assert np.allclose(10.0 * M1.grad.cpu().detach().numpy(),
                           M2.grad.cpu().detach().numpy())
예제 #9
0
def wasserstein_l1(base_dist, new_dist, normalise=False):
    """ Calculate the wasserstein distance between 2 distributions via samples using l1 cost"""

    # Calculate the Wasserstein distance between base_dist and new_dist using JSD as the cost
    N = base_dist.shape[0]

    base_dist = np.copy(base_dist)
    new_dist = np.copy(new_dist)

    # Sanitise the distributions
    base_dist.clip(0, 1)
    new_dist.clip(0, 1)

    base_vector = np.ones(N) / N
    new_vector = np.ones(N) / N
    cost_matrix = np.zeros(shape=(N, N))

    for i in range(N):
        row_cost = np.abs(base_dist[i] - new_dist).sum(axis=1)
        cost_matrix[i] = row_cost

    if normalise:
        mean_phi = np.mean(new_dist, axis=0)
        cost_to_base = np.abs(mean_phi - base_dist).sum(axis=1) + 0.0000001
        cost_matrix = cost_matrix / cost_to_base[:, np.newaxis]

    w_results = emd2(base_vector, new_vector, cost_matrix)

    return w_results
예제 #10
0
def pot_wasserstein_mapper(net1, net2, metric_space, p=None, q=None):
    """ Computes vanilla EMD (over Hausdorff dist) for mapper graphs

    Parameters
    ----------
    net1 : lightweight_mapper.Network
        Mapper graph
    net2 : lightweight_mapper.Network
        Mapper graph
    metric_space : np.array
        Pairwise distance matrix
    p : np.array - nx1
        Distribution over nodes corresponding to net1
    q : np.array - nx1
        Distribution over nodes corresponding to net2

    Returns
    -------
    EMD (Cost = Hausdorff dist)
    """
    C3 = network_merge_distance(net1, net2, metric_space)
    if p is None or q is None:
        p = np.diag(net1.adjacency_matrix.toarray())
        p = p / p.sum()

        q = np.diag(net2.adjacency_matrix.toarray())
        q = q / q.sum()

    gw_dist = ot.emd2(p, q, C3)
    params = ot.emd(p, q, C3)
    return gw_dist, params
예제 #11
0
파일: main.py 프로젝트: kenta1984/wrd
def main(args):
    # Input the sentences which you want to get the similarity
    s1 = '大坂なおみ 逆転で2年ぶり2度目の全米OP優勝。3度目のグランドスラム制覇'
    s2 = '大坂なおみが2年ぶり2回目のV 4大大会3勝目 全米テニス'
    # s2 = '大相撲秋場所 八角理事長「横綱不在 申し訳ない」'

    mt = MeCab.Tagger(
        '-d {} -Owakati'.format(args.mecab_dict_path)
    ) if args.mecab_dict_path is not None else MeCab.Tagger('-Owakati')
    wv = KeyedVectors.load_word2vec_format(
        os.path.dirname(os.path.abspath(__file__)) +
        '/vecs/jawiki.word_vectors.200d.txt')

    w1 = get_w(s1, mt, wv)
    w2 = get_w(s2, mt, wv)

    z1 = get_z(w1)
    z2 = get_z(w2)

    m1 = [np.linalg.norm(w1_i) / z1 for w1_i in w1]
    m2 = [np.linalg.norm(w2_i) / z2 for w2_i in w2]

    # Compute cost matrix C
    c = []
    for w1_i in w1:
        c.append([1 - cos_sim(np.array(w1_i), np.array(w2_j)) for w2_j in w2])

    # Show the result
    print(s1)
    print(s2)
    print("{:.2f}".format(ot.emd2(m1, m2, c)))
예제 #12
0
def calculate_path_length(device, args, model, data, end_time, n_pts=10000):
    """ Calculates the total length of the path from time 0 to timepoint
    """
    # z_samples = torch.tensor(data.get_data()).type(torch.float32).to(device)
    z_samples = data.base_sample()(n_pts, *data.get_shape()).to(device)
    model.eval()
    n = 1001
    with torch.no_grad():
        integration_times = (torch.tensor(np.linspace(0, end_time, n)).type(
            torch.float32).to(device))
        # z, _ = model(z_samples, torch.zeros_like(z_samples), integration_times=integration_times, reverse=False)
        z, _ = model(
            z_samples,
            torch.zeros_like(z_samples),
            integration_times=integration_times,
            reverse=True,
        )
        z = z.cpu().numpy()
        z_diff = np.diff(z, axis=0)
        z_lengths = np.sum(np.linalg.norm(z_diff, axis=-1), axis=0)
        total_length = np.mean(z_lengths)
        import ot as pot
        from scipy.spatial.distance import cdist

        emd = pot.emd2(
            np.ones(n_pts) / n_pts,
            np.ones(n_pts) / n_pts,
            cdist(z[-1, :, :], data.get_data()),
        )
        print(total_length, emd)
        plt.scatter(z[-1, :, 0], z[-1, :, 1])
        plt.savefig("test.png")
        plt.close()
예제 #13
0
    def basket_dist_EMD(self, baskets):
        basket1 = baskets[0]
        basket2 = baskets[1]
        dictionary = np.unique(list(basket1) + list(basket2))
        vocab_len_ = len(dictionary)
        product2ind = dict(zip(dictionary, np.arange(vocab_len_)))

        # Compute distance matrix.
        dictionary_vecs = self.model.wv.vectors[[x for x in dictionary]]
        distance_matrix = squareform(pdist(dictionary_vecs))

        if np.sum(distance_matrix) == 0.0:
            # `emd` gets stuck if the distance matrix contains only zeros.
            return float('inf')

        def nbow(document):
            bow = np.zeros(vocab_len_, dtype=np.float)
            for d in document:
                bow[product2ind[d]] += 1.
            return bow / len(document)

        # Compute nBOW representation of documents.
        d1 = nbow(basket1)
        d2 = nbow(basket2)

        # Compute WMD.
        return ot.emd2(d1, d2, distance_matrix)
예제 #14
0
파일: ot.py 프로젝트: ICB-DCM/pyABC
    def __call__(
        self,
        x: dict,
        x_0: dict,
        t: int = None,
        par: dict = None,
    ) -> float:
        # compute summary statistics, shape (n, dim), (n0, dim)
        s, s0 = self.sumstat(x), self.sumstat(x_0)
        n, n0 = s.shape[0], s0.shape[0]

        # pairwise cost matrix, shape (n, n0)
        m = self.dist(XA=s, XB=s0)

        # weights (could also be passed/learned?)
        w, w0 = np.ones((n, )) / n, np.ones((n0, )) / n0

        # optimal transport ("earth mover's") cost value
        cost = ot.emd2(a=w, b=w0, M=m, **self.emd_args, log=False)

        # take root to match Wasserstein distance definition
        if self.p < np.inf:
            cost = cost**(1 / self.p)

        return cost
 def evaluation(trainer):
     num_sample = 10000
     xp = gen.xp
     #xs = []
     """
     for i in range(0, num_sample, batchsize):
         z = Variable(xp.asarray(gen.make_hidden(batchsize)))
         with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
             x = gen(z)
         x = chainer.cuda.to_cpu(x.data)
         xs.append(x)
     xs = np.asarray(xs)
     """
     z = Variable(xp.asarray(gen.make_hidden(num_sample)))
     with chainer.using_config('train', False), chainer.using_config('enable_backprop', False):
         x = gen(z)
     xs = chainer.cuda.to_cpu(x.data)
     real_data = GmmDataset(num_sample, 123, num_cluster=8, std=0.02, scale=2)._data
     a, b = np.ones((num_sample,)) / num_sample, np.ones((num_sample,)) / num_sample
     #print(xs)
     #print(real_data)
     M = ot.dist(xs, real_data)
     M /= M.max()
     distance = ot.emd2(a, b, M)
     
     del xs
     gc.collect()
     del real_data
     gc.collect()
     
     #print(distance)
     chainer.reporter.report({
         'EMD': distance,
     })
예제 #16
0
    def compute_batch_images_emd(b_img1, b_img2, eps=0):
        """

    :param b_img1:
    :param b_img2:
    :return:
    """
        import ot
        assert b_img1.shape[-1] == 1 and b_img2.shape[-1] == 1

        b_img1 = b_img1.astype(np.float64)
        b_img2 = b_img2.astype(np.float64)
        # eps: avoid emd be zero
        b_img1 = np.clip(b_img1, a_min=eps, a_max=255)
        b_img1 /= b_img1.sum(axis=(1, 2, 3), keepdims=True)
        b_img2 = np.clip(b_img2, a_min=eps, a_max=255)
        b_img2 /= b_img2.sum(axis=(1, 2, 3), keepdims=True)

        b, h, w, c = b_img1.shape
        b_img1 = b_img1.reshape((b, -1))
        b_img2 = b_img2.reshape((b, -1))

        xx, yy = np.meshgrid(np.arange(h), np.arange(w))
        xy = np.hstack((xx.reshape(-1, 1), yy.reshape(-1, 1)))
        M = ot.dist(xy, xy)
        emd = np.zeros((b, 1))

        for idx in range(b):
            xapp1 = b_img1[idx]
            xapp2 = b_img2[idx]
            dist = ot.emd2(xapp1, xapp2, M)
            assert dist > 0
            emd[idx] = dist

        return emd
예제 #17
0
    def distFun_raw(self, x1, x2):
        if self.distType == "Euclidean":
            #normalize data CHECK IF IT WORKS BETTER
            x1 = np.divide(x1, np.linalg.norm(x1, 1, 0))
            x2 = np.divide(x2, np.linalg.norm(x2, 1, 0))

            tmp = np.zeros((x1.shape[1], x2.shape[1]))
            for k in range(x1.shape[1]):
                tmp[k, :] = np.linalg.norm(x1[:, k] - x2, None, 0)
            if x1.shape[1] == 1 and x2.shape[1] == 1:
                tmp = tmp.item()
            return tmp
        elif self.distType == "Wasserstein":
            if x1.shape[1] == 1 and x2.shape[1] == 1:
                x1 = np.array(np.divide(x1, np.sum(x1))).flatten()
                x2 = np.array(np.divide(x2, np.sum(x2))).flatten()
                #                print((x1.shape,x2.shape,self.TransportCostImg.shape))
                return ot.emd2(x1, x2, self.TransportCostImg)
            else:
                tmp = np.zeros((x1.shape[1], x2.shape[1]))
                for k in range(x1.shape[1]):
                    for kk in range(x2.shape[1]):
                        if kk >= k:
                            tmp[k, kk] = self.distFun_raw(x1[:, k], x2[:, kk])
                        else:
                            tmp[k, kk] = tmp[kk, k]
                return tmp
예제 #18
0
def wasserstein_distance_1d(histograms, centers, normalize=True, **kwargs):
    # handle args
    if not isinstance(centers[0], np.ndarray):
        centers = (centers, centers)

    metric = kwargs.pop("metric", "euclidean")

    # checks
    assert len(histograms) == 2
    assert histograms[0].ndim == 1
    assert histograms[1].ndim == 1

    assert len(centers) == 2
    assert centers[0].ndim == 1
    assert centers[1].ndim == 1

    # point distance
    centers0 = centers[0][..., np.newaxis]
    centers1 = centers[1][..., np.newaxis]
    M = cdist(centers0, centers1, metric=metric)

    # normalize histograms
    if normalize:
        hist0 = histograms[0] / histograms[0].sum()
        hist1 = histograms[1] / histograms[1].sum()
    else:
        hist0 = histograms[0]
        hist1 = histograms[1]

    # wasserstein distance
    distance = ot.emd2(hist0, hist1, M)
    return distance
def loader(data_path,
           embeddings_path,
           p=1,
           K_lda=70,
           glove_embeddings=True,
           stemming=True):
    """ Load dataset and embeddings from data path."""
    # Load dataset from data_path
    vocab, embed_vocab, bow_data, y = load_wmd_data(data_path)
    y = y - 1

    # Use GLOVE word embeddings
    if glove_embeddings:
        vocab, embed_vocab, bow_data = change_embeddings(
            vocab, bow_data, embeddings_path)
    # Reduce vocabulary by removing short words, stop words, and stemming
    if stemming:
        vocab, embed_vocab, bow_data = reduce_vocab(bow_data,
                                                    vocab,
                                                    embed_vocab,
                                                    embed_aggregate='mean')

    # Get embedded documents
    embed_data = get_embedded_data(
        bow_data, embed_vocab, vocab
    )  ## List of length n° of doc and L[doc] : np.array shape (size embedding, n° of words in doc)
    # Matrix of word embeddings
    embeddings = np.array([embed_vocab[w] for w in vocab
                           ])  ## shape (Vocab, features embedding)

    topics, lda_centers, topic_proportions = fit_topics(
        bow_data, embeddings, vocab, K_lda)

    cost_embeddings = euclidean_distances(
        embeddings, embeddings
    )**p  ## Pairwise distance between embeddings (vocal, base cost)
    cost_topics = np.zeros((topics.shape[0], topics.shape[0]))
    for i in range(cost_topics.shape[0]):
        print("i :", i)
        for j in range(i + 1, cost_topics.shape[1]):
            cost_topics[i, j] = ot.emd2(topics[i] / (topics[i].sum()),
                                        topics[j] / (topics[j].sum()),
                                        cost_embeddings)
    cost_topics = cost_topics + cost_topics.T
    out = {
        'X': bow_data,
        'y': y,
        'embeddings': embeddings,
        'topics': topics,
        'proportions': topic_proportions,
        'cost_E': cost_embeddings,
        'cost_T': cost_topics
    }

    print("Processing save for" + data_path)

    save_preprocessing(out, data_path, K_lda)

    print("Save done for" + data_path)
    return out
예제 #20
0
파일: nbd.py 프로젝트: tlarock/netrd
    def dist(self, G1, G2, topk='automatic', batch=100, tol=1e-5):
        """NBD between two graphs.

        Params
        ------

        G1, G2 (nx.Graph): The graphs to compare.

        topk (int or 'automatic'): The number of eigenvalues to compute.
        If 'automatic' (default), use only the eigenvalues that are larger
        than the square root of the largest eigenvalue.  Note this may
        yield different number of eigenvalues for each graph.

        batch (int): If topk is 'automatic', this is the number of
        eigenvalues to compute each time until the condition is met.
        Default 100.

        tol (float): Numerical tolerance when computing eigenvalues.

        """
        vals1 = nbvals(G1, topk, batch, tol)
        vals2 = nbvals(G2, topk, batch, tol)
        mass = lambda num: np.ones(num) / num
        vals_dist = distance_matrix(vals1, vals2)
        dist = emd2(mass(vals1.shape[0]), mass(vals2.shape[0]), vals_dist)
        self.results['vals'] = (vals1, vals2)
        return dist
예제 #21
0
def compute_divergence_from_cluster_labels(embeds1, embeds2, labels1, labels2,
                                           threshold):
    labels_all = list(np.concatenate((labels1, labels2)))
    counts1 = Counter(labels1)
    counts2 = Counter(labels2)
    n_senses = list(set(labels_all))
    #print("Clusters:", len(n_senses))
    t1 = []
    t2 = []
    label_list = []
    for i in n_senses:
        if counts1[i] + counts2[i] > threshold:
            t1.append(counts1[i])
            t2.append(counts2[i])
            label_list.append(i)
    t1 = np.array(t1)
    t2 = np.array(t2)

    emb1_means = np.array(
        [np.mean(embeds1[labels1 == clust], 0) for clust in label_list])
    emb2_means = np.array(
        [np.mean(embeds2[labels2 == clust], 0) for clust in label_list])
    M = np.nan_to_num(np.array(
        [cdist(emb1_means, emb2_means, metric='cosine')])[0],
                      nan=1)
    t1_dist = t1 / t1.sum()
    t2_dist = t2 / t2.sum()
    wass = ot.emd2(t1_dist, t2_dist, M)
    jsd = compute_jsd(t1_dist, t2_dist)
    return jsd, wass
예제 #22
0
def _compute_wasserstein_distance(label_sequences, sinkhorn=False, 
                                    categorical=False, sinkhorn_lambda=1e-2):
    '''
    Generate the Wasserstein distance matrix for the graphs embedded 
    in label_sequences
    '''
    # Get the iteration number from the embedding file
    n = len(label_sequences)
    
    M = np.zeros((n,n))
    # Iterate over pairs of graphs
    for graph_index_1, graph_1 in enumerate(label_sequences):
        # Only keep the embeddings for the first h iterations
        labels_1 = label_sequences[graph_index_1]
        for graph_index_2, graph_2 in enumerate(label_sequences[graph_index_1:]):
            labels_2 = label_sequences[graph_index_2 + graph_index_1]
            # Get cost matrix
            ground_distance = 'hamming' if categorical else 'euclidean'
            costs = ot.dist(labels_1, labels_2, metric=ground_distance)

            if sinkhorn:
                mat = ot.sinkhorn(np.ones(len(labels_1))/len(labels_1), 
                                    np.ones(len(labels_2))/len(labels_2), costs, sinkhorn_lambda, 
                                    numItermax=50)
                M[graph_index_1, graph_index_2 + graph_index_1] = np.sum(np.multiply(mat, costs))
            else:
                M[graph_index_1, graph_index_2 + graph_index_1] = \
                    ot.emd2([], [], costs)
                    
    M = (M + M.T)
    return M
예제 #23
0
def Wdist(X, Y, reg=0., p=2.):
    '''
    param X, Y: (n x 2) and (m x 2) numpy array (points of persistence diagrams)
    param reg: regularization parameters for entropic smoothing. If 0., exact computation.
    param p: exponent for Wasserstein;
    return: float, estimation of the Wasserstein distance between two diagrams (exact if reg = 0.).
    '''
    M = build_dist_matrix(X, Y, p=p)
    n = len(X)
    m = len(Y)
    a = 1.0 / (n + m) * np.ones(
        n)  # weight vector of the input diagram. Uniform here.
    hat_a = np.append(
        a,
        m / (n + m))  # so that we have a probability measure, required by POT
    b = 1.0 / (n + m) * np.ones(
        m)  # weight vector of the input diagram. Uniform here.
    hat_b = np.append(
        b,
        n / (m + n))  # so that we have a probability measure, required by POT
    if reg > 0:
        ot_cost = (n + m) * ot.bregman.sinkhorn2(hat_a, hat_b, M, reg=reg)
    else:
        ot_cost = (n + m) * ot.emd2(hat_a, hat_b, M)
    return np.power(ot_cost, 1. / p)
예제 #24
0
def get_dwasserstein(model, market_baskets):
    # make distance matrix d between purchase histories of customer c and d
    basket_X = market_baskets[0]
    basket_Y = market_baskets[1]
    list_basketX = list(basket_X)
    list_basketY = list(basket_Y)
    dictionary = np.unique(list_basketX + list_basketY)
    dictionary_len = len(dictionary)
    product2index = dict(zip(dictionary, np.arange(dictionary_len)))

    dictionary_vectors = model.wv.vectors[[word for word in dictionary]]
    distance_matrix = squareform(pdist(dictionary_vectors))

    if np.sum(distance_matrix) == 0.0:
        return float('inf')

    def bag_of_words(document):
        bow = np.zeros(dictionary_len, dtype=np.float)
        for d in document:
            bow[product2index[d]] += 1.
        return bow / len(document)

    bow_X = bag_of_words(basket_X)
    bow_Y = bag_of_words(basket_Y)
    # Finally we compute the Wasserstein metric using both baskets and the distance metrics.
    dw = ot.emd2(bow_X, bow_Y, distance_matrix)
    return dw
예제 #25
0
def _edge_curvature(
    edge,
    measures,
    geodesic_distances,
    measure_cutoff=1e-6,
    sinkhorn_regularisation=0,
    weighted_curvature=False,
):
    """Compute curvature for an edge."""
    node_x, node_y = edge
    m_x, m_y = measures[node_x], measures[node_y]

    Nx = np.where(m_x >= measure_cutoff * np.max(m_x))[0]
    Ny = np.where(m_y >= measure_cutoff * np.max(m_y))[0]

    m_x, m_y = m_x[Nx], m_y[Ny]
    m_x /= m_x.sum()
    m_y /= m_y.sum()

    distances_xy = geodesic_distances[np.ix_(Nx, Ny)]

    if sinkhorn_regularisation > 0:
        wasserstein_distance = ot.sinkhorn2(m_x, m_y, distances_xy, sinkhorn_regularisation)[0]
    else:
        wasserstein_distance = ot.emd2(m_x, m_y, distances_xy)

    if weighted_curvature:
        return geodesic_distances[node_x, node_y] - wasserstein_distance
    return 1.0 - wasserstein_distance / geodesic_distances[node_x, node_y]
예제 #26
0
def wasserstein(M, sqrt):
    """Calculate earth mover's distance."""
    if sqrt:
        M = M.abs().sqrt()
    emd = ot.emd2([], [], M.numpy())

    return emd
예제 #27
0
    def __call__(self, y_true_proba, y_proba):
        scores = []

        mask = ~np.any(np.isnan(y_proba), axis=1)
        y_proba = y_proba[mask]
        y_true_proba = y_true_proba[mask]

        for this_y_true, this_y_proba in zip(y_true_proba, y_proba):
            this_y_true_max = this_y_true.max()
            this_y_proba_max = this_y_proba.max()

            # special treatment for the all zero cases
            if (this_y_true_max * this_y_proba_max) == 0:
                if this_y_true_max or this_y_proba_max:
                    scores.append(1.)  # as ground_metric max is 1
                else:
                    scores.append(0.)
                continue

            this_y_true = this_y_true.astype(np.float64) / this_y_true.sum()
            this_y_proba = this_y_proba.astype(np.float64) / this_y_proba.sum()

            score = emd2(this_y_true, this_y_proba, self.ground_metric)
            scores.append(score)

        assert len(scores) == len(y_true_proba)
        assert len(y_proba) == len(y_true_proba)
        return np.mean(scores)
예제 #28
0
def earth_mover_distance(cloud1, cloud2, eigenvals):
    """
    Returns the earth mover's distance between two point clouds

    Parameters
    ----------
    cloud1 : 2-D array
        First point cloud
    cloud2 : 2-D array
        Second point cloud

    Returns
    -------
    distance : float
        The distance between the two point clouds
    """
    cloud1 = cloud1.toarray() if scipy.sparse.isspmatrix(cloud1) else cloud1
    cloud2 = cloud2.toarray() if scipy.sparse.isspmatrix(cloud2) else cloud2
    if eigenvals is not None:
        cloud1 = cloud1.dot(eigenvals)
        cloud2 = cloud2.dot(eigenvals)
    p = np.ones(len(cloud1)) / len(cloud1)
    q = np.ones(len(cloud2)) / len(cloud2)
    pairwise_dist = sklearn.metrics.pairwise.pairwise_distances(
        cloud1, Y=cloud2, metric='sqeuclidean')
    return np.sqrt(pot.emd2(p, q, pairwise_dist, numItermax=1e7))
예제 #29
0
    def basket_dist_Decomp(self, bskts):
        bskt1 = bskts[0]
        bskt2 = bskts[1]
        dct = nmpy.unique(list(bskt1) + list(bskt2))
        vocab_len_ = len(dct)
        product2ind = dict(zip(dct, nmpy.arange(vocab_len_)))

        # Here distance matrix is calculated.
        dict_vectors = self.model.wv.vectors[[x for x in dct]]
        dist_matrix = squareform(pdist(dict_vectors))

        if nmpy.sum(dist_matrix) == 0.0:
            # There will be issues if 'EMD' has 0s in it.
            return float('inf')

        def no_bow(doc):
            bow = nmpy.zeros(vocab_len_, dtype=nmpy.float)
            for e in doc:
                bow[product2ind[e]] += 1.
            return bow / len(doc)


# 'no_bow' is calculated to represent data as documets.

        dist_1 = no_bow(bskt1)
        dist_2 = no_bow(bskt2)

        # Here we obtain Wasserstein Minimim Distance.
        return ot.emd2(dist_1, dist_2, dist_matrix)
예제 #30
0
파일: test_ot.py 프로젝트: HelenLiGit/POT
def test_emd2_multi():
    n = 1000  # nb bins

    # bin positions
    x = np.arange(n, dtype=np.float64)

    # Gaussian distributions
    a = gauss(n, m=20, s=5)  # m= mean, s= std

    ls = np.arange(20, 1000, 20)
    nb = len(ls)
    b = np.zeros((n, nb))
    for i in range(nb):
        b[:, i] = gauss(n, m=ls[i], s=10)

    # loss matrix
    M = ot.dist(x.reshape((n, 1)), x.reshape((n, 1)))
    # M/=M.max()

    print('Computing {} EMD '.format(nb))

    # emd loss 1 proc
    ot.tic()
    emd1 = ot.emd2(a, b, M, 1)
    ot.toc('1 proc : {} s')

    # emd loss multipro proc
    ot.tic()
    emdn = ot.emd2(a, b, M)
    ot.toc('multi proc : {} s')

    np.testing.assert_allclose(emd1, emdn)

    # emd loss multipro proc with log
    ot.tic()
    emdn = ot.emd2(a, b, M, log=True, return_matrix=True)
    ot.toc('multi proc : {} s')

    for i in range(len(emdn)):
        emd = emdn[i]
        log = emd[1]
        cost = emd[0]
        check_duality_gap(a, b[:, i], M, log['G'], log['u'], log['v'], cost)
        emdn[i] = cost

    emdn = np.array(emdn)
    np.testing.assert_allclose(emd1, emdn)
예제 #31
0
def create_prototypes(dataset_name):
    all_files = glob.glob(dataset_name + "/json_format/*")
    all_files.sort(key=lambda x: int((x.strip().split('/')[-1]).split('.')[0]))

    all_clases = {}

    for file in all_files:
        name = (file.strip().split('/')[-1]).split('.')[0]
        with open(file, "r") as f1:
            graph = json.load(f1)

        if graph["target"] not in all_clases.keys():
            all_clases[graph["target"]] = {}

        if len(graph["labels"]) == 1:
            raise ValueError("Only one node")

        G = nx.Graph()
        G.add_edges_from(graph["edges"])
        N = normalized_laplacian_matrix(G).todense()
        eigvals = scipy.linalg.eigvals(N)
        eigvals = eigvals.real.round(decimals=5)
        if type(eigvals) == int:
            raise TypeError("Type is int rather than list")

        all_clases[graph["target"]][int(name)] = eigvals

    class_prototype_dict = {}

    for class_, class_graphs in all_clases.items():
        current_class_eigvals = []

        for num in sorted(class_graphs):
            # print(num)
            current_class_eigvals.append(class_graphs[num])

        all_dist = []
        for i in range(len(current_class_eigvals)):
            current_dist = []
            for j in range(len(current_class_eigvals)):
                a = current_class_eigvals[i]
                b = current_class_eigvals[j]
                cost = ot.utils.dist(np.reshape(a, (a.shape[0], 1)),
                                     np.reshape(b, (b.shape[0], 1)))
                # Uniform distribution has been assumed over the spectra for faster implementation. One can first use density estimation
                # to approximate the distribution which can provide better results.
                loss = ot.emd2([], [], cost)
                current_dist.append(loss)
            all_dist.append(current_dist)
        all_dist = np.array(all_dist)
        current_prot_index = np.argmin(np.sum(all_dist, axis=1))
        # print(list(class_graphs.keys()))
        sorted_keys = list(class_graphs.keys())
        sorted_keys.sort()
        class_prototype_dict[str(class_)] = sorted_keys[current_prot_index]

    print(class_prototype_dict)
    with open(dataset_name + "/class_prototype_numbers.json", 'w') as f:
        json.dump(class_prototype_dict, f)
예제 #32
0
파일: test_ot.py 프로젝트: HelenLiGit/POT
def test_emd_empty():
    # test emd and emd2 for simple identity
    n = 100
    rng = np.random.RandomState(0)

    x = rng.randn(n, 2)
    u = ot.utils.unif(n)

    M = ot.dist(x, x)

    G = ot.emd([], [], M)

    # check G is identity
    np.testing.assert_allclose(G, np.eye(n) / n)
    # check constratints
    np.testing.assert_allclose(u, G.sum(1))  # cf convergence sinkhorn
    np.testing.assert_allclose(u, G.sum(0))  # cf convergence sinkhorn

    w = ot.emd2([], [], M)
    # check loss=0
    np.testing.assert_allclose(w, 0)
예제 #33
0
def wasserstein(M, sqrt):
    if sqrt:
        M = M.abs().sqrt()
    emd = ot.emd2([],[],M.numpy())

    return emd