示例#1
0
def test_knn_memory():
    if not have_flann:
        raise SkipTest("No flann, so skipping knn tests.")

    dim = 3
    n = 20
    np.random.seed(47)
    bags = Features([np.random.randn(np.random.randint(30, 100), dim)
                     for _ in xrange(n)])

    tdir = tempfile.mkdtemp()
    div_funcs = ('kl', 'js', 'renyi:.9', 'l2', 'tsallis:.8')
    Ks = (3, 4)
    est = KNNDivergenceEstimator(div_funcs=div_funcs, Ks=Ks, memory=tdir)
    res1 = est.fit_transform(bags)

    with LogCapture('skl_groups.divergences.knn', level=logging.INFO) as l:
        res2 = est.transform(bags)
        assert len(l.records) == 0
    assert np.all(res1 == res2)

    with LogCapture('skl_groups.divergences.knn', level=logging.INFO) as l:
        res3 = est.fit_transform(bags)
        for r in l.records:
            assert not r.message.startswith("Getting divergences")
    assert np.all(res1 == res3)
示例#2
0
def test_knn_memory():
    if not have_flann:
        raise SkipTest("No flann, so skipping knn tests.")

    dim = 3
    n = 20
    np.random.seed(47)
    bags = Features(
        [np.random.randn(np.random.randint(30, 100), dim) for _ in xrange(n)])

    tdir = tempfile.mkdtemp()
    div_funcs = ('kl', 'js', 'renyi:.9', 'l2', 'tsallis:.8')
    Ks = (3, 4)
    est = KNNDivergenceEstimator(div_funcs=div_funcs, Ks=Ks, memory=tdir)
    res1 = est.fit_transform(bags)

    with LogCapture('skl_groups.divergences.knn', level=logging.INFO) as l:
        res2 = est.transform(bags)
        assert len(l.records) == 0
    assert np.all(res1 == res2)

    with LogCapture('skl_groups.divergences.knn', level=logging.INFO) as l:
        res3 = est.fit_transform(bags)
        for r in l.records:
            assert not r.message.startswith("Getting divergences")
    assert np.all(res1 == res3)
示例#3
0
def test_knn_kl():
    if not have_flann:
        raise SkipTest("No flann, so skipping knn tests.")

    # verified by hand
    # Dhat(P||Q) = \log m/(n-1) + d / n  \sum_{i=1}^n \log \nu_k(i)/rho_k(i)
    x = np.reshape([0., 1, 3], (3, 1))
    y = np.reshape([.2, 1.2, 3.2, 7.2], (4, 1))

    n = x.shape[0]
    m = y.shape[0]

    x_to_y = np.log(
        m /
        (n - 1)) + 1 / n * (np.log(1.2 / 3) + np.log(.8 / 2) + np.log(1.8 / 3))
    y_to_x = np.log(n / (m - 1)) + 1 / m * (np.log(.8 / 3) + np.log(1.2 / 2) +
                                            np.log(2.2 / 3) + np.log(6.2 / 6))

    msg = "got {}, expected {}"
    est = KNNDivergenceEstimator(div_funcs=['kl'], Ks=[2], clamp=False)
    res = est.fit_transform([x, y]).squeeze()
    assert res[0, 0] == 0
    assert res[1, 1] == 0
    assert np.allclose(res[0, 1], x_to_y), msg.format(res[0, 1], x_to_y)
    assert np.allclose(res[1, 0], y_to_x), msg.format(res[1, 0], y_to_x)
示例#4
0
def divergence_gen(gen, gt_db, batch=1000, metric='kl', normalize=False, 
                   n_bins=100, whitening=True, classes=None, **kwargs):
    """ 
    Given a generator and the gt function (the one generator 
    tries to approximate), we measure the discrepancy of the
    generated from the gt signals. 
    """
    # # generate some samples.
    batch = gt_db.shape[0]
    if classes is None:
        gen_samples = gen_images(gen, n=batch, batchsize=batch)
    else:
        # # conditional case.
        gen_csamples, n_ms = [], int(batch // len(classes) + 10)
        for cl in classes:
            x = gen_images_with_condition(gen, n=n_ms, c=cl, batchsize=n_ms)
            gen_csamples.append(x)
        gen_csamples = np.concatenate(gen_csamples, 0)
        gen_samples = gen_csamples[:gt_db.shape[0]]
    if len(gt_db.shape) != 2:
        gt_db = gt_db.reshape((batch, -1))
    if len(gen_samples.shape) != 2:
        gen_samples = gen_samples.reshape((batch, -1))
    if gen_samples.dtype == np.uint8:
        gen_samples = gen_samples.astype(np.float32)
    if normalize:
        # # Given that gen_images have a range [0, 255], normalize
        # # the images in the [-1, 1] range for the KNN.
        gen_samples1 = gen_samples / 127.5 - 1
    else:
        gen_samples1 = gen_samples

    if metric == 'ndb':
        global ndb
        if ndb is None:
            ndb = NDB(training_data=gt_db, number_of_bins=n_bins, whitening=whitening)
        metric_val = ndb.evaluate(gen_samples)
        chainer.reporter.report({'ndb': metric_val['NDB']})
        chainer.reporter.report({'JS': metric_val['JS']})
        diver = metric_val['NDB']
    else:
        # # define an estimator (e.g. KL divergence).
        est = KNNDivergenceEstimator(div_funcs=[metric], Ks=[3], clamp=False)
        # # fit and return the result.
        res_diver = est.fit_transform([gt_db, gen_samples])
        try:
            diver = res_diver[0, 1]
        except:
            diver = res_diver[0][0][0, 1]
        chainer.reporter.report({'kl': diver})
    return diver
示例#5
0
def kNNdiv_Kernel(X_white,
                  kernel,
                  Knn=3,
                  div_func='renyi:.5',
                  Nref=None,
                  compwise=True,
                  njobs=1,
                  W_ica_inv=None):
    ''' `div_func` kNN divergence estimate between some data X_white and a distribution specified by Kernel.
    '''
    if isinstance(Knn, int):
        Knns = [Knn]
    elif isinstance(Knn, list):
        Knns = Knn
    # if component wise there should be X_white.shape[1]
    # kernels for each componenets
    if compwise:
        if X_white.shape[1] != len(kernel): raise ValueError

    # construct reference "bag"
    if compwise:
        ref_dist = np.zeros((Nref, X_white.shape[1]))
        for icomp in range(X_white.shape[1]):
            samp = kernel[icomp].sample(Nref)
            if isinstance(samp, tuple):
                ref_dist[:, icomp] = samp[0].flatten()
            else:
                ref_dist[:, icomp] = samp.flatten()
    else:
        samp = kernel.sample(Nref)
        if isinstance(samp, tuple):
            ref_dist = samp[0]
        else:
            ref_dist = samp
    if W_ica_inv is not None:
        ref_dist = np.dot(ref_dist, W_ica_inv.T)
    # estimate divergence
    kNN = KNNDivergenceEstimator(div_funcs=[div_func],
                                 Ks=Knns,
                                 version='slow',
                                 clamp=False,
                                 n_jobs=njobs)
    feat = Features([X_white, ref_dist])
    div_knn = kNN.fit_transform(feat)
    if len(Knns) == 1:
        return div_knn[0][0][0][1]
    div_knns = np.zeros(len(Knns))
    for i in range(len(Knns)):
        div_knns[i] = div_knn[0][i][0][1]
    return div_knns
示例#6
0
def test_knn_js():
    if not have_flann:
        raise SkipTest("No flann, so skipping knn tests.")

    # verified by hand
    x = np.reshape([0., 1, 3, 6], (4, 1))
    n = 4

    y = np.reshape([.2, 1.2, 3.2, 6.2, 10.2], (5, 1))
    m = 5

    M = 2

    right_js = (
        np.log(n + m - 1) + psi(M)
        + 1/(2*n) * (  # x weight is 1/7, y weight is 4/35, quantile 1/4
            np.log(.2) - psi(1)     # 0 => .2(y), 1(x)
            + np.log(.8) - psi(2)   # 1 => 1.2(y), .2(y)
            + np.log(1.8) - psi(2)  # 3 => 3.2(y), 1.2(y)
            + np.log(2.8) - psi(2)  # 6 => 6.2(y), 3.2(y)
        )
        + 1/(2*m) * (  # x weight is 5/36, y weight is 1/9, quantile 1/4
            np.log(.2) - psi(1)     # .2 => 0(x)
            + np.log(1) - psi(2)    # 1.2 => 1(x), .2(y)
            + np.log(2) - psi(2)    # 3.2 => 3(x), 1.2(y)
            + np.log(3) - psi(2)    # 6.2 => 6(x), 3.2(y)
            + np.log(4.2) - psi(2)  # 10.2 => 6.2(y), 6(x)
        )
        - 1/2 * np.log(n-1) - 1/(2*n) * (
            np.log(3) + np.log(2) + np.log(3) + np.log(5))
        - 1/2 * np.log(m-1) - 1/(2*m) * (
            np.log(3) + np.log(2) + np.log(3) + np.log(4) + np.log(7))
    )

    msg = "got {}, expected {}"
    est = KNNDivergenceEstimator(div_funcs=['js'], Ks=[2], clamp=False)
    res = est.fit([x]).transform([y])
    assert res.shape == (1, 1, 1, 1)
    res = res[0, 0, 0, 0]
    assert np.allclose(res, right_js, atol=1e-6), msg.format(res, right_js)
示例#7
0
def distribution_divergence(X_s, X_l, k=10):
    """
    This function computes l2 and js divergences from samples of two distributions.
    The implementation use `skl-groups`, which implements non-parametric estimation
    of divergences.

    Args:
        + X_s: a numpy array containing point cloud in state space
        + X_e: a numpy array containing point cloud in latent space
    """

    # We discard cases with too large dimensions
    if X_s.shape[1] > 50:
        return {'l2_divergence': -1., 'js_divergence': -1.}

    # We instantiate the divergence object
    div = KNNDivergenceEstimator(div_funcs=['l2', 'js'],
                                 Ks=[k],
                                 n_jobs=4,
                                 clamp=True)

    # We turn both data to float32
    X_s = X_s.astype(np.float32)
    X_l = X_l.astype(np.float32)

    # We generate Features
    f_s = Features(X_s, n_pts=[X_s.shape[0]])
    f_l = Features(X_l, n_pts=[X_l.shape[0]])

    # We create the knn graph
    div.fit(X=f_s)

    # We compute the divergences
    l2, js = div.transform(X=f_l).squeeze()

    # We construct the returned dictionnary
    output = {'l2_divergence': l2, 'js_divergence': js}

    return output
示例#8
0
def kNNdiv_gauss(X_white,
                 cov_X,
                 Knn=3,
                 div_func='renyi:.5',
                 gauss=None,
                 Nref=None,
                 njobs=1):
    ''' `div_func` kNN divergence estimate between X_white and a 
    reference Gaussian with covariance matrix cov_X.
    '''
    if gauss is None:
        if Nref is None:
            raise ValueError
        gauss = np.random.multivariate_normal(
            np.zeros(X_white.shape[1]), cov_X,
            size=Nref)  # Gaussian reference distribution
    if gauss.shape[1] != X_white.shape[1]:
        raise ValueError(
            'dimension between X_white and Gaussian reference distribution do not match'
        )

    if isinstance(Knn, int):
        Knns = [Knn]
    elif isinstance(Knn, list):
        Knns = Knn

    kNN = KNNDivergenceEstimator(div_funcs=[div_func],
                                 Ks=Knns,
                                 version='slow',
                                 clamp=False,
                                 n_jobs=njobs)
    feat = Features([X_white, gauss])
    div_knn = kNN.fit_transform(feat)
    if len(Knns) == 1:
        return div_knn[0][0][0][1]
    div_knns = np.zeros(len(Knns))
    for i in range(len(Knns)):
        div_knns[i] = div_knn[0][i][0][1]
    return div_knns
示例#9
0
def test_knn_js():
    if not have_flann:
        raise SkipTest("No flann, so skipping knn tests.")

    # verified by hand
    x = np.reshape([0., 1, 3, 6], (4, 1))
    n = 4

    y = np.reshape([.2, 1.2, 3.2, 6.2, 10.2], (5, 1))
    m = 5

    M = 2

    right_js = (
        np.log(n + m - 1) + psi(M) + 1 / (2 * n) *
        (  # x weight is 1/7, y weight is 4/35, quantile 1/4
            np.log(.2) - psi(1)  # 0 => .2(y), 1(x)
            + np.log(.8) - psi(2)  # 1 => 1.2(y), .2(y)
            + np.log(1.8) - psi(2)  # 3 => 3.2(y), 1.2(y)
            + np.log(2.8) - psi(2)  # 6 => 6.2(y), 3.2(y)
        ) + 1 / (2 * m) * (  # x weight is 5/36, y weight is 1/9, quantile 1/4
            np.log(.2) - psi(1)  # .2 => 0(x)
            + np.log(1) - psi(2)  # 1.2 => 1(x), .2(y)
            + np.log(2) - psi(2)  # 3.2 => 3(x), 1.2(y)
            + np.log(3) - psi(2)  # 6.2 => 6(x), 3.2(y)
            + np.log(4.2) - psi(2)  # 10.2 => 6.2(y), 6(x)
        ) - 1 / 2 * np.log(n - 1) - 1 / (2 * n) *
        (np.log(3) + np.log(2) + np.log(3) + np.log(5)) -
        1 / 2 * np.log(m - 1) - 1 / (2 * m) *
        (np.log(3) + np.log(2) + np.log(3) + np.log(4) + np.log(7)))

    msg = "got {}, expected {}"
    est = KNNDivergenceEstimator(div_funcs=['js'], Ks=[2], clamp=False)
    res = est.fit([x]).transform([y])
    assert res.shape == (1, 1, 1, 1)
    res = res[0, 0, 0, 0]
    assert np.allclose(res, right_js, atol=1e-6), msg.format(res, right_js)
示例#10
0
def test_knn_kl():
    if not have_flann:
        raise SkipTest("No flann, so skipping knn tests.")

    # verified by hand
    # Dhat(P||Q) = \log m/(n-1) + d / n  \sum_{i=1}^n \log \nu_k(i)/rho_k(i)
    x = np.reshape([0., 1, 3], (3, 1))
    y = np.reshape([.2, 1.2, 3.2, 7.2], (4, 1))

    n = x.shape[0]
    m = y.shape[0]

    x_to_y = np.log(m / (n-1)) + 1/n * (
        np.log(1.2 / 3) + np.log(.8 / 2) + np.log(1.8 / 3))
    y_to_x = np.log(n / (m-1)) + 1/m * (
        np.log(.8 / 3) + np.log(1.2 / 2) + np.log(2.2 / 3) + np.log(6.2 / 6))

    msg = "got {}, expected {}"
    est = KNNDivergenceEstimator(div_funcs=['kl'], Ks=[2], clamp=False)
    res = est.fit_transform([x, y]).squeeze()
    assert res[0, 0] == 0
    assert res[1, 1] == 0
    assert np.allclose(res[0, 1], x_to_y), msg.format(res[0, 1], x_to_y)
    assert np.allclose(res[1, 0], y_to_x), msg.format(res[1, 0], y_to_x)
示例#11
0
def test_knn_sanity_slow():
    if not have_flann:
        raise SkipTest("No flann, so skipping knn tests.")

    dim = 3
    n = 20
    np.random.seed(47)
    bags = Features(
        [np.random.randn(np.random.randint(30, 100), dim) for _ in xrange(n)])

    # just make sure it runs
    div_funcs = ('kl', 'js', 'renyi:.9', 'l2', 'tsallis:.8')
    Ks = (3, 4)
    est = KNNDivergenceEstimator(div_funcs=div_funcs, Ks=Ks)
    res = est.fit_transform(bags)
    assert res.shape == (len(div_funcs), len(Ks), n, n)
    assert np.all(np.isfinite(res))

    # test that JS blows up when there's a huge difference in bag sizes
    # (so that K is too low)
    assert_raises(
        ValueError,
        partial(est.fit_transform, bags + [np.random.randn(1000, dim)]))

    # test fit() and then transform() with JS, with different-sized test bags
    est = KNNDivergenceEstimator(div_funcs=('js', ), Ks=(5, ))
    est.fit(bags, get_rhos=True)
    with LogCapture('skl_groups.divergences.knn', level=logging.WARNING) as l:
        res = est.transform([np.random.randn(300, dim)])
        assert res.shape == (1, 1, 1, len(bags))
        assert len(l.records) == 1
        assert l.records[0].message.startswith('Y_rhos had a lower max_K')

    # test that passing div func more than once raises
    def blah(df):
        est = KNNDivergenceEstimator(div_funcs=[df, df])
        return est.fit(bags)

    assert_raises(ValueError, lambda: blah('kl'))
    assert_raises(ValueError, lambda: blah('renyi:.8'))
    assert_raises(ValueError, lambda: blah('l2'))
示例#12
0
def kNNdiv_general(
    X,
    Y,
    Knn=3,
    div_func='kl',
    alpha=None,
    njobs=1,
):  #renyi:.5
    """
    kNN divergence estimate for samples drawn from any two arbitrary distributions.
    """
    if Y.shape[1] != X.shape[1]:
        raise ValueError(
            'dimension between X_white and Gaussian reference distribution do not match'
        )

    if isinstance(Knn, int):
        Knns = [Knn]
    elif isinstance(Knn, list):
        Knns = Knn

    if alpha is not None:
        div_func = div_func + ':%s' % alpha

    kNN = KNNDivergenceEstimator(div_funcs=[div_func],
                                 Ks=Knns,
                                 version='slow',
                                 clamp=False,
                                 n_jobs=njobs)
    feat = Features([X, Y])
    div_knn = kNN.fit_transform(feat)

    if len(Knns) == 1:
        return div_knn[0][0][0][1]

    div_knns = np.zeros(len(Knns))
    for i in range(len(Knns)):
        div_knns[i] = div_knn[0][i][0][1]
    return div_knns
def computePairwiseSimilarities2(patients, y):
    """
    Compute the pairwise similarity between bags using Dougal code

    Inputs:
    - patients: the collection of patient features
    - y: labels (number of abnormal nodes) for each patient. Used to fit the
         KNNDivergenceEstimator

    Returns: 
    - sims: the pairwise similarities between each patient
    * Note: sims is a NxN symmetric matrix, where N is the number of patients
    """

    # pass the features and labels to scikit-learn Features
    feats = Features(patients, labels=y)  # directly from Dougal
    # note: learning methods won't use the labels, this is for conveinence

    # estimate the distances between the bags (patients) using KNNDivergenceEstimator
    # details: use the kl divergence, find 3 nearest neighbors
    #          not sure what the pairwise picker line does?
    #          rbf and projectPSD help ensure the data is separable?
    distEstModel = Pipeline(
        [  # div_funcs=['kl'], rewrite this to actually use PairwisePicker correctly next time
            ('divs',
             KNNDivergenceEstimator(div_funcs=['kl'],
                                    Ks=[3],
                                    n_jobs=-1,
                                    version='fast')),
            ('pick', PairwisePicker((0, 0))), ('symmetrize', Symmetrize()),
            ('rbf', RBFize(gamma=1, scale_by_median=True)),
            ('project', ProjectPSD())
        ])

    # return the pairwise similarities between the bags (patients)
    sims = distEstModel.fit_transform(feats)
    return sims
示例#14
0
def test_knn_sanity_slow():
    if not have_flann:
        raise SkipTest("No flann, so skipping knn tests.")

    dim = 3
    n = 20
    np.random.seed(47)
    bags = Features([np.random.randn(np.random.randint(30, 100), dim)
                     for _ in xrange(n)])

    # just make sure it runs
    div_funcs = ('kl', 'js', 'renyi:.9', 'l2', 'tsallis:.8')
    Ks = (3, 4)
    est = KNNDivergenceEstimator(div_funcs=div_funcs, Ks=Ks)
    res = est.fit_transform(bags)
    assert res.shape == (len(div_funcs), len(Ks), n, n)
    assert np.all(np.isfinite(res))

    # test that JS blows up when there's a huge difference in bag sizes
    # (so that K is too low)
    assert_raises(
        ValueError,
        partial(est.fit_transform, bags + [np.random.randn(1000, dim)]))

    # test fit() and then transform() with JS, with different-sized test bags
    est = KNNDivergenceEstimator(div_funcs=('js',), Ks=(5,))
    est.fit(bags, get_rhos=True)
    with LogCapture('skl_groups.divergences.knn', level=logging.WARNING) as l:
        res = est.transform([np.random.randn(300, dim)])
        assert res.shape == (1, 1, 1, len(bags))
        assert len(l.records) == 1
        assert l.records[0].message.startswith('Y_rhos had a lower max_K')

    # test that passing div func more than once raises
    def blah(df):
        est = KNNDivergenceEstimator(div_funcs=[df, df])
        return est.fit(bags)
    assert_raises(ValueError, lambda: blah('kl'))
    assert_raises(ValueError, lambda: blah('renyi:.8'))
    assert_raises(ValueError, lambda: blah('l2'))
示例#15
0
 def blah(df):
     est = KNNDivergenceEstimator(div_funcs=[df, df])
     return est.fit(bags)
示例#16
0
def computeSubjSubjKernel(subjects, div='KL', numNeighbors=3):
    """
    Start by computing the pairwise similarities between subject
    using Dougal's code. Then, for HE and KL, symmetrize, RBFize,
    and project the similarities onto a positive semi-definite space.

    Inputs:
    - subjects: the collection of patient features
    - div: which divergence to use. Options are
            - 'KL': Kullback-Leibler divergence, 'kl' in the function (default)
            - 'HE': Hellinger divergence, 'hellinger' in the function
            - 'MMD': Maximum Mean Discrepancy, calls another function
    - numNeighbors: how many neighbors to look at. Default is 3

    Returns: 
    - kernel: the kernel calculated using the pairwise similarities between each subject
    * Note: kernel is a NxN symmetric matrix, where N is the number of subjects
    """

    # pass the features and labels to scikit-learn Features
    feats = Features(subjects)  # directly from Dougal

    # specify the divergence to use
    if div == 'KL':
        # estimate the distances between the bags (patients) using KNNDivergenceEstimator
        # details: use the kl divergence, find 3 nearest neighbors
        #          not sure what the pairwise picker line does?
        #          rbf and projectPSD help ensure the data is separable?
        distEstModel = Pipeline(
            [  # div_funcs=['kl'], rewrite this to actually use PairwisePicker correctly next time
                ('divs',
                 KNNDivergenceEstimator(div_funcs=['kl'],
                                        Ks=[numNeighbors],
                                        n_jobs=-1,
                                        version='fast')),
                ('pick', PairwisePicker((0, 0))), ('symmetrize', Symmetrize())
                # ('rbf', RBFize(gamma=1, scale_by_median=True)),
                # ('project', ProjectPSD())
            ])
        # return the pairwise similarities between the bags (patients)
        sims = distEstModel.fit_transform(feats)

        # Great, we have the similarities and they're symmetric
        # Now RBFize them, but do the scale by median by hand
        rbf = RBFize(gamma=1, scale_by_median=False)
        simsMedian = np.median(sims[np.triu_indices_from(sims)])
        medianScaledSims = sims / simsMedian
        rbfedSims = rbf.fit_transform(medianScaledSims)

        # Final step in building the kernel: project the rbf'ed similarities
        #   onto a positive semi-definite space
        psd = ProjectPSD()
        kernel = psd.fit_transform(rbfedSims)

    elif div == 'HE':
        # estimate the distances between the bags (patients) using KNNDivergenceEstimator
        # details: use the hellinger divergence, find 3 nearest neighbors
        #          not sure what the pairwise picker line does?
        #          rbf and projectPSD help ensure the data is separable?
        distEstModel = Pipeline(
            [  # div_funcs=['kl'], rewrite this to actually use PairwisePicker correctly next time
                ('divs',
                 KNNDivergenceEstimator(div_funcs=['hellinger'],
                                        Ks=[numNeighbors],
                                        n_jobs=-1,
                                        version='fast')),
                ('pick', PairwisePicker((0, 0))), ('symmetrize', Symmetrize())
                # ('rbf', RBFize(gamma=1, scale_by_median=True)),
                # ('project', ProjectPSD())
            ])

        # return the pairwise similarities between the bags (patients)
        sims = distEstModel.fit_transform(feats)

        # Great, we have the similarities and they're symmetric
        # Now RBFize them, but do the scale by median by hand
        rbf = RBFize(gamma=1, scale_by_median=False)
        simsMedian = np.median(sims[np.triu_indices_from(sims)])
        # medianScaledSims = sims/simsMedian
        # rbfedSims = rbf.fit_transform(medianScaledSims)
        rbfedSims = rbf.fit_transform(sims)

        # Final step in building the kernel: project the rbf'ed similarities
        #   onto a positive semi-definite space
        psd = ProjectPSD()
        kernel = psd.fit_transform(rbfedSims)

    elif div == 'MMD':
        # start by getting the median pairwise squared distance between subject,
        #   used as a heuristic for choosing the bandwidth of the inner RBF kernel
        subset = np.vstack(feats)
        subset = subset[np.random.choice(subset.shape[0],
                                         min(2000, subset.shape[0]),
                                         replace=False)]
        subsetSquaredDists = euclidean_distances(subset, squared=True)
        featsMedianSquaredDist = np.median(
            subsetSquaredDists[np.triu_indices_from(subsetSquaredDists,
                                                    k=numNeighbors)],
            overwrite_input=True)

        # now we need to determine gamma (scaling factor, inverse of sigma)
        #   This was initially done in the library, but Kayhan believes there's
        #   a multiplication instead of a division, so it's being done by hand
        firstGamma = 1 / featsMedianSquaredDist

        # calculate the mmds
        mmds, mmkDiagonals = mmd.rbf_mmd(feats,
                                         gammas=firstGamma,
                                         squared=True,
                                         ret_X_diag=True)

        # now let's turn the squared MMD distances into a kernel
        # symmetrize it
        sym = Symmetrize()
        mmds = sym.fit_transform(mmds)
        # get the median squared MMD distance
        mmdMedianSquaredDist = np.median(mmds[np.triu_indices_from(
            mmds, k=numNeighbors)])
        kernel = np.exp(np.multiply(mmds, -1 / mmdMedianSquaredDist))

    else:
        print("Error: divergence entered is not valid.")
        return -1

    return kernel
示例#17
0
def train_KNNDivergence(divergence, X_tr, y_tr, X_ts, y_ts, k=5, C=1, name=''):
    from skl_groups.divergences import KNNDivergenceEstimator

    from skl_groups.kernels import PairwisePicker, Symmetrize, RBFize, ProjectPSD
    """
    Parameters
    ----------
    divergence: string, 
        Type of divergence to use when estimating distance among distribution. 
        Options 'kl','renyi:.8','tsallis:.8','hellinger','bc','l2','linear', 'jensen-shannon'.
    X_tr: array-like
        Training data
    y_tr: array-like
        Training output
    X_ts: array-like
        Test data
    y_ts: array-like
        Test output
    k: int, optional default=5
        Number of k-nearest niehgbours to use for the estimation of the distances. 

    C: float, optional default=1
        Regularization parameter for SVM.
    """
    warnings.simplefilter('ignore')

    pipeline = [
        ('divs', KNNDivergenceEstimator(div_funcs=[divergence], Ks=[k])),
        ('pick', PairwisePicker((0, 0))),
        ('symmetrize', Symmetrize()),
        ('rbf', RBFize(gamma=1, scale_by_median=True)),
        ('project', ProjectPSD()),
    ]
    classification = isinstance(y_tr[0][0], str) or isinstance(
        y_tr[0][0], bool) or isinstance(y_tr[0][0], np.bool_)
    if classification:
        pipeline.append(('svm', SVC(C=C, kernel='precomputed')), )
    else:
        pipeline.append(('svm', SVR(C=C, kernel='precomputed')), )
    model = Pipeline(pipeline)
    X_tr = [x for x in X_tr]
    y_tr = [y for y in y_tr]
    X_ts = [x for x in X_ts]
    y_ts = [y for y in y_ts]
    # X_tr = list(X_tr)
    # y_tr - list(y_tr)
    # X_ts = list(X_ts)
    # y_ts = list(y_ts)

    model.fit(X_tr, y_tr)
    preds = model.predict(X_ts)
    pd.DataFrame.from_dict({
        'preds': preds,
        'labels': np.array(y_ts).flatten()
    }).to_csv(name + '.csv')
    if classification:
        train_score = accuracy_score(
            np.array(y_tr).flatten(), model.predict(X_tr))
        test_score = accuracy_score(np.array(y_ts).flatten(), preds)
    else:
        train_score = mean_squared_error(y_tr, model.predict(X_tr))
        test_score = mean_squared_error(y_ts, model.predict(X_ts))
    # wandb.log({'train_mse': train_score, 'test_mse': test_score})
    #print(train_score, test_score)
    return train_score, test_score
示例#18
0
 def blah(df):
     est = KNNDivergenceEstimator(div_funcs=[df, df])
     return est.fit(bags)