Exemplo n.º 1
0
Arquivo: pca.py Projeto: irakov/kaggle
def main():
    if len(argv) != 6:
        print_usage(basename(argv[0]))
        exit(1)

    print_log = True

    n_components = int(argv[1])
    input_train_data = load(argv[2])
    input_test_data = load(argv[3])
    output_train_data_file = argv[4]
    output_test_data_file = argv[5]

    t = time()
    pca = ProbabilisticPCA(n_components=n_components)
    output_train_data = pca.fit_transform(input_train_data)
    t = time()-t
    if print_log:
        print 'PCA fitting to {0} with {1} components to save (with transformation): {2[0]:d}m {2[1]:02d}s'.format(input_train_data.shape, n_components, (int(t//60), int(t%60)) )
        print 'New shape: {}'.format(output_train_data.shape)

    t = time()
    output_test_data = pca.transform (input_test_data)
    t = time()-t
    if print_log:
        print 'PCA transformation of {0} with {1} components to save: {2[0]:d}m {2[1]:02d}s'.format(input_test_data.shape, n_components, (int(t//60), int(t%60)) )
        print 'New shape: {}'.format(output_test_data.shape)

    save(output_train_data_file, output_train_data)
    save(output_test_data_file, output_test_data)
Exemplo n.º 2
0
def test_probabilistic_pca_vs_pca():
    """Test that PCA matches ProbabilisticPCA with homoscedastic=True
    """
    n, p = 100, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    pca = PCA(n_components=2).fit(X)
    ppca = ProbabilisticPCA(n_components=2).fit(X)
    assert_array_almost_equal(pca.score_samples(X), ppca.score(X))
Exemplo n.º 3
0
def test_probabilistic_pca_vs_pca():
    """Test that PCA matches ProbabilisticPCA with homoscedastic=True
    """
    n, p = 100, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    pca = PCA(n_components=2).fit(X)
    ppca = ProbabilisticPCA(n_components=2).fit(X)
    assert_array_almost_equal(pca.score_samples(X), ppca.score(X))
Exemplo n.º 4
0
def test_probabilistic_pca_2():
    """Test that probabilistic PCA correctly separated different datasets"""
    n, p = 100, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    ppca = ProbabilisticPCA(n_components=2)
    ppca.fit(X)
    ll1 = ppca.score(X)
    ll2 = ppca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5]))
    assert_greater(ll1.mean(), ll2.mean())
Exemplo n.º 5
0
def test_probabilistic_pca_1():
    """Test that probabilistic PCA yields a reasonable score"""
    n, p = 1000, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    ppca = ProbabilisticPCA(n_components=2)
    ppca.fit(X)
    ll1 = ppca.score(X)
    h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1 ** 2) * p
    np.testing.assert_almost_equal(ll1.mean() / h, 1, 0)
Exemplo n.º 6
0
def test_probabilistic_pca_1():
    """Test that probabilistic PCA yields a reasonable score"""
    n, p = 1000, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    ppca = ProbabilisticPCA(n_components=2)
    ppca.fit(X)
    ll1 = ppca.score(X)
    h = -0.5 * np.log(2 * np.pi * np.exp(1) * 0.1**2) * p
    np.testing.assert_almost_equal(ll1.mean() / h, 1, 0)
Exemplo n.º 7
0
def test_probabilistic_pca_4():
    """Check that ppca select the right model"""
    n, p = 200, 3
    rng = np.random.RandomState(0)
    Xl = rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])
    Xt = rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) + np.array([1, 0, 7])
    ll = np.zeros(p)
    for k in range(p):
        ppca = ProbabilisticPCA(n_components=k)
        ppca.fit(Xl)
        ll[k] = ppca.score(Xt).mean()

    assert_true(ll.argmax() == 1)
Exemplo n.º 8
0
def test_probabilistic_pca_4():
    """Check that ppca select the right model"""
    n, p = 200, 3
    rng = np.random.RandomState(0)
    Xl = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
          np.array([1, 0, 7]))
    Xt = (rng.randn(n, p) + rng.randn(n, 1) * np.array([3, 4, 5]) +
          np.array([1, 0, 7]))
    ll = np.zeros(p)
    for k in range(p):
        ppca = ProbabilisticPCA(n_components=k)
        ppca.fit(Xl)
        ll[k] = ppca.score(Xt).mean()

    assert_true(ll.argmax() == 1)
def pcaImgWrdMat(highDim, lowDim):
    (options, args) = parser.parse_args(sys.argv[1:])  #@UnusedVariable
    dataset = options.dataset
    #    method = options.method

    #acquire the category list
    catmap = getCatMap(dataset)
    catList = catmap.keys()
    # the number of categories in category list
    #    nCategory = len(catList)

    for catName in catList:
        print '%s : %d : %d\n' % (catName, highDim, lowDim)
        catPosFileName = rootDir + dataset + iwmDir + catName + str(
            highDim) + iwmext
        catPosData = np.loadtxt(catPosFileName, dtype=np.int, delimiter=' ')
        nPosImages = catPosData.shape[0]
        catNegFileName = rootDir + dataset + iwmDir + 'NEG' + catName + str(
            highDim) + iwmext
        catNegData = np.loadtxt(catNegFileName, dtype=np.int, delimiter=' ')
        nNegImages = catNegData.shape[0]
        catData = np.vstack((catPosData, catNegData))
        labels = np.vstack((np.ones(
            (nPosImages, 1), np.int), np.zeros((nNegImages, 1), np.int)))
        print 'pca...'
        pcaData = PCA(n_components=lowDim).fit(catData).transform(catData)
        pcaData = np.hstack((pcaData, labels))
        pcaDataFileName = rootDir + dataset + outputDir + catName + str(
            highDim) + str(lowDim) + '.pca'
        np.savetxt(pcaDataFileName, pcaData, fmt='%f', delimiter=' ')
        print 'ppca...'
        ppcaData = ProbabilisticPCA(
            n_components=lowDim).fit(catData).transform(catData)
        ppcaData = np.hstack((ppcaData, labels))
        ppcaDataFileName = rootDir + dataset + outputDir + catName + str(
            highDim) + str(lowDim) + '.ppca'
        np.savetxt(ppcaDataFileName, ppcaData, fmt='%f', delimiter=' ')
        print 'rpca...'
        rpcaData = RandomizedPCA(
            n_components=lowDim).fit(catData).transform(catData)
        rpcaData = np.hstack((rpcaData, labels))
        rpcaDataFileName = rootDir + dataset + outputDir + catName + str(
            highDim) + str(lowDim) + '.rpca'
        np.savetxt(rpcaDataFileName, rpcaData, fmt='%f', delimiter=' ')
        print 'kpca...'
        kpcaData = KernelPCA(
            n_components=lowDim).fit(catData).transform(catData)
        kpcaData = np.hstack((kpcaData, labels))
        kpcaDataFileName = rootDir + dataset + outputDir + catName + str(
            highDim) + str(lowDim) + '.kpca'
        np.savetxt(kpcaDataFileName, kpcaData, fmt='%f', delimiter=' ')
        print 'spca...'
        spcaData = MiniBatchSparsePCA(
            n_components=lowDim, n_iter=100).fit(catData).transform(catData)
        spcaData = np.hstack((spcaData, labels))
        spcaDataFileName = rootDir + dataset + outputDir + catName + str(
            highDim) + str(lowDim) + '.spca'
        np.savetxt(spcaDataFileName, spcaData, fmt='%f', delimiter=' ')
    pass
Exemplo n.º 10
0
def test_probabilistic_pca_3():
    """The homoscedastic model should work slightly worse
    than the heteroscedastic one in over-fitting condition
    """
    n, p = 100, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    ppca = ProbabilisticPCA(n_components=2)
    ppca.fit(X)
    ll1 = ppca.score(X)
    ppca.fit(X, homoscedastic=False)
    ll2 = ppca.score(X)
Exemplo n.º 11
0
def test_probabilistic_pca_3():
    """The homoscedastic model should work slightly worse
    than the heteroscedastic one in over-fitting condition
    """
    n, p = 100, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    ppca = ProbabilisticPCA(n_components=2)
    ppca.fit(X)
    ll1 = ppca.score(X)
    ppca.fit(X, homoscedastic=False)
    ll2 = ppca.score(X)
Exemplo n.º 12
0
def test_probabilistic_pca_2():
    """Test that probabilistic PCA correctly separated different datasets"""
    n, p = 100, 3
    rng = np.random.RandomState(0)
    X = rng.randn(n, p) * .1 + np.array([3, 4, 5])
    ppca = ProbabilisticPCA(n_components=2)
    ppca.fit(X)
    ll1 = ppca.score(X)
    ll2 = ppca.score(rng.randn(n, p) * .2 + np.array([3, 4, 5]))
    assert_greater(ll1.mean(), ll2.mean())