예제 #1
0
def generate_kpca_compression(X, n_components=16):
    """
    Compresses the data using sklearn KernelPCA implementation.

    :param X: Data (n_samples, n_features)
    :param n_components: Number of dimensions for PCA to keep

    :return: X_prime (the compressed representation), pca
    """

    kpca = KernelPCA(n_components=n_components, kernel='rbf', eigen_solver='arpack', fit_inverse_transform=False)
    kpca.fit(X)

    return kpca.transform(X), kpca
예제 #2
0
def main():

	#set the timer
	start = time.time()

	#load the data
	trainX = np.load('trainX.npy')
	testX = np.load('testX.npy')
	trainY = np.load('trainY.npy')
	testY = np.load('testY.npy')
	print('\n!!! Data Loading Completed !!!\n')

	#get the 1st digit zero and plot it
	zero = trainX[14].reshape(28, 28)
	plt.imshow(zero, cmap=cm.Greys_r)
	plt.savefig("original"+str(trainY[14])+".png")
	#plt.show()

	#apply kpca
	kpca = KernelPCA(kernel='rbf', gamma=1, fit_inverse_transform=True)
	kpca.fit(trainX[0:3000])
	trainX_kpca = kpca.transform(trainX)
	testX_kpca = kpca.transform(testX)

	#do inverse transform and plot the result
	orig = kpca.inverse_transform(trainX_kpca)
	img = orig[14].reshape(28, 28)
	plt.imshow(img, cmap=cm.Greys_r)
	plt.savefig("reconstructed"+str(trainY[14])+".png")
	#plt.show()

	selector = SelectPercentile(f_classif, percentile=5)
	selector.fit(trainX_kpca, trainY)
	trainX = selector.transform(trainX_kpca)
	testX = selector.transform(testX_kpca)

	#fit a classifier
	parameters = {'n_neighbors' : list(np.arange(15)+1)}
	clf = GridSearchCV(KNeighborsClassifier(weights='distance', n_jobs=-1), parameters)
	clf.fit(trainX, trainY)

	pred = clf.predict(testX)
	print accuracy_score(testY, pred)
	print confusion_matrix(testY, pred)
	#print(clf.best_params_)
	print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY)))

	print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def gogo_kpca( fxpath, mpath ):
    
    kpca_params = {'n_components':256,
                   'kernel':'rbf',
                   'gamma':None,
                   'degree':3,
                   'coef0':1,
                   'kernel_params':None,
                   'alpha':1.0,
                   'fit_inverse_transform':False,
                   'eigen_solver':'auto',
                   'tol':0,
                   'max_iter':None,
                   'remove_zero_eig':True}

    kpca_fname = '%s/kpca_rbf_{0}_{1}.pkl' % mpath

    for i in range(7):
        if i < 5:
            nbreed = 1
            sbreed = 'dog'
            nsubject = i+1
        else:
            nbreed = 2
            sbreed = 'human'
            nsubject = 1 + abs(5-i)

        print 'breed%d.subject%d..' % ( nbreed, nsubject )

        X_ictal = load_features( fxpath, nbreed, nsubject, 1 )
        X_inter = load_features( fxpath, nbreed, nsubject, 2 )

        X = vstack((X_inter, X_ictal))
        del X_inter, X_ictal; gc.collect()

        X_test = load_features( fxpath, nbreed, nsubject, 3 )
    
        X = vstack((X, X_test))
        del X_test; gc.collect()
    
        kpca = KernelPCA(**kpca_params)
        skip_interval = get_skip_interval(X)
        X = kpca_preprocess_features(X)
        kpca.fit(X[::skip_interval])
        with open(kpca_fname.format(sbreed,nsubject),'wb') as f:
            cPickle.dump(kpca,f)

        del X, kpca; gc.collect()
예제 #4
0
def test_kernel_conditioning():
    """Check that ``_check_psd_eigenvalues`` is correctly called in kPCA

    Non-regression test for issue #12140 (PR #12145).
    """

    # create a pathological X leading to small non-zero eigenvalue
    X = [[5, 1], [5 + 1e-8, 1e-8], [5 + 1e-8, 0]]
    kpca = KernelPCA(kernel="linear",
                     n_components=2,
                     fit_inverse_transform=True)
    kpca.fit(X)

    # check that the small non-zero eigenvalue was correctly set to zero
    assert kpca.lambdas_.min() == 0
    assert np.all(kpca.lambdas_ == _check_psd_eigenvalues(kpca.lambdas_))
예제 #5
0
def test_kernel_pca():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    for eigen_solver in ("auto", "dense", "arpack"):
        for kernel in ("linear", "rbf", "poly"):
            # transform fit data
            kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver,
                             fit_inverse_transform=True)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed),
                                      np.abs(X_fit_transformed2))

            # non-regression test: previously, gamma would be 0 by default,
            # forcing all eigenvalues to 0 under the poly kernel
            assert_not_equal(X_fit_transformed, [])

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1],
                         X_fit_transformed.shape[1])

            # inverse transform
            X_pred2 = kpca.inverse_transform(X_pred_transformed)
            assert_equal(X_pred2.shape, X_pred.shape)
def test_kernel_pca():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    def histogram(x, y, **kwargs):
        # Histogram kernel implemented as a callable.
        assert_equal(kwargs, {})  # no kernel_params that we didn't ask for
        return np.minimum(x, y).sum()

    for eigen_solver in ("auto", "dense", "arpack"):
        for kernel in ("linear", "rbf", "poly", histogram):
            # histogram kernel produces singular matrix inside linalg.solve
            # XXX use a least-squares approximation?
            inv = not callable(kernel)

            # transform fit data
            kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=inv)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2))

            # non-regression test: previously, gamma would be 0 by default,
            # forcing all eigenvalues to 0 under the poly kernel
            assert_not_equal(X_fit_transformed.size, 0)

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1])

            # inverse transform
            if inv:
                X_pred2 = kpca.inverse_transform(X_pred_transformed)
                assert_equal(X_pred2.shape, X_pred.shape)
예제 #7
0
def test_kernel_pca_sparse():
    """Test that kPCA works on a sparse data input.

    Same test as ``test_kernel_pca except inverse_transform`` since it's not
    implemented for sparse matrices.
    """
    rng = np.random.RandomState(0)
    X_fit = sp.csr_matrix(rng.random_sample((5, 4)))
    X_pred = sp.csr_matrix(rng.random_sample((2, 4)))

    for eigen_solver in ("auto", "arpack", "randomized"):
        for kernel in ("linear", "rbf", "poly"):
            # transform fit data
            kpca = KernelPCA(
                4,
                kernel=kernel,
                eigen_solver=eigen_solver,
                fit_inverse_transform=False,
                random_state=0,
            )
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed),
                                      np.abs(X_fit_transformed2))

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert X_pred_transformed.shape[1] == X_fit_transformed.shape[1]

            # inverse transform: not available for sparse matrices
            # XXX: should we raise another exception type here? For instance:
            # NotImplementedError.
            with pytest.raises(NotFittedError):
                kpca.inverse_transform(X_pred_transformed)
예제 #8
0
def test_kernel_pca():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    for eigen_solver in ("auto", "dense", "arpack"):
        for kernel in ("linear", "rbf", "poly"):
            # transform fit data
            kpca = KernelPCA(4,
                             kernel=kernel,
                             eigen_solver=eigen_solver,
                             fit_inverse_transform=True)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed),
                                      np.abs(X_fit_transformed2))

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1],
                         X_fit_transformed.shape[1])

            # inverse transform
            X_pred2 = kpca.inverse_transform(X_pred_transformed)
            assert_equal(X_pred2.shape, X_pred.shape)
예제 #9
0
def test_leave_zero_eig():
    """Non-regression test for issue #12141 (PR #12143)

    This test checks that fit().transform() returns the same result as
    fit_transform() in case of non-removed zero eigenvalue.
    """
    X_fit = np.array([[1, 1], [0, 0]])

    # Assert that even with all np warnings on, there is no div by zero warning
    with pytest.warns(None) as record:
        with np.errstate(all="warn"):
            k = KernelPCA(n_components=2,
                          remove_zero_eig=False,
                          eigen_solver="dense")
            # Fit, then transform
            A = k.fit(X_fit).transform(X_fit)
            # Do both at once
            B = k.fit_transform(X_fit)
            # Compare
            assert_array_almost_equal(np.abs(A), np.abs(B))

    for w in record:
        # There might be warnings about the kernel being badly conditioned,
        # but there should not be warnings about division by zero.
        # (Numpy division by zero warning can have many message variants, but
        # at least we know that it is a RuntimeWarning so lets check only this)
        assert not issubclass(w.category, RuntimeWarning)
예제 #10
0
def test_kernel_pca():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    for eigen_solver in ("auto", "dense", "arpack"):
        for kernel in ("linear", "rbf", "poly"):
            # transform fit data
            kpca = KernelPCA(4,
                             kernel=kernel,
                             eigen_solver=eigen_solver,
                             fit_inverse_transform=True)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed),
                                      np.abs(X_fit_transformed2))

            # non-regression test: previously, gamma would be 0 by default,
            # forcing all eigenvalues to 0 under the poly kernel
            assert_not_equal(X_fit_transformed, [])

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1],
                         X_fit_transformed.shape[1])

            # inverse transform
            X_pred2 = kpca.inverse_transform(X_pred_transformed)
            assert_equal(X_pred2.shape, X_pred.shape)
예제 #11
0
def main():

	#set the timer
	start = time.time()

	#load the data
	mnist = fetch_mldata('MNIST original')
	mnist.target = mnist.target.astype(np.int32)

	seed = np.random.randint(1,30000)
	rand = np.random.RandomState(seed)
	items = len(mnist.target)
	indices = rand.randint(items, size = 70000)
	trindex = indices[0:30000]
	tsindex = indices[30000:]

	#scale down features to the range [0, 1]
	mnist.data = mnist.data/255.0
	mnist.data = mnist.data.astype(np.float32)

	trainX = mnist.data[trindex]
	testX = mnist.data[tsindex]
	trainY = mnist.target[trindex]
	testY = mnist.target[tsindex]

	#extract the features using KPCA
	kpca = KernelPCA(kernel='precomputed')
	kpca_train = arc_cosine(trainX[0:1000], trainX[0:1000])
	#Fit the model from data in X
	kpca.fit(kpca_train)

	kernel_train = arc_cosine(trainX, trainX[0:1000])
	kernel_test = arc_cosine(testX, trainX[0:1000])

	trainX_kpca = kpca.transform(kernel_train)
	testX_kpca = kpca.transform(kernel_test)
	print testX_kpca.shape

	#fit the svm model and compute accuaracy measure
	clf = svm.SVC(kernel=arc_cosine)
	clf.fit(trainX_kpca, trainY)

	pred = clf.predict(testX_kpca)
	print accuracy_score(testY, pred)
	print('total : %d, correct : %d, incorrect : %d\n' %(len(pred), np.sum(pred == testY), np.sum(pred != testY)))

	print('Test Time : %f Minutes\n' %((time.time()-start)/60))
def PCA_rotate_data(feature_vector, n_points=256, nonlinear=False):
    """
    Rotate the data to align with the principal components
    of our acceleration data.
    
    feature vector is in the form [a_x_0, a_y_0, a_z_0, a_x_1,......]

    Component with largest eigenvector will be z-axis
    Second largest will be y-axis, I guess.
    """

    accXYZ = feature_vector.reshape(n_points,-1)
    if nonlinear:
        pca = KernelPCA(n_components=3, kernel='polynomial')
    else:
        pca = PCA(n_components = 3)
       
    pca.fit(accXYZ)
    
    # pca.explained_variance_: importance of data on each axis aka their important
    # tells us direction of vector, they are the eigenvalues
    if nonlinear:
        eigVals = pca.lambdas_
        eigVects = pca.alphas_
    else:
        eigVals = pca.explained_variance_
        eigVects = pca.components_

    x_index = np.argmin(eigVals)
    z_index = np.argmax(eigVals)
    y_index = list(set([0,1,2]) - set([x_index, z_index]))[0]

    new_x_hat = eigVects[x_index]/norm(eigVects[x_index])
    new_y_hat = eigVects[y_index]/norm(eigVects[y_index])
    new_z_hat = eigVects[z_index]/norm(eigVects[z_index])

    rotPCAData = []

    for i in range(len(accXYZ)):
        v = accXYZ[i]
        new_x = np.dot(new_x_hat, v )
        new_y = np.dot(new_y_hat, v )
        new_z = np.dot(new_z_hat, v )
        rotPCAData += [new_x, new_y, new_z]

    return rotPCAData, [new_x_hat, new_y_hat, new_z_hat]
예제 #13
0
def __pca_test(dem, X_tr, X_te, y_train, y_test):
    reg = KernelPCA(kernel='linear', n_components=dem, random_state=1)
    re = reg.fit(np.vstack((X_tr, X_te)))
    X_train = re.transform(X_tr)
    X_test = re.transform(X_te)
    reg, score = __mlp_test(X_train, X_test, y_train, y_test)
    print(score)
    return reg, score
예제 #14
0
def generate_kpca_compression(X, n_components=16):
    """
    Compresses the data using sklearn KernelPCA implementation.

    :param X: Data (n_samples, n_features)
    :param n_components: Number of dimensions for PCA to keep

    :return: X_prime (the compressed representation), pca
    """

    kpca = KernelPCA(n_components=n_components,
                     kernel='rbf',
                     eigen_solver='arpack',
                     fit_inverse_transform=False)
    kpca.fit(X)

    return kpca.transform(X), kpca
예제 #15
0
def reduction(data, params):

    # parse parameters

    for item in params:
        if isinstance(params[item], str):
            exec(item+'='+'"'+params[item]+'"')
        else:
            exec(item+'='+str(params[item]))

    # apply PCA

    kpca = KernelPCA(n_components=n_components, kernel=kernel)
    kpca.fit(data)
    X = kpca.transform(data)

    return X
예제 #16
0
def decom_kernel_pca_n20():

        filename = test_bench_path
        data = loadcsv(filename, 1, 4)
        print("data shape : ", data.shape)

        my_pca = KernelPCA(n_components=20, kernel='rbf')
        my_pca.fit(data)
        reduced_data = my_pca.transform(data)
        print("reduced data shape : ", reduced_data.shape)
        '''
        fig = plt.figure()
        ax = Axes3D(fig)
        ax.scatter(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2])
        plt.show()
        '''
        return my_pca
def kernel_pca_fit(n_components, train, test, shape, kernel="linear"):
    # Available kernels:
    # "linear", "poly", "rbf", "sigmoid", "cosine", "precomputed"
    # Set and fit KernelPCA
    kpca = KernelPCA(n_components=n_components,
                     kernel=kernel,
                     fit_inverse_transform=True)
    kpca.fit(train)
    # Reduce dimension
    test_reduced = kpca.transform(test)
    # Recover data from the lower dimension
    test_recovered = kpca.inverse_transform(test_reduced)
    # Calculate the MSE
    mse = np.mean((test_recovered - test)**2)
    # Reshape into a matrix
    test_recovered = test_recovered.reshape(shape)
    return kpca, test_recovered, mse
예제 #18
0
def main(args):
    df = pd.load(args.df)
    y = integer_labels(df)

    pca = KernelPCA(None, kernel=args.kernel)
    pca.fit(df)
    X = pca.transform(df)

    nonzero_components = X.shape[1]

    seed = int(time.time() * 1000)

    gmm = GMM(4, n_init=10, random_state=seed)
    gmm.fit(X)
    c = gmm.predict(X)

    score, _ = compare_clusters(c, y)

    best = score

    with open(args.out, 'w') as fh:
        fh.write('{} {} {} {}\n'.format(args.kernel, nonzero_components, seed,
                                        best))

    n_comps = range(
        2, 16) + [int(i) for i in np.linspace(16, nonzero_components, 20)]

    for n in n_comps:
        pca = KernelPCA(n, kernel=args.kernel)
        pca.fit(df)
        X = pca.transform(df)

        for i in range(128):
            seed = int(time.time() * 1000)

            gmm = GMM(4, random_state=seed)
            gmm.fit(df)
            c = gmm.predict(df)

            score, _ = compare_clusters(c, y)
            if score > best:
                best = score
                with open(args.out, 'a'):
                    fh.write('{} {} {} {}\n'.format(args.kernel, n, seed,
                                                    best))
예제 #19
0
파일: letor.py 프로젝트: nournia/python-hw
def doKernelPCA(q, components=40):
	global data

	# load test query
	loadFile('test', q)
	
	# fit model
	kpca = KernelPCA(components, kernel="rbf")
	kpca.fit(data)

	# transform and print test query
	data = kpca.transform(data)
	printFile('test{}'.format(q))

	for kind in ['train', 'vali']:
		loadFile(kind)
		data = kpca.transform(data)
		printFile(kind + str(q))
예제 #20
0
def plot_kernel_pca_variance():
    for dataset in datasets:
        X_train, X_test, y_train, y_test, target_names = dataset.get_data(
            model='KMeans')
        pca = KernelPCA(n_components=2, kernel='rbf', gamma=15)
        pca.fit(X_train)
        X_skernpca = pca.transform(X_train)
        plt.scatter(X_skernpca[y_train == 0, 0],
                    X_skernpca[y_train == 0, 1],
                    color='red',
                    marker='^',
                    alpha=0.5)
        plt.scatter(X_skernpca[y_train == 1, 0],
                    X_skernpca[y_train == 1, 1],
                    color='blue',
                    marker='o',
                    alpha=0.5)
        plt.show()
예제 #21
0
class KernelPCAReduction(AbstractReduction):
    """
    Use kernel PCA to reduce dimensionality

    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html
    """
    def __init__(self, n_components, **kwargs):
        self.pca = KernelPCA(n_components=n_components, **kwargs)
        self.n_components = n_components

    def n_components(self):
        return self.n_components

    def fit(self, X, Y=None):
        self.pca.fit(X)

    def transform(self, X):
        return self.pca.transform(X)
예제 #22
0
def testKernel(n_components, kernel, degree):
    print(n_components, kernel)
    kpca = KernelPCA(n_components, kernel=kernel, degree=degree)
    kpca_data = kpca.fit(data).transform(data)
    plt.scatter(kpca_data[:, 0],
                kpca_data[:, 1],
                c=labels,
                cmap='nipy_spectral')
    plt.show()
예제 #23
0
def decom_kernel_pca_n3(stage):
    data = []
    if stage=='assembly':
        data = loadcsv("data/assembly_training.csv", 1, 2)
    else:
        data = loadcsv("data/test_bench_training.csv", 1, 4)

    my_pca = KernelPCA(n_components=3, kernel='rbf')
    my_pca.fit(data)
    reduced_data = my_pca.transform(data)
    print("reduced data shape : ", reduced_data.shape)
    '''
    fig = plt.figure()
    ax = Axes3D(fig)
    ax.scatter(reduced_data[:, 0], reduced_data[:, 1], reduced_data[:, 2])
    plt.show()
    '''
    return my_pca
예제 #24
0
def KPCA10Fold(X, y):
    acc = []
    kf = KFold(X.shape[0], n_folds=10, shuffle=True)
    i = 0
    for train_index, test_index in kf:
        yTest = y[test_index]
        yTrain = y[train_index]
        clf = KernelPCA(kernel="rbf", fit_inverse_transform=True, gamma=10)
        clf.fit(X[train_index])
        newRepTrain = clf.transform(X[train_index])
        newRepTest = clf.transform(X[test_index])
        nclf = neighbors.KNeighborsClassifier(n_neighbors=2)
        nclf.fit(newRepTrain, yTrain)
        XPred = nclf.predict(newRepTest)
        acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0])
        #         print i,":",acc[i]
        i += 1
    return np.mean(acc), np.std(acc)
예제 #25
0
파일: letor.py 프로젝트: nournia/python-hw
def doKernelPCA(q, components=40):
    global data

    # load test query
    loadFile('test', q)

    # fit model
    kpca = KernelPCA(components, kernel="rbf")
    kpca.fit(data)

    # transform and print test query
    data = kpca.transform(data)
    printFile('test{}'.format(q))

    for kind in ['train', 'vali']:
        loadFile(kind)
        data = kpca.transform(data)
        printFile(kind + str(q))
예제 #26
0
def rf_assemble():

    assemble_pos, assemble_neg, assemble_X, assemble_Y = read_label_data()
    k_pca = KernelPCA(n_components=3, kernel='rbf')
    k_pca.fit(assemble_X)
    assemble_X_reduced = k_pca.transform(assemble_X)
    rfr = RandomForestClassifier(n_estimators=200,
                                 max_depth=10,
                                 class_weight={0: 80})
    rfr.fit(assemble_X_reduced, assemble_Y)

    validate_pos, validate_neg, validate_X, validate_Y = read_label_validation(
    )
    validate_X_reduced = k_pca.transform(validate_X)
    rfr_res = rfr.predict(validate_X_reduced)
    print('precision : ', precision_score(rfr_res, validate_Y))
    validate_neg_reduced = k_pca.transform(validate_neg)
    print('miss classified : ',
          np.sum(np.abs(rfr.predict(validate_neg_reduced))))
예제 #27
0
def getKPCAcomp(dict_read):
    A = np.arange(10000)
    for key in dict_read.keys():
        if key<=1000:
            [sample_rate,X] = dict_read.get(key)
            # if song doesnt have 10000 features, then add 0s at the end (this usually isnt the case)
            if (len(X)<10000):
                dif = 10000 - len(X)
                for i in range(dif):
                    X = np.hstack((X,0.0))
            A = np.vstack((A,X[:10000]))
        else:
            break
    A = np.delete(A, 0, 0)
    A = A.astype(float)
    kpca = KernelPCA(n_components=100, kernel="rbf")
    kpca.fit(A)
    A = kpca.transform(A)
    return A
예제 #28
0
파일: pca.py 프로젝트: DaMSL/ddc
def calc_kpca(xyz, kerneltype=None, title=None, n_comp=None):
  n_dim = np.prod(xyz.shape[1:])
  result = []
  if kerneltype is None:
    klist = ['linear', 'poly', 'rbf', 'sigmoid', 'cosine']
  else:
    klist = [kerneltype]
  for ktype in klist:
    kpca = KernelPCA(kernel=ktype, n_components=n_comp)
    st = dt.datetime.now()
    kpca.fit(xyz.reshape(len(xyz), n_dim))
    if title is not None:
      with open('kpca_%s_%s.dat' % (title, ktype), 'wb') as out:
        out.write(pickle.dumps(kpca))
    result.append(kpca)
  if kerneltype is None:
    return result
  else:
    return result[0]
예제 #29
0
class KernelPCAReduction(AbstractReduction):
    """
    Use kernel PCA to reduce dimensionality

    http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.KernelPCA.html
    """

    def __init__(self, n_components, **kwargs):
        self.pca = KernelPCA(n_components=n_components, **kwargs)
        self.n_components = n_components

    def n_components(self):
        return self.n_components

    def fit(self, X, Y=None):
        self.pca.fit(X)

    def transform(self, X):
        return self.pca.transform(X)
예제 #30
0
파일: pca.py 프로젝트: DaMSL/ddc
def calc_kpca(xyz, kerneltype=None, title=None, n_comp=None):
    n_dim = np.prod(xyz.shape[1:])
    result = []
    if kerneltype is None:
        klist = ['linear', 'poly', 'rbf', 'sigmoid', 'cosine']
    else:
        klist = [kerneltype]
    for ktype in klist:
        kpca = KernelPCA(kernel=ktype, n_components=n_comp)
        st = dt.datetime.now()
        kpca.fit(xyz.reshape(len(xyz), n_dim))
        if title is not None:
            with open('kpca_%s_%s.dat' % (title, ktype), 'wb') as out:
                out.write(pickle.dumps(kpca))
        result.append(kpca)
    if kerneltype is None:
        return result
    else:
        return result[0]
예제 #31
0
def runwSkPCA(train_x, train_y, test_x, test_y):
    dim_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
    score_list = []
    for dim in dim_list:
        pca = KernelPCA(n_components=dim, kernel="sigmoid")
        pca.fit(train_x)
        train_x_r = pca.transform(train_x)
        test_x_r = pca.transform(test_x)
        model = Sequential()
        model.add(
            Dense(500, input_shape=(train_x_r.shape[1], ),
                  activation="relu"))  #28*28=784
        model.add(Dropout(0.5))
        model.add(Dense(500, activation="relu"), )
        model.add(Dropout(0.5))
        model.add(Dense(10))
        model.add(Activation("softmax"))
        sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
        model.compile(loss='categorical_crossentropy',
                      optimizer=sgd,
                      metrics=['accuracy'])

        model.fit(train_x_r,
                  train_y,
                  batch_size=200,
                  epochs=400,
                  shuffle=True,
                  verbose=0,
                  validation_split=0.1)
        print("start test")
        scores = model.evaluate(test_x_r, test_y, batch_size=200, verbose=1)
        print("The test loss is: " + str(scores[0]))
        print('Test accuracy:', str(scores[1]))
        score_list.append(scores[1])
        print("NN with " + str(dim) + "-dim sigmoid kPCA score: " +
              str(scores[1]))

    plt.plot(dim_list, score_list, color='red')
    plt.title("NN Score with sigmoid kernel PCA")
    plt.xlabel("Dimension")
    plt.ylabel("Score")
    plt.show()
예제 #32
0
class KPCA:
    def __init__(self, rfe_cv, *args, **kwargs):
        self.rfe = None
        self.rfe_cv = rfe_cv
        self.model = KernelPCA(*args, **kwargs)

    def fit(self, X, y):
        Z = numpy.concatenate([X, y.reshape(-1, 1)], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        X_, y_ = X[~pandas.isna(Z).any(axis=1), :], y[~pandas.isna(Z).any(
            axis=1)]
        if Z.shape[0] != X.shape[0]:
            print(
                'FIT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            self.model.fit(X_)

    def predict(self, X):
        Z = numpy.concatenate([X], axis=1)
        Z = numpy.array(Z, dtype=numpy.float32)
        Z[Z == numpy.inf] = numpy.nan
        Z[Z == -numpy.inf] = numpy.nan
        nan_mask = ~pandas.isna(Z).any(axis=1)
        X_ = X[nan_mask, :]
        if Z.shape[0] != X.shape[0]:
            print(
                'PREDICT: the sample contains NaNs, they were dropped\tN of dropped NaNs: {0}'
                .format(X.shape[0] - X_.shape[0]))
        if self.rfe_cv:
            raise Exception("PCA could not be processed with RFE_CV")
        else:
            predicted = self.model.transform(X_)
            Z = numpy.full(shape=(X.shape[0], predicted.shape[1]),
                           fill_value=numpy.nan,
                           dtype=numpy.float64)
            Z[nan_mask, :] = predicted
        return Z
def getProjectionMatrixKPCA(dim=50):
    """ Kernel PCA : see paper for detailed description"""
    # Create an X for the hierarchy
    X = np.zeros((len(labelDict), len(labelDict)))
    for item in labelDict:
        pars = getPathToRoot(item)
        for par in pars:
            X[labelIndex[item]][labelIndex[par]] = 1
    kpca = KernelPCA(n_components=dim, fit_inverse_transform=True)
    X_kpca = kpca.fit(X)
    return kpca, kpca.alphas_
예제 #34
0
class KPCA(BaseEstimator, TransformerMixin):
    def __init__(self, kernel='linear', is_on=1):
        self.is_on = is_on
        self.kernel = kernel
        self.model = KernelPCA(kernel=self.kernel)

    def fit(self, X, y=None):
        if (self.is_on == 1):
            X = check_array(X)
            self.model.fit(X)
            print("PCA fitted")
        return self

    def transform(self, X, y=None):
        if (self.is_on == 1):
            X_new = self.model.transform(X)
            print("PCA transformed")
            return X_new
        else:
            return X
예제 #35
0
def runPCA(X_train, X_test, y_train, y_test, comp_range, Kernel):
    C = SVMmodel.getBestParam(Kernel)
    scores = []
    for n_comp in comp_range:
        print("\nn_comp=%d\n" % (n_comp))
        transformer = KernelPCA(n_components=n_comp,
                                kernel=Kernel,
                                copy_X=True,
                                n_jobs=8)
        transformer.fit(X_train)
        X_train_proj = transformer.transform(X_train)
        X_test_proj = transformer.transform(X_test)
        if n_comp == 2:
            np.save('X_train_proj_2d_' + Kernel, X_train_proj)
            np.save('X_test_proj_2d_' + Kernel, X_test_proj)
        score = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train, y_test, C,
                                Kernel)
        scores.append(score.mean())
        print(scores)
    return scores
예제 #36
0
def DoKPCA(kernel, pcaData, varN=None):
    # do pca
    print('Task KPCA : START TIME:' +
          time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    kpca = KernelPCA(varN, kernel=kernel, fit_inverse_transform=True)
    X_r = kpca.fit(pcaData).transform(pcaData)
    # print('explained variance ratio (first two components): %s' % str(kpca.explained_variance_ratio_))
    print(np.shape(X_r))
    print('Task KPCA : END TIME:' +
          time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
    return X_r
예제 #37
0
def test_kernel_pca_n_components():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    for eigen_solver in ("dense", "arpack"):
        for c in [1, 2, 4]:
            kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver)
            shape = kpca.fit(X_fit).transform(X_pred).shape

            assert shape == (2, c)
예제 #38
0
def test_kernel_pca_n_components():
    rng = np.random.RandomState(0)
    X_fit = rng.random_sample((5, 4))
    X_pred = rng.random_sample((2, 4))

    for eigen_solver in ("dense", "arpack"):
        for c in [1, 2, 4]:
            kpca = KernelPCA(n_components=c, eigen_solver=eigen_solver)
            shape = kpca.fit(X_fit).transform(X_pred).shape

            assert_equal(shape, (2, c))
예제 #39
0
def create_model(params):


    kpca = KernelPCA(kernel=params['kernel']['ktype'], n_components=params['n_components'])
    print('---------------------------------------')
    print('Kernel:  {}'.format(params['kernel']['ktype']))
    ensemble_kernel.append(params['kernel']['ktype'])
    print('N comp:   {}'.format(params['n_components']))
    ensemble_comp.append(params['n_components'])
    kpca.fit(x_train)
    train_img = kpca.transform(x_train)
    X_kpca = kpca.transform(x_valid)

    # Run Random Forest Classifier and plot result if n < 4
    validation_acc = run_rf(train_img, y_train, X_kpca, y_valid, "KPCA-RF ")
    ensemble_acc.append(validation_acc)
    # save for later
    print('Val. Acc:    {}'.format(validation_acc))

    return {'loss': -validation_acc, 'status': STATUS_OK, 'model': kpca}
예제 #40
0
    def plot_state_space_3d(self,nmax=2000,kernal=None,interpol_n=10):
        from scipy.interpolate import interp1d
        import matplotlib.pyplot as plt
        from mpl_toolkits.mplot3d import Axes3D

        fig = plt.figure(figsize=(6,6))
        ax = fig.add_subplot(111, projection='3d')
        if self.nx>3:
            if kernal is None:
                from sklearn.decomposition import KernelPCA
                kernal = KernelPCA(n_components=3,fit_inverse_transform=True)
            kernal.fit(self.x)
            F = kernal.transform(self.x[:nmax])
        else:
            F = self.x[:nmax]
        f = interp1d(np.arange(F.shape[0]),F.T,kind='quadratic')
        out = f(np.arange(0,F.shape[0]-1,1/interpol_n))
        ax.plot(out[0],out[1],out[2])
        # plt.show()
        return kernal
예제 #41
0
def Compute_var_ratio(HE_MI_train_test, kernel, invTran, degree):
    MyDataSet = HE_MI_train_test
    my_HEtraining = MyDataSet[0]
    my_MItraining = MyDataSet[1]
    my_HEtest = MyDataSet[2]
    my_MItest = MyDataSet[3]

    kpca = KernelPCA(kernel=kernel, fit_inverse_transform=invTran, degree=degree)
    HE_train_kpca = kpca.fit(my_HEtraining)
    HE_train_var = HE_train_kpca.lambdas_

    MI_train_kpca = kpca.fit(my_MItraining)
    MI_train_var = MI_train_kpca.lambdas_

    HE_test_kpca = kpca.fit(my_HEtest)
    HE_test_var = HE_test_kpca.lambdas_

    MI_test_kpca = kpca.fit(my_MItest)
    MI_test_var = MI_test_kpca.lambdas_

    return [HE_train_var, MI_train_var, HE_test_var, MI_test_var]
def kernel_kpca():
    start_time = time.time()
    data_array_all, sample_number_list = read_csv(data_dir)
    shape = data_array_all.shape
    feature_num = shape[1]
    data_num = shape[0]

    kpca = KernelPCA(n_components=feature_num / 8,
                     kernel="rbf",
                     fit_inverse_transform=True,
                     gamma=10)
    kpca.fit(data_array_all)
    dimension_reduction = kpca.transform(data_array_all)
    save_dir = './dr_results/{1}-kpca-{0}.csv'.format(feature_num / 8, target)
    save_csv_data(dir=save_dir,
                  data=dimension_reduction,
                  sample_number_list=sample_number_list,
                  name='kpca')
    # return dimension_reduction
    print("---kpca for {0} {1} seconds ---".format(target,
                                                   time.time() - start_time))
예제 #43
0
def kpca_run(kernel='linear'):
    pca = KernelPCA(n_components=2, kernel=kernel)

    pca_data = pca.fit(data).transform(data)

    fig, axs = plt.subplots(1, 1)

    axs.scatter(pca_data[:, 0], pca_data[:, 1], c=labels, cmap='rainbow')
    axs.set_xlabel('PC1')
    axs.set_ylabel('PC2')
    axs.set_title(kernel)

    plt.show()
예제 #44
0
파일: pca.py 프로젝트: DaMSL/ddc
class PCAKernel(PCAnalyzer):
  """ Non-linear PCA as wrapper over SciKitLearn Kernels """
  def __init__(self, components, ktype='poly'):
    PCAnalyzer.__init__(self)
    if isinstance(components, int):
      self.n_components = components
    self.pca = KernelPCA(kernel=ktype, n_components=components)
    self.type = 'kernel'

  def solve(self, X):
    self.dim = np.prod(X.shape[1:])
    self.pca.fit(X.reshape(len(X), self.dim))
    self.trainsize = len(X)

  def project(self, X):
    if isinstance(X, list):
      X = np.array(X)
    dimX = np.prod(X.shape[1:])
    if dimX != self.dim:
      logging.error('Projection Error in KPCA: Cannot reshape/project %s size data using PC Vects of size, %s', str(X.shape), str(self.dim))
      return None
    projection = self.pca.transform(X.reshape(len(X), dimX))
    return projection
def perform_kpca(input_data):
    '''
    Applying Kernal PCA on removed outliers data#
    Using scikit module for Kpca
    '''
    from sklearn.decomposition import KernelPCA
    
    # Specify  kernal fucntion  used in the K pca
    KERNEL = raw_input('Enter the kernal of kernalPCA(options are :cosine,rbf,linear,sigmoid:')
    kpca=KernelPCA(n_components=len(input_data.T),kernel=KERNEL)
    #Scaling thing for input dataset
    from sklearn.preprocessing import scale
    scld_input_data= scale(input_data, axis=0, with_mean=True, with_std=True, copy=True )
    kpca.fit(scld_input_data)
    # Transform the dataset on the given PC's
    kpca_input_data=kpca.transform(scld_input_data)
    #Percentage variance representarion
    Kpca_percent=np.array(map(lambda y: (kpca.lambdas_[y]/sum(kpca.lambdas_)),range(len(kpca.lambdas_))))
    Var_explanied=np.c_[Kpca_percent.reshape(len(Kpca_percent),1)]
    print '\nVariance explanied by eigenvalues of KPca '
    print (['Kpca'])
    print Var_explanied          
    return (kpca_input_data)
def test_kernel_pca_sparse():
    rng = np.random.RandomState(0)
    X_fit = sp.csr_matrix(rng.random_sample((5, 4)))
    X_pred = sp.csr_matrix(rng.random_sample((2, 4)))

    for eigen_solver in ("auto", "arpack"):
        for kernel in ("linear", "rbf", "poly"):
            # transform fit data
            kpca = KernelPCA(4, kernel=kernel, eigen_solver=eigen_solver, fit_inverse_transform=False)
            X_fit_transformed = kpca.fit_transform(X_fit)
            X_fit_transformed2 = kpca.fit(X_fit).transform(X_fit)
            assert_array_almost_equal(np.abs(X_fit_transformed), np.abs(X_fit_transformed2))

            # transform new data
            X_pred_transformed = kpca.transform(X_pred)
            assert_equal(X_pred_transformed.shape[1], X_fit_transformed.shape[1])
예제 #47
0
def test_leave_zero_eig():
    """This test checks that fit().transform() returns the same result as
    fit_transform() in case of non-removed zero eigenvalue.
    Non-regression test for issue #12141 (PR #12143)"""
    X_fit = np.array([[1, 1], [0, 0]])

    # Assert that even with all np warnings on, there is no div by zero warning
    with pytest.warns(None) as record:
        with np.errstate(all='warn'):
            k = KernelPCA(n_components=2, remove_zero_eig=False,
                          eigen_solver="dense")
            # Fit, then transform
            A = k.fit(X_fit).transform(X_fit)
            # Do both at once
            B = k.fit_transform(X_fit)
            # Compare
            assert_array_almost_equal(np.abs(A), np.abs(B))

    for w in record:
        # There might be warnings about the kernel being badly conditioned,
        # but there should not be warnings about division by zero.
        # (Numpy division by zero warning can have many message variants, but
        # at least we know that it is a RuntimeWarning so lets check only this)
        assert not issubclass(w.category, RuntimeWarning)
예제 #48
0
def kpca(Y, k, params):
    """KPCA driver.

    Runs KPCA on the input data matrix and UPDATES the KPCA parameters given
    by the user. See

    [1] B. Schoelkopf, A. Smola, and K. R. Muller. "Nonlinear component analysis
        as a kernel eigenvalue problem", Neural Computation, vol. 10,
        pp. 1299-1319, 1998

    for technical details on KPCA.

    Parameters:
    -----------
    Y : numpy array, shape = (N, D)
        Input matrix of D N-dimensional signals.

    k : int
        Compute k KPCA components.

    params : KPCAParam instance
        KPCA parameters.

        Upon completion, the params is updated. The following fields
        are set:

            _data : numpy.array, shape = (N, D) - Original data
            _A : numpy.array, shape = (N, k)    - KPCA weight matrix
            _l : numpy.array, shape = (k,)      - Eigenvalues of kernel matrix

        The following fields need to be set already:

            _kPar : Kernel parameters (depends on the kernel)
            _kFun : Kernel function   (depends on the kernel)

        Since the kernel will be called interally, the kernel parameters
        will also be updated (see kernel documentation).

    Returns:
    --------
    Xhat : numpy array, shape (k, D)
        NLDS state parameters.
    """

    if not isinstance(params, KPCAParam):
        raise ErrorDS('wrong KCPA parameters!')

    if (params._kPar is None or params._kFun is None):
        raise ErrorDS('KPCA not properly configured!')

    # save data
    params._data = Y

    # calls kernel fun
    params._kFun(Y, Y, params._kPar)
    kpcaObj = KernelPCA(kernel="precomputed")
    kpcaObj.fit(params._kPar._kMat)

    params._A = kpcaObj.alphas_[:,0:k]
    params._l = kpcaObj.lambdas_[0:k]

    if np.any(np.where(kpcaObj.lambdas_ <= 0)[0]):
        dsinfo.warn("some unselected eigenvalues are negative!")
    if np.any(np.where(params._l < 0)[0]):
        dsinfo.warn("some eigenvalues are negative!")

    # normalize KPCA weight vectors
    normalize(params._A, params._l)
    return params._A.T*params._kPar._kMat
예제 #49
0
파일: main.py 프로젝트: andybaoxv/COPDGene
#nmi_0 = nmi_revised(gold,labels_predict)
#nmi_1 = nmi_revised(gold,labels_fs1_predict)
#nmi_2 = nmi_revised(gold,labels_fs2_predict)


print nmi_0,nmi_1,nmi_2


#PCA
gamma = 1.0/(2*sigma_rbf**2)
degree = 3
color = ['b','r','g','m','y','k']

kpca_0 = KernelPCA(n_components=kpca_num,kernel='rbf',gamma=gamma
                   ,degree=degree)
kpca_0.fit(data)
kpca_data = kpca_0.fit_transform(data)
fig = plt.figure(4)
if kpca_num == 2:
    for i in range(len(labels_predict)):
        plt.scatter(kpca_data[i,0],kpca_data[i,1],c=color[labels_predict[i]]
                    ,marker='o')
else:
    ax = fig.add_subplot(111,projection='3d')
    for i in range(len(labels_predict)):
        ax.scatter(kpca_data[i,0],kpca_data[i,1],kpca_data[i,2]
                   ,c=color[labels_predict[i]],marker='o')
       
kpca_1 = KernelPCA(n_components=kpca_num,kernel='rbf',gamma=gamma,degree=degree)
kpca_1.fit(data_fs1)
kpca_data_fs1 = kpca_1.transform(data_fs1)
예제 #50
0
    rbf_svc = svm.SVC(kernel='rbf', gamma=0.00005, C=50).fit(X, y)
    #poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X, y)
    lin_svc = svm.LinearSVC(C=C).fit(X, y)    
    
    for i, clf in enumerate((rbf_svc,lin_svc)):
        # Plot the decision boundary. For that, we will assign a color to each
        # point in the mesh [x_min, m_max]x[y_min, y_max].
        z = clf.score(X_test, Y_test)
        if clf == rbf_svc :
            print("RBF",z)
        else:
            print("Linear",z)
            
svm.SVC(kernel='rbf', gamma=0.00005, C=50).fit(X_train, Y_train.ravel())
pca = PCA(n_components=20)
pca.fit(X_testing)
print(pca.explained_variance_ratio_)
X_testing = pca.transform(X_testing) 
Y_test = clf.predict(X_testing)

Y_testing = np.zeros((len(Y_test),3))
for i in range (0,len(Y_test)):
    if (Y_test[i] == 0):
        Y_testing[i,0] = 1
    elif(Y_test[i] == 1):
        Y_testing[i,1] = 1
    elif(Y_test[i] == 3):
        Y_testing[i,2] = 1
        
Y_testing
예제 #51
0
    gamma = 1/(2*sigma**2)

if (0):
    #%% K-PCA
    # Calculate accumulated variance
    kpca = KernelPCA(kernel="rbf",gamma=gamma)
    kpca.fit_transform(Xtrain)
    eigenvals = kpca.lambdas_[0:220]

    
    # Calculate classifiation scores for each component
    nComponents =  np.linspace(1, 500, 100, endpoint=True)
    kpcaScores = np.zeros((5,np.alen(nComponents)))
    
    kpca = KernelPCA(n_components = Ntrain,kernel="rbf",gamma=gamma)
    kpca.fit(Xtrain)
    XtrainT = kpca.transform(Xtrain)
    XtestT = kpca.transform(Xtest)
    

    for i in range(len(nComponents)):   
        kpcaScores[:,i] = util.classify(XtrainT[:,:nComponents[i]],XtestT[:,:nComponents[i]],labelsTrain,labelsTest)

    #%% Plot accuracies for kPCA
    plt.figure()
    for i in range (5):
        plt.plot(nComponents,kpcaScores[i,:],lw=3)

    plt.xlim(1,np.amax(nComponents))
    plt.title('kPCA accuracy')
    plt.xlabel('Number of components')
예제 #52
0
파일: main.py 프로젝트: andybaoxv/COPDGene
#        mtr_l[j,i] = mtr_l[i,j]
#eig_val, eig_vec = np.linalg.eig(mtr_l)

clf = SpectralClustering(n_clusters=K,affinity='precomputed')
clf.fit(affinity)
labels_predict = clf.labels_

draw_similarity_matrix(affinity,labels_predict,K)

#PCA
gamma = 1.0/(2*sigma_rbf**2)
degree = 3
color = ['b','r','g','m','y','k']

kpca_2 = KernelPCA(n_components=2,kernel='rbf',gamma=gamma,degree=degree)
kpca_2.fit(data_use)
kpca_2_data = kpca_2.fit_transform(data_use)
fig = plt.figure(1)
for i in range(len(labels_predict)):
    plt.scatter(kpca_2_data[i,0],kpca_2_data[i,1],c=color[labels_predict[i]],\
            marker='o')

kpca_3 = KernelPCA(n_components=3,kernel='rbf',gamma=gamma,degree=degree)
kpca_3.fit(data_use)
kpca_3_data = kpca_3.fit_transform(data_use)
fig = plt.figure(2)
ax = fig.add_subplot(111,projection='3d')
for i in range(len(labels_predict)):
    ax.scatter(kpca_3_data[i,0],kpca_3_data[i,1],kpca_3_data[i,2],\
            c=color[labels_predict[i]],marker='o')
예제 #53
0
파일: model.py 프로젝트: zaycev/n7
class FeatureSet(object):

    def __init__(self, index_dir,

                 allowed_terms=None,            # list of allowed terms which will be used (need for testing)
                 disallowed_terms=None,         # list of disallowed terms which will be ignored

                 ft_number_of_words=False,      # use number of regular words as feature
                 ft_number_of_hash_tags=False,  # use number of hash-tags as feature
                 ft_number_of_user_names=False, # use number of twitter user names as feature
                 ft_number_of_bad_words=False,  # use number of bad words as feature
                 ft_number_of_links=False,      #
                 ft_number_of_nes=False,        # use number of named entities as feature
                 ft_number_of_punct=False,      #
                 ft_emoticons=False,            #
                 ft_total_hate_score=False,     # use total hate score as feature
                 ft_terms_binary=False,         # use vector space model with binary function as feature
                 ft_terms_tf=False,             # use vector space model with frequency function as feature
                 ft_terms_tfidf=False,          # use vector space model with tfidf function as feature
                 ft_scale=False,

                 terms_max_df=0.5,              # specifies max document frequency in feature selection (normalized)
                 terms_min_df=50,               # specifies min document frequency in feature selection

                 tfidf_model=None,              #

                 pca=False,                     # apply pca to output vector
                 pca_model=None,                #

                 data_n7_dir=N7_DATA_DIR,       #

                 dtype=np.float32,              #

                 verbose=False):                #

        logging.info("GENERATING MODEL")

        self.tweet_id_index_map = dict()
        self.index_tweet_id_map = dict()

        self.index = TextIndex(index_dir)
        self.full_index = TextIndex(index_dir)
        self.full_index.load_terms(0, 1.0)
        self.searcher = Searcher(self.index, terms_min_df, terms_max_df)
        self.verbose = verbose

        self.allowed_terms = allowed_terms
        self.disallowed_terms = disallowed_terms

        self.ft_number_of_words = ft_number_of_words
        self.ft_number_of_hash_tags = ft_number_of_hash_tags
        self.ft_number_of_user_names = ft_number_of_user_names
        self.ft_number_of_bad_words = ft_number_of_bad_words
        self.ft_number_of_nes = ft_number_of_nes
        self.ft_number_of_links = ft_number_of_links
        self.ft_total_hate_score = ft_total_hate_score
        self.ft_terms_binary = ft_terms_binary
        self.ft_terms_tf = ft_terms_tf
        self.ft_terms_tfidf = ft_terms_tfidf
        self.ft_scale = ft_scale
        self.ft_number_of_punct = ft_number_of_punct
        self.ft_emoticons = ft_emoticons

        self.terms_max_df = terms_max_df
        self.terms_min_df = terms_min_df

        self.data_n7_dir = data_n7_dir

        self.tfidf_model = tfidf_model

        self.pca = pca
        self.pca_model = pca_model

        self.dtype = dtype

        self.twitter = TwitterTextUtil()

        if self.allowed_terms:
            allowed_terms = dict()
            for term in self.allowed_terms:
                if term in self.index.term_id_map:
                    allowed_terms[self.index.term_id_map[term]] = term
            self.allowed_terms = allowed_terms
            if self.verbose:
                logging.info("ALLOWED TERMS: %r" % self.allowed_terms)

        # create <term id> :-> <vector index> map
        if ft_terms_binary or ft_terms_tf or ft_terms_tfidf:
            for term_id in self.index.id_term_map.iterkeys():

                if self.allowed_terms is not None:
                    if term_id not in self.allowed_terms:
                        continue

                if self.disallowed_terms is not None:
                    term = self.index.term_id_map.get(term_id)
                    if term in self.disallowed_terms:
                        continue

                new_index_value = len(self.tweet_id_index_map)
                self.index_tweet_id_map[new_index_value] = term_id
                self.tweet_id_index_map[term_id] = new_index_value
                if self.verbose:
                    print "ADDED: %d as %d" % (term_id, new_index_value)

            if self.verbose:
                print self.tweet_id_index_map

            print "\tMODEL: %d terms" % len(self.tweet_id_index_map)

        loader = Loader(data_n7_dir)

        if self.ft_number_of_bad_words:
            self.bad_words = loader.bad_words(add_hashtags=False)
            print "\tMODEL: %d bad words" % len(self.bad_words)

    def text_to_vector(self, text, allow_pca=True):
        tokens = self.index.tokenize(text)
        if self.verbose:
            print tokens
        return self.terms_to_vector(text, tokens, allow_pca=allow_pca)

    def terms_to_vector(self, text, terms, allow_pca=True):
        term_ids = []
        outputs = []

        # PREPROCESSING

        for term in terms:
            term_id = self.index.term_id_map.get(term)
            if term_id is not None:
                term_ids.append(term_id)
        if self.verbose:
            print term_ids

        if self.allowed_terms:
            term_ids = filter(lambda term_id: term_id in self.allowed_terms, term_ids)
            if self.verbose:
                print term_ids

        # COMPUTING FEATURES

        if self.ft_terms_binary:
            bin_vector = self.__ft_bin_vector__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "bin_vector", bin_vector
            outputs.append(bin_vector)

        if self.ft_terms_tf:
            tf_vector = self.__ft_tf_vector__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "tf_vector", tf_vector
            outputs.append(tf_vector)

        if self.ft_terms_tfidf:
            tfifd_vector = self.__ft_tfidf_vector__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "tfifd_vector", tfifd_vector
            outputs.append(tfifd_vector)

        if self.ft_number_of_words:
            number_of_words = self.__ft_number_of_words__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "number_of_words", number_of_words
            outputs.append(number_of_words)

        if self.ft_number_of_hash_tags:
            number_of_hash_tags = self.__ft_number_of_hash_tags__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "number_of_hash_tags", number_of_hash_tags
            outputs.append(number_of_hash_tags)

        if self.ft_number_of_user_names:
            number_of_user_names = self.__ft_number_of_user_names__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "number_of_user_names", number_of_user_names
            outputs.append(number_of_user_names)

        if self.ft_number_of_links:
            number_of_links = self.__ft_number_of_links__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "number_of_links", number_of_links
            outputs.append(number_of_links)

        if self.ft_number_of_bad_words:
            number_of_bad_words = self.__ft_number_of_bad_words__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "number_of_bad_words", number_of_bad_words
            outputs.append(number_of_bad_words)

        if self.ft_number_of_punct:
            number_of_punct = self.__ft_number_of_punct__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "number_of_punct", number_of_punct
            outputs.append(number_of_punct)

        if self.ft_emoticons:
            emoticons_vector = self.__ft_emoticons_vector__(term_ids, terms, scale=self.ft_scale)
            if self.verbose:
                print "emoticons_vector", emoticons_vector
            outputs.append(emoticons_vector)

        outputs = np.concatenate(outputs)
        
        if allow_pca and self.pca:
            print "PCA IS ALLOWED"
            outputs = np.asarray(self.pca_model.transform(outputs)).reshape(-1)
        
        if self.verbose:
            print outputs

        return outputs

    def __scale_array__(self, array):
        return 1 - 1 / (array + 1)

    def __ft_emoticons_vector__(self, term_ids, terms, scale=False):
        vector = np.zeros(2, dtype=self.dtype)
        for term in terms:
            if Sad_RE.match(term):
                vector[0] += 1
            if Happy_RE.match(term):
                vector[0] += 1
        return vector

    def __ft_bin_vector__(self, term_ids, terms, scale=False):
        vector = np.zeros(len(self.tweet_id_index_map), dtype=self.dtype)
        for term_id in term_ids:
            term_index = self.tweet_id_index_map.get(term_id)
            if term_index is not None:
                vector[term_index] = 1
        return vector

    def __ft_tf_vector__(self, term_ids, terms, scale=False):
        vector = np.zeros(len(self.tweet_id_index_map), dtype=self.dtype)
        for term_id in term_ids:
            term_index = self.tweet_id_index_map.get(term_id)
            if term_index is not None:
                vector[term_index] += 1
        return vector

    def __ft_tfidf_vector__(self, term_ids, terms, scale=False):
        vector = np.zeros(len(self.tweet_id_index_map), dtype=self.dtype)
        for term_id in term_ids:
            term_index = self.tweet_id_index_map.get(term_id)
            if term_index is not None:
                vector[term_index] += 1
        vector = self.tfidf_model.transform(vector).toarray()[0]
        return vector

    def __ft_number_of_words__(self, term_ids, terms, scale=False):
        nw = np.zeros(1, dtype=self.dtype)
        for term in terms:
            is_word = True
            if self.twitter.is_hashtag(term):
                is_word = False
            if self.twitter.is_link(term):
                is_word = False
            if self.twitter.is_username(term):
                is_word = False
            if self.twitter.is_punct(term):
                is_word = False
            if self.verbose:
                if is_word:
                    print "%s is a word" % term
                else:
                    print "%s is not a word" % term
            if is_word:
                nw[0] += 1
        return self.__scale_array__(nw) if scale else nw

    def __ft_number_of_hash_tags__(self, term_ids, terms, scale=False):
        nw = np.zeros(1, dtype=self.dtype)
        for term in terms:
            if self.twitter.is_hashtag(term):
                nw[0] += 1
        return self.__scale_array__(nw) if scale else nw

    def __ft_number_of_user_names__(self, term_ids, terms, scale=False):
        nw = np.zeros(1, dtype=self.dtype)
        for term in terms:
            if self.twitter.is_username(term):
                nw[0] += 1
        return self.__scale_array__(nw) if scale else nw

    def __ft_number_of_links__(self, term_ids, terms, scale=False):
        nw = np.zeros(1, dtype=self.dtype)
        for term in terms:
            if self.twitter.is_link(term):
                if self.verbose:
                    print "%s is is link" % term
                nw[0] += 1
        return self.__scale_array__(nw) if scale else nw

    def __ft_number_of_punct__(self, term_ids, terms, scale=False):
        nw = np.zeros(1, dtype=self.dtype)
        for term in terms:
            if self.twitter.is_punct(term):
                nw[0] += 1
        return self.__scale_array__(nw) if scale else nw

    def __ft_number_of_bad_words__(self, term_ids, terms, scale=False):
        nw = np.zeros(1, dtype=self.dtype)
        for term in terms:
            for bad_w in self.bad_words:
                if len(bad_w) > 3:
                    if bad_w in term:
                        if self.verbose:
                            print "%s is bad word" % term
                        nw[0] += 1
                        break
                else:
                    if bad_w == term:
                        if self.verbose:
                            print "%s is bad word" % term
                        nw[0] += 1
                        break
        return self.__scale_array__(nw) if scale else nw
        
    def fit_pca(self, X, n_components=64, kernel="sigmoid"):
        self.pca_model = KernelPCA(n_components=n_components, kernel=kernel)
        logging.info("FITTING PCA(%s-%d) MODEL FROM %d EXAMPLES" % (kernel, n_components, X.shape[0]))
        self.pca_model.fit(X)
        logging.info("FITTING DONE")
        
    def fit_pca_from_index(self, training_examples=10, n_components=64, kernel="sigmoid"):
        X = self.fm_from_index(training_examples)
        self.fit_pca(X, n_components, kernel)
        
    def fit_tfidf(self, X):
        self.tfidf_model = FeatureSet.do_fit_tfidf(X)

    @staticmethod
    def do_fit_tfidf(X):
        tfidf_model = TfidfTransformer()
        logging.info("FITTING TFIDF MODEL FROM %d EXAMPLES" % X.shape[0])
        tfidf_model.fit(X)
        logging.info("FITTING DONE")
        return tfidf_model

    def fit_tfidf_from_index(self, training_examples=10):
        X = self.fm_from_index(training_examples)
        self.fit_tfidf(X)
        
    def save_pca_model(self, file_path=None):
        if file_path is None:
            file_path = "%s/models/model_tfidf.pkl" % self.data_n7_dir
        else:
            file_path = "%s/models/%s" % (self.data_n7_dir, file_path)
        joblib.dump(self.pca_model, file_path, compress=9)

    def load_pca_model(self, file_path=None):
        if file_path is None:
            file_path = "%s/models/model_tfidf.pkl" % self.data_n7_dir
        else:
            file_path = "%s/models/%s" % (self.data_n7_dir, file_path)
        self.pca_model = joblib.load(file_path)
        logging.info("LOADED PCA MODEL %r" % self.pca_model)

    def save_tfidf_model(self, file_path=None):
        if file_path is None:
            file_path = "%s/models/model_tfidf.pkl" % self.data_n7_dir
        else:
            file_path = "%s/models/%s" % (self.data_n7_dir, file_path)
        joblib.dump(self.tfidf_model, file_path, compress=9)

    def load_tfidf_model(self, file_path=None):
        if file_path is None:
            file_path = "%s/models/model_tfidf.pkl" % self.data_n7_dir
        else:
            file_path = "%s/models/%s" % (self.data_n7_dir, file_path)
        self.tfidf_model = joblib.load(file_path)
        logging.info("LOADED TFIDF MODEL %r" % self.tfidf_model)
        
    def fm_from_tokens(self, i_tokens, training_examples=10):
        v_size = len(self.text_to_vector("", allow_pca=False))
        logging.info("INITIALIZING %dx%d MATRIX" % (training_examples, v_size))
        X = np.zeros((training_examples, v_size), dtype=self.dtype)
        # X = matrix((training_examples, v_size), dtype=self.dtype)        
        i = 0
        for tokens in i_tokens:
            f_vect = self.terms_to_vector(None, tokens, allow_pca=False)
            print "EXTRACTED %d/%d" % (i, training_examples)
            X[i,:] = f_vect
            i += 1
            if i >= training_examples:
                break
        if self.pca:
            print "APPLYING PCA %r" % self.pca_model
            X = self.pca_model.transform(X)
        if self.verbose:
            print X
        return X
        
        
    def fm_from_index(self, training_examples=10):
        i_verctors = imap(lambda x: x[1], self.searcher.iterate())
        i_tokens = imap(lambda v: [self.full_index.id_term_map[tid] for tid in v], i_verctors)
        return self.fm_from_tokens(i_tokens, training_examples=training_examples)
    
    @staticmethod
    def save_fm(X, file_path=None, sparse=False):
        if file_path is None:
            file_path = "%s/models/X.pkl" % N7_DATA_DIR
        else:
            file_path = "%s/models/%s" % (N7_DATA_DIR, file_path)
        if sparse:
            logging.info("CONVERTING TO SPARSE REPRESENTATION")
            X = sparse_matrix(X, dtype=X.dtype)
        logging.info("SAVING FEATURE MATRIX %r -> %s" % (X.shape, file_path))
        joblib.dump(X, file_path, compress=9)
        
    @staticmethod
    def load_fm(file_path=None):
        if file_path is None:
            file_path = "%s/models/X.pkl" % N7_DATA_DIR
        else:
            file_path = "%s/models/%s" % (N7_DATA_DIR, file_path)
        X = joblib.load(file_path)
        return X
    
    def load_from_csv(self, file_paths, labeled=True):
        Y = []
        texts = []
        i = 1
        for fl_path in file_paths:
            reader = csv.reader(open(fl_path, "rb"))
            for row in reader:
                text = row[-1]
                texts.append(text)
                if labeled:
                    cl = row[0]
                    if cl == "?":
                        Y.append(0)
                    else:
                        if int(cl) > 0:
                            Y.append(1)
                        else:
                            Y.append(0)
            i += 1

        i_tokens = imap(self.index.tokenize, texts)
        X = self.fm_from_tokens(i_tokens, 500)

        if labeled:
            return X, Y
        return X
        

    def info(self):
        pass
예제 #54
0
pca_X_new = pca.fit_transform(X)
print 'pca explained', pca.explained_variance_ratio_
print 'pca explained sum', sum(pca.explained_variance_ratio_)
joblib.dump(pca_model, 'pca_model.pkl')
joblib.dump(pca_X_new, 'pca_X_new.pkl')
print pca_model

sparse_pca = SparsePCA(n_components=50)
sparse_pca_model = pca.fit(sparse_pca_data)
sparse_pca_X_new = pca.fit_transform(X)
joblib.dump(sparse_pca_model, 'sparse_pca_model.pkl')
joblib.dump(sparse_pca_X_new, 'sparse_pca_X_new.pkl')
print sparse_pca_model

kernel_pca = KernelPCA(n_components=50)
kernel_pca_model = kernel_pca.fit(kernel_pca_data)
kernel_X_new = kernel_pca.fit_transform(X)
joblib.dump(kernel_pca_model, 'kernel_pca_model.pkl')
joblib.dump(kernel_X_new, 'kernel_X_new.pkl')

fast_ica = FastICA(n_components=None)
fast_ica_start = time.time()
fast_ica_model = fast_ica.fit(fast_ica_data)
fast_ica_end = time.time()
print 'fast_ica fit time', fast_ica_end - fast_ica_start
fast_ica_X_new = fast_ica.transform(X)
joblib.dump(fast_ica_model, 'fast_ica_model.pkl')
joblib.dump(fast_ica_X_new, 'fast_ica_X_new.pkl')
print fast_ica_model

'''
예제 #55
0
#   SYNOPSIS:
#       python cmd_model_index.py <path to index directory> <dataset size for TFIDF> <dataset size for PCA>


import sys
import logging

from n7.model import FSetLoader
from sklearn.decomposition import KernelPCA


if __name__ == "__main__":

    logging.basicConfig(level=logging.INFO)

    kpca_size = int(sys.argv[1]) if len(sys.argv) > 1 else 15000
    input_matrix_name = sys.argv[2] if len(sys.argv) > 2 else "X_tfidf.pkl"
    output_model_name = sys.argv[3] if len(sys.argv) > 3 else "model_kpca.pkl"

    loader = FSetLoader()

    X = loader.load_model(input_matrix_name)[0:kpca_size,:]
    model = KernelPCA(n_components=128, kernel="sigmoid")
    logging.info("FITTING PCA on %dx%d examples" % (X.shape[0], X.shape[1]))
    model.fit(X.toarray())
    logging.info("FITTING DONE: %r" % model)
    loader.save_model(model, output_model_name)
    loader.save_model(model.lambdas_, output_model_name + ".ev")

예제 #56
0
predictions = np.concatenate(predictions, axis=0)
acc = sum([prd == sv for (prd, sv) in zip(predictions, titanic["Survived"])]) / float(len(predictions))
print("[Adaboost: Perceptron-RF-GB-LinearSVM-KNN] {0:.2f}%".format(100*acc))

titanic_test = read_dataset(TEST_PATH)
predictions = adaboost_predict(beta, algs, titanic_test)
submission = pd.DataFrame({"PassengerId": titanic_test["PassengerId"], "Survived": predictions})
submission.to_csv(SUBMISSION_PATH, index=False)


# KernelPCA
titanic_test = read_dataset(TEST_PATH)
p = ["Pclass", "Sex", "Age", "Fare", "FamilySize"]
#pairwise_plot(titanic[p], titanic["Survived"])
kpca = KernelPCA(kernel="poly", tol=1e-3, gamma=100)
T_kpca = kpca.fit(titanic_test[p])
X_kpca = kpca.transform(titanic[p])
print("Found {0} columns".format(len(X_kpca[0])))
N = len(X_kpca)
fN = 10
X_kpca = pd.DataFrame({'kpca'+str(j+1): pd.Series(stats.zscore([X_kpca[i][j] for i in range(N)]), index=range(N)) for j in range(fN)})
#pairwise_plot(X_kpca, titanic["Survived"])

alg = RandomForestClassifier(random_state=1, n_estimators=50, min_samples_split=2, min_samples_leaf=2)
alg.fit(X_kpca, titanic["Survived"])
pred = alg.predict(X_kpca)
acc = sum([prd == sv for (prd, sv) in zip(pred, titanic["Survived"])]) / float(len(pred))
print("[KernelPCA] {0:.2f}%".format(100*acc))
scores = cross_validation.cross_val_score(alg, X_kpca, titanic["Survived"], cv=10)
print("[KernelPCA-CV] {0:.2f}%".format(100*scores.mean()))
예제 #57
0
    print '... loading FOLD %d'%fold_id
    fold = pickle.load( open( DATADIR + '/pkl/fold%d_normed.pkl'%(fold_id), "rb" ) )

    X_train, y_train, id_train = load_X_from_fold_to_3dtensor(fold, 'train', NUM_OUTPUT)
    X_test, y_test, id_test = load_X_from_fold_to_3dtensor(fold, 'test', NUM_OUTPUT)

    X_concat_train = np.reshape(X_train, (X_train.shape[0]*X_train.shape[1], X_train.shape[2]), order='C')
    X_concat_test = np.reshape(X_test, (X_test.shape[0]*X_test.shape[1], X_test.shape[2]), order='C')

    np.random.seed(321)
    perm = np.random.permutation(X_concat_train.shape[0])
    subset_ind = perm[0:max_nb_samples]
    X_concat_train_SUBSET = X_concat_train[subset_ind]

    start_time = time.time()
    pca_model = pca.fit(X_concat_train_SUBSET)
    print("--- kPCA fitting: %.2f seconds ---" % (time.time() - start_time))

    start_time = time.time()
    pca_X_concat_train = pca_model.transform(X_concat_train)
    print("--- kPCA transforming TRAIN: %.2f seconds ---" % (time.time() - start_time))

    start_time = time.time()
    pca_X_concat_test = pca_model.transform(X_concat_test)
    print("--- kPCA transforming TEST: %.2f seconds ---" % (time.time() - start_time))

    print 'dims: ', pca_X_concat_train.shape, pca_X_concat_test.shape
    new_dim = pca_X_concat_train.shape[1]
    X_train = np.reshape(pca_X_concat_train, (X_train.shape[0], X_train.shape[1], new_dim), order='C')
    X_test = np.reshape(pca_X_concat_test, (X_test.shape[0], X_test.shape[1], new_dim), order='C')
예제 #58
0
        # standardize test data
        melody_concat_test = np.reshape(melody_test, (melody_test.shape[0]*melody_test.shape[1], melody_test.shape[2]), order='C')
        melody_concat_test_normed, _ = standardize(melody_concat_test, scaler)
        # print concat_test_normed.shape
        melody_test_normed = np.reshape(melody_concat_test_normed, (melody_test.shape[0], melody_test.shape[1], melody_test.shape[2]), order='C')
        del melody_concat_test, melody_concat_test_normed

        # concat with the other features
        X_train = np.concatenate((X_train, melody_train_normed), axis=2)
        X_test = np.concatenate((X_test, melody_test_normed), axis=2)

    if usePCA:
        X_concat_train = np.reshape(X_train, (X_train.shape[0]*X_train.shape[1], X_train.shape[2]), order='C')
        X_concat_test = np.reshape(X_test, (X_test.shape[0]*X_test.shape[1], X_test.shape[2]), order='C')
        pca_model = pca.fit(X_concat_train)
        pca_X_concat_train = pca_model.transform(X_concat_train)
        pca_X_concat_test = pca_model.transform(X_concat_test)
        print 'dims: ', pca_X_concat_train.shape, pca_X_concat_test.shape
        reduced_dim = pca_X_concat_train.shape[1]
        X_train = np.reshape(pca_X_concat_train, (X_train.shape[0], X_train.shape[1], reduced_dim), order='C')
        X_test = np.reshape(pca_X_concat_test, (X_test.shape[0], X_test.shape[1], reduced_dim), order='C')
    # print id_test.shape

    # X_train = X_train[0:100,:,:]
    # y_train = y_train[0:100,:,:]

    # X_train = X_train[:,[10,12,13,17,19,82,83,84,85,89,90,91,103,140,142,146,148,212,214,218,220]]
    # X_test = X_test[:,[10,12,13,17,19,82,83,84,85,89,90,91,103,140,142,146,148,212,214,218,220]]
    # X_train = X_train[:,[13,85,103,142,214]]
    # X_test = X_test[:,[13,85,103,142,214]]
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from plot import plotGraph, plotGraph3D
from sklearn.decomposition import PCA, KernelPCA

train_data = pd.read_csv('datasets/train.csv')
train_pts = train_data.drop('Activity', axis=1)
train_labels = train_data['Activity']

# test_data = pd.read_csv('datasets/test.csv')
# test_pts = test_data.drop('Activity', axis=1)
# test_labels = test_data['Activity']

pca = KernelPCA(n_components=100)
train_pca = pca.fit(train_pts,train_labels)
y = train_pca.lambdas_
x = range(1,101)
plt.plot(x,y)
plt.xlabel("No. of components")
plt.ylabel("Eigen values")
plt.title("Data preserved w.r.t no. of components")
plt.show()

# comp = []
# for i in range(0,100):
#     comp.append('comp'+str(i))
# pca = KernelPCA(n_components=100, kernel='rbf', gamma=0.1)
# train_pca = pca.fit_transform(train_pts,y=train_labels)
# train_pca = train_pca.tolist()
# print(type(train_pca))