예제 #1
0
 def __init__(self,
              x,
              y,
              z,
              num_bootstrap=99,
              kernel_type='gauss',
              bandwidth=None,
              index=1,
              seed=1,
              numba=True):
     if kernel_type in {'gauss', 'rectangle'}:
         self.kernel_type = kernel_type
     else:
         self.kernel_type = 'gauss'
     kde_z = KernelDensityEstimation(utils.as_matrix(z), bandwidth)
     self.kernel = kde_z.compute_kernel_density_estimate()
     self.dist_x = utils.compute_distance_matrix(utils.as_matrix(x), index)
     self.dist_y = utils.compute_distance_matrix(utils.as_matrix(y), index)
     assert self.dist_x.shape == self.dist_y.shape == self.kernel.shape
     self.stats = CDCStats(self.dist_x,
                           self.dist_y,
                           self.kernel,
                           numba=numba)
     self.cdcov_stats = 0.
     self.B = num_bootstrap
     self.permuted_cdcov_stats = np.zeros(self.B)
     self.seed = seed
     self.p_value = 0.
예제 #2
0
def cross_validation_with_and_without_manifold(X, y, n_neighbors, n_components, k):
    # Split indexes according to Kfold with k = 10
    kf = KFold(n_splits=k)

    # initialize scores lists
    scores = []
    scores2 = []
    for train_index, test_index in kf.split(X):
        kernel = GraphKernel(kernel={"name": "shortest_path", "with_labels": False}, normalize=True)

        # split train and test of K-fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Calculate the kernel matrix.
        K_train = kernel.fit_transform(X_train)
        K_test = kernel.transform(X_test)

        # Initialise an SVM and fit.
        clf = svm.SVC(kernel='precomputed', C=4)
        clf.fit(K_train, y_train)

        # Predict and test.
        y_pred = clf.predict(K_test)

        # Calculate accuracy of classification.
        acc = accuracy_score(y_test, y_pred)
        scores.append(acc)

        # Compute distance matrix
        D_train = compute_distance_matrix(K_train)
        D_test = compute_distance_matrix(K_test)

        # Initialize Isomap embedding object, embed train and test data
        embedding = manifold.Isomap(n_neighbors, n_components, metric="precomputed")
        E_train = embedding.fit_transform(D_train)
        E_test = embedding.transform(D_test)

        # initialize second svm (not necessary? search documentation)
        clf2 = svm.SVC(kernel='linear', C=4)
        clf2.fit(E_train, y_train)

        # Predict and test.
        y_pred = clf2.predict(E_test)

        # Calculate accuracy of classification.
        acc = accuracy_score(y_test, y_pred)
        scores2.append(acc)
    for i, _ in enumerate(scores):
        scores[i] = scores[i] * 100

    for i, _ in enumerate(scores2):
        scores2[i] = scores2[i] * 100
    return scores, scores2
예제 #3
0
M = arguments.N_KEEP
burnin = arguments.burnin
output_path = arguments.output_path

sigma_chain = np.cov(chain[burnin:].T)
mean_chain = np.mean(chain[burnin:], axis=0)

indexes = np.linspace(burnin, len(chain) - 1, M, dtype=int)

star_discrepency = utils.discrepency(chain[indexes], chain[burnin:],
                                     np.linalg.cholesky(sigma_chain),
                                     mean_chain)
vfk0 = make_imq(chain, gradient, pre='med')

print("Star discrepency done")
thin_mat = kmat(chain[indexes], gradient[indexes], vfk0)
KSD_thin = np.sqrt(np.mean(thin_mat))
print("KSD done")
ED_thin = 2*np.mean(utils.compute_distance_matrix(chain[indexes], chain[burnin:])) \
- np.mean(utils.compute_distance_matrix(chain[indexes], chain[indexes]))
print("ED done")

d = {
    "thinning": indexes,
    "ED": ED_thin,
    "KSD": KSD_thin,
    "star_discrepency": star_discrepency,
    "burnin": burnin
}
np.save(output_path, d, allow_pickle=True)
예제 #4
0
##OLD CODE

shortestPathKernel = GraphKernel(kernel={
    "name": "shortest_path",
    "with_labels": False
},
                                 normalize=True)

# Calculate the kernel matrix.
K = shortestPathKernel.fit_transform(X)

nan_elements = np.any(np.isnan(K))

# Compute the distance matrix D
D = compute_distance_matrix(K)

embedding = manifold.Isomap(n_neighbors=5,
                            n_components=10,
                            metric="precomputed")
X_transformed = embedding.fit_transform(D)

# xs = feature_vectors[:, 0]
# ys = feature_vectors[:, 1]
#
# plt.scatter(xs, ys, c=y)
# plt.show()

# print(np.all(np.isfinite(K_train)))

X_train, X_test, y_train, y_test = train_test_split(X_transformed,
def spk_isomap(X,y, k, KNNstart, KNNend, Dstart, Dend, svmC):

    filename = "accuracy.txt"

    myfile = open(filename, 'a')

    # Add info to file
    myfile.write('SP Isomap accuracy: K = %d-%d, D = %d-%d, C = %d, K-fold = %d\n'
                 % (KNNstart, KNNend, Dstart, Dend, svmC, k))

    KNN = []
    KNNrange = KNNend - KNNstart+1
    D = []
    Drange = Dend - Dstart+1

    for knn in range(KNNrange):
        KNN.append( knn + KNNstart)


    for d in range(Drange):
        D.append(d + Dstart)


    kf = KFold(n_splits=k)
    scores = []

    Z = np.ndarray(shape=( len(D) , len(KNN) ))

    for knn in range(len(KNN)):
        for d in range(len(D)):

            for train_index, test_index in kf.split(X):

                kernel = GraphKernel(kernel={"name": "shortest_path", "with_labels": False}, normalize=True)

                # split train and test of K-fold
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                # Calculate the kernel matrix.
                K_train = kernel.fit_transform(X_train)
                K_test = kernel.transform(X_test)

                # Compute distance matrix
                D_train = compute_distance_matrix(K_train)
                D_test = compute_distance_matrix(K_test)

                # Initialize Isomap embedding object, embed train and test data
                embedding = manifold.Isomap(n_neighbors=KNN[knn], n_components=D[d], metric="precomputed")
                E_train = embedding.fit_transform(D_train)
                E_test = embedding.transform(D_test)

                # initialize second svm (not necessary? search documentation)
                clf2 = svm.SVC(kernel='linear', C=svmC)
                clf2.fit(E_train, y_train)

                # Predict and test.
                y_pred = clf2.predict(E_test)

                # Append accuracy of classification.
                scores.append(accuracy_score(y_test, y_pred))

            val = np.mean(scores)
            Z[d][knn] = val
            myfile.write("%f " % (val))
            print("knn = ", KNN[knn], "d = ", D[d], " accuracy = ", Z[d][knn])
            print("{0:.2%} done".format((Drange*knn+d+1.0)/(Drange*KNNrange)))
            # print("{0:.2%} done".format((D*k+d + 1.0)/(D*KNN) ))
        myfile.write("\n")
    # Close the file
    myfile.close()
    return Z
#!/usr/bin/env python
import utils
import bmu
import pickle

# set the paths to the mapping and biom files
biom_fp = "../data/study_550_closed_reference_otu_table.biom"
map_fp = "../data/study_550_mapping_file.txt"

data, sample_ids, otus_names = bmu.load_biom(biom_fp)
dist_mat = utils.compute_distance_matrix(data)
pickle.dump(dist_mat, open("../data/distances_study_500.pkl", "wb"))