Exemplo n.º 1
0
 def __getitem__(self, index):
     dates = self.get_dates(index)
     x = np.concatenate([
         np.expand_dims(
             (standardize(v.sel(time=dates), v.name) if self.standardize
              else v.sel(time=dates)).fillna(0).values, -1)
         for v in self.variables
     ],
                        axis=-1)
     if x.ndim == 5:
         x = x.squeeze(axis=1)
     x = crop_center(crop_boundaries(x),
                     (len(dates), *self.in_size, len(self.varnames)))
     if self.truth_filename:
         y = self.truth_variable.sel(time=dates)[:, ::-1, :].values
         if self.onehot:
             y = to_categorical(y, 5)
         else:
             y = np.expand_dims(y, axis=-1)
         if self.onehot:
             y = crop_center(crop_boundaries(y),
                             (len(dates), *self.in_size, 5))
         else:
             y = crop_center(crop_boundaries(y),
                             (len(dates), *self.in_size, 1))
         return x, y
     return x
Exemplo n.º 2
0
def PCA(encodings):
    """
    Performs PCA on the encodings.
    First, it calculates the mean for the data set along the dimensions.
    Then it centers the data by subtracting the mean from each data point.
    Then, it scales the data into a unit vector.
    This process is called standardization.
    Then it calculates the eigenvalues/eigenvectors using eigenvalue decomposition
    on the covariance matrix
    """

    print "Calculated normalization"
    encodings = normalize.standardize(encodings)
    np.save(prefix + str(capacity) + "normalized", encodings)

    print "calculating cov"
    cov_mat = np.cov(encodings)
    np.save(prefix + str(capacity) + "cov_matrix", cov_mat)

    print "calculating eig"

    eig_vals, eig_vecs = np.linalg.eig(cov_mat)
    idx = eig_vals.argsort()[::-1]
    eig_vals = eig_vals[idx]
    eig_vecs = eig_vecs[:,idx]

    np.save(prefix + str(capacity) + "eig_vals", eig_vals)
    np.save(prefix + str(capacity) + "eig_vecs", eig_vecs)

    return eig_val, eig_vec
Exemplo n.º 3
0
from graph_pca_info import plot_pca_info
from normalize import norm, standardize
from sklearn import metrics
from pprint import pprint
from sklearn.metrics import roc_curve, auc
from matplotlib.backends.backend_pdf import PdfPages
import time
import matplotlib.pyplot as plt

# import training and testing data #

filePath = "X.dat"
file = open(filePath, 'r')
allData = np.loadtxt(file, delimiter=',')
Xtrain = allData[0:int(len(allData) / 2), 0:-1]
Xtrain = standardize(Xtrain)
# the labels are in the last column
ytrain = allData[0:int(len(allData) / 2), -1]

# n_components for dimensionality reduction, chosen from graph_pca_info plots
n_components = 40
# n_neighbors for knn; sklearn's default value is 5
knn_neighbors = 5

# dimensionality reduction; change the method for whichever method you choose

# lle(n_components, neighbors=(n_components * (n_components + 3) / 2) + 1, hessian=True)
# lle(n_components, neighbors=(n_components * (n_components + 3) / 2) + 1)
# componentsPca(n_components)
# kpca(n_components)
start_time = time.clock()
Exemplo n.º 4
0
    k_eig_vals = eig_vals[:top_k]
    k_eig_vecs = eig_vecs[:top_k]

    # for i in xrange(len(k_eig_vals)):
        # print i, k_eig_vals[i]

    normalized = encodings

    final = np.dot(k_eig_vecs, normalized)

    # Need to pass in each encoding row-wise
    final_t = np.transpose(final)
    clf = svm.SVC()
    clf.fit(final_t, stars)

    # Save our model
    joblib.dump(clf, "pca_clf.pkl")

    print "Training score: ", clf.score(final_t, stars)

    # Test it on our training set
    test_encodings, test_stars = normalize.read_data("test.json", dictionary, 0, 10000)

    # Normalize
    test_encodings = normalize.standardize(test_encodings)
    final_test = np.dot(k_eig_vecs, test_encodings)
    final_test_t = np.transpose(final_test)
    print "Testing score: ", clf.score(final_test_t, test_stars)