def central_diff(self, f, epsilon, theta): print epsilon x = theta.get_matrix() n = x.shape[0] g = np.zeros(x.shape) if x.ndim == 2: for i in range(x.shape[0]): for j in range(x.shape[1]): upper = x.copy() upper[i, j] += epsilon lower = x.copy() lower[i, j] -= epsilon g[i, j] = ((f(DataFrame.from_matrix(upper)) - f(DataFrame.from_matrix(lower))) / (2 * epsilon)) elif x.ndim == 1: for i in range(x.shape[0]): upper = x.copy() upper[i] += epsilon lower = x.copy() lower[i] -= epsilon g[i] = ((f(DataFrame.from_matrix(upper)) - f(DataFrame.from_matrix(lower))) / (2 * epsilon)) else: raise ValueError return g
def test_softmax_reg_loss(self): df = DataFrame() epsilon = 1e-4 y_path = ("y/", "y/") theta_path = ("theta/", "theta/") X_path = ("X/", "X/") k = 10 n, m = 5, 8 df[X_path] = DataFrame.from_matrix(nprand.rand(n, m)) df[theta_path] = DataFrame.from_matrix(nprand.rand(k, m)) y = np.zeros((n, k), dtype=bool) for i in range(n): j = nprand.randint(k) y[i, j] = True df[y_path] = DataFrame.from_matrix(y) reg = 0.0001 softmax = lambda theta_df: SoftmaxRegression(theta_df, df[X_path], df[ y_path], reg).f() g_central = self.central_diff(softmax, epsilon, df[theta_path]) g1 = SoftmaxRegression(df[theta_path], df[X_path], df[y_path], reg).g() # print g_central assert (np.allclose(g_central, g1))
def central_diff(self,f,epsilon,theta): print epsilon x = theta.get_matrix() n = x.shape[0] g = np.zeros(x.shape) if x.ndim == 2: for i in range(x.shape[0]): for j in range(x.shape[1]): upper = x.copy() upper[i,j] += epsilon lower = x.copy() lower[i,j] -= epsilon g[i,j] = ((f(DataFrame.from_matrix(upper)) -f(DataFrame.from_matrix(lower))) /(2*epsilon)) elif x.ndim == 1: for i in range(x.shape[0]): upper = x.copy() upper[i] += epsilon lower = x.copy() lower[i] -= epsilon g[i] = ((f(DataFrame.from_matrix(upper)) -f(DataFrame.from_matrix(lower))) /(2*epsilon)) else: raise ValueError return g
def test_softmax_reg_loss(self): df = DataFrame() epsilon = 1e-4 y_path = ("y/","y/") theta_path = ("theta/","theta/") X_path = ("X/","X/") k = 10 n,m = 5,8 df[X_path] = DataFrame.from_matrix(nprand.rand(n,m)) df[theta_path] = DataFrame.from_matrix(nprand.rand(k,m)) y = np.zeros((n,k),dtype=bool) for i in range(n): j = nprand.randint(k) y[i,j] = True df[y_path] = DataFrame.from_matrix(y) reg = 0.0001 softmax = lambda theta_df: SoftmaxRegression(theta_df, df[X_path], df[y_path], reg).f() g_central = self.central_diff(softmax,epsilon,df[theta_path]) g1 = SoftmaxRegression(df[theta_path], df[X_path], df[y_path], reg).g() # print g_central assert(np.allclose(g_central,g1)) # Test batch by checking average gradient # g2 = np.zeros((k,m)) # for i in range(n): # g2 += Softmax.g(df[theta_path], df[X_path], df[y_path], reg) # g2 /= n # assert(np.allclose(g_central,g2))
def test_permutation(self): df = DataFrame() M1_path = ("row1/", "col1/") permute_path1 = ("row2/", "col1/") M1 = nprand.rand(3, 5) df[M1_path] = DataFrame.from_matrix(M1) df[permute_path1] = Permute(df[M1_path]) p_df = df["auto/row1/", "auto/permutation/"] p = p_df.get_matrix().ravel() assert (df[permute_path1].get_matrix() == M1[p, :]).all()
def test_dot(self): df = DataFrame() M1_path = ("row1/", "col1/") M2_path = ("row2/", "col2/") dot_path1 = ("row1/", "col2/") M1 = nprand.rand(3, 5) M2 = nprand.rand(5, 8) df[M1_path] = DataFrame.from_matrix(M1) df[M2_path].set_matrix(M2) df[dot_path1] = Dot(df[M1_path], df[M2_path]) assert (df[dot_path1].get_matrix() == M1.dot(M2)).all()
def func(self, target_df, X_df): X = X_df.get_matrix() P = np.random.permutation(X.shape[0]) row_labels = X_df._row_index.keys() col_labels = X_df._col_index.keys() (row_query, col_query) = X_df.pwd() X_df._top_df[_auto_dir + row_query, _auto_dir + "permutation/"] = DataFrame.from_matrix(P[:, None]) # print "finished permute" return DataFrame.from_matrix(X[P, :], row_labels, col_labels)
def func(self, target_df, X_df): X = X_df.get_matrix() P = np.random.permutation(X.shape[0]) row_labels = X_df._row_index.keys() col_labels = X_df._col_index.keys() (row_query, col_query) = X_df.pwd() X_df._top_df[_auto_dir+row_query,_auto_dir+"permutation/"] = \ DataFrame.from_matrix(P[:,None]) # print "finished permute" return DataFrame.from_matrix(X[P, :], row_labels, col_labels)
def test_linear(self): df = DataFrame() M1_path = ("row1/", "col1/") M2_path = ("row2/", "col2/") linear_path1 = ("row1/", "col2/") M1 = nprand.rand(3, 5) M2 = nprand.rand(3, 5) df[M1_path] = DataFrame.from_matrix(M1) df[M2_path].set_matrix(M2) a = 2 b = -3 df[linear_path1] = Linear(a, df[M1_path], b, df[M2_path]) assert (df[linear_path1].get_matrix() == a * M1 + b * M2).all()
def test_gd(self): df = DataFrame() M1_path = ("row1/", "col1/") M2_path = ("row2/", "col2/") batch1_path = ("row1/", "col1/batch1/") batch2_path = ("row1/", "col1/batch2/") x0_path = ("x0/", "y0/") M1 = nprand.rand(3, 5) df[batch1_path].set_matrix(M1) M2 = np.zeros((3, 5)) df[M2_path] = GD(SquareTest, M2, df[M1_path], step_size=1) sleep(1) # df[M2_path].stop() assert np.allclose(df[M2_path].get_matrix(), df[M1_path].get_matrix()) # Assert that the input structure has been replicated assert ( df["row2/", "col2/batch1/"].get_matrix() == df[M2_path].get_matrix()).all() # Now attempt to extend the parameter matrix M3 = nprand.rand(3, 4) df[batch2_path].set_matrix(M3) sleep(1) assert df[M2_path].shape == df[M1_path].shape assert df[M2_path].shape == (3, 9) assert np.allclose(df[M2_path].get_matrix(), df[M1_path].get_matrix()) df[M2_path].stop()
def test_zero_mean(self): df = DataFrame() M1_path = ("row1/", "col1/") M2_path = ("row2/", "col2/") M1 = nprand.rand(3, 5) M1_zm = M1 - np.mean(M1, axis=0) df[M1_path].set_matrix(M1) df[M2_path] = ZeroMean(df[M1_path]) assert (df[M2_path].get_matrix() == M1_zm).all()
def func(self,target_df,X_df, num_bases=50): X = X_df.get_matrix() X_m = np.mean(X,axis=0) # mean X_zm = X - X_m # X with 0 mean u,s,v_T = la.svd(X_zm) row_labels = [str(i) for i in range(X.shape[1])] col_labels = [str(i) for i in range(num_bases)] return DataFrame.from_matrix(np.real(v_T.T[:,:num_bases]),row_labels,col_labels)
def func(self, target_df, X_df, num_bases=50): X = X_df.get_matrix() X_m = np.mean(X, axis=0) # mean X_zm = X - X_m # X with 0 mean u, s, v_T = la.svd(X_zm) row_labels = [str(i) for i in range(X.shape[1])] col_labels = [str(i) for i in range(num_bases)] return DataFrame.from_matrix(np.real(v_T.T[:, :num_bases]), row_labels, col_labels)
def test_permutation(self): df = DataFrame() M1_path = ("row1/","col1/") permute_path1 = ("row2/","col1/") M1 = nprand.rand(3,5) df[M1_path] = DataFrame.from_matrix(M1) df[permute_path1] = Permute(df[M1_path]) p_df = df["auto/row1/","auto/permutation/"] p = p_df.get_matrix().ravel() assert (df[permute_path1].get_matrix()==M1[p,:]).all()
def test_dot(self): df = DataFrame() M1_path = ("row1/","col1/") M2_path = ("row2/","col2/") dot_path1 = ("row1/","col2/") M1 = nprand.rand(3,5) M2 = nprand.rand(5,8) df[M1_path] = DataFrame.from_matrix(M1) df[M2_path].set_matrix(M2) df[dot_path1] = Dot(df[M1_path],df[M2_path]) assert (df[dot_path1].get_matrix()==M1.dot(M2)).all()
def test_sgd(self): # Also test sgd close = np.array([[-44.25076083, 38.62854577], [-38.41473092, 36.29945225], [-31.43300105, 30.79620632], [-21.27706071, 24.08638079], [-14.00259076, 6.54438641], [ 11.52354442, -6.07783327], [ 48.69374796, -38.64696136], [ 95.49682071, -84.38906967]]) df = DataFrame() path = "row/","col/" df["xrow/","xcol/"]= DataFrame.from_matrix(np.arange(16).reshape(8,2)) df["yrow/","ycol/"] = DataFrame.from_matrix(np.arange(8).reshape(8,1)) X_df = df["xrow/","xcol/"] y_df = df["yrow/","ycol/"] df[path] = SGD(SquareTest,close,y_df,batch_size=8,step_size=0.5) sleep(1) df[path].stop() assert np.allclose(df[path].get_matrix(), y_df.get_matrix())
def test_tuple_to_query(self): df = DataFrame() # Test conversion of hashable elements to their actual queries string = "randomstring" slice_hash, slice_actual = (slice, (2, 4, 1)), slice(2, 4, 1) list_hash, list_actual = (list, (1, 2, 3, 4, 5, 6)), [1, 2, 3, 4, 5, 6] assert df._tuple_element_to_query(string) == string assert df._tuple_element_to_query(slice_hash) == slice_actual assert df._tuple_element_to_query(list_hash) == list_actual assert df._query_to_tuple_element(string) == string assert df._query_to_tuple_element(slice_actual) == slice_hash assert df._query_to_tuple_element(list_actual) == list_hash
def test_linear(self): df = DataFrame() M1_path = ("row1/","col1/") M2_path = ("row2/","col2/") linear_path1 = ("row1/","col2/") M1 = nprand.rand(3,5) M2 = nprand.rand(3,5) df[M1_path] = DataFrame.from_matrix(M1) df[M2_path].set_matrix(M2) a = 2 b = -3 df[linear_path1] = Linear(a,df[M1_path],b,df[M2_path]) assert (df[linear_path1].get_matrix()==a*M1+b*M2).all()
def func(self,target_df,a,X_df,b,Y_df,row_labels=None,col_labels=None): """Fetch matrices from dataframes, and return the resulting linear combination in a dataframe""" x = X_df.get_matrix() y = Y_df.get_matrix() if row_labels==None: row_labels = X_df._row_index.keys() if col_labels==None: col_labels = X_df._col_index.keys() if (x.shape != y.shape): raise ValueError return DataFrame.from_matrix(a*x+b*y,row_labels,col_labels)
def test_sgd(self): # Also test sgd close = np.array([[-44.25076083, 38.62854577], [-38.41473092, 36.29945225], [-31.43300105, 30.79620632], [-21.27706071, 24.08638079], [-14.00259076, 6.54438641], [11.52354442, -6.07783327], [48.69374796, -38.64696136], [95.49682071, -84.38906967]]) df = DataFrame() path = "row/", "col/" df["xrow/", "xcol/"] = DataFrame.from_matrix(np.arange(16).reshape(8, 2)) df["yrow/", "ycol/"] = DataFrame.from_matrix(np.arange(8).reshape(8, 1)) X_df = df["xrow/", "xcol/"] y_df = df["yrow/", "ycol/"] df[path] = SGD(SquareTest, close, y_df, batch_size=8, step_size=0.5) sleep(1) df[path].stop() assert np.allclose(df[path].get_matrix(), y_df.get_matrix())
def test_PCA_basis(self): df = DataFrame() M1_path = ("row1/", "col1/") M2_path = ("row2/", "col2/") n = 10 m = 5 d = 3 M1 = nprand.rand(n, m) M1 = M1 - np.mean(M1, axis=0) # print M1 df[M1_path].set_matrix(M1) df[M2_path] = PCABasis(df[M1_path], d) u, s, v_T = numpy.linalg.svd(M1, full_matrices=False) s[d + 1:] = 0 v = v_T.T[:, :d] M1_reconstructed = u.dot(np.diag(s).dot(v_T)) # print M1 # print M1_reconstructed M1_reconstructed2 = M1.dot(v).dot(v.T) # print M1_reconstructed2 # print M1.dot(v.dot(v.T)) covmat = (1. / (n - 1)) * M1.T.dot(M1) evs, evmat = scipy.linalg.eig(covmat) p = np.argsort(evs)[::-1] evmat_sorted = evmat[:, p][:, :d] M1_reconstructed3 = M1.dot(evmat_sorted).dot(evmat_sorted.T) basis = df[M2_path].get_matrix() for i in range(evmat_sorted.shape[1]): assert np.isclose(basis[:,i], evmat_sorted[:,i]).all() or \ np.isclose(basis[:,i],-evmat_sorted[:,i]).all() M3_path = ("row3/", "col3/") M3 = nprand.rand(2 * n, m) M3 = M3 - np.mean(M1, axis=0) df[M3_path].set_matrix(M3) pca_path = ("pca/", "pca/") df[pca_path] = PCA(df[M1_path], df[M3_path], d) pca = df[pca_path].get_matrix() proj = M3.dot(evmat_sorted) for i in range(pca.shape[1]): assert np.isclose(pca[:,i], proj[:,i]).all() or \ np.isclose(pca[:,i],-proj[:,i]).all()
def test_tuple_to_query(self): df = DataFrame() # Test conversion of hashable elements to their actual queries string = "randomstring" slice_hash, slice_actual = (slice,(2,4,1)), slice(2,4,1) list_hash, list_actual = (list,(1,2,3,4,5,6)), [1,2,3,4,5,6] assert df._tuple_element_to_query(string) == string assert df._tuple_element_to_query(slice_hash) == slice_actual assert df._tuple_element_to_query(list_hash) == list_actual assert df._query_to_tuple_element(string) == string assert df._query_to_tuple_element(slice_actual) == slice_hash assert df._query_to_tuple_element(list_actual) == list_hash
def test_one_hot_encoding(self): df = DataFrame() M1_path = ("row1/", "col1/") M2_path = ("row2/", "col2/") n = 10 m = 5 M1 = np.vstack( [nprand.randint(0, m, (n, 1)), np.arange(m).reshape(m, 1)]) M2 = np.zeros((n + m, m)) for i in range(n + m): M2[i, M1[i]] = 1 df[M1_path].set_matrix(M1) df[M2_path] = OneHotEncoding(df[M1_path]) assert (df[M2_path].get_matrix() == M2).all()
def func(self, target_df, a, X_df, b, Y_df, row_labels=None, col_labels=None): """Fetch matrices from dataframes, and return the resulting linear combination in a dataframe""" x = X_df.get_matrix() y = Y_df.get_matrix() if row_labels == None: row_labels = X_df._row_index.keys() if col_labels == None: col_labels = X_df._col_index.keys() if (x.shape != y.shape): raise ValueError return DataFrame.from_matrix(a * x + b * y, row_labels, col_labels)
def func(self, target_df, X_df): return DataFrame.from_matrix(X_df.get_matrix())
def test_simple_query(self): df = DataFrame.from_matrix(np.arange(6).reshape(2,3)) assert df._is_simple_query() assert df["row/","col/"]._is_simple_query() assert df["row/","col/"][:,:]._is_simple_query() assert df["row/","col/"][0:1,2:3]._is_simple_query()
def func(self, target_df, X_df, Y_df): x = X_df.get_matrix() y = Y_df.get_matrix() row_labels = X_df._row_index.keys() col_labels = Y_df._col_index.keys() return DataFrame.from_matrix(x.dot(y), row_labels, col_labels)
def func(self, target_df, X_df): X = X_df.get_matrix() X_m = np.mean(X, axis=0) X_zm = X - X_m # X with zero mean return DataFrame.from_matrix(X_zm)
def test_simple_query(self): df = DataFrame.from_matrix(np.arange(6).reshape(2, 3)) assert df._is_simple_query() assert df["row/", "col/"]._is_simple_query() assert df["row/", "col/"][:, :]._is_simple_query() assert df["row/", "col/"][0:1, 2:3]._is_simple_query()
def test_setitem(self): df = DataFrame() rows = "row/" cols = "col/" M = np.arange(6).reshape(2, 3) df.__setitem__((slice(None, None, None), slice(None, None, None)), M) assert (df.get_matrix() == M).all() df = DataFrame() df.__setitem__((slice(None, None, None), slice(None, None, None)), M, rows=["a", "b"], cols=["c", "d", "e"]) assert (df.get_matrix() == M).all() df = DataFrame() df["x/", "y/"].__setitem__( (slice(None, None, None), slice(None, None, None)), M, rows=["a", "b"], cols=["c", "d", "e"]) assert (df.get_matrix() == M).all() assert (df["x/", "y/"].get_matrix() == M).all() df["x/", "y/"].__setitem__( (slice(None, None, None), slice(None, None, None)), 2) assert (df.get_matrix() == 2).all()
def func(self,target_df,X_df,Y_df): x = X_df.get_matrix() y = Y_df.get_matrix() row_labels = X_df._row_index.keys() col_labels = Y_df._col_index.keys() return DataFrame.from_matrix(x.dot(y),row_labels,col_labels)
def func(self,target_df,X_df): X = X_df.get_matrix() X_m = np.mean(X, axis=0) X_zm = X - X_m # X with zero mean return DataFrame.from_matrix(X_zm)
def test_setitem(self): df = DataFrame() rows = "row/" cols = "col/" M = np.arange(6).reshape(2,3) df.__setitem__((slice(None,None,None),slice(None,None,None)), M) assert (df.get_matrix()==M).all() df = DataFrame() df.__setitem__((slice(None,None,None),slice(None,None,None)), M, rows=["a","b"],cols=["c","d","e"]) assert (df.get_matrix()==M).all() df = DataFrame() df["x/","y/"].__setitem__((slice(None,None,None),slice(None,None,None)), M, rows=["a","b"],cols=["c","d","e"]) assert (df.get_matrix() == M).all() assert (df["x/","y/"].get_matrix() == M).all() df["x/","y/"].__setitem__((slice(None,None,None),slice(None,None,None)),2) assert (df.get_matrix() == 2).all()
def func(self,target_df, X_df): return DataFrame.from_matrix(X_df.get_matrix())