def test2_normalize(): X = np.array([ [1,0], [0,1]]) #Var(X[:,0]) = (.5)^2 -> SD(X[:,0]) = .5 # same for second column normalized_X = np.array([[ 1,-1], [-1, 1]]) assert_allclose(normalize_matrix(X),normalized_X)
######################## # Our thoughts on this matter goes as follows: # First we'd like to run the methods on data with and without outliers (2 points # have very high leverage), on data that has and hasn't been scaled and # centered. # These will be kept in numpy arrays until needed to be put into panda DataFrame # format # since we have outliers, the earlier normalizing is really going to do a np.std(X_full[:,0]) X_full_scaled = normalize_matrix(X_full) X_full = X_full # identifying outliers (via leverage), for our data it works with if we approach # it in 1 leverage analysis run or 2. leverage,X_rank = leverage_make(X_full) keepers = np.array([True if x not in sorted(leverage)[-2:] else False for x in leverage]) X_wo = X_full[keepers,:] names_wo = list(np.array(names_full)[keepers]) X_wo_scaled = normalize_matrix(X_wo)