Exemplo n.º 1
0
def get_zscore_matrix(M,L):
    # Compute z-scores by scaling and centering the records
    # scaling and centering is done with respect to L matrix instead of M, so
    # outliers are discarded
    z_matrix = scale_and_center(M, reference_matrix=L, scale=True)
    
    # The entries of M that are 0 will be incorrect, since these are missing data
    # replace those zscores with 0 to stay consistent
    missing_data_ids = where(M==0)
    z_matrix[missing_data_ids] = 0
    
    return z_matrix
Exemplo n.º 2
0
    data_matrix = column_stack(vectors)
    if(robust):
        
        if(gamma=="tune"):
            gamma, tol_perc, num_guesses, hi_num_pcs, L, C = increasing_tolerance_search(vectors)
            (weekday, hour) = key
            logMsg("Successfully tuned %s @ %d  after %d guesses : gamma=%f, tol=%f"%(weekday, hour, num_guesses, gamma, tol_perc))
        else:
            O = (data_matrix!=0)*1 # Observation matrix - 1 where we have data, 0 where we do not
             # Use outlier pursuit to get robust low-rank approximation of data
            L,C,term,n_iter = opursuit(data_matrix, O, gamma, tol_perc=tol_perc)
        
        
        #logMsg("PCA")
        # Perform PCA on the low-rank approximation, and estimate the statistics
        centered_L = scale_and_center(L, scale=False)
        pcs, robust_lowdim_data = pca(centered_L, k)
        num_pca_dimensions = pcs.shape[1]
        logMsg("Num eigenvalues : %d" % num_pca_dimensions)
        
        
        centered_corrupt = scale_and_center(L+C, reference_matrix=L, scale=False)
        
        stdout.flush()
        mahals5 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt, 5)
        mahals10 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt,10)
        mahals20 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt, 20)
        mahals50 = lowdim_mahalanobis_distance(pcs, robust_lowdim_data, centered_corrupt, 50)