def softimpute_used(X, X_incomplete, missing_mask, count_miss): softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300) X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete) """ softImpute_no_biscale_mse = ((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean() softImpute_no_biscale_rmse = np.sqrt(float(((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).sum())/count_miss) print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse) print("SoftImpute without BiScale RMSE: %f" % softImpute_no_biscale_rmse) """ return X_filled_softimpute_no_biscale
def softimpute_used_for_cv(X, X_incomplete, missing_mask, count_miss, defined_missing_percent, limit1, limit2, percentile): softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300) X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete) """ softImpute_no_biscale_mse = ((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).mean() softImpute_no_biscale_rmse = np.sqrt(float(((X_filled_softimpute_no_biscale[missing_mask] - X[missing_mask]) ** 2).sum())/count_miss) print("SoftImpute without BiScale MSE: %f" % softImpute_no_biscale_mse) print("SoftImpute without BiScale RMSE: %f" % softImpute_no_biscale_rmse) """ rmse_percentile = defaultdict(float) y = X[missing_mask] y_predict = X_filled_softimpute_no_biscale[missing_mask] y_percentile = defaultdict(list) y_predict_percentile = defaultdict(list) y_percentile_arr = defaultdict() y_predict_percentile_arr = defaultdict() for m, n in zip(y, y_predict): if m < percentile[10] and m > percentile[10] * (-1): y_percentile[10].append(m) y_predict_percentile[10].append(n) y_percentile_arr[10] = np.asarray(y_percentile[10]) y_predict_percentile_arr[10] = np.asarray(y_predict_percentile[10]) rmse_percentile[10] = np.sqrt( float( ((y_predict_percentile_arr[10] - y_percentile_arr[10])**2).sum()) / len(y_predict_percentile_arr[10])) for m, n in zip(y, y_predict): if abs(m) < percentile[5] and abs(m) > percentile[10]: y_percentile[5].append(m) y_predict_percentile[5].append(n) y_percentile_arr[5] = np.asarray(y_percentile[5]) y_predict_percentile_arr[5] = np.asarray(y_predict_percentile[5]) rmse_percentile[5] = np.sqrt( float(((y_predict_percentile_arr[5] - y_percentile_arr[5])**2).sum()) / len(y_predict_percentile_arr[5])) for m, n in zip(y, y_predict): if abs(m) < percentile[2] and abs(m) > percentile[5]: y_percentile[2].append(m) y_predict_percentile[2].append(n) y_percentile_arr[2] = np.asarray(y_percentile[2]) y_predict_percentile_arr[2] = np.asarray(y_predict_percentile[2]) rmse_percentile[2] = np.sqrt( float(((y_predict_percentile_arr[2] - y_percentile_arr[2])**2).sum()) / len(y_predict_percentile_arr[2])) return (X_filled_softimpute_no_biscale, rmse_percentile)
def Initialize_X_incomplete(X_incomplete, test_filename, train_filename): m, n = X_incomplete.shape missing_mask = np.zeros((m, n), dtype=bool) softImpute = SoftImpute(convergence_threshold=0.0001, max_iters=300) X = softImpute.complete(X_incomplete) count_miss = 0 for i in range(m): for j in range(n): if np.isnan(X_incomplete[i, j]): missing_mask[i, j] = True count_miss += 1 return (X, missing_mask, count_miss)
def fancy_predict(train, test_data_points, max_rank=8, shrinkage_value=0.02, max_iters=50): ''' Generates predictions for test data points using FancyImpute's dense implementation of SoftImpute. ''' train, rowscale, colscale, rowcenter, colcenter = fancy_biscale(train) train[train == 0] = np.nan si = SoftImpute(shrinkage_value=shrinkage_value, max_rank=max_rank, max_iters=max_iters, init_fill_method='zero', verbose=False) complete = si.complete(train) targets = zip(test_data_points[0], test_data_points[1]) res = [] for idx, (r, c) in enumerate(targets): res.append((complete[r, c], r, c)) res = fancy_remove_biscale(res, rowscale, colscale, rowcenter, colcenter) return res
if l_sn != 0: # The entire neighbourhood matrix (for ith seed)is mat: # mat gives individual columns of neighbourhood matrix at every run mat = np.zeros(shape=(l_sn, n)) for k_ in range(l_sn): mat[k_, :] = X[seed_neighbourhoods[i][k_], :] #Thinning for each neighbourhood associated with a seed mat = thinning(mat, seed, p0) # returning what ? indices or entire sub matrix # perform subspace completion to rank r mat[mat == 0] = np.nan obj = SoftImpute(max_rank=r, verbose=False) subspaces[i] = obj.complete(mat) seed_neighbourhoods = [sn for sn in seed_neighbourhoods if len(sn) != 0] subspaces = [s for s in subspaces if len(s) != 0] print "no of subspaces", len(subspaces) # subspace refinement subspaces = subspacesRefine(subspaces, k, n) print "no of subspaces", len(subspaces) # choose only top k subspaces # subspaces = subspaces[:k] # uncomment the line below to complete the matrix using original basis matrices of the k subspaces
def complete_matrix(X): simpute = SoftImpute() X_completed = simpute.complete(X) return X_completed
# pd_filled_knn = pd.DataFrame(X_filled_knn, index = experian.index.tolist(), columns = experian.columns.values.tolist()) # pd_filled_knn = pd.concat([account_id, pd_filled_knn], axis = 1) # pd_filled_knn.to_csv("ppl_experian_filled_k_closestrows.csv") # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform( X_filled_softimpute_normalized) pd_missing_mask = pd.DataFrame(missing_mask, index=experian.index.tolist(), columns=experian.columns.values.tolist()) pd_filled_softimpute = pd.DataFrame(X_filled_softimpute, index=experian.index.tolist(), columns=experian.columns.values.tolist()) pd_filled_softimpute = pd.concat([account_id, pd_filled_softimpute], axis=1) #pd_filled_softimpute = filled.append(pd_filled_softimpute) pd_filled_softimpute.to_csv("ppl_experian_filled_softimpute.csv", sep=",") #
if (l_sn != 0): # The entire neighbourhood matrix (for ith seed)is mat: # mat gives individual columns of neighbourhood matrix at every run mat = np.zeros((n, l_sn)) for k in range(l_sn): mat[:, k] = np.copy(X[:, seed_neighbourhoods[i][k]]) #Thinning for each neighbourhood associated with a seed mat = thinning(mat, s, n, p0) # returning what ? indices or entire sub matrix # perform subspace completion to rank r mat[mat == 0] = np.nan obj = SoftImpute(max_rank=r, verbose=False) mat2 = obj.complete(mat) subspaces[i] = mat2 seed_neighbourhoods = [s for s in seed_neighbourhoods if len(s) != 0] subspaces = [s for s in subspaces if len(s) != 0] print "no of subspaces", len(subspaces) # subspace refinement subspaces = subspacesRefine(subspaces, k, n) print "no of subspaces", len(subspaces) # choose only top k subspaces subspaces = subspaces[:k]
# matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding softImpute = SoftImpute() # simultaneously normalizes the rows and columns of your observed data, # sometimes useful for low-rank imputation methods biscaler = BiScaler() # rescale both rows and columns to have zero mean and unit variance X_incomplete_normalized = biscaler.fit_transform(X_incomplete) X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized) X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized) X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete) meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean() print("meanFill MSE: %f" % meanfill_mse) # print mean squared error for the three imputation methods above nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean() print("Nuclear norm minimization MSE: %f" % nnm_mse) softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean() print("SoftImpute MSE: %f" % softImpute_mse) softImpute_no_biscale_mse = (