def test_matrix_factorization_with_low_rank_random_matrix(): solver = MatrixFactorization(rank=3, l1_penalty=0, l2_penalty=0) XY_completed = solver.complete(XY_incomplete) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="MatrixFactorization") assert missing_mae < 0.01, "Error too high!"
def test_matrix_factorization_with_low_rank_random_matrix(): solver = MatrixFactorization(learning_rate=0.01, rank=3, l2_penalty=0, min_improvement=1e-6) XY_completed = solver.complete(XY_incomplete) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="MatrixFactorization") assert missing_mae < 0.1, "Error too high!"
def test_matrix_factorization_with_low_rank_random_matrix(): solver = MatrixFactorization( rank=3, l1_penalty=0, l2_penalty=0) XY_completed = solver.complete(XY_incomplete) _, missing_mae = reconstruction_error( XY, XY_completed, missing_mask, name="MatrixFactorization") assert missing_mae < 0.01, "Error too high!"
def deal_mar(df): """Deal with missing data with missing at random pattern.""" Xy_incomplete = df.values # knn with NoStdStreams(): Xy_filled_knn = KNN().fit_transform(Xy_incomplete); score_knn = compute_imputation_score(Xy_filled_knn) print("Imputation score of knn is {}".format(score_knn)) # matrix factorization with NoStdStreams(): Xy_filled_mf = MatrixFactorization().fit_transform(Xy_incomplete); score_mf = compute_imputation_score(Xy_filled_mf) print("Imputation score of matrix factorization is {}".format(score_knn)) # multiple imputation with NoStdStreams(): Xy_filled_ii = IterativeImputer().fit_transform(Xy_incomplete) score_ii = compute_imputation_score(Xy_filled_ii) print("Imputation score of multiple imputation is {}".format(score_ii)) score_dict = {'knn': score_knn, 'matrix factorization': score_mf, 'multiple imputation': score_ii} print("Imputation method with the highest socre is {}".format(max(score_dict, key=score_dict.get))) recommend = max(score_dict, key=score_dict.get) return recommend
def MatrixFactorizationImpute(X_incomplete, X_complete, M, p, rank=10, pen=1e-5): X_incomplete_new = replace0tonan(X_incomplete, M) start_time = time.time() X_incomplete_new = np.array(X_incomplete_new) X_complete = np.array(X_complete) M = np.array(M) X_filled_mf = MatrixFactorization( learning_rate=0.01, rank=rank, l2_penalty=pen).fit_transform(X_incomplete_new) X_filled_mf[np.where(np.isnan(X_filled_mf))] = 0 mse = np.sum(np.square(X_filled_mf - X_complete)) / np.sum(1 - M) print(str(rank) + "_" + str(pen)) print(mse) print("MatrixFactorizationImpute costed time: %4.4f" % (time.time() - start_time)) print("\n") paras = str(p) + "/" + str(rank) + "_" + str(pen) path = "mfimpute" + "/" + paras + "/" if not os.path.exists(path): os.makedirs(path) f = open(os.path.join(path, str(mse)), "w") f.write(str(mse)) f.close()
def fit(self,df): logging.info("Initializaing Pipeline") isTraining = True self.isTraining = isTraining self.adf = df self.df = df.copy() self.numeric_cols = getNumericColumns(df) self.cat_cols = getCategorialColumns(df) self.DEPENDENT_VARIABLE = getDependentVariable() self.cat_cols_useless = [ "encounter_id" , "hospital_id" , "patient_id" , "icu_id"] self.cat_cols_minus = [c for c in self.cat_cols if c not in ["clusterId","hospital_death", "encounter_id" , "hospital_id" , "patient_id"]] self.cat_cols_minus_useless = [c for c in self.cat_cols if c not in ["clusterId", "encounter_id" , "hospital_id" , "patient_id" , "icu_id" ]] self.cols_to_dummy = [c for c in self.cat_cols_minus_useless if c != "hospital_death"] self.num_mean = SimpleImputer(strategy="median") self.cat_freq = SimpleImputer(strategy="most_frequent") self.rs = RobustScaler() self.pt = PowerTransformer() self.ohe = OneHotEncoder(handle_unknown='ignore' , sparse=False) self.outlierKNN = KNN() self.num_means = [MatrixFactorization() for i in range(4)] self.cat_freqs = [SimpleImputer(strategy="most_frequent") for i in range(4)] #self.label_encoders = defaultdict(LabelEncoder) self.label_encoders = WOEEncoder() self.later_num_transform = PowerTransformer() self.X = self.df.drop([self.DEPENDENT_VARIABLE] , axis=1) self.y = self.df[self.DEPENDENT_VARIABLE] return self.GetTransformedData(isTraining)
def prep_clfs(feature): if feature == 'tmcq': log_reg_clf = make_pipeline(LogisticRegression(random_state=56)) rf_clf = make_pipeline( RandomForestClassifier(n_jobs=-1, random_state=56)) gb_clf = make_pipeline(GradientBoostingClassifier(random_state=56)) xgb_clf = make_pipeline( XGBClassifier(max_depth=3, learning_rate=0.1, random_state=56, n_jobs=-1)) else: log_reg_clf = make_pipeline( ImputeTransform(strategy=MatrixFactorization()), LogisticRegression(random_state=56)) rf_clf = make_pipeline( ImputeTransform(strategy=MatrixFactorization()), RandomForestClassifier(n_jobs=-1, random_state=56)) gb_clf = make_pipeline(ImputeTransform(strategy=MatrixFactorization()), GradientBoostingClassifier(random_state=56)) xgb_clf = make_pipeline( ImputeTransform(strategy=MatrixFactorization()), XGBClassifier(max_depth=3, learning_rate=0.1, random_state=56, n_jobs=-1)) classifier_dict = { 'LogReg': { 'clf': log_reg_clf }, 'RandomForest': { 'clf': rf_clf }, 'GradientBoosting': { 'clf': gb_clf }, 'XGB': { 'clf': xgb_clf } } return classifier_dict
def complete(self, data: pd.DataFrame): df = data.copy() cols = list(df) if np.argmax(cols) < self.rank: self.rank = np.argmax(cols) df = pd.DataFrame(MatrixFactorization(rank=self.rank, verbose=False).fit_transform(df)) df.columns = cols return df
def clean_missing(df, features, setting): """Clean missing values in the dataset. Parameters ---------- df : DataFrame features : List List of feature names. Returns ------- features_new : List List of feature names after cleaning. Xy_filled : array-like Numpy array where missing values have been cleaned. """ df_preprocessed, features_new = missing_preprocess(df, features) if setting == 'mcar': recommend = deal_mcar(df_preprocessed) elif setting == 'mar': recommend = deal_mar(df_preprocessed) elif setting == 'mnar': recommend = deal_mnar(df_preprocessed) else: print("Default MAR") recommend = deal_mar(df_preprocessed) if recommend == 'mean': print("Applying mean imputation ...") Xy_filled = Imputer(missing_values=np.nan, strategy='mean').fit_transform( df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'mode': print("Applying mode imputation ...") Xy_filled = Imputer(missing_values=np.nan, strategy='most_frequent').fit_transform( df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'knn': print("Applying knn imputation ...") with NoStdStreams(): Xy_filled = KNN().fit_transform(df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'matrix factorization': print("Applying matrix factorization ...") with NoStdStreams(): Xy_filled = MatrixFactorization().fit_transform( df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'multiple imputation': print("Applying multiple imputation ...") with NoStdStreams(): Xy_filled = IterativeImputer().fit_transform( df_preprocessed.values) print("Missing values cleaned!") else: print("Error: Approach not available!") return features_new, Xy_filled
def test_matrix_factorization_with_low_rank_random_matrix(): initialize_random_seed() # for reproducibility solver = MatrixFactorization(learning_rate=0.01, rank=3, l2_penalty=0, min_improvement=1e-6, verbose=False) XY_completed = solver.fit_transform(XY_incomplete) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="MatrixFactorization") assert missing_mae < 0.1, "Error too high!" initialize_random_seed() # for reproducibility solver = MatrixFactorization(learning_rate=0.01, rank=3, l2_penalty=0, min_improvement=1e-6, verbose=False) XY_completed = solver.fit(XY_incomplete, missing_mask) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="MatrixFactorization") assert missing_mae < 0.1, "Error too high!" XY_completed = solver.transform(XY_incomplete, missing_mask) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="MatrixFactorization") assert missing_mae < 0.1, "Error too high!"
def prep_data(df, dataset, scale='before', return_complete=False): df_complete = df.copy() df_complete.loc[:,:] = MatrixFactorization().complete(df) if dataset == 'TMCQ': cols = ['Y1_P_TMCQ_ACTIVITY', 'Y1_P_TMCQ_AFFIL', 'Y1_P_TMCQ_ANGER', 'Y1_P_TMCQ_FEAR', 'Y1_P_TMCQ_HIP', 'Y1_P_TMCQ_IMPULS', 'Y1_P_TMCQ_INHIBIT', 'Y1_P_TMCQ_SAD', 'Y1_P_TMCQ_SHY', 'Y1_P_TMCQ_SOOTHE', 'Y1_P_TMCQ_ASSERT', 'Y1_P_TMCQ_ATTFOCUS', 'Y1_P_TMCQ_LIP', 'Y1_P_TMCQ_PERCEPT', 'Y1_P_TMCQ_DISCOMF', 'Y1_P_TMCQ_OPENNESS', 'DX'] dataset_all = df_complete[cols] elif dataset == 'neuro': cols = ['STOP_SSRTAVE_Y1', 'DPRIME1_Y1', 'DPRIME2_Y1', 'SSBK_NUMCOMPLETE_Y1', 'SSFD_NUMCOMPLETE_Y1', 'V_Y1', 'Y1_CLWRD_COND1', 'Y1_CLWRD_COND2', 'Y1_DIGITS_BKWD_RS', 'Y1_DIGITS_FRWD_RS', 'Y1_TRAILS_COND2', 'Y1_TRAILS_COND3', 'CW_RES', 'TR_RES', 'Y1_TAP_SD_TOT_CLOCK', 'DX'] scaler = StandardScaler() dataset = df_complete[cols] if scale=='before': dataset_all = dataset.copy() dataset_all.iloc[:,0:-1] = scaler.fit_transform(dataset.iloc[:,0:-1]) else: dataset_all = dataset.copy() adhd = dataset_all[dataset_all['DX'] == 3] control = dataset_all[dataset_all['DX'] == 1] dataset_all.drop(columns='DX', inplace=True) adhd.drop(columns='DX', inplace=True) control.drop(columns='DX', inplace=True) if return_complete: return dataset_all, adhd, control, df_complete else: return dataset_all, adhd, control
def deal_mcar(df): """Deal with missing data with missing completely at random pattern.""" # number of instances num_instances = df.shape[0] # number of rows containing missing num_missing_instances = df.isnull().sum(axis=1).astype(bool).sum() # missing percentage missing_percentage = num_missing_instances / num_instances print("Missing percentage is {}".format(missing_percentage)) if missing_percentage < 0.05: recommend = 'list deletion' else: Xy_incomplete = df.values # mean Xy_filled_mean = Imputer(missing_values=np.nan, strategy='mean').fit_transform(Xy_incomplete) score_mean = compute_imputation_score(Xy_filled_mean) print("Imputation score of mean is {}".format(score_mean)) # mode Xy_filled_mode = Imputer( missing_values=np.nan, strategy='most_frequent').fit_transform(Xy_incomplete) score_mode = compute_imputation_score(Xy_filled_mode) print("Imputation score of mode is {}".format(score_mode)) # knn with NoStdStreams(): Xy_filled_knn = KNN().fit_transform(Xy_incomplete) score_knn = compute_imputation_score(Xy_filled_knn) print("Imputation score of knn is {}".format(score_knn)) # matrix factorization with NoStdStreams(): Xy_filled_mf = MatrixFactorization().fit_transform(Xy_incomplete) score_mf = compute_imputation_score(Xy_filled_mf) print( "Imputation score of matrix factorization is {}".format(score_knn)) # multiple imputation with NoStdStreams(): Xy_filled_ii = IterativeImputer().fit_transform(Xy_incomplete) score_ii = compute_imputation_score(Xy_filled_ii) print("Imputation score of multiple imputation is {}".format(score_ii)) score_dict = { 'mean': score_mean, 'mode': score_mode, 'knn': score_knn, 'matrix factorization': score_mf, 'multiple imputation': score_ii } print("Imputation method with the highest socre is {}".format( max(score_dict, key=score_dict.get))) recommend = max(score_dict, key=score_dict.get) return recommend
def fi_complete(self, X, method='mf', **params): if method == 'mf': #rank = params['rank']=100 self.X_filled = MatrixFactorization(params['rank']).complete(X) if method == 'knn': # Use 3 nearest rows which have a feature to fill in each row's missing features #k = params['k'] = 3 self.X_filled = KNN(params['k']).complete(X) if method == 'soft': # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding self.X_filled = SoftImpute().complete(X)
def complex_imputation(df, method='mice', neighbors=3): """ Inputs: df -- dataframe of incomplete data method -- method of imputation - 'knn': Imputes using K Nearest Neighbors of completed rows - 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions - 'mice': Imputes using Multiple Imputation by Chained Equations method - 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method - 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V with L1 sparsity on U elements and L2 sparsity on V elements - 'iterative_svd': Imputes based on iterative low-rank SVD decomposition neighbors -- parameter for KNN imputation Output: Completed matrix """ # Create matrix of features X_incomplete = df.values # Normalize matrix by std and mean (0 mean, 1 variance) X_incomplete_normalized = BiScaler().fit_transform(X_incomplete) if method == 'knn': X_complete = KNN(neighbors).complete(X_incomplete) return fill_values(df, X_complete) if method == 'soft_impute': X_complete_normalized = SoftImpute().complete(X_incomplete_normalized) X_complete = BiScaler().inverse_transform(X_complete_normalized) return fill_values(df, X_complete) if method == 'mice': X_complete = MICE().complete(X_incomplete) return fill_values(df, X_complete) if method == 'nuclear_nm': X_complete = NuclearNormMinimization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'matrix_factorization': X_complete = MatrixFactorization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'iterative_svd': X_complete = IterativeSVD().complete(X_incomplete) return fill_values(df, X_complete)
def run_impute(self, X, state='train'): if state == 'train': self.train_data['ave'] = np.zeros([X.shape[0], X.shape[1]]) for imp_method in self.impute_method: if imp_method == 'mean': imp_ope = SimpleFill() if imp_method == 'KNN': imp_ope = KNN() if imp_method == 'IterativeSVD': imp_ope = IterativeSVD() if imp_method == 'MatrixFactorization': imp_ope = MatrixFactorization() X_filled = imp_ope.fit_transform(X) self.train_data[imp_method] = X_filled self.impute_operator[imp_method] = imp_ope self.train_data['ave'] += X_filled self.train_data['ave'] /= len(self.impute_method) return 0
def determine_impute(df): """Iterates various imputation methods to find lower MSE""" algorithms = [ SimpleFill(), KNN(1), KNN(2), KNN(3), KNN(4), KNN(5), IterativeSVD(), MatrixFactorization() ] MSE = {} df_incomplete = create_test_df(df, 0.7, list(T40_dict.keys())) for i, alg in enumerate(algorithms): print(alg) X_complete = impute_df(df_incomplete, alg) alg_mse = ((df - X_complete)**2).sum().mean() print(str(i) + alg.__class__.__name__, alg_mse) MSE[str(i) + alg.__class__.__name__] = alg_mse return MSE
def __init__(self, data, predict): self.df = data self.predict = predict self.X = None self.y = None self.X_scale = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.incomplete_data = None self.clean_data = None self.methods = [ SimpleFill(), KNN(1), KNN(2), KNN(3), KNN(4), KNN(5), IterativeSVD(), MatrixFactorization() ]
def clean_missing(df,features): """Clean missing values in the dataset. Parameters ---------- df : DataFrame features : List List of feature names. Returns ------- features_new : List List of feature names after cleaning. Xy_filled : array-like Numpy array where missing values have been cleaned. """ display(HTML('<h4>Clean Missing Data ...</h4>')) # Xy = np.concatenate((X,y.reshape((y.shape[0],1))), axis=1) df_preprocessed, features_new = missing_preprocess(df, features) print("") print("Choose the missing mechanism [a/b/c/d]:") print("a.MCAR b.MAR c.MNAR d.Skip") time.sleep(0.05) ans = input() if ans == 'a': recommend = deal_mcar(df_preprocessed) elif ans == 'b': recommend = deal_mar(df_preprocessed) elif ans == 'c': recommend = deal_mnar(df_preprocessed) else: print("Default MAR") recommend = deal_mar(df_preprocessed) print("") display(HTML('<bold>Recommended Approach!</bold>')) print("The recommended approach is {}".format(recommend)) time.sleep(0.05) ans = input("Do you want to apply the recommended approach? [y/n]") print("") if ans == 'y': if recommend == 'mean': print("Applying mean imputation ...") Xy_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'mode': print("Applying mode imputation ...") Xy_filled = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(df_preprocessed.values) print("Missing values cleaned!") elif recommend == 'knn': print("Applying knn imputation ...") with NoStdStreams(): Xy_filled = KNN().fit_transform(df_preprocessed.values); print("Missing values cleaned!") elif recommend == 'matrix factorization': print("Applying matrix factorization ...") with NoStdStreams(): Xy_filled = MatrixFactorization().fit_transform(df_preprocessed.values); print("Missing values cleaned!") elif recommend == 'multiple imputation': print("Applying multiple imputation ...") with NoStdStreams(): Xy_filled = IterativeImputer().fit_transform(df_preprocessed.values) print("Missing values cleaned!") else: print("Error: Approach not available!") else: print("") print("Choose the approach you want to apply [a/b/c/d/e/skip]:") print("a.Mean b.Mode c.K Nearest Neighbor d.Matrix Factorization e. Multiple Imputation") ans = input() print("") if ans == 'a': print("Applying mean imputation ...") Xy_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(df_preprocessed.values) elif ans == 'b': print("Applying mode imputation ...") Xy_filled = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(df_preprocessed.values) elif ans == 'c': print("Applying knn imputation ...") with NoStdStreams(): Xy_filled = KNN().fit_transform(df_preprocessed.values); elif ans == 'd': print("Applying matrix factorization ...") with NoStdStreams(): Xy_filled = MatrixFactorization().fit_transform(df_preprocessed.values); elif ans == 'e': print("Applying multiple imputation ...") with NoStdStreams(): Xy_filled = IterativeImputer().fit_transform(df_preprocessed.values) else: print("Applying default method mean imputation ...") Xy_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(df_preprocessed.values) print("Missing values cleaned!") return features_new, Xy_filled
def benchmark_complete(data, ending_density=.02, step=.01): ''' Input: Data array to benchmark on, the ending density to return results, the step bteween density imputation Output: Dataframe of output density and RMSE for each method with respect to each input density ''' # removes min value that is greater than zero (checks density) in each iteration randomly chosen #density range to run nonzeroscount = np.count_nonzero(data) sizel = data.shape totalentr = sizel[0] * sizel[1] end = 0.02 # final density to test begin = (nonzeroscount / totalentr) # Begning density of matrix given #step=.01 # step of density #intialize lists to store density_in = [] RMSE_empca_scores = [] RMSE_wpca_scores = [] RMSE_sfi_scores = [] RMSE_siv_scores = [] RMSE_sni_scores = [] RMSE_smi_scores = [] RMSE_szi_scores = [] RMSE_wmiC_scores = [] RMSE_wmiP_scores = [] Density_empca = [] Density_wpca = [] Density_sfi = [] Density_siv = [] Density_sni = [] Density_smi = [] Density_szi = [] Density_wmiC = [] Density_wmiP = [] #radnomly remove values from known matrix and try to impute them for d in reversed(np.arange(end, begin, step)): otum = data.T.copy() #begin density check nonzeroscount = np.count_nonzero(otum) sizel = otum.shape totalentr = sizel[0] * sizel[1] while np.float64((nonzeroscount / totalentr)) > d: #remove a min frequency OTU and then check density j = np.random.randint(0, len(otum[:][:]) - 1) #make sure row is not all zero (all zero row causes singular matrix) if sum(list(otum[j][:])) < 1: continue m = min(i for i in list(otum[j][:]) if i > 0) #make sure removing value will not result in zero row if sum(list(otum[j][:])) == m: continue otum[j][list(otum[j][:]).index(m)] = 0 #check denstiy to break nonzeroscount = float(np.count_nonzero(otum)) sizel = otum.shape totalentr = float(sizel[0]) * float(sizel[1]) # coherce float of the unknown and print new density print("Data table of %f generated" % d) otum = otum.T.astype(np.float64) # make zero unknown for fancy impute, avoid singular matrix by taking transpose otum2 = otum.T.copy() otum2 = otum2.astype(np.float64) otum2[otum2 == 0] = np.nan #make unknown nan #WPCA and EMPCA #build wieghted matrix weight = otum.copy() for i in range(len(otum2.T)): for j in range(len(otum2.T[i])): if otum2.T[i][j] == 0: weight[i][j] = 1 else: weight[i][j] = 1000 print("Running EMPCA") EMPCAi = EMPCA(n_components=3).fit_reconstruct(otum.copy(), weight) print("Running WPCA") WPCAi = WPCA(n_components=3).fit_reconstruct(otum.copy(), weight) # fancy impute and zeros print("Nuclear Norm") sni = NuclearNormMinimization(min_value=(np.amin(otum2)), max_value=(np.amax(otum2))).complete( otum2.copy()) print("Running Soft Impute") sfi = SoftImpute(shrinkage_value=None, convergence_threshold=0.00001, max_iters=1000, max_rank=min(otum2.shape), n_power_iterations=1, init_fill_method="zero", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), normalizer=None, verbose=False).complete(otum2.copy()) print("Running Iterative SVD") siv = IterativeSVD(rank=(min(otum2.shape) - 1), convergence_threshold=0.00001, max_iters=1000, gradual_rank_increase=True, svd_algorithm="arpack", init_fill_method="zero", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), verbose=False).complete(otum2.copy()) print("Running Matrix Factorization") smi = MatrixFactorization(rank=(min(otum2.shape) - 1), initializer=np.random.randn, learning_rate=0.01, patience=5, l1_penalty=0.05, l2_penalty=0.05, min_improvement=0.01, max_gradient_norm=5, optimization_algorithm="adam", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), verbose=False).complete(otum2.copy()) print("Imputing by filling with zeros for base comparison") szi = base.zeros(otum2.copy()) print("Weighted Mean Interpolation without phylo-distance") wmiC = base.wmi_wrapper(X=otum2.copy()) print("Weighted Mean Interpolation with phylo-distance") phylo = pd.read_csv( 'data/Matched_Pheno_and_Phylo_Data/matched_phylo.csv/matched_phylo.csv' ) wmiP = base.wmi_wrapper(X=otum2.copy(), D_j=phylo) # save the results #density in (after removed values) density_in.append(error.get_density(otum)) # density imputed Density_empca.append(error.get_density(EMPCAi)) Density_wpca.append(error.get_density(WPCAi)) Density_sfi.append(error.get_density(sfi)) Density_siv.append(error.get_density(siv)) Density_sni.append(error.get_density(sni)) Density_smi.append(error.get_density(smi)) Density_szi.append(error.get_density(szi)) Density_wmiC.append(error.get_density(wmiC)) Density_wmiP.append(error.get_density(wmiP)) # RMSE of imputed values missing_mask = np.isnan( otum2.T ) # masking to only check RMSE between values imputed and values removed RMSE_empca_scores.append(error.RMSE(data, EMPCAi, missing_mask)) RMSE_wpca_scores.append(error.RMSE(data, WPCAi, missing_mask)) RMSE_sfi_scores.append(error.RMSE(data, sfi.T, missing_mask)) RMSE_siv_scores.append(error.RMSE(data, siv.T, missing_mask)) RMSE_sni_scores.append(error.RMSE(data, sni.T, missing_mask)) RMSE_smi_scores.append(error.RMSE(data, smi.T, missing_mask)) RMSE_szi_scores.append(error.RMSE(data, szi.T, missing_mask)) RMSE_wmiC_scores.append(error.RMSE(data, wmiC.T, missing_mask)) RMSE_wmiP_scores.append(error.RMSE(data, wmiP.T, missing_mask)) RMSEmapping = pd.DataFrame({ 'Density': list(map(int, density_in)), 'EMPCA': RMSE_empca_scores, 'Matrix Factorization': RMSE_smi_scores, 'WPCA': RMSE_wpca_scores, 'Soft Impute': RMSE_sfi_scores, 'Iterative SVD': RMSE_siv_scores, 'Nuclear Norm Minimization': RMSE_sni_scores, 'Zeros Replace Unknown': RMSE_szi_scores, 'Weighted-Mean Interpolation Correlation': RMSE_wmiC_scores, 'Weighted-Mean Interpolation Phylo': RMSE_wmiP_scores }) RMSEmapping.set_index(['Density'], inplace=True) Out_density = pd.DataFrame({ 'density': list(map(int, density_in)), 'EMPCA': Density_empca, 'Matrix Factorization': Density_smi, 'WPCA': Density_wpca, 'Soft Impute': Density_sfi, 'Iterative SVD': Density_siv, 'Nuclear Norm Minimization': Density_sni, 'Zeros Replace Unknown': Density_szi, 'Weighted-Mean Interpolation Correlation': Density_wmiC, 'Weighted-Mean Interpolation Phylo': Density_wmiP }) Out_density.set_index(['density'], inplace=True) return Out_density, RMSEmapping
# # Median matrix_imp_median = matrix.copy() medians = matrix.median() for col in matrix.columns[matrix.isnull().any()]: matrix_imp_median.loc[matrix[col].isnull(), col] = medians[col] # KNN imputer = KNNImputer(n_neighbors=2, weights="uniform") matrix_imp_KNN = pd.DataFrame(imputer.fit_transform(matrix), index=matrix.index, columns=matrix.columns) # MF matrix_imp_MF = pd.DataFrame( MatrixFactorization().fit_transform(matrix), index=matrix.index, columns=matrix.columns, ) matrix_imp_MF = matrix_imp_MF.clip(lower=0) matrix_imp_MF.to_parquet(matrix_imputed_file) # Reduce redundancy in variables by getting only the most "differentiated" # parent for each variable matrix_red_var = matrix_imp_MF.copy() for var in matrix_imp_MF.columns: try: child, parent = var.split("/") except ValueError: # variable has no parent continue
import pandas as pd import numpy as np from scipy.sparse import coo_matrix from fancyimpute import MatrixFactorization if __name__ == '__main__': data = pd.read_csv('user_rating_data.csv') u_user_id = np.array(list(data['user_id'].unique())) u_book_id = np.array(list(data['book_id'].unique())) def u_map(x): return np.where(u_user_id == x)[0][0] def b_map(x): return np.where(u_book_id == x)[0][0] data['book_id'] = data['book_id'].apply(b_map) data['user_id'] = data['user_id'].apply(u_map) matrix = coo_matrix( (np.array(data['rating']), (np.array(data['user_id']), np.array(data['book_id'])))) sparse = matrix.toarray() sparse = np.where(sparse == 0, np.nan, sparse) full_m = MatrixFactorization().fit_transform(sparse) np.save('sparse_matrix', sparse) np.save('full_matrix', full_m) np.save('u_user_id', u_user_id) np.save('u_book_id', u_book_id)
def impute_mf(X): return MatrixFactorization().complete(X)
X, _, y = generator.generate_data_logistic(1024, min_mult=0.0, max_mult=1.0) # X_incomplete has the same values as X except a subset have been replace with NaN X_incomplete, missing_mask = generator.generate_missing(X, 0.1, np.nan) # Use 3 nearest rows which have a feature to fill in each row's missing features X_filled_knn = KNN(k=3).fit_transform(X_incomplete) # matrix completion using MICE X_filled_mice = IterativeImputer().fit_transform(X_incomplete) # matrix completion using Iterative SVD X_filled_svd = IterativeSVD(rank=3).fit_transform(X_incomplete) # matrix completion using Matrix Factorization X_filled_mf = MatrixFactorization(learning_rate=0.01, rank=3, l2_penalty=0, min_improvement=1e-6).fit_transform(X_incomplete) # matrix completion using Mean Fill X_filled_meanfill = SimpleFill(fill_method='mean').fit_transform(X_incomplete) # matrix completion using Median Fill X_filled_medianfill = SimpleFill(fill_method='median').fit_transform(X_incomplete) # matrix completion using Zero Fill X_filled_zerofill = SimpleFill(fill_method='zero').fit_transform(X_incomplete) # matrix completion using Min Fill X_filled_minfill = SimpleFill(fill_method='min').fit_transform(X_incomplete) # matrix completion using Sampled Fill X_filled_randomfill = SimpleFill(fill_method='random').fit_transform(X_incomplete) # Instead of solving the nuclear norm objective directly, instead # induce sparsity using singular value thresholding
def test_matrix_factorization_with_low_rank_random_matrix(): solver = MatrixFactorization(learning_rate=0.02, rank=5) XY_completed = solver.fit_transform(XY_incomplete) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="MatrixFactorization") assert missing_mae < 0.1, "Error too high!"