Пример #1
0
def test_matrix_factorization_with_low_rank_random_matrix():
    solver = MatrixFactorization(rank=3, l1_penalty=0, l2_penalty=0)
    XY_completed = solver.complete(XY_incomplete)
    _, missing_mae = reconstruction_error(XY,
                                          XY_completed,
                                          missing_mask,
                                          name="MatrixFactorization")
    assert missing_mae < 0.01, "Error too high!"
def test_matrix_factorization_with_low_rank_random_matrix():
    solver = MatrixFactorization(learning_rate=0.01,
                                 rank=3,
                                 l2_penalty=0,
                                 min_improvement=1e-6)
    XY_completed = solver.complete(XY_incomplete)
    _, missing_mae = reconstruction_error(XY,
                                          XY_completed,
                                          missing_mask,
                                          name="MatrixFactorization")
    assert missing_mae < 0.1, "Error too high!"
def test_matrix_factorization_with_low_rank_random_matrix():
    solver = MatrixFactorization(
        rank=3,
        l1_penalty=0,
        l2_penalty=0)
    XY_completed = solver.complete(XY_incomplete)
    _, missing_mae = reconstruction_error(
        XY,
        XY_completed,
        missing_mask,
        name="MatrixFactorization")
    assert missing_mae < 0.01, "Error too high!"
Пример #4
0
def deal_mar(df):
    """Deal with missing data with missing at random pattern."""

    Xy_incomplete = df.values

    # knn
    with NoStdStreams():
        Xy_filled_knn = KNN().fit_transform(Xy_incomplete);
    score_knn = compute_imputation_score(Xy_filled_knn)
    print("Imputation score of knn is {}".format(score_knn))
    # matrix factorization
    with NoStdStreams():
        Xy_filled_mf = MatrixFactorization().fit_transform(Xy_incomplete);
    score_mf = compute_imputation_score(Xy_filled_mf)
    print("Imputation score of matrix factorization is {}".format(score_knn))
    # multiple imputation
    with NoStdStreams():
        Xy_filled_ii = IterativeImputer().fit_transform(Xy_incomplete)
    score_ii = compute_imputation_score(Xy_filled_ii)
    print("Imputation score of multiple imputation is {}".format(score_ii))

    score_dict = {'knn': score_knn,
                  'matrix factorization': score_mf, 'multiple imputation': score_ii}
    print("Imputation method with the highest socre is {}".format(max(score_dict, key=score_dict.get)))
    recommend = max(score_dict, key=score_dict.get)
    return recommend
Пример #5
0
def MatrixFactorizationImpute(X_incomplete,
                              X_complete,
                              M,
                              p,
                              rank=10,
                              pen=1e-5):
    X_incomplete_new = replace0tonan(X_incomplete, M)
    start_time = time.time()
    X_incomplete_new = np.array(X_incomplete_new)
    X_complete = np.array(X_complete)
    M = np.array(M)
    X_filled_mf = MatrixFactorization(
        learning_rate=0.01, rank=rank,
        l2_penalty=pen).fit_transform(X_incomplete_new)
    X_filled_mf[np.where(np.isnan(X_filled_mf))] = 0
    mse = np.sum(np.square(X_filled_mf - X_complete)) / np.sum(1 - M)
    print(str(rank) + "_" + str(pen))
    print(mse)
    print("MatrixFactorizationImpute costed time: %4.4f" %
          (time.time() - start_time))
    print("\n")

    paras = str(p) + "/" + str(rank) + "_" + str(pen)
    path = "mfimpute" + "/" + paras + "/"
    if not os.path.exists(path):
        os.makedirs(path)
    f = open(os.path.join(path, str(mse)), "w")
    f.write(str(mse))
    f.close()
Пример #6
0
    def fit(self,df):
        logging.info("Initializaing Pipeline")
        isTraining = True
        self.isTraining = isTraining
        self.adf = df
        self.df = df.copy()

        self.numeric_cols = getNumericColumns(df)
        self.cat_cols = getCategorialColumns(df)
        self.DEPENDENT_VARIABLE = getDependentVariable()

        self.cat_cols_useless =  [ "encounter_id" , "hospital_id" , "patient_id" , "icu_id"]
        self.cat_cols_minus = [c for c in self.cat_cols if c not in ["clusterId","hospital_death", "encounter_id" , "hospital_id" , "patient_id"]]
        self.cat_cols_minus_useless = [c for c in self.cat_cols if c not in ["clusterId", "encounter_id" , "hospital_id" , "patient_id" , "icu_id" ]]
        self.cols_to_dummy = [c for c in self.cat_cols_minus_useless if c != "hospital_death"]

        self.num_mean = SimpleImputer(strategy="median")
        self.cat_freq = SimpleImputer(strategy="most_frequent")
        self.rs = RobustScaler()
        self.pt = PowerTransformer()
        self.ohe = OneHotEncoder(handle_unknown='ignore' , sparse=False)
        self.outlierKNN = KNN()

        self.num_means = [MatrixFactorization() for i in range(4)]
        self.cat_freqs = [SimpleImputer(strategy="most_frequent") for i in range(4)]
        #self.label_encoders = defaultdict(LabelEncoder)
        self.label_encoders = WOEEncoder()
        self.later_num_transform = PowerTransformer()

        self.X = self.df.drop([self.DEPENDENT_VARIABLE] , axis=1)
        self.y = self.df[self.DEPENDENT_VARIABLE]
        return self.GetTransformedData(isTraining)
Пример #7
0
def prep_clfs(feature):
    if feature == 'tmcq':
        log_reg_clf = make_pipeline(LogisticRegression(random_state=56))

        rf_clf = make_pipeline(
            RandomForestClassifier(n_jobs=-1, random_state=56))

        gb_clf = make_pipeline(GradientBoostingClassifier(random_state=56))

        xgb_clf = make_pipeline(
            XGBClassifier(max_depth=3,
                          learning_rate=0.1,
                          random_state=56,
                          n_jobs=-1))
    else:
        log_reg_clf = make_pipeline(
            ImputeTransform(strategy=MatrixFactorization()),
            LogisticRegression(random_state=56))

        rf_clf = make_pipeline(
            ImputeTransform(strategy=MatrixFactorization()),
            RandomForestClassifier(n_jobs=-1, random_state=56))

        gb_clf = make_pipeline(ImputeTransform(strategy=MatrixFactorization()),
                               GradientBoostingClassifier(random_state=56))

        xgb_clf = make_pipeline(
            ImputeTransform(strategy=MatrixFactorization()),
            XGBClassifier(max_depth=3,
                          learning_rate=0.1,
                          random_state=56,
                          n_jobs=-1))
    classifier_dict = {
        'LogReg': {
            'clf': log_reg_clf
        },
        'RandomForest': {
            'clf': rf_clf
        },
        'GradientBoosting': {
            'clf': gb_clf
        },
        'XGB': {
            'clf': xgb_clf
        }
    }
    return classifier_dict
 def complete(self, data: pd.DataFrame):
     df = data.copy()
     cols = list(df)
     if np.argmax(cols) < self.rank:
         self.rank = np.argmax(cols)
     df = pd.DataFrame(MatrixFactorization(rank=self.rank, verbose=False).fit_transform(df))
     df.columns = cols
     return df
Пример #9
0
def clean_missing(df, features, setting):
    """Clean missing values in the dataset.
    Parameters
    ----------
    df : DataFrame
    features : List
        List of feature names.
    Returns
    -------
    features_new : List
        List of feature names after cleaning.
    Xy_filled : array-like
        Numpy array where missing values have been cleaned.
    """

    df_preprocessed, features_new = missing_preprocess(df, features)
    if setting == 'mcar':
        recommend = deal_mcar(df_preprocessed)
    elif setting == 'mar':
        recommend = deal_mar(df_preprocessed)
    elif setting == 'mnar':
        recommend = deal_mnar(df_preprocessed)
    else:
        print("Default MAR")
        recommend = deal_mar(df_preprocessed)

    if recommend == 'mean':
        print("Applying mean imputation ...")
        Xy_filled = Imputer(missing_values=np.nan,
                            strategy='mean').fit_transform(
                                df_preprocessed.values)
        print("Missing values cleaned!")
    elif recommend == 'mode':
        print("Applying mode imputation ...")
        Xy_filled = Imputer(missing_values=np.nan,
                            strategy='most_frequent').fit_transform(
                                df_preprocessed.values)
        print("Missing values cleaned!")
    elif recommend == 'knn':
        print("Applying knn imputation ...")
        with NoStdStreams():
            Xy_filled = KNN().fit_transform(df_preprocessed.values)
        print("Missing values cleaned!")
    elif recommend == 'matrix factorization':
        print("Applying matrix factorization ...")
        with NoStdStreams():
            Xy_filled = MatrixFactorization().fit_transform(
                df_preprocessed.values)
        print("Missing values cleaned!")
    elif recommend == 'multiple imputation':
        print("Applying multiple imputation ...")
        with NoStdStreams():
            Xy_filled = IterativeImputer().fit_transform(
                df_preprocessed.values)
        print("Missing values cleaned!")
    else:
        print("Error: Approach not available!")
    return features_new, Xy_filled
Пример #10
0
def test_matrix_factorization_with_low_rank_random_matrix():
    initialize_random_seed()  # for reproducibility
    solver = MatrixFactorization(learning_rate=0.01,
                                 rank=3,
                                 l2_penalty=0,
                                 min_improvement=1e-6,
                                 verbose=False)
    XY_completed = solver.fit_transform(XY_incomplete)
    _, missing_mae = reconstruction_error(XY,
                                          XY_completed,
                                          missing_mask,
                                          name="MatrixFactorization")
    assert missing_mae < 0.1, "Error too high!"

    initialize_random_seed()  # for reproducibility
    solver = MatrixFactorization(learning_rate=0.01,
                                 rank=3,
                                 l2_penalty=0,
                                 min_improvement=1e-6,
                                 verbose=False)
    XY_completed = solver.fit(XY_incomplete, missing_mask)
    _, missing_mae = reconstruction_error(XY,
                                          XY_completed,
                                          missing_mask,
                                          name="MatrixFactorization")
    assert missing_mae < 0.1, "Error too high!"

    XY_completed = solver.transform(XY_incomplete, missing_mask)
    _, missing_mae = reconstruction_error(XY,
                                          XY_completed,
                                          missing_mask,
                                          name="MatrixFactorization")
    assert missing_mae < 0.1, "Error too high!"
def prep_data(df, dataset, scale='before', return_complete=False):
    df_complete = df.copy()
    df_complete.loc[:,:] = MatrixFactorization().complete(df)
    if dataset == 'TMCQ':
        cols = ['Y1_P_TMCQ_ACTIVITY',
            'Y1_P_TMCQ_AFFIL',
            'Y1_P_TMCQ_ANGER',
            'Y1_P_TMCQ_FEAR',
            'Y1_P_TMCQ_HIP',
            'Y1_P_TMCQ_IMPULS',
            'Y1_P_TMCQ_INHIBIT',
            'Y1_P_TMCQ_SAD',
            'Y1_P_TMCQ_SHY',
            'Y1_P_TMCQ_SOOTHE',
            'Y1_P_TMCQ_ASSERT',
            'Y1_P_TMCQ_ATTFOCUS',
            'Y1_P_TMCQ_LIP',
            'Y1_P_TMCQ_PERCEPT',
            'Y1_P_TMCQ_DISCOMF',
            'Y1_P_TMCQ_OPENNESS',
            'DX']
        dataset_all = df_complete[cols]
    elif dataset == 'neuro':
        cols = ['STOP_SSRTAVE_Y1',
                 'DPRIME1_Y1',
                 'DPRIME2_Y1',
                 'SSBK_NUMCOMPLETE_Y1',
                 'SSFD_NUMCOMPLETE_Y1',
                 'V_Y1',
                 'Y1_CLWRD_COND1',
                 'Y1_CLWRD_COND2',
                 'Y1_DIGITS_BKWD_RS',
                 'Y1_DIGITS_FRWD_RS',
                 'Y1_TRAILS_COND2',
                 'Y1_TRAILS_COND3',
                 'CW_RES',
                 'TR_RES',
                 'Y1_TAP_SD_TOT_CLOCK',
                 'DX']
        scaler = StandardScaler()
        dataset = df_complete[cols]
        if scale=='before':
            dataset_all = dataset.copy()
            dataset_all.iloc[:,0:-1] = scaler.fit_transform(dataset.iloc[:,0:-1])
        else:
            dataset_all = dataset.copy()
    adhd = dataset_all[dataset_all['DX'] == 3]
    control = dataset_all[dataset_all['DX'] == 1]

    dataset_all.drop(columns='DX', inplace=True)
    adhd.drop(columns='DX', inplace=True)
    control.drop(columns='DX', inplace=True)

    if return_complete:
        return dataset_all, adhd, control, df_complete
    else:
        return dataset_all, adhd, control
Пример #12
0
def deal_mcar(df):
    """Deal with missing data with missing completely at random pattern."""
    # number of instances
    num_instances = df.shape[0]

    # number of rows containing missing
    num_missing_instances = df.isnull().sum(axis=1).astype(bool).sum()

    # missing percentage
    missing_percentage = num_missing_instances / num_instances
    print("Missing percentage is {}".format(missing_percentage))

    if missing_percentage < 0.05:
        recommend = 'list deletion'
    else:
        Xy_incomplete = df.values
        # mean
        Xy_filled_mean = Imputer(missing_values=np.nan,
                                 strategy='mean').fit_transform(Xy_incomplete)
        score_mean = compute_imputation_score(Xy_filled_mean)
        print("Imputation score of mean is {}".format(score_mean))
        # mode
        Xy_filled_mode = Imputer(
            missing_values=np.nan,
            strategy='most_frequent').fit_transform(Xy_incomplete)
        score_mode = compute_imputation_score(Xy_filled_mode)
        print("Imputation score of mode is {}".format(score_mode))
        # knn
        with NoStdStreams():
            Xy_filled_knn = KNN().fit_transform(Xy_incomplete)
        score_knn = compute_imputation_score(Xy_filled_knn)
        print("Imputation score of knn is {}".format(score_knn))
        # matrix factorization
        with NoStdStreams():
            Xy_filled_mf = MatrixFactorization().fit_transform(Xy_incomplete)
        score_mf = compute_imputation_score(Xy_filled_mf)
        print(
            "Imputation score of matrix factorization is {}".format(score_knn))
        # multiple imputation
        with NoStdStreams():
            Xy_filled_ii = IterativeImputer().fit_transform(Xy_incomplete)
        score_ii = compute_imputation_score(Xy_filled_ii)
        print("Imputation score of multiple imputation is {}".format(score_ii))

        score_dict = {
            'mean': score_mean,
            'mode': score_mode,
            'knn': score_knn,
            'matrix factorization': score_mf,
            'multiple imputation': score_ii
        }
        print("Imputation method with the highest socre is {}".format(
            max(score_dict, key=score_dict.get)))
        recommend = max(score_dict, key=score_dict.get)
    return recommend
Пример #13
0
 def fi_complete(self, X, method='mf', **params):
     if method == 'mf':
         #rank = params['rank']=100
         self.X_filled = MatrixFactorization(params['rank']).complete(X)
     if method == 'knn':
         # Use 3 nearest rows which have a feature to fill in each row's missing features
         #k = params['k'] = 3
         self.X_filled = KNN(params['k']).complete(X)
     if method == 'soft':
         # Instead of solving the nuclear norm objective directly, instead
         # induce sparsity using singular value thresholding
         self.X_filled = SoftImpute().complete(X)
Пример #14
0
def complex_imputation(df, method='mice', neighbors=3):
    """
	Inputs:
	df -- dataframe of incomplete data
	method -- method of imputation
		- 'knn': Imputes using K Nearest Neighbors of completed rows
		- 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions
		- 'mice': Imputes using Multiple Imputation by Chained Equations method
		- 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method
		- 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V
								  with L1 sparsity on U elements and L2 sparsity on V elements
		- 'iterative_svd': Imputes based on iterative low-rank SVD decomposition
	neighbors -- parameter for KNN imputation
	
	Output:
	Completed matrix
	"""
    # Create matrix of features
    X_incomplete = df.values
    # Normalize matrix by std and mean (0 mean, 1 variance)
    X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)

    if method == 'knn':
        X_complete = KNN(neighbors).complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'soft_impute':
        X_complete_normalized = SoftImpute().complete(X_incomplete_normalized)
        X_complete = BiScaler().inverse_transform(X_complete_normalized)
        return fill_values(df, X_complete)

    if method == 'mice':
        X_complete = MICE().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'nuclear_nm':
        X_complete = NuclearNormMinimization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'matrix_factorization':
        X_complete = MatrixFactorization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'iterative_svd':
        X_complete = IterativeSVD().complete(X_incomplete)
        return fill_values(df, X_complete)
Пример #15
0
 def run_impute(self, X, state='train'):
     if state == 'train':
         self.train_data['ave'] = np.zeros([X.shape[0], X.shape[1]])
         for imp_method in self.impute_method:
             if imp_method == 'mean':
                 imp_ope = SimpleFill()
             if imp_method == 'KNN':
                 imp_ope = KNN()
             if imp_method == 'IterativeSVD':
                 imp_ope = IterativeSVD()
             if imp_method == 'MatrixFactorization':
                 imp_ope = MatrixFactorization()
             X_filled = imp_ope.fit_transform(X)
             self.train_data[imp_method] = X_filled
             self.impute_operator[imp_method] = imp_ope
             self.train_data['ave'] += X_filled
         self.train_data['ave'] /= len(self.impute_method)
     return 0
Пример #16
0
def determine_impute(df):
    """Iterates various imputation methods to find lower MSE"""
    algorithms = [
        SimpleFill(),
        KNN(1),
        KNN(2),
        KNN(3),
        KNN(4),
        KNN(5),
        IterativeSVD(),
        MatrixFactorization()
    ]
    MSE = {}
    df_incomplete = create_test_df(df, 0.7, list(T40_dict.keys()))
    for i, alg in enumerate(algorithms):
        print(alg)
        X_complete = impute_df(df_incomplete, alg)
        alg_mse = ((df - X_complete)**2).sum().mean()
        print(str(i) + alg.__class__.__name__, alg_mse)
        MSE[str(i) + alg.__class__.__name__] = alg_mse
    return MSE
Пример #17
0
 def __init__(self, data, predict):
     self.df = data
     self.predict = predict
     self.X = None
     self.y = None
     self.X_scale = None
     self.X_train = None
     self.X_test = None
     self.y_train = None
     self.y_test = None
     self.incomplete_data = None
     self.clean_data = None
     self.methods = [
         SimpleFill(),
         KNN(1),
         KNN(2),
         KNN(3),
         KNN(4),
         KNN(5),
         IterativeSVD(),
         MatrixFactorization()
     ]
Пример #18
0
def clean_missing(df,features):
    """Clean missing values in the dataset.

    Parameters
    ----------

    df : DataFrame

    features : List
        List of feature names.

    Returns
    -------

    features_new : List
        List of feature names after cleaning.

    Xy_filled : array-like
        Numpy array where missing values have been cleaned.
    """
    display(HTML('<h4>Clean Missing Data ...</h4>'))
#     Xy = np.concatenate((X,y.reshape((y.shape[0],1))), axis=1)
    df_preprocessed, features_new = missing_preprocess(df, features)
    print("")
    print("Choose the missing mechanism [a/b/c/d]:")
    print("a.MCAR b.MAR c.MNAR d.Skip")
    time.sleep(0.05)
    ans = input()
    if ans == 'a':
        recommend = deal_mcar(df_preprocessed)
    elif ans == 'b':
        recommend = deal_mar(df_preprocessed)
    elif ans == 'c':
        recommend = deal_mnar(df_preprocessed)
    else:
        print("Default MAR")
        recommend = deal_mar(df_preprocessed)
    print("")
    display(HTML('<bold>Recommended Approach!</bold>'))
    print("The recommended approach is {}".format(recommend))
    time.sleep(0.05)
    ans = input("Do you want to apply the recommended approach? [y/n]")
    print("")
    if ans == 'y':
        if recommend == 'mean':
            print("Applying mean imputation ...")
            Xy_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(df_preprocessed.values)
            print("Missing values cleaned!")
        elif recommend == 'mode':
            print("Applying mode imputation ...")
            Xy_filled = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(df_preprocessed.values)
            print("Missing values cleaned!")
        elif recommend == 'knn':
            print("Applying knn imputation ...")
            with NoStdStreams():
                Xy_filled = KNN().fit_transform(df_preprocessed.values);
            print("Missing values cleaned!")
        elif recommend == 'matrix factorization':
            print("Applying matrix factorization ...")
            with NoStdStreams():
                Xy_filled = MatrixFactorization().fit_transform(df_preprocessed.values);
            print("Missing values cleaned!")
        elif recommend == 'multiple imputation':
            print("Applying multiple imputation ...")
            with NoStdStreams():
                Xy_filled = IterativeImputer().fit_transform(df_preprocessed.values)
            print("Missing values cleaned!")
        else:
            print("Error: Approach not available!")
        
    else:
        print("")
        print("Choose the approach you want to apply [a/b/c/d/e/skip]:")
        print("a.Mean b.Mode c.K Nearest Neighbor d.Matrix Factorization e. Multiple Imputation")
        ans = input()
        print("")
        if ans == 'a':
            print("Applying mean imputation ...")
            Xy_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(df_preprocessed.values)
        elif ans == 'b':
            print("Applying mode imputation ...")
            Xy_filled = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(df_preprocessed.values)
        elif ans == 'c':
            print("Applying knn imputation ...")
            with NoStdStreams():
                Xy_filled = KNN().fit_transform(df_preprocessed.values);
        elif ans == 'd':
            print("Applying matrix factorization ...")
            with NoStdStreams():
                Xy_filled = MatrixFactorization().fit_transform(df_preprocessed.values);
        elif ans == 'e':
            print("Applying multiple imputation ...")
            with NoStdStreams():
                Xy_filled = IterativeImputer().fit_transform(df_preprocessed.values)
        else:
            print("Applying default method mean imputation ...")
            Xy_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(df_preprocessed.values)
            
        print("Missing values cleaned!")
    return features_new, Xy_filled
Пример #19
0
    def benchmark_complete(data, ending_density=.02, step=.01):
        '''
        Input: Data array to benchmark on, the ending density to return results, the step bteween density imputation
        
        Output: Dataframe of output density and RMSE for each method with respect to each input density
        
        
        '''
        # removes min value that is greater than zero (checks density) in each iteration randomly chosen
        #density range to run
        nonzeroscount = np.count_nonzero(data)
        sizel = data.shape
        totalentr = sizel[0] * sizel[1]
        end = 0.02  # final density to test
        begin = (nonzeroscount / totalentr)  # Begning density of matrix given
        #step=.01 # step of density

        #intialize lists to store
        density_in = []
        RMSE_empca_scores = []
        RMSE_wpca_scores = []
        RMSE_sfi_scores = []
        RMSE_siv_scores = []
        RMSE_sni_scores = []
        RMSE_smi_scores = []
        RMSE_szi_scores = []
        RMSE_wmiC_scores = []
        RMSE_wmiP_scores = []
        Density_empca = []
        Density_wpca = []
        Density_sfi = []
        Density_siv = []
        Density_sni = []
        Density_smi = []
        Density_szi = []
        Density_wmiC = []
        Density_wmiP = []

        #radnomly remove values from known matrix and try to impute them

        for d in reversed(np.arange(end, begin, step)):
            otum = data.T.copy()

            #begin density check
            nonzeroscount = np.count_nonzero(otum)
            sizel = otum.shape
            totalentr = sizel[0] * sizel[1]

            while np.float64((nonzeroscount / totalentr)) > d:
                #remove a min frequency OTU and then check density
                j = np.random.randint(0, len(otum[:][:]) - 1)
                #make sure row is not all zero (all zero row causes singular matrix)
                if sum(list(otum[j][:])) < 1:
                    continue
                m = min(i for i in list(otum[j][:]) if i > 0)
                #make sure removing value will not result in zero row
                if sum(list(otum[j][:])) == m:
                    continue
                otum[j][list(otum[j][:]).index(m)] = 0
                #check denstiy to break
                nonzeroscount = float(np.count_nonzero(otum))
                sizel = otum.shape
                totalentr = float(sizel[0]) * float(sizel[1])

            # coherce float of the unknown and print new density
            print("Data table of %f generated" % d)
            otum = otum.T.astype(np.float64)

            # make zero unknown for fancy impute, avoid singular matrix by taking transpose
            otum2 = otum.T.copy()
            otum2 = otum2.astype(np.float64)
            otum2[otum2 == 0] = np.nan  #make unknown nan

            #WPCA and EMPCA

            #build wieghted matrix
            weight = otum.copy()
            for i in range(len(otum2.T)):
                for j in range(len(otum2.T[i])):
                    if otum2.T[i][j] == 0:
                        weight[i][j] = 1
                    else:
                        weight[i][j] = 1000

            print("Running EMPCA")
            EMPCAi = EMPCA(n_components=3).fit_reconstruct(otum.copy(), weight)
            print("Running WPCA")
            WPCAi = WPCA(n_components=3).fit_reconstruct(otum.copy(), weight)

            # fancy impute and zeros
            print("Nuclear Norm")
            sni = NuclearNormMinimization(min_value=(np.amin(otum2)),
                                          max_value=(np.amax(otum2))).complete(
                                              otum2.copy())
            print("Running Soft Impute")
            sfi = SoftImpute(shrinkage_value=None,
                             convergence_threshold=0.00001,
                             max_iters=1000,
                             max_rank=min(otum2.shape),
                             n_power_iterations=1,
                             init_fill_method="zero",
                             min_value=(np.amin(otum2)),
                             max_value=(np.amax(otum2)),
                             normalizer=None,
                             verbose=False).complete(otum2.copy())
            print("Running Iterative SVD")
            siv = IterativeSVD(rank=(min(otum2.shape) - 1),
                               convergence_threshold=0.00001,
                               max_iters=1000,
                               gradual_rank_increase=True,
                               svd_algorithm="arpack",
                               init_fill_method="zero",
                               min_value=(np.amin(otum2)),
                               max_value=(np.amax(otum2)),
                               verbose=False).complete(otum2.copy())
            print("Running Matrix Factorization")
            smi = MatrixFactorization(rank=(min(otum2.shape) - 1),
                                      initializer=np.random.randn,
                                      learning_rate=0.01,
                                      patience=5,
                                      l1_penalty=0.05,
                                      l2_penalty=0.05,
                                      min_improvement=0.01,
                                      max_gradient_norm=5,
                                      optimization_algorithm="adam",
                                      min_value=(np.amin(otum2)),
                                      max_value=(np.amax(otum2)),
                                      verbose=False).complete(otum2.copy())
            print("Imputing by filling with zeros for base comparison")
            szi = base.zeros(otum2.copy())
            print("Weighted Mean Interpolation without phylo-distance")
            wmiC = base.wmi_wrapper(X=otum2.copy())
            print("Weighted Mean Interpolation with phylo-distance")
            phylo = pd.read_csv(
                'data/Matched_Pheno_and_Phylo_Data/matched_phylo.csv/matched_phylo.csv'
            )
            wmiP = base.wmi_wrapper(X=otum2.copy(), D_j=phylo)

            # save the results

            #density in (after removed values)
            density_in.append(error.get_density(otum))

            # density imputed
            Density_empca.append(error.get_density(EMPCAi))
            Density_wpca.append(error.get_density(WPCAi))
            Density_sfi.append(error.get_density(sfi))
            Density_siv.append(error.get_density(siv))
            Density_sni.append(error.get_density(sni))
            Density_smi.append(error.get_density(smi))
            Density_szi.append(error.get_density(szi))
            Density_wmiC.append(error.get_density(wmiC))
            Density_wmiP.append(error.get_density(wmiP))

            # RMSE of imputed values
            missing_mask = np.isnan(
                otum2.T
            )  # masking to only check RMSE between values imputed and values removed
            RMSE_empca_scores.append(error.RMSE(data, EMPCAi, missing_mask))
            RMSE_wpca_scores.append(error.RMSE(data, WPCAi, missing_mask))
            RMSE_sfi_scores.append(error.RMSE(data, sfi.T, missing_mask))
            RMSE_siv_scores.append(error.RMSE(data, siv.T, missing_mask))
            RMSE_sni_scores.append(error.RMSE(data, sni.T, missing_mask))
            RMSE_smi_scores.append(error.RMSE(data, smi.T, missing_mask))
            RMSE_szi_scores.append(error.RMSE(data, szi.T, missing_mask))
            RMSE_wmiC_scores.append(error.RMSE(data, wmiC.T, missing_mask))
            RMSE_wmiP_scores.append(error.RMSE(data, wmiP.T, missing_mask))

        RMSEmapping = pd.DataFrame({
            'Density':
            list(map(int, density_in)),
            'EMPCA':
            RMSE_empca_scores,
            'Matrix Factorization':
            RMSE_smi_scores,
            'WPCA':
            RMSE_wpca_scores,
            'Soft Impute':
            RMSE_sfi_scores,
            'Iterative SVD':
            RMSE_siv_scores,
            'Nuclear Norm Minimization':
            RMSE_sni_scores,
            'Zeros Replace Unknown':
            RMSE_szi_scores,
            'Weighted-Mean Interpolation Correlation':
            RMSE_wmiC_scores,
            'Weighted-Mean Interpolation Phylo':
            RMSE_wmiP_scores
        })
        RMSEmapping.set_index(['Density'], inplace=True)
        Out_density = pd.DataFrame({
            'density':
            list(map(int, density_in)),
            'EMPCA':
            Density_empca,
            'Matrix Factorization':
            Density_smi,
            'WPCA':
            Density_wpca,
            'Soft Impute':
            Density_sfi,
            'Iterative SVD':
            Density_siv,
            'Nuclear Norm Minimization':
            Density_sni,
            'Zeros Replace Unknown':
            Density_szi,
            'Weighted-Mean Interpolation Correlation':
            Density_wmiC,
            'Weighted-Mean Interpolation Phylo':
            Density_wmiP
        })
        Out_density.set_index(['density'], inplace=True)

        return Out_density, RMSEmapping
Пример #20
0
# # Median
matrix_imp_median = matrix.copy()
medians = matrix.median()
for col in matrix.columns[matrix.isnull().any()]:
    matrix_imp_median.loc[matrix[col].isnull(), col] = medians[col]

# KNN
imputer = KNNImputer(n_neighbors=2, weights="uniform")
matrix_imp_KNN = pd.DataFrame(imputer.fit_transform(matrix),
                              index=matrix.index,
                              columns=matrix.columns)

# MF
matrix_imp_MF = pd.DataFrame(
    MatrixFactorization().fit_transform(matrix),
    index=matrix.index,
    columns=matrix.columns,
)

matrix_imp_MF = matrix_imp_MF.clip(lower=0)
matrix_imp_MF.to_parquet(matrix_imputed_file)

# Reduce redundancy in variables by getting only the most "differentiated"
# parent for each variable
matrix_red_var = matrix_imp_MF.copy()
for var in matrix_imp_MF.columns:
    try:
        child, parent = var.split("/")
    except ValueError:  # variable has no parent
        continue
Пример #21
0
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from fancyimpute import MatrixFactorization

if __name__ == '__main__':
    data = pd.read_csv('user_rating_data.csv')
    u_user_id = np.array(list(data['user_id'].unique()))
    u_book_id = np.array(list(data['book_id'].unique()))

    def u_map(x):
        return np.where(u_user_id == x)[0][0]

    def b_map(x):
        return np.where(u_book_id == x)[0][0]

    data['book_id'] = data['book_id'].apply(b_map)
    data['user_id'] = data['user_id'].apply(u_map)
    matrix = coo_matrix(
        (np.array(data['rating']), (np.array(data['user_id']),
                                    np.array(data['book_id']))))
    sparse = matrix.toarray()
    sparse = np.where(sparse == 0, np.nan, sparse)
    full_m = MatrixFactorization().fit_transform(sparse)
    np.save('sparse_matrix', sparse)
    np.save('full_matrix', full_m)
    np.save('u_user_id', u_user_id)
    np.save('u_book_id', u_book_id)
Пример #22
0
def impute_mf(X):
    return MatrixFactorization().complete(X)
Пример #23
0
X, _, y = generator.generate_data_logistic(1024, min_mult=0.0, max_mult=1.0)
# X_incomplete has the same values as X except a subset have been replace with NaN
X_incomplete, missing_mask = generator.generate_missing(X, 0.1, np.nan)

# Use 3 nearest rows which have a feature to fill in each row's missing features
X_filled_knn = KNN(k=3).fit_transform(X_incomplete)

# matrix completion using MICE
X_filled_mice = IterativeImputer().fit_transform(X_incomplete)

# matrix completion using Iterative SVD
X_filled_svd = IterativeSVD(rank=3).fit_transform(X_incomplete)

# matrix completion using Matrix Factorization
X_filled_mf = MatrixFactorization(learning_rate=0.01,
                                  rank=3,
                                  l2_penalty=0,
                                  min_improvement=1e-6).fit_transform(X_incomplete)

# matrix completion using Mean Fill
X_filled_meanfill = SimpleFill(fill_method='mean').fit_transform(X_incomplete)
# matrix completion using Median Fill
X_filled_medianfill = SimpleFill(fill_method='median').fit_transform(X_incomplete)
# matrix completion using Zero Fill
X_filled_zerofill = SimpleFill(fill_method='zero').fit_transform(X_incomplete)
# matrix completion using Min Fill
X_filled_minfill = SimpleFill(fill_method='min').fit_transform(X_incomplete)
# matrix completion using Sampled Fill
X_filled_randomfill = SimpleFill(fill_method='random').fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
def test_matrix_factorization_with_low_rank_random_matrix():
    solver = MatrixFactorization(learning_rate=0.02, rank=5)
    XY_completed = solver.fit_transform(XY_incomplete)
    _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="MatrixFactorization")
    assert missing_mae < 0.1, "Error too high!"