Пример #1
0
def test_solver_fill_methods_with_low_rank_random_matrix():
    for fill_method in ("zero", "mean", "median", "min", "random"):
        imputer = SimpleFill(fill_method=fill_method)
        XY_completed = imputer.fit_transform(XY_incomplete)
        _, missing_mae = reconstruction_error(
            XY,
            XY_completed,
            missing_mask,
            name="Solver with fill_method=%s" % fill_method)
        assert missing_mae < 5, "Error too high for Solver with %s fill method!" % fill_method
Пример #2
0
    def __init__(self, fill_method='zero', fill_missing=True, **kwargs):
        """Imputs NaN's using various filling methods like mean, zero, median, min, random


        Args:
            fill_method: How NaN's will be exchanged. Possible values: 'mean', 'zero', 'median', 'min', 'random'
            fill_missing: If True, transformer will fill NaN values by filling method
        """
        super().__init__()
        self.fill_missing = fill_missing
        self.filler = SimpleFill(fill_method)
def imputeMethodMedain(result,
                       originData,
                       missData,
                       missRate,
                       missPattern,
                       dataType='continuous'):
    imputationMethod = "median"
    try:
        imputedData = SimpleFill("median").fit_transform(missData)
        if dataType != 'continuous':

            mark = [
                temp[0] for temp in pd.DataFrame(np.unique(missData)).dropna(
                    axis=0).values
            ]
            imputedData = modifier(imputedData, mark)
        result = addResult(result, missRate, missPattern, imputationMethod,
                           evaluate.RMSE(originData, imputedData),
                           MAE(originData, imputedData),
                           masked_mape_np(originData, imputedData))
    except Exception as e:
        print(e)
        imputedData = 'none'
        result = addResult(result, missRate, missPattern, imputationMethod,
                           np.inf, np.inf, np.inf)
    return result, imputedData
Пример #4
0
 def __mean(self, test_data):
     """
     wrap fancyimpute-mean
     """
     test_data = mvp.df2np(test_data, [], self.verbose)
     complete_data = SimpleFill(fill_method="mean").complete(test_data)
     return complete_data
Пример #5
0
class FillNan(BaseTransformer):
    def __init__(self, fill_method='zero', fill_missing=True, **kwargs):
        """Imputs NaN's using various filling methods like mean, zero, median, min, random


        Args:
            fill_method: How NaN's will be exchanged. Possible values: 'mean', 'zero', 'median', 'min', 'random'
            fill_missing: If True, transformer will fill NaN values by filling method
        """
        super().__init__()
        self.fill_missing = fill_missing
        self.filler = SimpleFill(fill_method)

    def transform(self, X):
        """
        Args:
            X: DataFrame with NaN's
        Returns:
            Dictionary with one key - 'X' corresponding to given DataFrame but without nan's

        """
        if self.fill_missing:
            X = self.filler.complete(X)
        return {'X': X}

    def load(self, filepath):
        self.filler = joblib.load(filepath)
        return self

    def persist(self, filepath):
        joblib.dump(self.filler, filepath)
Пример #6
0
def fill_missing_values(df):
	df = drop_high_missing_features(df)
	is_missing = pd.isnull(df).sum().sum()
	if is_missing:
		arr_complete = SimpleFill().complete(df)
		df = pd.DataFrame(arr_complete, columns = df.columns)	
	return df
Пример #7
0
def baseline_inpute(X_incomplete, method='mean', level=0):

    if method == 'mean':
        X_filled_mean = SimpleFill().fit_transform(X_incomplete)
        return X_filled_mean
    elif method == 'knn':
        k = [3, 10, 50][level]
        X_filled_knn = KNN(k=k, verbose=False).fit_transform(X_incomplete)
        return X_filled_knn
    elif method == 'svd':
        rank = [
            np.ceil((X_incomplete.shape[1] - 1) / 10),
            np.ceil((X_incomplete.shape[1] - 1) / 5), X_incomplete.shape[1] - 1
        ][level]
        X_filled_svd = IterativeSVD(rank=int(rank),
                                    verbose=False).fit_transform(X_incomplete)
        return X_filled_svd
    elif method == 'mice':
        max_iter = [3, 10, 50][level]
        X_filled_mice = IterativeImputer(
            max_iter=max_iter).fit_transform(X_incomplete)
        return X_filled_mice
    elif method == 'spectral':
        # default value for the sparsity level is with respect to the maximum singular value,
        # this is now done in a heuristic way
        sparsity = [0.5, None, 3][level]
        X_filled_spectral = SoftImpute(
            shrinkage_value=sparsity).fit_transform(X_incomplete)
        return X_filled_spectral
    else:
        raise NotImplementedError
Пример #8
0
    def impute_using_statistics(df, method='min'):
        """
        Imputes the missing values by the selected statistical property of each column

        :param df: The input dataframe that contains missing values
        :param method: The imputation method (min by default)
            "zero": fill missing entries with zeros
            "mean": fill with column means
            "median" : fill with column medians
            "min": fill with min value per column
            "random": fill with gaussian noise according to mean/std of column
        :return: the imputed dataframe
        """
        sf = SimpleFill(method)
        imputed_matrix = sf.complete(df.values)
        imputed_df = pd.DataFrame(imputed_matrix, df.index, df.columns)
        return imputed_df
    def _get_imputer(self):
        if self.strategy == "simple":
            return SimpleFill()

        return {
            "knn": KNN,
            "mice": MICE,
            "matrix": MatrixFactorization,
            "soft": SoftImpute,
        }[self.strategy](verbose=False)
    def impute_missing_values(self, value_set, strategy):
        """
        对原始数据矩阵进行填充
        :param value_set: 待处理的原始数据矩阵
        :param strategy: 1:剔除缺失值 2:高频值填充 3:属性相关关系填充 4:数据对象相似性填充
        :return: 进行填充过的数据矩阵,类型为list: (col1, col2, ...)
        """
        # 以剔除缺失值的方法进行处理
        if strategy == 1:
            new_value_set = []
            for data_sample in value_set:
                new_data_sample = []
                if None in data_sample or 'NA' in data_sample:
                    continue
                else:
                    for data in data_sample:
                        new_data_sample.append(float(data))
                new_value_set.append(new_data_sample)
            value_array = np.array(new_value_set)

        elif strategy in [2, 3, 4]:
            # 将value_set矩阵转化为numpy矩阵,并将其中的缺失值用np.nan替换
            new_value_set = []
            for data_sample in value_set:
                new_data_sample = []
                for data in data_sample:
                    if data and data != 'NA':
                        new_data_sample.append(float(data))
                    else:
                        new_data_sample.append(np.nan)
                new_value_set.append(new_data_sample)
            value_array = np.array(new_value_set)

            # 以最高频值进行填补,由于均为概率类的数值属性,所以用平均数代替
            if strategy == 2:
                value_array = SimpleFill(
                    fill_method="mean").complete(value_array)

            # 以属性相关关系进行填补,取相关性最高的三个属性做
            elif strategy == 3:
                value_array = MICE(n_nearest_columns=3).complete(value_array)

            # 以数据对象相似性进行填补,取相似度最高的10个数据对象
            elif strategy == 4:
                for batch in range(len(value_array) // 1000 + 1):
                    value_array[batch*1000 : min(batch*1000+1000, len(value_array))] = \
                        KNN(k = 10).complete(value_array[batch*1000 : min(batch*1000+1000, len(value_array))])
        else:
            raise ArgInputError("The strategy should be in (1,2,3,4)!")

        # 将填充过的数据矩阵按feature_col转换为n个col的list
        feature_col_list = []
        for i in range(len(value_array[0])):
            feature_col_list.append(value_array[:, i].tolist())
        return feature_col_list
Пример #11
0
 def __init__(self, method, **kwargs):
     self.clf = None
     self.method = method
     if method == "SoftImpute":
         self.clf = SoftImpute(**kwargs)
     elif method == "KNN":
         self.clf = KNN(**kwargs)
     elif method == "Naive":
         self.clf = SimpleFill()
     elif method == 'II':
         raise ('NOT TESTED')
         self.clf = IterativeImputer(min_value=0)
     else:
         raise ("Not Implemented method")
def load_data(p_miss, dataset="drive", mode="mcar", para=0.5, train=None, rand_seed=42):
    np.random.seed(rand_seed)

    with open("data/" + dataset + "_x", "rb") as file:
        data_x = pickle.load(file)
    with open("data/" + dataset + "_y", "rb") as file:
        data_y = pickle.load(file)

    n = data_x.shape[0]
    p = data_x.shape[1]

    perc_miss = p_miss
    xmiss = np.copy(data_x)

    if mode == "mcar":
        xmiss_flat = xmiss.flatten()
        miss_pattern = np.random.choice(n*p, np.floor(n*p*perc_miss).astype(np.int), replace=False)
        xmiss_flat[miss_pattern] = np.nan
        xmiss = xmiss_flat.reshape([n, p])  # in xmiss, the missing values are represented by nans
    elif mode == "mar":
        fixed_len = int(np.floor(p/3))
        prob = para*np.mean(data_x[:, :fixed_len], 1)
        prob = sigmoid(prob, 0.5)
        for i in range(n):
            mask_tmp = np.random.choice([1, 0], size=p, p=[1 - prob[i], prob[i]])
            for j in range(fixed_len, p):
                if mask_tmp[j] == 0:
                    xmiss[i, j] = np.nan
        print("missing rate: ", np.sum(np.isnan(xmiss.flatten()))/(n*p))
    else:
        raise Exception("mode is not valid")

    mask = np.isfinite(xmiss) # binary mask that indicates which values are missing

    xhat_0 = np.copy(xmiss)
    xhat_0[np.isnan(xmiss)] = 0

    x_filled = SimpleFill().fit_transform(xmiss)

    print("MSE mean imputation full data: " + str(mse(x_filled, data_x, mask)))

    if train == True:
        part = int(np.floor(n/2))
        return (n-part), p, xmiss[part:,:], xhat_0[part:,:], mask[part:,:], data_x[part:,:], data_y[part:,:]
    elif train == False:
        part = int(np.floor(n/2))
        return part, p, xmiss[:part,:], xhat_0[:part,:], mask[:part,:], data_x[:part,:], data_y[:part,:]
    elif train == None:
        return n, p, xmiss, xhat_0, mask, data_x, data_y
Пример #13
0
def residualize_baseline(df, baseline_vars=[]):
    if len(baseline_vars) == 0:
        baseline_vars = ['Age', 'Sex']
    # remove baseline vars
    baseline = df[baseline_vars]
    data = df.copy()
    data.drop(baseline_vars, axis=1, inplace=True)
    lr = LinearRegression()
    if data.isnull().sum().sum() > 0:
        imputed = SimpleFill().fit_transform(data)
        data = pd.DataFrame(imputed, index=data.index, columns=data.columns)
    for v in data:
        y = data[v]
        lr.fit(baseline, y)
        data[v] = y - lr.predict(baseline)
    return data
Пример #14
0
def determine_impute(df):
    """Iterates various imputation methods to find lower MSE"""
    algorithms = [
        SimpleFill(),
        KNN(1),
        KNN(2),
        KNN(3),
        KNN(4),
        KNN(5),
        IterativeSVD(),
        MatrixFactorization()
    ]
    MSE = {}
    df_incomplete = create_test_df(df, 0.7, list(T40_dict.keys()))
    for i, alg in enumerate(algorithms):
        print(alg)
        X_complete = impute_df(df_incomplete, alg)
        alg_mse = ((df - X_complete)**2).sum().mean()
        print(str(i) + alg.__class__.__name__, alg_mse)
        MSE[str(i) + alg.__class__.__name__] = alg_mse
    return MSE
Пример #15
0
 def __init__(self, data, predict):
     self.df = data
     self.predict = predict
     self.X = None
     self.y = None
     self.X_scale = None
     self.X_train = None
     self.X_test = None
     self.y_train = None
     self.y_test = None
     self.incomplete_data = None
     self.clean_data = None
     self.methods = [
         SimpleFill(),
         KNN(1),
         KNN(2),
         KNN(3),
         KNN(4),
         KNN(5),
         IterativeSVD(),
         MatrixFactorization()
     ]
Пример #16
0
 def run_impute(self, X, state='train'):
     if state == 'train':
         self.train_data['ave'] = np.zeros([X.shape[0], X.shape[1]])
         for imp_method in self.impute_method:
             if imp_method == 'mean':
                 imp_ope = SimpleFill()
             if imp_method == 'KNN':
                 imp_ope = KNN()
             if imp_method == 'IterativeSVD':
                 imp_ope = IterativeSVD()
             if imp_method == 'MatrixFactorization':
                 imp_ope = MatrixFactorization()
             X_filled = imp_ope.fit_transform(X)
             self.train_data[imp_method] = X_filled
             self.impute_operator[imp_method] = imp_ope
             self.train_data['ave'] += X_filled
         self.train_data['ave'] /= len(self.impute_method)
     return 0
Пример #17
0
def prepareImputation(df):
    afterImputation = pd.DataFrame(SimpleFill().fit_transform(
        df.loc[:, df.columns != 0]))
    for c in afterImputation.columns:
        df[c + 1] = afterImputation[c].values
    return df
Пример #18
0
        new_dataset = pd.concat([test_data, train_data], axis=0)
        train_data = train_data.values
        test_data = test_data.values
        print('train datasize:', train_data.shape, ' test datasize: ',
              test_data.shape)

        corrupted_holdout = test_data.copy()
        corrupted_holdout[:, :RNA_size] = np.nan
        df_combine = pd.DataFrame(
            np.concatenate([corrupted_holdout, train_data], axis=0))
        print('name:', cancertype, ' missing rate:', missing_perc,
              'train datasize:', train_data.shape, ' test datasize: ',
              test_data.shape)

        ############## Mean method
        X_filled = SimpleFill(fill_method="mean").fit_transform(df_combine)
        RNA_txt = pd.DataFrame(X_filled[:, :RNA_size],
                               index=shuffle_cancer.index,
                               columns=shuffle_cancer.columns[:RNA_size])
        RNA_txt.to_csv(datadir + '/filled_data/Mean_' + cancertype +
                       str(missing_perc * 100) + '_' + str(sample_count) +
                       '.csv')

        nz = test_data[:, :RNA_size].size
        nnm_mse = np.sqrt((np.linalg.norm(
            (X_filled[:test_data.shape[0], :RNA_size] -
             test_data[:, :RNA_size]))**2) / nz)
        print("Mean method, RMSE: %f" % nnm_mse)
        loss_list_Mean[cancer_c, perc, sample_count - 1] = nnm_mse

        ##############SVD
Пример #19
0
     imputedData = modifier(imputedData, mark)
     score = evaluate.RMSE(originData, imputedData)
     ii_misc[0].append(score)
     ii_misc[1].append(MAE(originData, imputedData))
     ii_misc[2].append(masked_mape_np(originData, imputedData))
     ii_misc[3].append(TF(originData, imputedData))
     logger.info(
         "fi IterativeImputer missing rate:{},RMSE:{}".format(
             i, score))
 except:
     ii_misc[0].append(np.inf)
     ii_misc[1].append(np.inf)
     ii_misc[2].append(np.inf)
     ii_misc[3].append(np.inf)
 try:
     imputedData = SimpleFill("median").fit_transform(missData)
     imputedData = modifier(imputedData, mark)
     score = evaluate.RMSE(originData, imputedData)
     median_misc[0].append(score)
     median_misc[1].append(MAE(originData, imputedData))
     median_misc[2].append(masked_mape_np(originData, imputedData))
     median_misc[3].append(TF(originData, imputedData))
     logger.info("fi median missing rate:{},RMSE:{}".format(
         i, score))
 except:
     median_misc[0].append(np.inf)
     median_misc[1].append(np.inf)
     median_misc[2].append(np.inf)
     median_misc[3].append(np.inf)
 try:
     imputedData = impyute.imputation.cs.random(missData)
Пример #20
0
from fancyimpute import (BiScaler, KNN, NuclearNormMinimization, SoftImpute,
                         SimpleFill)

n = 200
m = 20
inner_rank = 4
X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
print("Mean squared element: %0.4f" % (X**2).mean())

# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.fit_transform(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.fit_transform(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
Пример #21
0
def run(folder, name, patients, run_all, save_imputed):
    random_seed = 123
    np.random.seed(seed=random_seed)

    X_corrupt = load_file(folder, name)
    name = name.split('.csv')[0]
    print(name)

    end = X_corrupt.shape[0]
    print(end)
    X = np.genfromtxt('./data/completeCasesBoxCox.csv', delimiter=',',
                      skip_header=1)[:end, 1:]

    scores = {}
    simple_mean_X = SimpleFill(fill_method='mean').complete(X_corrupt)
    scores['simple_mean'] = evaluate(simple_mean_X, X, X_corrupt)

    simple_median_X = SimpleFill(fill_method='median').complete(X_corrupt)
    scores['simple_median'] = evaluate(simple_median_X, X, X_corrupt)

    random_X = SimpleFill(fill_method='random').complete(X_corrupt)
    scores['random'] = evaluate(random_X, X, X_corrupt)

    # SVD
    svd_1_X = IterativeSVD(rank=1).complete(X_corrupt)
    scores['svd_1'] = evaluate(svd_1_X, X, X_corrupt)

    svd_2_X = IterativeSVD(rank=2).complete(X_corrupt)
    scores['svd_2'] = evaluate(svd_2_X, X, X_corrupt)

    svd_3_X = IterativeSVD(rank=3).complete(X_corrupt)
    scores['svd_3'] = evaluate(svd_3_X, X, X_corrupt)

    svd_4_X = IterativeSVD(rank=4).complete(X_corrupt)
    scores['svd_4'] = evaluate(svd_4_X, X, X_corrupt)

    svd_5_X = IterativeSVD(rank=5).complete(X_corrupt)
    scores['svd_5'] = evaluate(svd_5_X, X, X_corrupt)

    svd_6_X = IterativeSVD(rank=6).complete(X_corrupt)
    scores['svd_6'] = evaluate(svd_6_X, X, X_corrupt)

    svd_7_X = IterativeSVD(rank=7).complete(X_corrupt)
    scores['svd_7'] = evaluate(svd_7_X, X, X_corrupt)

    svd_8_X = IterativeSVD(rank=8).complete(X_corrupt)
    scores['svd_8'] = evaluate(svd_8_X, X, X_corrupt)

    svd_9_X = IterativeSVD(rank=9).complete(X_corrupt)
    scores['svd_9'] = evaluate(svd_9_X, X, X_corrupt)

    svd_10_X = IterativeSVD(rank=10).complete(X_corrupt)
    scores['svd_10'] = evaluate(svd_10_X, X, X_corrupt)

    svd_11_X = IterativeSVD(rank=11).complete(X_corrupt)
    scores['svd_11'] = evaluate(svd_11_X, X, X_corrupt)

    svd_12_X = IterativeSVD(rank=12).complete(X_corrupt)
    scores['svd_12'] = evaluate(svd_12_X, X, X_corrupt)

    svd_13_X = IterativeSVD(rank=13).complete(X_corrupt)
    scores['svd_13'] = evaluate(svd_13_X, X, X_corrupt)

    svd_14_X = IterativeSVD(rank=14).complete(X_corrupt)
    scores['svd_14'] = evaluate(svd_14_X, X, X_corrupt)

    svd_15_X = IterativeSVD(rank=15).complete(X_corrupt)
    scores['svd_15'] = evaluate(svd_15_X, X, X_corrupt)

    svd_16_X = IterativeSVD(rank=16).complete(X_corrupt)
    scores['svd_16'] = evaluate(svd_16_X, X, X_corrupt)

    svd_17_X = IterativeSVD(rank=17).complete(X_corrupt)
    scores['svd_17'] = evaluate(svd_17_X, X, X_corrupt)

    svd_18_X = IterativeSVD(rank=18).complete(X_corrupt)
    scores['svd_18'] = evaluate(svd_18_X, X, X_corrupt)

    svd_19_X = IterativeSVD(rank=19).complete(X_corrupt)
    scores['svd_19'] = evaluate(svd_19_X, X, X_corrupt)

    svd_20_X = IterativeSVD(rank=20).complete(X_corrupt)
    scores['svd_20'] = evaluate(svd_20_X, X, X_corrupt)

    svd_21_X = IterativeSVD(rank=21).complete(X_corrupt)
    scores['svd_21'] = evaluate(svd_21_X, X, X_corrupt)

    svd_22_X = IterativeSVD(rank=22).complete(X_corrupt)
    scores['svd_22'] = evaluate(svd_22_X, X, X_corrupt)

    svd_23_X = IterativeSVD(rank=23).complete(X_corrupt)
    scores['svd_23'] = evaluate(svd_23_X, X, X_corrupt)

    svd_24_X = IterativeSVD(rank=24).complete(X_corrupt)
    scores['svd_24'] = evaluate(svd_24_X, X, X_corrupt)

    si_X = SoftImpute().complete(X_corrupt)
    scores['si'] = evaluate(si_X, X, X_corrupt)

    si_s_half_X = SoftImpute(shrinkage_value=0.5).complete(X_corrupt)
    scores['si_s_half'] = evaluate(si_s_half_X, X, X_corrupt)

    si_s_1_X = SoftImpute(shrinkage_value=1).complete(X_corrupt)
    scores['si_s_1'] = evaluate(si_s_1_X, X, X_corrupt)

    si_s_2_X = SoftImpute(shrinkage_value=2).complete(X_corrupt)
    scores['si_s_2'] = evaluate(si_s_2_X, X, X_corrupt)

    si_s_4_X = SoftImpute(shrinkage_value=4).complete(X_corrupt)
    scores['si_s_4'] = evaluate(si_s_4_X, X, X_corrupt)

    si_s_8_X = SoftImpute(shrinkage_value=8).complete(X_corrupt)
    scores['si_s_8'] = evaluate(si_s_8_X, X, X_corrupt)

    si_s_16_X = SoftImpute(shrinkage_value=16).complete(X_corrupt)
    scores['si_s_16'] = evaluate(si_s_16_X, X, X_corrupt)

    si_s_32_X = SoftImpute(shrinkage_value=32).complete(X_corrupt)
    scores['si_s_32'] = evaluate(si_s_32_X, X, X_corrupt)

    si_s_64_X = SoftImpute(shrinkage_value=64).complete(X_corrupt)
    scores['si_s_64'] = evaluate(si_s_64_X, X, X_corrupt)

    si_s_128_X = SoftImpute(shrinkage_value=128).complete(X_corrupt)
    scores['si_s_128'] = evaluate(si_s_128_X, X, X_corrupt)

    if save_imputed:
        np.savetxt('./output/sweeps/' + name + '_simple_mean.csv',
                   simple_mean_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_simple_median.csv',
                   simple_median_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_simple_random.csv',
                   random_X, delimiter=',', newline='\n'),
        np.savetxt('./output/sweeps/' + name + '_svd_1.csv',
                   svd_1_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_2.csv',
                   svd_2_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_3.csv',
                   svd_3_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_4.csv',
                   svd_4_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_5.csv',
                   svd_5_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_6.csv',
                   svd_6_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_7.csv',
                   svd_7_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_8.csv',
                   svd_8_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_9.csv',
                   svd_9_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_10.csv',
                   svd_10_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_11.csv',
                   svd_11_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_12.csv',
                   svd_12_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_13.csv',
                   svd_13_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_14.csv',
                   svd_14_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_15.csv',
                   svd_15_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_16.csv',
                   svd_16_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_17.csv',
                   svd_17_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_18.csv',
                   svd_18_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_19.csv',
                   svd_19_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_20.csv',
                   svd_20_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_21.csv',
                   svd_21_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_22.csv',
                   svd_22_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_23.csv',
                   svd_23_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_24.csv',
                   svd_24_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si.csv',
                   si_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_half.csv',
                   si_s_half_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_1.csv',
                   si_s_1_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_2.csv',
                   si_s_2_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_4.csv',
                   si_s_4_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_8.csv',
                   si_s_8_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_16.csv',
                   si_s_16_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_32.csv',
                   si_s_32_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_64.csv',
                   si_s_64_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_128.csv',
                   si_s_128_X, delimiter=',', newline='\n')

    if run_all:
        mice_X = MICE().complete(X_corrupt)
        scores['MICE'] = evaluate(mice_X, X, X_corrupt)

        mice_col_lambda_reg_25 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.25)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_25'] = evaluate(
            mice_col_lambda_reg_25, X, X_corrupt)

        mice_col_lambda_reg_10 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.1)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_10'] = evaluate(
            mice_col_lambda_reg_10, X, X_corrupt)

        mice_col_lambda_reg_1 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.01)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_1'] = evaluate(
            mice_col_lambda_reg_1, X, X_corrupt)

        mice_col_lambda_reg_01 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.001)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_01'] = evaluate(
            mice_col_lambda_reg_01, X, X_corrupt)

        mice_col_lambda_reg_001 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.0001)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_001'] = evaluate(
            mice_col_lambda_reg_001, X, X_corrupt)

        mice_pmm_X = MICE(impute_type='pmm').complete(X_corrupt)
        scores['MICE_pmm'] = evaluate(mice_pmm_X, X, X_corrupt)

        mice_pmm_lambda_reg_25 = MICE(
            impute_type='pmm',
            model=BayesianRidgeRegression(lambda_reg=0.25)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_25'] = evaluate(
            mice_pmm_lambda_reg_25, X, X_corrupt)

        mice_pmm_lambda_reg_10 = MICE(
            impute_type='pmm',
            model=BayesianRidgeRegression(lambda_reg=0.1)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_10'] = evaluate(
            mice_pmm_lambda_reg_10, X, X_corrupt)

        mice_pmm_lambda_reg_1 = MICE(
            impute_type='pmm',
             model=BayesianRidgeRegression(lambda_reg=0.01)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_1'] = evaluate(mice_pmm_lambda_reg_1, X, X_corrupt)

        mice_pmm_lambda_reg_01 = MICE(
            impute_type='pmm',
            model=BayesianRidgeRegression(lambda_reg=0.001)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_01'] = evaluate(mice_pmm_lambda_reg_01, X, X_corrupt)

        mice_pmm_lambda_reg_001 = MICE(
            impute_type='pmm',
            model=BayesianRidgeRegression(lambda_reg=0.0001)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_001'] = evaluate(
            mice_pmm_lambda_reg_001, X, X_corrupt)

        knn_1_X = KNN(k=1).complete(X_corrupt)
        scores['knn_1'] = evaluate(knn_1_X, X, X_corrupt)

        knn_3_X = KNN(k=3).complete(X_corrupt)
        scores['knn_3'] = evaluate(knn_3_X, X, X_corrupt)

        knn_9_X = KNN(k=9).complete(X_corrupt)
        scores['knn_9'] = evaluate(knn_9_X, X, X_corrupt)

        knn_15_X = KNN(k=15).complete(X_corrupt)
        scores['knn_15'] = evaluate(knn_15_X, X, X_corrupt)

        knn_30_X = KNN(k=30).complete(X_corrupt)
        scores['knn_30'] = evaluate(knn_30_X, X, X_corrupt)

        knn_81_X = KNN(k=81).complete(X_corrupt)
        scores['knn_81'] = evaluate(knn_81_X, X, X_corrupt)

        knn_243_X = KNN(k=243).complete(X_corrupt)
        scores['knn_243'] = evaluate(knn_243_X, X, X_corrupt)

        knn_751_X = KNN(k=751).complete(X_corrupt)
        scores['knn_751'] = evaluate(knn_751_X, X, X_corrupt)

        knn_2000_X = KNN(k=2000).complete(X_corrupt)
        scores['knn_2000'] = evaluate(knn_2000_X, X, X_corrupt)

        knn_6000_X = KNN(k=6000).complete(X_corrupt)
        scores['knn_6000'] = evaluate(knn_6000_X, X, X_corrupt)

        if save_imputed:
            np.savetxt('./output/sweeps/' + name + '_MICE.csv',
                       mice_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name +
                       '_mice_col_lambda_reg_25.csv',
                       mice_col_lambda_reg_25, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_10.csv',
                       mice_col_lambda_reg_10, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_1.csv',
                       mice_col_lambda_reg_1, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_01.csv',
                       mice_col_lambda_reg_01, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_001.csv',
                       mice_col_lambda_reg_001, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_X.csv',
                       mice_pmm_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_25.csv',
                       mice_pmm_lambda_reg_25, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_10.csv',
                       mice_pmm_lambda_reg_10, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_1.csv',
                       mice_pmm_lambda_reg_1, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_01.csv',
                       mice_pmm_lambda_reg_01, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_001.csv',
                       mice_pmm_lambda_reg_001, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_1.csv',
                       knn_1_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_3.csv',
                       knn_3_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_9.csv',
                       knn_9_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_15.csv',
                       knn_15_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_30.csv',
                       knn_30_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_81.csv',
                       knn_81_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_243.csv',
                       knn_243_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_751.csv',
                       knn_751_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_2000.csv',
                       knn_2000_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_6000.csv',
                       knn_6000_X, delimiter=',', newline='\n')
    print(scores)
    scores_df = pd.DataFrame().from_dict(scores.items())
    scores_df.columns = ['Method', 'Score']
    scores_df.set_index('Method')
    scores_df.to_csv('./output/scores/' + folder + '/' + name + '.csv')
Пример #22
0
X_filled_knn = KNN(k=3).fit_transform(X_incomplete)

# matrix completion using MICE
X_filled_mice = IterativeImputer().fit_transform(X_incomplete)

# matrix completion using Iterative SVD
X_filled_svd = IterativeSVD(rank=3).fit_transform(X_incomplete)

# matrix completion using Matrix Factorization
X_filled_mf = MatrixFactorization(learning_rate=0.01,
                                  rank=3,
                                  l2_penalty=0,
                                  min_improvement=1e-6).fit_transform(X_incomplete)

# matrix completion using Mean Fill
X_filled_meanfill = SimpleFill(fill_method='mean').fit_transform(X_incomplete)
# matrix completion using Median Fill
X_filled_medianfill = SimpleFill(fill_method='median').fit_transform(X_incomplete)
# matrix completion using Zero Fill
X_filled_zerofill = SimpleFill(fill_method='zero').fit_transform(X_incomplete)
# matrix completion using Min Fill
X_filled_minfill = SimpleFill(fill_method='min').fit_transform(X_incomplete)
# matrix completion using Sampled Fill
X_filled_randomfill = SimpleFill(fill_method='random').fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)
X_filled_softimpute = SoftImpute().fit_transform(X_incomplete_normalized)

# print mean squared error for the  imputation methods above
Пример #23
0
def impute_mean(X):
    return SimpleFill("mean").complete(X)
Пример #24
0
        re_X = re_X.astype(int)
        X_filled_knn = modifier(X_filled_knn, s)
        X_filled_knn = X_filled_knn.astype(int)
        logger.info("knn MSE:{}".format(MSE(imputedData, X_filled_knn)))
        logger.info("knn res MSE:{}".format(MSE(imputedData, re_X)))
        logger.info("res  change MSE:{}".format(MSE(X_filled_knn, re_X)))

        # X_filled_ii = IterativeImputer().fit_transform(mm_missData)
        # re_X = inp.revise(X_filled_ii, miss_location,
        #                   model=os.path.join(modelSavePath, '{}.pkl'.format(modelName)))
        # X_filled_ii = restore(min_max_scaler=min_max_scaler,s=s,data=X_filled_ii)
        # re_X = restore(min_max_scaler=min_max_scaler, s=s, data=re_X)
        # logger.info("ii MSE:{}".format(MSE(imputedData, X_filled_ii)))
        # logger.info("ii res MSE:{}".format(MSE(imputedData,  re_X)))

        X_filled_sf = SimpleFill().fit_transform(missData)
        re_X = inp.revise(modifier(X_filled_sf, s),
                          miss_location,
                          model=os.path.join(modelSavePath,
                                             '{}.pkl'.format(modelName)))
        re_X = modifier(re_X, s)
        re_X = re_X.astype(int)
        X_filled_sf = modifier(X_filled_sf, s)
        X_filled_sf = X_filled_sf.astype(int)
        logger.info("sf MSE:{}".format(MSE(imputedData, X_filled_sf)))
        logger.info("sf res MSE:{}".format(MSE(imputedData, re_X)))
        logger.info("res  change MSE:{}".format(MSE(X_filled_sf, re_X)))

        X_filled_me = SimpleFill("median").fit_transform(missData)
        re_X = inp.revise(modifier(X_filled_me, s),
                          miss_location,
Пример #25
0
import numpy as np
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, SimpleFill

n = 200
m = 20
inner_rank = 4
X = np.dot(np.random.randn(n, inner_rank), np.random.randn(inner_rank, m))
print("Mean squared element: %0.4f" % (X ** 2).mean())

# X is a data matrix which we're going to randomly drop entries from
missing_mask = np.random.rand(*X.shape) < 0.1
X_incomplete = X.copy()
# missing entries indicated with NaN
X_incomplete[missing_mask] = np.nan

meanFill = SimpleFill("mean")
X_filled_mean = meanFill.complete(X_incomplete)

# Use 3 nearest rows which have a feature to fill in each row's missing features
knnImpute = KNN(k=3)
X_filled_knn = knnImpute.complete(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
Пример #26
0
def preprocess1(dataset, mf_imputer, labelencoder, delete_rows=True):
    dataset['Gender'] = dataset['Gender'].map(lambda x: 1 if x == 'Male' else 0 if x == 'Female' else x)
    dataset['Gender'] = mf_imputer.fit_transform(dataset[['Gender']]).ravel()
    dataset['Married'] = dataset['Married'].map(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x)

    dataset['Dependents'] = dataset['Dependents'].map(lambda x: 4 if x == '3+' else x)
    dataset['Dependents'] = pd.to_numeric(dataset['Dependents'], errors='coerce')

    dataset['Self_Employed'] = dataset['Self_Employed'].map(lambda x: 1 if x == 'Yes' else 0 if x == 'No' else x)
    dataset['Property_Area'] = labelencoder.fit_transform(dataset['Property_Area'])
    dataset['Education'] = labelencoder.fit_transform(dataset['Education'])

    dataset['Gender'] = pd.to_numeric(dataset['Gender'], errors='coerce').astype(np.int8)
    dataset['Dependents'] = pd.to_numeric(dataset['Dependents'], errors='coerce').astype(np.int8)

    printValueCount(dataset)
    cols = dataset.columns
    from fancyimpute import (
        BiScaler,
        KNN,
        NuclearNormMinimization,
        SoftImpute,
        SimpleFill
    )

    X_filled_knn = KNN(k=3).complete(dataset)
    X_filled_mean = SimpleFill("mean").complete(dataset)
    X_filled_softimpute = SoftImpute().complete(dataset)

    simplefill_mse = ((X_filled_mean - dataset) ** 2).mean()
    # print("KNN: %f" % simplefill_mse)

    knn_mse = ((X_filled_knn - dataset) ** 2).mean()
    # print("KNN: %f" % knn_mse)

    softImpute_mse = ((X_filled_softimpute - dataset) ** 2).mean()
    # print("SoftImpute MSE: %f" % softImpute_mse)

    dataset = getDataFrame(X_filled_knn, cols)
   # printValueCount(dataset)

    return dataset
   # df.sex = df.sex.map({'female': 1, 'male': 0})

    # dataset['Dependents'] = mf_imputer.fit_transform(dataset[['Dependents']]).ravel()
    # Impute Dependents using married people
    for row, v in dataset['Dependents'].iteritems():
        if (v is np.nan):
            #print(row, dataset.loc[row, 'Married'])
            if (dataset.loc[row, 'Married'] == 0):
                dataset.loc[row, 'Dependents'] = 0
            else:
                dataset.loc[row, 'Dependents'] = 1

    # Impute married missing values using dependents


    dataset['Dependents'] = dataset['Dependents'].map(
        lambda x: int(x) if str(x).isalnum() and not str(x).isalpha() else raise_('Number format exception'))
    for row, v in dataset['Married'].iteritems():
        if (v != v or v is np.nan or v is None or v is '' or v == ""):
            if (dataset.loc[row, 'Dependents'] > 0):
                dataset.loc[row, 'Married'] = 1
            else:
                dataset.loc[row, 'Married'] = 0

    # dataset['Married'].groupby(dataset['Dependents']).value_counts()

    # Impute missing values of Self_Employed

    #plt.figure(figsize=(16, 6))
    #sns.boxplot(x=dataset['Self_Employed'], y=dataset['ApplicantIncome'])
    # plt.yscale("log")
    #plt.title('Self employed wise boxplot of income')
    #plt.xticks(rotation=90);

    #plt.figure(figsize=(16, 6))
    #sns.boxplot(x=dataset['Self_Employed'], y=dataset['CoapplicantIncome'])
    # plt.yscale("log")
    #plt.title('Self employed wise boxplot of CoapplicantIncome')
    #plt.xticks(rotation=90);

    incomemaan = dataset['ApplicantIncome'].groupby(dataset['Self_Employed']).mean()
    for row, v in dataset['Self_Employed'].iteritems():
        if (v != v or v is np.nan or v is None or v is '' or v == ""):
            print(row, v)
            if (dataset.loc[row, 'ApplicantIncome'] > incomemaan[1]):
                dataset.loc[row, 'Self_Employed'] = 1
            else:
                dataset.loc[row, 'Self_Employed'] = 0

    # Impute missing values of LoanAmount
    #f, ax = plt.subplots(1, 2, figsize=(14, 6))
    #ax1, ax2 = ax.flatten()
    #sns.distplot(dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean()), color='r', ax=ax1)
    #ax1.set_title('Distrbution of LoanAmount')
    #sns.boxplot(x=dataset['LoanAmount'], ax=ax2)
    #ax2.set_ylabel('')
    #ax2.set_title('Boxplot of LoanAmount')

    # Remove ouliers for ApplicantIncome remove >8000
    #sns.distplot(dataset['ApplicantIncome'])
    if (delete_rows == True):
        dataset = dataset[dataset['ApplicantIncome'] < 15000]
    #sns.distplot(dataset['ApplicantIncome'])

    # Remove ouliers for LoanAmount remove >500
    #sns.distplot(dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean()))
    if (delete_rows == True):
        dataset = dataset[dataset['LoanAmount'] < 450]
    else:
        dataset['LoanAmount'] = dataset['LoanAmount'].map(lambda x: 100 if x != x or x is None or x is np.nan else x)
    #sns.distplot(dataset['LoanAmount'].fillna(dataset['LoanAmount'].mean()))

    #sns.jointplot(x=dataset['LoanAmount'], y=dataset['Property_Area'], color='g')  # Do not have any relation
    #sns.jointplot(x=dataset['LoanAmount'], y=dataset['Loan_Amount_Term'], color='g')  # Do not have any relation
    #sns.jointplot(x=dataset['LoanAmount'], y=dataset['Credit_History'], color='g')  # Do not have any relation
    #sns.jointplot(x=dataset['LoanAmount'], y=dataset['ApplicantIncome'], color='g')  # Do not have any relation

    #sns.distplot(dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mean()))
    #sns.jointplot(x=dataset['Loan_Amount_Term'], y=dataset['Property_Area'], color='g')  # Do not have any relation
    #sns.jointplot(x=dataset['Loan_Amount_Term'], y=dataset['LoanAmount'], color='g')  # Do not have any relation
    #sns.jointplot(x=dataset['Loan_Amount_Term'], y=dataset['Credit_History'], color='g')  # Do not have any relation
    #sns.jointplot(x=dataset['Loan_Amount_Term'], y=dataset['ApplicantIncome'], color='g')  # Do not have any relation

    # It is not significent with any column so make it mean
    dataset['Loan_Amount_Term'].fillna(dataset['Loan_Amount_Term'].mean(), inplace=True)

    # Impute Credit_History
    #sns.distplot(dataset['Credit_History'].fillna(dataset['Credit_History'].mean()))
    #sns.jointplot(x=dataset['Credit_History'], y=dataset['Property_Area'], color='g')  # Do not have any relation
    #sns.jointplot(x=dataset['Credit_History'], y=dataset['LoanAmount'], color='g')  # Do not have any relation
    #sns.jointplot(x=dataset['Credit_History'], y=dataset['Loan_Amount_Term'], color='g')  # Do not have any relation
    #sns.jointplot(x=dataset['Credit_History'], y=dataset['ApplicantIncome'], color='g')  # Do not have any relation

    # It is not significent with any column so make it most_frequent
    dataset['Credit_History'] = mf_imputer.fit_transform(dataset[['Credit_History']]).ravel()
    printValueCount(dataset, 5)
    return dataset
Пример #27
0
# from hyperopt import hp
from ray import tune
from ray.tune.suggest.hyperopt import HyperOptSearch
from utils.handle_missingdata import gene_missingdata
# space = {
#     "lr": hp.loguniform("lr", 1e-10, 0.1),
#     "momentum": hp.uniform("momentum", 0.1, 0.9),
# }

#baseline插补方法
from ycimpute.imputer import mice
from ycimpute.utils import evaluate
from utils.base_impute import random_inpute
from fancyimpute import IterativeImputer, SimpleFill
imputation = {
    'median': SimpleFill("median").fit_transform,
    'random': random_inpute,
    'mice': mice.MICE().complete,
    'ii': IterativeImputer().fit_transform
}


class TAI(Solver):
    def __init__(self,
                 theta=5,
                 epochs=50,
                 use_cuda=False,
                 batch_size=64,
                 early_stop=1e-06,
                 normalizer='zero_score',
                 iterations=30,
Пример #28
0
    for negative_log_regularization_weight in [1, 2, 3]:
        regularization_weight = 10.0 ** -negative_log_regularization_weight
        table.add_entry(
            solver=MICE(
                n_nearest_columns=25,
                n_imputations=20,
                n_burn_in=10,
                model=BayesianRidgeRegression(lambda_reg=regularization_weight),
                init_fill_method="mean",
            ),

            name="MICE_%d" % negative_log_regularization_weight)

    for fill_method in ["mean", "median"]:
        table.add_entry(
            solver=SimpleFill(fill_method=fill_method),
            name="SimpleFill_%s" % fill_method)

    for k in [1, 5, 17]:
        table.add_entry(
            solver=DenseKNN(
                k=k,
                orientation="rows"),
            name="DenseKNN_k%d" % (k,))

    for shrinkage_value in [50, 200, 800]:
        # SoftImpute without rank constraints
        table.add_entry(
            solver=SoftImpute(
                shrinkage_value=shrinkage_value),
            name="SoftImpute_lambda%d" % (shrinkage_value,))
Пример #29
0
    table = ResultsTable(images_dict=images_dict,
                         scale_rows=False,
                         center_rows=False)

    for negative_log_regularization_weight in [2, 3, 4]:
        regularization_weight = 10.0**-negative_log_regularization_weight
        table.add_entry(solver=IterativeImputer(
            n_nearest_columns=80,
            n_iter=50,
            n_burn_in=5,
        ),
                        name="IterativeImputer_%d" %
                        negative_log_regularization_weight)

    for fill_method in ["mean", "median"]:
        table.add_entry(solver=SimpleFill(fill_method=fill_method),
                        name="SimpleFill_%s" % fill_method)

    for k in [1, 3, 7]:
        table.add_entry(solver=KNN(k=k, orientation="rows"),
                        name="KNN_k%d" % (k, ))

    for shrinkage_value in [25, 50, 100]:
        # SoftImpute without rank constraints
        table.add_entry(solver=SoftImpute(shrinkage_value=shrinkage_value),
                        name="SoftImpute_lambda%d" % (shrinkage_value, ))

    for rank in [10, 20, 40]:
        table.add_entry(solver=IterativeSVD(rank=rank,
                                            init_fill_method="zero"),
                        name="IterativeSVD_rank%d" % (rank, ))
Пример #30
0
import torch.utils.data
from pandas import isnull
from functools import partial
from logger import logger
from sklearn.preprocessing import StandardScaler
#继承类和model
from utils.tools import Solver
from dnn.autoencoder_test_partice import Autoencoder,ResAutoencoder,StockedAutoencoder,StockedResAutoencoder
from utils.normalizer import NORMALIZERS,RECOVER


#baseline插补方法
from ycimpute.imputer import  mice
from utils.base_impute import random_inpute
from fancyimpute import IterativeImputer, SimpleFill
imputation = {'median':SimpleFill("median").fit_transform,'random':random_inpute,'mice':mice.MICE().complete,'ii':IterativeImputer().fit_transform}
AUTOENCODER_METHOD={'Autoencoder':Autoencoder,'ResAutoencoder':ResAutoencoder,'StockedAutoencoder':StockedAutoencoder,'StockedResAutoencoder':StockedResAutoencoder}
LOSS={'MSELoss':torch.nn.MSELoss(),'CrossEntropyLoss':torch.nn.CrossEntropyLoss()}


class TAI(Solver):
    #原始参数
    def __init__(
            self,
            theta=5,
            epochs=50,
            use_cuda=False,
            batch_size=64,
            early_stop=1e-06,
            normalizer='zero_score',
            iterations=30,