def impute_ages(self, data):
     #drop_survived = data.drop(['Survived'], axis=1)
     column_titles = list(data)
     mice_results = fancyimpute.MICE().complete(np.array(data))
     results = pd.DataFrame(mice_results, columns=column_titles)
     #results['Survived'] = list(data['Survived'])
     return results
 def Imputer(method):
     if method == 'MICE':
         imputer = fancyimpute.MICE(
             n_nearest_columns=n_nearest_columns,
             min_value=0.0,
             verbose=False,
         )
         def impute(data):
             return imputer.complete(data)
     elif method in ('fancymean', 'zero', 'fancymedian', 'min', 'random'):
         imputer = fancyimpute.SimpleFill(
             min_value=0.0,
             fill_method=method,
         )
         def impute(data):
             return imputer.complete(data)
     elif method in ('mean', 'median', 'most_frequent'):
         import sklearn.preprocessing
         imputer = sklearn.preprocessing.Imputer(
             strategy=method,
         )
         def impute(data):
             return imputer.fit_transform(data)
     elif method == 'drop':
         def impute(data):
             raise NotImplementedError
             
     return impute
예제 #3
0
def test_cross_validation_with_imputation():
    imputer = fancyimpute.MICE(n_imputations=2,
                               n_burn_in=1,
                               n_nearest_columns=25)
    train_data = (mhcflurry.dataset.Dataset.from_csv(
        get_path("data_kim2014", "bdata.2009.mhci.public.1.txt")).get_alleles(
            ["HLA-A0201", "HLA-A0202", "HLA-A0301"]))

    folds = cross_validation_folds(train_data,
                                   n_folds=3,
                                   imputer=imputer,
                                   drop_similar_peptides=True,
                                   alleles=["HLA-A0201", "HLA-A0202"])

    eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"})
    eq_(len(folds), 6)

    for fold in folds:
        eq_(fold.train.unique_alleles(), set([fold.allele]))
        eq_(fold.imputed_train.unique_alleles(), set([fold.allele]))
        eq_(fold.test.unique_alleles(), set([fold.allele]))

    models = HYPERPARAMETER_DEFAULTS.models_grid(activation=["tanh", "relu"],
                                                 layer_sizes=[[4]],
                                                 embedding_output_dim=[8],
                                                 n_training_epochs=[3])
    print(models)

    df = train_across_models_and_folds(folds, models)
    print(df)
    assert df.test_auc.mean() > 0.6
예제 #4
0
 def impute_value(self, df, method="MICE"):
     """
     Impute using MICE
     """
     if method == "MICE":
         return fi.MICE(verbose=False).complete(df)
     elif method == "KNN":
         return fi.KNN(k=4, verbose=False).complete(df)
     else:
         return fi.SimpleFill().complete(df)
예제 #5
0
def impute(data, **kwargs):
    ### Impute missing values | kwargs from MICE args

    # can add impute_method=random (or other) to MICE
    impute_missing = data
    impute_missing_cols = list(impute_missing)
    filled_soft = fancyimpute.MICE(**kwargs).complete(np.array(impute_missing))
    results = pd.DataFrame(filled_soft, columns=impute_missing_cols)
    assert results.isnull().sum().sum() == 0, 'Not all NAs removed'
    return results
예제 #6
0
    def score_rent(self):
        X_train, X_test, y_train, y_test, contin_cols = self.preprocessing()
        #seperate continous and categorical columns

        con_train = X_train[contin_cols]
        cat_train = X_train[X_train.columns.difference(contin_cols)]

        #fancyimpute mice in continous train data
        mice = fancyimpute.MICE(verbose=0)
        con_train = np.asarray(con_train)
        con_train_mice = mice.complete(con_train)

        #fancyimpute mice in categorical train data
        cat_train = np.asarray(cat_train)
        cat_train_fancyknn = fancyimpute.KNN().complete(cat_train)
        cat_train_fancyknn = np.round(cat_train_fancyknn).astype(int)

        #apply boxcox transformation to continuous train data
        con_train_mice_bc = np.empty(con_train_mice.shape)
        from scipy import stats
        for i in range(len(contin_cols)):
            if np.argwhere(con_train_mice[:, i] < 0).size == 0:
                x = stats.boxcox(con_train_mice[:, i] + 1e-5)[0]
                x = np.asarray([x])
                con_train_mice_bc[:, i] = x
            else:
                con_train_mice_bc[:, i] = con_train_mice[:, i]

        # apply onehot to categorical train data
        enc = OneHotEncoder()
        enc = enc.fit(cat_train_fancyknn)
        oh = enc.transform(cat_train_fancyknn).toarray()
        cat_train_fancyknn_onehot = np.round(oh).astype(int)

        #concatenate imputed train data
        X_train_imp = np.concatenate(
            (cat_train_fancyknn_onehot, con_train_mice_bc), axis=1)

        # Feature selection using Lasso
        select_lassocv = SelectFromModel(LassoCV())
        select_lassocv = select_lassocv.fit(X_train_imp, y_train)

        #LassoCV
        param_grid = {'alpha': np.logspace(-3, 0, 14)}
        print(param_grid)
        grid = GridSearchCV(Lasso(normalize=True, max_iter=1e6),
                            param_grid,
                            cv=10)

        #makepipeline to prevent information leakage
        pipe_lassocv = make_pipeline(MinMaxScaler(), select_lassocv, grid)
        pipe_lassocv = pipe_lassocv.fit(X_train_imp, y_train)
        train_r2 = np.mean(
            cross_val_score(pipe_lassocv, X_train_imp, y_train, cv=5))
        return contin_cols, enc, pipe_lassocv, train_r2, X_test, y_test
예제 #7
0
def impute(data):
    """Impute missing values in the Age, Deck, Embarked, and Fare features.
    """
    impute_missing = data.drop(['Survived', 'Train'], axis = 1)
    impute_missing_cols = list(impute_missing)
    filled_soft = fancyimpute.MICE().complete(np.array(impute_missing))
    results = pd.DataFrame(filled_soft, columns = impute_missing_cols)
    results['Train'] = list(data['Train'])
    results['Survived'] = list(data['Survived'])
    assert results.isnull().sum().sum() == 0, 'Not all NAs removed'
    return results
예제 #8
0
 def imputate_continuous(data_train):
     '''  Imputation Continuous Variable 
     '''
     continuous = data_train.columns[data_train.dtypes != "object"]
     index_train = data_train.index
     X_train = data_train[continuous].as_matrix()
     try:
         X_train_fancy_mice = fancyimpute.MICE(verbose=0).complete(X_train)
         data_train_continuous = pd.DataFrame(X_train_fancy_mice,
                                              columns=continuous,
                                              index=index_train)
     except:
         data_train_continuous = pd.DataFrame(X_train,
                                              columns=continuous,
                                              index=index_train)
     return data_train_continuous
예제 #9
0
        def parametric_input(df):
            '''
            Using mice to fill missing continuous variables, adding a column to indicate that they were missing.
            
            Returns a transformed dataframe.
            
            Attributes
            ----------
            df: pandas.DataFrame, type of float or int types.
            
            '''
            # Filling values using mice.
            mice_matrix = fancyimpute.MICE(n_imputations=50).complete(df.values)
            mice_df = pd.DataFrame(mice_matrix)
            mice_df.columns = df.columns
            mice_df.index = df.index

            # Adding an indicator dataframe.
            ismissing_df = create_ismissing_df(df)
            return(pd.concat([mice_df, ismissing_df], axis=1))
예제 #10
0
def test_imputation():
    imputer = fancyimpute.MICE(n_imputations=2,
                               n_burn_in=1,
                               n_nearest_columns=25)
    train_data = (mhcflurry.dataset.Dataset.from_csv(
        get_path("data_kim2014", "bdata.2009.mhci.public.1.txt")).get_alleles(
            ["HLA-A0201", "HLA-A0202", "HLA-A0301"]))

    folds = cross_validation_folds(train_data,
                                   n_folds=3,
                                   imputer=imputer,
                                   drop_similar_peptides=True,
                                   alleles=["HLA-A0201", "HLA-A0202"])

    eq_(set(x.allele for x in folds), {"HLA-A0201", "HLA-A0202"})
    eq_(len(folds), 6)

    for fold in folds:
        eq_(fold.train.unique_alleles(), set([fold.allele]))
        eq_(fold.imputed_train.unique_alleles(), set([fold.allele]))
        eq_(fold.test.unique_alleles(), set([fold.allele]))
예제 #11
0
    def predict_rent(self):
        contin_cols, enc, pipe_lassocv, train_r2, X_test, y_test = self.score_rent(
        )
        con_test = X_test[contin_cols]
        cat_test = X_test[X_test.columns.difference(contin_cols)]

        #impute test data respectively (continous data)
        mice = fancyimpute.MICE(verbose=0)
        con_test = np.asarray(con_test)
        con_test_mice = mice.complete(con_test)

        #categorical data
        cat_test = np.asarray(cat_test)
        cat_test_fancyknn = fancyimpute.KNN().complete(cat_test)
        cat_test_fancyknn = np.round(cat_test_fancyknn).astype(int)

        #apply boxcox transformation to continuous test data
        con_test_mice_bc = np.empty(con_test_mice.shape)
        from scipy import stats
        for i in range(len(contin_cols)):
            if np.argwhere(con_test_mice[:, i] < 0).size == 0:
                x = stats.boxcox(con_test_mice[:, i] + 1e-5)[0]
                x = np.asarray([x])
                con_test_mice_bc[:, i] = x
            else:
                con_test_mice_bc[:, i] = con_test_mice[:, i]

        # apply onehot to categorical train data
        oh = enc.transform(cat_test_fancyknn).toarray()
        cat_test_fancyknn_onehot = np.round(oh).astype(int)
        print("Finished onehot")
        #concatenate imputed test data
        X_test_imp = np.concatenate(
            (cat_test_fancyknn_onehot, con_test_mice_bc), axis=1)

        # make prediction based on training model
        y_pred = pipe_lassocv.predict(X_test_imp)
        test_r2 = r2_score(y_test, y_pred)
        print(test_r2)
        return X_test, y_test, y_pred
예제 #12
0
import fancyimpute as fi


def impute_value(df):
    mice_impute = fi.MICE().complete(df)
    return mice_impute


import numpy as np
if __name__ == '__main__':
    w = np.random.randn(10, 2)
    w_n = w.copy()
    w_n[3, 1] = np.nan
    w_c = fi.MICE().complete(w_n)
    print w_c, w
예제 #13
0
def impute_value(df):
    mice_impute = fi.MICE().complete(df)
    return mice_impute
예제 #14
0
def preprocess(trainfile,
               testfile,
               outputdir,
               useless_attr,
               miss_threshold,
               xstrategy,
               ymin,
               ymax,
               ystrategy,
               fill_method="MICE",
               normal01=True):
    """对XY进行数据预处理,矩阵补全、正则化标准化等。

    :param trainfile: string, 训练集(d_train_20180102.csv)的路径
    :param testfile: string, 测试集(d_test_A_20180102.csv)的路径
    :param outputdir: string, 预处理后文件保存的路径
    :param useless_attr: list, 需要删除的无用属性,比如[0, 1, 2, 3]
    :param miss_threshold: float, 属性确实严重忽略的阈值,百分比,比如0.7
    :param xstrategy: string, 对x中奇异点的处理方式{"replace", "nothing"}
    :param ymin: float, 对Y中点的最小值,小于这个值,即为奇异点
    :param ymax: float, 对Y中点的最大值,超过这个值,就是奇异点
    :param ystrategy: string, 对y中奇异点的处理方式("delete", "replace", "nothing")
    :param fill_method: string, 矩阵补全的策略,{"KNN", "SoftI", "MF", "MICE"}
    :param normal01: bool, 如果为真,则对结果进行归一化到01,否则,不归一化
    :return: list, 归一化之后的trainX, trainY, testX
    """
    # 0. 读入训练集,测试集
    train_XY = convert(trainfile)
    test_X = convert(testfile)
    print("读入数据集,开始数据预处理")

    # 1. 删除无用属性列
    train_id = train_XY[:, 0:1]
    test_id = test_X[:, 0:1]
    train_XY = np.delete(train_XY, useless_attr, axis=1)
    test_X = np.delete(test_X, useless_attr, axis=1)
    n_test = test_X.shape[0]
    info1 = "1. 删除train_XY, test_X上的无用属性:%s, train_X.shape=%s, test_X.shape=%s"\
          %(str(useless_attr), str(train_XY.shape), str(test_X.shape))
    print(info1)

    # 2. 删除缺失严重的列
    miss_mask = np.isnan(train_XY)
    n = miss_mask.shape[0]
    column_del = []  # 删除列的list
    for i in range(miss_mask.shape[1]):
        miss_n = miss_mask[:, i].sum()
        if miss_n / n >= miss_threshold:
            column_del.append(i)
    train_XY = np.delete(train_XY, column_del, axis=1)
    test_X = np.delete(test_X, column_del, axis=1)
    info2 = "2. 在train_XY, test_X上删除缺失超过%f%%的属性:%s" % (miss_threshold * 100,
                                                       str(column_del))
    print(info2)

    # 3. 对y进行去噪,手动设置阈值
    train_Y = train_XY[:, -1:]
    upper_mask = train_Y > ymax
    lower_mask = train_Y < ymin
    if ystrategy == "replace":
        train_Y[upper_mask] = ymax
        train_Y[lower_mask] = ymin
    elif ystrategy == "delete":
        index = np.array(np.arange(0, train_Y.shape[0], 1), ndmin=2).T
        chsn_mask = upper_mask | lower_mask
        train_XY = np.delete(train_XY, index[chsn_mask], axis=0)
        train_id = np.delete(train_id, index[chsn_mask], axis=0)
    elif ystrategy == "nothing":
        pass
    else:
        raise ValueError(r"'ystrategy'应该是{nothing, replace, delete}中的一个")
    train_Y = train_XY[:, -1:]
    print("3. 对trainY去噪(%s),trainXY.shape=%s" % (ystrategy, train_XY.shape))

    # 4. 对X进行操作,通过boxplot计算阈值
    train_X = train_XY[:, :-1]
    all_X = np.concatenate([train_X, test_X], axis=0)
    attr_n = train_XY.shape[1] - 1
    attr_min_max = np.zeros(
        (attr_n, 2), dtype=np.float64)  # 存储每个属性经过boxplot之后的最小最大值,即阈值array
    if xstrategy == "nothing":
        pass
    elif xstrategy == "replace":
        # 对X中的奇异点 替换为 最值
        for i in range(attr_n):
            # 对每列进行浅拷贝,对crt_attr操作相当于对train_XY操作
            crt_attr = all_X[:, i:i + 1]
            miss = np.isnan(crt_attr)
            box_dic = plt.boxplot(crt_attr[~miss])
            crt_max = box_dic["caps"][0].get_ydata()[0]
            crt_min = box_dic["caps"][1].get_ydata()[0]
            if crt_max < crt_min:
                tmp = crt_max
                crt_max = crt_min
                crt_min = tmp
            attr_min_max[i, 0] = crt_min
            attr_min_max[i, 1] = crt_max
            crt_attr[miss] = 0
            upper_mask = crt_attr > crt_max
            lower_mask = crt_attr < crt_min
            upper_mask &= ~miss
            lower_mask &= ~miss

            crt_attr[upper_mask] = crt_max
            crt_attr[lower_mask] = crt_min
            crt_attr[miss] = np.nan
    else:
        raise ValueError(r"'xstrategy'应该是{nothing, replace}中的一个")
    print(r"4. 对所有的X进行去噪(%s)." % xstrategy)

    # 5. 矩阵补全
    completer = None
    if fill_method == "KNN":
        completer = fi.KNN(verbose=False)
    elif fill_method == "SoftI":
        completer = fi.SoftImpute(verbose=False)
    elif fill_method == "MF":
        completer = fi.MatrixFactorization(verbose=False)
    elif fill_method == "MICE":
        completer = fi.MICE(verbose=False)
    else:
        ValueError(r"'fill_method'应该是{'KNN','SoftI','MF','MICE'}.")
    all_X_complete = completer.complete(all_X)
    print("5. 在all_X上进行矩阵补全(%s)." % fill_method)

    # train_X = all_X_complete[:-1000, :]
    # test_X = all_X_complete[-1000:, :]
    # 6. 归一化,以及01缩放
    if normal01:
        X_nmler = StandardScaler()
        X_01 = MinMaxScaler()
        Y_nmler = StandardScaler()
        Y_01 = MinMaxScaler()

        X_nmler.fit(all_X_complete)
        Y_nmler.fit(train_Y)
        all_X_nml = X_nmler.transform(all_X_complete)
        train_Y_nml = Y_nmler.transform(train_Y)
        X_01.fit(all_X_nml)
        Y_01.fit(train_Y_nml)
        all_X_nml01 = X_01.transform(all_X_nml)
        train_Y_nml01 = Y_01.transform(train_Y_nml)
        final_train_X = all_X_nml01[:-n_test, :]
        final_test_X = all_X_nml01[-n_test:, :]
        final_train_Y = np.concatenate([train_Y_nml01, train_Y], axis=1)
    else:
        final_train_X = all_X_complete[:-n_test, :]
        final_test_X = all_X_complete[-n_test:, :]
        final_train_Y = train_Y
    print(r"6. 对all_X, train_Y归一化到01(%s)." % normal01)

    # 7. 存储数据
    print(r"7. 存储数据为: 集合_类别_日期.csv(%s)." % outputdir)
    # timestamp = datetime.now().strftime("%Y%m%d%H%M")
    timestamp = "0000"
    np.savetxt(outputdir + r"\train_X_" + timestamp + ".csv",
               final_train_X,
               delimiter=",")
    np.savetxt(outputdir + r"\test_X_" + timestamp + ".csv",
               final_test_X,
               delimiter=",")
    np.savetxt(outputdir + r"\train_Y_" + timestamp + ".csv",
               final_train_Y,
               delimiter=",")
    np.savetxt(outputdir + r"\train_id_" + timestamp + ".csv",
               train_id.astype(np.int64),
               delimiter=",")
    np.savetxt(outputdir + r"\test_id_" + timestamp + ".csv",
               test_id.astype(np.int64),
               delimiter=",")
    return train_X, train_Y, test_X, train_id
예제 #15
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import _tree
import graphviz

dataset = read_csv('realdata3.csv')
modifiedData = dataset.fillna(np.NaN)
print(modifiedData.head(5))
d = modifiedData

d1 = fancyimpute.MICE().complete(d)
newd = df(data=d1, index=d.index, columns=list(d.columns))
newd.to_csv('test2.csv')

#criterion = "entropy", max_depth = 7, min_samples_split=500, min_samples_leaf=500
outcome_var = 'BAD'
model = tree.DecisionTreeClassifier(criterion="entropy",
                                    max_depth=12,
                                    min_samples_split=500,
                                    min_samples_leaf=200)
predictor_var = [
    'LOAN', 'MORTDUE', 'REASON', 'VALUE', 'DELINQ', 'DEROG', 'CLAGE', 'Other',
    'DELINQ', 'Office', 'Sales', 'ProfExe'
]

X_train, X_test, y_train, y_test = train_test_split(newd[predictor_var],
def fancy_impute(df, method='mice'):
    if method =='knn':
        df = pd.DataFrame(data=fancyimpute.KNN(3).complete(df), columns=df.columns, index=df.index)
    else:
        df = pd.DataFrame(data=fancyimpute.MICE().complete(df), columns=df.columns, index=df.index)
    return df
예제 #17
0

# List containing names of the features
column_list = ['Pregnancies','Glucose','BloodPressure','SkinThickness',
               'Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']
df = pd.read_csv('Pimadiabetes.csv', names = column_list, header = None)

# Treat 0 in the biological variables other than number of times pregnant and outcome as missing values
replace_cols = [i for i in column_list if i not in ['Outcome','Pregnancies']]
df[replace_cols] = df[replace_cols].replace(0, np.nan)

# Median Imputation Technique for BMI, BloodPressure, Glucose
df[['BMI','BloodPressure','Glucose']] = df[['BMI','BloodPressure','Glucose']].fillna(df.median())

# Multiple Imputation Technique for SkinThickness and Insulin
df[['Insulin','SkinThickness']] = imp.MICE().complete(df[['Insulin','SkinThickness']]) 

# Splitting dataframe into predictors and outcome
Y_data = df['Outcome']
X_data = df.drop(['Outcome'], axis=1)

# Splits up the data set into 80% train 20% test
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size = .2, random_state = 13)

# Dictionary of Classifiers 
dict_classifiers = {
    "Logistic Regression": LogisticRegression(),
    "AdaBoost": AdaBoostClassifier(n_estimators=100),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100,criterion='entropy')}  
예제 #18
0
def preprocess(train_data, test_data, fill_times=1000, ignore_columns=['id']):
    """对训练数据和测试数据进行预处理

    1. 对数据进行补全
    2. 对数据进行归一化

    :param train_data: DataFrame, 通过pandas读入的训练集数据
    :param test_data: DataFrame, 通过pandas读入的测试集数据
    :return: X_train, y_train, X_test
    """
    feature_name = test_data.columns
    label_name = list(set(train_data.columns) - set(feature_name))
    train_label = train_data[label_name]
    train_feature = train_data[feature_name]
    test_feature = test_data
    test_feature = test_feature.set_index(test_feature.index + 1000)
    train_index = train_feature.index
    test_index = test_feature.index
    all_feature = pd.concat([train_feature, test_feature], axis=0)
    all_index = all_feature.index

    snp_columns = [
        'SNP1', 'SNP2', 'SNP3', 'SNP4', 'SNP5', 'SNP6', 'SNP7', 'SNP8', 'SNP9',
        'SNP10', 'SNP11', 'SNP12', 'SNP13', 'SNP14', 'SNP15', 'SNP16', 'SNP17',
        'SNP18', 'SNP19', 'SNP20', 'SNP21', 'SNP22', 'SNP23', 'SNP24', 'SNP25',
        'SNP26', 'SNP27', 'SNP28', 'SNP29', 'SNP30', 'SNP31', 'SNP32', 'SNP33',
        'SNP34', 'SNP35', 'SNP36', 'SNP37', 'SNP38', 'SNP39', 'SNP40', 'SNP41',
        'SNP42', 'SNP43', 'SNP44', 'SNP45', 'SNP46', 'SNP47', 'SNP48', 'SNP49',
        'SNP50', 'SNP51', 'SNP52', 'SNP53', 'SNP54', 'SNP55'
    ]
    other_columns = set(all_feature.columns) - set(snp_columns)
    other_columns = list(other_columns)
    all_feature[snp_columns] = all_feature[snp_columns].fillna(0)
    snp_feature = all_feature[snp_columns]
    snp_scale = MinMaxScaler()
    snp_scale.fit([[0], [3]])
    snp_feature01 = snp_scale.transform(snp_feature)
    snp_feature_final = pd.DataFrame(snp_feature01,
                                     columns=snp_columns,
                                     index=all_index)
    ######################
    # 数据填充
    ######################
    feature_complete = []
    t0 = time()
    for i in range(fill_times):
        mice_data = fi.MICE(verbose=False).complete(all_feature[other_columns])
        feature_complete.append(mice_data)
        remain_time = (time() - t0) / (i + 1) * (fill_times - i - 1)
        print("第 %2.d/%d 次填充, 剩余时间 %.0f s" % (i + 1, fill_times, remain_time))
    feature_complete = np.array(feature_complete)
    feature_filled = feature_complete.mean(axis=0)
    all_feature[other_columns] = feature_filled
    other_feature = all_feature[other_columns]
    int_column = [
        '年龄', '孕次', '产次', 'BMI分类', '收缩压', '舒张压', 'ALT', 'AST', 'Lpa', 'DM家族史',
        'ACEID'
    ]
    float2_column = [
        'BUN', 'ApoA1', 'CHO', 'wbc', '孕前体重', 'HDLC', 'Cr', 'RBP4', 'ApoB',
        '分娩时', '身高', '糖筛孕周', 'TG', 'LDLC', 'hsCRP'
    ]
    float5_column = ['孕前BMI']
    float6_column = ['VAR00007']
    other_feature.loc[:][int_column] = other_feature[int_column].round()
    other_feature.loc[:][float2_column] = other_feature[float2_column].round(2)
    other_feature.loc[:][float5_column] = other_feature[float5_column].round(5)
    other_feature.loc[:][float6_column] = other_feature[float6_column].round(6)
    ######################
    # 数据归一化
    ######################
    id_feature = other_feature[ignore_columns]

    feature_need_nml = other_feature.drop(ignore_columns, axis=1)
    feature_nml = scale(feature_need_nml.values, axis=0)
    feature_nml01 = minmax_scale(feature_nml, axis=0)
    other_feature_final = pd.DataFrame(feature_nml01,
                                       columns=feature_need_nml.columns,
                                       index=all_index)

    feature_final = pd.concat(
        [id_feature, snp_feature_final, other_feature_final], axis=1)
    train_feature = pd.DataFrame(feature_final.ix[train_index],
                                 columns=feature_name)
    train_label = pd.DataFrame(train_label, columns=["label"])
    test_feature = pd.DataFrame(feature_final.ix[test_index],
                                columns=feature_name)
    return train_feature, train_label, test_feature
예제 #19
0
def impute(df):
    return pd.DataFrame(fancyimpute.MICE().complete(np.array(df)), columns = list(df))
예제 #20
0
        print('MF imputation')
        print(get_loss(X_c, X_mf_c, Y_c))


# MICE imputation
# Since MICE can not handle the singular matrix, we do it in a batch style

X_mice = []

# since the data matrix of one patient is a singular matrix, we merge a batch of matrices and do MICE impute

n = len(X)
batch_size = 128
nb_batch = (n + batch_size - 1) // batch_size

for i in range(nb_batch):
    print('On batch {}'.format(i))
    x = np.concatenate(X[i * batch_size: (i + 1) * batch_size])
    y = np.concatenate(Y[i * batch_size: (i + 1) * batch_size])
    x_mice = fancyimpute.MICE(n_imputations=100, n_pmm_neighbors=20, verbose=False).complete(x)

    X_mice.append(x_mice)

X_mice = np.concatenate(X_mice, axis=0)
X_c = np.concatenate(X, axis=0)
Y_c = np.concatenate(Y, axis=0)

print('MICE imputation')
print(get_loss(X_c, X_mice, Y_c))
# 3- pip install fancyimpute

# En caso de que siga generando problemas analizar las dependencias adicionales exigidas, en mi caso fue ipykernel
# pip install ipukernel
# y despues: pip install fancyinpute

import pandas as pd
import numpy as np

datoNum = data.select_dtypes(include=[np.float]).as_matrix()
fecha = data.select_dtypes(include=[np.object]).as_matrix()

datoNum = pd.DataFrame(datoNum)
fecha = pd.DataFrame(fecha)

import fancyimpute
datoNumcomp = pd.DataFrame(fancyimpute.MICE().complete(datoNum))

datos_completos = pd.concat([fecha, datoNumcomp], axis=1)

datos_completos.columns = data.columns
datos_completos.index = data.index

datos_completos

# In[ ]:

datos_completos.isnull().any().any()

# In[ ]:
예제 #22
0
def impute_mice(X):
	# fancyimpute is a downloadable package capable of kNN, NMM, soft impute,
	# MSE, MICE, and low-rank SVD.
	X_new = fancyimpute.MICE(n_imputations=100).complete(X)
	return X_new