def get_results_mice_imputation_includingy(X_incomplete, y): # Impute incomplete data using the IterativeImputer as a MICEImputer # Now using the output variable in the imputation loop m = 5 multiple_imputations = [] for i in range(m): Xy = np.column_stack((X_incomplete, y)) imputer = IterativeImputer(n_iter=100, sample_posterior=True, random_state=i) imputer.fit(Xy) data_imputed = imputer.transform(Xy) # We save only the X imputed data because we do not want to use y to # predict y later on. X_imputed = data_imputed[:, :-1] multiple_imputations.append(X_imputed) # Perform linear regression on mice multiple imputed data # Estimate beta estimates and their variances m_coefs = [] m_vars = [] for i in range(m): estimator = LinearRegression() estimator.fit(multiple_imputations[i], y) y_predict = estimator.predict(multiple_imputations[i]) m_coefs.append(estimator.coef_) m_vars.append(calculate_variance_of_beta_estimates( y, y_predict, multiple_imputations[i])) # Calculate the end estimates by applying Rubin's rules. Qbar = calculate_Qbar(m_coefs) T = calculate_T(m_coefs, m_vars, Qbar) mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar
def MultiIterBayesian(dataset): Dim = dataset['d'] trainX = dataset['train_x'] testX = dataset['test_x'] trainM = dataset['train_m'] testM = dataset['test_m'] # Train_No = dataset['train_no'] # Test_No = dataset['test_no'] test_X = testX.copy() train_X = trainX.copy() train_X[trainM == 0] = np.nan test_X[testM == 0] = np.nan # Bayesian imputation br_estimator = BayesianRidge() by_imp = IterativeImputer(random_state=0, estimator=br_estimator) by_imp.fit(train_X) imputed_X = by_imp.transform(test_X) print('>>>BayesianRidge IterativeImputer result: \n') print(imputed_X) _all_rmse = compute_rmse(testX, imputed_X, testM) print('>>>all_rmse', _all_rmse) return _all_rmse
def test_iterative_imputer_early_stopping(): rng = np.random.RandomState(0) n = 50 d = 5 A = rng.rand(n, 1) B = rng.rand(1, d) X = np.dot(A, B) nan_mask = rng.rand(n, d) < 0.5 X_missing = X.copy() X_missing[nan_mask] = np.nan imputer = IterativeImputer(max_iter=100, tol=1e-3, sample_posterior=False, verbose=1, random_state=rng) X_filled_100 = imputer.fit_transform(X_missing) assert len(imputer.imputation_sequence_) == d * imputer.n_iter_ imputer = IterativeImputer(max_iter=imputer.n_iter_, sample_posterior=False, verbose=1, random_state=rng) X_filled_early = imputer.fit_transform(X_missing) assert_allclose(X_filled_100, X_filled_early, atol=1e-7) imputer = IterativeImputer(max_iter=100, tol=0, sample_posterior=False, verbose=1, random_state=rng) imputer.fit(X_missing) assert imputer.n_iter_ == imputer.max_iter
def prepare_data_letter(): data_path = os.path.join("data_raw", "fully_labeled", "letter_recognition", "letter_recognition.csv") df = pd.read_csv(data_path) df = df.drop(columns="id") df = df.replace(list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), list(range(26))) df = df.astype(float) test_mask = np.random.rand(len(df)) < config.TEST_RATIO train_df = df[~test_mask] test_df = df[test_mask] # Impute missing values imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(train_df) train_np = imp.transform(train_df) test_np = imp.transform(test_df) x_train = train_np[:, :-1].astype(float) y_train = train_np[:, -1].astype(int) x_test = test_np[:, :-1].astype(float) y_test = test_np[:, -1].astype(int) return x_train, y_train, x_test, y_test
def main(): time_series = pd.read_csv(data_path + "TimeSeries.csv") print("original dim", time_series.shape) time_series = time_series[time_series.Hour < 25] time_series.iloc[:, 7:(len(time_series.columns) - 1)] = remove_alpha( time_series.iloc[:, 7:(len(time_series.columns) - 1)]) time_series['PatientID2'] = time_series['PatientID'] print(time_series.columns) #print(time_series.columns) aggregate_series = time_series.groupby('PatientID2').first() missingness = (aggregate_series.isnull().sum() * 100 / len(aggregate_series)) missingness.reindex(aggregate_series.columns) aggregate_series = aggregate_series.loc[:, pd.notnull(aggregate_series).sum( ) > len(aggregate_series) * .80] #Impute missing values as SOM clustering does not accommodate missingness imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(aggregate_series.iloc[:, 7:]) aggregate_series.iloc[:, 7:] = imp.transform(aggregate_series.iloc[:, 7:]) aggregate_series.to_csv(clustering_path + "BaselineObservations.csv", index=False)
def fill_empty_fields(dataframe): imp = IterativeImputer(max_iter=100, random_state=0) imp.fit(dataframe) dataframe = imp.transform(dataframe) return dataframe
def process_y(data): df = data[['fustat', 'futime']] years = [1,3,5] values = [] # [1-year, 2-year, 3-year, 5-year] #import pdb #pdb.set_trace() for i in range(df.shape[0]): stat, time = df.loc[i] row = [] for year in years: if time >= year * 365: row.append(0) elif stat == 1 and time < year * 365: row.append(1) elif stat == 0 and time < year * 365: row.append(np.NaN) values.append(row) column_name = ['year_'+str(i) for i in years] year_survival_rate = pd.DataFrame(values, index=data.index, columns=column_name) data = data.join(year_survival_rate) #print (data[['fustat', 'futime']+column_name]) imp = IterativeImputer(max_iter=20, random_state=random_seed, min_value=0, max_value=1) imp.fit(data) arr_values = (imp.transform(data)) arr_values = np.round(arr_values) data[column_name] = pd.DataFrame(arr_values, columns=data.columns)[column_name] del data['fustat'] del data['futime'] return data, column_name
def replace_nan_values(self): # we have almost 30 patients that have missing values or nan values in their information # as we don't want to delete these rows, we will replace these values using IterativeImputer from sklearn # it models each feature with missing values as a function of other features, and uses that estimate for imputation. # more info : https://scikit-learn.org/stable/modules/impute.html#iterative-imputer imp = IterativeImputer(max_iter=50, random_state=0) imp.fit(self.training_data) transformed_train = imp.transform(self.training_data) transformed_test = imp.transform(self.testing_data) index_train = self.training_data.index index_test = self.testing_data.index columns = self.training_data.columns self.training_data = pd.DataFrame(transformed_train, index=index_train, columns=columns) self.testing_data = pd.DataFrame(transformed_test, index=index_test, columns=columns) # this method will predict float values while we want int values for categorical data for cat in self.categorical_col: self.training_data[cat] = self.training_data[cat].round() self.testing_data[cat] = self.testing_data[cat].round() return self.training_data, self.testing_data
def test_iterative_imputer_catch_min_max_error(min_value, max_value, err_msg): # check that passing scalar or array-like # for min_value and max_value in IterativeImputer works X = np.random.random((10, 3)) imputer = IterativeImputer(min_value=min_value, max_value=max_value) with pytest.raises(ValueError, match=err_msg): imputer.fit(X)
class IterativeInterpolate(BaseEstimator, TransformerMixin): def __init__(self, estimater=None, is_estimate=False, missing_values=np.nan, max_iter=10, random_state=None): self.estimater = estimater self.is_estimate = is_estimate self.missing_values = missing_values self.max_iter = max_iter self.random_state = random_state def fit(self, X, y=None): if self.is_estimate: self.imp = IterativeImputer(estimator=self.estimater, missing_values=self.missing_values, max_iter=self.max_iter, random_state=self.random_state) else: self.imp = IterativeImputer(missing_values=self.missing_values, max_iter=self.max_iter, random_state=self.random_state) if y is None: self.imp.fit(X) else: self.imp.fit(X, y) return self def transform(self, X): return self.imp.transform(X)
def get_results_mice_imputation(X_incomplete, y): # Impute incomplete data using the IterativeImputer to perform multiple # imputation. We set n_burn_in at 99 and use only last imputation and # loop this procedure m times. m = 5 multiple_imputations = [] for i in range(m): imputer = IterativeImputer(n_iter=100, sample_posterior=True, random_state=i) imputer.fit(X_incomplete) X_imputed = imputer.transform(X_incomplete) multiple_imputations.append(X_imputed) # Perform a model on each of the m imputed datasets # Estimate the estimates for each model/dataset m_coefs = [] m_vars = [] for i in range(m): estimator = LinearRegression() estimator.fit(multiple_imputations[i], y) y_predict = estimator.predict(multiple_imputations[i]) m_coefs.append(estimator.coef_) m_vars.append(calculate_variance_of_beta_estimates( y, y_predict, multiple_imputations[i])) # Calculate the end estimates by applying Rubin's rules. Qbar = calculate_Qbar(m_coefs) T = calculate_T(m_coefs, m_vars, Qbar) mice_errorbar = 1.96 * np.sqrt(T) return Qbar, T, mice_errorbar
def impute(df, impute_columns): imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(df[impute_columns]) df[impute_columns] = imp.transform(df[impute_columns]) return df[impute_columns]
def prepare_data_breast(): data_path = os.path.join("data_raw", "fully_labeled", "breast_w", "breast_w.csv") df = pd.read_csv(data_path) df = df.iloc[:, 1:] df = df.replace("?", np.nan) df = df.replace("benign", 0) df = df.replace("malignant", 1) df = df.astype(float) test_mask = np.random.rand(len(df)) < config.TEST_RATIO train_df = df[~test_mask] test_df = df[test_mask] # Impute missing values imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(train_df) train_np = imp.transform(train_df) test_np = imp.transform(test_df) x_train = train_np[:, :-1].astype(float) y_train = train_np[:, -1].astype(int) x_test = test_np[:, :-1].astype(float) y_test = test_np[:, -1].astype(int) return x_train, y_train, x_test, y_test
def iter_impute(data, subject=None, cols=None, rounding=3, max_iter=10): # Prepare input # if cols is none, perform for all columns (except first column) if cols is None: cols = data.columns[1:] # If subject is null, perform for all subjects if subject is None: inp = data[cols] else: # Create a dataframe with all selected subjects inp = pandas.DataFrame() for s in subject: inp = inp.append(get_subject(data, s, data.columns[0]).loc[:, cols]) if len(inp.columns) < 2: raise Exception("Multiple variables must be given as input") # Create imputer imputer = IterativeImputer(max_iter=max_iter) imputer.fit(inp) # Impute missing values and round to the third decimal point res = pandas.DataFrame(np.round(imputer.transform(inp), decimals=rounding), index=inp.index, columns=inp.columns) data.loc[res.index, res.columns] = res return data
def fill_missing_value_using_iterative(dataset): """ 将空缺值按该行其他未空缺值计算贝叶斯回归来估计空缺值 :param dataset: 传入pandas数据 :return: """ # dataset[[1, 2, 3, 4, 5]] = dataset[[1, 2, 3, 4, 5]].replace(0, nan) data = dataset.values X, y = data[:, 1:], data[:, 0] y = np.expand_dims(y, 1) X = np.asarray(X, dtype=np.float64) # print total missing print('Missing: %d' % sum(isnan(X).flatten())) # define imputer imputer = IterativeImputer() # fit on the dataset imputer.fit(X) # transform the dataset Xtrans = imputer.transform(X) # print total missing print('Missing: %d' % sum(isnan(Xtrans).flatten())) print(X.shape) return_data = np.hstack((y, Xtrans)) return return_data
def MultiIterTrees(dataset): from sklearn.impute import IterativeImputer Dim = dataset['d'] trainX = dataset['train_x'] testX = dataset['test_x'] trainM = dataset['train_m'] testM = dataset['test_m'] # Train_No = dataset['train_no'] # Test_No = dataset['test_no'] test_X = testX.copy() train_X = trainX.copy() train_X[trainM == 0] = np.nan test_X[testM == 0] = np.nan # Bayesian imputation etr_estimator = ExtraTreesRegressor(n_estimators=10, random_state=0) etr_imp = IterativeImputer(random_state=0, estimator=etr_estimator) etr_imp.fit(train_X) imputed_X = etr_imp.transform(test_X) print('>>>ExtraTreesRegressor IterativeImputer result: \n') print(imputed_X) _all_rmse = compute_rmse(testX, imputed_X, testM) print('>>>all_rmse', _all_rmse) return _all_rmse
def main(): configs = json.load( open('MachineLearning/Models/LSTM/Configuration.json', 'r')) if not os.path.exists(configs['model']['save_dir']): os.makedirs(configs['model']['save_dir']) time_series = pd.read_csv(clustered_timeseries_path + "TimeSeriesAggregatedClusteredDeltaTwoDays.csv") print(time_series.shape) # configs['data']['train_test_split'], #the split #configs['data']['columns_dynamic'] # the columns #Impute and Scale Data dynamic_features = configs['data']['dynamic_columns'] grouping = configs['data']['grouping'] imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(time_series[dynamic_features]) time_series[dynamic_features] = imp.transform( time_series[dynamic_features]) time_series = scale(time_series, dynamic_features) X = time_series[dynamic_features] groups = np.array(time_series[grouping]) for outcome in configs['data']['classification_outcome']: y = time_series[outcome] y = y.astype(int) model = Model(configs['model']['name'] + outcome) print(grouping) print(len(set(time_series[grouping]))) model.build_model(configs) i = 0 for ffold_ind, (training_ind, testing_ind) in enumerate( stratified_group_k_fold(X, y, groups, k=10)): # CROSS-VALIDATION training_groups, testing_groups = groups[training_ind], groups[ testing_ind] this_y_train, this_y_val = y[training_ind], y[testing_ind] this_X_train, this_X_val = X.iloc[training_ind], X.iloc[ testing_ind] assert len(set(training_groups) & set(testing_groups)) == 0 print(" X SHAPE: ", this_X_train.shape) print(" Y shape: ", this_y_train.shape) input_timesteps = 24 input_dim = 2 if i == 0: #(NumberOfExamples, TimeSteps, FeaturesPerStep). model.train((this_X_train.values).reshape(-1, 24, 35), (this_y_train.values).reshape(-1, 24, 1)) i = i + 1
def iterative_imputer(X, args={}): """ 缺失值插入:通过将该缺失属性与其他属性结合起来进行插值 """ from sklearn.impute import IterativeImputer iti = IterativeImputer(**args) iti.fit(X) return iti
def problem2_3_3(data): data[3].loc[data[3] == 0] = np.nan imp = IterativeImputer(missing_values=np.nan) imp.fit(data) newdata = np.round(imp.transform(data)) area = newdata[:, 2].tolist() print("Use Multivariate:", problem2_3_1(area)) return "as shown in the plots"
def iterative_inputer_integer(self, df): df_copy = df.copy() imp = IterativeImputer(max_iter=10, random_state=0) imp.fit(df_copy) df_new = pd.DataFrame(np.round(imp.transform(df_copy)), columns=df_copy.columns) df_new = df_new.astype('int32') return df_new
def experiment_LinearRegression(df, df_full, score): start_time = time.time() imp = IterativeImputer(estimator=LinearRegression(), random_state=0, max_iter=10) imp.fit(df) df_filled = pd.DataFrame(imp.transform(df)) score.loc['Linear Regression', 'r2_score'] = r2_score(df_full, df_filled) score.loc['Linear Regression', 'time'] = time.time() - start_time
def test_iterative_imputer_one_feature(X): # check we exit early when there is a single feature imputer = IterativeImputer().fit(X) assert imputer.n_iter_ == 0 imputer = IterativeImputer() imputer.fit([[1], [2]]) assert imputer.n_iter_ == 0 imputer.fit([[1], [np.nan]]) assert imputer.n_iter_ == 0
def use_imputation(df_list, train_x_columns): imputer = IterativeImputer(random_state=0, max_iter=30, verbose=2) imputer.fit(df_list[0][train_x_columns]) for i in range(len(df_list)): df_list[i][train_x_columns] = imputer.transform( df_list[i][train_x_columns]) return df_list
def imput_data_with_sklearn_imputer(df_daily): df_daily_interp = df_daily.copy() df_daily_interp["MES"] = df_daily_interp.index.month imputer = IterativeImputer(estimator=BayesianRidge(), random_state=1) imputer.fit(df_daily_interp.values) imputted_vals = imputer.transform(df_daily_interp.values) df_daily_interp.loc[:, :] = imputted_vals return df_daily_interp
def fill_chunk(fit_df, transform_df): estimator = RandomForestRegressor(n_estimators=10, n_jobs=8) imp = IterativeImputer(estimator=estimator, max_iter=5, random_state=0) imp.fit(fit_df) transformed = imp.transform(transform_df) imputed_df = pd.DataFrame(data=transformed, index=transform_df.index, columns=transform_df.columns) return imputed_df
def main(): df = get_raw_data() data_dict = pd.read_csv("data/WiDS Datathon 2020 Dictionary.csv") identifier_features = data_dict[data_dict["Category"] == "identifier"][ "Variable Name"].tolist() + ["icu_id"] type__features = [ "hospital_admit_source", "icu_admit_source", "icu_stay_type", "icu_type", ] redundant_features = ['readmission_status', 'apache_2_bodysystem'] features_to_drop = identifier_features + type__features + redundant_features # keep features that have less than 70% of nulls cut_off_percentage = 0.3 n_of_nulls = int(cut_off_percentage * df.shape[0]) df = df.dropna(axis=1, thresh=n_of_nulls) numeric_features = data_dict[ data_dict["Data Type"] == "numeric"]["Variable Name"].tolist() + [ "bmi", "apache_2_diagnosis", "apache_3j_diagnosis" ] skewed_numeric_features = df.columns[df.columns.isin(numeric_features)] numeric_df = df[skewed_numeric_features] imp = IterativeImputer(max_iter=3, verbose=0) imp.fit(numeric_df) imputed_df = imp.transform(numeric_df) imputed_df = pd.DataFrame(imputed_df, columns=numeric_df.columns) categorical_features = data_dict[ data_dict["Data Type"] != "numeric"]["Variable Name"].tolist() # remove ['bmi','apache_2_diagnosis','apache_3j_diagnosis'] non_categorical features categorical_features = [ feature for feature in categorical_features if feature not in ["bmi", "apache_2_diagnosis", "apache_3j_diagnosis"] ] skewed_categorical_features = df.columns[df.columns.isin( categorical_features)] categorical_df = df[skewed_categorical_features] # fill the null with the most occurred values # df.series.mode() returns a series. so [0] exact value of the series for feature in skewed_categorical_features: categorical_df[feature].fillna(categorical_df[feature].mode()[0], inplace=True) complete_df = pd.concat([imputed_df, categorical_df], axis=1) return complete_df
def fit(self, X, y=None): """Perform co-clustering. Parameters ---------- X : numpy array or scipy sparse matrix, shape=(n_samples, n_features) Matrix to be analyzed """ random_state = check_random_state(self.random_state) check_array(X, accept_sparse=True, dtype="numeric", order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=self.n_row_clusters, ensure_min_features=self.n_col_clusters, warn_on_dtype=False, estimator=None) global indices indices=np.argwhere(np.isnan(X)) if(len(indices)): imp = IterativeImputer(missing_values=np.nan, sample_posterior=False, max_iter=10, tol=0.001, n_nearest_features=4, initial_strategy='most_frequent') imp.fit(X) X=imp.transform(X) check_positive(X) X = X.astype(float) criterion = self.criterion criterions = self.criterions row_labels_ = self.row_labels_ column_labels_ = self.column_labels_ delta_kl_ = self.delta_kl_ seeds = random_state.randint(np.iinfo(np.int32).max, size=self.n_init) for seed in seeds: self._fit_single(X, seed, y) if np.isnan(self.criterion): raise ValueError("matrix may contain negative or " "unexpected NaN values") # remember attributes corresponding to the best criterion if (self.criterion > criterion): criterion = self.criterion criterions = self.criterions row_labels_ = self.row_labels_ column_labels_ = self.column_labels_ delta_kl_ = self.delta_kl_ # update attributes self.criterion = criterion self.criterions = criterions self.row_labels_ = row_labels_ self.column_labels_ = column_labels_ self.delta_kl_ = delta_kl_ return self
def train_model_iterative_fill(filename): pd.options.mode.chained_assignment = None df = pd.read_csv(filename, encoding = 'utf-16', sep = '\t') groups = list(set(df[PAGAL_KA_SUGRUPUOTI_SPEJIMUS].astype(int))) estimators = [ExtraTreesRegressor(), BayesianRidge(), KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor()] # geriausiai veikia decision trees regressor # pasirenkamas algoritmas estimator = estimators[0] # ar reikia atmesti mažas reikšmes new_filename = filename atmesti_mazas_reiksmes = True if atmesti_mazas_reiksmes: df = atmesti_mazas_tui(df) new_filename = filename.split('.') new_filename = new_filename[0] + '_be_mazu_tui.' + new_filename[1] df.to_csv(new_filename, sep = '\t', encoding = 'utf-16', index = False) for group in groups: print('Pildomas %s rodiklis' % group) maindf = pd.read_csv(new_filename, encoding = 'utf-16', sep = '\t') # atsirenkamos eilutės tik su tam tikra PAGAL_KA_SUGRUPUOTI_SPEJIMUS reikšme df = maindf.loc[maindf[PAGAL_KA_SUGRUPUOTI_SPEJIMUS] == group] X = shuffle(df) # numetamos reikšmės, kurių neina konvertuoti į skaičius for name in ATMESTI: X = X.drop(name, axis = 1) # atsikratome tuščių stulpelių (neįmanoma teisingai nuspėti kai nėra jokio pavyzdžio) for col in X: if X[col].isnull().all(): X = X.drop(col, axis = 1) # jei yra bent viena eilutė, kurią būtų galima užpildyti if len(X) > 0: index = list(X.index) columns = list(X.columns.values) # sukuriamas ir ištreniruojamas algoritmas imp = IterativeImputer(estimator = estimator, missing_values = np.nan) imp.fit(X) # užpildomos tuščios X reikšmės X = imp.transform(X) # čia X grąžinamas np.array pavidalu, todėl reikia jį atversti atgal į pandas.DataFrame X = pd.DataFrame(data = X, index = index, columns = columns) maindf.update(X) new_filename = new_filename.split('.')[0] + '_updated.' + new_filename.split('.')[1] # išsaugomi spėjimai maindf.to_csv(new_filename, sep = '\t', encoding = 'utf-16', index = False) # sutvarko failą tidy_up_file(new_filename) return 0
def imputation(df): ## La imputacion se hace para reemplazar los nans por un valor estimado from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imputer = IterativeImputer(max_iter=10, random_state=0) imputer.fit(df) df_num_imp = imputer.transform(df) df = pd.DataFrame(df_num_imp, columns=df.columns) return df
def internal(self, col_list): col_list1 = col_list.get('internal') data = self.data[self.data.columns.intersection(col_list1)] imp_mean = IterativeImputer(random_state=0) imp_mean.fit(data) data_iterative = pd.DataFrame(imp_mean.transform(data), columns=data.columns, index=data.index) data_iterative.to_csv('internal.csv', index=False) return data_iterative
def test_iterative_imputer_verbose(): rng = np.random.RandomState(0) n = 100 d = 3 X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray() imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=1) imputer.fit(X) imputer.transform(X) imputer = IterativeImputer(missing_values=0, max_iter=1, verbose=2) imputer.fit(X) imputer.transform(X)
def test_iterative_imputer_no_missing(): rng = np.random.RandomState(0) X = rng.rand(100, 100) X[:, 0] = np.nan m1 = IterativeImputer(max_iter=10, random_state=rng) m2 = IterativeImputer(max_iter=10, random_state=rng) pred1 = m1.fit(X).transform(X) pred2 = m2.fit_transform(X) # should exclude the first column entirely assert_allclose(X[:, 1:], pred1) # fit and fit_transform should both be identical assert_allclose(pred1, pred2)
def test_iterative_imputer_transform_stochasticity(): pytest.importorskip("scipy", minversion="0.17.0") rng1 = np.random.RandomState(0) rng2 = np.random.RandomState(1) n = 100 d = 10 X = sparse_random_matrix(n, d, density=0.10, random_state=rng1).toarray() # when sample_posterior=True, two transforms shouldn't be equal imputer = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=True, random_state=rng1) imputer.fit(X) X_fitted_1 = imputer.transform(X) X_fitted_2 = imputer.transform(X) # sufficient to assert that the means are not the same assert np.mean(X_fitted_1) != pytest.approx(np.mean(X_fitted_2)) # when sample_posterior=False, and n_nearest_features=None # and imputation_order is not random # the two transforms should be identical even if rng are different imputer1 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng1) imputer2 = IterativeImputer(missing_values=0, max_iter=1, sample_posterior=False, n_nearest_features=None, imputation_order='ascending', random_state=rng2) imputer1.fit(X) imputer2.fit(X) X_fitted_1a = imputer1.transform(X) X_fitted_1b = imputer1.transform(X) X_fitted_2 = imputer2.transform(X) assert_allclose(X_fitted_1a, X_fitted_1b) assert_allclose(X_fitted_1a, X_fitted_2)