def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent']: imputer = Imputer(strategy=strategy) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2))
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(strategy=strategy) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2))
def _impute(features, imputer=True): """ Helper function that uses the safest imputing method to remove null values, in terms of compatibility with the data size @param features: the feature values that need to be imputed @type features: numpy.array @param imputer: whether or not the scikit imputing method should be used @type imputer: boolean @return: the modified feature values @rtype: numpy.array """ if not imputer: #run imputer only if enabled (default) return np.nan_to_num(features) else: imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=2) try: impfeatures = imp.fit_transform(features) except ValueError as exc: #catch errors with illegal values (e.g. strings) log.warning("Exception trying to run scikit imputation: {}".format(exc)) impfeatures = features #show size for debugging purposes #log.debug("Featurevectors {} after imputation: {}".format(impfeatures.shape, features))i #we don't want shgrid_scores_ape to change, so if this happens, then just replace nans with zero and infinites if impfeatures.shape == features.shape: features = impfeatures else: log.warning("Imputer failed, filtering NaN based on numpy converter") features = np.nan_to_num(features) return features
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] self.categorical = [True if attribute == 'nominal' else False for attribute in self.attribute_types] data = np.array(dataset['data'], dtype=np.float64) X = data[:,:-1] y = data[:,-1].reshape((-1,)) ohe = OneHotEncoder(self.categorical) X_transformed = ohe.fit_transform(X) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) center = not scipy.sparse.isspmatrix((X_transformed)) standard_scaler = StandardScaler(with_mean=center) X_transformed = standard_scaler.fit_transform(X_transformed) X_transformed = X_transformed.todense() # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"] (self.X_transformed, self.y)) self.helpers.set_value("MissingValues", self.helpers[ "MissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"]( self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value("Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value("Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def check_indicator(X, expected_imputed_features, axis): n_samples, n_features = X.shape imputer = Imputer(missing_values=-1, strategy='mean', axis=axis) imputer_with_in = clone(imputer).set_params(add_indicator_features=True) Xt = imputer.fit_transform(X) Xt_with_in = imputer_with_in.fit_transform(X) imputed_features_mask = X[:, expected_imputed_features] == -1 n_features_new = Xt.shape[1] n_imputed_features = len(imputer_with_in.imputed_features_) assert_array_equal(imputer.imputed_features_, expected_imputed_features) assert_array_equal(imputer_with_in.imputed_features_, expected_imputed_features) assert_equal(Xt_with_in.shape, (n_samples, n_features_new + n_imputed_features)) assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask))) imputer_with_in = clone(imputer).set_params(add_indicator_features=True) assert_array_equal(Xt_with_in, imputer_with_in.fit_transform(sparse.csc_matrix(X)).A) assert_array_equal(Xt_with_in, imputer_with_in.fit_transform(sparse.csr_matrix(X)).A)
X, Y, test_size=validation_size, random_state=seed) X_train = pd.DataFrame(data=X_train, columns=columns) X_validation = pd.DataFrame(data=X_validation, columns=columns) # handling missing values (NaN, Null) # creates additonal new columns based on calumns where missing data was (fill those columns with 1 and 0) # True where missing value was, False where not (1 or 0) missing_columns = [ col for col in X_train.columns if X_train[col].isnull().any() ] for col in missing_columns: X_train[col + '_missing_data'] = X_train[col].isnull() original_data = X_train # fill missing values with mean values imputer = Imputer() X_train = pd.DataFrame(data=imputer.fit_transform(X_train)) X_train.columns = original_data.columns # make one column indicating where wasmissing point, drop missing_columns X_train['missing_values'] = numpy.zeros((len(X_train), 1)) for col in missing_columns: X_train['missing_values'] += X_train[col + '_missing_data'] X_train = X_train.drop([col + '_missing_data'], axis=1) X_train['Age'] = X_train['Age'].values.round() X_train = X_train.values # validation dataset missing_columns = [ col for col in X_validation.columns if X_validation[col].isnull().any() ] for col in missing_columns: X_validation[col + '_missing_data'] = X_validation[col].isnull()
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] self.categorical = [True if attribute == 'nominal' else False for attribute in self.attribute_types] data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1,)) # First, swap NaNs and zeros, because when converting an encoded # dense matrix to sparse, the values which are encoded to zero are lost X_sparse = X.copy() NaNs = ~np.isfinite(X_sparse) X_sparse[NaNs] = 0 X_sparse = sparse.csr_matrix(X_sparse) ohe = OneHotEncoder(self.categorical) X_transformed = X_sparse.copy() X_transformed = ohe.fit_transform(X_transformed) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler() X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X_sparse self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"] (self.X_transformed, self.y)) self.helpers.set_value("MissingValues", self.helpers[ "MissingValues"](self.X, self.y, self.categorical)) self.mf.set_value("NumberOfMissingValues", self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"]( self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value("Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value("Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
test_path = '../input/test.csv' test_data = pd.read_csv(test_path) train_data = pd.read_csv(path) total_data = train_data.append(test_data) #exploring the data print((total_data.isnull().sum())) # finding columns that have null values #getting rid of Cabin since most of its values are missing (687) data = total_data.drop('Cabin', axis=1) # drop Cabin because it is mostly blank # replacing missing values in age with median age droplist = [ 'PassengerId', 'Name', 'Sex', 'Ticket', 'Embarked', 'Survived', 'Pclass', 'Parch', 'Fare', 'SibSp' ] data1 = data.drop(droplist, axis=1) imputed_age = my_age_imputer.fit_transform(data1) #imputer outputs a multi-D array so we need to convert it into a dataframe before we can use it age_corrected = pd.DataFrame({'ImputedAge': imputed_age[:, 0]}) data['ImputedAge'] = age_corrected data.Embarked.fillna('S', inplace=True) # filling na with mode of location Embarked data.Embarked = data.Embarked.replace(['S', 'Q', 'C'], [0, 1, 2]) corr = data.corr() print(corr.Survived) #checking data correlation # In[ ]: from matplotlib import pyplot as plt #plotting histograms of Age and ImputedAge to see if the distribution is similar - histogram shows the imputed data in 25-30 is somewhat higher plt.hist(data.ImputedAge, range=[0, data.ImputedAge.max()],
# Create the data and the labels correseponding to the data X = dataset.iloc[:, [0, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14]].values y = dataset.iloc[:, 15].values # Preprocess the data from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.preprocessing.imputation import Imputer labelencode_array = [0, 2, 5, 6, 8, 9] for i in labelencode_array: labelencoder = LabelEncoder() X[:, i] = labelencoder.fit_transform(X[:, i].astype(str)) imp = Imputer(missing_values=np.nan, strategy='mean') X = imp.fit_transform(X) # one-hot encode onehotencode_array = [0, 48, 60, 64, 69, 83] for i in onehotencode_array: onehotencoder_make = OneHotEncoder(categorical_features=[i]) X = onehotencoder_make.fit_transform(X).toarray() X = X[:, 1:] # Split the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,
#print("test_features's num: ",test_features.columns.size) #print("feature's name: ",[col for col in test_features.columns # if col not in train_features.columns]) #train_features,test_features = train_features.align(test_features, # join='left', # axis = 1) missing_cols_train = [ col for col in train_features.columns if train_features[col].isnull().any() ] print('missing features:' + str(missing_cols_train)) #print(train_features.LotFrontage) # 缺失值处理 my_imputer = Imputer(strategy='median') train_features = my_imputer.fit_transform(train_features) test_features = my_imputer.transform(test_features) #print(train_features.LotFrontage) #print("features num : "+len(train_features.columns)) ## 训练数据集分割成训练集和测试集,用于测试 X_train, X_test, y_train, y_test = train_test_split(train_features, train_target, train_size=0.8, test_size=0.2, random_state=0) # 训练XGBOOST model = XGBRegressor(max_depth=7, learning_rate=0.1, Missing=None) model.fit(X_train, y_train, verbose=False)
preds = model.predict(X_test) return mean_absolute_error(y_test, preds) cols_with_missing = [ col for col in X_train.columns if X_train[col].isnull().any() ] reduced_X_train = X_train.drop(cols_with_missing, axis=1) reduced_X_test = X_test.drop(cols_with_missing, axis=1) print("Mean Absolute Error from dropping columns with Missing Values:") print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test)) from sklearn.preprocessing.imputation import Imputer my_imputer = Imputer() imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train)) imputed_X_train.columns = numeric_predictors.columns imputed_X_test = my_imputer.transform(X_test) print("Mean Absolute Error from Imputation:") print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test)) imputed_X_train_plus = X_train.copy() imputed_X_test_plus = X_test.copy() cols_with_missing = (col for col in X_train.columns if X_train[col].isnull().any()) for col in cols_with_missing: imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull() imputed_X_test_plus[col + '_was_missing'] = imputed_X_test_plus[col].isnull()
if count % 1000 == 0: print(count) val = noncat_matrix[x, y] if val - math.floor(val) != 0.0: for i in range(20): if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001: X[x, 2 * y] = math.ceil(abs(val) * i) X[x, 2 * y + 1] = i return X # категории print("building train") train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix() imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) train_cat_matr = imp.fit_transform(train_cat_matr) # imp2 = Imputer(missing_values='NaN', strategy='median') train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix() # train_noncat_matr = imp2.fit_transform(train_noncat_matr) # allf = np.hstack((train_cat_matr, train_noncat_matr)) print("building test") test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix() test_cat_matr = imp.transform(test_cat_matr) test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix() # test_noncat_matr = imp2.transform(test_noncat_matr) # test_extra_matr = build_extra_features(test_noncat_matr[:,:10])
predictors_without_categoricals = train_predictors.select_dtypes( exclude=['object']) mae_without_categoricals = get_mae(predictors_without_categoricals, target) mae_one_hot_encoded = get_mae(one_hot_encoded_training_predictors, target) print('Mean Absolute Error when Dropping Categoricals: ' + str(int(mae_without_categoricals))) print('Mean Abslute Error with One-Hot Encoding: ' + str(int(mae_one_hot_encoded))) one_hot_encoded_training_predictors = pd.get_dummies(train_predictors) one_hot_encoded_test_predictors = pd.get_dummies(test_predictors) final_train, final_test = one_hot_encoded_training_predictors.align( one_hot_encoded_test_predictors, join='inner', axis=1) print('Mean Absolute Error for Final train: ' + str(int(get_mae(final_train, target)))) from sklearn.preprocessing.imputation import Imputer my_imputer = Imputer() imputed_final_test = pd.DataFrame(my_imputer.fit_transform(final_test)) imputed_final_test.columns = one_hot_encoded_test_predictors.columns forest_model = RandomForestRegressor(50) forest_model.fit(final_train, target) predicted_prices = forest_model.predict(imputed_final_test) print(predicted_prices) print() submission = pd.DataFrame({'Id': test_data.Id, 'SalePrice': predicted_prices}) submission.to_csv('submission2.csv', index=False)
# drop columns with Missing values cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()] reduced_X_train = X_train.drop(cols_with_missing, axis=1) reduced_X_test = X_test.drop(cols_with_missing, axis=1) print("Mean Absolute Error from dropping columns with Missing Values:") print(score_dataset(reduced_X_train, reduced_X_test, y_train, y_test)) # imputer my_imputer = Imputer() # 先fit再transform # fit:只有X_train的话,执行无监督学习算法,比如降维、特征提取、标准化等 # transform:根据对象的特性来定,比如这里是Imputer()对象,那么就是要执行impute # 另外也可以是StandardScaler()对象,实现标准化(在此之前也要fit) #print(len(X_train.columns)) imputed_X_train = my_imputer.fit_transform(X_train) #print(len(imputed_X_train[0,:])) imputed_X_test = my_imputer.transform(X_test) print("Mean Absolute Error from Imputation:") print(score_dataset(imputed_X_train, imputed_X_test, y_train, y_test)) # 被impute的数据 imputed_X_train_plus = X_train.copy() imputed_X_test_plus = X_test.copy() cols_with_missing = (col for col in X_train.columns if X_train[col].isnull().any()) # 有缺失值得数据不是直接删除,而是有数据的是false,无数据的是true for col in cols_with_missing: imputed_X_train_plus[col + '_was_missing'] = imputed_X_train_plus[col].isnull()
def calculate_all_metafeatures(X, y, categorical, dataset_name, calculate=None, dont_calculate=None): """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = list() visited = set() to_visit = deque() to_visit.extend(metafeatures) # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) ohe = OneHotEncoder(categorical, sparse=True) X_transformed = ohe.fit_transform(X) imputer = Imputer(strategy='mean') X_transformed = imputer.fit_transform(X_transformed) standard_scaler = StandardScaler() X_transformed = standard_scaler.fit_transform(X_transformed) # TODO add possibility to not transform here if scipy.sparse.issparse(X_transformed): X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_arrays(X_transformed, sparse_format='dense', allow_nans=False)[0] rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) X_transformed = X_transformed[indices] y_transformed = y[indices] # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in npy_metafeatures: X_ = X_transformed y_ = y_transformed else: X_ = X y_ = y dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): value = helper_functions[dependency](X_, y_, categorical) helper_functions.set_value(dependency, value) mf_.append(value) value = metafeatures[name](X_, y_) metafeatures.set_value(name, value) mf_.append(value) visited.add(name) mf_.sort(key=lambda t: t.name) mf_ = DatasetMetafeatures(dataset_name, mf_) return mf_
if count % 1000 == 0: print(count) val = noncat_matrix[x, y] if val - math.floor(val) != 0.0: for i in range(20): if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001: X[x, 2 * y] = math.ceil(abs(val) * i) X[x, 2 * y + 1] = i return X # категории print("building train") train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix() imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0) train_cat_matr = imp.fit_transform(train_cat_matr) # imp2 = Imputer(missing_values='NaN', strategy='median') train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix() # train_noncat_matr = imp2.fit_transform(train_noncat_matr) # allf = np.hstack((train_cat_matr, train_noncat_matr)) print("building test") test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix() test_cat_matr = imp.transform(test_cat_matr) test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix() # test_noncat_matr = imp2.transform(test_noncat_matr) # test_extra_matr = build_extra_features(test_noncat_matr[:,:10])