def imputation(df, strategy, cols_to_leave_out=None): """ Method that imputes values to the missing places based on the median, mean, etc. of the data in the column Args: df: (dataframe), pandas dataframe containing data strategy: (str), method of imputation, e.g. median, mean, etc. cols_to_leave_out: (list), list of column indices to not include in imputation Returns: df: (dataframe): dataframe with NaN or missing values resolved via imputation """ col_names = df.columns.tolist() if cols_to_leave_out is None: df_imputed = pd.DataFrame( Imputer(missing_values='NaN', strategy=strategy, axis=0).fit_transform(df)) else: df_include = df.drop(cols_to_leave_out, axis=1) df_hold_out = df.drop( [c for c in df.columns if c not in cols_to_leave_out], axis=1) df_imputed = pd.DataFrame(Imputer(missing_values='NaN', strategy=strategy, axis=0).fit_transform(df_include), columns=df_include.columns) # Need to join the imputed dataframe with the columns containing strings that were held out if cols_to_leave_out is None: df = df_imputed else: df = pd.concat([df_hold_out, df_imputed], axis=1) col_names = df.columns.tolist() return df
def preprocess_data(self): # Step 1 - One Hot Encode self.get_categorical_columns() print('Step 2 - Categorical Column Identification Complete ...') self.x_train = pd.get_dummies(self.x_train, columns=self.categorical_columns, prefix='one_hot_encoded_') self.get_training_columns(self.x_train) # Hotfix for XGBoost for column in self.traincols: if ("<" in column): self.x_train.rename(index=str, columns={column: column.replace("<", "")}, inplace=True) self.get_training_columns(self.x_train) encoded_columns = [i for i in self.traincols if "one_hot_encoded_" in i][:-1] not_encoded_columns = [i for i in self.traincols if "one_hot_encoded_" not in i] self.x_train = self.x_train[self.union(encoded_columns, not_encoded_columns)] self.get_training_columns(self.x_train) print('Step 3 - One Hot Encoding Complete ...') # Step 2 - Null Value Impute imputer = Imputer(strategy='mean', copy=False) self.x_train = pd.DataFrame(data=imputer.fit_transform(self.x_train), columns=self.traincols) print('Step 4 - Null Value Imputation Complete ...') # Step 3 - Feature Scaling sc_X = scaler(copy=False) self.x_train[not_encoded_columns] = sc_X.fit_transform(self.x_train[not_encoded_columns]) print('Step 5 - Standardisation Complete ...') self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x_train, self.y_train, test_size=0.2, random_state=1) print('Step 5 - Train Test Splitting Complete ...') print('Shape:' + str(self.x_train.shape)) return self.df, self.x_train, self.y_train, self.x_test, self.y_test, self.traincols, self.categorical_columns
def return_features_labels(): global data global data_copy # As we can see the the different move options, we perform label encoding. mapping_for_moves = {'x': 1, "o": 0} # For b, we put mean of the data. # Positive is win, negative is lose mapping_for_wins = {"positive": 1, "negative": 0} data.is_win = data.is_win.map(mapping_for_wins) data_copy.is_win = data_copy.is_win.map(mapping_for_wins) data = data.drop(columns=["is_win"], axis=1) for i in data.columns: # Applying map to all the columns except is_win. data[i] = data[i].map(mapping_for_moves) # Extracting features and labels features = data.values labels = data_copy.is_win.values # Filling missing values aka "b" with the mean features = (Imputer().fit_transform(features)) features = features.astype(np.int) labels = labels.astype(np.int) return features, labels
def impute_and_scale_array(mat, scaling=None): """ Impute missing values with mean and scale data included in numpy array. Parameters ---------- mat : numpy array Array to scale scaling : string String describing type of scaling to apply. Options recognized: 'maxabs', 'minmax', 'std'. 'maxabs' : scales data to range [-1 to 1]. 'minmax' : scales data to range [-1 to 1]. 'std' : scales data to normal variable with mean 0 and standard deviation 1. (Default: None, no scaling). Return ---------- Returns the numpy array imputed with the mean value of the \ column and scaled by the method specified. If no scaling method is specified, \ it returns the imputed numpy array. """ # imputer = Imputer(strategy='mean', axis=0, copy=False) # imputer = SimpleImputer(strategy='mean', copy=False) # Next line is from conditional import. axis=0 is default # in old version so it is not necessary. imputer = Imputer(strategy='mean', copy=False) imputer.fit_transform(mat) return scale_array(mat, scaling)
def get_some_data(): data = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv') y = data.Price X = data[cols_to_use] my_imputer = Imputer() imputed_X = my_imputer.fit_transform(X) return imputed_X, y
def impute_and_scale(df, scaling='std'): """Impute missing values with mean and scale data included in pandas dataframe. Parameters ---------- df : pandas dataframe dataframe to impute and scale scaling : 'maxabs' [-1,1], 'minmax' [0,1], 'std', or None, optional (default 'std') type of scaling to apply """ df = df.dropna(axis=1, how='all') imputer = Imputer(strategy='mean') mat = imputer.fit_transform(df) if scaling is None or scaling.lower() == 'none': return pd.DataFrame(mat, columns=df.columns) if scaling == 'maxabs': scaler = MaxAbsScaler() elif scaling == 'minmax': scaler = MinMaxScaler() else: scaler = StandardScaler() mat = scaler.fit_transform(mat) df = pd.DataFrame(mat, columns=df.columns) return df
def test_categories_to_integers_grid_search(self): data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data", "adult_set.txt") df = pandas.read_csv(data, sep="\t") X = df.drop('income', axis=1) y = df['income'] pipe = make_pipeline(CategoriesToIntegers(), LogisticRegression()) self.assertRaise(lambda: test_sklearn_grid_search_cv(lambda: pipe, df), ValueError) self.assertRaise( lambda: test_sklearn_grid_search_cv( lambda: pipe, X, y, categoriestointegers__single=[True, False] ), ValueError, "Unable to find category value") pipe = make_pipeline(CategoriesToIntegers(), Imputer(strategy='most_frequent'), LogisticRegression()) res = test_sklearn_grid_search_cv( lambda: pipe, X, y, categoriestointegers__single=[True, False], categoriestointegers__skip_errors=[True]) self.assertIn('model', res) self.assertIn('score', res) self.assertGreater(res['score'], 0) self.assertLesser(res['score'], 1)
def __init__(self, params, max_iter=6, ini_fill=True, ini_strategy_reg='mean', tol=1e-3, model_reg="xgboost", model_clf="xgboost"): ''' -max_iter:迭代次数 -ini_fill:是否要进行简单填补(False仅对xgb和lgb有效) -ini_strategy_reg:连续变量简单填补规则, mean or median -ini_strategy_clf:离散变量简单填补规则, only most_frequent -cat_index:离散变量索引(int) -tol:阈值 -model_reg:连续变量采用的预测缺失值模型, be xgboost,lightgbm, randomforest, knn -model_clf:离散变量采用的预测缺失值模型 ''' self.params = params # 模型参数 self.best_params = params #用于存储最佳的模型参数 self.ini_fill = ini_fill #是否简单初始化缺失部分。 self.max_iter = max_iter #最大迭代次数,每次迭代,缺失部分都会被修改。 self.imputer_reg = Imputer(strategy=ini_strategy_reg) #TODO 使用均值预填充 self.tol = tol #误差阈值,小于阈值则停止训练 self.model_reg = model_reg #回归模型
def impute_array(X_fit, *X_s, missing_values=np.NaN, strategy="mean"): """ :param X_fit: {array-like, sparse matrix} used to fit the imputer. This array is also imputed. :param X_s: the additional (optional) arrays that are imputed using the same imputer. :param missing_values: the value that will be substituted during the imputation. :param strategy: 'mean' (default) -> missing values are imputed with the mean value of the corresponding vector. 'median' -> missing values are imputed with the median value of the corresponding vector. 'mode' -> missing values are imputed with the mode of the corresponding vector. ('constant', value) -> missing values are imputed with the constant value provided as the second term of the tuple. None -> no-op (for internal use). :return: a list of imputed arrays, returned in the same order as they were provided. """ if strategy is None: return [X_fit, *X_s] strategy, fill_value = strategy if isinstance( strategy, tuple) and strategy[0] == 'constant' else (strategy, None) strategy = dict(mode='most_frequent').get(strategy, strategy) imputer = Imputer(missing_values=missing_values, strategy=strategy, fill_value=fill_value) imputed = _restore_dtypes(imputer.fit_transform(X_fit), X_fit) if len(X_s) > 0: result = [imputed] for X in X_s: result.append(_restore_dtypes(imputer.transform(X), X)) return result else: return imputed
def data_preprocess(data): # your code here # example: label = LabelEncoder() label_count = 0 for col in data: if data[col].dtype == 'object': if len(list(data[col].unique())) <= 2: # Train on data label.fit(data[col]) # Transform data data[col] = label.transform(data[col]) label_count += 1 x = pd.get_dummies(data) scaler = Normalizer() imputer = Imputer(strategy = 'median') imputer.fit(x) x = imputer.transform(x) scaler.fit(x) x = scaler.transform(x) # your code end return x
def preprocessdata(dataset): categorical = pd.get_dummies(data=dataset,columns =['Embarked','Sex']) names= pd.DataFrame({"Names":dataset['Name']}) column_titles = ['Mr.','Mrs.','Miss.','Master.', 'Rev.', 'Dr.','Col.','Mme.','Major.','Ms.','Lady.','Sir.', 'Mlle.','Capt.'] names.reindex(columns = column_titles, fill_value=0 ) titles = ['Mr\.','Mrs\.','Miss\.','Master\.', 'Rev\.', 'Dr\.','Col\.','Mme\.','Major\.','Ms\.','Lady\.','Sir\.', 'Mlle\.','Capt\.'] for ColName, title in zip(column_titles, titles): names[ColName] = names['Names'].str.contains(title) names = names.drop(columns=['Names']) dataset = pd.concat([categorical, names],axis = 1) dataset = dataset.drop(columns = ['Name','Cabin','Ticket','PassengerId']) #taking care of missing data imputer = Imputer(missing_values=np.nan, strategy='mean') imputer = imputer.fit(dataset) dataset = imputer.transform(dataset) #Feature Scaling sc_X = StandardScaler() dataset= sc_X.fit_transform(dataset) return dataset
def load_dataset(data_X, data_y, subsample_data, FIXED_SPLIT=True, TEST_SIZE=0.3, RANDOM_STATE=42, LIMIT_SPLIT=1000000): if isinstance(data_X,pd.DataFrame): data_X=pd.get_dummies(data_X) data_X=data_X.values if np.any(np.isnan(data_X)): imputer=Imputer(strategy="median") imputer.fit(data_X) data_X=imputer.transform(data_X) if isinstance(data_y,pd.Series): data_y=data_y.values if not isinstance(data_X,np.ndarray) or not isinstance(data_y,np.ndarray): raise "Incompatible dataset type. Must be pandas or np.ndarray." if subsample_data < 1.0 and FIXED_SPLIT is True: data_X, _, data_y, _ = train_test_split(data_X, data_y, train_size=subsample_data, random_state=RANDOM_STATE) data_X, data_y = check_X_y(data_X, data_y, accept_sparse=False) #change for true when the sparse grammar is ready temp_folder = tempfile.mkdtemp() filename = os.path.join(temp_folder, 'autocve_joblib.mmap') if os.path.exists(filename): os.unlink(filename) _ = dump(data_X, filename) data_X_memmap = load(filename, mmap_mode='r') if FIXED_SPLIT is True: split=None else: TRAIN_SIZE=1-TEST_SIZE split=StratifiedShuffleSplit(train_size=(TRAIN_SIZE*subsample_data), test_size=(TEST_SIZE*subsample_data), random_state=RANDOM_STATE, n_splits=LIMIT_SPLIT).split(data_X_memmap,data_y) return data_X_memmap, data_y, split, filename, temp_folder, data_X.shape[1]
def replacing_missing_numeric(df='dataframe', df_test='dataframe2'): # Test missing values #missing_value(df_test,df_name='TEST',visualizse=False,head_count=10) ## Fill in missing values #Strategy = Median, variances is high so better to use Median imputer = Imputer(strategy='median') scaler = MinMaxScaler(feature_range=[0, 1]) train = df_test train_col = train.columns with open((r'Imputer_folder/_Imputer.pkl'), 'rb') as f: imputer = pickle.load(f) test = imputer.transform(df_test) with open((r'Scalar/_ScalarImputer.pkl'), 'rb') as f: scaler = pickle.load(f) test = scaler.transform(test) #print('Testing data shape: ', test.shape) new_df_test = pd.DataFrame(test, columns=train_col) new_df_test['SK_ID_CURR'] = df_test['SK_ID_CURR'].values # Test missing values #missing_value(new_df_test,df_name='TEST',visualizse=False,head_count=5) #print(' Observation : \n 1.Now there is no missing value in Train and test') return new_df_test
def get_some_data(data): y = data.suit X = data[cols_to_use] my_imputer = Imputer() imputed_X = my_imputer.fit_transform(X) return imputed_X, y
def get_some_data(): cols_to_use = ['Distance', 'Landsize', 'BuildingArea'] data = pd.read_csv('../input/melbourne-housing-snapshot/melb_data.csv') y = data.Price X = data[cols_to_use] my_imputer = Imputer() imputed_X = my_imputer.fit_transform(X) return imputed_X, y
def _imputation(self, data): imp = Imputer(strategy='median') attributes = [ 'Critic_Score', 'User_Score', 'Critic_Count', 'User_Count' ] for item in attributes: data[item] = imp.fit_transform(data[[item]]).ravel() return data
def process_data_pipeline(raw_data: pd.DataFrame, num_feat: 'list of numbers', categ_feat: 'list of strings' = None, categ_feat_vals: 'list of strings' = None, just_transform: bool = False, just_pipeline: bool = False): num_pipeline = Pipeline([ ('feat_sel', FeatureSelector(num_feat, True)), ('Grade', FeatureCreator(['OverallCond', 'OverallQual'], lambda x, y: x / y, as_dataframe=True, feat_name='Grade')), ('Age', FeatureCreator(['YrSold', 'YearBuilt'], lambda x, y: x - y, as_dataframe=True, feat_name='Age')), ('RemodAge', FeatureCreator(['YrSold', 'YearRemodAdd'], lambda x, y: x - y, as_dataframe=True, feat_name='RemodAge')), ('TotalSF', FeatureCreator(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], lambda x, y: x + y, as_dataframe=True, feat_name='TotalSF')), ('drop_cat_feat', FeatureDropper(['YrSold', 'OverallCond'], as_dataframe=True)), ('imputer_mean', Imputer(strategy='mean')), ('std_scaler', RobustScaler()) ]) if categ_feat is None: if just_transform is True: return num_pipeline.transform(raw_data) return num_pipeline.fit_transform(raw_data) categ_cols = [raw_data[col].unique() for col in categ_feat ] if categ_feat_vals is None else categ_feat_vals cat_pipeline = Pipeline([ ('feat_sel', FeatureSelector(categ_feat, True)), ('imputer_most_frequent', CategoricalImputer()), ('encode', OneHotEncoder(sparse=False) if categ_cols is None else OneHotEncoder(categories=categ_cols, sparse=False)), ]) feat_union = FeatureUnion(transformer_list=[ ('num_features', num_pipeline), ('cat_features', cat_pipeline), ]) if just_pipeline is True: return feat_union if just_transform is True: return feat_union.transform(raw_data) return feat_union.fit_transform(raw_data)
def pact_score(Xtrain,Xvalid,Xtest,ytrain,yvalid,ytest,costs,verbose=False): columns = Xtrain.columns#np.array(pd.read_pickle('../data/ed-trauma/'+"Xtrain_raw_cat.pickle").columns) Xtrain = pd.DataFrame(data=np.vstack((Xtrain,Xvalid)),columns=columns) ytrain = np.hstack((ytrain,yvalid)) Xtest = pd.DataFrame(data=Xtest,columns=columns) ytest = ytest all_measurability = load_cost_dict()#{fname:cost for fname, cost in zip(columns,costs)} # PACT Score pact_meas = {"shock_index":all_measurability['scenefirstpulse']+all_measurability['scenefirstbloodpressure'], "age":all_measurability['age'], "not_mvc":all_measurability['causecode'], "gcs":np.sum(all_measurability[f'scenegcs{k}'] for k in ['eye','motor','verbal']), "intub":all_measurability['intub'], "cpr":all_measurability['cpr']} lr_pact = LogisticRegression() Xtrain_pact = pd.DataFrame() Xtrain_pact["shock_index"] = Xtrain['scenefirstpulse']/(Xtrain['scenefirstbloodpressure']+1) Xtrain_pact["age"] = Xtrain['age'] bike_mv_mc_ped = np.array([2,14,15,18]) Xtrain_pact["not_mvc"] = ~np.isin(Xtrain['causecode'].values,bike_mv_mc_ped) Xtrain_pact["gcs"] = 15-Xtrain['scenegcs'] Xtrain_pact["cpr"] = Xtrain['cpr'] Xtrain_pact["intub"] = Xtrain['intub'] imp = Imputer() ss = StandardScaler() Xtrain_pact_imp = imp.fit_transform(Xtrain_pact.values.astype('float')) Xtrain_pact_ss = ss.fit_transform(Xtrain_pact_imp) Xtest_pact = pd.DataFrame() Xtest_pact["shock_index"] = Xtest['scenefirstpulse']/(Xtest['scenefirstbloodpressure']+1) Xtest_pact["age"] = Xtest['age'] Xtest_pact["not_mvc"] = ~np.isin(Xtest['causecode'].values,bike_mv_mc_ped) Xtest_pact["gcs"] = 15-Xtest['scenegcs'] Xtest_pact["cpr"] = Xtest['cpr'] Xtest_pact["intub"] = Xtest['intub'] Xtest_pact_imp = imp.transform(Xtest_pact.values.astype('float')) Xtest_pact_ss = ss.transform(Xtest_pact_imp) pact_lr = LogisticRegression() pact_lr.fit(Xtrain_pact_ss,ytrain) if verbose: print ("PACT ROC",roc_auc_score(ytest,pact_lr.predict_proba(Xtest_pact_ss)[:,1])) pact_cost = np.sum(list(pact_meas.values())) if verbose: print ("PACT Cost", pact_cost) costvec = np.array([pact_meas[c] for c in Xtrain_pact.columns]) exp = LinearExplainer model = LogisticRegression() DIO = knapsack.IncreasingCostRetainer(model,exp) DIO.fit(Xtrain_pact_ss,ytrain,costvec) DIO.score_models_proba(Xtest_pact_ss,ytest,roc_auc_score) preds = pact_lr.predict_proba(Xtest_pact_ss)[:,1] return pact_cost,roc_auc_score(ytest,preds), Xtest_pact_ss, pact_lr,DIO, Xtest_pact.columns, preds
def test_imputer(self): try: model = Imputer(missing_values='NaN', strategy='mean', axis=0) except TypeError: model = Imputer(missing_values=np.nan, strategy='mean') model.axis = 0 data = [[1, 2], [np.nan, 3], [7, 6]] model.fit(data) from onnxmltools.convert.coreml.convert import convert import coremltools # noqa try: model_coreml = coremltools.converters.sklearn.convert(model) except ValueError as e: if 'not supported' in str(e): # Python 2.7 + scikit-learn 0.22 return model_onnx = convert(model_coreml.get_spec()) self.assertTrue(model_onnx is not None) dump_data_and_model(np.array(data, dtype=np.float32), model, model_onnx, basename="CmlImputerMeanFloat32")
def test_multiindex_df(multiindex_dataframe_incomplete): """ Get a dataframe from a multiindex dataframe with missing data """ df = multiindex_dataframe_incomplete mapper = DataFrameMapper([([c], Imputer()) for c in df.columns], df_out=True) transformed = mapper.fit_transform(df) assert len(transformed) == len(multiindex_dataframe_incomplete) for c in df.columns: assert len(transformed[str(c)]) == len(df[c])
def impute_data(X, X_test): if np.any(np.isnan(X.values)) or np.any(np.isnan(X_test.values)): imputer = Imputer(strategy="median") imputer.fit(X) X = imputer.transform(X) X_test = imputer.transform(X_test) else: X = X.values #TPOT operators need numpy format for been applied X_test = X_test.values return X, X_test
def createAuto(target): win = 13 # window size, how many previous values we take of the target (here 12 because the range goes from 1-12 without the 13) dataAuto = np.empty((len(target), win - 1)) for i in range(1, win): dataAuto[:, i - 1] = shift2(target, i) dataAuto[np.isinf(dataAuto)] = np.nan imp = Imputer( missing_values=np.nan, strategy='mean' ) # fill in the missing values with the mean of each column, works on axis=0 by default transformedDataAuto = imp.fit_transform(dataAuto) X_auto = transformedDataAuto return X_auto
def test_default_transformer(): """ If default=Transformer, non explicitly selected columns are applied this transformer. """ df = pd.DataFrame({ 'a': [1, np.nan, 3], }) mapper = DataFrameMapper([], default=Imputer()) transformed = mapper.fit_transform(df) assert (transformed[:0] == np.array([1., 2., 3.])).all()
def impute_featureset(fset, strategy='constant', value=None, max_value=1e20, inplace=False): """Replace NaN/Inf values with imputed values as defined by `strategy`. Output should satisfy `sklearn.validation.assert_all_finite` so that training a model will not produce an error. Parameters ---------- strategy : str, optional The imputation strategy. Defaults to 'constant'. - 'constant': replace all missing with `value` - 'mean': replace all missing with mean along `axis` - 'median': replace all missing with median along `axis` - 'most_frequent': replace all missing with mode along `axis` value : float or None, optional Replacement value to use for `strategy='constant'`. Defaults to `None`, in which case a very large negative value is used (a good choice for e.g. random forests). max_value : float, optional Maximum (absolute) value above which values are treated as infinite. Used to prevent overflow when fitting `sklearn` models. inplace : bool, optional If True, fill in place. If False, return a copy. Returns ------- pd.DataFrame Feature data frame wth no missing/infinite values. """ if not inplace: fset = fset.copy() fset.values[np.isnan( fset.values)] = np.inf # avoid NaN comparison warnings fset.values[np.abs(fset.values) > max_value] = np.nan if strategy == 'constant': if value is None: # If no fill-in value is provided, use a large negative value value = -2. * np.nanmax(np.abs(fset.values)) fset.fillna(value, inplace=True) elif strategy in ('mean', 'median', 'most_frequent'): imputer = Imputer(strategy=strategy) fset.values[:] = imputer.fit_transform(fset.values) else: raise NotImplementedError("Imputation strategy '{}' not" "recognized.".format(strategy)) return fset
def data_cleaning(df): #Data Cleaning for numbers imputer = Imputer(strategy="median") dfn = df.drop("ocean_proximity", axis=1) imputer.fit(dfn) Xn = pd.DataFrame(imputer.transform(dfn), columns=dfn.columns) #Data Cleaning for Text and Categorical Attributes encoder = LabelBinarizer(sparse_output=True) Xt = encoder.fit_transform(df["ocean_proximity"]) return imputer.statistics_, Xn, Xt
def __init__(self, max_iter = 10, ini_fill = True, ini_strategy_reg = 'mean', ini_strategy_clf = 'most_frequent', with_cat = False, cat_index = None, tol = 1e-3, model_reg = "knn", model_clf = "knn"): ''' -max_iter:迭代次数 -ini_fill:是否要进行简单填补(False仅对xgb和lgb有效) -ini_strategy_reg:连续变量简单填补规则, mean or median -ini_strategy_clf:离散变量简单填补规则, only most_frequent -cat_index:离散变量索引(int) -tol:阈值 -model_reg:连续变量采用的预测缺失值模型, be xgboost,lightgbm, randomforest, knn -model_clf:离散变量采用的预测缺失值模型 ''' self.ini_fill = ini_fill self.max_iter = max_iter self.imputer_reg = Imputer(strategy = ini_strategy_reg) #TODO 使用均值预填充 self.imputer_clf = Imputer(strategy = ini_strategy_clf) self.with_cat = with_cat self.cat_index = cat_index self.tol = tol self.model_reg = model_reg self.model_clf = model_clf if (not self.ini_fill) and (self.model_reg not in ('lightgbm', 'xgboost')) and (self.model_clf not in ('lightgbm', 'xgboost')): raise ValueError("ini_fill = False only work when prams is lightgbm or xgboost")
def preprocess_numerics(dataframe, numerical_columns): """ Preprocess numerical dataframe columns for a Restricted Boltzmann Machine. Parameters ---------- dataframe : pd.DataFrame A Pandas DataFrame to be used for training an RBM on. numerical_columns : list of str A list of the column names to be treated as numerical values. Returns ------- numerics : np.array A numpy array of the numerical columns scaled to [0,1]. scaler: sklearn.preprocessing.MinMaxScaler The scikit-learn scaler used to transform the values. """ # converts to numerical values where possible, replaces with NaN if not numerics = pd.DataFrame( dataframe[numerical_columns]._convert(numeric=True)) # selects only columns with some numerical values numerics = numerics.select_dtypes([np.number]) if not numerics.empty: to_impute = np.logical_not(np.isfinite(numerics)) # avoids that annoying pandas warning numerics.is_copy = False # replaces infs with nans numerics[to_impute] = np.nan # replace NaNs with column means to leave min-max scaling unaffected array = Imputer().fit_transform(numerics) # scale values to the range [0,1] scaler = MinMaxScaler().fit(array) numerics = scaler.transform(array) # put our NaNs back in to be imputed by the RBM numerics[to_impute] = np.nan else: numerics = np.empty((dataframe.shape[0], 0)) scaler = None return numerics, scaler
def impute_and_scale_array(mat, scaling=None): """Impute missing values with mean and scale data included in numpy array. Parameters ---------- mat : numpy array array to scale scaling : 'maxabs', 'minmax', 'std', or None, optional (default 'None') type of scaling to apply """ imputer = Imputer(strategy='mean', axis=0, copy=False) imputer.fit_transform(mat) #mat = imputer.fit_transform(mat) return scale_array(mat, scaling)
def get_rfc_grid(cv, dim_reduction_methods, scoring, random_state=None, n_jobs=1, rfc_n_estimators_l=None): pipe = Pipeline([ ("Fill_NaN", Imputer(strategy="median")), ('StdScaler', StandardScaler()), ('dim_reduction', SelectKBest(stats.ttest_ind)), ('classifier', RandomForestClassifier(random_state=random_state)),]) param_grid = {'dim_reduction': dim_reduction_methods,} if rfc_n_estimators_l is not None: param_grid['classifier__n_estimators'] = rfc_n_estimators_l return GridSearchCV( estimator = pipe, param_grid = param_grid, scoring = scoring, cv = cv, n_jobs = n_jobs )
def makeit(self): self.housing_num = self.housing.drop(self.rem_attribs, axis=1) self.num_attribs = list(self.housing_num) self.num_pipeline = Pipeline([ ('selector', DataFrameSelector(self.num_attribs)), ('imputer', Imputer(strategy="median")), ('attribs_adder', self.attr_adder), ('std_scaler', StandardScaler()), ]) self.cat_pipeline = Pipeline([ ('selector', DataFrameSelector(self.cat_attribs)), ('cat_encoder', OneHotEncoder(sparse=False)) ]) self.full_pipeline = FeatureUnion(transformer_list=[ ('num_pipeline', self.num_pipeline), ('cat_pipeline', self.cat_pipeline), ]) self.train_labels = self.strat_train_set['median_house_value'].copy( ).to_numpy() self.strat_train_set.drop('median_house_value', axis=1) self.train_features_prepared = self.full_pipeline.fit_transform( self.strat_train_set) self.test_lables = self.strat_test_set['median_house_value'].to_numpy() self.strat_test_set.drop('median_house_value', axis=1) self.test_features_prepared = self.full_pipeline.fit_transform( self.strat_test_set) self.cat_encoder = self.cat_pipeline.named_steps['cat_encoder'] self.cat_onehot_attribs = list(self.cat_encoder.categories_[0]) self.headings = self.num_attribs + self.attr_adder.extras( ) + self.cat_onehot_attribs print('\n' + '=' * 80) # print('\nPipeline: {}'.format(self.attr_adder)) print('Pipeline training array.shape:\t', self.train_features_prepared.shape) print('Pipeline test array.shape:\t\t', self.test_features_prepared.shape) return self.train_labels, self.train_features_prepared, self.test_lables, self.test_features_prepared