def impute_mean(df, attr): """Imputes the given attribute of the given DataFrame with the mean strategy. Returns a DataFrame object""" imp = Imputer(missing_values="NaN", strategy="mean") imp.fit(df[[attr]]) df[attr] = imp.transform(df[[attr]]).ravel() return df
def preprocessData(self, data): imputer = Imputer(missing_values=np.nan, strategy='mean') imputer.fit(data) imputedData = imputer.transform(data) # nan values will take on mean scaledData = preprocessing.scale(imputedData).tolist() return scaledData
def to_predict_instance(self, X, partition_columns): values_for_preferences = [] for column in partition_columns: if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns): values_for_preferences.append(list(X[column].unique())) all_combinations = list(itertools.product( *values_for_preferences)) instances = [] for combination in all_combinations: instance = [] for column in X.columns: # se é um parametro dentro das preferencias if PreferenceProcessor.is_parameter_in_preferences(column, partition_columns): instance.append( combination[list(partition_columns).index(column)]) # se não está nas preferencias e esta codificado elif len(column.split("#")) > 1: instance.append(0) # se não está nas preferencias e não esta codificado else: instance.append(np.nan) imputer = Imputer( missing_values=np.nan, strategy='mean', axis=0) imputer = imputer.fit(X) instance = imputer.transform([instance])[0] instances.append(instance) return instances
def _impute(features, imputer=True): """ Helper function that uses the safest imputing method to remove null values, in terms of compatibility with the data size @param features: the feature values that need to be imputed @type features: numpy.array @param imputer: whether or not the scikit imputing method should be used @type imputer: boolean @return: the modified feature values @rtype: numpy.array """ if not imputer: #run imputer only if enabled (default) return np.nan_to_num(features) else: imp = Imputer(missing_values='NaN', strategy='mean', axis=0, verbose=2) try: impfeatures = imp.fit_transform(features) except ValueError as exc: #catch errors with illegal values (e.g. strings) log.warning("Exception trying to run scikit imputation: {}".format(exc)) impfeatures = features #show size for debugging purposes #log.debug("Featurevectors {} after imputation: {}".format(impfeatures.shape, features))i #we don't want shgrid_scores_ape to change, so if this happens, then just replace nans with zero and infinites if impfeatures.shape == features.shape: features = impfeatures else: log.warning("Imputer failed, filtering NaN based on numpy converter") features = np.nan_to_num(features) return features
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] self.categorical = [True if attribute == 'nominal' else False for attribute in self.attribute_types] data = np.array(dataset['data'], dtype=np.float64) X = data[:,:-1] y = data[:,-1].reshape((-1,)) ohe = OneHotEncoder(self.categorical) X_transformed = ohe.fit_transform(X) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) center = not scipy.sparse.isspmatrix((X_transformed)) standard_scaler = StandardScaler(with_mean=center) X_transformed = standard_scaler.fit_transform(X_transformed) X_transformed = X_transformed.todense() # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"] (self.X_transformed, self.y)) self.helpers.set_value("MissingValues", self.helpers[ "MissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"]( self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value("Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value("Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def clean(df, strategy='median'): '''Cleans DataFrame.''' imputer = Imputer(strategy=strategy) object_df = df.select_dtypes(include=['object']) float_df = df.select_dtypes(include=['float64']) imputer.fit(float_df) float_df = pd.DataFrame(imputer.transform(float_df), columns=float_df.columns) return pd.concat([object_df, float_df], axis=1)
def test_deprecated_imputer_axis(): depr_message = ("Parameter 'axis' has been deprecated in 0.20 and will " "be removed in 0.22. Future (and default) behavior is " "equivalent to 'axis=0' (impute along columns). Row-wise " "imputation can be performed with FunctionTransformer.") X = sparse_random_matrix(5, 5, density=0.75, random_state=0) imputer = Imputer(missing_values=0, axis=0) assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X) imputer = Imputer(missing_values=0, axis=1) assert_warns_message(DeprecationWarning, depr_message, imputer.fit, X)
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ['mean', 'median', 'most_frequent']: imputer = Imputer(strategy=strategy) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2))
def test_imputation_shape(): # Verify the shapes of the imputed matrix for different strategies. X = np.random.randn(10, 2) X[::2] = np.nan for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(strategy=strategy) X_imputed = imputer.fit_transform(X) assert_equal(X_imputed.shape, (10, 2)) X_imputed = imputer.fit_transform(sparse.csr_matrix(X)) assert_equal(X_imputed.shape, (10, 2))
def feature_inf(my_feature,dim_feature): from sklearn.preprocessing.imputation import Imputer dim_feature=my_feature.shape[1] imp = Imputer(missing_values=np.inf, strategy='mean') correction_array=[0]*2*dim_feature correction_array=np.asarray(correction_array).reshape(2,dim_feature) imp.fit(correction_array) my_feature=imp.transform(my_feature) # preprocessing to get rid of NaN, infinity, etc. return my_feature
def preprocessData(self, data): ''' Handle missing values and scale the data (scaling necessary for SVM to function well). :param data: All of the original data. :return: Data that has been processed. ''' imputer = Imputer(missing_values=np.nan, strategy='mean') imputer.fit(data) imputedData = imputer.transform(data) #nan values will take on mean scaledData = preprocessing.scale(imputedData).tolist() return scaledData
def test_initialize_model_from_run(self): clf = sklearn.pipeline.Pipeline( steps=[('Imputer', Imputer(strategy='median') ), ('VarianceThreshold', VarianceThreshold( threshold=0.05)), ('Estimator', GaussianNB())]) task = openml.tasks.get_task(11) run = openml.runs.run_model_on_task(task, clf, avoid_duplicate_runs=False) run_ = run.publish() run = openml.runs.get_run(run_.run_id) modelR = openml.runs.initialize_model_from_run(run.run_id) modelS = openml.setups.initialize_model(run.setup_id) flowR = openml.flows.sklearn_to_flow(modelR) flowS = openml.flows.sklearn_to_flow(modelS) flowL = openml.flows.sklearn_to_flow(clf) openml.flows.assert_flows_equal(flowR, flowL) openml.flows.assert_flows_equal(flowS, flowL) self.assertEquals(flowS.components['Imputer'].parameters['strategy'], '"median"') self.assertEquals( flowS.components['VarianceThreshold'].parameters['threshold'], '0.05')
def test_imputation_pickle(): """Test for pickling imputers.""" import pickle l = 100 X = sparse_random_matrix(l, l, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_equal(imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), "Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def test_imputation_pickle(): # Test for pickling imputers. import pickle l = 100 X = sparse_random_matrix(l, l, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), "Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def get_pipeline(self, classifier): # preprocess_pipeline = make_pipeline( # ColumnSelector(columns=self.cols_feature), # , # ) transformer_list = [] if float in self.X.dtypes.values: transformer_list.append(( "numeric_features", make_pipeline( TypeSelector(np.number), # SimpleImputer(strategy="median"), Imputer(strategy="median"), StandardScaler()))) if "category" in self.X.dtypes.values: transformer_list.append(( "categorical_features", make_pipeline( TypeSelector("category"), # SimpleImputer(strategy="most_frequent"), Imputer(strategy="most_frequent"), OneHotEncoder()))) if 'bool' in self.X.dtypes.values: transformer_list.append(( "boolean_features", make_pipeline(TypeSelector("bool"), Imputer(strategy="most_frequent") # SimpleImputer(strategy="most_frequent") ))) feature_union = FeatureUnion(transformer_list=transformer_list) pipeline = Pipeline( steps=[('colselector', ColumnSelector( columns=self.X.columns)), ( 'featureunion', feature_union), ('classifier', classifier)]) # make_pipeline( # preprocess_pipeline, # 'classfier': classifier, # ) return pipeline
def check_indicator(X, expected_imputed_features, axis): n_samples, n_features = X.shape imputer = Imputer(missing_values=-1, strategy='mean', axis=axis) imputer_with_in = clone(imputer).set_params(add_indicator_features=True) Xt = imputer.fit_transform(X) Xt_with_in = imputer_with_in.fit_transform(X) imputed_features_mask = X[:, expected_imputed_features] == -1 n_features_new = Xt.shape[1] n_imputed_features = len(imputer_with_in.imputed_features_) assert_array_equal(imputer.imputed_features_, expected_imputed_features) assert_array_equal(imputer_with_in.imputed_features_, expected_imputed_features) assert_equal(Xt_with_in.shape, (n_samples, n_features_new + n_imputed_features)) assert_array_equal(Xt_with_in, np.hstack((Xt, imputed_features_mask))) imputer_with_in = clone(imputer).set_params(add_indicator_features=True) assert_array_equal(Xt_with_in, imputer_with_in.fit_transform(sparse.csc_matrix(X)).A) assert_array_equal(Xt_with_in, imputer_with_in.fit_transform(sparse.csr_matrix(X)).A)
def test_mice_missing_at_transform(): n = 100 d = 10 Xtr = np.random.randint(low=0, high=3, size=(n, d)) Xts = np.random.randint(low=0, high=3, size=(n, d)) Xtr[:, 0] = 1 # definitely no missing values in 0th column Xts[0, 0] = 0 # definitely missing value in 0th column for strategy in ["mean", "median", "most_frequent"]: mice = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1, initial_strategy=strategy).fit(Xtr) initial_imputer = Imputer(missing_values=0, strategy=strategy).fit(Xtr) # if there were no missing values at time of fit, then mice will # only use the initial imputer for that feature at transform assert np.all( mice.transform(Xts)[:, 0] == initial_imputer.transform(Xts)[:, 0])
def test_local_run_metric_score(self): # construct sci-kit learn classifier clf = Pipeline(steps=[('imputer', Imputer( strategy='median')), ('estimator', RandomForestClassifier())]) # download task task = openml.tasks.get_task(7) # invoke OpenML run run = openml.runs.run_model_on_task(task, clf) self._test_local_evaluations(run)
def test_run_and_upload_decision_tree_pipeline(self): pipeline2 = Pipeline( steps=[('Imputer', Imputer( strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV(DecisionTreeClassifier(), { 'min_samples_split': [2**x for x in range(1, 7 + 1)], 'min_samples_leaf': [2**x for x in range(0, 6 + 1)] }, cv=3, n_iter=10))]) self._run_and_upload(pipeline2, '62501')
def test__run_exists(self): # would be better to not sentinel these clfs, # so we do not have to perform the actual runs # and can just check their status on line clfs = [ sklearn.pipeline.Pipeline( steps=[('Imputer', Imputer(strategy='mean')), ('VarianceThreshold', VarianceThreshold(threshold=0.05) ), ('Estimator', DecisionTreeClassifier(max_depth=4))]), sklearn.pipeline.Pipeline( steps=[('Imputer', Imputer(strategy='most_frequent')), ('VarianceThreshold', VarianceThreshold(threshold=0.1) ), ('Estimator', DecisionTreeClassifier(max_depth=4))]) ] task = openml.tasks.get_task(115) for clf in clfs: try: # first populate the server with this run. # skip run if it was already performed. run = openml.runs.run_model_on_task(task, clf, avoid_duplicate_runs=True) run.publish() except openml.exceptions.PyOpenMLError as e: # run already existed. Great. pass flow = openml.flows.sklearn_to_flow(clf) flow_exists = openml.flows.flow_exists(flow.name, flow.external_version) self.assertGreater(flow_exists, 0) downloaded_flow = openml.flows.get_flow(flow_exists) setup_exists = openml.setups.setup_exists(downloaded_flow, clf) self.assertGreater(setup_exists, 0) run_ids = _run_exists(task.task_id, setup_exists) self.assertTrue(run_ids, msg=(run_ids, clf))
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. pipeline = Pipeline([('imputer', Imputer(missing_values=0)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__strategy': ["mean", "median", "most_frequent"], 'imputer__axis': [0, 1] } l = 100 X = sparse_random_matrix(l, l, density=0.10) Y = sparse_random_matrix(l, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
def test_imputation_copy(): """Test imputation with copy=True.""" l = 5 # Test default behaviour and with copy=True for params in [{}, {'copy': True}]: X = sparse_random_matrix(l, l, density=0.75, random_state=0) # Dense imputer = Imputer(missing_values=0, strategy="mean", **params) Xt = imputer.fit(X).transform(X) Xt[0, 0] = np.nan # Check that the objects are different and that they don't use # the same buffer assert_false(np.all(X.todense() == Xt)) # Sparse imputer = Imputer(missing_values=0, strategy="mean", **params) X = X.todense() Xt = imputer.fit(X).transform(X) Xt[0, 0] = np.nan # Check that the objects are different and that they don't use # the same buffer assert_false(np.all(X == Xt))
def test_run_on_dataset_with_missing_labels(self): # Check that _run_task_get_arffcontent works when one of the class # labels only declared in the arff file, but is not present in the # actual data task = openml.tasks.get_task(2) class_labels = task.class_labels model = Pipeline(steps=[('Imputer', Imputer( strategy='median')), ('Estimator', DecisionTreeClassifier())]) data_content, _, _, _, _ = _run_task_get_arffcontent(model, task) # 2 folds, 5 repeats; keep in mind that this task comes from the test # server, the task on the live server is different self.assertEqual(len(data_content), 4490) for row in data_content: # repeat, fold, row_id, 6 confidences, prediction and correct label self.assertEqual(len(row), 12)
def test_learning_curve_task(self): task_id = 801 # diabates dataset num_test_instances = 6144 # for learning curve num_repeats = 1 num_folds = 10 num_samples = 8 clfs = [] random_state_fixtures = [] #nb = GaussianNB() #clfs.append(nb) #random_state_fixtures.append('62501') pipeline1 = Pipeline(steps=[('scaler', StandardScaler( with_mean=False)), ('dummy', DummyClassifier(strategy='prior'))]) clfs.append(pipeline1) random_state_fixtures.append('62501') pipeline2 = Pipeline( steps=[('Imputer', Imputer( strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV(DecisionTreeClassifier(), { 'min_samples_split': [2**x for x in range(1, 7 + 1)], 'min_samples_leaf': [2**x for x in range(0, 6 + 1)] }, cv=3, n_iter=10))]) clfs.append(pipeline2) random_state_fixtures.append('62501') for clf, rsv in zip(clfs, random_state_fixtures): run = self._perform_run(task_id, num_test_instances, clf, random_state_value=rsv) # todo: check if runtime is present self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
def test__prediction_to_row(self): repeat_nr = 0 fold_nr = 0 clf = sklearn.pipeline.Pipeline( steps=[('Imputer', Imputer(strategy='mean') ), ('VarianceThreshold', VarianceThreshold( threshold=0.05)), ('Estimator', GaussianNB())]) task = openml.tasks.get_task(20) train, test = task.get_train_test_split_indices(repeat_nr, fold_nr) X, y = task.get_X_and_y() clf.fit(X[train], y[train]) test_X = X[test] test_y = y[test] probaY = clf.predict_proba(test_X) predY = clf.predict(test_X) sample_nr = 0 # default for this task for idx in range(0, len(test_X)): arff_line = _prediction_to_row(repeat_nr, fold_nr, sample_nr, idx, task.class_labels[test_y[idx]], predY[idx], probaY[idx], task.class_labels, clf.classes_) self.assertIsInstance(arff_line, list) self.assertEqual(len(arff_line), 6 + len(task.class_labels)) self.assertEqual(arff_line[0], repeat_nr) self.assertEqual(arff_line[1], fold_nr) self.assertEqual(arff_line[2], sample_nr) self.assertEqual(arff_line[3], idx) sum = 0.0 for att_idx in range(4, 4 + len(task.class_labels)): self.assertIsInstance(arff_line[att_idx], float) self.assertGreaterEqual(arff_line[att_idx], 0.0) self.assertLessEqual(arff_line[att_idx], 1.0) sum += arff_line[att_idx] self.assertAlmostEqual(sum, 1.0) self.assertIn(arff_line[-1], task.class_labels) self.assertIn(arff_line[-2], task.class_labels) pass
def transform_data(self, housing_data): data = housing_data.drop('median_house_value', axis=1) self.housing_num = data.select_dtypes(include=[np.number]) self.num_attribs = list(self.housing_num) self.cat_attribs = list(data.select_dtypes(include=[np.object])) self.num_pipeline = Pipeline([ ('selector' , DataFrameSelector (self.num_attribs )), ('imputer' , Imputer (strategy="median")), ('attribs_adder', CombinedAttributesAdder( )), ('std_caller' , StandardScaler ( )) ]) self.cat_pipeline = Pipeline([ ('selector' , DataFrameSelector (self.cat_attribs )), ('cat_encoder' , OneHotEncoder (sparse=False )) ]) self.full_pipeline = FeatureUnion(transformer_list=[ ("num_pipeline", self.num_pipeline), ("cat_pipeline", self.cat_pipeline) ])
def modelo_4v(): print(request.args) loaded_model, graph = cargarModelo_4v() # dimensions of our images. # Show datatest_name = request.args.get("datacsv") data_path = '../samples/' + datatest_name + '.csv' dataset = pd.read_csv(data_path, delimiter='\t') # imp = SimpleImputer(missing_values=np.nan, strategy='mean') sc = StandardScaler() #imputacion de datos(datos nulos) imp = Imputer() X_ID = dataset.iloc[:, 0].values X_testing = dataset.iloc[:, 1:5].values #imputacion de datos(datos nulos) imp = Imputer() imp.fit(X_testing) X_test = imp.transform(X_testing) X_test = sc.fit_transform(X_test, ) #prediccion with graph.as_default(): y_pred = loaded_model.predict(X_test) resultado_final = '' for i in range(0, len(y_pred)): if y_pred[i] > 0.5: print(X_ID[i], ' --> Genera Valor!') resultado = str(X_ID[i]) + ' --> Genera Valor!! ' else: print(X_ID[i], ' --> No genera Valor ') resultado = str(X_ID[i]) + ' --> No genera Valor ' resultado_final = resultado_final + resultado + '\n' #print('Prediccion:', score, ' Gato ' if score < 0.5 else ' Perro') return resultado_final
def test_imputation_pickle(): # Test for pickling imputers. import pickle n = 100 X = sparse_random_matrix(n, n, density=0.10).todense() for strategy in ["mean", "median", "most_frequent", "mice"]: if strategy == 'mice': imputer = MICEImputer(missing_values=0, n_imputations=1, n_burn_in=1) else: imputer = Imputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_almost_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), err_msg="Fail to transform the data after pickling " "(strategy = %s)" % (strategy))
def test_learning_curve_task_2(self): task_id = 801 # diabates dataset num_test_instances = 6144 # for learning curve num_repeats = 1 num_folds = 10 num_samples = 8 pipeline2 = Pipeline( steps=[('Imputer', Imputer( strategy='median')), ('VarianceThreshold', VarianceThreshold()), ('Estimator', RandomizedSearchCV(DecisionTreeClassifier(), { 'min_samples_split': [2**x for x in range(1, 7 + 1)], 'min_samples_leaf': [2**x for x in range(0, 6 + 1)] }, cv=3, n_iter=10))]) run = self._perform_run(task_id, num_test_instances, pipeline2, random_state_value='62501') self._check_sample_evaluations(run.sample_evaluations, num_repeats, num_folds, num_samples)
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_false(np.all(X == Xt)) # copy=True, sparse csr => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = Imputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_true(np.all(X == Xt)) # copy=False, sparse csr, axis=1 => no copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=0 => no copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_true(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=0) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csc, axis=1 => copy X = X_orig.copy().tocsc() imputer = Imputer(missing_values=X.data[0], strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_false(np.all(X.data == Xt.data)) # copy=False, sparse csr, axis=1, missing_values=0 => copy X = X_orig.copy() imputer = Imputer(missing_values=0, strategy="mean", copy=False, axis=1) Xt = imputer.fit(X).transform(X) assert_false(sparse.issparse(Xt))
def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: - along the two axes - with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " "axis = {0}, sparse = {1}" % (strategy, missing_values) # Normal matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) X_trans = imputer.fit(X).transform(X.copy()) assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, False)) assert_array_equal(X_trans, X_true, err_msg.format(0, False)) # Normal matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(X.transpose()) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, X.copy().transpose()) else: X_trans = imputer.transform(X.copy().transpose()) assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, False)) # Sparse matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, True)) assert_array_equal(X_trans, X_true, err_msg.format(0, True)) # Sparse matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(sparse.csc_matrix(X.transpose())) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, sparse.csc_matrix(X.copy().transpose())) else: X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, True))
count += 1 if count % 1000 == 0: print(count) val = noncat_matrix[x, y] if val - math.floor(val) != 0.0: for i in range(20): if abs(abs(val) * i - math.ceil(abs(val) * i)) < 0.001: X[x, 2 * y] = math.ceil(abs(val) * i) X[x, 2 * y + 1] = i return X # категории print("building train") train_cat_matr = train_df.ix[:, 0:CAT_COUNT].as_matrix() imp = Imputer(missing_values="NaN", strategy="most_frequent", axis=0) train_cat_matr = imp.fit_transform(train_cat_matr) # imp2 = Imputer(missing_values='NaN', strategy='median') train_noncat_matr = train_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # train_noncat_matr = train_df.ix[:, CAT_COUNT:].as_matrix() # train_noncat_matr = imp2.fit_transform(train_noncat_matr) # allf = np.hstack((train_cat_matr, train_noncat_matr)) print("building test") test_df.ix[:, 0:CAT_COUNT] = test_set_to_encode test_cat_matr = test_df.ix[:, 0:CAT_COUNT].as_matrix() test_cat_matr = imp.transform(test_cat_matr) test_noncat_matr = test_df.ix[:, CAT_COUNT:].fillna(0).as_matrix() # test_noncat_matr = test_df.ix[:, CAT_COUNT:].as_matrix() # test_noncat_matr = imp2.transform(test_noncat_matr)
# This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from sklearn.preprocessing.imputation import Imputer from matplotlib import pyplot as plt # Input data files are available in the "../input/" directory. # For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory import os print(os.listdir("../input")) my_age_imputer = Imputer(strategy='median') # Any results you write to the current directory are saved as output. # In[ ]: #loading data into dataframe variable path = '../input/train.csv' test_path = '../input/test.csv' test_data = pd.read_csv(test_path) train_data = pd.read_csv(path) total_data = train_data.append(test_data) #exploring the data print((total_data.isnull().sum())) # finding columns that have null values #getting rid of Cabin since most of its values are missing (687) data = total_data.drop('Cabin', axis=1) # drop Cabin because it is mostly blank
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] self.categorical = [True if attribute == 'nominal' else False for attribute in self.attribute_types] data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1,)) # First, swap NaNs and zeros, because when converting an encoded # dense matrix to sparse, the values which are encoded to zero are lost X_sparse = X.copy() NaNs = ~np.isfinite(X_sparse) X_sparse[NaNs] = 0 X_sparse = sparse.csr_matrix(X_sparse) ohe = OneHotEncoder(self.categorical) X_transformed = X_sparse.copy() X_transformed = ohe.fit_transform(X_transformed) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler() X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X_sparse self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"] (self.X_transformed, self.y)) self.helpers.set_value("MissingValues", self.helpers[ "MissingValues"](self.X, self.y, self.categorical)) self.mf.set_value("NumberOfMissingValues", self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"]( self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value("Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value("Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: - along the two axes - with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "axis = {0}, sparse = {1}" % (strategy, missing_values) # Normal matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) X_trans = imputer.fit(X).transform(X.copy()) assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, False)) assert_array_equal(X_trans, X_true, err_msg.format(0, False)) # Normal matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(X.transpose()) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, X.copy().transpose()) else: X_trans = imputer.transform(X.copy().transpose()) assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, False)) # Sparse matrix, axis = 0 imputer = Imputer(missing_values, strategy=strategy, axis=0) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(imputer.statistics_, statistics, err_msg.format(0, True)) assert_array_equal(X_trans, X_true, err_msg.format(0, True)) # Sparse matrix, axis = 1 imputer = Imputer(missing_values, strategy=strategy, axis=1) imputer.fit(sparse.csc_matrix(X.transpose())) if np.isnan(statistics).any(): assert_raises(ValueError, imputer.transform, sparse.csc_matrix(X.copy().transpose())) else: X_trans = imputer.transform(sparse.csc_matrix(X.copy().transpose())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_array_equal(X_trans, X_true.transpose(), err_msg.format(1, True))
X_train, X_validation, Y_train, Y_validation = train_test_split( X, Y, test_size=validation_size, random_state=seed) X_train = pd.DataFrame(data=X_train, columns=columns) X_validation = pd.DataFrame(data=X_validation, columns=columns) # handling missing values (NaN, Null) # creates additonal new columns based on calumns where missing data was (fill those columns with 1 and 0) # True where missing value was, False where not (1 or 0) missing_columns = [ col for col in X_train.columns if X_train[col].isnull().any() ] for col in missing_columns: X_train[col + '_missing_data'] = X_train[col].isnull() original_data = X_train # fill missing values with mean values imputer = Imputer() X_train = pd.DataFrame(data=imputer.fit_transform(X_train)) X_train.columns = original_data.columns # make one column indicating where wasmissing point, drop missing_columns X_train['missing_values'] = numpy.zeros((len(X_train), 1)) for col in missing_columns: X_train['missing_values'] += X_train[col + '_missing_data'] X_train = X_train.drop([col + '_missing_data'], axis=1) X_train['Age'] = X_train['Age'].values.round() X_train = X_train.values # validation dataset missing_columns = [ col for col in X_validation.columns if X_validation[col].isnull().any() ] for col in missing_columns: