def test_feature_stacker(): # basic sanity check for feature stacker iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target pca = RandomizedPCA(n_components=2) select = SelectKBest(k=1) fs = FeatureUnion([("pca", pca), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4))
def test_set_feature_union_steps(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] mult5 = Mult(5) mult5.get_feature_names = lambda: ['x5'] ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) # Directly setting attr ft.transformer_list = [('m5', mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['m5__x5'], ft.get_feature_names()) # Using set_params ft.set_params(transformer_list=[('mock', mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x3'], ft.get_feature_names()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(['mock__x5'], ft.get_feature_names())
def test_set_feature_union_steps(): mult2 = Mult(2) mult2.get_feature_names = lambda: ["x2"] mult3 = Mult(3) mult3.get_feature_names = lambda: ["x3"] mult5 = Mult(5) mult5.get_feature_names = lambda: ["x5"] ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.transform(np.asarray([[1]]))) assert_equal(["m2__x2", "m3__x3"], ft.get_feature_names()) # Directly setting attr ft.transformer_list = [("m5", mult5)] assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(["m5__x5"], ft.get_feature_names()) # Using set_params ft.set_params(transformer_list=[("mock", mult3)]) assert_array_equal([[3]], ft.transform(np.asarray([[1]]))) assert_equal(["mock__x3"], ft.get_feature_names()) # Using set_params to replace single step ft.set_params(mock=mult5) assert_array_equal([[5]], ft.transform(np.asarray([[1]]))) assert_equal(["mock__x5"], ft.get_feature_names())
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", TransfT()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8))
def test_feature_union(): # basic sanity check for feature union iris = load_iris() X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert_equal(X_transformed.shape, (X.shape[0], 3)) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # Test clone fs2 = assert_no_warnings(clone, fs) assert_false(fs.transformer_list[0][1] is fs2.transformer_list[0][1]) # test setting parameters fs.set_params(select__k=2) assert_equal(fs.fit_transform(X, y).shape, (X.shape[0], 4)) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert_equal(X_transformed.shape, (X.shape[0], 8)) # test error if some elements do not support transform assert_raises_regex(TypeError, 'All estimators should implement fit and ' 'transform.*\\bNoTrans\\b', FeatureUnion, [("transform", Transf()), ("no_transform", NoTrans())]) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) fs.fit(X, y)
def test_feature_union(): # basic sanity check for feature union X = iris.data X -= X.mean(axis=0) y = iris.target svd = TruncatedSVD(n_components=2, random_state=0) select = SelectKBest(k=1) fs = FeatureUnion([("svd", svd), ("select", select)]) fs.fit(X, y) X_transformed = fs.transform(X) assert X_transformed.shape == (X.shape[0], 3) # check if it does the expected thing assert_array_almost_equal(X_transformed[:, :-1], svd.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) # test if it also works for sparse input # We use a different svd object to control the random_state stream fs = FeatureUnion([("svd", svd), ("select", select)]) X_sp = sparse.csr_matrix(X) X_sp_transformed = fs.fit_transform(X_sp, y) assert_array_almost_equal(X_transformed, X_sp_transformed.toarray()) # Test clone fs2 = assert_no_warnings(clone, fs) assert fs.transformer_list[0][1] is not fs2.transformer_list[0][1] # test setting parameters fs.set_params(select__k=2) assert fs.fit_transform(X, y).shape == (X.shape[0], 4) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("svd", svd), ("select", select)]) X_transformed = fs.fit_transform(X, y) assert X_transformed.shape == (X.shape[0], 8) # test error if some elements do not support transform assert_raises_regex(TypeError, 'All estimators should implement fit and ' 'transform.*\\bNoTrans\\b', FeatureUnion, [("transform", Transf()), ("no_transform", NoTrans())]) # test that init accepts tuples fs = FeatureUnion((("svd", svd), ("select", select))) fs.fit(X, y)
class FeatureUnionDataFrame(TransformerMixin): def __init__(self, *args, **kwargs): self.fu = FeatureUnion(*args, **kwargs) def fit(self, X, y=None, **kwargs): self.fu.fit(X, y, **kwargs) return self def transform(self, X, y=None, **fit_params): return pd.DataFrame(self.fu.transform(X), columns=self.fu.get_feature_names()) def get_feature_names(self): return self.fu.get_feature_names() def set_params(self, **kwargs): self.fu.set_params(**kwargs) def get_params(self, deep=False): return self.fu.get_params(deep)
def test_set_feature_union_step_none(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) ft.set_params(m2=None) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names()) ft.set_params(m3=None) assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_equal([], ft.get_feature_names()) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X))
def test_set_feature_union_step_drop(drop): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_equal(['m2__x2', 'm3__x3'], ft.get_feature_names()) ft.set_params(m2=drop) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names()) ft.set_params(m3=drop) assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_equal([], ft.get_feature_names()) # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X)) # Check 'drop' step at construction time ft = FeatureUnion([('m2', drop), ('m3', mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_equal(['m3__x3'], ft.get_feature_names())
def test_feature_union(weights): X = np.ones((10, 5)) y = np.zeros(10) union = FeatureUnion( [ ("tr0", ScalingTransformer()), ("tr1", ScalingTransformer()), ("tr2", ScalingTransformer()), ] ) factors = [(2, 3, 5), (2, 4, 5), (2, 4, 6), (2, 4, None), (None, None, None)] params, sols, grid = [], [], [] for constants, w in product(factors, weights or [None]): p = {} for n, c in enumerate(constants): if c is None: p["tr%d" % n] = None elif n == 3: # 3rd is always an estimator p["tr%d" % n] = ScalingTransformer(c) else: p["tr%d__factor" % n] = c sol = union.set_params(transformer_weights=w, **p).transform(X) sols.append(sol) if w is not None: p["transformer_weights"] = w params.append(p) p2 = {"union__" + k: [v] for k, v in p.items()} p2["est"] = [CheckXClassifier(sol[0])] grid.append(p2) # Need to recreate the union after setting estimators to `None` above union = FeatureUnion( [ ("tr0", ScalingTransformer()), ("tr1", ScalingTransformer()), ("tr2", ScalingTransformer()), ] ) pipe = Pipeline([("union", union), ("est", CheckXClassifier())]) gs = dcv.GridSearchCV(pipe, grid, refit=False, cv=2) with warnings.catch_warnings(record=True): gs.fit(X, y)
def test_feature_union(weights): X = np.ones((10, 5)) y = np.zeros(10) union = FeatureUnion([('tr0', ScalingTransformer()), ('tr1', ScalingTransformer()), ('tr2', ScalingTransformer())]) factors = [(2, 3, 5), (2, 4, 5), (2, 4, 6), (2, 4, None), (None, None, None)] params, sols, grid = [], [], [] for constants, w in product(factors, weights or [None]): p = {} for n, c in enumerate(constants): if c is None: p['tr%d' % n] = None elif n == 3: # 3rd is always an estimator p['tr%d' % n] = ScalingTransformer(c) else: p['tr%d__factor' % n] = c sol = union.set_params(transformer_weights=w, **p).transform(X) sols.append(sol) if w is not None: p['transformer_weights'] = w params.append(p) p2 = {'union__' + k: [v] for k, v in p.items()} p2['est'] = [CheckXClassifier(sol[0])] grid.append(p2) # Need to recreate the union after setting estimators to `None` above union = FeatureUnion([('tr0', ScalingTransformer()), ('tr1', ScalingTransformer()), ('tr2', ScalingTransformer())]) pipe = Pipeline([('union', union), ('est', CheckXClassifier())]) gs = dcv.GridSearchCV(pipe, grid, refit=False, cv=2) with warnings.catch_warnings(record=True): gs.fit(X, y)
def test_set_feature_union_step_drop(get_names): mult2 = Mult(2) mult3 = Mult(3) if get_names == "get_feature_names": mult2.get_feature_names = lambda: ["x2"] mult3.get_feature_names = lambda: ["x3"] else: # get_feature_names_out mult2.get_feature_names_out = lambda input_features: ["x2"] mult3.get_feature_names_out = lambda input_features: ["x3"] X = np.asarray([[1]]) ft = FeatureUnion([("m2", mult2), ("m3", mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert_array_equal(["m2__x2", "m3__x3"], getattr(ft, get_names)()) with pytest.warns(None) as record: ft.set_params(m2="drop") assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_array_equal(["m3__x3"], getattr(ft, get_names)()) assert not record with pytest.warns(None) as record: ft.set_params(m3="drop") assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert_array_equal([], getattr(ft, get_names)()) assert not record with pytest.warns(None) as record: # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X)) assert not record with pytest.warns(None) as record: # Check 'drop' step at construction time ft = FeatureUnion([("m2", "drop"), ("m3", mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert_array_equal(["m3__x3"], getattr(ft, get_names)()) assert not record
def test_set_feature_union_step_drop(): mult2 = Mult(2) mult2.get_feature_names = lambda: ['x2'] mult3 = Mult(3) mult3.get_feature_names = lambda: ['x3'] X = np.asarray([[1]]) ft = FeatureUnion([('m2', mult2), ('m3', mult3)]) assert_array_equal([[2, 3]], ft.fit(X).transform(X)) assert_array_equal([[2, 3]], ft.fit_transform(X)) assert ['m2__x2', 'm3__x3'] == ft.get_feature_names() with pytest.warns(None) as record: ft.set_params(m2='drop') assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert ['m3__x3'] == ft.get_feature_names() assert not record with pytest.warns(None) as record: ft.set_params(m3='drop') assert_array_equal([[]], ft.fit(X).transform(X)) assert_array_equal([[]], ft.fit_transform(X)) assert [] == ft.get_feature_names() assert not record with pytest.warns(None) as record: # check we can change back ft.set_params(m3=mult3) assert_array_equal([[3]], ft.fit(X).transform(X)) assert not record with pytest.warns(None) as record: # Check 'drop' step at construction time ft = FeatureUnion([('m2', 'drop'), ('m3', mult3)]) assert_array_equal([[3]], ft.fit(X).transform(X)) assert_array_equal([[3]], ft.fit_transform(X)) assert ['m3__x3'] == ft.get_feature_names() assert not record
class MCPTFeatureEvaluator(BaseEstimator, SelectorMixin): def __init__(self, convert_datetime=False, selector_recipe='univariate_unbiased', impute=False, verbose=False, copy=True): assert selector_recipe in \ ['univariate_unbiased', 'univariate_cscv'], "Provided 'selector_recipe' is not valid!" self.convert_datetime = convert_datetime self.selector_recipe = selector_recipe self.impute = impute self.verbose = verbose self.copy = copy def fit(self, X, y, cols_to_discrete=None, method='discrete', measure='mi', n_bins_x=5, n_bins_y=5, n_reps=100, cscv_folds=None, target='cpu'): # make sure n_reps is valid assert isinstance(n_reps, int) and n_reps >= 0, \ "n_reps must be an integer greater than or equal to 0." if n_reps == 0: warnings.warn("n_reps=0 ... selector_recipe is not applicable.", RuntimeWarning) # make sure that a DataFrame has been passed in assert isinstance(X, pd.DataFrame) # if y is a pandas Series or a numpy array then make # sure it is the same length as the DataFrame if isinstance(y, pd.Series) or isinstance(y, np.ndarray): assert len(y) == X.shape[0] if isinstance(y, np.ndarray): # if it is a numpy array then need to make it into a series # and assign a name to that series y_name = 'y_auto' y = pd.Series(y, name='y_auto') else: # capture the name from the y pandas Series y_name = y.name # Capture original predictor columns prior to adding y into X self.original_predictor_columns = X.columns # add y into X so that it can be pre-processed correctly X = X.assign(y_auto=y).rename(columns={'y_auto': y_name}) else: # make sure that 'y' is a column in X assert y in X.columns, "The target column 'y' does not exist in 'X'!" y_name = y # Capture the original predictor columns without removing the target self.original_predictor_columns = X.columns # Convenience check: # Remove any columns that might have a single value as information # calculations will not really be accurate for col in X.columns: if X[col].nunique() == 1: if col == y_name: raise RuntimeError( "The target variable has an insufficient " + "number of unique values!") X = X.drop([col], axis=1) warnings.warn( "Removing column '" + col + "' due to non-unique values", RuntimeWarning) # convert any columns to discrete that are indicated in the fit method if cols_to_discrete is not None: try: X[cols_to_discrete] = X[cols_to_discrete].astype(object) except KeyError: cols_error = list(set(cols_to_discrete) - set(X.columns)) raise KeyError("The DataFrame does not " + "include the following " + "columns to discretize: %s" % cols_error) t0 = time() # Capture the column names in the DataFrame. These will be used # later in the presentation of the results self.numeric_cols = TypeSelector(np.number, True).fit_transform(X) self.bool_cols = TypeSelector("bool", True).fit_transform(X) self.category_cols = TypeSelector("category", True).fit_transform(X) self.object_cols = TypeSelector("object", True).fit_transform(X) self.datetime_cols = TypeSelector("datetime", True).fit_transform(X) self.preprocessed_columns = [] self.column_type_mask = [] if self.verbose: t1 = time() print("Type selector time: {0:.3g} sec".format(t1 - t0)) t2 = time() # Generate the feature union with all the possible different dtypes if self.impute: self.preprocess_features = \ FeatureUnion(transformer_list=[ ("numeric_features", make_pipeline( TypeSelector(np.number), Imputer(strategy="median") )), ("boolean_features", make_pipeline( TypeSelector("bool"), Imputer(strategy="most_frequent") )), ("categorical_features", make_pipeline( TypeSelector("category"), MostFrequentImputer(), MultiColumnLabelEncoder(self.category_cols) )), ("object_features", make_pipeline( TypeSelector("object"), MostFrequentImputer(), MultiColumnLabelEncoder(self.object_cols) )), ("datetime_features", make_pipeline( TypeSelector("datetime"), Imputer(strategy="most_frequent") )) ]) else: # don't impute missing values # less execute time self.preprocess_features = \ FeatureUnion(transformer_list=[ ("numeric_features", make_pipeline( TypeSelector(np.number) )), ("boolean_features", make_pipeline( TypeSelector("bool") )), ("categorical_features", make_pipeline( TypeSelector("category"), MultiColumnLabelEncoder(self.category_cols) )), ("object_features", make_pipeline( TypeSelector("object"), MultiColumnLabelEncoder(self.object_cols) )), ("datetime_features", make_pipeline( TypeSelector("datetime") )) ]) # If some of the dtypes are not present in the data set # and the feature union is run as-is, an error will be thrown. # As a result, those feature pipelines without any dtypes # need to be set to None prior to the FeatureUnion being executed # If they are present, then they need to be added to the # list of all columns being evaluated # # In addition, the categorical/object columns need to be tracked # so that the proper MI algorithm can be used in the calculation # (cont/cont, discrete/cont, discrete/discrete) # # Numerics if len(self.numeric_cols) == 0: self.preprocess_features.set_params(numeric_features=None) else: self.preprocessed_columns += list(self.numeric_cols) for i in range(len(self.numeric_cols)): self.column_type_mask += ['numeric'] # Booleans if len(self.bool_cols) == 0: self.preprocess_features.set_params(boolean_features=None) else: self.preprocessed_columns += list(self.bool_cols) for i in range(len(self.bool_cols)): self.column_type_mask += ['discrete'] # Categorical if len(self.category_cols) == 0: self.preprocess_features.set_params(categorical_features=None) else: self.preprocessed_columns += list(self.category_cols) for i in range(len(self.category_cols)): self.column_type_mask += ['discrete'] # Object if len(self.object_cols) == 0: self.preprocess_features.set_params(object_features=None) else: self.preprocessed_columns += list(self.object_cols) for i in range(len(self.object_cols)): self.column_type_mask += ['discrete'] # Datetime if len(self.datetime_cols) == 0: self.preprocess_features.set_params(datetime_features=None) else: self.preprocessed_columns += list(self.datetime_cols) for i in range(len(self.datetime_cols)): self.column_type_mask += ['datetime'] X = self.preprocess_features.fit_transform(X) if self.verbose: t3 = time() print("Preprocess time: {0:.3g} sec".format(t3 - t2)) # Convert back to a DataFrame so we can properly extract the target # variable. There may be a better way to do this but wanted to preprocess # the target variable inline with all the other variables X = pd.DataFrame(data=X, columns=self.preprocessed_columns) # extract y series from X after preprocessing, determine if it is # discrete or continuous, and remove from X y = X[y_name] X = X.drop([y_name], axis=1) # Derive 'target_is_discrete' target_is_discrete = (y_name in self.category_cols) or \ (y_name in self.object_cols) # Adjust 'self.column_type_mask' and 'self.columns' given that # 'y' was just separated out from 'X' self.preprocessed_columns = np.array(self.preprocessed_columns) self.column_type_mask = np.array(self.column_type_mask) predictor_idx = np.where(self.preprocessed_columns != y_name)[0] self.column_type_mask = self.column_type_mask[predictor_idx] self.preprocessed_columns = self.preprocessed_columns[predictor_idx] # Next steps ... run the MCPT self.method = method self.measure = measure self.target = target self.n_bins_x = n_bins_x self.n_bins_y = n_bins_y self.n_mcpt_reps = n_reps if cscv_folds is None and ('cscv' in self.selector_recipe): self.n_cscv_folds = 4 warnings.warn( "No value provided for 'cscv_folds'. '" + "Setting to '4' CSCV folds by default.", RuntimeWarning) else: self.n_cscv_folds = cscv_folds kwargs = {'method': self.method, 'measure': self.measure, \ 'target_is_discrete': target_is_discrete, \ 'column_type_mask': self.column_type_mask, \ 'n_bins_x': self.n_bins_x, 'n_bins_y': self.n_bins_y, \ 'n_reps': self.n_mcpt_reps, 'cscv_folds': self.n_cscv_folds, \ 'target': self.target, 'verbose': self.verbose} if self.selector_recipe in ['univariate_unbiased', 'univariate_cscv']: info_matrix = univariate.screen_univariate(X.values, y.values, **kwargs) else: raise ValueError var_series = pd.Series(self.preprocessed_columns, name='Variables') if self.measure == 'mi': measure_name = 'MI' else: measure_name = 'UR' if self.n_mcpt_reps > 0: col_names = [measure_name, 'Solo p-value', 'Unbiased p-value'] else: col_names = [measure_name] if self.n_cscv_folds is not None: col_names += ['P(<=median)'] self.information = pd.DataFrame(info_matrix, columns=col_names) self.information.insert(0, 'Variable', var_series) self.information = self.information.sort_values(by=measure_name, ascending=False) self.information = self.information.reset_index(drop=True) return self def _get_support_mask(self): check_is_fitted(self, 'information') orig_cols = self.original_predictor_columns if self.n_mcpt_reps > 0: if self.selector_recipe == "univariate_unbiased": # Get the variable names variable_mask = self.information['Unbiased p-value'] < 0.05 selected = self.information['Variable'][variable_mask].values elif self.selector_recipe == "univariate_cscv": variable_mask_1 = self.information['P(<=median)'] <= 0.2 variable_mask_2 = self.information['Unbiased p-value'] < 0.05 variable_mask = variable_mask_1.values & variable_mask_2.values selected = self.information['Variable'][variable_mask].values else: raise ValueError mask = np.array( [True if col in selected else False for col in orig_cols]) else: # self.n_mcpt_reps = 0 .. return all original columns mask = np.repeat(True, len(orig_cols)) return mask
# For training data, must remove the empty lines to ensure accurate training (X_train, raw_train, y_train) = preprocessing_removeEmpty(raw_train, y_train) print('Number of lines training set: ' + str(len(X_train))) (X_test, raw_test, y_test) = preprocessing_removeEmpty(raw_test, y_test) print('Number of lines testing set: ' + str(len(X_test))) unigram_vectorizer = TfidfVectorizer(ngram_range=(1, 1), min_df=1) temp_uni_tfidf = unigram_vectorizer.fit_transform(X_train).toarray() # n_features = len(unigram_vectorizer.get_feature_names()) n_features = 100 multigrams_vectorizer = TfidfVectorizer(ngram_range=(2, 3), min_df=2, max_features=n_features) comb_vectorizer = FeatureUnion([("uni_vec", unigram_vectorizer), ("multi_vec", multigrams_vectorizer)]) comb_vectorizer.set_params(multi_vec=None) X_train_tfidf = comb_vectorizer.fit_transform(X_train).toarray() feature_names = comb_vectorizer.get_feature_names() print("num_features: " + str(len(feature_names))) # print(feature_names[:50]) print("features extracted & tfidf transformed.") # Transform documents to document-term matrix. (.transform) - No learning involved as it is test data # For test data # X_test_tfidf = vectorizer.transform(X_test).toarray() X_test_tfidf = comb_vectorizer.transform(X_test).toarray() # model = svm.SVC(kernel="linear",C=100,cache_size=5000,probability=True) print('Creating Model...') model = svm.LinearSVC(C=10, multi_class='ovr') # model = svm.SVC(kernel="linear",C=10,decision_function_shape='ovr',cache_size=5000,probability=True)
def extract_features(X, sfreq, selected_funcs, funcs_params=None, n_jobs=1, return_as_df=False): """ Extraction of temporal or spectral features from epoched EEG signals. Parameters ---------- X : ndarray, shape (n_epochs, n_channels, n_times) Array of epoched EEG data. sfreq : float Sampling rate of the data. selected_funcs : list of str The elements of `selected_features` are aliases for the feature functions which will be used to extract features from the data. (See `mne_features` documentation for a complete list of available feature functions). funcs_params : dict or None (default: None) If not None, dict of optional parameters to be passed to the feature functions. Each key of the `funcs_params` dict should be of the form : [alias_feature_function]__[optional_param] (for example: 'higuchi_fd__kmax`). n_jobs : int (default: 1) Number of CPU cores used when parallelizing the feature extraction. If given a value of -1, all cores are used. return_as_df : bool (default: False) If True, the extracted features will be returned as a Pandas DataFrame. The column index is a MultiIndex (see `pd.MultiIndex`) which contains the alias of each feature function which was used. If False, the features are returned as a 2d Numpy array. Returns ------- array-like, shape (n_epochs, n_features) """ if sfreq <= 0: raise ValueError('Sampling rate `sfreq` must be positive.') univariate_funcs = get_univariate_funcs(sfreq) bivariate_funcs = get_bivariate_funcs(sfreq) feature_funcs = univariate_funcs.copy() feature_funcs.update(bivariate_funcs) sel_funcs = _check_func_names(selected_funcs, feature_funcs.keys()) # Feature extraction n_epochs = X.shape[0] _tr = [(n, FeatureFunctionTransformer(func=feature_funcs[n])) for n in sel_funcs] extractor = FeatureUnion(transformer_list=_tr) if funcs_params is not None: extractor.set_params(**funcs_params) res = joblib.Parallel(n_jobs=n_jobs)( joblib.delayed(_apply_extractor)(extractor, X[j, :, :]) for j in range(n_epochs)) Xnew = np.vstack(res) if return_as_df: return _format_as_dataframe(Xnew, extractor.get_feature_names()) else: return Xnew
def extract_features(X, sfreq, selected_funcs, funcs_params=None, n_jobs=1, ch_names=None, return_as_df=False): """Extraction of temporal or spectral features from epoched EEG signals. Parameters ---------- X : ndarray, shape (n_epochs, n_channels, n_times) Array of epoched EEG data. sfreq : float Sampling rate of the data. selected_funcs : list of str or tuples The elements of ``selected_features`` are either strings or tuples of the form ``(str, callable)``. If an element is of type ``str``, it is the alias of a feature function. The aliases are built from the feature functions' names by removing ``compute_``. For instance, the alias of the feature function :func:`compute_ptp_amp` is ``ptp_amp``. (See the documentation of mne-features). If an element is of type ``tuple``, the first element of the tuple should be a string (name/alias given to a user-defined feature function) and the second element should be a callable (a user-defined feature function which accepts Numpy arrays with shape ``(n_channels, n_times)``). The names/aliases given to user-defined feature functions should not intersect the aliases used by mne-features. If the name given to a user-defined feature function is already used as an alias in mne-features, an error will be raised. funcs_params : dict or None (default: None) If not None, dict of optional parameters to be passed to the feature functions. Each key of the ``funcs_params`` dict should be of the form: ``[alias_feature_function]__[optional_param]`` (for example: ``higuchi_fd__kmax``). n_jobs : int (default: 1) Number of CPU cores used when parallelizing the feature extraction. If given a value of -1, all cores are used. ch_names : list of str or None (default: None) If not None, list containing the names of each input channel. return_as_df : bool (default: False) If True, the extracted features will be returned as a Pandas DataFrame. The column index is a MultiIndex (see :class:`~pandas.MultiIndex`) which contains the alias of each feature function which was used. If False, the features are returned as a 2d Numpy array. Returns ------- array-like, shape (n_epochs, n_features) """ if sfreq <= 0: raise ValueError('Sampling rate `sfreq` must be positive.') univariate_funcs = get_univariate_funcs(sfreq) bivariate_funcs = get_bivariate_funcs(sfreq) feature_funcs = univariate_funcs.copy() feature_funcs.update(bivariate_funcs) sel_funcs = _check_funcs(selected_funcs, feature_funcs) if ch_names is not None and len(ch_names) != X.shape[1]: raise ValueError('`ch_names` should be of length {%s}' % X.shape[1]) # Feature extraction n_epochs = X.shape[0] _tr = [(n, FeatureFunctionTransformer(func=func)) for n, func in sel_funcs] extractor = FeatureUnion(transformer_list=_tr) if funcs_params is not None: extractor.set_params(**funcs_params) res = joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(_apply_extractor)( extractor, X[j, :, :], ch_names, return_as_df) for j in range(n_epochs)) feature_names = res[0][1] res = list(zip(*res))[0] Xnew = np.vstack(res) if return_as_df: return _format_as_dataframe(Xnew, feature_names) else: return Xnew
kf = KFold(len(X_trainDF),5,shuffle=True,random_state=55) while compteur < Iteration: print compteur C = 10**(uniform(-6,-2)) p = uniform(3,6) npca = randrange(5, 30) which_feature = {k:int(proba.random()) for k in Feature.transformer_weights.keys()} which_feature['HOGFeature'] = 1 which_feature['SobelFeature'] = 1 Feature.transformer_weights = which_feature param = {'SobelFeature__PCA__n_components':npca, 'RawImage__PCA__n_components':npca, 'HOGFeature__PCA__n_components':npca} Feature.set_params(**param) scores = []; rocauctr = []; rocaucval = [] print 'Debut cross-validation' for train_index, val_index in kf: X_trDF, X_valDF = X_trainDF.iloc[train_index], X_trainDF.iloc[val_index] y_trDF, y_valDF = y_trainDF.iloc[train_index], y_trainDF.iloc[val_index] X_tr = Feature.fit_transform(X_trDF) y_tr = np.array(y_trDF)[:,np.newaxis] X_val = Feature.transform(X_valDF) y_val = np.array(y_valDF)[:,np.newaxis] model = LogisticRegression(penalty='l2',C = C, class_weight = {0:1,1:p})