def test_column_transformer_get_feature_names(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', Trans(), [0, 1])]) # raise correct error when not fitted assert_raises(NotFittedError, ct.get_feature_names) # raise correct error when no feature names are available ct.fit(X_array) assert_raise_message( AttributeError, "Transformer trans (type Trans) does not provide " "get_feature_names", ct.get_feature_names) # working example X = np.array([[{ 'a': 1, 'b': 2 }, { 'a': 3, 'b': 4 }], [{ 'c': 5 }, { 'c': 6 }]], dtype=object).T ct = ColumnTransformer([('col' + str(i), DictVectorizer(), i) for i in range(2)]) ct.fit(X) assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b', 'col1__c']) # passthrough transformers not supported ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) ct.fit(X) assert_raise_message(NotImplementedError, 'get_feature_names is not yet supported', ct.get_feature_names) ct = ColumnTransformer([('trans', DictVectorizer(), 0)], remainder='passthrough') ct.fit(X) assert_raise_message(NotImplementedError, 'get_feature_names is not yet supported', ct.get_feature_names) # drop transformer ct = ColumnTransformer([('col0', DictVectorizer(), 0), ('col1', 'drop', 1)]) ct.fit(X) assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b'])
def load_credita(): path = os.path.join('datasets', 'credit-a.arff') raw_data = loadarff(path) df = pd.DataFrame(raw_data[0]) y = df.pop('class') X = df y_label_encoder = LabelEncoder() y = y_label_encoder.fit_transform(y) # fill missing numerical values X.fillna(X.mean(), inplace=True) # fill missing categorical values categ_cols = X.select_dtypes(include=['category', object]).columns for col in categ_cols: X[col].replace(b'?', X[col].mode()[0], inplace=True) # standarize numerical features num_cols = X.select_dtypes(include=['number']).columns mm_scaler = MinMaxScaler() X[num_cols] = mm_scaler.fit_transform(X[num_cols]) # use one transformer per feature to preserve its name in the generated features # since new feature names are based on the transformer's name transformers = [(col, OneHotEncoder(drop='first'), [col]) for col in categ_cols] col_transformer = ColumnTransformer(transformers, remainder='passthrough') X_arr = col_transformer.fit_transform(X) X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names()) return X, y
class BoutDataset(torch.utils.data.Dataset): def __init__(self, df): ## - onehotencode hyp_time_col ## - scale sedentary, light, medium, vigorous ## - make sure activities are in order ## - bout_train = BoutDataset(df_per_hour.loc[train_ids.pid]) ## - bout_test = BoutDataset(df_per_hour.loc[test_ids.pid]) # One-Hot-Encode the hyp_time_col self.columnTransformer = ColumnTransformer( [('hour', OneHotEncoder(handle_unknown='ignore', sparse=False), ['hour'])], remainder='passthrough') index = df.index df = self.columnTransformer.fit_transform(df) df = pd.DataFrame(df, columns=self.columnTransformer.get_feature_names(), index=index) filter_columns = df.columns.str.startswith('hour') columns = df.columns[filter_columns].str.split('__x0_').str.join('_') # append last columns to renamed beginning columns columns = columns.append(df.columns[len(columns):]) df.columns = columns # reorder reorder_columns = [ 'sedentary_bins', 'light_bins', 'medium_bins', 'vigorous_bins' ] reorder_columns.extend(df.columns[filter_columns].tolist()) df = df[reorder_columns] # Scale self.scaler = preprocessing.StandardScaler() scaled = self.scaler.fit_transform(df[[ 'sedentary_bins', 'light_bins', 'medium_bins', 'vigorous_bins' ]]) df[['sedentary_bins', 'light_bins', 'medium_bins', 'vigorous_bins']] = scaled # Set class object self.df = df.sort_index() self.df = self.df[[ 'sedentary_bins', 'light_bins', 'medium_bins', 'vigorous_bins' ]] self.hours_in_day = 24 def __len__(self): return len(self.df) def __getitem__(self, pid): # Select sample X = self.df.loc[pid].values X = np.expand_dims(X, axis=0) y = self.df.loc[pid].values y = np.expand_dims(y, axis=0) return X, y
def apply_haar_smooth(data): transformers = [ ('orig', PassthroughTransformer(), ['pct_change__close']), ('haar_smooth', HaarSmoothTransformer(.4), ['pct_change__close']), ] ct = ColumnTransformer(transformers=transformers, remainder='drop', n_jobs=-1) return pd.DataFrame(data=ct.fit_transform(data), columns=ct.get_feature_names())
class FeatureTransformer(BaseEstimator, TransformerMixin): def __init__(self): # Ugly but otherwise col_transformer.feature_names() doesn't work StandardScaler.get_feature_names = get_empty_feature_names FunctionTransformer.get_feature_names = get_empty_feature_names OrdinalEncoder.get_feature_names = get_empty_feature_names SimpleImputer.get_feature_names = get_empty_feature_names RobustScaler.get_feature_names = get_empty_feature_names # Transformer which returns the same result identity = FunctionTransformer(func=lambda x: x, validate=False) # transformer 1/x reciprocal = FunctionTransformer(func=lambda x: 1 / x, validate=False) # ColumnTransformer allows different columns or column subsets of the input # to be transformed separately and the results combined into a single # feature space. self.col_transformer = ColumnTransformer( [ # (name, transformer, column(s)) # ==categorical== # OneHotEncoder - M categories in column -> M columns ("Transmission Type", OneHotEncoder(), ["Transmission Type"]), # OrdinalEncoder - encodes categories to integer ("Vehicle Size", OrdinalEncoder([['Compact', 'Midsize', 'Large'] ]), ["Vehicle Size"]), # ==numerical== # Leave column as it is ("Number of Doors", identity, ["Number of Doors"]), ("Engine HP", identity, ["Engine HP"]), # calculate 1/x ("city mpg trans", reciprocal, ["city mpg"]), # Leave column as it is ("Year", identity, ["Year"]), ], remainder='drop' # Drop all other remaining columns ) def fit(self, X): self.col_transformer.fit(X) return self def transform(self, X): return self.col_transformer.transform(X) def get_feature_names(self): return self.col_transformer.get_feature_names()
class OneHotEncoderPrim(primitive): # can handle missing values. turns nans to extra category def __init__(self, random_state=0): super(OneHotEncoderPrim, self).__init__(name='OneHotEncoder') self.id = 4 self.hyperparams = [] self.type = 'data preprocess' self.description = "Encode categorical integer features as a one-hot numeric array. The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka ‘one-of-K’ or ‘dummy’) encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array. By default, the encoder derives the categories based on the unique values in each feature. Alternatively, you can also specify the categories manually. The OneHotEncoder previously assumed that the input features take on values in the range [0, max(values)). This behaviour is deprecated. This encoding is needed for feeding categorical data to many scikit-learn estimators, notably linear models and SVMs with the standard kernels." self.hyperparams_run = {'default': True} self.preprocess = None self.cat_cols = None self.accept_type = 'b' def can_accept(self, data): return self.can_accept_b(data) def is_needed(self, data): # data = handle_data(data) cols = data['X'] num_cols = data['X']._get_numeric_data().columns cat_cols = list(set(cols) - set(num_cols)) if len(cat_cols) == 0: return False return True def fit(self, data): data = handle_data(data) if not self.is_needed(data): return x = deepcopy(data['X']) cols = data['X'].columns num_cols = data['X']._get_numeric_data().columns self.cat_cols = list(set(cols) - set(num_cols)) x[self.cat_cols] = x[self.cat_cols].fillna('NaN') self.preprocess = ColumnTransformer([ ("one_hot", OneHotEncoder(handle_unknown='ignore'), self.cat_cols) ]) x[self.cat_cols] = x[self.cat_cols].astype(str) self.preprocess.fit(x) # .astype(str) def produce(self, data): output = handle_data(data) if not self.is_needed(output): final_output = {0: output} return final_output output['X'][self.cat_cols] = output['X'][self.cat_cols].fillna('NaN') result = self.preprocess.transform(output['X']) if isinstance(result, csr_matrix): result = result.toarray() output['X'] = pd.DataFrame( result, columns=self.preprocess.get_feature_names()).infer_objects() output['X'] = output['X'].ix[:, ~output['X'].columns.duplicated()] final_output = {0: output} return final_output
def test_feature_names_empty_columns(empty_col): pd = pytest.importorskip('pandas') df = pd.DataFrame({"col1": ["a", "a", "b"], "col2": ["z", "z", "z"]}) ct = ColumnTransformer(transformers=[ ("ohe", OneHotEncoder(), ["col1", "col2"]), ("empty_features", OneHotEncoder(), empty_col), ], ) ct.fit(df) assert ct.get_feature_names() == ['ohe__x0_a', 'ohe__x0_b', 'ohe__x1_z']
def test_column_transformer_get_feature_names(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', Trans(), [0, 1])]) # raise correct error when not fitted assert_raises(NotFittedError, ct.get_feature_names) # raise correct error when no feature names are available ct.fit(X_array) assert_raise_message(AttributeError, "Transformer trans (type Trans) does not provide " "get_feature_names", ct.get_feature_names) # working example X = np.array([[{'a': 1, 'b': 2}, {'a': 3, 'b': 4}], [{'c': 5}, {'c': 6}]], dtype=object).T ct = ColumnTransformer( [('col' + str(i), DictVectorizer(), i) for i in range(2)]) ct.fit(X) assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b', 'col1__c']) # passthrough transformers not supported ct = ColumnTransformer([('trans', 'passthrough', [0, 1])]) ct.fit(X) assert_raise_message( NotImplementedError, 'get_feature_names is not yet supported', ct.get_feature_names) ct = ColumnTransformer([('trans', DictVectorizer(), 0)], remainder='passthrough') ct.fit(X) assert_raise_message( NotImplementedError, 'get_feature_names is not yet supported', ct.get_feature_names) # drop transformer ct = ColumnTransformer( [('col0', DictVectorizer(), 0), ('col1', 'drop', 1)]) ct.fit(X) assert_equal(ct.get_feature_names(), ['col0__a', 'col0__b'])
def load_sick(): raw_data = loadarff('datasets/sick.arff') df = pd.DataFrame(raw_data[0]) y = df.pop('class') X = df X.drop('TBG', axis=1, inplace=True) # all NaN, useless implicit_cols = [col for col in X.columns if col.endswith('_measured')] X.drop(implicit_cols, axis=1, inplace=True) # Replace NaN values X.fillna(X.mean(), inplace=True) X['sex'].replace(b'?', X['sex'].mode()[0], inplace=True) # Standarize numerical features num_cols = X.select_dtypes(include=['number']).columns scaler = MinMaxScaler() X[num_cols] = scaler.fit_transform(X[num_cols]) # Encode categorical features categ_cols = X.select_dtypes(include=['category', object]).columns categ_cols = categ_cols.drop('referral_source') # we use a dict where each feature has an entry with its encoder # for future or inverse transformations label_encoders = defaultdict(LabelEncoder) X[categ_cols] = X[categ_cols].apply( lambda x: label_encoders[x.name].fit_transform(x)) ohe_encoder = OneHotEncoder() # save for future or inverse transformations ohe_transformer = ColumnTransformer( [('referral_source', ohe_encoder, ['referral_source'])], remainder='passthrough') X_arr = ohe_transformer.fit_transform(X) X = pd.DataFrame(X_arr, columns=ohe_transformer.get_feature_names()) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE) candidates = [] for eps in range(1, 11): for ms in range(4, 21): model = DBSCAN(eps=eps / 10, min_samples=ms).fit(X) counts = np.unique(model.labels_, return_counts=True)[1] if len(counts) == 3: print(model, counts) candidates.append(model)
class FeatureColumnTransformer(DfTransformer): def __init__(self, transformers, remainder="passthrough", n_jobs=-1): self.name = "FeatureColumnTransformer" super().log_start(self.name) self.transformers = transformers self.remainder = remainder self.n_jobs = n_jobs self.column_transfomer = ColumnTransformer( transformers = self.transformers, remainder = self.remainder, n_jobs=self.n_jobs ) self.columns = None self.column_types = None def fit(self, X, y=None): self.column_transfomer.fit(X) return self def transform(self, X, y=None): X_concat = self.column_transfomer.transform(X) self.columns = self.column_transfomer.get_feature_names() self.rename_df_columns() X_concat = pd.DataFrame(X_concat, index = X.index, columns = self.columns) X_concat_df = self.redefine_column_types(X,X_concat) super().log_end(self.name) return X_concat_df def rename_df_columns(self): for i,col in enumerate(self.columns): self.columns[i] = col.split(sep="__")[-1] def redefine_column_types(self, X_input, X_output): for feature in X_input.columns: if feature in X_output.columns: X_output[feature]=X_output[feature].astype(X_input[feature].dtypes.name) return X_output
def test_ColumnTransformer(): import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.preprocessing import OneHotEncoder from ML_in_business.hw6.TransformerLib import MyTempEncoder, tempEstimator from sklearn import set_config X = pd.DataFrame( {'city': ['London', 'London', 'Paris', 'Sallisaw'], 'title': ["His Last Bow", "How Watson Learned the Trick", "A Moveable Feast", "The Grapes of Wrath"], 'expert_rating': [5, 3, 4, 5], 'user_rating': [4, 5, 4, 3]}) # column_trans = ColumnTransformer( # [('city_category', OneHotEncoder(dtype='int'),['city']), # ('title_bow', CountVectorizer(), 'title')], # remainder='drop') # column_trans= Pipeline([ # ('selector', MyTempEncoder()) # ]) column_trans = ColumnTransformer( [ #('city_category', OneHotEncoder(dtype='int'),['city']), ('myEncoder', tempEstimator('AAAA'), ['title'])], remainder='passthrough' #remainder='drop' ) #HTML representation of Pipeline #set_config(display='diagram') set_config(display='text') column_trans column_trans.fit_transform(X) names = column_trans.get_feature_names() arr = column_trans.transform(X) assert True
def feature_generation(df): """ Crear nuevos features útiles como: - Una variable booleana para saber si la llamada es del 911/066 o no. - Transformar var categóricas con OneHotEncoder :param df: Dataframe del cual se generarán nuevas variables :return: """ # Creamos la variable booleana print("Creating boolean variable.") df["bool_llamada"] = np.where((df.tipo_entrada == "LLAMADA DEL 911") | (df.tipo_entrada == "LLAMADA DEL 066"), 1, 0) print("Transforming discrete variables...") # Aplicamos OneHot Encoder para las categóricas transformers = [('one_hot', OneHotEncoder(), [ 'delegacion_inicio', 'incidente_c4', 'tipo_entrada', 'espacio_del_dia' ])] col_trans = ColumnTransformer(transformers, remainder="passthrough", n_jobs=-1) # Ordenaremos el dataframe temporalmente df = df.sort_values( by=["año_creacion", "mes_creacion", "dia_creacion", "hora_simple"]) X = col_trans.fit_transform(df.drop(columns="label")) y = df.label.values.reshape(X.shape[0], ) print("Successfully transformation of the discrete variables.'") print(X.shape) print("Converting to dataframe...") X = X.todense() df = pd.DataFrame(X, columns=col_trans.get_feature_names()) df['label'] = y return df, X, y
class DataFrameOHETransformer(BaseEstimator, TransformerMixin): def __init__(self, feature_names=None): self.fnames = feature_names self.col_transf = None self.fit_est = None self.features = None def fit(self, X, y=None): ohes = [] for feature in self.fnames: ohes.append((feature, OneHotEncoder(dtype='int'), [feature])) self.col_transf = ColumnTransformer(ohes, remainder='drop') self.col_transf.fit(X, y) return self def fit_transform(self, X, y=None, **fit_params): return self.fit(X, y).transform(X, y) def transform(self, X, y=None): tf = pandas.DataFrame(self.col_transf.transform(X), columns=self.col_transf.get_feature_names(), index=X.index) return pandas.concat([tf, X.drop(self.fnames, 1)], 1)
def make_features(input_df, target_col, keep_cols=None, ma_lags=None, ma_cols=None, n_samples=None) -> pd.DataFrame: transformers = list() if keep_cols: transformers.extend([('passthrough', PassthroughTransformer(), keep_cols)]) if ma_lags and ma_cols: transformers.extend([('ma' + str(n), MovingAverageTransformer(n), ma_cols) for n in ma_lags]) transformers.extend([('target', PercentChangeTransformer(), [target_col])]) ct = ColumnTransformer(transformers=transformers, remainder='drop', n_jobs=-1) arr = ct.fit_transform(input_df) arr = strip_nan_rows(arr) if n_samples: arr = keep_last_n_rows(arr, n_samples) return pd.DataFrame(data=arr, columns=list(ct.get_feature_names()))
class FeatureTransformer(BaseEstimator, TransformerMixin): def __init__(self): # Ugly but otherwise col_transformer.feature_names() doesn't work StandardScaler.get_feature_names = get_empty_feature_names FunctionTransformer.get_feature_names = get_empty_feature_names OrdinalEncoder.get_feature_names = get_empty_feature_names SimpleImputer.get_feature_names = get_empty_feature_names RobustScaler.get_feature_names = get_empty_feature_names identity = FunctionTransformer(func=lambda x: x, validate=False) reciprocal = FunctionTransformer(func=lambda x: 1 / x, validate=False) self.col_transformer = ColumnTransformer( [ # categorical ("Transmission Type", OneHotEncoder(), ["Transmission Type"]), ("Vehicle Size", OrdinalEncoder([['Compact', 'Midsize', 'Large'] ]), ["Vehicle Size"]), # numerical ("city mpg", reciprocal, ["city mpg"]), ("Year", identity, ["Year"]), ("Engine HP", identity, ["Engine HP"]), ], remainder='drop') def fit(self, X): self.col_transformer.fit(X) return self def transform(self, X): return self.col_transformer.transform(X) def get_feature_names(self): return self.col_transformer.get_feature_names()
list_cat = [ 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 32, 33, 35, 39, 40, 41, 42, 53, 55, 57, 58, 60, 63, 64, 65, 72, 73, 74, 78, 79 ] list_num = [ 0, 1, 3, 4, 17, 18, 19, 20, 26, 34, 36, 37, 38, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 56, 59, 61, 62, 66, 67, 68, 69, 70, 71, 75, 76, 77, 80 ] ct = ColumnTransformer([('oneHot', OneHotEncoder(categories='auto', sparse=False), list_cat)]) ct_result = pd.DataFrame(ct.fit_transform(data)) ct_result.columns = ct.get_feature_names() ct_result.insert(0, "Id", ct_result.index + 1) #merge categorical datafarme with numerical dataframe on ID #Store in processed full data dataframe numeric_df = data.iloc[:, list_num] p_full_data = pd.merge(ct_result, numeric_df, left_on='Id', right_on='Id', how='inner') #Split data set in training and testing subset
def load_credita(weighting=None, **extra_kwargs): cv_splits = [] # preprocess the first fold keeping statistics for next folds train_path = os.path.join('datasetsCBR', 'credit-a', f'credit-a.fold.000000.train.arff') test_path = os.path.join('datasetsCBR', 'credit-a', f'credit-a.fold.000000.test.arff') df_train = pd.DataFrame(loadarff(train_path)[0]) df_test = pd.DataFrame(loadarff(test_path)[0]) X = df_train.append(df_test) y = X.pop('class') y_label_encoder = LabelEncoder() y = y_label_encoder.fit_transform(y) # fill missing numerical values means = X.mean() X.fillna(means, inplace=True) # fill missing categorical values categ_cols = X.select_dtypes(include=['category', object]).columns modes = X[categ_cols].mode() for col in categ_cols: X[col].replace(b'?', modes[col][0], inplace=True) # standarize numerical features num_cols = X.select_dtypes(include=['number']).columns mm_scaler = MinMaxScaler() X[num_cols] = mm_scaler.fit_transform(X[num_cols]) # use one transformer per feature to preserve its name in the generated features # since new feature names are based on the transformer's name transformers = [(col, OneHotEncoder(drop='first'), [col]) for col in categ_cols] col_transformer = ColumnTransformer(transformers, remainder='passthrough') X_arr = col_transformer.fit_transform(X) X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names()) p = len(df_train) X_train, X_test, y_train, y_test = X[:p], X[p:], y[:p], y[p:] # feature selection if weighting == 'mutual_info': weights = mutual_info(X, y) # apply weights to features X_train *= weights X_test *= weights elif weighting == 'relief': weights = relief(X, y) # apply weights to features X_train *= weights X_test *= weights cv_splits.append((X_train, X_test, y_train, y_test)) # preprocess rest of folds for i in range(1, K_FOLDS): train_path = os.path.join('datasetsCBR', 'credit-a', f'credit-a.fold.00000{str(i)}.train.arff') test_path = os.path.join('datasetsCBR', 'credit-a', f'credit-a.fold.00000{str(i)}.test.arff') df_train = pd.DataFrame(loadarff(train_path)[0]) df_test = pd.DataFrame(loadarff(test_path)[0]) X = df_train.append(df_test) y = X.pop('class') y = y_label_encoder.transform(y) # fill missing numerical values X.fillna(means, inplace=True) # fill missing categorical values for col in categ_cols: X[col].replace(b'?', modes[col][0], inplace=True) # normalize numerical features X[num_cols] = mm_scaler.transform(X[num_cols]) # one hot encode X_arr = col_transformer.transform(X) X = pd.DataFrame(X_arr, columns=col_transformer.get_feature_names()) p = len(df_train) X_train, X_test, y_train, y_test = X[:p], X[p:], y[:p], y[p:] # feature selection if weighting == 'mutual_info': weights = mutual_info(X_train, y_train) # apply weights to features X_train *= weights X_test *= weights elif weighting == 'relief': weights = relief(X_train, y_train) # apply weights to features X_train *= weights X_test *= weights cv_splits.append((X_train, X_test, y_train, y_test)) return cv_splits
# In[4]: """ Apply DWT Smooth. """ transformers = [ ('haar_smooth', HaarSmoothTransformer(.05), list(feature_data_train.columns)), ('orig', PassthroughTransformer(), ['target__close']), ] ct = ColumnTransformer(transformers=transformers, n_jobs=-1) smooth_arr_train = ct.fit_transform(feature_data_train) smooth_data_train = pd.DataFrame(smooth_arr_train, columns=ct.get_feature_names()) smooth_arr_test = ct.fit_transform(feature_data_test) smooth_data_test = pd.DataFrame(smooth_arr_test, columns=ct.get_feature_names()) smooth_data_train.plot() plt.show() # In[5]: """ Make time-series data. """ X_train, y_train = data_to_supervised(input_df=smooth_data_train, target_ix=-1, Tx=Tx,
def data_manipulation(data_set): """ Prepare the dataset for training the regression model. This function takes the csv file location as a parameter Parameters ---------- data_set: str data file location Returns ------- data_set: csv file turns all the categorical data into numerical data. """ print("Loading the Dataset...") df = pd.read_csv(data_set) print("Data Manipulation...") # remove all the null value from the dataset df.dropna(inplace=True) # create the variable that contains the combine source destination source_destination = df.source + '-' + df.destination # create the column and assign to the above created variable to the df['source_destination'] = source_destination # since we create a seperate column for source destination, now we drop that df.drop(['source', 'destination'], axis=1, inplace=True) # create a dictionary to convert cab_type data to number cab_type = {'Lyft': 0, 'Uber': 1} # map the above dictionary to the cab_type dataset df.cab_type = df['cab_type'].map(cab_type) # drop the id column since it has lot of unique variables df.drop('id', axis=1, inplace=True) print("One Hot Encoding...") # create a list for categorical labels categorical_label = ['product_id', 'source_destination', 'name'] # create a OneHotEncoder object one_hot_encoding = OneHotEncoder() # perform the columnTransformer and use oneHotEncoder as a transformer transformer = ColumnTransformer( [('one_hot', one_hot_encoding, categorical_label)], remainder='passthrough') # fit the transformer with the dataframe transform_df = transformer.fit_transform(df).toarray() # get the feature name columns = transformer.get_feature_names() new_column = [] # get the column name using iteration and append it to new_column for i in range(len(columns) - 5): new_column.append(columns[i][12:]) for i in range(len(columns) - 5, len(columns)): new_column.append(columns[i]) print("Transforming data...") # create a dataset using the transformed dataframe data_set = pd.DataFrame(transform_df) # replace the column with the new_column list that we created data_set.columns = new_column # return the manipulated dataset return data_set
def examples(): from sklearn.pipeline import Pipeline from sklearn.svm import SVC from sklearn.decomposition import PCA estimators = [('reduce_dim', PCA()), ('clf', SVC())] pipe = Pipeline(estimators) print(pipe) print(pipe.steps[0]) print(pipe.named_steps['reduce_dim']) pipe.set_params(clf__C=10) print(pipe.named_steps['clf']) ################################################### # 网格搜索,搜索管道中的参数(重要) from sklearn.model_selection import GridSearchCV param_grid = dict(reduce_dim__n_components=[2, 5, 10], clf__C=[0.1, 10, 100]) grid_search = GridSearchCV(pipe, param_grid=param_grid) print(grid_search) ################################################### # 网格搜索,搜索管道中的参数(重要) from sklearn.linear_model import LogisticRegression param_grid = dict(reduce_dim=[None, PCA(5), PCA(10)], clf=[SVC(), LogisticRegression()], clf__C=[0.1, 10, 100]) # 多个可组成列表 grid_search = GridSearchCV(pipe, param_grid=param_grid) print(grid_search) ################################################### from sklearn.pipeline import make_pipeline from sklearn.naive_bayes import MultinomialNB from sklearn.preprocessing import Binarizer pipe = make_pipeline(Binarizer(), MultinomialNB()) print(pipe) ################################################### # 利用memory减少重复计算 from tempfile import mkdtemp from shutil import rmtree from sklearn.decomposition import PCA from sklearn.svm import SVC from sklearn.pipeline import Pipeline estimators = [('reduce_dim', PCA()), ('clf', SVC())] cachedir = mkdtemp() pipe = Pipeline(estimators, memory=cachedir) print(pipe) # Clear the cache directory when you don't need it anymore rmtree(cachedir) ##################################################### # Transforming target in regression import numpy as np from sklearn.datasets import load_boston from sklearn.compose import TransformedTargetRegressor from sklearn.preprocessing import QuantileTransformer from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split boston = load_boston() X = boston.data y = boston.target transformer = QuantileTransformer(output_distribution='normal') regressor = LinearRegression() regr = TransformedTargetRegressor(regressor=regressor, transformer=transformer) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) regr.fit(X_train, y_train) print('R2 score: {0:.2f}'.format(regr.score(X_test, y_test))) raw_target_regr = LinearRegression().fit(X_train, y_train) print('R2 score: {0:.2f}'.format(raw_target_regr.score(X_test, y_test))) ########################################################## # 对每列数据进行处理-预处理 import pandas as pd X = pd.DataFrame({ 'city': ['London', 'London', 'Paris', 'Sallisaw'], 'title': [ "His Last Bow", "How Watson Learned the Trick", "A Moveable Feast", "The Grapes of Wrath" ], 'expert_rating': [5, 3, 4, 5], 'user_rating': [4, 5, 4, 3] }) from sklearn.compose import ColumnTransformer from sklearn.feature_extraction.text import CountVectorizer column_trans = ColumnTransformer( [('city_category', CountVectorizer(analyzer=lambda x: [x]), 'city'), ('title_bow', CountVectorizer(), 'title')], remainder='drop') print(column_trans.fit(X)) print(column_trans.get_feature_names()) print(column_trans.transform(X).toarray())
train_df = pd.DataFrame(train, columns= numerical_features + list(preprocessor.named_transformers_.cat)) train_df.head() ct = ColumnTransformer([ ('oh_enc', OneHotEncoder(sparse=False), [8,9,10,11]),]) d_1he = ct.fit_transform(Xtrain_new) d_encoded_data = pd.DataFrame(d_1he, columns=ct.get_feature_names()) d_encoded_data.drop(['oh_enc__x0_2016', 'oh_enc__x1_1','oh_enc__x2_0', 'oh_enc__x3_0','oh_enc__x4_0', 'oh_enc__x5_fall'], inplace=True, axis=1) df_concat = pd.concat([Xtrain_new.reset_index(drop=True), d_encoded_data.reset_index(drop=True)], axis=1) df_concat.drop(['season', 'year', 'month', 'hours', 'is_business_day', 'is_holiday'], inplace=True, axis=1) X_trained = df_concat[:dataInt.shape[0]] # Les Num ct_num = ColumnTransformer([ ('stdScal', StandardScaler(), ['temp_1','temp_2','mean_national_temp','humidity_1', 'humidity_2','consumption_secondary_1','consumption_secondary_2','consumption_secondary_3'])], remainder='passthrough') X_tr = ct_num.fit_transform(numerical_features)
print(X_df.shape, y.shape) column_trans = ColumnTransformer([ ('system_category', OneHotEncoder(dtype='int'), ['systems']), ('genre_category', OneHotEncoder(dtype='int'), ['genres']), ('playModes_category', OneHotEncoder(dtype='int'), ['playModes']), ('themes_category', OneHotEncoder(dtype='int'), ['themes']), ('series_category', OneHotEncoder(dtype='int'), ['series']), ('playerPerspectives', OneHotEncoder(dtype='int'), ['playerPerspectives']), ('TfIdf', TfidfVectorizer(stop_words='english'), 'gameDescription') ], remainder='drop') column_trans.fit(X_df) column_trans.get_feature_names() X = column_trans.transform(X_df).toarray() print(X) # Split data into test and train X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y) ##################### ### RANDOM FOREST ### ##################### # Model (can also use single decision tree)
'city': ['London', 'London', 'Paris', 'Sallisaw'], 'title': [ "His Last Bow", "How Watson Learned the Trick", "A Moveable Feast", "The Grapes of Wrath" ], 'expert_rating': [5, 3, 4, 5], 'user_rating': [4, 5, 4, 3] }) column_trans = ColumnTransformer( [ ( 'city category', OneHotEncoder(dtype='int'), ['city'] ), # One Hot Encoder requires 2D data as an input, thus we have to pass the column name as a list of strings, # as is the case with most transformers. ('title bow', CountVectorizer(), 'title') ], # CountVectorizer takes a 1D array as input, thus the column is passed as a string remainder='drop' ) # The 'remainder' parameter determines whether to ignore(drop) the remaining columns. The columns can be kept by using remainder='passthrough' # The remainder can also be set to an estimator to transform the remaining columns # remainder=MinMaxScaler()) column_trans.fit(X) print(column_trans.get_feature_names()) # The make_column_transformer function is a useful alternative as it automatically assigns names col_tran = make_column_transformer((OneHotEncoder(), ['city']), (CountVectorizer(), 'title'), remainder=MinMaxScaler()) print(col_tran)
def test_transformer_get_feature_names(self): transformers = [('transformer name', BaseTransformer(), self.test_cols)] ct = ColumnTransformer(transformers=transformers) ct.fit(self.data) self.assertListEqual(['transformer name__' + col for col in self.test_cols], ct.get_feature_names())
def get_data(self, dataset='Adult', random_number=42): if isinstance(dataset, str): dataset_key = self.map_name2id[dataset] else: dataset_key = str(dataset) number_instances = [] number_attributes = [] number_features = [] def get_class_attribute_name(df): for i in range(len(df.columns)): if str(df.columns[i]).startswith('class@'): return str(df.columns[i]) def get_sensitive_attribute_id(df, sensitive_attribute_name): for i in range(len(df.columns)): if str(df.columns[i]) == sensitive_attribute_name: return i key = dataset_key if type(dataset_key) == type(None): key = list(self.map_dataset.keys())[random.randint(0, len(self.map_dataset) - 1)] data_path = './google_drive_data' if not os.path.isdir(data_path): print("Downloading Datasets ...") download_file_from_google_drive("19Qj3T9Yt_hQ4bM0Ac9D2MS7x507sTJRU", 'DFS_datasets.zip') with zipfile.ZipFile('DFS_datasets.zip') as zf: zf.extractall('google_drive_data') os.remove('DFS_datasets.zip') print("Downloading Query Optimizer Models ...") download_file_from_google_drive("1lxbcs9vS6U8t-5II2qpx0OIv08EON7NL", 'DFS_models.zip') with zipfile.ZipFile('DFS_models.zip') as zf: zf.extractall('google_drive_models') os.remove('DFS_models.zip') value = self.map_dataset[key] with open(data_path + "/dfs_datasets/" + str(key) + ".arff") as f: df = a2p.load(f) number_instances.append(df.shape[0]) number_attributes.append(df.shape[1]) y = copy.deepcopy(df[get_class_attribute_name(df)]) X = df.drop(columns=[get_class_attribute_name(df)]) categorical_features = [] continuous_columns = [] for type_i in range(len(X.columns)): if X.dtypes[type_i] == object: categorical_features.append(type_i) else: continuous_columns.append(type_i) sensitive_attribute_id = get_sensitive_attribute_id(X, value) #print(sensitive_attribute_id) X_datat = X.values for x_i in range(X_datat.shape[0]): for y_i in range(X_datat.shape[1]): if type(X_datat[x_i][y_i]) == type(None): if X.dtypes[y_i] == object: X_datat[x_i][y_i] = 'missing' else: X_datat[x_i][y_i] = np.nan X_temp, X_test, y_temp, y_test = train_test_split(X_datat, y.values.astype('str'), test_size=0.2, random_state=random_number, stratify=y.values.astype('str')) X_train, X_validation, y_train, y_validation = train_test_split(X_temp, y_temp, test_size=0.25, random_state=random_number, stratify=y_temp) cat_sensitive_attribute_id = -1 for c_i in range(len(categorical_features)): if categorical_features[c_i] == sensitive_attribute_id: cat_sensitive_attribute_id = c_i break my_transformers = [] if len(categorical_features) > 0: ct = ColumnTransformer( [("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)]) my_transformers.append(("o", ct)) if len(continuous_columns) > 0: scale = ColumnTransformer([("scale", Pipeline( [('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scale', MinMaxScaler())]), continuous_columns)]) my_transformers.append(("s", scale)) pipeline = FeatureUnion(my_transformers) pipeline.fit(X_train) X_train = pipeline.transform(X_train) X_validation = pipeline.transform(X_validation) X_test = pipeline.transform(X_test) number_features.append(X_train.shape[1]) all_columns = [] for ci in range(len(X.columns)): all_columns.append(str(X.columns[ci]).split('@')[0]) X.columns = all_columns names = ct.get_feature_names() for c in continuous_columns: names.append(str(X.columns[c])) for n_i in range(len(names)): if names[n_i].startswith('onehot__x'): tokens = names[n_i].split('_') category = '' for ti in range(3, len(tokens)): category += '_' + tokens[ti] cat_id = int(names[n_i].split('_')[2].split('x')[1]) names[n_i] = str(X.columns[categorical_features[cat_id]]) + category sensitive_ids = [] all_names = ct.get_feature_names() for fname_i in range(len(all_names)): if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'): sensitive_ids.append(fname_i) le = preprocessing.LabelEncoder() le.fit(y_train) y_train = le.fit_transform(y_train) y_validation = le.transform(y_validation) y_test = le.transform(y_test) return X_train, X_validation, X_test, y_train, y_validation, y_test, names, sensitive_ids
xshape = X_train.shape[1] if one_hot: ct = ColumnTransformer([ ("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), [1, 3, 5, 6, 7, 8, 9, 13]) ]) scale = ColumnTransformer([("scale", MinMaxScaler(), continuous_columns)]) pipeline = FeatureUnion([("o", ct), ("s", scale)]) X_train = pipeline.fit_transform(X_train) xshape = X_train.shape[1] print(xshape) X_test = pipeline.transform(X_test) print(ct.get_feature_names()) names = ct.get_feature_names() for c in continuous_columns: names.append(str(X.columns[c])) pickle.dump(names, open("/home/felix/phd/ranking_exeriments/names.p", "wb")) print(np.array(names)) #ranking by accuracy ranking_model = ExtraTreesClassifier(n_estimators=n_estimators, random_state=0) ranking_model.fit(X_train, y_train) accuracy_ranking = ranking_model.feature_importances_ pickle.dump( accuracy_ranking,
def get_fair_data1(dataset_key=None): map_dataset = {} map_dataset['31'] = 'foreign_worker@{yes,no}' map_dataset['802'] = 'sex@{female,male}' map_dataset['1590'] = 'sex@{Female,Male}' map_dataset['1461'] = 'AGE@{True,False}' map_dataset['42193'] = 'race_Caucasian@{0,1}' map_dataset['1480'] = 'V2@{Female,Male}' # map_dataset['804'] = 'Gender@{0,1}' map_dataset['42178'] = 'gender@STRING' map_dataset['981'] = 'Gender@{Female,Male}' map_dataset['40536'] = 'samerace@{0,1}' map_dataset['40945'] = 'sex@{female,male}' map_dataset['451'] = 'Sex@{female,male}' # map_dataset['945'] = 'sex@{female,male}' map_dataset['446'] = 'sex@{Female,Male}' map_dataset['1017'] = 'sex@{0,1}' map_dataset['957'] = 'Sex@{0,1,4}' map_dataset['41430'] = 'SEX@{True,False}' map_dataset['1240'] = 'sex@{Female,Male}' map_dataset['1018'] = 'sex@{Female,Male}' # map_dataset['55'] = 'SEX@{male,female}' map_dataset['38'] = 'sex@{F,M}' map_dataset['1003'] = 'sex@{male,female}' map_dataset['934'] = 'race@{black,white}' number_instances = [] number_attributes = [] number_features = [] def get_class_attribute_name(df): for i in range(len(df.columns)): if str(df.columns[i]).startswith('class@'): return str(df.columns[i]) def get_sensitive_attribute_id(df, sensitive_attribute_name): for i in range(len(df.columns)): if str(df.columns[i]) == sensitive_attribute_name: return i key = dataset_key if type(dataset_key) == type(None): key = list(map_dataset.keys())[random.randint(0, len(map_dataset) - 1)] value = map_dataset[key] with open(Config.get('data_path') + "/downloaded_arff/" + str(key) + ".arff") as f: df = a2p.load(f) print("dataset: " + str(key)) number_instances.append(df.shape[0]) number_attributes.append(df.shape[1]) y = copy.deepcopy(df[get_class_attribute_name(df)]) X = df.drop(columns=[get_class_attribute_name(df)]) categorical_features = [] continuous_columns = [] for type_i in range(len(X.columns)): if X.dtypes[type_i] == object: categorical_features.append(type_i) else: continuous_columns.append(type_i) sensitive_attribute_id = get_sensitive_attribute_id(X, value) print(sensitive_attribute_id) X_datat = X.values for x_i in range(X_datat.shape[0]): for y_i in range(X_datat.shape[1]): if type(X_datat[x_i][y_i]) == type(None): if X.dtypes[y_i] == object: X_datat[x_i][y_i] = 'missing' else: X_datat[x_i][y_i] = np.nan X_train, X_test, y_train, y_test = train_test_split(X_datat, y.values.astype('str'), test_size=0.5, random_state=42, stratify=y.values.astype('str')) ''' X_train, X_test, y_train, y_test = train_test_split(X_datat[0:200,:], y.values[0:200].astype('str'), test_size=0.5, random_state=42, stratify=y.values[0:200].astype('str')) ''' cat_sensitive_attribute_id = -1 for c_i in range(len(categorical_features)): if categorical_features[c_i] == sensitive_attribute_id: cat_sensitive_attribute_id = c_i break my_transformers = [] if len(categorical_features) > 0: ct = ColumnTransformer( [("onehot", OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)]) my_transformers.append(("o", ct)) if len(continuous_columns) > 0: scale = ColumnTransformer([("scale", Pipeline( [('impute', SimpleImputer(missing_values=np.nan, strategy='mean')), ('scale', MinMaxScaler())]), continuous_columns)]) my_transformers.append(("s", scale)) pipeline = FeatureUnion(my_transformers) pipeline.fit(X_train) X_train = pipeline.transform(X_train) X_test = pipeline.transform(X_test) number_features.append(X_train.shape[1]) all_columns = [] for ci in range(len(X.columns)): all_columns.append(str(X.columns[ci]).split('@')[0]) X.columns = all_columns names = ct.get_feature_names() for c in continuous_columns: names.append(str(X.columns[c])) for n_i in range(len(names)): if names[n_i].startswith('onehot__x'): tokens = names[n_i].split('_') category = '' for ti in range(3, len(tokens)): category += '_' + tokens[ti] cat_id = int(names[n_i].split('_')[2].split('x')[1]) names[n_i] = str(X.columns[categorical_features[cat_id]]) + category print(names) sensitive_ids = [] all_names = ct.get_feature_names() for fname_i in range(len(all_names)): if all_names[fname_i].startswith('onehot__x' + str(cat_sensitive_attribute_id) + '_'): sensitive_ids.append(fname_i) le = preprocessing.LabelEncoder() le.fit(y_train) y_train = le.fit_transform(y_train) y_test = le.transform(y_test) return X_train, X_test, y_train, y_test, names, sensitive_ids, key, sensitive_attribute_id
remainder='passthrough') train_new = ct_num.fit_transform(numeric_features) #TEST test_new = ct_num.fit(numeric_features) # Gerer les variables categoriques ct = ColumnTransformer([ ('oh_enc', OneHotEncoder(sparse=False), [8, 9, 10, 11, 12, 13]), ]) d_1he = ct.fit_transform(train_new) #Get Feature Names of Encoded columns #ct.get_feature_names() # Converting the numpy array into a pandas dataframe d_encoded_data = pd.DataFrame(d_1he, columns=ct.get_feature_names()) d_encoded_data.drop([ 'oh_enc__x0_2016', 'oh_enc__x1_1', 'oh_enc__x2_0', 'oh_enc__x3_0', 'oh_enc__x4_0', 'oh_enc__x5_fall' ], inplace=True, axis=1) #Concatenating the encoded dataframe with the original dataframe df_concat = pd.concat( [train_new.reset_index(drop=True), d_encoded_data.reset_index(drop=True)], axis=1) # Dropping drive-wheels, make and engine-location columns as they are encoded df_concat.drop( ['season', 'year', 'month', 'hours', 'is_business_day', 'is_holiday'], inplace=True,
print("..Training Result:") print(f"....acc: {accuracy_score(y_train, pred_train)}") print(f"....precision: {precision_score(y_train, pred_train)}") print(f"....recall: {recall_score(y_train, pred_train)}") print(f"....f1: {f1_score(y_train, pred_train)}") print("..Testing Result:") print(f"....acc: {accuracy_score(y_test, pred_test)}") print(f"....precision: {precision_score(y_test, pred_test)}") print(f"....recall: {recall_score(y_test, pred_test)}") print(f"....f1: {f1_score(y_test, pred_test)}") # %% plot the decision tree and look for important features from sklearn.tree import plot_tree plot_tree(clf, filled=True, max_depth=6, feature_names=ct.get_feature_names()) # %% apply logistic regerssion classifier and check results from sklearn.linear_model import LogisticRegression clf = LogisticRegression() clf.fit(x_train, y_train) pred_train = clf.predict(x_train) pred_test = clf.predict(x_test) print(clf.__class__.__name__) print("..Training Result:") print(f"....acc: {accuracy_score(y_train, pred_train)}") print(f"....precision: {precision_score(y_train, pred_train)}") print(f"....recall: {recall_score(y_train, pred_train)}")
pc = preprocessing_config # In[4]: transforms = [ ('passthrough', PassthroughTransformer(), pc['passthrough']), ('ma03', MovingAverageTransformer(3), pc['moving_average']), ('ma06', MovingAverageTransformer(6), pc['moving_average']), ('ma12', MovingAverageTransformer(12), pc['moving_average']), ('ma24', MovingAverageTransformer(24), pc['moving_average']), ('ma48', MovingAverageTransformer(48), pc['moving_average']), ('make_target', PercentChangeTransformer(), [pc['target']]), ] ct = ColumnTransformer(transforms, remainder='drop', n_jobs=-1) ct = ct.fit(data) features = ct.get_feature_names() features # In[5]: arr = ct.transform(data) arr = arr[~np.isnan(arr).any(axis=1)] arr.view() # In[6]: plt.figure() plt.plot(arr[:, features.index('passthrough__close')]) plt.title('close') plt.figure() plt.plot(arr[:, features.index('make_target__close')])
X_train = pipeline.transform(X_train) X_validation = pipeline.transform(X_validation) X_test = pipeline.transform(X_test) number_features = X_train.shape[1] print(name_dataset + ": instances = " + str(number_instances) + " attributes = " + str(number_attributes) + " features = " + str(number_features)) all_columns = [] for ci in range(len(X.columns)): all_columns.append(str(X.columns[ci]).split('@')[0]) X.columns = all_columns names = ct.get_feature_names() for c in continuous_columns: names.append(str(X.columns[c])) for n_i in range(len(names)): if names[n_i].startswith('onehot__x'): tokens = names[n_i].split('_') category = '' for ti in range(3, len(tokens)): category += '_' + tokens[ti] cat_id = int(names[n_i].split('_')[2].split('x')[1]) names[n_i] = str( X.columns[categorical_features[cat_id]]) + category print(names)