def test_column_transformer_sparse_array(): X_sparse = sparse.eye(3, 2).tocsr() # no distinction between 1D and 2D X_res_first = X_sparse[:, 0] X_res_both = X_sparse for col in [0, [0], slice(0, 1)]: for remainder, res in [('drop', X_res_first), ('passthrough', X_res_both)]: ct = ColumnTransformer([('trans', Trans(), col)], remainder=remainder, sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), res) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), res) for col in [[0, 1], slice(0, 2)]: ct = ColumnTransformer([('trans', Trans(), col)], sparse_threshold=0.8) assert sparse.issparse(ct.fit_transform(X_sparse)) assert_allclose_dense_sparse(ct.fit_transform(X_sparse), X_res_both) assert_allclose_dense_sparse(ct.fit(X_sparse).transform(X_sparse), X_res_both)
def test_column_transformer_special_strings(): # one 'drop' -> ignore X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', 'drop', [1])]) exp = np.array([[0.], [1.], [2.]]) assert_array_equal(ct.fit_transform(X_array), exp) assert_array_equal(ct.fit(X_array).transform(X_array), exp) # all 'drop' -> return shape 0 array ct = ColumnTransformer( [('trans1', 'drop', [0]), ('trans2', 'drop', [1])]) assert_array_equal(ct.fit(X_array).transform(X_array).shape, (3, 0)) assert_array_equal(ct.fit_transform(X_array).shape, (3, 0)) # 'passthrough' X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', 'passthrough', [1])]) exp = X_array assert_array_equal(ct.fit_transform(X_array), exp) assert_array_equal(ct.fit(X_array).transform(X_array), exp) # None itself / other string is not valid for val in [None, 'other']: ct = ColumnTransformer( [('trans1', Trans(), [0]), ('trans2', None, [1])]) assert_raise_message(TypeError, "All estimators should implement", ct.fit_transform, X_array) assert_raise_message(TypeError, "All estimators should implement", ct.fit, X_array)
def test_column_transformer_callable_specifier(): # assert that function gets the full array / dataframe X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([[0, 1, 2]]).T def func(X): assert_array_equal(X, X_array) return [0] ct = ColumnTransformer([('trans', Trans(), func)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) pd = pytest.importorskip('pandas') X_df = pd.DataFrame(X_array, columns=['first', 'second']) def func(X): assert_array_equal(X.columns, X_df.columns) assert_array_equal(X.values, X_df.values) return ['first'] ct = ColumnTransformer([('trans', Trans(), func)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), X_res_first) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first)
def test_column_transformer_remainder(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_second = np.array([2, 4, 6]).reshape(-1, 1) X_res_both = X_array # default drop ct = ColumnTransformer([('trans1', Trans(), [0])]) assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1]) # specify passthrough ct = ColumnTransformer([('trans', Trans(), [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # column order is not preserved (passed through added to end) ct = ColumnTransformer([('trans1', Trans(), [1])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1]) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1]) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [0]) # passthrough when all actual transformers are skipped ct = ColumnTransformer([('trans1', 'drop', [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_second) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1]) # error on invalid arg ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit, X_array) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\', \'passthrough\', " "or estimator.", ct.fit_transform, X_array) # check default for make_column_transformer ct = make_column_transformer(([0], Trans())) assert ct.remainder == 'drop'
def test_column_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first1D = np.array([0, 1, 2]) X_res_second1D = np.array([2, 4, 6]) X_res_first = X_res_first1D.reshape(-1, 1) X_res_both = X_array cases = [ # single column 1D / 2D (0, X_res_first), ([0], X_res_first), # list-like ([0, 1], X_res_both), (np.array([0, 1]), X_res_both), # slice (slice(0, 1), X_res_first), (slice(0, 2), X_res_both), # boolean mask (np.array([True, False]), X_res_first), ] for selection, res in cases: ct = ColumnTransformer([('trans', Trans(), selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), res) assert_array_equal(ct.fit(X_array).transform(X_array), res) # callable that returns any of the allowed specifiers ct = ColumnTransformer([('trans', Trans(), lambda x: selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_array), res) assert_array_equal(ct.fit(X_array).transform(X_array), res) ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])], transformer_weights=transformer_weights) res = np.vstack([transformer_weights['trans1'] * X_res_first1D, transformer_weights['trans2'] * X_res_second1D]).T assert_array_equal(both.fit_transform(X_array), res) assert_array_equal(both.fit(X_array).transform(X_array), res) assert len(both.transformers_) == 2 both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_array), 0.1 * X_res_both) assert_array_equal(both.fit(X_array).transform(X_array), 0.1 * X_res_both) assert len(both.transformers_) == 1
def test_column_transformer_negative_column_indexes(): X = np.random.randn(2, 2) X_categories = np.array([[1], [2]]) X = np.concatenate([X, X_categories], axis=1) ohe = OneHotEncoder(categories='auto') tf_1 = ColumnTransformer([('ohe', ohe, [-1])], remainder='passthrough') tf_2 = ColumnTransformer([('ohe', ohe, [2])], remainder='passthrough') assert_array_equal(tf_1.fit_transform(X), tf_2.fit_transform(X))
def test_column_transformer_cloning(): X_array = np.array([[0., 1., 2.], [2., 4., 6.]]).T ct = ColumnTransformer([('trans', StandardScaler(), [0])]) ct.fit(X_array) assert_false(hasattr(ct.transformers[0][1], 'mean_')) assert_true(hasattr(ct.transformers_[0][1], 'mean_')) ct = ColumnTransformer([('trans', StandardScaler(), [0])]) ct.fit_transform(X_array) assert_false(hasattr(ct.transformers[0][1], 'mean_')) assert_true(hasattr(ct.transformers_[0][1], 'mean_'))
def test_column_transformer_remainder_numpy(key): # test different ways that columns are specified with passthrough X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both)
def test_make_column_transformer_pandas(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) norm = Normalizer() ct1 = ColumnTransformer([('norm', Normalizer(), X_df.columns)]) ct2 = make_column_transformer((norm, X_df.columns)) assert_almost_equal(ct1.fit_transform(X_df), ct2.fit_transform(X_df))
def test_column_transformer_remainder_pandas(key): # test different ways that columns are specified with passthrough pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both)
def test_column_transformer_no_remaining_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T ct = ColumnTransformer([('trans1', Trans(), [0, 1, 2])], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_array) assert_array_equal(ct.fit(X_array).transform(X_array), X_array) assert len(ct.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder'
def test_column_transformer_remainder(): X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_second = np.array([2, 4, 6]).reshape(-1, 1) X_res_both = X_array # default passthrough ct = ColumnTransformer([('trans', Trans(), [0])]) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) # specify to drop remaining columns ct = ColumnTransformer([('trans1', Trans(), [0])], remainder='drop') assert_array_equal(ct.fit_transform(X_array), X_res_first) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_first) # column order is not preserved (passed through added to end) ct = ColumnTransformer([('trans1', Trans(), [1])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_both[:, ::-1]) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both[:, ::-1]) # passthrough when all actual transformers are skipped ct = ColumnTransformer([('trans1', 'drop', [0])], remainder='passthrough') assert_array_equal(ct.fit_transform(X_array), X_res_second) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_second) # error on invalid arg ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=1) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\' or \'passthrough\'", ct.fit, X_array) assert_raise_message( ValueError, "remainder keyword needs to be one of \'drop\' or \'passthrough\'", ct.fit_transform, X_array)
def test_column_transformer_empty_columns(pandas, column): # test case that ensures that the column transformer does also work when # a given transformer doesn't have any columns to work on X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_res_both = X_array if pandas: pd = pytest.importorskip('pandas') X = pd.DataFrame(X_array, columns=['first', 'second']) else: X = X_array ct = ColumnTransformer([('trans1', Trans(), [0, 1]), ('trans2', Trans(), column)]) assert_array_equal(ct.fit_transform(X), X_res_both) assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[1][1], Trans) ct = ColumnTransformer([('trans1', Trans(), column), ('trans2', Trans(), [0, 1])]) assert_array_equal(ct.fit_transform(X), X_res_both) assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 assert isinstance(ct.transformers_[0][1], Trans) ct = ColumnTransformer([('trans', Trans(), column)], remainder='passthrough') assert_array_equal(ct.fit_transform(X), X_res_both) assert_array_equal(ct.fit(X).transform(X), X_res_both) assert len(ct.transformers_) == 2 # including remainder assert isinstance(ct.transformers_[0][1], Trans) fixture = np.array([[], [], []]) ct = ColumnTransformer([('trans', Trans(), column)], remainder='drop') assert_array_equal(ct.fit_transform(X), fixture) assert_array_equal(ct.fit(X).transform(X), fixture) assert len(ct.transformers_) == 2 # including remainder assert isinstance(ct.transformers_[0][1], Trans)
def test_column_transformer_sparse_threshold(): X_array = np.array([['a', 'b'], ['A', 'B']], dtype=object).T # above data has sparsity of 4 / 8 = 0.5 # apply threshold even if all sparse col_trans = ColumnTransformer([('trans1', OneHotEncoder(), [0]), ('trans2', OneHotEncoder(), [1])], sparse_threshold=0.2) res = col_trans.fit_transform(X_array) assert not sparse.issparse(res) assert not col_trans.sparse_output_ # mixed -> sparsity of (4 + 2) / 8 = 0.75 for thres in [0.75001, 1]: col_trans = ColumnTransformer( [('trans1', OneHotEncoder(sparse=True), [0]), ('trans2', OneHotEncoder(sparse=False), [1])], sparse_threshold=thres) res = col_trans.fit_transform(X_array) assert sparse.issparse(res) assert col_trans.sparse_output_ for thres in [0.75, 0]: col_trans = ColumnTransformer( [('trans1', OneHotEncoder(sparse=True), [0]), ('trans2', OneHotEncoder(sparse=False), [1])], sparse_threshold=thres) res = col_trans.fit_transform(X_array) assert not sparse.issparse(res) assert not col_trans.sparse_output_ # if nothing is sparse -> no sparse for thres in [0.33, 0, 1]: col_trans = ColumnTransformer( [('trans1', OneHotEncoder(sparse=False), [0]), ('trans2', OneHotEncoder(sparse=False), [1])], sparse_threshold=thres) res = col_trans.fit_transform(X_array) assert not sparse.issparse(res) assert not col_trans.sparse_output_
def test_column_transformer_no_estimators(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).astype('float').T ct = ColumnTransformer([], remainder=StandardScaler()) params = ct.get_params() assert params['remainder__with_mean'] X_trans = ct.fit_transform(X_array) assert X_trans.shape == X_array.shape assert len(ct.transformers_) == 1 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][2] == [0, 1, 2]
def test_column_transformer_drop_all_sparse_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T ct = ColumnTransformer([('trans1', 'drop', [0])], remainder=SparseMatrixTrans()) X_trans = ct.fit_transform(X_array) assert sparse.issparse(X_trans) # SparseMatrixTrans creates 3 features for each column, thus: assert X_trans.shape == (3, 3) assert_array_equal(X_trans.toarray(), np.eye(3)) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_list(): X_list = [ [1, float('nan'), 'a'], [0, 0, 'b'] ] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_drops_all_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T # columns are doubled when remainder = DoubleTrans X_res_both = 2 * X_array.copy()[:, 1:3] ct = ColumnTransformer([('trans1', 'drop', [0])], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], DoubleTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_remainder_pandas(key): # test different ways that columns are specified with passthrough pd = pytest.importorskip('pandas') if isinstance(key, six.string_types) and key == 'pd-index': key = pd.Index(['first']) X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_both = X_array ct = ColumnTransformer([('trans1', Trans(), key)], remainder='passthrough') assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'passthrough' assert_array_equal(ct.transformers_[-1][2], [1])
def test_column_transformer_remainder_transformer(key): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T X_res_both = X_array.copy() # second and third columns are doubled when remainder = DoubleTrans X_res_both[:, 1:3] *= 2 ct = ColumnTransformer([('trans1', Trans(), key)], remainder=DoubleTrans()) assert_array_equal(ct.fit_transform(X_array), X_res_both) assert_array_equal(ct.fit(X_array).transform(X_array), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], DoubleTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
def test_column_transformer_list(): X_list = [ [1, float('nan'), 'a'], [0, 0, 'b'] ] expected_result = np.array([ [1, float('nan'), 1, 0], [-1, 0, 0, 1], ]) ct = ColumnTransformer([ ('numerical', StandardScaler(), [0, 1]), ('categorical', OneHotEncoder(), [2]), ]) with pytest.warns(DataConversionWarning): # TODO: this warning is not very useful in this case, would be good # to get rid of it assert_array_equal(ct.fit_transform(X_list), expected_result) assert_array_equal(ct.fit(X_list).transform(X_list), expected_result)
def test_column_transformer_sparse_remainder_transformer(): X_array = np.array([[0, 1, 2], [2, 4, 6], [8, 6, 4]]).T ct = ColumnTransformer([('trans1', Trans(), [0])], remainder=SparseMatrixTrans()) X_trans = ct.fit_transform(X_array) assert sparse.issparse(X_trans) # SparseMatrixTrans creates 3 features for each column. There is # one column in ``transformers``, thus: assert X_trans.shape == (3, 3 + 1) exp_array = np.hstack( (X_array[:, 0].reshape(-1, 1), np.eye(3))) assert_array_equal(X_trans.toarray(), exp_array) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert isinstance(ct.transformers_[-1][1], SparseMatrixTrans) assert_array_equal(ct.transformers_[-1][2], [1, 2])
# %% band_gap_num = band_gap.drop('Compound', axis=1) pipe = Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('std', StandardScaler()), ]) band_gap_tr = pipe.fit_transform(band_gap_num) # %% num_attribs = list(band_gap_num) cat_attribs = ['Compound'] full_pipe = ColumnTransformer([('num', pipe, num_attribs), ('cat', OrdinalEncoder(), cat_attribs)]) band_gap_prepared = full_pipe.fit_transform(band_gap) # %% # OrdinalEncoder().categories # band_gap_prepared_df = pd.DataFrame(band_gap_prepared) # band_gap_prepared_df.head(10) # %% lin_reg = LinearRegression() lin_reg.fit(band_gap_prepared, band_gap_label) # %% band_gap_prediction = lin_reg.predict(band_gap_prepared) zip_sample = zip(band_gap_prediction, band_gap_label) for i, j in zip_sample: print(i, j) bg_mse = mean_squared_error(band_gap_prediction, band_gap_label) bg_rmse = np.sqrt(bg_mse)
""" import pandas as pd base = pd.read_csv('census.csv') previsores = base.iloc[:, 0:14].values classe = base.iloc[:, 14].values from sklearn.preprocessing import OneHotEncoder, LabelEncoder from sklearn.compose import ColumnTransformer onehotencorder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])], remainder='passthrough') previsores = onehotencorder.fit_transform(previsores).toarray() labelencorder_classe = LabelEncoder() classe = labelencorder_classe.fit_transform(classe) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split( previsores, classe, test_size=0.15, random_state=0) #importa biblioteca from sklearn.neighbors import KNeighborsClassifier classificador = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
# Importing the libraries import numpy as np import pandas as pd # Importing the dataset dataset = pd.read_csv('50_Startups.csv') X = dataset.iloc[ : , :-1].values y = dataset.iloc[ : , -1].values # Encoding Categorical Data # Encoding the Independent Variable from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct =ColumnTransformer( transformers = [('encoder', OneHotEncoder(), [3])], remainder ='passthrough') X= np.array (ct.fit_transform(X)) #avoiding dummy variable trap X = X[ : ,1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test =train_test_split( X, y, test_size = 0.2, random_state = 0) # Training the Multiple Linear Regression model on the Training set from sklearn.linear_model import LinearRegression regressor = LinearRegression() regressor.fit(X_train, y_train) # Predicting the Test set results y_pred = regressor.predict(X_test)
# convert texts to numbers housing_cat = housing[['ocean_proximity']] ordinal_encoder = OrdinalEncoder() housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) ordinal_encoder.categories_ cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy='median')), # ('attribs_adder', Combined) ('std_scaler', StandardScaler()) ]) num_attribs = list(housing_num) cat_attribs = ['ocean_proximitnum_pipeliney'] full_pipeline = ColumnTransformer([ ('num', num_pipeline, num_attribs), ('cat', OneHotEncoder(), cat_attribs) ]) housing_prepared = full_pipeline.fit_transform(housing) lin_reg = LinearRegression() lin_reg.fit(housing_prepared,housing_labels)
from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = imputer.fit(x[:, 1:3]) x[:, 1:3] = imputer.transform(x[:, 1:3]) # Encoding categorical data from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer #labelEncoder_x = LabelEncoder() #x[:, 0] = labelEncoder_x.fit_transform(x[:, 0]) # Create dummy variables for the countries columnTransformer = ColumnTransformer([('lel', OneHotEncoder(), [0])], remainder='passthrough') x = columnTransformer.fit_transform(x).astype(float) # Encoding dependent variable labelEncoder_y = LabelEncoder() y = labelEncoder_y.fit_transform(y) # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc_x = StandardScaler()
class train_lnphi: def __init__(self): # Force CPU #os.environ['CUDA_VISIBLE_DEVICES'] = '-1' return # Read using pandas def load_lnphi_data(self, lnphi_path, datafile_name): csv_path = os.path.join( lnphi_path, datafile_name ) # previously data = "data_const_T_20200716-230921.csv" self.lnphi = pd.read_csv( csv_path, delimiter=',', names=['a_mix', 'b_mix', 'b_i', 'sum', 'lnphi']) print('Loading done. Shape: {}'.format(str(self.lnphi.shape))) # Drop out of range lnphi instances def lnphi_range(self, min, max): self.lnphi.drop( self.lnphi.loc[(self.lnphi.loc[:, 'lnphi'] < min) | (self.lnphi.loc[:, 'lnphi'] > max)].index, inplace=True) print('Drop lnphi out of range done. Shape: {}'.format( str(self.lnphi.shape))) def split_data(self): self.X = self.lnphi.loc[:, 'a_mix':'sum'] self.y = self.lnphi.loc[:, 'lnphi'] # Split data -> (train_full, test) self.X_train_full, self.X_test, self.y_train_full, self.y_test = train_test_split( self.X, self.y, test_size=0.2, random_state=42) # Split train_full -> (train, valid) self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split( self.X_train_full, self.y_train_full, test_size=0.2, random_state=42) print('Splitting done.') def feature_eng(self): # Label Transform pipeline self.label_scaler = MinMaxScaler() self.label_num_pipeline = Pipeline([('label minmax scaler', self.label_scaler)]) self.y_train_prepared = self.label_num_pipeline.fit_transform( self.y_train.values.reshape(-1, 1)) self.y_valid_prepared = self.label_num_pipeline.transform( self.y_valid.values.reshape(-1, 1)) self.y_test_prepared = self.label_num_pipeline.transform( self.y_test.values.reshape(-1, 1)) # Attribute Transform pipeline self.attr_scaler = MinMaxScaler() num_pipeline = Pipeline([ #('std scaler', self.attr_std_scaler) ('min_max_scaler', self.attr_scaler) ]) num_attribs = list(self.X_train) self.full_pipeline = ColumnTransformer([('num', num_pipeline, num_attribs)]) self.X_train_prepared = self.full_pipeline.fit_transform(self.X_train) self.X_valid_prepared = self.full_pipeline.transform(self.X_valid) self.X_test = self.full_pipeline.transform(self.X_test) print('Feature Eng done.') def model_construct(self, n_layers, n_nodes): n_inputs = self.X_train_prepared.shape[1] self.model = tf.keras.Sequential() self.model.add( tf.keras.layers.Dense( n_nodes, activation=tf.keras.layers.LeakyReLU(alpha=0.1), input_shape=[n_inputs])) for _ in range(n_layers - 1): self.model.add( tf.keras.layers.Dense( n_nodes, activation=tf.keras.layers.LeakyReLU(alpha=0.1))) self.model.add(tf.keras.layers.Dense(1)) # Remove lr if scheduler in use? self.model.compile(loss='mse', optimizer=keras.optimizers.Adam(), metrics=[ 'mse', 'mae', tf.keras.metrics.MeanAbsolutePercentageError() ]) def train_model(self, batch_size, n_layers, n_nodes, epochs, initial_epoch, log_save_dir, name_prefix): # Logs callback model_name = name_prefix + '_' + str(batch_size) + '_' + str( n_layers) + '_' + str(n_nodes) + '_' + str(epochs) + '_' try: logdir = self.logdir except AttributeError: print('New logdir created.') self.logdir = log_save_dir + ".\\logs\\scalars\\" + model_name + str( datetime.datetime.now().strftime("%Y%m%d-%H%M%S")) logdir = self.logdir if not os.path.exists(logdir): os.makedirs(logdir) tensorboard_callback = keras.callbacks.TensorBoard( log_dir=logdir, histogram_freq=0, # How often to log histogram visualizations write_graph=True, update_freq='epoch', profile_batch= 0, # set to 0. Else bug Tensorboard not show train loss. embeddings_freq=0, # How often to log embedding visualizations ) # Learning rate schedule as callback def scheduler(epoch): if epoch < 10: return 0.001 else: return 0.001 * tf.math.exp(0.5 * (10 - epoch)) '''if 0.001 * tf.math.exp(0.1 * (10 - epoch)) < 1E-5: return 1E-5 else: return 0.001 * tf.math.exp(0.1 * (10 - epoch))''' #lr_scheduler_callback = tf.keras.callbacks.LearningRateScheduler(scheduler, verbose=1) #reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=5, min_lr=0.0001) # Early stop early_stop = tf.keras.callbacks.EarlyStopping(monitor='mse', min_delta=0.001, patience=3) #todo maybe make proportional early stopping # Callback save model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=logdir, # +'.\\{epoch:.02d}-{mse:.2f}', verbose=1, save_weights_only=False, monitor='mse', # Not sure mode='auto', save_best_only=True) # Store version info as file in directory def get_git_revision_hash(): return subprocess.check_output(['git', 'rev-parse', 'HEAD']) with open(logdir + '.\\version_info.txt', 'a', newline='') as file: file.write('model_name' + ' ' + str(get_git_revision_hash()) + '\n') # Store attributes from data transformation # Delete previous file if exists try: os.remove(logdir + '.\\full_pipeline_' + model_name + '.pkl') except OSError: pass with open(logdir + '.\\full_pipeline_' + model_name + '.pkl', 'wb') as f: pickle.dump(self.full_pipeline, f) pickle.dump(self.label_num_pipeline, f) # "history" object holds a record of the loss values and metric values during training history = self.model.fit( self.X_train_prepared, self.y_train_prepared, initial_epoch=initial_epoch, epochs=epochs, callbacks=[tensorboard_callback, model_checkpoint_callback], validation_data=(self.X_valid_prepared, self.y_valid_prepared), shuffle=True, batch_size=batch_size, verbose=2) # Save entire model with training config self.model.save(logdir + '.\\' + model_name + '{}'.format( str(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))) endTime = datetime.datetime.now() print('Ended at ' + str(endTime)) print('end')
def test_column_transformer_dataframe(): pd = pytest.importorskip('pandas') X_array = np.array([[0, 1, 2], [2, 4, 6]]).T X_df = pd.DataFrame(X_array, columns=['first', 'second']) X_res_first = np.array([0, 1, 2]).reshape(-1, 1) X_res_both = X_array cases = [ # String keys: label based # scalar ('first', X_res_first), # list (['first'], X_res_first), (['first', 'second'], X_res_both), # slice (slice('first', 'second'), X_res_both), # int keys: positional # scalar (0, X_res_first), # list ([0], X_res_first), ([0, 1], X_res_both), (np.array([0, 1]), X_res_both), # slice (slice(0, 1), X_res_first), (slice(0, 2), X_res_both), # boolean mask (np.array([True, False]), X_res_first), (pd.Series([True, False], index=['first', 'second']), X_res_first), ] for selection, res in cases: ct = ColumnTransformer([('trans', Trans(), selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) # callable that returns any of the allowed specifiers ct = ColumnTransformer([('trans', Trans(), lambda X: selection)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), res) assert_array_equal(ct.fit(X_df).transform(X_df), res) ct = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' ct = ColumnTransformer([('trans1', Trans(), [0]), ('trans2', Trans(), [1])]) assert_array_equal(ct.fit_transform(X_df), X_res_both) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_both) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test with transformer_weights transformer_weights = {'trans1': .1, 'trans2': 10} both = ColumnTransformer([('trans1', Trans(), ['first']), ('trans2', Trans(), ['second'])], transformer_weights=transformer_weights) res = np.vstack([transformer_weights['trans1'] * X_df['first'], transformer_weights['trans2'] * X_df['second']]).T assert_array_equal(both.fit_transform(X_df), res) assert_array_equal(both.fit(X_df).transform(X_df), res) assert len(both.transformers_) == 2 assert ct.transformers_[-1][0] != 'remainder' # test multiple columns both = ColumnTransformer([('trans', Trans(), ['first', 'second'])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' both = ColumnTransformer([('trans', Trans(), [0, 1])], transformer_weights={'trans': .1}) assert_array_equal(both.fit_transform(X_df), 0.1 * X_res_both) assert_array_equal(both.fit(X_df).transform(X_df), 0.1 * X_res_both) assert len(both.transformers_) == 1 assert ct.transformers_[-1][0] != 'remainder' # ensure pandas object is passes through class TransAssert(BaseEstimator): def fit(self, X, y=None): return self def transform(self, X, y=None): assert_true(isinstance(X, (pd.DataFrame, pd.Series))) if isinstance(X, pd.Series): X = X.to_frame() return X ct = ColumnTransformer([('trans', TransAssert(), 'first')], remainder='drop') ct.fit_transform(X_df) ct = ColumnTransformer([('trans', TransAssert(), ['first', 'second'])]) ct.fit_transform(X_df) # integer column spec + integer column names -> still use positional X_df2 = X_df.copy() X_df2.columns = [1, 0] ct = ColumnTransformer([('trans', Trans(), 0)], remainder='drop') assert_array_equal(ct.fit_transform(X_df), X_res_first) assert_array_equal(ct.fit(X_df).transform(X_df), X_res_first) assert len(ct.transformers_) == 2 assert ct.transformers_[-1][0] == 'remainder' assert ct.transformers_[-1][1] == 'drop' assert_array_equal(ct.transformers_[-1][2], [1])
imputer.fit( x[:, 1:3]) # compute the missing values for all rows for 1st and 2nd calumn x[:, 1:3] = imputer.transform( x[:, 1:3]) # replace all rows for 1st and second column with imputers version # print(x) # print("".join(['-' for i in range(40)])) # Transform and Encode Categorical Data from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder # use oneHotEncoder to trasform 0th column, keep the others unchanged ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') x = np.array(ct.fit_transform(x)) # transform the x and convert it to np array # print(x) # print("".join(['-' for i in range(40)])) # Transform and Encode The Dependent Variable from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = le.fit_transform(y) # print(y) # print("".join(['-' for i in range(40)])) # Split dataset into Training and Testing sets from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=1) # split 80/20 and seed random with 1 print(X_train)
# Encoding categorical data # Categorical variable for country labelencoder_X_1 = LabelEncoder() X[:, 1] = labelencoder_X_1.fit_transform(X[:, 1]) # Categorical variable for gender labelencoder_X_2 = LabelEncoder() X[:, 2] = labelencoder_X_2.fit_transform(X[:, 2]) # Since country is not ordinal variable we need to create three dummy variables ct = ColumnTransformer([("Geography", OneHotEncoder(), [1])], remainder = 'passthrough') X = ct.fit_transform(X) # We need remove one dummy variable to avoid dummy variable trap X = X[:,1:] # Splitting the dataset into the Training set and Test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) """ Applying XGBoost """
############################################################################### ############################################################################### # We will perform a 10-fold cross-validation and train the neural-network with # the two different strategies previously presented. from sklearn.model_selection import StratifiedKFold skf = StratifiedKFold(n_splits=10) cv_results_imbalanced = [] cv_time_imbalanced = [] cv_results_balanced = [] cv_time_balanced = [] for train_idx, valid_idx in skf.split(X_train, y_train): X_local_train = preprocessor.fit_transform(X_train.iloc[train_idx]) y_local_train = y_train.iloc[train_idx].values.ravel() X_local_test = preprocessor.transform(X_train.iloc[valid_idx]) y_local_test = y_train.iloc[valid_idx].values.ravel() elapsed_time, roc_auc = fit_predict_imbalanced_model( X_local_train, y_local_train, X_local_test, y_local_test) cv_time_imbalanced.append(elapsed_time) cv_results_imbalanced.append(roc_auc) elapsed_time, roc_auc = fit_predict_balanced_model( X_local_train, y_local_train, X_local_test, y_local_test) cv_time_balanced.append(elapsed_time) cv_results_balanced.append(roc_auc) ###############################################################################
## Encoding categorical data from sklearn.preprocessing import OneHotEncoder, LabelEncoder from sklearn.compose import ColumnTransformer # Label encorder not required anymore can use ColumnTransformer # But can use to identify the category #labelEncoder_Country = LabelEncoder() #x[:, 0] = labelEncoder_Country.fit_transform(x[:, 0]) # we there are multiple categpries so need to use columnTransformer = ColumnTransformer(transformers=[ ('one_hot_encoder', OneHotEncoder(categories='auto'), [0]) ], remainder='passthrough') x = columnTransformer.fit_transform(x) labelEncoder_Purchased = LabelEncoder() y = labelEncoder_Purchased.fit_transform(y) ## Split dataset to train and test from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) ## Feature scaling # There are two ways Standardisation and Normalisation from sklearn.preprocessing import StandardScaler
df df=df.iloc[:,:].values df m_status=LabelEncoder() df[:,2]=m_status.fit_transform(df[:,2]) df df=pd.DataFrame(df) df ct= ColumnTransformer(transformers=[('encode',OneHotEncoder(),[2])],remainder='passthrough') df=ct.fit_transform(df) df df=pd.DataFrame(df) df #pd.get_dummies(df) #B Rename all columns as df df.rename(columns={0:'df1', 1:'df2', 2:'df3', 3:'df4',4:'df5', 5: 'df6',6:'df7',7:'df8', 8:'df9',9:'df10', 10:'df11',11:'df12',12:'df13', 13:'df14',14:'df15', 15:'df16',16:'df17',17:'df18',18:'df19'}) df
""" Created on Sat Aug 22 20:52:32 2020 @author: renan """ import pandas as pd base = pd.read_csv('census.csv') previsores = base.iloc[:, 0:14].values classe = base.iloc[:, 14].values from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer column_tranformer = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(), [1, 3, 5, 6, 7, 8, 9, 13])], remainder='passthrough') previsores = column_tranformer.fit_transform(previsores).toarray() labelencoder_classe = LabelEncoder() classe = labelencoder_classe.fit_transform(classe) from sklearn.preprocessing import StandardScaler scaler = StandardScaler() previsores = scaler.fit_transform(previsores) from sklearn.model_selection import train_test_split previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split( previsores, classe, test_size=0.15, random_state=0)
dataset_away = pd.read_csv('result_data_A.csv', header=None, encoding="shift-jis") #アウェイのクラブがどのクラブに対して何点取るか、という予想をするモデル X_away = dataset_away.iloc[:, [0, 5, 6]].values Y_away = dataset_away.iloc[:, 7:8].values print("data import clear") #列のデータを変換するためのクラス from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder cd = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [1, 2])], remainder="passthrough") X_home = (cd.fit_transform(X_home)).toarray() X_away = (cd.fit_transform(X_away)).toarray() print("encoding clear") #randomforestはensembleからimport from sklearn.ensemble import RandomForestRegressor #n_estimatorsでいくつの木(モデル)に分割するか指定 regressor_home = RandomForestRegressor(n_estimators=10, random_state=0) regressor_away = RandomForestRegressor(n_estimators=10, random_state=0) regressor_home.fit(X_home, Y_home) regressor_away.fit(X_away, Y_away) print("learning clear") #予想したい対戦カード
data.drop('bool_of_active', axis=1, inplace=True) data # In[9]: data.drop('step_count', axis=1, inplace=True) data # In[10]: from sklearn.compose import ColumnTransformer ct = ColumnTransformer( [("mood", OneHotEncoder(), [0])], remainder="passthrough" ) # The last arg ([0]) is the list of columns you want to transform in this step x = ct.fit_transform(data) x # In[11]: from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0) # In[12]: from sklearn.naive_bayes import GaussianNB nb = GaussianNB() model = nb.fit(X_train, y_train)
# conver categorical columns into numerical columns: features = pd.get_dummies(features) # Split your data into training set and test sets: features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.20, random_state=23) # standardize/normalize numerical features: numerical_features = features.select_dtypes(include=['float64', 'int64']) numerical_columns = numerical_features.columns ct = ColumnTransformer([("only numeric", StandardScaler(), numerical_columns)], remainder='passthrough') # Fit your instance ct of ColumnTransformer to the training data and at the same time transform it by using the ColumnTransformer.fit_transform() method. Assign the result to a variable called features_train_scaled: features_train_scaled = ct.fit_transform(features_train) #ransform your test data instance features_test using the trained ColumnTransformer instance ct. Assign the result to a variable called features_test_scaled: features_test_scaled = ct.transform(features_test) # Create an instance of my_model: my_model = Sequential() # Create the input layer input = InputLayer(input_shape=(features.shape[1], )) # Add the input layer: my_model.add(input) # Add one hidden layer: my_model.add(Dense(64, activation="relu"))
admissionData = admissionData.drop(["Serial No."], axis=1) labels = admissionData.iloc[:, -1] # remove uni rating and TOEFL score - unethical? # remove serial no. and research - irrelevant info features = admissionData.iloc[:, [0, 3, 4, 5, 6]] # split dataset into train and test features_train, features_test, labels_train, labels_test = train_test_split( features, labels, test_size=0.3, random_state=42) # scale/normalise dataset features ct = ColumnTransformer([("normalize", Normalizer(), [0, 1, 2, 3])], remainder='passthrough') features_train = ct.fit_transform(features_train) features_test = ct.transform(features_test) learning_rate = 0.001 num_epochs = 20 # create neural network # admissionsModel = build_model(features_train, learning_rate) # rewrite this function # admissionsModel.fit(features_train, labels_train, epochs=20, batch_size=1, verbose=1) history1 = fit_model(build_model(features_train, learning_rate), features_train, labels_train, learning_rate, num_epochs) # need to return the fitted model into a graph somehow here plt.savefig('perf_graph.png')
labelEncoder_previsores = LabelEncoder() previsores[:, 1] = labelEncoder_previsores.fit_transform(previsores[:, 1]) previsores[:, 3] = labelEncoder_previsores.fit_transform(previsores[:, 3]) previsores[:, 5] = labelEncoder_previsores.fit_transform(previsores[:, 5]) previsores[:, 6] = labelEncoder_previsores.fit_transform(previsores[:, 6]) previsores[:, 7] = labelEncoder_previsores.fit_transform(previsores[:, 7]) previsores[:, 8] = labelEncoder_previsores.fit_transform(previsores[:, 8]) previsores[:, 9] = labelEncoder_previsores.fit_transform(previsores[:, 9]) previsores[:, 13] = labelEncoder_previsores.fit_transform(previsores[:, 13]) # One Hot Encoder oneHotEncoder = ColumnTransformer( [('one_hot_encoder', OneHotEncoder(categories='auto'), [1, 3, 5, 6, 7, 8, 9, 13])], remainder='passthrough') previsores = oneHotEncoder.fit_transform(previsores).toarray() # Y labelEncoder_classe = LabelEncoder() classe = labelEncoder_classe.fit_transform(classe) # Escalonamento dos dados ##### Escalonamento Parcial ##### # scalerCols = previsores[:, 102:] # scaler = StandardScaler() # previsores[:, 102:] = scaler.fit_transform(scalerCols) ##### Escalonamento Total ##### scaler = StandardScaler() previsores = scaler.fit_transform(previsores) # Split dos dados
# Handling missing data from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values = np.nan,strategy = 'mean' ) imputer = imputer.fit(x[:,1:3]) x[:,1:3] = imputer.transform(x[:,1:3]) # category encoder from sklearn.preprocessing import LabelEncoder,OneHotEncoder from sklearn.compose import ColumnTransformer labelencoder_x = LabelEncoder() x[:,0] = labelencoder_x.fit_transform(x[:,0]) transform = ColumnTransformer([("Country", OneHotEncoder(), [0])], remainder="passthrough") x = transform.fit_transform(x) labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) # split the dataset into train and test dataset # random state make the result be the same as the other people if it setted like them from sklearn.model_selection import train_test_split x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=0) # Feature Scaling # here we made the feature scaling on the whole training and test sets with the non numeric features # in x_test we don't need to fit the data because it is already fitted from sklearn.preprocessing import StandardScaler sc_x = StandardScaler() x_train = sc_x.fit_transform(x_train)
) column_trans = ColumnTransformer( [ ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), ("onehot_categorical", OneHotEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), ("passthrough_numeric", "passthrough", ["BonusMalus"]), ("log_scaled_numeric", log_scale_transformer, ["Density"]), ], remainder="drop", ) X = column_trans.fit_transform(df) # Insurances companies are interested in modeling the Pure Premium, that is # the expected total claim amount per unit of exposure for each policyholder # in their portfolio: df["PurePremium"] = df["ClaimAmount"] / df["Exposure"] # This can be indirectly approximated by a 2-step modeling: the product of the # Frequency times the average claim amount per claim: df["Frequency"] = df["ClaimNb"] / df["Exposure"] df["AvgClaimAmount"] = df["ClaimAmount"] / np.fmax(df["ClaimNb"], 1) with pd.option_context("display.max_columns", 15): print(df[df.ClaimAmount > 0].head()) # %%
if cat_attr[i] == True: print(feature_nam[i]) val = feature_nam[i] features_1[val].fillna(features_1[val].value_counts().index[0], inplace=True) num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler()), ]) preprocess = ColumnTransformer([ ("num", num_pipeline, num_attr), ("cat", OneHotEncoder(), cat_attr), ]) features_prepared = preprocess.fit_transform(features_1) features_prepared_2 = preprocess.fit_transform(features_2) # Set up train and test arrays from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(features_prepared, target_1, random_state=0) #=====================================================================================================# # Prediction models print() ; print('=============== Predicition Models ===============') nam_model = [] type_model = [] #========== SVC ==========# from sklearn.svm import SVC
def noprep(dataset, dirt, numeric_features, categorical_features, delim=',', indexdrop=False): index_features = ['_dmIndex_', '_PartInd_'] data = pd.read_csv(dirt + dataset + '.csv', delimiter=delim) # panda.DataFrame print(data.columns) data = data.astype({'_dmIndex_': 'int', '_PartInd_': 'int'}) numeric_features = list( set(data.select_dtypes(include=["number"])) - set(index_features) - set(['income_flag'])) categorical_features = list( set(data.select_dtypes(exclude=["number"])) - set(['income_flag'])) index_transformer = Pipeline( steps=[('imputer', SimpleImputer(strategy='constant', fill_value=-1))]) #y_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant',fill_value=-1)),\ # ('orden', OrdinalEncoder())]) numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]) categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing')),\ ('onehot', OneHotEncoder(sparse=False))]) preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features),\ ('cat', categorical_transformer, categorical_features),('index',index_transformer, index_features)]) data["income_flag"] = data["income_flag"].astype('category') data["income_flag"] = data["income_flag"].cat.codes # data["income_flag"]=data.where(data["income_flag"]==,0) # data["income_flag"]=data.where(data["income_flag"]==4,1) data = preprocessor.fit_transform(data) data = pd.DataFrame(data) col = data.columns.values print(col) X = data.drop(col[-3:], axis=1) X_train = data[data[col[-1]] > 0].drop( col[-3:], axis=1) #pd.DataFrame(X).to_csv('X_vanilla.csv') X_test = data[data[col[-1]] == 0].drop( col[-3:], axis=1) #pd.DataFrame(X).to_csv('X_vanilla.csv') print(data.shape) #################################################################### #y= data["y"] #lb = preprocessing.LabelBinarizer() #y= lb.fit_transform(y) y = data[col[-3]] y_train = data[data[col[-1]] > 0][col[-3]] y_test = data[data[col[-1]] == 0][col[-3]] ########################################################## ################################################################## feat_type = [] #dict() xcol = X.columns.values for cl in xcol: if cl in categorical_features: feat_type.append(1) else: feat_type.append(0) # X_train_auto, X_test_auto, y_train_auto, y_test_auto = \ # sklearn.model_selection.train_test_split(X, y,test_size=0.2, random_state=1) return data, X, y, X_train, y_train, X_test, y_test, feat_type
from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values = np.nan, strategy ='mean') #imputer = Imputer(missing_values = 'NaN', strategy = 'mean', axis =0) imputer = imputer.fit(X[:,1:3]) X[:, 1:3] = imputer.transform (X[:,1:3]) #Encoding categorical values from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer LabelEncoder_X = LabelEncoder() X[:, 0] = LabelEncoder_X.fit_transform(X[:, 0]) columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [0])], remainder='passthrough') X=np.array(columnTransformer.fit_transform(X),dtype=np.str) LabelEncoder_y = LabelEncoder() y = LabelEncoder_y.fit_transform(y) # columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), [3])], # remainder='passthrough') # y=np.array(columnTransformer.fit_transform(y),dtype=np.str) #Splitting datasets into test and training sets nb train sz + tst sz = 1 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split (X, y, test_size=0.2, random_state=0) #Feature Scaling from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train)
#convert gender and country to number data from sklearn.preprocessing import LabelEncoder, OneHotEncoder from sklearn.compose import ColumnTransformer label_encoder_x_1 = LabelEncoder() X[:, 2] = label_encoder_x_1.fit_transform(X[:,2]) transformer = ColumnTransformer( transformers=[ ("OneHot", # Just a name OneHotEncoder(), # The transformer class [1] # The column(s) to be applied on. ) ], remainder='passthrough' # donot apply anything to the remaining columns ) X = transformer.fit_transform(X.tolist()) X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler Sscale = StandardScaler() X_train = Sscale.fit_transform(X_train) X_test = Sscale.transform(X_test) #importing keras from keras.models import Sequential
'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time' ]] # One-hot ecoding to convert from categorical features into vectors x = pd.get_dummies(x) X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=1) ct = ColumnTransformer([('numeric', StandardScaler(), [ 'age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time' ])], remainder='passthrough') #why not 'standardize'? X_train = ct.fit_transform(X_train) X_test = ct.transform(X_test) # Label encoding for categorical outcome le = LabelEncoder() Y_train = le.fit_transform(Y_train.astype(str)) Y_test = le.transform(Y_test.astype(str)) # Convert labels into categorical type Y_train = to_categorical(Y_train) Y_test = to_categorical(Y_test) # Model model = Sequential() model.add(InputLayer(input_shape=(X_train.shape[1], ))) model.add(Dense(12, activation='relu')) model.add(
# ### Encoding categorical variables # In[ ]: # Encoding independent variables # Import Class --> Create Object --> Fit Object to Data --> Transform Data from sklearn.compose import ColumnTransformer # import class from sklearn.preprocessing import OneHotEncoder # import class ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop = 'first'), [0])], remainder='passthrough') #create object X = np.array(ct.fit_transform(X)) # fit object to data and transform data print(X) # In[ ]: # Encoding dependent variable because this is a classification problem. The dependent variable is categorical. # Import Class --> Create Object --> Fit Object to Data --> Transform Data from sklearn.preprocessing import LabelEncoder #import class le = LabelEncoder() #create object y = le.fit_transform(y) #fit and transform print(y)
def __init__(self, explanation, model, dataset, true_y, classes, features, locale, categorical_features, true_y_dataset): """Initialize the Error Analysis Dashboard Input. :param explanation: An object that represents an explanation. :type explanation: ExplanationMixin :param model: An object that represents a model. It is assumed that for the classification case it has a method of predict_proba() returning the prediction probabilities for each class and for the regression case a method of predict() returning the prediction value. :type model: object :param dataset: A matrix of feature vector examples (# examples x # features), the same samples used to build the explanation. Will overwrite any set on explanation object already. Must have fewer than 10000 rows and fewer than 1000 columns. :type dataset: numpy.array or list[][] or pandas.DataFrame :param true_y: The true labels for the provided explanation. Will overwrite any set on explanation object already. :type true_y: numpy.array or list[] :param classes: The class names. :type classes: numpy.array or list[] :param features: Feature names. :type features: numpy.array or list[] :param categorical_features: The categorical feature names. :type categorical_features: list[str] :param true_y_dataset: The true labels for the provided dataset. Only needed if the explanation has a sample of instances from the original dataset. Otherwise specify true_y parameter only. :type true_y_dataset: numpy.array or list[] """ self._model = model original_dataset = dataset if isinstance(dataset, pd.DataFrame): self._dataset = dataset.to_json() else: self._dataset = dataset if true_y_dataset is None: self._true_y = true_y else: self._true_y = true_y_dataset self._categorical_features = categorical_features self._categories = [] self._categorical_indexes = [] self._is_classifier = model is not None\ and hasattr(model, SKLearn.PREDICT_PROBA) and \ model.predict_proba is not None self._dataframeColumns = None self.dashboard_input = {} # List of explanations, key of explanation type is "explanation_type" self._mli_explanations = explanation.data(-1)["mli"] local_explanation = self._find_first_explanation( ExplanationDashboardInterface.MLI_LOCAL_EXPLANATION_KEY) global_explanation = self._find_first_explanation( ExplanationDashboardInterface.MLI_GLOBAL_EXPLANATION_KEY) ebm_explanation = self._find_first_explanation( ExplanationDashboardInterface.MLI_EBM_GLOBAL_EXPLANATION_KEY) dataset_explanation = self._find_first_explanation( ExplanationDashboardInterface.MLI_EXPLANATION_DATASET_KEY) if hasattr(explanation, 'method'): self.dashboard_input[ExplanationDashboardInterface. EXPLANATION_METHOD] = explanation.method predicted_y = None feature_length = None if dataset_explanation is not None: if dataset is None or len(dataset) != len(true_y): dataset = dataset_explanation[ ExplanationDashboardInterface.MLI_DATASET_X_KEY] if true_y is None: true_y = dataset_explanation[ ExplanationDashboardInterface.MLI_DATASET_Y_KEY] elif len(dataset) != len(true_y): dataset = explanation._eval_data if isinstance(dataset, pd.DataFrame) and hasattr(dataset, 'columns'): self._dataframeColumns = dataset.columns try: list_dataset = self._convert_to_list(dataset) except Exception as ex: ex_str = _format_exception(ex) raise ValueError( "Unsupported dataset type, inner error: {}".format(ex_str)) if dataset is not None and model is not None: try: predicted_y = model.predict(dataset) except Exception as ex: ex_str = _format_exception(ex) msg = "Model does not support predict method for given" "dataset type, inner error: {}".format(ex_str) raise ValueError(msg) try: predicted_y = self._convert_to_list(predicted_y) except Exception as ex: ex_str = _format_exception(ex) raise ValueError("Model prediction output of unsupported type," "inner error: {}".format(ex_str)) if predicted_y is not None: self.dashboard_input[ ExplanationDashboardInterface.PREDICTED_Y] = predicted_y row_length = 0 if list_dataset is not None: row_length, feature_length = np.shape(list_dataset) if row_length > 100000: raise ValueError("Exceeds maximum number of rows" "for visualization (100000)") if feature_length > 1000: raise ValueError("Exceeds maximum number of features for" " visualization (1000). Please regenerate the" " explanation using fewer features or" " initialize the dashboard without passing a" " dataset.") self.dashboard_input[ExplanationDashboardInterface. TRAINING_DATA] = _serialize_json_safe( list_dataset) self.dashboard_input[ExplanationDashboardInterface. IS_CLASSIFIER] = self._is_classifier local_dim = None if true_y is not None and len(true_y) == row_length: self.dashboard_input[ExplanationDashboardInterface. TRUE_Y] = self._convert_to_list(true_y) if local_explanation is not None: try: local_explanation["scores"] = self._convert_to_list( local_explanation["scores"]) if np.shape(local_explanation["scores"])[-1] > 1000: raise ValueError("Exceeds maximum number of features for " "visualization (1000). Please regenerate" " the explanation using fewer features.") local_explanation["intercept"] = self._convert_to_list( local_explanation["intercept"]) # We can ignore perf explanation data. # Note if it is added back at any point, # the numpy values will need to be converted to python, # otherwise serialization fails. local_explanation["perf"] = None self.dashboard_input[ExplanationDashboardInterface. LOCAL_EXPLANATIONS] = local_explanation except Exception as ex: ex_str = _format_exception(ex) raise ValueError("Unsupported local explanation type," "inner error: {}".format(ex_str)) if list_dataset is not None: local_dim = np.shape(local_explanation["scores"]) if len(local_dim) != 2 and len(local_dim) != 3: raise ValueError( "Local explanation expected to be a 2D or 3D list") if len(local_dim) == 2 and (local_dim[1] != feature_length or local_dim[0] != row_length): raise ValueError("Shape mismatch: local explanation" "length differs from dataset") if len(local_dim) == 3 and (local_dim[2] != feature_length or local_dim[1] != row_length): raise ValueError("Shape mismatch: local explanation" " length differs from dataset") if local_explanation is None and global_explanation is not None: try: global_explanation["scores"] = self._convert_to_list( global_explanation["scores"]) if 'intercept' in global_explanation: global_explanation["intercept"] = self._convert_to_list( global_explanation["intercept"]) self.dashboard_input[ExplanationDashboardInterface. GLOBAL_EXPLANATION] = global_explanation except Exception as ex: ex_str = _format_exception(ex) raise ValueError("Unsupported global explanation type," "inner error: {}".format(ex_str)) if ebm_explanation is not None: try: self.dashboard_input[ExplanationDashboardInterface. EBM_EXPLANATION] = ebm_explanation except Exception as ex: ex_str = _format_exception(ex) raise ValueError( "Unsupported ebm explanation type: {}".format(ex_str)) if features is None and hasattr(explanation, 'features')\ and explanation.features is not None: features = explanation.features if features is not None: features = self._convert_to_list(features) if feature_length is not None and len(features) != feature_length: raise ValueError("Feature vector length mismatch:" " feature names length differs" " from local explanations dimension") self.dashboard_input[FEATURE_NAMES] = features if classes is None and hasattr(explanation, 'classes')\ and explanation.classes is not None: classes = explanation.classes if classes is not None: classes = self._convert_to_list(classes) if local_dim is not None and len(classes) != local_dim[0]: raise ValueError("Class vector length mismatch:" "class names length differs from" "local explanations dimension") self.dashboard_input[ ExplanationDashboardInterface.CLASS_NAMES] = classes if model is not None and hasattr(model, SKLearn.PREDICT_PROBA) \ and model.predict_proba is not None and dataset is not None: try: probability_y = model.predict_proba(dataset) except Exception as ex: ex_str = _format_exception(ex) raise ValueError("Model does not support predict_proba method" " for given dataset type," " inner error: {}".format(ex_str)) try: probability_y = self._convert_to_list(probability_y) except Exception as ex: ex_str = _format_exception(ex) raise ValueError( "Model predict_proba output of unsupported type," "inner error: {}".format(ex_str)) self.dashboard_input[ ExplanationDashboardInterface.PROBABILITY_Y] = probability_y if locale is not None: self.dashboard_input[ExplanationDashboardInterface.LOCALE] = locale if self._categorical_features: category_dictionary = {} features = self.dashboard_input[FEATURE_NAMES] self._categorical_indexes = [ features.index(feature) for feature in self._categorical_features ] from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OrdinalEncoder ordinal_enc = OrdinalEncoder() ct = ColumnTransformer( [('ord', ordinal_enc, self._categorical_indexes)], remainder='drop') self.string_ind_data = ct.fit_transform(original_dataset) transformer_categories = ct.transformers_[0][1].categories_ for category_arr, category_index in zip(transformer_categories, self._categorical_indexes): category_values = category_arr.tolist() self._categories.append(category_values) category_dictionary[category_index] = category_values self.dashboard_input[ExplanationDashboardInterface. CATEGORICAL_MAP] = category_dictionary
import numpy as np import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression data = pd.read_csv('Dataset/50_Startups.csv') #set the path accordingly x = data.iloc[:, :-1].values y = data.iloc[:, -1].values ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [-1])], remainder='passthrough') x = np.array(ct.fit_transform(x)) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) regressor = LinearRegression() regressor.fit(x_train, y_train) y_pred = regressor.predict(x_test) np.set_printoptions(precision=2) print( np.concatenate( (y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_pred), 1)), axis=1))
def prepare_data(train_df_raw, test_df_raw, data_prep_dict): ''' Function to process raw data into required modelling data Inputs: 1. train_df_raw - Dataframe 2. test_df_raw - Dataframe 3. data_prep_dict - Dictionary Outputs: 1. train_df_processed - Dataframe 2. test_df_processed - Dataframe ''' #quick check to apply data processing on both train and test combined #train_df_raw = pd.concat([train_df_raw,test_df_raw],axis = 0) #override simple imputer error by manually assigning missing values train_df_raw['Holding_Policy_Duration'].fillna('-1', inplace=True) test_df_raw['Holding_Policy_Duration'].fillna('-1', inplace=True) train_df_raw.fillna('missing', inplace=True) test_df_raw.fillna('missing', inplace=True) #modify data values to convert catergorical raw attributes to potential numeric features train_df_raw.replace({'14+': '14'}, inplace=True) train_df_raw['Holding_Policy_Duration'] = train_df_raw[ 'Holding_Policy_Duration'].astype(float) test_df_raw.replace({'14+': '14'}, inplace=True) test_df_raw['Holding_Policy_Duration'] = test_df_raw[ 'Holding_Policy_Duration'].astype(float) #freeze data types train_df_raw[data_prep_dict['one_hot_encode']] = train_df_raw[ data_prep_dict['one_hot_encode']].astype(str) test_df_raw[data_prep_dict['one_hot_encode']] = test_df_raw[ data_prep_dict['one_hot_encode']].astype(str) #target encode required attributes for target_encode_col in data_prep_dict['target_encode']: encoding_dict = train_df_raw.groupby( target_encode_col)[TARGET].mean().to_dict() train_df_raw[target_encode_col] = train_df_raw[target_encode_col].map( encoding_dict) test_df_raw[target_encode_col] = test_df_raw[target_encode_col].map( encoding_dict) #fill missing Region Codes #city_code_means = train_df_raw.groupby(['City_Code'])[TARGET].mean().reset_index() #test_df_raw['Region_Code'] = test_df_raw.apply( #lambda row: city_code_means[TARGET][city_code_means.City_Code == # row['City_Code']].values[0] # if row['Region_Code'] not in train_df_raw['Region_Code'].unique() else row['Region_Code'], # axis=1 # ) #define set of transformation steps per raw attribute present in the data column_transformer_1 = ColumnTransformer( [('one_hot_encode', OneHotEncoder(sparse=False, drop='if_binary'), data_prep_dict['one_hot_encode'])], remainder='passthrough', verbose='True') #build and fit the column transformer on train data train_df_processed = column_transformer_1.fit_transform(train_df_raw) #apply the column transformer on test data test_df_processed = column_transformer_1.transform(test_df_raw) #convert numpy arrays into pandas dataframe for further analysis train_df_processed_1 = pd.DataFrame( train_df_processed, columns=column_transformer_1.get_feature_names()) test_df_processed_1 = pd.DataFrame( test_df_processed, columns=column_transformer_1.get_feature_names()) column_transformer_2 = ColumnTransformer([('passthrough', 'passthrough', [ col for col in train_df_processed_1.columns if col not in data_prep_dict['standard_scale'] ]), ('standard_scale', StandardScaler(), data_prep_dict['standard_scale']) ], remainder='passthrough', verbose='True') #build and fit the column transformer on train data train_df_processed_2 = column_transformer_2.fit_transform( train_df_processed_1) #apply the column transformer on test data test_df_processed_2 = column_transformer_2.transform(test_df_processed_1) #recreate column names in the correct order, to understand feature importances train_df_processed_out = pd.DataFrame( train_df_processed_2, columns=[ col for col in train_df_processed_1.columns if col not in data_prep_dict['standard_scale'] ] + data_prep_dict['standard_scale']) test_df_processed_out = pd.DataFrame( test_df_processed_2, columns=[ col for col in train_df_processed_1.columns if col not in data_prep_dict['standard_scale'] ] + data_prep_dict['standard_scale']) #progress logger print('Target encoding completed, return processed data') return train_df_processed_out, test_df_processed_out
class DogeDataLoader: def __init__(self, filename, categorical_cols, target_col, seq_length, batch_size, preprocessor=True, prediction_window=1): ''' :param filename: path to the csv dataset :param categorical_cols: name of the categorical columns, if None pass empty list :param target_col: name of the targeted column :param seq_length: window length to use :param prediction_window: window length to predict :param preprocessor: if normalize data or not :param batch_size: batch size ''' self.data = self.read_and_preprocess(filename) self.categorical_cols = categorical_cols self.numerical_cols = list( set(self.data.columns) - set(categorical_cols) - set(target_col)) self.target_col = target_col self.seq_length = seq_length self.prediction_window = prediction_window self.batch_size = batch_size self.preprocessor = preprocessor self.preprocess = ColumnTransformer( [ ("scaler", StandardScaler(), self.numerical_cols), #("encoder", OneHotEncoder(), self.categorical_cols) ], remainder="passthrough") def read_and_preprocess(self, filename): # Reading df = pd.read_csv(filename) # Reorder and resetting index df = df[::-1].reset_index(drop=True) # Preprocessing 'Change' column df['Change %'] = df['Change %'].str.replace("%", "") df['Change %'] = pd.to_numeric(df['Change %'].str.replace(",", "")) # Preprocessing 'Vol.' column vols = [el for el in df['Vol.']] for num, el in enumerate(vols): # Check if is billion isB = el[-1] == 'B' try: el = float(el[:-1]) except ValueError: print("Value Error at row ", num) el = vols[num - 1] if isB: el = el * 1000 vols[num] = el df['Vol.'] = vols # Dropping Date column df.pop('Date') # Done, returning dataframe return df def preprocess_data(self): ''' Preprocessing function ''' X = self.data.drop(self.target_col, axis=1) y = self.data[self.target_col] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False) if self.preprocessor is not None: X_train = self.preprocess.fit_transform(X_train) X_test = self.preprocess.fit_transform(X_test) if self.target_col: return X_train, X_test, y_train.values, y_test.values return X_train, X_test def frame_series(self, X, y=None): ''' Function used to prepare the data for time series prediction :param X: set of features :param y: targeted value to predict :return: TensorDataset ''' nb_obs, nb_features = X.shape features, target, y_hist = [], [], [] for i in range(1, nb_obs - self.seq_length - self.prediction_window): features.append( torch.FloatTensor(X[i:i + self.seq_length, :]).unsqueeze(0)) features_var = torch.cat(features) if y is not None: for i in range(1, nb_obs - self.seq_length - self.prediction_window): target.append( torch.tensor(y[i + self.seq_length:i + self.seq_length + self.prediction_window])) target_var = torch.cat(target) return TensorDataset(features_var, target_var) return TensorDataset(features_var) def get_loaders(self, ): ''' Preprocess and frame the dataset :return: DataLoaders associated to training and testing data ''' X_train, X_test, y_train, y_test = self.preprocess_data() train_dataset = self.frame_series(X_train, y_train) test_dataset = self.frame_series(X_test, y_test) train_iter = DataLoader(train_dataset, batch_size=self.batch_size, shuffle=False, drop_last=True) test_iter = DataLoader(test_dataset, batch_size=self.batch_size, shuffle=False, drop_last=True) return train_iter, test_iter
import pandas as pd # Importing the dataset dataset = pd.read_csv('50_Startups.csv') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 4].values #encoding the categorical coloumn from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:, 3] = labelencoder_X.fit_transform(X[:, 3]) from sklearn.compose import ColumnTransformer ct = ColumnTransformer(transformers=[('state', OneHotEncoder(), [3])], remainder='passthrough') X = np.array(ct.fit_transform(X), dtype='float') # avoiding dummy variable trap X = X[:, 1:] # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Feature Scaling """from sklearn.preprocessing import StandardScaler sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train)
#Part 1 Data Preprocessing # importing the dataset dataset =pd.read_csv("Churn_Modelling.csv") X = dataset.iloc[:, 3:-1].values y= dataset.iloc[:,-1].values #Encoding independent variables from sklearn.preprocessing import OneHotEncoder, LabelEncoder country=LabelEncoder() gender=LabelEncoder() X[:,1]=country.fit_transform(X[:,1]) X[:,2]=gender.fit_transform(X[:,2]) from sklearn.compose import ColumnTransformer transformer=ColumnTransformer([('encoder' , OneHotEncoder(), [1])] ,remainder= 'passthrough') X=np.array(transformer.fit_transform(X) , dtype=np.float) X=X[:,1:] #splitting the data into train and test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.2,random_state=0) #Feature Scaling from sklearn.preprocessing import StandardScaler sc=StandardScaler() X_train=sc.fit_transform(X_train) X_test=sc.transform(X_test) #Part 2 Building ANN import keras from keras.layers import Dense from keras.models import Sequential