def test_imputation_error_sparse_0(strategy): # check that error are raised when missing_values = 0 and input is sparse X = np.ones((3, 5)) X[0] = 0 X = sparse.csc_matrix(X) imputer = SimpleImputer(strategy=strategy, missing_values=0) with pytest.raises(ValueError, match="Provide a dense array"): imputer.fit(X) imputer.fit(X.toarray()) with pytest.raises(ValueError, match="Provide a dense array"): imputer.transform(X)
def test_imputation_const_mostf_error_invalid_types(strategy, dtype): # Test imputation on non-numeric data using "most_frequent" and "constant" # strategy X = np.array([ [np.nan, np.nan, "a", "f"], [np.nan, "c", np.nan, "d"], [np.nan, "b", "d", np.nan], [np.nan, "c", "d", "h"], ], dtype=dtype) err_msg = "SimpleImputer does not support data" with pytest.raises(ValueError, match=err_msg): imputer = SimpleImputer(strategy=strategy) imputer.fit(X).transform(X)
def test_imputation_pickle(): # Test for pickling imputers. import pickle X = sparse_random_matrix(100, 100, density=0.10) for strategy in ["mean", "median", "most_frequent"]: imputer = SimpleImputer(missing_values=0, strategy=strategy) imputer.fit(X) imputer_pickled = pickle.loads(pickle.dumps(imputer)) assert_array_almost_equal( imputer.transform(X.copy()), imputer_pickled.transform(X.copy()), err_msg="Fail to transform the data after pickling " "(strategy = %s)" % (strategy) )
def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0) # copy=True, dense => copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert not np.all(X == Xt) # copy=True, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=True) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data) # copy=False, dense => no copy X = X_orig.copy().toarray() imputer = SimpleImputer(missing_values=0, strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt[0, 0] = -1 assert_array_almost_equal(X, Xt) # copy=False, sparse csc => no copy X = X_orig.copy().tocsc() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert_array_almost_equal(X.data, Xt.data) # copy=False, sparse csr => copy X = X_orig.copy() imputer = SimpleImputer(missing_values=X.data[0], strategy="mean", copy=False) Xt = imputer.fit(X).transform(X) Xt.data[0] = -1 assert not np.all(X.data == Xt.data)
def data_preprocessing(dataset): # import data # dataset = pd.read_csv('data/train.csv') X = dataset.iloc[:, 2:13].values Y = dataset.iloc[:, 1].values # replace missing data from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy= "mean", missing_values = np.nan) imputer = imputer.fit(X[:,3]) #X = imputer.fit_transform(X[:, 5]) Testing out new code X[:,3] = imputer.transform(X[:,3])
def _check_statistics(X, X_true, strategy, statistics, missing_values): """Utility function for testing imputation for a given strategy. Test: - along the two axes - with dense and sparse arrays Check that: - the statistics (mean, median, mode) are correct - the missing values are imputed correctly""" err_msg = "Parameters: strategy = %s, missing_values = %s, " \ "axis = {0}, sparse = {1}" % (strategy, missing_values) assert_ae = assert_array_equal if X.dtype.kind == 'f' or X_true.dtype.kind == 'f': assert_ae = assert_array_almost_equal # Normal matrix imputer = SimpleImputer(missing_values, strategy=strategy) X_trans = imputer.fit(X).transform(X.copy()) assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(0, False)) assert_ae(X_trans, X_true, err_msg=err_msg.format(0, False)) # Sparse matrix imputer = SimpleImputer(missing_values, strategy=strategy) imputer.fit(sparse.csc_matrix(X)) X_trans = imputer.transform(sparse.csc_matrix(X.copy())) if sparse.issparse(X_trans): X_trans = X_trans.toarray() assert_ae(imputer.statistics_, statistics, err_msg=err_msg.format(0, True)) assert_ae(X_trans, X_true, err_msg=err_msg.format(0, True))
def test_imputation_most_frequent_objects(marker): # Test imputation using the most-frequent strategy. X = np.array([ [marker, marker, "a", "f"], [marker, "c", marker, "d"], [marker, "b", "d", marker], [marker, "c", "d", "h"], ], dtype=object) X_true = np.array([ ["c", "a", "f"], ["c", "d", "d"], ["b", "d", "d"], ["c", "d", "h"], ], dtype=object) imputer = SimpleImputer(missing_values=marker, strategy="most_frequent") X_trans = imputer.fit(X).transform(X) assert_array_equal(X_trans, X_true)
En utilisant la fonction dataset.reindex() et dataset.isnull(), introduire des données manquantes et récupérer les indices des valeurs manquantes. Puis remplacez les valeurs manquantes par 0 par exemple. Puis supprimez ces valeurs manquantes. ''' df = df.fillna(0) print(df) ''' Importer les bibliothèques adéquates. Importer le jeu de données (data-set). Transformer les valeurs manquantes en moyenne (SimpleImputer) ''' s = SimpleImputer(missing_values=np.nan, strategy='mean') #df1.iloc[:,:] = s.fit_transform(df1) s = s.fit(df1[['Niv_Etude_Bac']]) df1['Niv_Etude_Bac'] = s.transform(df1[['Niv_Etude_Bac']]) s = s.fit(df1[['enfant_a_Charge']]) df1['enfant_a_Charge'] = s.transform(df1[['enfant_a_Charge']]) print(df1) ''' Encoder les valeurs catégoriques (LabelEncoder) ''' t = LabelEncoder() df1['Solvable'] = t.fit_transform(df1['Solvable']) print(df1) ''' Fractionner le jeu de données pour l’entrainement et le test (Training and Test set '''
from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import confusion_matrix, accuracy_score base = pd.read_csv('../data_bases/credit_data.csv') base.loc[base.age < 0, 'age'] = 40.92 # 1) Separar os tipos das variáveis (Previsores e Classe) previsores = base.iloc[:, 1:4].values classe = base.iloc[:, 4].values # 2) Aplicar o pré processamento # Substituir os valores faltantes pela média dos valores previsores imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer = imputer.fit(previsores[:, 1:4]) previsores[:, 1:4] = imputer.transform(previsores[:, 1:4]) # Escalonamento scaler = StandardScaler() previsores = scaler.fit_transform(previsores) # Separar os dados de treinamento dos dados de teste previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split( previsores, classe, test_size=0.25, random_state=0) # 3) Aplicar o treinamento classificador = RandomForestClassifier( n_estimators=40, criterion='entropy', random_state=0) # n_estimators = Número de árvores classificador.fit(previsores_treinamento, classe_treinamento)
# load the csv file as a data frame dataframe = pd.read_csv('Pohang.csv') X = dataframe.iloc[:, :-1].values y = dataframe.iloc[:, 8].values #Rescale data (between 0 and 1) from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler(feature_range=(0, 1)) rescaledX = scaler.fit_transform(X) # Taking care of missing data from sklearn.impute import SimpleImputer # creating object for SimpleImputer class as "imputer" imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0) imputer = imputer.fit( rescaledX[:, 1:8]) #upper bound is not included, but lower bound rescaledX[:, 1:8] = imputer.transform(rescaledX[:, 1:8]) # ============================================================================= # summarize the class distribution target = dataframe.values[:, -1] counter = Counter(target) for k, v in counter.items(): per = v / len(target) * 100 print('Class=%s, Count=%s, Percentage=%.3f%%' % (k, v, per)) # summarize class distribution print(X.shape, y.shape, Counter(y)) # Implementing SMOTE for the Imbalanced data in Multi-class classification
import pandas as pd from torch import nn, optim import torch.nn.functional as F import torch.utils.data as data import torch import numpy as np import io from sklearn.model_selection import train_test_split dataset = pd.read_csv('result_set-a.csv') dataset = dataset.sample(frac=1) from sklearn.impute import SimpleImputer imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp = imp.fit(dataset) dataset = imp.transform(dataset) x = dataset[:, 0:41] y = dataset[:, 41] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) from xgboost import XGBClassifier classifier = XGBClassifier() classifier.fit(x_train, y_train) from sklearn.externals import joblib # Save the model as a pickle in a file joblib.dump(classifier, 'model.pkl') y_pred = classifier.predict(x_test)
import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler from sklearn.model_selection import train_test_split # import the CSV data into a DataFrame dataset = pd.read_csv('Machine Learning basics/Data Preprocessing/Data.csv') # matriX of independent variables X = dataset.iloc[:, :-1].values # array of dependent variable (output) y = dataset.iloc[:, -1].values # handling missing data imputer = SimpleImputer() imputer = imputer.fit(X[:, 1:3]) X[:, 1:3] = imputer.transform(X[:, 1:3]) # encoding categories: # categorical data must be represented as numbers # because ML models are mathematical lblEncoder = LabelEncoder() X[:, 0] = lblEncoder.fit_transform(X[:, 0]) # some categorical data doesn't have order nor hierarchy, # it just needs a numerical format oneEncoder = OneHotEncoder(categorical_features=[0]) X = oneEncoder.fit_transform(X).toarray() y = lblEncoder.fit_transform(y) # splitting the dataset into training and test sets
def imputer(df, dfv, dfk, target_col, imputer_dict): result = {} for i in imputer_dict: if imputer_dict[i]['Indicator'] == 'deleterows': if df[i].isna().sum() > 0: df = df[df[i].isfinite()] dfv = dfv[dfv[i].isfinite()] dfk = dfk[dfk[i].isfinite()] if imputer_dict[i]['Indicator'] == True: if df[i].isna().sum() > 0: df[i + '_null_ind'] = np.where(df[i].isna(), 1, 0) dfv[i + '_null_ind'] = np.where(dfv[i].isna(), 1, 0) dfk[i + '_null_ind'] = np.where(dfk[i].isna(), 1, 0) if imputer_dict[i]['mvi'] in ['mean', 'median', 'most_frequent']: imp = SimpleImputer(missing_values=np.nan, strategy=imputer_dict[i]['mvi'], verbose=True, add_indicator=False, fill_value=None) imp.fit(df[[i]]) result[i] = imp df.loc[:, i] = result[i].transform(df[[i]]) dfv.loc[:, i] = result[i].transform(dfv[[i]]) dfk.loc[:, i] = result[i].transform(dfk[[i]]) if imputer_dict[i]['mvi'] == 'far_val': result[i] = df[i].max() * 100 df[i] = np.where(df[i].isna(), result[i], df[i]) dfv[i] = np.where(dfv[i].isna(), result[i], dfv[i]) dfk[i] = np.where(dfk[i].isna(), result[i], dfk[i]) ##### interativeimputer (if none of the above then this) ###### imp = IterativeImputer( max_iter=3, estimator=ExtraTreesRegressor( ) #### hyperparameter, alternatively beysian, knn etc. , n_nearest_features= 5 ##### Change value for maximum columns considered to predict missing value ) dfvc = dfv.copy() dfv[target_col] = np.nan dfkc = dfk.copy() dfk[target_col] = np.nan dfcolumns = df.columns imp.fit(df) df = pd.DataFrame(imp.transform(df)) df.columns = dfcolumns dfv = pd.DataFrame(imp.transform(dfv)) dfv.columns = dfcolumns dfk = pd.DataFrame(imp.transform(dfk)) dfk.columns = dfcolumns dfv[target_col] = np.array(dfvc[target_col]) dfk[target_col] = np.nan for i in imputer_dict: if imputer_dict[i]['mvi'] == 'iterativeimputer': result[i] = imp print("Completed imputer - ", datetime.datetime.now()) return df, dfv, dfk, result
def transform(df): imputer = SimpleImputer(missing_values=np.nan, strategy='mean', verbose=0) imputer = imputer.fit(df) df = imputer.transform(df) return df
# drop rows that at least 4 no-NaN values df.dropna(thresh=4) # only drop rows where NaN appear in specific columns (here: 'C') df.dropna(subset=['C']) print(df) ############################ # IMPUTING MISSING VALUES ############################ imr = SimpleImputer(missing_values=np.nan, strategy="mean") imr = imr.fit(df) imputed_data = imr.transform(df.values) print(imputed_data) ############################ # HANDLING CATEGORICAL DATA ############################ df = pd.DataFrame([ ['green', 'M', 10.1, 'class1'], ['red', 'L', 13.5, 'class2'], ['blue', 'XL', 15.3, 'class1'], ]) df.columns = ['color', 'size', 'price', 'classlabel']
""" ax = sns.heatmap(df.isnull()).set_title("Wykrywanie brakujących wartości") """W tym zbiorze danych w aż 7 kolumnach występują brakujące wartości. Istnieje wiele sposobów radzenia sobie z brakujacymi danymi tzw. **missing data**. Techniki te najprościej podzielić na takie w których usuwa się dane (pojedyncze obserwacje lub całe kolumny) oraz na takie, w których imputuje się dane (ang. "impute"). W zależności od tego z jakimi danymi mamy doczynienia, czy są to szeregi czasowe, czy klasyczne dane, czy ciągłe, czy binarne itd. dobieramy odpowiednie metody. Gender - płeć jest pierwszą kolumną w której występują brakujące dane. Ze względu na małą ilość brakujących danych (12) decyduję się je usunąć. Analogicznie Married. Self_Employed jest zmienną która zawiera wiele brakujących danych. Z tego względu decyduje się nie usuwać tych wartości. Dla pozostałych 3 kolumn, które miały wyraźnie wiecej obserwacji brakujacych niż inne stosuje statystyczną strategie polegającą na uzupełnieniu danych najpopularniejszą wartościa dla danej zmiennej objaśniającej. Jest to prosty, ale skuteczny sposób radzi sobie zarówno ze zmiennymi ciągłymi jak i kategorycznymi. """ #usuwanie brakujących wartości df = df.dropna(subset=["Gender", "Married", "Dependents", "Loan_Amount_Term"]) # zastepowanie brakujacych wartosco najczesciej wystepujacymi imp_mean = SimpleImputer(strategy='most_frequent') imp_mean.fit(df) imputed_data_india = imp_mean.transform(df) imputed_data_india = pd.DataFrame(imputed_data_india, columns=col_names) imputed_data_india['Dependents'] = imputed_data_india['Dependents'].replace( ['3+'], 3) imputed_data_india['Dependents'] = imputed_data_india['Dependents'].astype(int) df = imputed_data_india #Konwersja typów df['Gender'] = df['Gender'].astype(str) df['Married'] = df['Married'].astype(str) df['Education'] = df['Education'].astype(str) df['Self_Employed'] = df['Self_Employed'].astype(str) df['ApplicantIncome'] = df['ApplicantIncome'].astype(int) df['CoapplicantIncome'] = df['CoapplicantIncome'].astype(int) df['LoanAmount'] = df['LoanAmount'].astype(int) df['Loan_Amount_Term'] = df['Loan_Amount_Term'].astype(int)
import numpy as np from sklearn.impute import SimpleImputer data = [[1, 2], [np.nan, 3], [7, 6]] inputer = SimpleImputer(missing_values=np.nan, strategy='mean') inputer.fit(data) SimpleImputer() print(inputer.transform(data))
# -*- coding: utf-8 -*- """ Created on Wed Apr 4 00:58:56 2021 @author: Hp """ import numpy as np import pandas as pd dataset = pd.read_csv('clasificacion2.csv') x = dataset.iloc[:, :7].values y = dataset.iloc[:, 7].values from sklearn.preprocessing import LabelEncoder labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(x[:, :7]) x[:, :7] = imputer.transform(x[:, :7]) print(x) from sklearn.preprocessing import StandardScaler from sklearn import model_selection sc = StandardScaler() X_train, X_test, y_train, y_test = model_selection.train_test_split( x, y, test_size=0.2) X_train[:, :7] = sc.fit_transform(X_train[:, :7]) X_test[:, :7] = sc.transform(X_test[:, :7])
class STMBplus_auto(SupervisedLearnerPrimitiveBase[Inputs, Outputs, Params, Hyperparams]): """ A primitive that performs supervised structured feature selection to reduce input feature dimension. Input to this primitive should be a matrix of tabular numerical/categorical data, consisting of columns of features, and an array of labels. Output will be a reduced data matrix with metadata updated. """ metadata = metadata_base.PrimitiveMetadata({ 'id': '9d1a2e58-5f97-386c-babd-5a9b4e9b6d6c', 'version': rpi_d3m_primitives.__coreversion__, 'name': 'STMBplus_auto feature selector', 'keywords': ['Feature Selection'], 'description': 'This primitive is a structured feature selection function based on the independence test', 'source': { 'name': rpi_d3m_primitives.__author__, 'contact': 'mailto:[email protected]', 'uris': [ 'https://github.com/zijun-rpi/d3m-primitives/blob/master/STMBplus_auto.py', 'https://github.com/zijun-rpi/d3m-primitives.git' ] }, 'installation': [{ 'type': metadata_base.PrimitiveInstallationType.PIP, 'package': 'rpi_d3m_primitives', 'version': rpi_d3m_primitives.__version__ }], 'python_path': 'd3m.primitives.feature_selection.simultaneous_markov_blanket.AutoRPI', 'algorithm_types': [ metadata_base.PrimitiveAlgorithmType. MINIMUM_REDUNDANCY_FEATURE_SELECTION ], 'primitive_family': metadata_base.PrimitiveFamily.FEATURE_SELECTION }) def __init__( self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: typing.Union[typing.Dict[ str, base.DockerContainer]] = None ) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) self._index = None self._problem_type = 'classification' self._training_inputs = None self._training_outputs = None self._fitted = False self._cate_flag = None self._LEoutput = preprocessing.LabelEncoder() # label encoder self._Imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # imputer self._nbins = self.hyperparams['nbins'] self._Kbins = preprocessing.KBinsDiscretizer( n_bins=self._nbins, encode='ordinal', strategy='uniform' ) #self.hyperparams['Discretizer_Strategy']) #KbinsDiscretizer ## TO DO # select columns via semantic types # remove preprocessing def set_training_data(self, *, inputs: Inputs, outputs: Outputs) -> None: # set problem type metadata = outputs.metadata column_metadata = metadata.query((metadata_base.ALL_ELEMENTS, 0)) semantic_types = column_metadata.get('semantic_types', []) if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: self._problem_type = 'classification' # set training labels self._LEoutput.fit(outputs) self._training_outputs = self._LEoutput.transform(outputs) else: self._problem_type = 'regression' # convert cateforical values to numerical values in training data metadata = inputs.metadata [m, n] = inputs.shape self._training_inputs = np.zeros((m, n)) self._cate_flag = np.zeros((n, )) for column_index in metadata.get_elements( (metadata_base.ALL_ELEMENTS, )): if column_index is metadata_base.ALL_ELEMENTS: continue column_metadata = metadata.query( (metadata_base.ALL_ELEMENTS, column_index)) semantic_types = column_metadata.get('semantic_types', []) if 'https://metadata.datadrivendiscovery.org/types/CategoricalData' in semantic_types: LE = preprocessing.LabelEncoder() LE = LE.fit(inputs.iloc[:, column_index]) self._training_inputs[:, column_index] = LE.transform( inputs.iloc[:, column_index]) self._cate_flag[column_index] = 1 elif 'http://schema.org/Text' in semantic_types: pass else: temp = list(inputs.iloc[:, column_index].values) for i in np.arange(len(temp)): if bool(temp[i]): self._training_inputs[i, column_index] = float(temp[i]) else: self._training_inputs[i, column_index] = float('nan') if not np.count_nonzero( np.isnan(self._training_inputs[:, column_index]) ) == 0: # if there is missing values if np.count_nonzero( np.isnan(self._training_inputs[:, column_index]) ) == m: # all missing self._training_inputs[:, column_index] = np.zeros( m, ) # replace with all zeros self._fitted = False def fit(self, *, timeout: float = None, iterations: int = None) -> None: if self._fitted: return CallResult(None) if self._training_inputs.any() == None or self._training_outputs.any( ) == None: raise ValueError('Missing training data, or missing values exist.') ## impute missing values self._Imputer.fit(self._training_inputs) self._training_inputs = self._Imputer.transform(self._training_inputs) # [m,n] = self._training_inputs.shape # for column_index in range(n): # if len(np.unique(self._training_inputs[:,column_index])) == 1: # self._cate_flag[column_index] = 1 ## discretize non-categorical values disc_training_inputs = self._training_inputs if not len(np.where(self._cate_flag == 0)[0]) == 0: self._Kbins.fit( self._training_inputs[:, np.where( self._cate_flag == 0)[0]]) #find non-categorical values temp = self._Kbins.transform( self._training_inputs[:, np.where(self._cate_flag == 0)[0]]) disc_training_inputs[:, np.where(self._cate_flag == 0)[0]] = temp #start from zero Trainset = RelationSet(self._training_inputs, self._training_outputs.reshape(-1, 1)) discTrainset = RelationSet(disc_training_inputs, self._training_outputs.reshape(-1, 1)) validSet, smallTrainSet = Trainset.split( self._training_inputs.shape[0] // 4) smallDiscTrainSet = discTrainset.split( self._training_inputs.shape[0] // 4)[1] model = STMB(Trainset, discTrainset, self._problem_type, test_set=Trainset) index = model.select_features() self._index = [] [ m, ] = index.shape for ii in np.arange(m): if not len(np.unique( self._training_inputs[:, index[ii].item()])) == 1: self._index.append(index[ii].item()) self._fitted = True return CallResult(None) def produce( self, *, inputs: Inputs, timeout: float = None, iterations: int = None ) -> base.CallResult[Outputs]: # inputs: m x n numpy array if self._fitted: output = inputs.iloc[:, self._index] output.metadata = utils.select_columns_metadata( inputs.metadata, columns=self._index) return CallResult(output) else: raise ValueError('Model should be fitted first.') def get_params(self) -> None: pass def set_params(self) -> None: pass
# all rows included, but from the columns we only need the 7th (6th index) print('Original:' '\n=========') print(dataset) # =================== # DATA PREPROCESSING | # =================== # ------------- # MISSING DATA | # ------------- imputer = SimpleImputer(missing_values=np.nan, strategy='constant') # handles missing data & replaces NaN values # strategy argument 'constant' replaces missing values with fill_value (for string/object datatypes) imputer = imputer.fit(X[:, 1:6]) # fits the imputer on X # # fits data to avoid data leakage during cross validation X[:, 1:6] = imputer.transform(X[:, 1:6]) # imputes all missing values in X print("\nImputed:" "\n========") print(X) # ------------------------------------------------------------------------ # CONVERT CATEGORICAL TEXT DATA INTO MODEL-UNDERSTANDABLE NUMBERICAL DATA | # ------------------------------------------------------------------------ labelencoder_X = LabelEncoder() # encodes target lables with values between 0 and n_classes -1 X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) # fit label encoder and return encoded labels
import matplotlib.pyplot as plt import pandas as pd from openpyxl import Workbook, load_workbook arac = load_workbook("ham veri.xlsx") sheet = arac.active sheet = arac['işlenmiş veri'] dataset = pd.read_excel('ham veri.xlsx') X = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') imputer.fit(X[:, 5:7]) X[:, 5:7] = imputer.transform(X[:, 5:7]) print(X) from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0, 1, 3, 4])], remainder='passthrough') X = np.array(ct.fit_transform(X)) print(X) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,
for year in range(2014, 2020): yearDf = pd.read_csv('data/playerTotals' + str(year) + '.csv', sep=',') names += [name + ' ' + str(year) for name in yearDf['name']] df = pd.concat([df, yearDf], axis=0, ignore_index=True) X = df.values #%% fill in blank columns from sklearn.impute import SimpleImputer print(df.isna().any()) imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=0) imputer.fit(X) X = imputer.transform(X) #%% add shot selection stats df = pd.DataFrame(data=X, columns=df.columns) shooting_ranges = [ 'At Rim', '3 to <10 ft', '10 to <16 ft', '16 ft to <3-pt', '3-pt' ] playerFGA = pd.Series(np.zeros(len(df.index))) for shot in shooting_ranges: playerFGA = playerFGA + df[shot + ' FGA'] for shot in shooting_ranges: df[shot + ' FGA%'] = df[shot + ' FGA'] / playerFGA #%% save complete player totals df.to_csv('data/completePlayerTotals.csv')
'0s=suspect Blood Donor': 0 }) #Découpage des données par classe blood_donor = dataset.iloc[:533, :].values suspect_blood_donor = dataset.iloc[533:540, :].values hepatitis = dataset.iloc[540:564, :].values fibrosis = dataset.iloc[564:585, :].values cirrhosis = dataset.iloc[585:, :].values #Traitements des données manquantes par classe from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy="mean") imputer = imputer.fit(blood_donor[:, 3:]) blood_donor[:, 3:] = imputer.transform(blood_donor[:, 3:]) imputer = imputer.fit(hepatitis[:, 3:]) hepatitis[:, 3:] = imputer.transform(hepatitis[:, 3:]) imputer = imputer.fit(fibrosis[:, 3:]) fibrosis[:, 3:] = imputer.transform(fibrosis[:, 3:]) imputer = imputer.fit(cirrhosis[:, 3:]) cirrhosis[:, 3:] = imputer.transform(cirrhosis[:, 3:]) #fusion des tableaux z = np.concatenate( (blood_donor, suspect_blood_donor, hepatitis, fibrosis, cirrhosis), axis=0) x = z[:, 1:]
def main(): datafile = input('Filename: ') print('\nKeep in mind that bad lines are skipped in original data format.') if datafile != 'test_plus': df = pd.read_csv(datafile, sep=" ", header=None, error_bad_lines=False) print(df.shape) #imp = SimpleImputer(missing_values=np.nan, strategy='mean') #imp = imp.fit(df) #df = imp.transform(df) df = df.dropna() print('\nChosen filename: ', datafile) if datafile == "dataset_04.txt": df.columns = [ 'frame_number', 'T1_x', 'T1_y', '1_mx', '1_my', '1_ex', '1_ey', '2_mx', '2_my', '2_ex', '2_ey', '3_mx', '3_my', '3_ex', '3_ey', '4_mx', '4_my', '4_ex', '4_ey', 'orientation', 'label' ] elif datafile == 'test_1.txt': df.columns = [ 'frame_number', 'T1_x', 'T1_y', 'angle1', 'angle2', 'angle3', 'angle4', 'cu1', 'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3', 'chL4', 'theta1', 'theta2', 'theta3', 'theta4', 'AL1', 'AL2', 'AL3', 'AL4', 'orientation', 'label' ] #df = pd.DataFrame({'frame_number':df['frame_number'], 'T1_x': df['T1_x'], 'T1_y': df['T1_y'], 'angles': df['angle1']*df['angle2']*df['angle3']*df['angle4'], 'curvatures': df['cu1']*df['cu2']*df['cu3']*df['cu4']*pow(1000,4), 'chLs': df['chL1']*df['chL2']*df['chL3']*df['chL4'], 'thetas': df['theta1']*df['theta2']*df['theta3']*df['theta4'], 'ALs': df['AL1']*df['AL2']*df['AL3']*df['AL4'], 'orientation': df['orientation'], 'label': df['label']}) print(df.head()) elif datafile == 'test_plus': df1 = pd.read_csv('test_start_end_points.txt', sep=" ", header=None, error_bad_lines=False) df2 = pd.read_csv('test_for_mid_points.txt', sep=" ", header=None, error_bad_lines=False) df1 = df1.replace([np.inf, -np.inf], np.nan) df1 = df1.fillna(df1.mean()) df2 = df2.replace([np.inf, -np.inf], np.nan) df2 = df2.fillna(df2.mean()) df1 = df1.dropna() df2 = df2.dropna() df1.columns = [ 'frame_number', 'T1_x', 'T1_y', 'angle1', 'angle2', 'angle3', 'angle4', 'cu1', 'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3', 'chL4', 'd_tot', 'theta1', 'theta2', 'theta3', 'theta4', 'AL1', 'AL2', 'AL3', 'AL4', 'A1', 'A2', 'A3', 'A4', 'd11', 'd21', 'd31', 'd41', 'A1_tot', 'A2_tot', 'A3_tot', 'A4_tot', 'h1', 'h2', 'h3', 'h4', 'orientation', 'label' ] df2.columns = [ 'frame_number', 'T1_x', 'T1_y', 'angle1', 'angle2', 'angle3', 'angle4', 'cu1', 'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3', 'chL4', 'theta1', 'theta2', 'theta3', 'theta4', 'AL1', 'AL2', 'AL3', 'AL4', 'A1', 'A2', 'A3', 'A4', 'orientation', 'label' ] df = df1 df['mid1'] = df2['angle1'] df['mid2'] = df2['angle2'] df['mid3'] = df2['angle3'] df['mid4'] = df2['angle4'] df = df.drop([ 'cu1', 'cu2', 'cu3', 'cu4', 'd_tot', 'theta1', 'theta2', 'theta3', 'theta3', 'A1_tot', 'A2_tot', 'A3_tot', 'A4_tot', 'h1', 'h2', 'h3', 'h4', 'orientation' ], axis=1) elif datafile == 'test_start_end_points.txt' or 'test_for_mid_points.txt': df.columns = [ 'frame_number', 'T1_x', 'T1_y', 'angle1', 'angle2', 'angle3', 'angle4', 'cu1', 'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3', 'chL4', 'd_tot', 'theta1', 'theta2', 'theta3', 'theta4', 'AL1', 'AL2', 'AL3', 'AL4', 'A1', 'A2', 'A3', 'A4', 'd11', 'd21', 'd31', 'd41', 'A1_tot', 'A2_tot', 'A3_tot', 'A4_tot', 'h1', 'h2', 'h3', 'h4', 'orientation', 'label' ] #df = pd.DataFrame({'frame_number':df['frame_number'], 'T1_x': df['T1_x'], 'T1_y': df['T1_y'], 'angles': df['angle1']*df['angle2']*df['angle3']*df['angle4'], 'curvatures': df['cu1']*df['cu2']*df['cu3']*df['cu4'], 'chLs': df['chL1']*df['chL2']*df['chL3']*df['chL4'], 'thetas': df['theta1']*df['theta2']*df['theta3']*df['theta4'], 'ALs': df['AL1']*df['AL2']*df['AL3']*df['AL4'], 'As': df['A1']*df['A2']*df['A3']*df['A4'], 'orientation': df['orientation'], 'label': df['label']}) print(df.head()) print('Dataset features: ', df.columns) print( '--------------------------------------------------------------------------------------' ) print('\nData information section.') print('\nDataFrame description:') print(df.describe()) print('\nDataFrame info:') print(df.info()) print( '--------------------------------------------------------------------------------------' ) data_manipulation = input('Use data manipulation? Type "Yes" or "No": ') if data_manipulation == "Yes": manipulation_type = input( '\nWhich manipulation type do you want to use? Type answer here: ') if manipulation_type == "Angle-S1": print('Calculating direct angles between end points.') df = feature_derivation.angle(df) elif manipulation_type == "Angle-S2": print( 'Calculating direct angles between end points AND substracting them from flow orientation.' ) df = feature_derivation.angle(df) df = feature_derivation.orientation_strategy(df) else: print( '\nDesired manipulation method has not yet been implemented. Exiting program.' ) exit() else: print('\nData will not be manipulated in any way.') print( '--------------------------------------------------------------------------------------' ) strat = input( '\nWhich stratagem do we use? 1: [50-50 + 50-50], 2: [50-50 + 10-90], 3: [10-90 + 10-90]: ' ) num_sample = input('\nWhat is the standard sample size? ') if int(strat) == 1: print('\nStratagem 1 chosen.') positives = df[df['label'] == 1] positives = positives.sample(n=int(int(num_sample) * 0.5), random_state=1) zeros = df[df['label'] == 0] zeros = zeros.sample(n=int(int(num_sample) * 0.5), random_state=1) frames = [positives, zeros] df = pd.concat(frames) labels = df['label'] print(df.columns) inputs = df.drop(['frame_number', 'T1_x', 'T1_y', 'label'], 1) input_train, input_test, labels_train, labels_test = train_test_split( inputs, labels, test_size=0.20, random_state=0) elif int(strat) == 2: print('\nStratagem 2 chosen.') positives = df[df['label'] == 1] positives = positives.sample(n=int(int(num_sample) * 0.5), random_state=1) print(positives.shape) zeros = df[df['label'] == 0] zeros_s = zeros.sample(n=int(int(num_sample) * 0.5), random_state=1) print(zeros_s.shape) frames = [positives, zeros_s] df = pd.concat(frames) labels = df['label'] inputs = df.drop(['frame_number', 'T1_x', 'T1_y', 'label'], 1) input_train, input_test, labels_train, labels_test = train_test_split( inputs, labels, test_size=0.20, random_state=0) positives_2 = positives.sample(n=int(int(num_sample) * 0.1), random_state=1) print(positives_2.shape) #zeros_2 = df['label'] #print(zeros_2.shape) zeros_2 = zeros.sample(n=int(int(num_sample) * 0.9), random_state=1, replace=False) print(zeros_2.shape) frames_2 = [positives_2, zeros_2] temp_df = pd.concat(frames_2) labels_2 = temp_df['label'] inputs_2 = temp_df.drop(['frame_number', 'T1_x', 'T1_y', 'label'], 1) input_train_2, input_test, labels_train_2, labels_test = train_test_split( inputs_2, labels_2, test_size=0.20, random_state=0) elif int(strat) == 3: print('\nStratagem 3 chosen.') positives = df[df['label'] == 1] positives = positives.sample(n=int(int(num_sample) * 0.1), random_state=1) zeros = df[df['label'] == 0] zeros = zeros.sample(n=int(int(num_sample) * 0.9), random_state=1) frames = [positives, zeros] df = pd.concat(frames) labels = df['label'] inputs = df.drop(['frame_number', 'T1_x', 'T1_y', 'label'], 1) input_train, input_test, labels_train, labels_test = train_test_split( inputs, labels, test_size=0.20, random_state=0) else: print('\nInvalid stratagem. Exiting program.') exit() i = 0 imp = SimpleImputer(missing_values=np.nan, strategy='mean') imp = imp.fit(input_train) input_train = imp.transform(input_train) input_test = imp.transform(input_test) solver_type = input('\nWhich ML solver do we use: ') print("\n------------------------------------------------------------") while i < 2: if i == 0: print('\nRunning the model with raw data (no scaling)...') #9,10,11,8, if i != 0: do_pca = input('Do the PCA? Type "Yes" or "No": ') if do_pca == "Yes": pca = PCA(n_components=2, svd_solver='arpack', random_state=0) principalComponents = pca.fit_transform(input_train) #labels_2 = pd.concat(labels_train, labels_train) label_df = pd.DataFrame({ 'label': labels_train, 'label_2': labels_train }) print('PCA', principalComponents.shape) print('labels', labels_train.shape) #print(labels_train.columns) finalDf = np.hstack((principalComponents, label_df)) #finalDf = pd.concat(principalComponents, labels_train) #save_pca = input('\nSave PCA data? Type "Yes" or "No": ') #if save_pca == "Yes": #print(finalDf.head()) #print(finalDf.transpose().head()) # np.savetxt(r'C:\Users\oskar\eclipse-workspace\fEX\T1-feature-extraction-master\pca_data_test1.txt', finalDf, fmt='%d', delimiter=' ') print( "\n------------------------------------------------------------" ) print("\nNumber of features in the original data: ", pca.n_features_) print( pd.DataFrame( pca.components_.transpose(), index=[ 'angle1', 'angle2', 'angle3', 'angle4', 'cu1', 'cu2', 'cu3', 'cu4', 'chL1', 'chL2', 'chL3', 'chL4', 'd_tot', 'theta1', 'theta2', 'theta3', 'theta4', 'AL1', 'AL2', 'AL3', 'AL4', 'A1', 'A2', 'A3', 'A4', 'd11', 'd21', 'd31', 'd41', 'A1_tot', 'A2_tot', 'A3_tot', 'A4_tot', 'h1', 'h2', 'h3', 'h4', 'orientation', 'mid1', 'mid2', 'mid3', 'mid4' ], columns=['PC-1', 'PC-2'])) print("Data variance ratio after PCA tranform: ", pca.explained_variance_ratio_) print( "\n------------------------------------------------------------" ) vis_pca = input( '\nVisualize PCA components? Type "Yes" or "No": ') if vis_pca == "Yes": fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(1, 1, 1) ax.set_xlabel('Principal Component 1', fontsize=15) ax.set_ylabel('Principal Component 2', fontsize=15) ax.set_title('PCA: 2 component projection', fontsize=20) targets = [0, 1] colors = ['orangered', 'dodgerblue'] #print(finalDf.head()) for target, color in zip(targets, colors): #finalDf_2 = finalDf[np.where(finalDf[:,2] == target)] finalDf_2 = finalDf[finalDf[:, 2] == target] if color == 'r': alfa = 1.0 normi = 1.0 else: alfa = 0.80 normi = 1.0 ax.scatter(finalDf_2[:, 0], finalDf_2[:, 1], c=color, alpha=alfa, marker='o', s=5.0, norm=normi) ax.legend(targets) ax.grid() plt.show() clf = MLPClassifier(solver=solver_type, alpha=1e-5, hidden_layer_sizes=(9, 10, 2), random_state=0, max_iter=20000, shuffle=True) clf.fit(input_train, labels_train) predicted = clf.predict(input_test) conf_matrix = metrics.confusion_matrix(labels_test, predicted) tn, fp, fn, tp = metrics.confusion_matrix(labels_test, predicted).ravel() scores = cross_val_score(clf, input_test, labels_test, cv=5) print('Fitting of training data complete.') print('Predicting based on test data.') print("\nTraining set score: %f" % clf.score(input_train, labels_train)) print("Test set score: %f" % clf.score(input_test, labels_test)) print('Confusion matrix: \n') print('TN:', tn) print('TP: ', tp) print('FN: ', fn) print('FP:', fp) print('CROSS VALIDATION SCORES:', scores) print("\n------------------------------------------------------------") if i == 0: print('\nScaling following features', inputs.columns) input_train = StandardScaler().fit_transform(input_train) input_test = StandardScaler().fit_transform(input_test) print('\nRunning the same model with scaled data...') i = i + 1 print('\nFinished.')
# Lê o dataset credit_data e define como df df = pd.read_csv('CSVs/credit_data.csv') # Define as idades anômolas pela média das normais df.loc[df.age < 0, 'age'] = df.age[df.age > 0].mean() # Define as features para previsores previsores = df.iloc[:, 1:4].values classe = df.iloc[:, 4].values # Instancia a classe Imputer imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # Treina o algoritmo com os valores existentes imputer = imputer.fit(previsores) # Insere os valores nas células com valores faltantes previsores = imputer.transform(previsores) # Instancia a classe de escalonamento scaler = StandardScaler() # Padroniza os valores na mesma escala previsores = scaler.fit_transform(previsores) """ DIVISÃO TREINO E TESTE """ # Importa a biblioteca para divisão do dataset em treinamento e teste from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt import pandas as pd """Importing Data Set""" dataset = pd.read_csv('Data.csv') x = dataset.iloc[:, :-1].values y = dataset.iloc[:, -1].values print(x) print(y) """Taking care of missing data""" from sklearn.impute import SimpleImputer imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer.fit(x[:, 1:3]) x[:, 1:3] = imputer.transform(x[:, 1:3]) print(x) """#Encoding Categorical data ###Encoding the independent variable """ from sklearn.compose import ColumnTransformer from sklearn.preprocessing import OneHotEncoder ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough') x = np.array(ct.fit_transform(x)) print(x) """###Encoding for dependent variables"""
os.chdir("C:/Venkat/Personal/Trainings/Datasets/") #creation of data frames from csv titanic_train = pd.read_csv("Titanic_train.csv") print(titanic_train.info()) #preprocessing stage #The SimpleImputer class is for continues numerical values, but also supports categorical data represented #as string values or pandas categoricals when using the 'most_frequent' or 'constant' strategy: #SimpleImputer(strategy="most_frequent") #impute missing values for continuous features imputable_cont_features = ['Age', 'Fare'] #cont_imputer = preprocessing.Imputer() not working in this version python. cont_imputer = SimpleImputer() # default Mean cont_imputer.fit(titanic_train[imputable_cont_features]) print(cont_imputer.statistics_) titanic_train[imputable_cont_features] = cont_imputer.transform( titanic_train[imputable_cont_features]) #impute missing values for categorical features #cat_imputer = CategoricalImputer cat_imputer = SimpleImputer(strategy="most_frequent") cat_imputer.fit(titanic_train[['Embarked']]) #print(cat_imputer.fill_) print(cat_imputer.statistics_) titanic_train[['Embarked']] = cat_imputer.transform(titanic_train[['Embarked' ]]) le_embarked = preprocessing.LabelEncoder() le_embarked.fit(titanic_train['Embarked'])
def main(): # data load df = pd.read_csv('./data/' + file_model + '.csv', header=0) ID = df.iloc[:, 0] y = df.iloc[:, -1] X = df.iloc[:, 1:-1] # preprocessing-1: one-hot encoding X_ohe = pd.get_dummies(X, dummy_na=True, columns=ohe_cols) X_ohe = X_ohe.dropna(axis=1, how='all') X_ohe_columns = X_ohe.columns.values # preprocessing-2: null imputation imp = SimpleImputer() imp.fit(X_ohe) X_ohe = pd.DataFrame(imp.transform(X_ohe), columns=X_ohe_columns) print(X_ohe.shape) # preprocessing-3: feature selection selector = RFECV(estimator=RandomForestClassifier(n_estimators=100, random_state=0), step=0.05) selector.fit(X_ohe, y) X_ohe_selected = selector.transform(X_ohe) X_ohe_selected = pd.DataFrame(X_ohe_selected, columns=X_ohe_columns[selector.support_]) print(X_ohe_selected.shape) X_ohe_selected.head() # preprocessing-4: preprocessing of a score data along with a model dataset if len(file_score) > 0: # load score data dfs = pd.read_csv('./data/' + file_score + '.csv', header=0) IDs = dfs.iloc[:, [0]] Xs = dfs.iloc[:, 1:-1] Xs_ohe = pd.get_dummies(Xs, dummy_na=True, columns=ohe_cols) cols_m = pd.DataFrame(None, columns=X_ohe_columns, dtype=float) # consistent with columns set Xs_exp = pd.concat([cols_m, Xs_ohe]) Xs_exp.loc[:,list(set(X_ohe_columns)-set(Xs_ohe.columns.values))] = \ Xs_exp.loc[:,list(set(X_ohe_columns)-set(Xs_ohe.columns.values))].fillna(0, axis=1) Xs_exp = Xs_exp.drop( list(set(Xs_ohe.columns.values) - set(X_ohe_columns)), axis=1) # re-order the score data columns Xs_exp = Xs_exp.reindex_axis(X_ohe_columns, axis=1) Xs_exp = pd.DataFrame(imp.transform(Xs_exp), columns=X_ohe_columns) Xs_exp_selected = Xs_exp.loc[:, X_ohe_columns[selector.support_]] # modeling clf.fit(X_ohe_selected, y.as_matrix().ravel()) joblib.dump(clf, './model/' + model_name + '.pkl') results = cross_val_score(clf, X_ohe_selected, y, scoring='roc_auc', cv=5) print('cv score:', np.average(results), '+-', np.std(results)) # scoring if len(file_score) > 0: score = pd.DataFrame(clf.predict_proba(Xs_exp_selected)[:, 1], columns=['pred_score']) IDs.join(score).to_csv('./data/' + model_name + '_' + file_score + '_with_pred.csv', index=False) # model profile imp = pd.DataFrame([clf.named_steps['est'].feature_importances_], columns=X_ohe_columns[selector.support_]) imp.T.to_csv('./data/' + model_name + '_feature_importances.csv', index=True)
import numpy as np #Fetching the training dataset import pandas as pd train_data = pd.read_excel('../Dataset/Train_dataset(1).xlsx', 'Train_dataset') test_data = pd.read_excel('../Dataset/Test_dataset_1.xlsx') test_27March = pd.read_excel('../Dataset/Train_dataset(1).xlsx', 'Train_27March') #Checking for missing values #print(train_data.isnull().sum()) #Imputing values through simple method like median of the column imp = SimpleImputer(missing_values=np.nan, strategy='median') imp.fit(train_data[[ 'Children', 'Diuresis', 'Platelets', 'HBB', 'd-dimer', 'Heart rate', 'HDL cholesterol', 'Insurance', 'FT/month' ]]) X = pd.DataFrame( imp.transform(train_data[[ 'Children', 'Diuresis', 'Platelets', 'HBB', 'd-dimer', 'Heart rate', 'HDL cholesterol', 'Insurance', 'FT/month' ]])) train_data[[ 'Children', 'Diuresis', 'Platelets', 'HBB', 'd-dimer', 'Heart rate', 'HDL cholesterol', 'Insurance', 'FT/month' ]] = imp.transform(train_data[[ 'Children', 'Diuresis', 'Platelets', 'HBB', 'd-dimer', 'Heart rate', 'HDL cholesterol', 'Insurance', 'FT/month' ]]) # Imputing categorical data by mode
# Import Data X = pd.read_csv("input/X.csv") print(X.shape) y = pd.read_csv("input/y.csv", squeeze=True) print(y.shape) train_metadata = pd.read_csv('input/metadata_train.csv') # read train data test_metadata = pd.read_csv('input/metadata_test.csv') # read train data X_train, X_valid, y_train, y_valid = train_test_split( X, y, test_size=.2, stratify=y, random_state=1) # split X, y into 80% train 20% valid my_imputer = SimpleImputer(strategy="median") my_imputer.fit(X_train) scaler = StandardScaler() scaler.fit(X_train) X_train = pd.DataFrame(scaler.transform(my_imputer.transform(X_train))) X_valid = pd.DataFrame(scaler.transform(my_imputer.transform(X_valid))) def opt(X_train, y_train, X_test, y_test, trial): # param_list n_estimators = trial.suggest_int('n_estimators', 0, 1000) max_depth = trial.suggest_int('max_depth', 1, 20) min_child_weight = trial.suggest_int('min_child_weight', 1, 20) # learning_rate = trial.suggest_discrete_uniform('learning_rate', 0.01, 0.1, 0.01) scale_pos_weight = trial.suggest_int('scale_pos_weight', 1, 100) subsample = trial.suggest_discrete_uniform('subsample', 0.5, 0.9, 0.1) colsample_bytree = trial.suggest_discrete_uniform(
# plt.show() from pandas.tools.plotting import scatter_matrix attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"] # scatter_matrix(housing[attributes], figsize=(12, 8)) # plt.show() housing["rooms_per_household"] = housing["total_rooms"]/housing["households"] housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"] housing["population_per_household"]=housing["population"]/housing["households"] housing_num = housing.drop("ocean_proximity", axis=1) from sklearn.impute import SimpleImputer imputer = SimpleImputer(strategy="median") imputer.fit(housing_num) X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns) # from sklearn.preprocessing import LabelEncoder # encoder = LabelEncoder() housing_cat = housing["ocean_proximity"] # housing_cat_encoded = encoder.fit_transform(housing_cat) # from sklearn.preprocessing import OneHotEncoder # encoder = OneHotEncoder() # housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1,1)) from sklearn.preprocessing import LabelBinarizer encoder = LabelBinarizer()
# %% housing_data = strat_train_set.drop("median_house_value", axis=1) housing_labels = strat_train_set["median_house_value"].copy() # clean data # housing_data.dropna(subset=["total_bedrooms"]) # first option -> drop rows with missing value # housing_data.drop("total_bedroooms", axis = 1) # second option -> drop whole attribute # median = housing_data["total_bedrooms"].median() # housing_data["total_bedrooms"].fillna(median, inplace = True ) # %% imputer = SimpleImputer(strategy="median") housing_data_num = housing_data.drop("ocean_proximity", axis=1) imputer.fit(housing_data_num) imputer.statistics_ # %% housing_data_num.median().values # %% transform_table = imputer.transform(housing_data_num) housing_tr = pd.DataFrame(transform_table, columns=housing_data_num.columns) # %% encoder = LabelEncoder() housing_cat = housing_data["ocean_proximity"] housing_cat_encoded = encoder.fit_transform(housing_cat) housing_cat_encoded
def fit_categorical_imputer(train_data: pd.DataFrame, categorical_features: List[str]): categorical_imputer = SimpleImputer(strategy="most_frequent", copy=True) categorical_imputer.fit(train_data[categorical_features]) return categorical_imputer
"test_8": test_8, "test_9": test_9, "test_10": test_10, "test_11": test_11, "test_12": test_12, "test_13": test_13, "test_14": test_14, "test_charge": test_charge }) ### normalize data train_X = dataset_train.drop('train_charge', axis=1) test_X = dataset_test.drop('test_charge', axis=1) #print(test_X) sc.fit(train_X) train_X_normalized = pd.DataFrame(sc.fit_transform(train_X), columns=train_X.columns.values) test_X_normalized = sc.transform(test_X) #print(test_X_normalized) train_Y = dataset_train['train_charge'] test_Y = dataset_test['test_charge'] #print(train_X_normalized) # print(train_Y) ### RFR regressor fit ### ''' random_state:指定模型随机状态,确保每次生成的模型是相同的 n_jods:进程个数(-1为用所有的CPU进行计算,默认为None,即为1) 最大特征数为所有特征数一半时为佳 最大深度:当特征较少或数据集较少时可选择默认
def preprocess(data_dir='datasets/', imputation_type='mean'): # X is the complete data matrix missing_gt = 60 print(os.getcwd()) aps_training_df = pd.read_csv(data_dir + 'aps_failure_training_set.csv', error_bad_lines=False) aps_test_df = pd.read_csv(data_dir + 'aps_failure_test_set.csv', error_bad_lines=False) print(aps_training_df.shape, aps_test_df.shape) print("=======Remove duplicates=================") aps_training_df.drop_duplicates(inplace=True) print(aps_training_df.shape, aps_test_df.shape) # aps_training_df = aps_training_df[aps_training_df['class']=='pos'] print(aps_training_df.shape) print(aps_training_df.columns) #print(aps_training_df.isin(['na']).mean() * 100) print(aps_training_df.head()) print('replacing na values to null=========') aps_training_df.replace(r'na', np.nan, regex=False, inplace=True) aps_test_df.replace(r'na', np.nan, regex=False, inplace=True) print('===removing more than ' + str(missing_gt) + '% missing column=========') percent_missing = aps_training_df.isnull().sum() * 100 / len( aps_training_df) missing_value_df = pd.DataFrame({ 'column_name': aps_training_df.columns, 'percent_missing': percent_missing }) missing_value_df.sort_values('percent_missing', inplace=True, ascending=False) missing_gt_x = missing_value_df[ missing_value_df['percent_missing'] > missing_gt].column_name print(missing_gt_x) aps_training_df = aps_training_df.drop(missing_gt_x, axis=1) aps_test_df = aps_test_df.drop(missing_gt_x, axis=1) print( "============================Remove missing rows from Traning data==========================================" ) percent_missing_pos = ( aps_training_df[aps_training_df['class'] == 'pos'].isnull().sum() / len(aps_training_df[aps_training_df['class'] == 'pos'])) * 100 missing_value_df_pos = pd.DataFrame({ 'column_name': aps_training_df.columns, 'percent_missing': percent_missing_pos }) missing_value_df_pos.sort_values('percent_missing', inplace=True, ascending=False) prcent_row_missing = aps_training_df.isnull().sum(axis=1) * 100 / 170 prcnt_50_row_missing = prcent_row_missing[prcent_row_missing > 50] print('More than 50% missing rows:::', len(prcnt_50_row_missing)) #aps_training_df.drop(prcnt_50_row_missing.index, axis=0, inplace=True) print( "============================Remove missing rows from Test data ==========================================" ) percent_missing_pos = ( aps_test_df[aps_test_df['class'] == 'pos'].isnull().sum() / len(aps_test_df[aps_test_df['class'] == 'pos'])) * 100 missing_value_df_pos = pd.DataFrame({ 'column_name': aps_test_df.columns, 'percent_missing': percent_missing_pos }) missing_value_df_pos.sort_values('percent_missing', inplace=True, ascending=False) prcent_row_missing = aps_test_df.isnull().sum(axis=1) * 100 / 170 prcnt_50_row_missing = prcent_row_missing[prcent_row_missing > 50] print('More than 50% missing rows:::', len(prcnt_50_row_missing)) #aps_test_df.drop(prcnt_50_row_missing.index, axis=0, inplace=True) #intersection_list = list(set(missing_value_df_pos.index) & set(prcnt_50_row_missing.index)) #print("==Intersection list::::::::::",intersection_list) #aps_training_df.drop(intersection_list, axis=1, inplace=True) #aps_test_df.drop(intersection_list, axis=1, inplace=True) print("Training and Test data-set shape after dropping features is ", aps_training_df.shape, aps_test_df.shape) # Print number of positive classes and number of negative classes in the training data-set print("Number of positive classes = ", sum(aps_training_df['class'] == 'pos')) print("Number of negative classes = ", sum(aps_training_df['class'] == 'neg')) print("*******************") print("===================Drop outliers=================") from scipy import stats def drop_numerical_outliers(df, z_thresh=3): # Constrains will contain `True` or `False` depending on if it is a value below the threshold. constrains = df.select_dtypes(include=[np.number]) \ .apply(lambda x: np.abs(stats.zscore(x)) < z_thresh) \ .all(axis=1) # Drop (inplace) values set to be rejected return df.drop(df.index[~constrains]) # train = X_train # train['failure'] = Y_train #outlier = drop_numerical_outliers(aps_training_df) #print(outlier['class'].value_counts()) #print(outlier.shape) # Extract features and labels from the training and test data-set y_train = aps_training_df[['class']].values x_train = aps_training_df.drop('class', axis=1) y_test = aps_test_df.loc[:, 'class'].values x_test = aps_test_df.drop('class', axis=1) columns = x_train.columns print('=======================remove duplicates====================') print('=============Missing Imputation=========') # Fill missing data in training and test data-set if (imputation_type == 'median'): imputer = SimpleImputer(strategy='median') imputer.fit(x_train.values) x_train = imputer.transform(x_train.values) x_test = imputer.transform(x_test.values) elif (imputation_type == 'knn'): imputer = KNNImputer(n_neighbors=3) imputer.fit(x_train.values) x_train = imputer.transform(x_train.values) x_test = imputer.transform(x_test.values) elif (imputation_type == 'mean'): imputer = SimpleImputer(strategy='mean') imputer.fit(x_train.values) x_train = imputer.transform(x_train.values) x_test = imputer.transform(x_test.values) else: x_train.fillna(-1, inplace=True) x_test.fillna(-1, inplace=True) scaler = MinMaxScaler() scaler.fit(x_train) x_train = scaler.transform(x_train) x_test = scaler.transform(x_test) x_train_df = pd.DataFrame(x_train, columns=columns) y_train_df = pd.DataFrame(y_train, columns=['failure']) x_test_df = pd.DataFrame(x_test, columns=columns) y_test_df = pd.DataFrame(y_test, columns=['failure']) #x_train_df, x_test_df = feature_selection(x_train_df,y_train_df,x_test_df) df_train_data = pd.concat([y_train_df, x_train_df], axis=1) df_train_data.to_csv("datasets/train_processed_aps_" + imputation_type + ".csv") print(df_train_data.shape) df_test_data = pd.concat([y_test_df, x_test_df], axis=1) df_test_data.to_csv("datasets/test_processed_aps_" + imputation_type + ".csv") print(df_test_data.shape) df_train_data['failure'] = df_train_data['failure'].replace(['neg', 'pos'], [0, 1]) df_test_data['failure'] = df_test_data['failure'].replace(['neg', 'pos'], [0, 1]) #gc.collect() print("Pre processing completed") return df_train_data, df_test_data
print(model.score(x_test, y_test)) print('\n') #Do Predictions-------------------------------------------------------------- #read test file new = pd.read_csv(r'testdata.csv') #replace missing values with NaN new = new.replace('[?]', numpy.nan, regex=True) #replace numerical missing values mean imputer = SimpleImputer(missing_values=numpy.nan, strategy='mean') # fill A2 and A14 imputer = imputer.fit(new[['A2']]) new[['A2']] = imputer.transform(new[['A2']]) imputer = imputer.fit(new[['A14']]) new[['A14']] = imputer.transform(new[['A14']]) #replace non numerical missing values using forward filling new = new.ffill(axis=0) #label encode new['A1'] = label_encoder.fit_transform(new['A1']) new['A3'] = label_encoder.fit_transform(new['A3']) new['A4'] = label_encoder.fit_transform(new['A4']) new['A6'] = label_encoder.fit_transform(new['A6']) new['A9'] = label_encoder.fit_transform(new['A9']) new['A15'] = label_encoder.fit_transform(new['A15'])
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeRegressor from sklearn.impute import SimpleImputer # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) imputer = SimpleImputer(strategy="median") imputer.fit(training_features) training_features = imputer.transform(training_features) testing_features = imputer.transform(testing_features) # Average CV score on the training set was: -6.068083376489832e+16 exported_pipeline = DecisionTreeRegressor(max_depth=3, min_samples_leaf=4, min_samples_split=18) # Fix random state in exported estimator if hasattr(exported_pipeline, 'random_state'): setattr(exported_pipeline, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def main(): np.set_printoptions( threshold=10) # Ndarray display threshold to avoid hiding some columns print('HOUSING_PATH=', HOUSING_PATH) print('HOUSING_URL=', HOUSING_URL) fetch_housing_data(HOUSING_URL, HOUSING_PATH) print('After fetch_housing_data') housing = load_housing_data(HOUSING_PATH) print('After load_housing_data') print(housing.head()) # INFO statement print("\nINFO statement:") print(housing.info()) # Value counts print("\nValue counts:") print(housing["ocean_proximity"].value_counts()) # "describe" statement for summary print("\nDESCRIBE statement:") print(housing.describe()) # Plot data #housing.hist(bins=50,figsize =(20,15)) #plt.show() # Test set sampling - random vs stratification housing["income_cat"] = pd.cut(housing["median_income"], bins=[0., 1.5, 3.0, 4.5, 6.0, np.inf], labels=[1, 2, 3, 4, 5]) housing["income_cat"].hist() #plt.show() # Random test set # Not necessarily the best sampling method. Ex. Sex can influence the median income -> right fraction of male/female is critical rand_train_set, rand_test_set = train_test_split(housing, test_size=0.2, random_state=42) # Stratification of data print("\nStratify housing data:") split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42) print(split.split(housing, housing["income_cat"])) print(len(list(split.split(housing, housing["income_cat"])))) ic = 0 for train_index, test_index in split.split(housing, housing["income_cat"]): ic += 1 print("ic = ", ic) print(len(train_index), train_index) print(len(test_index), test_index) #sys.exit() strat_train_set = housing.loc[train_index] strat_test_set = housing.loc[test_index] #strat_full_set=housing #rts = (rand_test_set["income_cat"].value_counts()/len(rand_test_set)).sort_index() #sts = (strat_test_set["income_cat"].value_counts()/len(strat_test_set)).sort_index() #sfs = (strat_full_set["income_cat"].value_counts()/len(strat_full_set)).sort_index() #print('rand_test: \n{0}'.format(rts)) #print('strat_test: \n{0}'.format(sts)) #print('strat_full: \n{0}'.format(sfs)) # Separate predictors and labels print("\nSeparate predictors and labels:") housing = strat_train_set.drop("median_house_value", axis=1) # <- Predictor data housing_labels = strat_train_set["median_house_value"].copy() # <- Labels housing_cat = housing[["ocean_proximity"]] # Non-numeric categories print("housing_cat.head(10) = {}".format(housing_cat.head(10))) ''' Sklearn - Simple imputer ''' imputer = SimpleImputer(strategy="median") housing_num_only = housing.drop("ocean_proximity", axis=1) imputer.fit(housing_num_only) print("imputer.statistics_ = {0}".format(imputer.statistics_)) print("housing_num_only.median() = {0}".format(housing_num_only.median())) X = imputer.transform(housing_num_only) housing_tr = pd.DataFrame(X, columns=housing_num_only.columns) print('housing_tr.info() : ') print(housing_tr.info()) ''' Encording ''' print("housing_cat_encoded = {0}".format(housing_cat[:10])) # Ordinal encoder : replace categorical attributes into numbers # Issue with this method is the "distance" between the numerical values ##print("\nOrdinal encoder:") ##ordinal_encoder = OrdinalEncoder() ##housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat) ##print("housing_cat_encoded = {0}".format(housing_cat_encoded[:10])) ##print("ordinal_encoder.categories_ = {0}".format(ordinal_encoder.categories_)) # One-hot encorder: Split categories and label only 0 or 1 # This way can avoid "distance" problem of the ordinal encorder # Output is a SiPy sparse matrix. User toarray() to convert to numpy array print("\nOne-hot encoder:") cat_encoder = OneHotEncoder() housing_cat_1hot = cat_encoder.fit_transform(housing_cat) #print("housing_cat_1hot = {0}".format(housing_cat_1hot)) #print("housing_cat_1hot.toarray() = {0}".format(housing_cat_1hot.toarray())) # Attribute adder attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False) housing_extra_attribs = attr_adder.transform(housing.values) # Transformation pipeline num_pipeline = Pipeline([ ('imputer', SimpleImputer(strategy="median")), ('attribs_adder', CombinedAttributesAdder()), ('std_scaler', StandardScaler()), ]) #print("num_pipeline = {0}".format(type(num_pipeline))) housing_num_tr = num_pipeline.fit_transform(housing_num_only) #print("housing_num_tr = {0}".format(housing_num_tr)) num_attribs = list(housing_num_only) cat_attribs = ["ocean_proximity"] full_pipeline = ColumnTransformer([ ("num", num_pipeline, num_attribs), ("cat", OneHotEncoder(), cat_attribs), ]) print("housing.head(5): ", housing.head(5)) housing_prepared = full_pipeline.fit_transform(housing) print("housing_parepared: ", pd.DataFrame(housing_prepared).iloc[:5]) ''' Training and evaluating on the training set ''' # Perform linear regression print('Linear regression:') lin_reg = LinearRegression() lin_reg.fit(housing_prepared, housing_labels) #print("housing.head(5): ",housing.head(5)) ''' some_data = housing.iloc[:5] some_labels = housing_labels.iloc[:5] print("len(some_data): ",len(some_data)) print("some_data: ",some_data) print("len(some_labels): ",len(some_labels)) print("some_labels: ",some_labels) some_data_prepared = full_pipeline.transform(some_data) # Output is a numpy array #print("some_data_prepared: ",pd.DataFrame(some_data_prepared)) print("Labels: ",list(some_labels)) ''' print("Labels: ", housing_labels) #print("Labels: ",list(housing_labels)) #np.set_printoptions(threshold=np.inf) # Compute RMSE #some_predictions = lin_reg.predict(some_data_prepared) #print("Predictions:", type(some_predictions)) #print("len(Predictions):", len(some_predictions)) #print("Predictions:", some_predictions) #lin_mse = mean_squared_error(some_labels,some_predictions) housing_predictions = lin_reg.predict(housing_prepared) print("len(housing_predictions):", len(housing_predictions)) print("Predictions:", housing_predictions) lin_mse = mean_squared_error(housing_labels, housing_predictions) lin_rmse = np.sqrt(lin_mse) print('lin_rmse = {0}\n'.format(lin_rmse)) # Perform Decision tree regressor print('Decision tree regressor:') tree_reg = DecisionTreeRegressor() tree_reg.fit(housing_prepared, housing_labels) # Compute RMSE housing_predictions = tree_reg.predict(housing_prepared) print("Predictions:", housing_predictions) tree_mse = mean_squared_error(housing_labels, housing_predictions) tree_rmse = np.sqrt(tree_mse) print('tree_rmse = {0}\n'.format(tree_rmse)) print('') ''' Cross-validation ''' print('*** Cross-validation ***') print(' Decision tree:') scores = cross_val_score(tree_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) tree_rmse_scores = np.sqrt(-scores) display_scores(tree_rmse_scores) print('') print(' Linear regression:') lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) lin_rmse_scores = np.sqrt(-lin_scores) display_scores(lin_rmse_scores) print('') print(' Random forest regressor:') forest_reg = RandomForestRegressor() forest_reg.fit(housing_prepared, housing_labels) housing_predictions = forest_reg.predict(housing_prepared) print("Predictions:", housing_predictions) forest_mse = mean_squared_error(housing_labels, housing_predictions) forest_rmse = np.sqrt(forest_mse) print('forest_rmse = {0}\n'.format(forest_rmse)) scores = cross_val_score(forest_reg, housing_prepared, housing_labels, scoring="neg_mean_squared_error", cv=10) forest_rmse_scores = np.sqrt(-scores) display_scores(forest_rmse_scores) print('') ''' Fine-tuning ''' param_grid = [ { 'n_estimators': [ 3, 10, 30, ], 'max_features': [2, 4, 6, 8] }, { 'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4] }, ] forest_reg = RandomForestRegressor() grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True) grid_search.fit(housing_prepared, housing_labels) print('grid_search.best_params_ = {}'.format(grid_search.best_params_)) print('grid_search.best_estimator_', grid_search.best_estimator_) cvres = grid_search.cv_results_ for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]): print(np.sqrt(-mean_score), params) feature_importances = grid_search.best_estimator_.feature_importances_ print('feature_importances: \n{0}'.format(feature_importances)) extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"] cat_encoder = full_pipeline.named_transformers_["cat"] cat_one_hot_attribs = list(cat_encoder.categories_[0]) attributes = num_attribs + extra_attribs + cat_one_hot_attribs print('sorted(fi,attr):') ftri_attribs = sorted(zip(feature_importances, attributes), reverse=True) for fa in ftri_attribs: print(fa) ''' Evaluate your system on the Test Set ''' print('Evaluating the Test Set ') final_model = grid_search.best_estimator_ X_test = strat_test_set.drop("median_house_value", axis=1) Y_test = strat_test_set["median_house_value"].copy() X_test_prepared = full_pipeline.transform(X_test) final_predictions = final_model.predict(X_test_prepared) final_mse = mean_squared_error(Y_test, final_predictions) print('final_mse = ', final_mse) final_rmse = np.sqrt(final_mse) #=> evalupates to 47,730.2 print('final_rmse = ', final_rmse) return