import numpy as np import matplotlib.pyplot as plt import pandas as pd dataset = pd.read_csv("Dataset/Data.csv") X = dataset.iloc[:, :-1].values Y = dataset.iloc[:, 3].values print(X) print(Y) # taking care of missing data - taking mean of other observations from sklearn.preprocessing import Imputer # axs =0 -> column wide #axs = 1 -> row wide imputer = Imputer(missing_values='NaN', strategy="mean", axis=0) imputer.axis(X)
X = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values # managing missing values """----------------------------------------------------------------------------------- from sklearn.impute import SimpleImputer imputer = SimpleImputer("'missing_values = NaN'", "'strategy' = 'mean'", "'axis = 0'") imputer = imputer.fit(X[:,1:3]) X[:,1:3]= imputer.transform(X[:,1:3]) -----------------------------------------------------------------------------------""" from sklearn.preprocessing import Imputer imputer = Imputer() imputer.missing_values = 'NaN' imputer.strategy = 'mean' imputer.axis = 0 imputer = imputer.fit(X[:,1:3]) X[:,1:3]= imputer.transform(X[:,1:3]) #---------------------------------------------------------------------------------- #encoding X and y in numeric forms from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_X = LabelEncoder() X[:,0] = labelencoder_X.fit_transform(X[:,0]) onehotencoder = OneHotEncoder(categorical_features = [0]) X = onehotencoder.fit_transform(X).toarray() labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y) #---------------------------------------------------------------------------------- # Splitting the dataset into the Training set and Test set