def process_missing_data(housing): """ 方法1:去掉对应的数据 方法2:去掉整个属性 方法3:赋值(0,平均值,中位数等) pandas.DataFrame中的dropna(), drop(), fillna()实现 housing.dropna(subset=["total_bedrooms"]) # 方法1 housing.drop("total_bedrooms", axis=1) # 方法2 median = housing["total_bedrooms"].median() # 计算中位数 housing["total_bedrooms"].fillna(median) # 方法3 scikit-learn中的Imputer imputer = Imputer(strategy="median") housing_num = housing.drop("ocean_proximity", axis=1) imputer.fix(housing_num) X = imputer.transform(housing_num) """ imputer = Imputer(strategy="median") housing_num = housing.drop("ocean_proximity", axis=1) imputer.fix(housing_num) X = imputer.transform(housing_num) housing_tr = pd.DataFrame(X, columns=housing_num.columns)
def read_data(): dataset = pd.read_csv('/datasets/Data.csv') X = dataset.iloc[:, :-1].values Y = dataset.iloc[:, 3].values imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer = imputer.fix(X[:, 1:3]) labelencoder_X = LabelEncoder() X[:, 0] = labelencoder_X.fit_transform(X[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) X = onehotencoder.fit_transform(X).toarray() labelencoder_Y = LabelEncoder() Y = labelencoder_Y.fit_transform(Y) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) sc_X = StandardScaler() X_train = sc_X.fit_transform(X_train) X_test = sc_X.fit_transform(X_test)
#Categorical Data #Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd #Importing the dataset dataset = pd.read_csv("Data.csv") x = dataset.iloc[:, :-1].values y = dataset.iloc[:, 3].values #Splitting the dataset into the Trainning set and Test set from sklearn.preprocessing import Imputer imputer = Imputer(missing_values="NaN", strategy="mean", axis=0) imputer = Imputer.fix(x[:, 1:3]) x[:, 1:3] = imputer.transform(x[:, 1:3]) #Encoding Categorical Data #Encoding the Independent Variable from sklearn.preprocessing import LabelEncoder, OneHotEncoder labelencoder_x = LabelEncoder() x[:, 0] = labelencoder_x.fit_transform(x[:, 0]) onehotencoder = OneHotEncoder(categorical_features=[0]) x = onehotencoder.fit_transform(x).toarray() #Encoding the Dependent Variable labelencoder_y = LabelEncoder() y = labelencoder_y.fit_transform(y)