Пример #1
0
def process_missing_data(housing):
    """
    方法1:去掉对应的数据
    方法2:去掉整个属性
    方法3:赋值(0,平均值,中位数等)

    pandas.DataFrame中的dropna(), drop(), fillna()实现
        housing.dropna(subset=["total_bedrooms"])    # 方法1
        housing.drop("total_bedrooms", axis=1)       # 方法2
        median = housing["total_bedrooms"].median()  # 计算中位数
        housing["total_bedrooms"].fillna(median)     # 方法3

    scikit-learn中的Imputer
        imputer = Imputer(strategy="median")
        housing_num = housing.drop("ocean_proximity", axis=1)
        imputer.fix(housing_num)
        X = imputer.transform(housing_num)
    """
    imputer = Imputer(strategy="median")
    housing_num = housing.drop("ocean_proximity", axis=1)
    imputer.fix(housing_num)
    X = imputer.transform(housing_num)
    housing_tr = pd.DataFrame(X, columns=housing_num.columns)
Пример #2
0
def read_data():
    dataset = pd.read_csv('/datasets/Data.csv')
    X = dataset.iloc[:, :-1].values
    Y = dataset.iloc[:, 3].values

    imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
    imputer = imputer.fix(X[:, 1:3])

    labelencoder_X = LabelEncoder()
    X[:, 0] = labelencoder_X.fit_transform(X[:, 0])

    onehotencoder = OneHotEncoder(categorical_features=[0])
    X = onehotencoder.fit_transform(X).toarray()
    labelencoder_Y = LabelEncoder()
    Y = labelencoder_Y.fit_transform(Y)

    X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                        Y,
                                                        test_size=0.2,
                                                        random_state=0)

    sc_X = StandardScaler()
    X_train = sc_X.fit_transform(X_train)
    X_test = sc_X.fit_transform(X_test)
#Categorical Data

#Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#Importing the dataset
dataset = pd.read_csv("Data.csv")
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 3].values

#Splitting the dataset into the Trainning set and Test set
from sklearn.preprocessing import Imputer
imputer = Imputer(missing_values="NaN", strategy="mean", axis=0)
imputer = Imputer.fix(x[:, 1:3])
x[:, 1:3] = imputer.transform(x[:, 1:3])

#Encoding Categorical Data
#Encoding the Independent Variable
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_x = LabelEncoder()
x[:, 0] = labelencoder_x.fit_transform(x[:, 0])
onehotencoder = OneHotEncoder(categorical_features=[0])
x = onehotencoder.fit_transform(x).toarray()

#Encoding the Dependent Variable
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)