示例#1
0
def normalize(features):

    features_n = MinMaxScaler().fit_transform(features[features.columns[4:45]])
    features_n = pd.DataFrame(features_n,
                              columns=features.columns[4:45],
                              index=features.index)
    if 'total_cases' in features_n.columns:
        features_n = features_n.drop(columns=['total_cases'])
    features_n['month'] = features['month']
    features_n['weekofyear'] = features['weekofyear']
    features_n['odd_year'] = features['odd_year']

    return features_n
示例#2
0
#replace missval in attribute age with mean
imp = Imputer(strategy='mean')
data['Age'] = imp.fit_transform(data[['Age']])
index = data.columns

#check if missval still exist
#print(data.isnull().sum())

#normalization
data = MinMaxScaler().fit_transform(data)
data = pd.DataFrame(data)
data.columns = index

#split attribute and target class
X = data.drop(['Survived'], axis=1)
y = data['Survived']

#find outliers
FS = IsolationForest()
FS.fit(X)

# FS=EllipticEnvelope()
# FS.fit(X)

outliers = FS.predict(X)

drop = []
for index, num in enumerate(outliers.tolist()):
    if (num == -1):
        drop.append(index)
示例#3
0
        Ticket.append("X")
df["Ticket"] = Ticket
df = pd.get_dummies(df, columns=["Ticket"], prefix="T")

# Cabib 依照第一碼分類, 再取 One Hot
df["Cabin"] = pd.Series(
    [i[0] if not pd.isnull(i) else 'X' for i in df['Cabin']])
df = pd.get_dummies(df, columns=["Cabin"], prefix="Cabin")

# Embarked, Pclass 取 One Hot
df = pd.get_dummies(df, columns=["Embarked"], prefix="Em")
df["Pclass"] = df["Pclass"].astype("category")
df = pd.get_dummies(df, columns=["Pclass"], prefix="Pc")

# 捨棄 Name 欄位
df.drop(labels=["Name"], axis=1, inplace=True)

# 確認缺值 與 目前的資料表內容
na_check(df)
df.head()

# 將資料最大最小化
df = MinMaxScaler().fit_transform(df)

# 將前述轉換完畢資料 df , 重新切成 train_X, test_X
train_num = train_Y.shape[0]
train_X = df[:train_num]
test_X = df[train_num:]

# 使用三種模型 : 邏輯斯迴歸 / 梯度提升機 / 隨機森林, 參數使用 Random Search 尋找
from sklearn.linear_model import LogisticRegression