def python_random_split(data, ratio=0.75, seed=42): """Pandas random splitter The splitter randomly splits the input data. Args: data (pd.DataFrame): Pandas DataFrame to be split. ratio (float or list): Ratio for splitting data. If it is a single float number it splits data into two halves and the ratio argument indicates the ratio of training data set; if it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios. If a list is provided and the ratios are not summed to 1, they will be normalized. seed (int): Seed. Returns: list: Splits of the input data as pd.DataFrame. """ multi_split, ratio = process_split_ratio(ratio) if multi_split: splits = split_pandas_data_with_ratios(data, ratio, shuffle=True, seed=seed) splits_new = [x.drop('split_index', axis=1) for x in splits] return splits_new else: return sk_split(data, test_size=None, train_size=ratio, random_state=seed)
def main(): path = 'heart.csv' feature, target = Read_data(path) save_path = "test-output/" # 資料切分 train_feature, test_feature, train_target, test_target = sk_split(feature, target, test_size=0.3, random_state=10) # 以隨機的方式資料分割 並給隨機種子固定隨機模式 # 模型選擇 model = clf_rtree(max_depth=5,min_samples_leaf=4,n_estimators=10,random_state=10) # 限制決策樹最大深度 避免overfitting model.fit(train_feature, train_target) # 模型預測 pre_target = model.predict(test_feature) # true false positive sensitive ROC t_f_postive(test_target,pre_target) # accurancy 正確率 scor = model.score(test_feature,test_target) print('scor: ',scor) # 各feature 的影響程度 var_influence(model, test_feature, save_path) # 畫隨機森林樹出來 column = train_feature.columns draw_tree(model , column , save_path)
def train(model, x, y): record_path = 'model_save/training.csv' train_x, test_x, train_y, test_y = sk_split( x, y, test_size=0.3, random_state=10) # 以隨機的方式資料分割 並給隨機種子固定隨機模式 train_y = keras.utils.to_categorical(train_y, 2) test_y = keras.utils.to_categorical(test_y, 2) adam = Adam(lr=0.05, decay=3e-4) model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) # 將每次訓練的結果已CSV檔形式儲存起來 => 可以用來視覺化訓練情況 csv_logger = CSVLogger(record_path) filepath = 'model_save' checkpointer = ModelCheckpoint(filepath=filepath + '/weights-{val_acc:.2f}.hdf5', verbose=1, save_best_only=True, period=10) # model.fit(train_x, train_y, batch_size=20, epochs=500, validation_data=(test_x,test_y), verbose=1, callbacks=[csv_logger, checkpointer]) train_plot(record_path) analizy(filepath, x, y)