def split_train_val_test(df): train_val, test = get_train_test_1fold(df) train, val = get_train_test_1fold(train_val) df_train = keep_index_and_1diagnose_columns(train, 'Instance labels') df_test = keep_index_and_1diagnose_columns(test, 'Instance labels') df_val = keep_index_and_1diagnose_columns(val, 'Instance labels') return df_train, df_val, df_test
def construct_train_test_cv(df, nr_cv, split): train_val_ind_col, test_ind_col = split_data_cv(df, nr_cv) df_train_val, df_test = get_rows_from_indices(df, train_val_ind_col[split], test_ind_col[split]) train_ind_col, val_ind_col = split_data_cv(df_train_val, nr_cv) df_train, df_val = get_rows_from_indices(df_train_val, train_ind_col[split], val_ind_col[split]) train_set = keep_index_and_1diagnose_columns(df_train, 'Instance labels') val_set = keep_index_and_1diagnose_columns(df_val, 'Instance labels') test_set = keep_index_and_1diagnose_columns(df_test, 'Instance labels') return train_set, val_set, test_set
def split_data_cv(df, splits_nr, current_split, random_seed, diagnose_col, ratio_to_keep=None): df_train_val = filter_rows_on_class(df, class_name=diagnose_col) train_inds_coll, val_inds_coll = split_test_train_cv( df_train_val, splits_nr, test_ratio=0.2, random_state=random_seed) df_train = df.iloc[train_inds_coll[current_split]] df_val = df.iloc[val_inds_coll[current_split]] df_train_final = keep_index_and_1diagnose_columns(df_train, 'instance labels') df_val_final = keep_index_and_1diagnose_columns(df_val, 'instance labels') return df_train_final, df_val_final
def prepare_mura_set(df_train_val, test_df_all_classes, class_name): _, _, train_df_all_classes, val_df_all_classes = split_train_val_set( df_train_val) df_train, df_val, df_test = filter_all_set_for_class( train_df_all_classes, val_df_all_classes, test_df_all_classes, class_name) df_train_final = keep_index_and_1diagnose_columns(df_train, 'instance labels') df_val_final = keep_index_and_1diagnose_columns(df_val, 'instance labels') df_test_final = keep_index_and_1diagnose_columns(df_test, 'instance labels') print('Training set: ' + str(df_train_final.shape)) print('Validation set: ' + str(df_val_final.shape)) # print('Localization testing set: '+ str(df_bbox_test.shape)) print('Classification testing set: ' + str(df_test_final.shape)) return df_train_final, df_val_final, df_test_final
def split_filter_data(config, df): ''' Splits a dataframe into test, validation and training subsets and Filters unnecessary columns ''' results_path = config['results_path'] class_name = config['class_name'] print("Splitting data ...") df_train, df_val, df_test = ld.get_train_test(df, random_state=1, do_stats=False, res_path=results_path, label_col=class_name) label_patches = class_name + '_loc' if class_name is not None: df_train_filtered_cols, df_val_filtered_cols, df_test_filtered_cols = \ ld.keep_index_and_1diagnose_columns(df_train, label_patches), \ ld.keep_index_and_1diagnose_columns(df_val, label_patches), \ ld.keep_index_and_1diagnose_columns(df_test, label_patches) return df_train_filtered_cols, df_val_filtered_cols, df_test_filtered_cols
def filter_rows_and_columns(df, class_name): df = filter_rows_on_class(df, class_name=class_name) return keep_index_and_1diagnose_columns(df, 'instance labels')