def main(): if len(sys.argv) == 3: database_filepath, model_filepath = sys.argv[1:] print('Loading data...\n DATABASE: {}'.format(database_filepath)) X, y, category_names = load_data(database_filepath) mlss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=42) for train_index, test_index in mlss.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y.values[train_index], y.values[test_index] y_train = pd.DataFrame(y_train,columns=category_names) y_test = pd.DataFrame(y_test,columns=category_names) print('Building model...') model = build_model() print('Training model...') model.fit(X_train, y_train) print('Evaluating model...') evaluate_model(model, X_test, y_test, category_names) print('Saving model...\n MODEL: {}'.format(model_filepath)) save_model(model, model_filepath) print('Trained model saved!') else: print('Please provide the filepath of the disaster messages database '\ 'as the first argument and the filepath of the pickle file to '\ 'save the model to as the second argument. \n\nExample: python '\ 'train_classifier.py ../data/DisasterResponse.db classifier.pkl')
def evaluate(model): callbacks = [ EarlyStopping( # Stop training when loss is no longer improving monitor="loss", # "no longer improving" being defined as "no better than 1e-2 less" min_delta=1e-5, # "no longer improving" being further defined as "for at least 2 epochs" patience=2, verbose=1, ) ] kfold = MultilabelStratifiedShuffleSplit(n_splits=4, random_state=seed, test_size=.2) scores = np.empty(0) for train, test in kfold.split(X, Y): X_train, X_test = X[train], X[test] Y_train, Y_test = Y[train], Y[test] model.fit( X_train, Y_train, epochs=1, #epochs=50, callbacks=callbacks, ) # change epochs to 14 Y_pred = model.predict(X_test) control_mask = X_test['cp_type'] == 'ctl_vehicle' Y_pred[control_mask, :] = 0 # scores = np.append(scores,model.evaluate(X_test,Y_test)) scores = np.append(scores, log_loss(Y_test, Y_pred, labels=[0, 1])) return scores.mean()
def MultiStratifiedShuffleSplit(images, annotations, test_size): # count categories per image categories_per_image = defaultdict(Counter) max_id = 0 for ann in annotations: categories_per_image[ann['image_id']][ann['category_id']] += 1 if ann['category_id'] > max_id: max_id = ann['category_id'] # prepare list with count of cateory objects per image all_categories = [] for cat in categories_per_image.values(): pair = [] for i in range(1, max_id + 1): pair.append(cat[i]) all_categories.append(pair) # multilabel-stratified-split strat_split = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=2020) for train_index, test_index in strat_split.split(images, all_categories): x = [images[i] for i in train_index] y = [images[i] for i in test_index] print('Train:', len(x), 'images, valid:', len(y)) return x, y
def data_split(x, y, test_size=0.5, random_state=30): msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) for train_index, test_index in msss.split(x, y): X_train, X_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] return X_train, X_test, y_train, y_test
def train_test_multilabel_stratified_shuffle_split(dataset, test_size=0.2, random_state=42): y = target_to_numpy(dataset['Target']) msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) train_idx, valid_idx = list(msss.split(X=dataset, y=y))[0] return train_idx, valid_idx
def stratified_train_test_split(X, y): msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=config.TEST_SIZE, random_state=42) for train_index, test_index in msss.split(X, y): X_train, X_test = X.loc[train_index], X.loc[test_index] y_train, y_test = y.loc[train_index], y.loc[test_index] return X_train, X_test, y_train, y_test
def split2_stratified(df: pd.DataFrame, target_names: List[str], test_size: float, random_state: int) -> Tuple[pd.DataFrame, pd.DataFrame]: msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) multi_lbl_arr = df[target_names] i_train, i_test = next(msss.split(np.zeros(df.shape[0]), multi_lbl_arr)) idx_train = [df.index[i] for i in i_train] idx_test = [df.index[i] for i in i_test] return (copy.deepcopy(df.loc[idx_train, :]), copy.deepcopy(df.loc[idx_test, :]))
def multi_label_split_based_on_percentage(df, n_splits, test_percentage, unique_id_column, target_column, seed): """ :param df: The dataframe in which 1 row = 1 class for multi-label classification :param n_splits: How to split the dataframe :param test_percentage: how much should be the test percentage split? :param unique_id_column: the column which uniquely identifies the dataframe :param target_column: the classes column (multi labels). It has to be numeric :param seed: 42 :return: train and validation dataframe same as df but with fold columns """ # store unique ids unique_ids = df[unique_id_column].unique() # find unique classes unique_classes = df[target_column].unique() # convert the target column into multi label format one_hot_labels = [] for uid in unique_ids: classes = df[df[unique_id_column] == uid][target_column].values x = np.eye(len(unique_classes))[classes.astype(int)].sum(0) one_hot_labels.append(x) # https://github.com/trent-b/iterative-stratification#multilabelstratifiedshufflesplit msss = MultilabelStratifiedShuffleSplit(n_splits=n_splits, train_size=1 - test_percentage, test_size=test_percentage, random_state=seed) # create train and validation splits train_df = pd.DataFrame() val_df = pd.DataFrame() # X is unique id for fold, (train_index, val_index) in enumerate(msss.split(unique_ids, one_hot_labels)): train_data = df[df[unique_id_column].isin( unique_ids[train_index])].copy(deep=True) val_data = df[df[unique_id_column].isin( unique_ids[val_index])].copy(deep=True) train_data["fold"] = fold val_data["fold"] = fold train_df = train_df.append(train_data, ignore_index=True) val_df = val_df.append(val_data, ignore_index=True) return train_df, val_df
def train_test_split(X, y, test_size=0.33): from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit msss = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.33, random_state=0) for train_index, test_index in msss.split(X, y): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] return (X_train, y_train), (X_test, y_test)
def stratified_split(): print("Splitting train data...") msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=92) train_df = pd.read_csv("../input/train.csv") train_df_orig = train_df.copy() X = train_df["id"].tolist() y = train_df['attribute_ids'].tolist() y = [make_label(cur_y) for cur_y in y] for train_index, test_index in msss.split(X, y): new_train_df = train_df_orig.loc[train_df_orig.index.intersection(train_index)].copy() new_valid_df = train_df_orig.loc[train_df_orig.index.intersection(test_index)].copy() new_train_df.to_csv("./data/train_split_90pc.csv", index=False) new_valid_df.to_csv("./data/valid_split_10pc.csv", index=False) print("Successfully finished!")
def train_test_split(df, target, classes): X = [] Y = [] for index in range(len(df)): image_data = df.loc[index] X.append(image_data[target]) Y.append(image_data[classes].values.tolist()) X = np.array(X) Y = np.array(Y) mskf = MultilabelStratifiedShuffleSplit(n_splits=2, test_size=0.15, random_state=0) for train_index, test_index in mskf.split(X, Y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] return X_train, y_train, X_test, y_test
def MultilabelStratifiedShuffleSplit(self, n_splits, test_size, random_state): partitions = [] msss = MultilabelStratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state) for train_index, test_index in msss.split( self.train_labels["Id"].index.values, self.train_labels.drop(columns=['Id', 'Target']).values): partition = {} partition["train"] = self.train_labels.Id.values[train_index] partition["validation"] = self.train_labels.Id.values[test_index] partitions.append(partition) return partitions
def split_data(threshold): from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit df = pd.read_csv( '/Users/kevinmaikjablonka/Dropbox (LSMO)/proj75_mofcolor/ml/data/all.csv' ) exlcuded = [] keept = [] THRESHOLD = 0.03 if threshold != 255: for i, row in df.iterrows(): if row['color_cleaned_x'] in color_threshold_dict[threshold]: keept.append(row) else: exlcuded.append(row) df_rel = pd.DataFrame(keept) df = df_rel.drop_duplicates(subset=CHEMICAL_FEATURES) else: df = df.drop_duplicates(subset=CHEMICAL_FEATURES) r_binned = bin_column(df['r'].values) g_binned = bin_column(df['g'].values) b_binned = bin_column(df['b'].values) mlss = MultilabelStratifiedShuffleSplit(n_splits=1, train_size=0.85, test_size=0.15, random_state=RANDOM_SEED) for train_idx, test_idx in mlss.split( df, np.hstack([r_binned, g_binned, b_binned])): pass df_train = df.iloc[train_idx].sample(len(train_idx)) df_test = df.iloc[test_idx].sample(len(test_idx)) df_train.to_csv( '/Users/kevinmaikjablonka/Dropbox (LSMO)/proj75_mofcolor/ml/data/development_set.csv', index=False, ) df_test.to_csv( '/Users/kevinmaikjablonka/Dropbox (LSMO)/proj75_mofcolor/ml/data/holdout_set.csv', index=False, )
def make_strat_folds(df, n_folds: int) -> pd.DataFrame: """ makes iterative stratification of multi label data Source: https://github.com/trent-b/iterative-stratification """ msss = MultilabelStratifiedShuffleSplit(n_splits=n_folds, test_size=0.2, random_state=42) train_df_orig = df.copy() X = train_df_orig['ImageId'].tolist() cls_counts = Counter(cls for classes in df['attribute_ids'].str.split() for cls in classes) y = train_df_orig['attribute_ids'].str.split().tolist() #print(X, y) for train_index, test_index in msss.split(X, y): print("TRAIN:", train_index, "TEST:", test_index) train_df = train_df_orig.loc[train_df_orig.index.intersection( train_index)].copy() valid_df = train_df_orig.loc[train_df_orig.index.intersection( test_index)].copy() return train_df, valid_df
def stratification(label_dir): print('Stratification...') # Define the weights, the SNOMED CT code for the normal class, and equivalent SNOMED CT codes. normal_class = '426783006' equivalent_classes = [['713427006', '59118001'], ['284470004', '63593006'], ['427172004', '17338001']] # Find the label files. label_files = load_label_files(label_dir) # Load the labels and classes. label_classes, labels_onehot, labels = load_labels(label_files, normal_class, equivalent_classes) temp = [[] for _ in range(len(labels_onehot))] indexes, values = np.where(np.array(labels_onehot).astype(int) == 1) for k, v in zip(indexes, values): temp[k].append(v) labels_int = temp X = np.zeros(len(labels_onehot)) y = labels_onehot msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=0) for train_index, val_index in msss.split(X, y): X_train, X_val = X[train_index], X[val_index] y_train, y_val = y[train_index], y[val_index] print('Saving split index...') datasets_distribution(labels_int, [train_index, val_index]) savemat('model_training/split.mat', { 'train_index': train_index, 'val_index': val_index }) print('Stratification done.')
def split(dataset_path, test_size, stratification): df = get_csv(dataset_path, name="train") img_ids = df["image_id"] if stratification == "sklearn": train_set, valid_set = train_test_split(df[KEYS], test_size=test_size, random_state=SEED, shuffle=True) elif stratification == "sklearn_stratified": df['subset'] = np.nan splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=SEED) train_indcs, valid_indcs = next(splitter.split(X=img_ids, y=df[KEYS])) train_set = df.loc[df.index.intersection(train_indcs)].copy() valid_set = df.loc[df.index.intersection(valid_indcs)].copy() df.iloc[train_indcs, -1] = 'train' df.iloc[valid_indcs, -1] = 'valid' df.to_csv(os.path.join(dataset_path, 'train_stratified.csv'), index=None) elif stratification == "iterstrat": splitter = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=SEED) train_indcs, valid_indcs = next(splitter.split(X=img_ids, y=df[KEYS])) train_set = df.loc[df.index.intersection(train_indcs)].copy() valid_set = df.loc[df.index.intersection(valid_indcs)].copy() elif stratification == "skmultilearn": splitter = IterativeStratification( n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0 - test_size]) train_indcs, valid_indcs = next(splitter.split(X=img_ids, y=df[KEYS])) train_set = df.loc[df.index.intersection(train_indcs)].copy() valid_set = df.loc[df.index.intersection(valid_indcs)].copy() else: raise ValueError("Try something else :)") return train_set, valid_set
def split_dataframe_stratify_n(df, split, test_size, eval_size, target_names, random_state=None): assert isinstance(target_names, list) and len(target_names) >= 1 if split is None: return Splited(train=df, eval=None, test=None) elif split is False: msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) multi_lbl_arr = df[target_names].values i_train, i_test = next(msss.split(np.zeros(df.shape[0]), multi_lbl_arr)) idx_train = [df.index[i] for i in i_train] idx_test = [df.index[i] for i in i_test] df_test = df.loc[idx_test, :] df_train = df.loc[idx_train, :] return Splited(train=df_train, eval=None, test=df_test) elif split is True: msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_state) multi_lbl_arr = df[target_names].values i_train_eval, i_test = next( msss.split(np.zeros(df.shape[0]), multi_lbl_arr)) idx_train_eval = [df.index[i] for i in i_train_eval] idx_test = [df.index[i] for i in i_test] df_test = df.loc[idx_test, :] msss2 = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=eval_size, random_state=random_state) multi_lbl_arr2 = df.loc[idx_train_eval, target_names].values i_train, i_eval = next( msss2.split(np.zeros(df.shape[0] - len(i_test)), multi_lbl_arr2)) idx_train = [idx_train_eval[i] for i in i_train] idx_eval = [idx_train_eval[i] for i in i_eval] df_eval = df.loc[idx_eval, :] df_train = df.loc[idx_train, :] return Splited(train=df_train, eval=df_eval, test=df_test)
def data_prep(self, N_splits=10, Test_size=0.3, Val_size=0.2, Batch_size=32): rand_sta = 333 msss = MultilabelStratifiedShuffleSplit(n_splits=N_splits, test_size=Test_size, random_state=rand_sta) # msss=MultilabelStratifiedKFold(n_splits=n_fold,random_state=rand_sta) train_list = [] test_list = [] for train_index, test_index in msss.split(self.X, self.Y): train_list.append(train_index) test_list.append(test_index) x_train_tmp = self.X.as_matrix()[train_list[0]] y_train_tmp = self.Y.as_matrix()[train_list[0]] l_train_tmp = self.L.as_matrix()[train_list[0]] x_test = self.X.as_matrix()[test_list[0]] y_test = self.Y.as_matrix()[test_list[0]] l_test = self.L.as_matrix()[test_list[0]] msss_cv = MultilabelStratifiedShuffleSplit(n_splits=N_splits, test_size=Val_size, random_state=rand_sta) train_list = [] val_list = [] for train_index, val_index in msss_cv.split(x_train_tmp, y_train_tmp): train_list.append(train_index) val_list.append(val_index) x_train = x_train_tmp[train_list[0]] y_train = y_train_tmp[train_list[0]] l_train = l_train_tmp[train_list[0]] x_val = x_train_tmp[val_list[0]] y_val = y_train_tmp[val_list[0]] l_val = l_train_tmp[val_list[0]] self.x_train = x_train self.y_train = y_train self.x_val = x_val self.y_val = y_val self.x_test = x_test self.y_test = y_test self.l_train = l_train self.l_test = l_test self.l_val = l_val self.batch_size = Batch_size self.makedivisible_to_all()
def split_files(segment_names, labels, durations, segments_timestamps, config): r"""Make stratified (multilabel) training, testing and validation split of the segments, and add a prefix to the segment name to indicate to which set it belongs. Args: segment_names (np.ndarray): List of segment audio files. labels (np.ndarray): audio segments labels durations (np.ndarray): audio segments labelled portion durations. segments_timestamps (np.ndarray): Time stamps of the audio events in the segments. config (dict): Configuration dictionary Returns: The updated names of the audio segments. """ # Stratified split tr_sss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=53) train_index, dev_test_index = next(tr_sss.split(np.zeros(labels.shape[0]), labels)) dev_test_sss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=32) dev_index, test_index = next( dev_test_sss.split(np.zeros(labels[dev_test_index].shape[0]), labels[dev_test_index])) def array_to_list(array): r"""Recursive implementation of tolist function for numpy array""" if isinstance(array, np.ndarray): return array_to_list(array.tolist()) elif isinstance(array, list): return [array_to_list(item) for item in array] elif isinstance(array, tuple): return tuple(array_to_list(item) for item in array) else: return array for idx, file in enumerate(segment_names): with open(os.path.join(config['output_folder'], os.path.splitext(file)[0] + '.json'), 'w') as json_file: json.dump({'label': labels[idx].tolist(), 'durations': durations[idx].tolist(), 'timestamps': array_to_list(segments_timestamps[idx].tolist())}, json_file) with open(os.path.join(config['output_folder'], 'config.json'), 'w') as json_file: json.dump({'classes': config['classes'], 'sampling_rate': config['sampling_rate'], 'length_segments_s': config['length_segments_s'], 'Training_files': array_to_list(segment_names[train_index]), 'Testing_files': array_to_list(segment_names[dev_test_index][dev_index]), 'Validation_files': array_to_list(segment_names[dev_test_index][test_index])}, json_file)
def save_train_dev_test_split(train_path, dev_path, test_path, stft_magnitudes, stft_phases, mel_spectrograms, labels, segment_names, durations): r"""Compute stratified (multilabel) training, testing and validation split and save the split features to hdf5. Args: train_path (str): path to hdf5 file to save the data of the training set dev_path (str): path to hdf5 file to save the data of the testing set. test_path (str): path to hdf5 file to save the data of the validation set. stft_magnitudes (np.ndarray): Segments magnitudes stft_phases (np.ndarray): Segments phases mel_spectrograms (np.ndarray): Segments mel Spectrograms labels (np.ndarray): Segments labels segment_names (np.ndarray): Segments filenames durations (np.ndarray): Segments labelled portion durations """ tr_sss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=53) train_index, dev_test_index = next(tr_sss.split(np.zeros(stft_magnitudes.shape[0]), labels)) dev_test_sss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=32) dev_index, test_index = next( dev_test_sss.split(np.zeros(stft_magnitudes[dev_test_index].shape[0]), labels[dev_test_index])) save_features(train_path, stft_magnitudes=stft_magnitudes[train_index], stft_phases=stft_phases[train_index], mel_spectrograms=mel_spectrograms[train_index], labels=labels[train_index], segment_names=segment_names[train_index], durations=durations[train_index]) save_features(dev_path, stft_magnitudes=stft_magnitudes[dev_test_index][dev_index], stft_phases=stft_phases[dev_test_index][dev_index], mel_spectrograms=mel_spectrograms[dev_test_index][dev_index], labels=labels[dev_test_index][dev_index], segment_names=segment_names[dev_test_index][dev_index], durations=durations[dev_test_index][dev_index]) save_features(test_path, stft_magnitudes=stft_magnitudes[dev_test_index][test_index], stft_phases=stft_phases[dev_test_index][test_index], mel_spectrograms=mel_spectrograms[dev_test_index][test_index], labels=labels[dev_test_index][test_index], segment_names=segment_names[dev_test_index][test_index], durations=durations[dev_test_index][test_index])
def __init__(self, split, args, transforms=None, test_transforms=None, channels="g", debug=False, n_samples=None): self.split = split self.transforms = transforms self.test_transforms = test_transforms if test_transforms else None self.image_channels = channels self.full_size = args.full_size self.debug = debug self.n_classes = 28 self.resize = tfms.Resize( args.img_size, args.img_size) if args.img_size is not None else None self.base_path = args.primary_datapath if not args.full_size else args.fullsize_datapath self.n_samples = n_samples if self.debug: self.n_samples = 128 # check for valid image mode if not (set(self.image_channels) <= set("rgby")): raise ValueError("Invalid image channels selection.") # split the training set into training and validation if split in ["train", "val", "trainval"]: with open(os.path.join(self.base_path, 'train.csv'), 'r') as f: csvreader = csv.reader(f) data = list(csvreader)[1:] label_lookup = {k: np.array(v.split(' ')) for k, v in data} ids = sorted(list(label_lookup.keys())) lbls = [self.encode_label(label_lookup[k]) for k in ids] ids = np.asarray(ids).reshape(-1, 1) lbls = np.asarray(lbls) msss = MultilabelStratifiedShuffleSplit( n_splits=1, train_size=args.trainval_ratio, test_size=None, random_state=0) train_inds, val_inds = list(msss.split(ids, lbls))[0] train_ids = ids[train_inds].flatten().tolist() val_ids = ids[val_inds].flatten().tolist() ids = ids.flatten().tolist() # if using external data, add it self.source_lookup = {i: "trainval" for i in ids} if args.use_external: with open(os.path.join(args.primary_datapath, 'external.csv'), 'r') as f: csvreader = csv.reader(f) external_data = list(csvreader)[1:] external_label_lookup = { k: np.array(v.split(' ')) for k, v in external_data } external_ids = sorted(list(external_label_lookup.keys())) self.source_lookup.update( {i: "external" for i in external_ids}) label_lookup.update(external_label_lookup) ids = ids + external_ids train_ids = train_ids + external_ids # select data if self.split == "train": self.data = [(i, label_lookup[i]) for i in train_ids] elif self.split == "val": self.data = [(i, label_lookup[i]) for i in val_ids] elif self.split == "trainval": self.data = [(i, label_lookup[i]) for i in ids] elif self.split == "test": with open(os.path.join(self.base_path, 'sample_submission.csv'), 'r') as f: lines = list(csv.reader(f))[1:] test_ids = [line[0] for line in lines] self.data = [(i, None) for i in test_ids] self.test_ids = test_ids self.source_lookup = {i: "test" for i in test_ids} else: raise Exception("Invalid dataset split.") # subsampling if self.n_samples is not None and self.n_samples < len(self.data): self.data = random.sample(self.data, self.n_samples) # class and example weighting if self.split == "train" or self.split == "trainval": labels = [self.encode_label(l[1]) for l in self.data] self.class_weights = np.sum(labels, axis=0).astype(np.float32) self.class_weights[self.class_weights == 0] = np.inf self.class_weights = self.class_weights[ self.class_weights != np.inf].max() / self.class_weights self.class_weights = self.class_weights / self.n_classes self.example_weights = np.asarray(labels) * self.class_weights[ np.newaxis, :] self.example_weights = np.sum(self.example_weights, axis=1) self.class_weights = torch.tensor(self.class_weights, dtype=torch.float32) self.example_weights = torch.tensor(self.example_weights, dtype=torch.float32) # set the image normalization p_mean = [ 0.08033423981012082, 0.05155526791740866, 0.05359709020876417, 0.0811968791288488 ] p_std = [ 0.1313705843029108, 0.08728413305330673, 0.13922084421796302, 0.12760922364487468 ] t_mean = [ 0.05860568283679439, 0.04606191081626742, 0.03982708801568723, 0.06027994646558575 ] t_std = [ 0.10238559670323068, 0.08069846376704155, 0.10501834094962233, 0.09908335311368136 ] e_mean = [ 0.03775239471734739, 0.04191453443041034, 0.007705539179783242, 0.0942332991656135 ] e_std = [ 0.05167756366610396, 0.061291035726105815, 0.019559849511340346, 0.13389048820718571 ] if self.image_channels == "g": p_mean, p_std = p_mean[2], p_std[2] t_mean, t_std = t_mean[2], t_std[2] e_mean, e_std = e_mean[2], e_std[2] elif self.image_channels == "rgb": p_mean, p_std = p_mean[:3], p_std[:3] t_mean, t_std = t_mean[:3], t_std[:3] e_mean, e_std = e_mean[:3], e_std[:3] elif self.image_channels == "rgby": pass else: raise NotImplementedError("Unsupported image channels selection.") self.primary_normalization = tfms.Normalize(mean=p_mean, std=p_std) self.test_normalization = tfms.Normalize(mean=t_mean, std=t_std) self.external_normalization = tfms.Normalize(mean=e_mean, std=e_std)
def training(model, fold, args): # resore from last checkpoint # all model weights resored, but not learning rate. if os.path.exists(os.path.join(config.weights, config.model_name, str(fold), "checkpoint.pth.tar")): best_model = torch.load(os.path.join(config.weights, config.model_name, str(fold), "checkpoint.pth.tar")) model.load_state_dict(best_model["state_dict"]) # logging issues log = Logger() log.open(os.path.join(config.logs_dir, "%s_log_train.txt" % config.model_name), mode="a") log.write( "\n---------------------------- [START %s] %s\n\n" % (datetime.now().strftime('%Y-%m-%d %H:%M:%S'), '-' * 20)) log.write( '----------------------|--------- Train ---------|-------- Valid ---------|-------Best ' 'Results-------|----------|\n') log.write( 'mode iter epoch | loss f1_macro | loss f1_macro | loss f1_macro | time ' ' |\n') log.write( '----------------------------------------------------------------------------------------------------------' '----\n') # training params optimizer = optim.SGD(model.parameters(), lr=config.learning_rate_start, momentum=0.9, weight_decay=config.weight_decay) if config.loss_name == 'ce': criterion = nn.BCEWithLogitsLoss().cuda() elif config.loss_name == 'focal': criterion = FocalLoss().cuda() elif config.loss_name == 'f1': criterion = F1Loss().cuda() else: raise ValueError('unknown loss name {}'.format(config.loss_name)) best_results = [np.inf, 0] val_metrics = [np.inf, 0] scheduler = lr_scheduler.StepLR(optimizer, step_size=config.learning_rate_decay_epochs, gamma=config.learning_rate_decay_rate) start = timer() # load dataset all_files = pd.read_csv(config.train_csv) image_names = all_files['Id'] labels_strs = all_files['Target'] image_labels = [] for cur_label_str in labels_strs: cur_label = np.eye(config.num_classes, dtype=np.float)[np.array(list(map(int, cur_label_str.split(' '))))].sum(axis=0) image_labels.append(cur_label) image_labels = np.stack(image_labels, axis=0) msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=config.val_percent, random_state=0) for train_index, val_index in msss.split(image_names, image_labels): train_image_names = image_names[train_index] train_image_labels = image_labels[train_index] val_image_names = image_names[val_index] val_image_labels = image_labels[val_index] train_gen = HumanDataset(train_image_names, train_image_labels, config.train_dir, mode="train") sampler = WeightedRandomSampler(weights=get_sample_weights()[train_index], num_samples=int(len(all_files)*(1-config.val_percent))) train_loader = DataLoader(train_gen, batch_size=config.batch_size, pin_memory=True, num_workers=4, sampler=sampler) # train_loader = DataLoader(train_gen, batch_size=config.batch_size, shuffle=True, pin_memory=True, num_workers=4) val_gen = HumanDataset(val_image_names, val_image_labels, config.train_dir, augument=False, mode="train") val_loader = DataLoader(val_gen, batch_size=config.batch_size, shuffle=False, pin_memory=True, num_workers=4) # train for epoch in range(0, config.epochs): # training & evaluating scheduler.step(epoch) get_learning_rate(optimizer) train_metrics = train(train_loader, model, criterion, optimizer, epoch, val_metrics, best_results, start) val_metrics = evaluate(val_loader, model, criterion, epoch, train_metrics, best_results, start) # check results is_best_loss = val_metrics[0] < best_results[0] best_results[0] = min(val_metrics[0], best_results[0]) is_best_f1 = val_metrics[1] > best_results[1] best_results[1] = max(val_metrics[1], best_results[1]) # save model save_checkpoint({ "epoch": epoch + 1, "model_name": config.model_name, "state_dict": model.state_dict(), "best_loss": best_results[0], "optimizer": optimizer.state_dict(), "fold": fold, "best_f1": best_results[1], }, is_best_loss, is_best_f1, fold) # print logs print('\r', end='', flush=True) log.write( logging_pattern % ( "best", epoch, epoch, train_metrics[0], train_metrics[1], val_metrics[0], val_metrics[1], str(best_results[0])[:8], str(best_results[1])[:8], time_to_str((timer() - start), 'min') ) ) log.write("\n") time.sleep(0.01)
"Label"].str.split("|").apply(lambda x: [int(i) for i in x]) train_kaggle_public = pd.concat([train_df, public_hpa_df_except16_0], ignore_index=True, sort=False) mlb = MultiLabelBinarizer() X = train_kaggle_public['ID'] y = train_kaggle_public['Label'].str.split("|").apply( lambda x: [int(i) for i in x]) df_ohe = pd.DataFrame(mlb.fit_transform(y), columns=mlb.classes_) df_ohe_np = df_ohe.to_numpy() msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0) for train_index, test_index in msss.split(X, df_ohe_np): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] train_data = HPADataset(X_train, y_train, mode="train", tfms=get_transforms(data_type='train')) test_data = HPADataset(X_test, y_test, mode="test", tfms=get_transforms(data_type='valid')) full_data = HPADataset(X,
def get_dataflow(is_train=True): train_df = pd.read_csv(os.path.join('/data/kaggle/HPA', 'train.csv')) #train_df = oversample(train_df) labels = [[int(i) for i in s.split()] for s in train_df['Target']] fnames = train_df['Id'].tolist() fnames = [os.path.join(config.TRAIN_DATASET, f) for f in fnames] sprase_label = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in labels ] extra_df = pd.read_csv( os.path.join('/data/kaggle/HPA', 'HPAv18RGBY_WithoutUncertain_wodpl.csv')) #extra_df = oversample(extra_df) extra_labels = [[int(i) for i in s.split()] for s in extra_df['Target']] extra_labels = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in extra_labels ] extra_fnames = extra_df['Id'].tolist() extra_fnames = [ os.path.join(config.EXTRA_DATASET, f) for f in extra_fnames ] fnames = fnames + extra_fnames sprase_label = sprase_label + extra_labels fnames = np.array(fnames) sprase_label = np.array(sprase_label) msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42) for train_index, test_index in msss.split(fnames, sprase_label): x_train, x_test = fnames[train_index], fnames[test_index] y_train, y_test = sprase_label[train_index], sprase_label[test_index] holdout_data = list(zip(x_test, y_test)) # 5 fold the rest mskf = MultilabelStratifiedKFold(n_splits=5, random_state=1) for fold_num, (train_index, test_index) in enumerate(mskf.split(x_train, y_train)): if fold_num == config.FOLD: foldx_train, foldx_test = x_train[train_index], x_train[test_index] foldy_train, foldy_test = y_train[train_index], y_train[test_index] break train_data = list(zip(foldx_train, foldy_train)) val_data = list(zip(foldx_test, foldy_test)) train_data = oversample_2(train_data) pseudo_df = pd.read_csv(os.path.join('/data/kaggle/HPA', 'LB623.csv')) pseudo_fnames = pseudo_df['Id'].tolist() pseudo_fnames = [ os.path.join(config.TEST_DATASET, f) for f in pseudo_fnames ] #pseudo_labels = np.load("./SOTA.npy") #pseudo_labels = [np.array(_) for _ in pseudo_labels] pseudo_labels = [[int(i) for i in s.split()] for s in pseudo_df['Predicted']] pseudo_labels = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in pseudo_labels ] pseudo_data = list(zip(pseudo_fnames, pseudo_labels)) train_data = train_data + pseudo_data print("train: ", len(train_data), len(val_data)) if not is_train: return val_data ds = DataFromList(train_data, shuffle=True) ds = BatchData(MapData(ds, preprocess), config.BATCH) ds = PrefetchDataZMQ(ds, 6) return ds
def get_dataflow(is_train=True): train_df = pd.read_csv(os.path.join('/data/kaggle/HPA', 'train.csv')) labels = [[int(i) for i in s.split()] for s in train_df['Target']] binary_label = [] for la in labels: if MODEL_LABEL in la: binary_label.append([1]) else: binary_label.append([0]) fnames = train_df['Id'].tolist() fnames = [os.path.join(config.TRAIN_DATASET, f) for f in fnames] sprase_label = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in binary_label ] if config.EXTRA: extra_df = pd.read_csv( os.path.join('/data/kaggle/HPA', 'HPAv18RBGY_wodpl.csv')) extra_labels = [[int(i) for i in s.split()] for s in extra_df['Target']] binary_label = [] for la in labels: if MODEL_LABEL in la: binary_label.append([1]) else: binary_label.append([0]) extra_labels = [ np.eye(config.NUM_CLASS, dtype=np.float)[np.array(la)].sum(axis=0) for la in binary_label ] extra_fnames = extra_df['Id'].tolist() extra_fnames = [ os.path.join(config.EXTRA_DATASET, f) for f in extra_fnames ] fnames = fnames + extra_fnames sprase_label = sprase_label + extra_labels # extra_data = list(zip(extra_fnames, extra_labels)) fnames = np.array(fnames) sprase_label = np.array(sprase_label) print(fnames.shape[0]) msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=42) for train_index, test_index in msss.split(fnames, sprase_label): x_train, x_test = fnames[train_index], fnames[test_index] y_train, y_test = sprase_label[train_index], sprase_label[test_index] train_data = list(zip(x_train, y_train)) val_data = list(zip(x_test, y_test)) if not is_train: return val_data ds = DataFromList(train_data, shuffle=True) ds = BatchData(MapData(ds, preprocess), config.BATCH) ds = PrefetchDataZMQ(ds, 6) return ds
input_shape=(HEIGHT, WIDTH, 3)) x = base_model.output x = Dropout(0.125)(x) y_pred = Dense(6, activation='sigmoid')(x) return Model(inputs=base_model.input, outputs=y_pred) # In[16]: # Submission Placeholder submission_predictions_b2 = [] # Multi Label Stratified Split stuff msss = MultilabelStratifiedShuffleSplit(n_splits=20, test_size=TEST_SIZE, random_state=SEED) X = train_df.index Y = train_df.Label.values # Get train and test index msss_splits = next(msss.split(X, Y)) train_idx = msss_splits[0] valid_idx = msss_splits[1] # In[17]: # Loop through Folds of Multi Label Stratified Split #for epoch, msss_splits in zip(range(0, 9), msss.split(X, Y)): # # Get train and test index # train_idx = msss_splits[0]
'vowel': 0.30, 'consonant': 0.30 }, metrics={ 'root': ['accuracy', tf.keras.metrics.Recall()], 'vowel': ['accuracy', tf.keras.metrics.Recall()], 'consonant': ['accuracy', tf.keras.metrics.Recall()] }) # Model Summary print(model.summary()) # Multi Label Stratified Split stuff... msss = MultilabelStratifiedShuffleSplit(n_splits=EPOCHS, test_size=TEST_SIZE, random_state=SEED) # CustomReduceLRonPlateau function best_val_loss = np.Inf def CustomReduceLRonPlateau(model, history, epoch): global best_val_loss # ReduceLR Constants monitor = 'val_root_loss' patience = 5 factor = 0.75 min_lr = 1e-5
def get_loaders(path: str, image_size: int, n_splits: int = 1, test_size: float = 0.1, batch_size: int = 128, num_workers: int = 4, external: bool = False, use_sampler: bool = False) -> Tuple[AttrDict, List[AttrDict]]: df = pd.read_csv(f'{path}/train.csv') df_external = pd.read_csv(f'{path}/external.csv') X = np.array(df.Id) y = np.array( [HumanProteinDataset.parse_target(target) for target in df.Target]) msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=42) train, valid = list(msss.split(X, y))[0] df_train, df_valid = df.iloc[train], df.iloc[valid] df_test = pd.read_csv(f'{path}/sample_submission.csv') if external: df_train = pd.concat([df_train, df_external], axis=0) transforms_train, transforms_test, transforms_test_aug = _get_transforms( image_size) dataset_train = HumanProteinDataset(df_train, f'{path}/train', transforms=transforms_train) dataset_train_test = HumanProteinDataset(df_train, f'{path}/train', transforms=transforms_test) dataset_train_aug = HumanProteinDataset(df_train, f'{path}/train', transforms=transforms_test_aug) dataset_valid = HumanProteinDataset(df_valid, f'{path}/train', transforms=transforms_test) dataset_valid_aug = HumanProteinDataset(df_valid, f'{path}/train', transforms=transforms_test_aug) dataset_test = HumanProteinDataset(df_test, f'{path}/test', train_mode=False, transforms=transforms_test) dataset_test_aug = HumanProteinDataset(df_test, f'{path}/test', train_mode=False, transforms=transforms_test_aug) default_loaders = AttrDict() default_loaders.train = DataLoader(dataset_train, batch_size, num_workers=num_workers) default_loaders.train_test = DataLoader(dataset_train_test, batch_size, num_workers=num_workers) default_loaders.train_aug = DataLoader(dataset_train_aug, batch_size, num_workers=num_workers) default_loaders.valid = DataLoader(dataset_valid, batch_size, pin_memory=True, num_workers=num_workers) default_loaders.valid_aug = DataLoader(dataset_valid_aug, batch_size, pin_memory=True, num_workers=num_workers) default_loaders.test = DataLoader(dataset_test, batch_size, pin_memory=True, num_workers=num_workers) default_loaders.test_aug = DataLoader(dataset_test_aug, batch_size, pin_memory=True, num_workers=num_workers) if n_splits == 1: sampler = _get_sampler(df_train) if use_sampler else None loaders = AttrDict() loaders.train = DataLoader(dataset_train, batch_size, not use_sampler, sampler, num_workers=num_workers) loaders.valid = default_loaders.valid loaders.valid_aug = default_loaders.valid_aug return default_loaders, [loaders] folds = [] for train, valid in _k_fold(df_train, n_splits): fold_train, fold_valid = df_train.iloc[train], df_train.iloc[valid] dataset_train = HumanProteinDataset(fold_train, f'{path}/train', transforms=transforms_train) dataset_valid = HumanProteinDataset(fold_valid, f'{path}/train', transforms=transforms_test) dataset_valid_aug = HumanProteinDataset(fold_valid, f'{path}/train', transforms=transforms_test_aug) sampler = _get_sampler(fold_train) if use_sampler else None loaders = AttrDict() loaders.train = DataLoader(dataset_train, batch_size, not use_sampler, sampler, num_workers=num_workers) loaders.valid = DataLoader(dataset_valid, batch_size, pin_memory=True, num_workers=num_workers) loaders.valid_aug = DataLoader(dataset_valid_aug, batch_size, pin_memory=True, num_workers=num_workers) folds.append(loaders) return default_loaders, folds
unique_classes = df["class_id"].unique() # %% -------------------- one_hot_labels = [] for img_id in unique_image_ids: classes = df[df["img_id"] == img_id]["class_id"].values x = np.eye(len(unique_classes))[classes.astype(int)].sum(0) one_hot_labels.append(x) one_hot_labels = np.array(one_hot_labels) # %% -------------------- n_splits = 3 # mskf = MultilabelStratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2021) mskf = MultilabelStratifiedShuffleSplit(n_splits=n_splits, train_size=0.5, test_size=0.5, random_state=2021) # %% -------------------- train_df = pd.DataFrame() val_df = pd.DataFrame() # %% -------------------- # X is unique image_id for fold, (train_index, val_index) in enumerate(mskf.split(unique_image_ids, one_hot_labels)): train_data = df[df["img_id"].isin( unique_image_ids[train_index])].copy(deep=True) val_data = df[df["img_id"].isin( unique_image_ids[val_index])].copy(deep=True)
def main(): df = pd.read_csv("data.csv", sep=";") model = get_model(IMG_SIZE) model.compile( optimizer=Adam(lr=0.016), loss={"season": 'mean_absolute_error'}, loss_weights={"season": 1}, metric={"season": ["mean_absolute_error", tf.keras.metrics.Recall()]}) # Model summary print(model.summary()) # Multi Label Stratified Split stuff... msss = MultilabelStratifiedShuffleSplit(n_splits=EPOCHS, test_size=TEST_SIZE) X_train = df["PATH"].values y_columns = [x for x in df.columns if x.startswith("SEASON")] Y_train = df[y_columns].to_numpy() for epoch, msss_split in zip(range(EPOCHS), msss.split(X_train, Y_train)): print('=========== EPOCH {}'.format(epoch)) train_ids = msss_split[0] valid_ids = msss_split[1] print('Train Length: {0} First 10 indices: {1}'.format( len(train_ids), train_ids[:])) print('Valid Length: {0} First 10 indices: {1}'.format( len(valid_ids), valid_ids[:])) train_df = df.loc[train_ids] X_train_data = train_df["PATH"].values y_columns = [x for x in train_df.columns if x.startswith("SEASON")] Y_train_data = train_df[y_columns].to_numpy() data_generator_train = TrainDataGenerator(X_train_data, Y_train_data, train_ids, batch_size=16, img_size=IMG_SIZE) valid_df = df.loc[valid_ids] X_valid_data = valid_df["PATH"].values Y_valid_data = valid_df[y_columns].to_numpy() data_generator_val = TrainDataGenerator(X_valid_data, Y_valid_data, valid_ids, batch_size=16, img_size=IMG_SIZE) TRAIN_STEPS = int(len(data_generator_train)) VALID_STEPS = int(len(data_generator_val)) print('Train Generator Size: {0}'.format(len(data_generator_train))) print('Validation Generator Size: {0}'.format(len(data_generator_val))) model.fit_generator(generator=data_generator_train, validation_data=data_generator_val, steps_per_epoch=TRAIN_STEPS, validation_steps=VALID_STEPS, epochs=1, callbacks=[ ModelCheckpointFull(RUN_NAME + 'model_' + str(epoch) + '.h5') ], verbose=1) # Set and Concat Training History temp_history = model.history.history if epoch == 0: history = temp_history else: for k in temp_history: history[k] = history[k] + temp_history[k] # Custom ReduceLRonPlateau CustomReduceLRonPlateau(model, history, epoch) # Cleanup del data_generator_train, data_generator_val, train_ids, valid_ids gc.collect()