def get_trn_val_split(x, y, val_pct=0.15): val_idxs = get_cv_idxs(len(x), val_pct=val_pct) if isinstance(x, list): return [([arr[i] for i in val_idxs], [arr[i] for i in range(len(arr)) if i not in val_idxs]) for arr in [x, y]] else: return split_by_idx(val_idxs, x, y)
def _get_filename_sets(self, resize_folder:str): exclude_str = '/' + resize_folder + '/' paths = self._find_files_recursively(self.path,self.file_exts) paths = filter(lambda path: not re.search(exclude_str, str(path)), paths) fnames_full = [Path(str(fname).replace(str(self.path) + '/','')) for fname in paths] self._update_np_random_seed() keeps = np.random.rand(len(fnames_full)) < self.keep_pct fnames = np.array(fnames_full, copy=False)[keeps] val_idxs = get_cv_idxs(len(fnames), val_pct=min(0.01/self.keep_pct, 0.1)) return split_by_idx(val_idxs, np.array(fnames), np.array(fnames))
def add_family_survived_self(df, set_idx, val_idx): df['FamilySurvived'] = np.NAN ((val, train), ) = split_by_idx(val_idx, df) family_survived = train[['LastName', 'Survived']].groupby('LastName').sum() val_copy = val.copy() add_family_survived(family_survived, val_copy) df.loc[val_idx, 'FamilySurvived'] = val_copy['FamilySurvived'] set_idx = set(set_idx) # To set family survived inside train itself is more complicated. For each row we must calculate the result # as if the row doesn't exist. for index, row in train.iterrows(): if index in set_idx: df.loc[index, 'FamilySurvived'] = train[ train['LastName'] == row.LastName]['Survived'].sum() - row.Survived
def from_data_frame(cls, path, folder, df, val_idxs, suffix='', bs=64, tfms=(None, None), test_df=None, num_workers=8): [(val_df, trn_df)] = split_by_idx(val_idxs, df) return cls.from_data_frames(path, folder, trn_df, val_df, suffix=suffix, bs=bs, tfms=tfms, test_df=test_df, num_workers=num_workers)