示例#1
0
def get_trn_val_split(x, y, val_pct=0.15):
    val_idxs = get_cv_idxs(len(x), val_pct=val_pct)
    if isinstance(x, list):
        return [([arr[i] for i in val_idxs],
                 [arr[i] for i in range(len(arr)) if i not in val_idxs])
                for arr in [x, y]]
    else:
        return split_by_idx(val_idxs, x, y)
 def _get_filename_sets(self, resize_folder:str):
     exclude_str = '/' + resize_folder + '/'
     paths = self._find_files_recursively(self.path,self.file_exts) 
     paths = filter(lambda path: not re.search(exclude_str, str(path)), paths)
     fnames_full = [Path(str(fname).replace(str(self.path) + '/','')) for fname in paths] 
     self._update_np_random_seed()
     keeps = np.random.rand(len(fnames_full)) < self.keep_pct
     fnames = np.array(fnames_full, copy=False)[keeps]
     val_idxs = get_cv_idxs(len(fnames), val_pct=min(0.01/self.keep_pct, 0.1))
     return split_by_idx(val_idxs, np.array(fnames), np.array(fnames))
示例#3
0
def add_family_survived_self(df, set_idx, val_idx):
    df['FamilySurvived'] = np.NAN
    ((val, train), ) = split_by_idx(val_idx, df)

    family_survived = train[['LastName', 'Survived']].groupby('LastName').sum()
    val_copy = val.copy()
    add_family_survived(family_survived, val_copy)
    df.loc[val_idx, 'FamilySurvived'] = val_copy['FamilySurvived']

    set_idx = set(set_idx)

    # To set family survived inside train itself is more complicated. For each row we must calculate the result
    # as if the row doesn't exist.
    for index, row in train.iterrows():
        if index in set_idx:
            df.loc[index, 'FamilySurvived'] = train[
                train['LastName'] ==
                row.LastName]['Survived'].sum() - row.Survived
示例#4
0
 def from_data_frame(cls,
                     path,
                     folder,
                     df,
                     val_idxs,
                     suffix='',
                     bs=64,
                     tfms=(None, None),
                     test_df=None,
                     num_workers=8):
     [(val_df, trn_df)] = split_by_idx(val_idxs, df)
     return cls.from_data_frames(path,
                                 folder,
                                 trn_df,
                                 val_df,
                                 suffix=suffix,
                                 bs=bs,
                                 tfms=tfms,
                                 test_df=test_df,
                                 num_workers=num_workers)