def get_files(path, extensions=None, recurse=True, folders=None, followlinks=True): "Get all the files in `path` with optional `extensions`, optionally with `recurse`, only in `folders`, if specified." path = Path(path) folders = L(folders) if extensions is not None: extensions = set(uniqueify(extensions)) extensions = {e.lower() for e in extensions} if recurse: res = [] for i, (p, d, f) in enumerate(os.walk(path, followlinks=followlinks) ): # returns (dirpath, dirnames, filenames) if len(folders) != 0 and i == 0: d[:] = [o for o in d if o in folders] else: d[:] = [o for o in d if not o.startswith(".")] if len(folders) != 0 and i == 0 and "." not in folders: continue res += _get_files(p, f, extensions) else: f = [o.name for o in os.scandir(path) if o.is_file()] res = _get_files(path, f, extensions) return L(res)
def make_xla_child_learner(rank, sync_valid, learner_args, add_args, ctrl_args): "create a learner using passed parameters" device = xm.xla_device() world_size = xm.xrt_world_size() dls = build_distributed_dataloaders(learner_args.pop('base_dls'), rank, world_size, sync_valid=sync_valid) model = learner_args.pop('wrapped_model').to(device) master_cbs = learner_args.pop('master_cbs') if master_cbs is None: master_cbs = L() learner = Learner(dls, model, **learner_args) learner.__stored_args__ = {**learner.__stored_args__, **add_args} learner.to_multi_xla(device, rank, sync_valid=sync_valid) if not ctrl_args['use_progress'] and 'progress' in L( learner.cbs).attrgot('name'): learner.remove_cbs(ProgressCallback) if rank == 0: learner.add_cbs(master_cbs) return learner
def _tokenize_files(func, files, path, output_dir=None, output_names=None, n_workers=defaults.cpus, rules=None, tok=None, encoding='utf8', skip_if_exists=False): "Tokenize text `files` in parallel using `n_workers`" if tok is None: tok = WordTokenizer() output_dir = Path(ifnone(output_dir, path.parent / f'{path.name}_tok')) if skip_if_exists and output_dir.exists(): return output_dir output_dir.mkdir(exist_ok=True) if output_names is None: output_names = L(output_dir / f.relative_to(path) for f in files) # rules = partial(Path.readlines, encoding=encoding) + L(ifnone(rules, defaults.text_proc_rules.copy())) rules = partial(Path.read_text, encoding=encoding) + L( ifnone(rules, defaults.text_proc_rules.copy())) lengths, counter = {}, Counter() for i, tok in parallel_tokenize(files, tok, rules, n_workers=n_workers): out = func(i, output_dir) out.mk_write(' '.join(tok)) lengths[str(files[i].relative_to(path))] = len(tok) counter.update(tok) save_pickle(output_dir / fn_lengths_pkl, lengths) save_pickle(output_dir / fn_counter_pkl, counter) return output_dir
def plot_sched(self:Recorder, keys=None, figsize=None): keys = self.hps.keys() if keys is None else L(keys) rows,cols = (len(keys)+1)//2, min(2, len(keys)) figsize = figsize or (6*cols,4*rows) _, axs = plt.subplots(rows, cols, figsize=figsize) axs = axs.flatten() if len(keys) > 1 else L(axs) for p,ax in zip(keys, axs): ax.plot(self.hps[p]) ax.set_ylabel(p)
def on_change_ds(self, change=None): ds = self.dd_ds.index cat = self.dd_cats.value iwi = self.iwis[ds] info = L([o for o in iwi if o[1] == cat]) fns = info.sorted(2, reverse=True).itemgot(0) self.iw.set_fns(fns) dd_children = L(self.iw.widget.children).itemgot(1) for dd_child in dd_children: dd_child.observe(self.on_change_dd_item, 'value')
def bayes_predict(self:Learner,item, rm_type_tfms=None, with_input=False, sample_size=10,reduce=True): "gets a sample distribution of predictions and computes entropy" dl = self.dls.test_dl([item], rm_type_tfms=rm_type_tfms, num_workers=0) # modify get_preds to get distributed samples collect_preds = [] collect_targs = [] collect_dec_preds = [] collect_inp = None cbs = [MCDropoutCallback()] with self.no_bar(): for j in range(sample_size): inp,preds,_,dec_preds = self.get_preds(dl=dl, with_input=True, with_decoded=True, cbs=cbs) i = getattr(self.dls, 'n_inp', -1) inp = (inp,) if i==1 else tuplify(inp) dec = self.dls.decode_batch(inp + tuplify(dec_preds))[0] dec_inp,dec_targ = map(detuplify, [dec[:i],dec[i:]]) # res = dec_targ,dec_preds[0],preds[0] if with_input and collect_inp is None: # collect inp first iter only collect_inp = dec_inp collect_targs.append(dec_targ) collect_dec_preds.append(dec_preds[0]) collect_preds.append(preds[0]) dist_preds = torch.stack(collect_preds) dist_dec_preds = L(collect_dec_preds).map(lambda o: o.item()) dist_targs = L(collect_targs) res1 = (dist_targs, dist_dec_preds, dist_preds) mean_pred = dist_preds.mean(dim=0) ent = entropy(dist_preds.unsqueeze(1)).item() best_guess = torch.argmax(mean_pred).item() best_prob = mean_pred[best_guess].item() best_cat = self.dls.vocab[best_guess] res2 = (ent, best_prob, best_guess, best_cat) if reduce: if len(dist_targs.unique()) > 1: targ = Counter(dist_targs) else: targ = dist_targs.unique()[0] if len(dist_dec_preds.unique()) > 1: dec_pred = Counter(dist_dec_preds) else: dec_pred = dist_dec_preds.unique()[0] res1 = (targ, dec_pred, mean_pred) res = res1 + res2 if with_input: res = (collect_inp,) + res return res
def __init__(self, items=None, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs): super().__init__(dl_type=dl_type) self.tls = L( tls if tls else [TfmdListsX(items, t, **kwargs) for t in L(ifnone(tfms, [None]))]) self.n_inp = ifnone(n_inp, max(1, len(self.tls) - 1))
def wrapper_tokenizer_class(df, tok, text_col: str = 'text', print_len: int = 40): "Wrapper around Tokenizer (easy to use e.g. from_df)" toke = Tokenizer.from_df(df=df, text_cols=text_col, tok=tok) toke.tok.setup(L(toke.kwargs['df'][text_col].tolist())) tokenizer_res = first( toke.tok(L(toke.kwargs['df'][text_col].tolist()[:print_len]))) toke_obj = toke.__dict__ return tokenizer_res, toke_obj
def setup_fit_cbs(rank, fit_args): "add master cbs to cbs fit args if rank 0" master_cbs = L(fit_args.pop('master_cbs')) if rank != 0: master_cbs = L() if 'cbs' in fit_args: cbs = L(fit_args.pop('cbs')) else: cbs = L() if len(master_cbs) > 0 or len(cbs) > 0: fit_args['cbs'] = [*cbs, *master_cbs] return fit_args
def transducer_splitter(m, adahessian=False): def ps_of_sub(mod): return [p for p in mod.parameters() if p.requires_grad] if adahessian: l = L([*m.joint.param_groups(), *m.predictor.param_groups()]) for p in l: print(p.shape) print(len(l)) return l else: ps = L(m.param_groups()) return ps
def __init__(self, image_dir='images', mask_dir=None, config=None, path=None, ensemble_dir=None, label_fn=None, metrics=None, loss_fn=None, cbs=None, ds_kwargs={}, stats=None, files=None): self.config = config or Config() self.stats = stats self.path = Path(path) if path is not None else Path('.') self.metrics = metrics or [Iou()] #Dice_f1() self.loss_fn = loss_fn or WeightedSoftmaxCrossEntropy(axis=1) self.cbs = cbs or [ SaveModelCallback(monitor='iou'), ElasticDeformCallback ] #ShowGraphCallback self.ensemble_dir = ensemble_dir or self.path / 'ensemble' self.files = L(files) or get_image_files(self.path / image_dir, recurse=False) assert len( self.files ) > 0, f'Found {len(self.files)} images in "{image_dir}". Please check your images and image folder' if any([mask_dir, label_fn]): if label_fn: self.label_fn = label_fn else: self.label_fn = get_label_fn(self.files[0], self.path / mask_dir) #Check if corresponding masks exist mask_check = [self.label_fn(x).is_file() for x in self.files] chk_str = f'Found {len(self.files)} images in "{image_dir}" and {sum(mask_check)} masks in "{mask_dir}".' assert len(self.files) == sum(mask_check) and len( self.files ) > 0, f'Please check your images and masks (and folders). {chk_str}' print(chk_str) else: self.label_fn = label_fn self.n_splits = min(len(self.files), self.max_splits) for key, value in get_default_shapes(self.arch).items(): ds_kwargs.setdefault(key, value) self.ds_kwargs = ds_kwargs self.item_tfms = [Brightness(max_lighting=self.light)] self.models = {} self.recorder = {} self._set_splits() self.ds = RandomTileDataset(self.files, label_fn=self.label_fn, create_weights=False, **self.mw_kwargs, **self.ds_kwargs) self.in_channels = self.ds.get_data(max_n=1)[0].shape[-1] self.df_val, self.df_ens, self.df_model, self.ood = None, None, None, None
def __init__(self, coll, idxs=None, cache=None, tfm=noop): if idxs is None: idxs = L.range(coll) self.coll = coll self.idxs = idxs self.tfm = tfm if cache is not None: self._get = functools.lru_cache(maxsize=cache)(self._get)
def reset_changes(self, b): self.update_message = False dd_children = L(self.iw.widget.children).itemgot(1) for dd_child in dd_children: dd_child.value = '<Keep>' self.check_pending_changes() self.update_message = True
def extract_level(spectrum_blocks: L, dtype=np.float32) -> pd.DataFrame: """Receives a mapping `spectrum_blocks` and returns the Matrix with the Levels as values, Frequencies as columns and Block Number as index. :param pivoted: If False, optionally returns an unpivoted version of the Matrix """ assert len(spectrum_blocks), f"The spectrum block list is empty" spectrum_blocks = spectrum_blocks.itemgot(1) block = spectrum_blocks[0] assert block.type in (63, 64, 67, 68), f"The input blocks are not spectral blocks" rows = len(spectrum_blocks) min_level = block.offset - 127.5 if block.type in (63, 67): cols = block.ndata frequencies = getattr(block, "frequencies") return pd.DataFrame( _extract_uncompressed(spectrum_blocks, rows, cols, min_level, dtype), columns=frequencies, ) else: cols = block.norig thresh = block.thresh - 1 block_data = [b.block_data for b in spectrum_blocks] frequencies = np.linspace(block.start_mega, block.stop_mega, num=cols) levels = cy_extract_compressed(block_data, rows, cols, thresh, min_level) if dtype != np.float32: levels = levels.astype(dtype) return pd.DataFrame(levels, columns=frequencies)
def delete_items(self): ds_index = self.dd_ds.index # use train(0) or valid(1) iwi = self.iwis[ds_index] # get filenames,targs,loss for train/valid items = L(iwi).itemgot(0) # get filenames for train/valid for_deletion = self.delete().copy() for idx in for_deletion: self.delete_dataset_item(items, iwi, idx)
def convert_params(o:list) -> list: """ Converts `o` into Pytorch-compatable param groups `o` should be a set of layer-groups that should be split in the optimizer Example: ```python def splitter(m): return convert_params([[m.a], [m.b]]) ``` Where `m` is a model defined as: ```python class RegModel(Module): def __init__(self): self.a,self.b = nn.Parameter(torch.randn(1)),nn.Parameter(torch.randn(1)) def forward(self, x): return x*self.a + self.b ``` """ if not isinstance(o[0], dict): splitter = [] for group in o: if not isinstance(group[0], nn.parameter.Parameter): group = L(group).map(params) splitter.append({'params':group}) return splitter return o
def __init__(self, items, tfms, use_list=None, do_setup=True, split_idx=None, train_setup=True, splits=None, types=None, verbose=False, dl_type=None): super().__init__(items, use_list=use_list) if dl_type is not None: self._dl_type = dl_type #potentially unused self.splits = L([slice(None), []] if splits is None else splits).map( mask2idxs) if isinstance(tfms, TfmdListsX): tfms = tfms.tfms if isinstance(tfms, PipelineX): do_setup = False # This is relevant, equivalent to PipelineX self.tfms = PipelineX(tfms, split_idx=split_idx) store_attr('types,split_idx') if do_setup: pv(f"Setting up {self.tfms}", verbose) self.setup(train_setup=train_setup)
def pre_xla_inference(self: Learner): ctrl_args = {} progress_removed = False if 'progress' in L(self.cbs).attrgot('name'): self.remove_cbs(ProgressCallback) progress_removed = True ctrl_args['use_progress'] = progress_removed return ctrl_args
def add_master_cb(self: Learner, cb): "add a master callback" if not hasattr(self, '_master_cbs'): self._master_cbs = L() if isinstance(cb, type): cb = cb() # cb.learn = self # setattr(self, cb.name, cb) self._master_cbs.append(cb)
def round_multiple(x, mult, round_down=False): "Round `x` to nearest multiple of `mult`" def _f(x_): return (int if round_down else round)(x_ / mult) * mult res = L(x).map(_f) return res if is_listy(x) else res[0]
def fit_flat_cos(self:Learner, n_epoch, lr=None, div_final=1e5, pct_start=0.75, wd=None, cbs=None, reset_opt=False): "Fit `self.model` for `n_epoch` at flat `lr` before a cosine annealing." if self.opt is None: self.create_opt() self.opt.set_hyper('lr', self.lr if lr is None else lr) lr = np.array([h['lr'] for h in self.opt.hypers]) scheds = {'lr': combined_cos(pct_start, lr, lr, lr/div_final)} self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
def after_fit(self): tlogger = make_notifier() tlogger.info( "{}: Ende des Trainings nach {} Epochen mit Loss {}".format( self.model_name, self.epoch + 1, L(self.recorder.values[0:]).itemgot(1)[-1], ))
def reclassify_items(self): self.update_message = False ds_index = self.dd_ds.index # use train(0) or valid(1) iwi = self.iwis[ds_index] # get filenames,targs,loss for train/valid items = L(iwi).itemgot(0) # get filenames for train/valid for idx,new_cat in self.change(): if new_cat != self.dd_cats.value: # new_cat is not equal to existing cat self.reclassify_item(items, iwi, idx, new_cat)
def fit_one_cycle(self:Learner, n_epoch, lr_max=None, div=25., div_final=1e5, pct_start=0.25, wd=None, moms=None, cbs=None, reset_opt=False): "Fit `self.model` for `n_epoch` using the 1cycle policy." if self.opt is None: self.create_opt() self.opt.set_hyper('lr', self.lr if lr_max is None else lr_max) lr_max = np.array([h['lr'] for h in self.opt.hypers]) scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final), 'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))} self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
def set_hyper(self, k, v): "Set the value(s) in `v` for hyper-paramter `k`" if isinstance(v, slice): if v.start: v = even_mults(v.start, v.stop, len(self.param_lists)) else: v = [v.stop/10]*(len(self.param_lists)-1) + [v.stop] v = L(v, use_list=None) if len(v)==1: v = v*len(self.param_lists) assert len(v) == len(self.hypers), f"Trying to set {len(v)} values for {k} but there are {len(self.param_lists)} parameter groups." self._set_hyper(k, v)
def after_epoch(self): if (self.epoch + 1) % 10 == 0: tlogger = make_notifier() tlogger.info("{}: Epoche {}/{} mit Loss {}".format( self.model_name, self.epoch + 1, self.n_epoch, L(self.recorder.values[0:]).itemgot(1)[-1], ))
def __init__(self, ds_idx=1, dl=None, with_input=False, with_decoded=False, cbs=None, reorder=True): self.cbs = L(cbs) store_attr('ds_idx,dl,with_input,with_decoded,reorder')
def apply_sentence_piecer(df, text_col: str = 'text', vocab_sz: int = 200, print_len: int = 40): "Apply SentencePieceTokenizer" txt = L(df[text_col].tolist()) sp = SentencePieceTokenizer(vocab_sz) sp.setup(txt) return next(iter(sp(txt[:print_len])))
def fit_sgdr(self:Learner, n_cycles, cycle_len, lr_max=None, cycle_mult=2, cbs=None, reset_opt=False, wd=None): "Fit `self.model` for `n_cycles` of `cycle_len` using SGDR." if self.opt is None: self.create_opt() self.opt.set_hyper('lr', self.lr if lr_max is None else lr_max) lr_max = np.array([h['lr'] for h in self.opt.hypers]) n_epoch = cycle_len * (cycle_mult**n_cycles-1)//(cycle_mult-1) pcts = [cycle_len * cycle_mult**i / n_epoch for i in range(n_cycles)] scheds = [SchedCos(lr_max, 0) for _ in range(n_cycles)] scheds = {'lr': combine_scheds(pcts, scheds)} self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
def after_validate(self): to_rm = L(cb for cb in self.learn.cbs if getattr(cb, 'remove_on_fetch', False)) with self.learn.removed_cbs(to_rm + self.cbs) as learn: self.preds = learn.get_preds(ds_idx=self.ds_idx, dl=self.dl, with_input=self.with_input, with_decoded=self.with_decoded, inner=True, reorder=self.reorder)