Пример #1
0
def get_files(path,
              extensions=None,
              recurse=True,
              folders=None,
              followlinks=True):
    "Get all the files in `path` with optional `extensions`, optionally with `recurse`, only in `folders`, if specified."
    path = Path(path)
    folders = L(folders)
    if extensions is not None:
        extensions = set(uniqueify(extensions))
        extensions = {e.lower() for e in extensions}
    if recurse:
        res = []
        for i, (p, d,
                f) in enumerate(os.walk(path, followlinks=followlinks)
                                ):  # returns (dirpath, dirnames, filenames)
            if len(folders) != 0 and i == 0:
                d[:] = [o for o in d if o in folders]
            else:
                d[:] = [o for o in d if not o.startswith(".")]
            if len(folders) != 0 and i == 0 and "." not in folders:
                continue
            res += _get_files(p, f, extensions)
    else:
        f = [o.name for o in os.scandir(path) if o.is_file()]
        res = _get_files(path, f, extensions)
    return L(res)
Пример #2
0
def make_xla_child_learner(rank, sync_valid, learner_args, add_args,
                           ctrl_args):
    "create a learner using passed parameters"
    device = xm.xla_device()
    world_size = xm.xrt_world_size()
    dls = build_distributed_dataloaders(learner_args.pop('base_dls'),
                                        rank,
                                        world_size,
                                        sync_valid=sync_valid)

    model = learner_args.pop('wrapped_model').to(device)
    master_cbs = learner_args.pop('master_cbs')
    if master_cbs is None:
        master_cbs = L()
    learner = Learner(dls, model, **learner_args)
    learner.__stored_args__ = {**learner.__stored_args__, **add_args}

    learner.to_multi_xla(device, rank, sync_valid=sync_valid)

    if not ctrl_args['use_progress'] and 'progress' in L(
            learner.cbs).attrgot('name'):
        learner.remove_cbs(ProgressCallback)

    if rank == 0:
        learner.add_cbs(master_cbs)

    return learner
Пример #3
0
def _tokenize_files(func,
                    files,
                    path,
                    output_dir=None,
                    output_names=None,
                    n_workers=defaults.cpus,
                    rules=None,
                    tok=None,
                    encoding='utf8',
                    skip_if_exists=False):
    "Tokenize text `files` in parallel using `n_workers`"
    if tok is None: tok = WordTokenizer()
    output_dir = Path(ifnone(output_dir, path.parent / f'{path.name}_tok'))
    if skip_if_exists and output_dir.exists(): return output_dir
    output_dir.mkdir(exist_ok=True)
    if output_names is None:
        output_names = L(output_dir / f.relative_to(path) for f in files)
    # rules = partial(Path.readlines, encoding=encoding) + L(ifnone(rules, defaults.text_proc_rules.copy()))
    rules = partial(Path.read_text, encoding=encoding) + L(
        ifnone(rules, defaults.text_proc_rules.copy()))

    lengths, counter = {}, Counter()
    for i, tok in parallel_tokenize(files, tok, rules, n_workers=n_workers):
        out = func(i, output_dir)
        out.mk_write(' '.join(tok))
        lengths[str(files[i].relative_to(path))] = len(tok)
        counter.update(tok)

    save_pickle(output_dir / fn_lengths_pkl, lengths)
    save_pickle(output_dir / fn_counter_pkl, counter)
    return output_dir
Пример #4
0
def plot_sched(self:Recorder, keys=None, figsize=None):
    keys = self.hps.keys() if keys is None else L(keys)
    rows,cols = (len(keys)+1)//2, min(2, len(keys))
    figsize = figsize or (6*cols,4*rows)
    _, axs = plt.subplots(rows, cols, figsize=figsize)
    axs = axs.flatten() if len(keys) > 1 else L(axs)
    for p,ax in zip(keys, axs):
        ax.plot(self.hps[p])
        ax.set_ylabel(p)
 def on_change_ds(self, change=None):
     ds = self.dd_ds.index
     cat = self.dd_cats.value
     iwi = self.iwis[ds]
     info = L([o for o in iwi if o[1] == cat])
     fns = info.sorted(2, reverse=True).itemgot(0)
     self.iw.set_fns(fns)
     dd_children = L(self.iw.widget.children).itemgot(1)
     for dd_child in dd_children:
         dd_child.observe(self.on_change_dd_item, 'value')
def bayes_predict(self:Learner,item, rm_type_tfms=None, with_input=False,
                  sample_size=10,reduce=True):
    "gets a sample distribution of predictions and computes entropy"
    dl = self.dls.test_dl([item], rm_type_tfms=rm_type_tfms, num_workers=0)

    # modify get_preds to get distributed samples
    collect_preds = []
    collect_targs = []
    collect_dec_preds = []
    collect_inp = None
    cbs = [MCDropoutCallback()]
    with self.no_bar():
        for j in range(sample_size):
            inp,preds,_,dec_preds = self.get_preds(dl=dl, with_input=True,
                                                   with_decoded=True,
                                                   cbs=cbs)
            i = getattr(self.dls, 'n_inp', -1)
            inp = (inp,) if i==1 else tuplify(inp)
            dec = self.dls.decode_batch(inp + tuplify(dec_preds))[0]
            dec_inp,dec_targ = map(detuplify, [dec[:i],dec[i:]])
            # res = dec_targ,dec_preds[0],preds[0]
            if with_input and collect_inp is None: # collect inp first iter only
                   collect_inp = dec_inp
            collect_targs.append(dec_targ)
            collect_dec_preds.append(dec_preds[0])
            collect_preds.append(preds[0])
    dist_preds = torch.stack(collect_preds)
    dist_dec_preds = L(collect_dec_preds).map(lambda o: o.item())
    dist_targs = L(collect_targs)
    res1 = (dist_targs, dist_dec_preds, dist_preds)

    mean_pred = dist_preds.mean(dim=0)
    ent = entropy(dist_preds.unsqueeze(1)).item()
    best_guess = torch.argmax(mean_pred).item()
    best_prob = mean_pred[best_guess].item()
    best_cat = self.dls.vocab[best_guess]
    res2 = (ent, best_prob, best_guess, best_cat)

    if reduce:
        if len(dist_targs.unique()) > 1:
            targ = Counter(dist_targs)
        else:
            targ = dist_targs.unique()[0]

        if len(dist_dec_preds.unique()) > 1:
            dec_pred = Counter(dist_dec_preds)
        else:
            dec_pred = dist_dec_preds.unique()[0]
        res1 = (targ, dec_pred, mean_pred)

    res = res1 + res2
    if with_input:
        res = (collect_inp,) + res
    return res
Пример #7
0
 def __init__(self,
              items=None,
              tfms=None,
              tls=None,
              n_inp=None,
              dl_type=None,
              **kwargs):
     super().__init__(dl_type=dl_type)
     self.tls = L(
         tls if tls else
         [TfmdListsX(items, t, **kwargs) for t in L(ifnone(tfms, [None]))])
     self.n_inp = ifnone(n_inp, max(1, len(self.tls) - 1))
Пример #8
0
def wrapper_tokenizer_class(df,
                            tok,
                            text_col: str = 'text',
                            print_len: int = 40):
    "Wrapper around Tokenizer (easy to use e.g. from_df)"

    toke = Tokenizer.from_df(df=df, text_cols=text_col, tok=tok)
    toke.tok.setup(L(toke.kwargs['df'][text_col].tolist()))
    tokenizer_res = first(
        toke.tok(L(toke.kwargs['df'][text_col].tolist()[:print_len])))
    toke_obj = toke.__dict__
    return tokenizer_res, toke_obj
Пример #9
0
def setup_fit_cbs(rank, fit_args):
    "add master cbs to cbs fit args if rank 0"
    master_cbs = L(fit_args.pop('master_cbs'))
    if rank != 0:
        master_cbs = L()
    if 'cbs' in fit_args:
        cbs = L(fit_args.pop('cbs'))
    else:
        cbs = L()
    if len(master_cbs) > 0 or len(cbs) > 0:
        fit_args['cbs'] = [*cbs, *master_cbs]
    return fit_args
Пример #10
0
def transducer_splitter(m, adahessian=False):
    def ps_of_sub(mod):
        return [p for p in mod.parameters() if p.requires_grad]

    if adahessian:
        l = L([*m.joint.param_groups(), *m.predictor.param_groups()])
        for p in l:
            print(p.shape)
        print(len(l))
        return l
    else:
        ps = L(m.param_groups())
        return ps
Пример #11
0
    def __init__(self,
                 image_dir='images',
                 mask_dir=None,
                 config=None,
                 path=None,
                 ensemble_dir=None,
                 label_fn=None,
                 metrics=None,
                 loss_fn=None,
                 cbs=None,
                 ds_kwargs={},
                 stats=None,
                 files=None):

        self.config = config or Config()
        self.stats = stats
        self.path = Path(path) if path is not None else Path('.')
        self.metrics = metrics or [Iou()]  #Dice_f1()
        self.loss_fn = loss_fn or WeightedSoftmaxCrossEntropy(axis=1)
        self.cbs = cbs or [
            SaveModelCallback(monitor='iou'), ElasticDeformCallback
        ]  #ShowGraphCallback
        self.ensemble_dir = ensemble_dir or self.path / 'ensemble'

        self.files = L(files) or get_image_files(self.path / image_dir,
                                                 recurse=False)
        assert len(
            self.files
        ) > 0, f'Found {len(self.files)} images in "{image_dir}". Please check your images and image folder'
        if any([mask_dir, label_fn]):
            if label_fn: self.label_fn = label_fn
            else:
                self.label_fn = get_label_fn(self.files[0],
                                             self.path / mask_dir)
            #Check if corresponding masks exist
            mask_check = [self.label_fn(x).is_file() for x in self.files]
            chk_str = f'Found {len(self.files)} images in "{image_dir}" and {sum(mask_check)} masks in "{mask_dir}".'
            assert len(self.files) == sum(mask_check) and len(
                self.files
            ) > 0, f'Please check your images and masks (and folders). {chk_str}'
            print(chk_str)

        else:
            self.label_fn = label_fn

        self.n_splits = min(len(self.files), self.max_splits)
        for key, value in get_default_shapes(self.arch).items():
            ds_kwargs.setdefault(key, value)
        self.ds_kwargs = ds_kwargs
        self.item_tfms = [Brightness(max_lighting=self.light)]
        self.models = {}
        self.recorder = {}
        self._set_splits()
        self.ds = RandomTileDataset(self.files,
                                    label_fn=self.label_fn,
                                    create_weights=False,
                                    **self.mw_kwargs,
                                    **self.ds_kwargs)
        self.in_channels = self.ds.get_data(max_n=1)[0].shape[-1]
        self.df_val, self.df_ens, self.df_model, self.ood = None, None, None, None
Пример #12
0
 def __init__(self, coll, idxs=None, cache=None, tfm=noop):
     if idxs is None: idxs = L.range(coll)
     self.coll = coll
     self.idxs = idxs
     self.tfm = tfm
     if cache is not None:
         self._get = functools.lru_cache(maxsize=cache)(self._get)
 def reset_changes(self, b):
     self.update_message = False
     dd_children = L(self.iw.widget.children).itemgot(1)
     for dd_child in dd_children:
         dd_child.value = '<Keep>'
     self.check_pending_changes()
     self.update_message = True
Пример #14
0
def extract_level(spectrum_blocks: L, dtype=np.float32) -> pd.DataFrame:
    """Receives a mapping `spectrum_blocks` and returns the Matrix with the Levels as values, Frequencies as columns and Block Number as index.
    :param pivoted: If False, optionally returns an unpivoted version of the Matrix
    """
    assert len(spectrum_blocks), f"The spectrum block list is empty"
    spectrum_blocks = spectrum_blocks.itemgot(1)
    block = spectrum_blocks[0]
    assert block.type in (63, 64, 67,
                          68), f"The input blocks are not spectral blocks"
    rows = len(spectrum_blocks)
    min_level = block.offset - 127.5
    if block.type in (63, 67):
        cols = block.ndata
        frequencies = getattr(block, "frequencies")
        return pd.DataFrame(
            _extract_uncompressed(spectrum_blocks, rows, cols, min_level,
                                  dtype),
            columns=frequencies,
        )
    else:
        cols = block.norig
        thresh = block.thresh - 1
        block_data = [b.block_data for b in spectrum_blocks]
        frequencies = np.linspace(block.start_mega, block.stop_mega, num=cols)
        levels = cy_extract_compressed(block_data, rows, cols, thresh,
                                       min_level)
        if dtype != np.float32:
            levels = levels.astype(dtype)
        return pd.DataFrame(levels, columns=frequencies)
 def delete_items(self):
     ds_index = self.dd_ds.index # use train(0) or valid(1)
     iwi = self.iwis[ds_index] # get filenames,targs,loss for train/valid
     items = L(iwi).itemgot(0) # get filenames for  train/valid
     for_deletion = self.delete().copy()
     for idx in for_deletion:
         self.delete_dataset_item(items, iwi, idx)
Пример #16
0
def convert_params(o:list) -> list:
    """
    Converts `o` into Pytorch-compatable param groups

    `o` should be a set of layer-groups that should be split in the optimizer

    Example:

    ```python
    def splitter(m): return convert_params([[m.a], [m.b]])
    ```

    Where `m` is a model defined as:

    ```python
    class RegModel(Module):
      def __init__(self): self.a,self.b = nn.Parameter(torch.randn(1)),nn.Parameter(torch.randn(1))
      def forward(self, x): return x*self.a + self.b
    ```
    """
    if not isinstance(o[0], dict):
        splitter = []
        for group in o:
            if not isinstance(group[0], nn.parameter.Parameter):
                group = L(group).map(params)
            splitter.append({'params':group})
        return splitter
    return o
Пример #17
0
    def __init__(self,
                 items,
                 tfms,
                 use_list=None,
                 do_setup=True,
                 split_idx=None,
                 train_setup=True,
                 splits=None,
                 types=None,
                 verbose=False,
                 dl_type=None):
        super().__init__(items, use_list=use_list)
        if dl_type is not None: self._dl_type = dl_type

        #potentially unused
        self.splits = L([slice(None), []] if splits is None else splits).map(
            mask2idxs)
        if isinstance(tfms, TfmdListsX): tfms = tfms.tfms
        if isinstance(tfms, PipelineX): do_setup = False

        # This is relevant, equivalent to PipelineX
        self.tfms = PipelineX(tfms, split_idx=split_idx)

        store_attr('types,split_idx')
        if do_setup:
            pv(f"Setting up {self.tfms}", verbose)
            self.setup(train_setup=train_setup)
Пример #18
0
def pre_xla_inference(self: Learner):
    ctrl_args = {}
    progress_removed = False
    if 'progress' in L(self.cbs).attrgot('name'):
        self.remove_cbs(ProgressCallback)
        progress_removed = True
    ctrl_args['use_progress'] = progress_removed
    return ctrl_args
Пример #19
0
def add_master_cb(self: Learner, cb):
    "add a master callback"
    if not hasattr(self, '_master_cbs'):
        self._master_cbs = L()
    if isinstance(cb, type): cb = cb()
    #     cb.learn = self
    #     setattr(self, cb.name, cb)
    self._master_cbs.append(cb)
Пример #20
0
def round_multiple(x, mult, round_down=False):
    "Round `x` to nearest multiple of `mult`"

    def _f(x_):
        return (int if round_down else round)(x_ / mult) * mult

    res = L(x).map(_f)
    return res if is_listy(x) else res[0]
Пример #21
0
def fit_flat_cos(self:Learner, n_epoch, lr=None, div_final=1e5, pct_start=0.75, wd=None,
                 cbs=None, reset_opt=False):
    "Fit `self.model` for `n_epoch` at flat `lr` before a cosine annealing."
    if self.opt is None: self.create_opt()
    self.opt.set_hyper('lr', self.lr if lr is None else lr)
    lr = np.array([h['lr'] for h in self.opt.hypers])
    scheds = {'lr': combined_cos(pct_start, lr, lr, lr/div_final)}
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
Пример #22
0
 def after_fit(self):
     tlogger = make_notifier()
     tlogger.info(
         "{}: Ende des Trainings nach {} Epochen mit Loss {}".format(
             self.model_name,
             self.epoch + 1,
             L(self.recorder.values[0:]).itemgot(1)[-1],
         ))
 def reclassify_items(self):
     self.update_message = False
     ds_index = self.dd_ds.index # use train(0) or valid(1)
     iwi = self.iwis[ds_index] # get filenames,targs,loss for train/valid
     items = L(iwi).itemgot(0) # get filenames for  train/valid
     for idx,new_cat in self.change():
         if new_cat != self.dd_cats.value: # new_cat is not equal to existing cat
             self.reclassify_item(items, iwi, idx, new_cat)
Пример #24
0
def fit_one_cycle(self:Learner, n_epoch, lr_max=None, div=25., div_final=1e5, pct_start=0.25, wd=None,
                  moms=None, cbs=None, reset_opt=False):
    "Fit `self.model` for `n_epoch` using the 1cycle policy."
    if self.opt is None: self.create_opt()
    self.opt.set_hyper('lr', self.lr if lr_max is None else lr_max)
    lr_max = np.array([h['lr'] for h in self.opt.hypers])
    scheds = {'lr': combined_cos(pct_start, lr_max/div, lr_max, lr_max/div_final),
              'mom': combined_cos(pct_start, *(self.moms if moms is None else moms))}
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
Пример #25
0
 def set_hyper(self, k, v):
     "Set the value(s) in `v` for hyper-paramter `k`"
     if isinstance(v, slice):
         if v.start: v = even_mults(v.start, v.stop, len(self.param_lists))
         else: v = [v.stop/10]*(len(self.param_lists)-1) + [v.stop]
     v = L(v, use_list=None)
     if len(v)==1: v = v*len(self.param_lists)
     assert len(v) == len(self.hypers), f"Trying to set {len(v)} values for {k} but there are {len(self.param_lists)} parameter groups."
     self._set_hyper(k, v)
Пример #26
0
 def after_epoch(self):
     if (self.epoch + 1) % 10 == 0:
         tlogger = make_notifier()
         tlogger.info("{}: Epoche {}/{} mit Loss {}".format(
             self.model_name,
             self.epoch + 1,
             self.n_epoch,
             L(self.recorder.values[0:]).itemgot(1)[-1],
         ))
Пример #27
0
 def __init__(self,
              ds_idx=1,
              dl=None,
              with_input=False,
              with_decoded=False,
              cbs=None,
              reorder=True):
     self.cbs = L(cbs)
     store_attr('ds_idx,dl,with_input,with_decoded,reorder')
Пример #28
0
def apply_sentence_piecer(df,
                          text_col: str = 'text',
                          vocab_sz: int = 200,
                          print_len: int = 40):
    "Apply SentencePieceTokenizer"

    txt = L(df[text_col].tolist())
    sp = SentencePieceTokenizer(vocab_sz)
    sp.setup(txt)
    return next(iter(sp(txt[:print_len])))
Пример #29
0
def fit_sgdr(self:Learner, n_cycles, cycle_len, lr_max=None, cycle_mult=2, cbs=None, reset_opt=False, wd=None):
    "Fit `self.model` for `n_cycles` of `cycle_len` using SGDR."
    if self.opt is None: self.create_opt()
    self.opt.set_hyper('lr', self.lr if lr_max is None else lr_max)
    lr_max = np.array([h['lr'] for h in self.opt.hypers])
    n_epoch = cycle_len * (cycle_mult**n_cycles-1)//(cycle_mult-1)
    pcts = [cycle_len * cycle_mult**i / n_epoch for i in range(n_cycles)]
    scheds = [SchedCos(lr_max, 0) for _ in range(n_cycles)]
    scheds = {'lr': combine_scheds(pcts, scheds)}
    self.fit(n_epoch, cbs=ParamScheduler(scheds)+L(cbs), reset_opt=reset_opt, wd=wd)
Пример #30
0
 def after_validate(self):
     to_rm = L(cb for cb in self.learn.cbs
               if getattr(cb, 'remove_on_fetch', False))
     with self.learn.removed_cbs(to_rm + self.cbs) as learn:
         self.preds = learn.get_preds(ds_idx=self.ds_idx,
                                      dl=self.dl,
                                      with_input=self.with_input,
                                      with_decoded=self.with_decoded,
                                      inner=True,
                                      reorder=self.reorder)