def _bounded_data2param(self, data, lb=0., ub=1.): data = enforce_float_tensor(data) lb = npt.tensor(lb, min_ndim=0) ub = npt.tensor(ub, min_ndim=0) if lb is None and ub is None: # Unbounded return data else: if lb is None: lb = -np.inf if ub is None: ub = np.inf data = torch.clamp( data, min=float(lb + self.epsilon), max=float(ub - self.epsilon) ) if lb == -np.inf: # data[data > ub - self.epsilon] = ub - self.epsilon return torch.log(ub - data) elif ub == np.inf: # data[data < lb + self.epsilon] = lb + self.epsilon return torch.log(data - lb) else: # data[data < lb + self.epsilon] = lb + self.epsilon # data[data > ub - self.epsilon] = ub - self.epsilon p = (data - lb) / (ub - lb) return torch.log(p) - torch.log(1. - p)
def register_bounded_parameter(self, name, data, lb=0., ub=1.): lb = npt.tensor(lb, min_ndim=0) ub = npt.tensor(ub, min_ndim=0) data = enforce_float_tensor(data) self._params_bounded[name] = {'lb':lb, 'ub':ub} param = self._bounded_data2param(data, lb, ub) self.register_parameter('_bounded_' + name, nn.Parameter(param, requires_grad=True)) self.params_bounded.__dict__[name] = None # just a reminder
def __init__(self, data, lb=0., ub=1., skip_loading_lbub=False, requires_grad=True, **kwargs): super().__init__(**kwargs) self.lb = npt.tensor(lb) self.ub = npt.tensor(ub) self.skip_loading_lbub = skip_loading_lbub self._param = nn.Parameter(self.data2param(data), requires_grad=requires_grad) if self._param.ndim == 0: raise Warning('Use ndim>0 to allow consistent use of [:]. ' 'If ndim=0, use paramname.v to access the ' 'value.')
def _bounded_param2data(self, param, lb=0., ub=1.): lb = npt.tensor(lb, min_ndim=0) ub = npt.tensor(ub, min_ndim=0) param = enforce_float_tensor(param) if lb is None and ub is None: # Unbounded return param elif lb is None: return ub - torch.exp(param) elif ub is None: return lb + torch.exp(param) else: return (1. / (1. + torch.exp(-param))) * (ub - lb) + lb
def get_named_bounded_params( self, named_bounded_params: Dict[str, BoundedParameter] = None, exclude: Iterable[str] = () ) -> (Iterable[str], np.ndarray, np.ndarray, np.ndarray, np.ndarray): """ :param named_bounded_params: :param exclude: :return: names, v, grad, lb, ub """ if named_bounded_params is None: d = odict([ (k, v) for k, v in self.named_modules() if (isinstance(v, OverriddenParameter) # # BoundedParameter) and k not in exclude) ]) else: d = named_bounded_params names = [] v = [] lb = [] ub = [] grad = [] requires_grad = [] for name, param in d.items(): v0 = param.v.flatten() if param._param.grad is None: g0 = torch.zeros_like(v0) else: g0 = param._param.grad.flatten() l0 = npt.tensor(param.lb).expand_as(param.v).flatten() u0 = npt.tensor(param.ub).expand_as(param.v).flatten() for i, (v1, g1, l1, u1) in enumerate(zip(v0, g0, l0, u0)): v.append(npy(v1)) grad.append(npy(g1)) lb.append(npy(l1)) ub.append(npy(u1)) requires_grad.append(npy(param._param.requires_grad)) if v0.numel() > 1: names.append(name + '%d' % i) else: names.append(name) v = np.stack(v) lb = np.stack(lb) ub = np.stack(ub) grad = -np.stack(grad) # minimizing; so take negative requires_grad = np.stack(requires_grad) return names, v, grad, lb, ub, requires_grad
def data2param(self, data) -> torch.Tensor: lb = self.lb ub = self.ub data = enforce_float_tensor(data) if lb is None and ub is None: # Unbounded return data elif lb is None: data[data > ub - self.epsilon] = ub - self.epsilon return torch.log(ub - data) elif ub is None: data[data < lb + self.epsilon] = lb + self.epsilon return torch.log(data - lb) elif npt.tensor(lb == ub).all(): return torch.zeros_like(data) else: too_small = data < lb + self.epsilon try: data[too_small] = lb + self.epsilon except RuntimeError: data[too_small] = (lb + self.epsilon)[too_small] too_big = data > ub - self.epsilon try: data[too_big] = ub - self.epsilon except RuntimeError: data[too_big] = (ub - self.epsilon)[too_big] p = (data - lb) / (ub - lb) return torch.log(p) - torch.log(1. - p)
def _load_from_state_dict( self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): lb_name = prefix + '_lb' ub_name = prefix + '_ub' param_name = prefix + '_param' data_name = prefix + '_data' if self.skip_loading_lbub: if lb_name in state_dict: state_dict.pop(lb_name) if ub_name in state_dict: state_dict.pop(ub_name) else: if lb_name in state_dict: self.lb = npt.tensor(state_dict.pop(lb_name)) if ub_name in state_dict: self.ub = npt.tensor(state_dict.pop(ub_name)) if data_name in state_dict: state_dict[param_name] = self.data2param( state_dict.pop(data_name).detach().clone()) return super()._load_from_state_dict( state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
def param2data(self, param): lb = self.lb ub = self.ub param = enforce_float_tensor(param) if lb is None and ub is None: # Unbounded return param elif lb is None: return torch.tensor(ub) - torch.exp(param) elif ub is None: return lb + torch.exp(param) elif npt.tensor(lb == ub).all(): return torch.zeros_like(param) + lb else: return (1 / (1 + torch.exp(-param))) * (ub - lb) + lb # noqa
def dat2p_dat( self, ch_tr_dim: np.ndarray, dur_tr: np.ndarray, ev_tr_dim: np.ndarray ) -> (torch.Tensor, torch.Tensor, np.ndarray, np.ndarray, np.ndarray, np.ndarray): """ :param ch_tr_dim: [tr, dim] :param dur_tr: [tr] :param ev_tr_dim: [tr, dim] :return: n_cond_dur_ch[cond, dur, ch], ev_cond_fr_dim_meanvar[dcond, fr, dim, (mean, var)], ev_cond_dim[dcond, dim], dcond_tr[tr], durs[dur], ddur_tr[tr] """ nt0 = self.nt0 dt0 = self.dt0 n_ch_flat = self.n_ch subsample_factor = self.subsample_factor nt = int(nt0 // subsample_factor) durs, ddur_tr = np.unique(dur_tr, return_inverse=True) ddur_tr = ddur_tr.astype(np.int) n_dur = len(durs) durs = torch.tensor(durs) ddur_tr = torch.tensor(ddur_tr, dtype=torch.long) ch_tr_flat = consts.ch_by_dim2ch_flat(ch_tr_dim) ev_cond_dim, dcond_tr = np.unique(ev_tr_dim, return_inverse=True, axis=0) n_cond_flat = len(ev_cond_dim) ev_cond_fr_dim = torch.tensor(ev_cond_dim)[:, None, :].expand( [-1, nt, -1]) ev_cond_fr_dim_meanvar = torch.stack( [ev_cond_fr_dim, torch.zeros_like(ev_cond_fr_dim)], -1) n_cond_dur_ch = npt.tensor( npg.aggregate(np.stack([dcond_tr, npy(ddur_tr), ch_tr_flat]), 1., 'sum', [n_cond_flat, n_dur, n_ch_flat])) return n_cond_dur_ch, ev_cond_fr_dim_meanvar, ev_cond_dim, dcond_tr, \ durs, ddur_tr
def optimize( model: ModelType, fun_data: FunDataType, fun_loss: FunLossType, plotfuns: PlotFunsType, optimizer_kind='Adam', max_epoch=100, patience=20, # How many epochs to wait before quitting thres_patience=0.001, # How much should it improve wi patience learning_rate=.5, reduce_lr_by=0.5, reduced_lr_on_epoch=0, reduce_lr_after=50, reset_lr_after=100, to_plot_progress=True, show_progress_every=5, # number of epochs to_print_grad=True, n_fold_valid=1, epoch_to_check=None, # CHECKED comment='', **kwargs # to ignore unnecessary kwargs ) -> (float, dict, dict, List[float], List[float]): """ :param model: :param fun_data: (mode='all'|'train'|'valid'|'train_valid'|'test', fold_valid=0, epoch=0, n_fold_valid=1) -> (data, target) :param fun_loss: (out, target) -> loss :param plotfuns: [(str, fun)] where fun takes dict d with keys 'data_*', 'target_*', 'out_*', 'loss_*', where * = 'train', 'valid', etc. :param optimizer_kind: :param max_epoch: :param patience: :param thres_patience: :param learning_rate: :param reduce_lr_by: :param reduced_lr_on_epoch: :param reduce_lr_after: :param to_plot_progress: :param show_progress_every: :param to_print_grad: :param n_fold_valid: :param kwargs: :return: loss_test, best_state, d, losses_train, losses_valid where d contains 'data_*', 'target_*', 'out_*', and 'loss_*', where * is 'train_valid', 'test', and 'all'. """ def get_optimizer(model, lr): if optimizer_kind == 'SGD': return optim.SGD(model.parameters(), lr=lr) elif optimizer_kind == 'Adam': return optim.Adam(model.parameters(), lr=lr) elif optimizer_kind == 'LBFGS': return optim.LBFGS(model.parameters(), lr=lr) else: raise NotImplementedError() learning_rate0 = learning_rate optimizer = get_optimizer(model, learning_rate) best_loss_epoch = 0 best_loss_valid = np.inf best_state = model.state_dict() best_losses = [] # CHECKED storing and loading states state0 = None loss0 = None data0 = None target0 = None out0 = None outs0 = None def array2str(v): return ', '.join(['%1.2g' % v1 for v1 in v.flatten()[:10]]) def print_targ_out(target0, out0, outs0, loss0): print('target:\n' + array2str(target0)) print('outs:\n' + '\n'.join( ['[%s]' % array2str(v) for v in outs0])) print('out:\n' + array2str(out0)) print('loss: ' + '%g' % loss0) def fun_outs(model, data): p_bef_lapse0 = model.dtb(*data)[0].detach().clone() p_aft_lapse0 = model.lapse(p_bef_lapse0).detach().clone() return [ p_bef_lapse0, p_aft_lapse0 ] def are_all_equal(outs, outs0): for i, (out1, out0) in enumerate(zip(outs, outs0)): if (out1 != out0).any(): warnings.warn( 'output %d different! max diff = %g' % (i, (out1 - out0).abs().max())) print('--') # losses_train[epoch] = average cross-validated loss for the epoch losses_train = [] losses_valid = [] if to_plot_progress: writer = SummaryWriter(comment=comment) t_st = time.time() epoch = 0 try: for epoch in range(max([max_epoch, 1])): losses_fold_train = [] losses_fold_valid = [] for i_fold in range(n_fold_valid): # NOTE: Core part data_train, target_train = fun_data('train', i_fold, epoch, n_fold_valid) model.train() if optimizer_kind == 'LBFGS': def closure(): optimizer.zero_grad() out_train = model(data_train) loss = fun_loss(out_train, target_train) loss.backward() return loss if max_epoch > 0: optimizer.step(closure) out_train = model(data_train) loss_train1 = fun_loss(out_train, target_train) raise NotImplementedError( 'Restoring best state is not implemented yet' ) else: optimizer.zero_grad() out_train = model(data_train) loss_train1 = fun_loss(out_train, target_train) # DEBUGGED: optimizer.step() must not be taken before # storing best_loss or best_state losses_fold_train.append(loss_train1) if n_fold_valid == 1: out_valid = npt.tensor(npy(out_train)) loss_valid1 = npt.tensor(npy(loss_train1)) data_valid = data_train target_valid = target_train # DEBUGGED: Unless directly assigned, target_valid != # target_train when n_fold_valid = 1, which doesn't make # sense. Suggests a bug in fun_data when n_fold = 1 else: model.eval() data_valid, target_valid = fun_data('valid', i_fold, epoch, n_fold_valid) out_valid = model(data_valid) loss_valid1 = fun_loss(out_valid, target_valid) model.train() losses_fold_valid.append(loss_valid1) loss_train = torch.mean(torch.stack(losses_fold_train)) loss_valid = torch.mean(torch.stack(losses_fold_valid)) losses_train.append(npy(loss_train)) losses_valid.append(npy(loss_valid)) if to_plot_progress: writer.add_scalar( 'loss_train', loss_train, global_step=epoch ) writer.add_scalar( 'loss_valid', loss_valid, global_step=epoch ) # --- Store best loss # NOTE: storing losses/states must happen BEFORE taking a step! if loss_valid < best_loss_valid: # is_best = True best_loss_epoch = deepcopy(epoch) best_loss_valid = npt.tensor(npy(loss_valid)) best_state = model.state_dict() best_losses.append(best_loss_valid) # CHECKED storing and loading state if epoch == epoch_to_check: loss0 = loss_valid.detach().clone() state0 = model.state_dict() data0 = deepcopy(data_valid) target0 = deepcopy(target_valid) out0 = out_valid.detach().clone() outs0 = fun_outs(model, data0) loss001 = fun_loss(out0, target0) # CHECKED: loss001 must equal loss0 print('loss001 - loss0: %g' % (loss001 - loss0)) print_targ_out(target0, out0, outs0, loss0) print('--') def print_loss(): t_el = time.time() - t_st print('%1.0f sec/%d epochs = %1.1f sec/epoch, Ltrain: %f, ' 'Lvalid: %f, LR: %g, best: %f, epochB: %d' % (t_el, epoch + 1, t_el / (epoch + 1), loss_train, loss_valid, learning_rate, best_loss_valid, best_loss_epoch)) if epoch % show_progress_every == 0: model.train() data_train_valid, target_train_valid = fun_data( 'train_valid', i_fold, epoch, n_fold_valid ) out_train_valid = model(data_train_valid) loss_train_valid = fun_loss(out_train_valid, target_train_valid) print_loss() if to_plot_progress: d = { 'data_train': data_train, 'data_valid': data_valid, 'data_train_valid': data_train_valid, 'out_train': out_train.detach(), 'out_valid': out_valid.detach(), 'out_train_valid': out_train_valid.detach(), 'target_train': target_train.detach(), 'target_valid': target_valid.detach(), 'target_train_valid': target_train_valid.detach(), 'loss_train': loss_train.detach(), 'loss_valid': loss_valid.detach(), 'loss_train_valid': loss_train_valid.detach() } for k, f in odict(plotfuns).items(): fig, d = f(model, d) if fig is not None: writer.add_figure(k, fig, global_step=epoch) # --- Learning rate reduction and patience # if epoch == reduced_lr_on_epoch + reset_lr_after # if epoch == reduced_lr_on_epoch + reduce_lr_after and ( # best_loss_valid # > best_losses[-reduce_lr_after] - thres_patience # ): if epoch > 0 and epoch % reset_lr_after == 0: learning_rate = learning_rate0 elif epoch > 0 and epoch % reduce_lr_after == 0: learning_rate *= reduce_lr_by optimizer = get_optimizer(model, learning_rate) reduced_lr_on_epoch = epoch if epoch >= patience and ( best_loss_valid > best_losses[-patience] - thres_patience ): print('Ran out of patience!') if to_print_grad: print_grad(model) break # --- Take a step if optimizer_kind != 'LBFGS': # steps are not taken above for n_fold_valid == 1, so take a # step here, after storing the best state loss_train.backward() if to_print_grad and epoch == 0: print_grad(model) if max_epoch > 0: optimizer.step() except Exception as ex: from lib.pylabyk.cacheutil import is_keyboard_interrupt if not is_keyboard_interrupt(ex): raise ex print('fit interrupted by user at epoch %d' % epoch) from lib.pylabyk.localfile import LocalFile, datetime4filename localfile = LocalFile() cache = localfile.get_cache('model_data_target') data_train_valid, target_train_valid = fun_data( 'all', 0, 0, n_fold_valid) cache.set({ 'model': model, 'data_train_valid': data_train_valid, 'target_train_valid': target_train_valid }) cache.save() print_loss() if to_plot_progress: writer.close() if epoch_to_check is not None: # Must print the same output as previous call to print_targ_out print_targ_out(target0, out0, outs0, loss0) model.load_state_dict(state0) state1 = model.state_dict() for (key0, param0), (key1, param1) in zip( state0.items(), state1.items() ): # type: ((str, torch.Tensor), (str, torch.Tensor)) if (param0 != param1).any(): with torch.no_grad(): warnings.warn( 'Strange! loaded %s = %s\n' '!= stored %s = %s\n' 'loaded - stored = %s' % (key1, param1, key0, param0, param1 - param0)) data, target = fun_data('valid', 0, epoch_to_check, n_fold_valid) if not torch.is_tensor(data): p_unequal = torch.tensor([ (v1 != v0).double().mean() for v1, v0 in zip(data, data0) ]) if (p_unequal > 0).any(): print('Strange! loaded data != stored data0\n' 'Proportion: %s' % p_unequal) else: print('All loaded data == stored data') elif (data != data0).any(): print('Strange! loaded data != stored data0') else: print('All loaded data == stored data') if (target != target0).any(): print('Strange! loaded target != stored target0') else: print('All loaded target == stored target') print_targ_out(target0, out0, outs0, loss0) # with torch.no_grad(): # out01 = model(data0) # loss01 = fun_loss(out01, target0) model.train() # with torch.no_grad(): # CHECKED # outs1 = fun_outs(model, data) # are_all_equal(outs1, outs0) out1 = model(data) if (out0 != out1).any(): warnings.warn( 'Strange! out from loaded params != stored out\n' 'Max abs(loaded - stored): %g' % (out1 - out0).abs().max()) print('--') else: print('out from loaded params = stored out') loss01 = fun_loss(out0, target0) print_targ_out(target0, out0, outs0, loss01) if loss0 != loss01: warnings.warn( 'Strange! loss1 = %g simply computed again with out0, ' 'target0\n' '!= stored loss0 = %g\n' 'loaded - stored: %g\n' 'Therefore, fun_loss, out0, or target0 has changed!' % (loss01, loss0, loss01 - loss0)) print('--') else: print('loss0 == loss01, simply computed again with out0, target0') loss1 = fun_loss(out1, target) if loss0 != loss1: warnings.warn( 'Strange! loss1 = %g from loaded params\n' '!= stored loss0 = %g\n' 'loaded - stored: %g' % (loss1, loss0, loss1 - loss0)) print('--') else: print('loss1 = %g = loss0 = %g' % (loss1, loss0)) loss10 = fun_loss(out1, target0) if loss0 != loss1: warnings.warn( 'Strange! loss10 = %g from loaded params and stored ' 'target0\n' '!= stored loss0 = %g\n' 'loaded - stored: %g' % (loss10, loss0, loss10 - loss0)) print('--') else: print('loss10 = %g = loss10 = %g' % (loss1, loss0)) print('--') model.load_state_dict(best_state) d = {} for mode in ['train_valid', 'valid', 'test', 'all']: data, target = fun_data(mode, 0, 0, n_fold_valid) out = model(data) loss = fun_loss(out, target) d.update({ 'data_' + mode: data, 'target_' + mode: target, 'out_' + mode: npt.tensor(npy(out)), 'loss_' + mode: npt.tensor(npy(loss)) }) if d['loss_valid'] != best_loss_valid: print('d[loss_valid] = %g from loaded best_state \n' '!= best_loss_valid = %g\n' 'd[loss_valid] - best_loss_valid = %g' % (d['loss_valid'], best_loss_valid, d['loss_valid'] - best_loss_valid)) print('--') if isinstance(model, OverriddenParameter): print(model.__str__()) elif isinstance(model, BoundedModule): pprint(model._parameters_incl_bounded) else: pprint(model.state_dict()) return d['loss_test'], best_state, d, losses_train, losses_valid