def __init__(self, batch_size=256, seed=-1, shuffle_level=2, allow_rollback=True, labels=None, log_path=None, verbose=3): super(MainLoop, self).__init__() self._labels = labels self._main_task = None self._task = [] self._subtask = [] self._evaltask = [] self._task_when = {} # mapping from `Task` to `Timer` self._task_freq = {} # mapping from `Task` to `Timer` self._allow_rollback = bool(allow_rollback) self._verbose = int(verbose) # create default RNG (no randomization) self._rng = struct() self._rng.randint = lambda *args, **kwargs: None # set batch self.set_batch(batch_size=batch_size, seed=seed, shuffle_level=shuffle_level) self._callback = CallbackList() # ====== for the checkpoint ====== # self._save_path = None self._save_obj = None self._save_variables = [] self._best_object = None self._save_history = True # ====== maximum stored checkpoint ====== # self._checkpoint_increasing = True self._checkpoint_max = -1 self._current_checkpoint_count = 0
def __init__(self, batch_size=256, seed=-1, shuffle_level=0, allow_rollback=True, labels=None, log_path=None, verbose=3): super(MainLoop, self).__init__() self._labels = labels self._main_task = None self._task = [] self._subtask = [] self._evaltask = [] self._task_when = {} # mapping from `Task` to `Timer` self._task_freq = {} # mapping from `Task` to `Timer` self._allow_rollback = bool(allow_rollback) self._verbose = int(verbose) # create default RNG (no randomization) self._rng = struct() self._rng.randint = lambda *args, **kwargs: None # set batch self.set_batch(batch_size=batch_size, seed=seed, shuffle_level=shuffle_level) self._callback = CallbackList() # ====== for the checkpoint ====== # self._save_path = None self._save_obj = None self._save_variables = [] self._best_object = None self._save_history = True # ====== maximum stored checkpoint ====== # self._checkpoint_increasing = True self._checkpoint_max = -1 self._current_checkpoint_count = 0
def set_batch(self, batch_size=None, seed=-1, shuffle_level=None): if batch_size is not None: self._batch_size = batch_size if seed is None or seed >= 0: if seed is not None: self._rng = np.random.RandomState(seed) else: self._rng = struct() self._rng.randint = lambda x: None self._rng.rand = get_rng().rand if shuffle_level is not None: self._shuffle_level = min(max(int(shuffle_level), 0), 2) return self
def set_batch(self, batch_size=None, seed=-1, shuffle_level=None): if batch_size is not None: self._batch_size = batch_size self._nb_samples_per_epoch = min([len(i) for i in self._data]) if seed is None or seed >= 0: if seed is not None: self._rng = np.random.RandomState(seed) else: self._rng = struct() self._rng.randint = lambda x: None self._rng.rand = RNG_GENERATOR.rand if shuffle_level is not None: self._shuffle_level = min(max(int(shuffle_level), 0), 2) return self
def set_batch(self, batch_size=None, seed=-1, shuffle_level=None): """ Parameters ---------- batch_size: int size of each batch return when iterate this Data seed: None, int if None, no shuffling is performed while iterating, if < 0, do not change the current seed if >= 0, enable randomization with given seed start: int, float if int, start indicates the index of starting data points to iterate. If float, start is the percentage of data to start. end: int, float ending point of the interation shuffle_level: int 0: only shuffle the order of each batch 1: shuffle the order of batches and inside each batch as well. 2: includes level 0 and 1, and custom shuffling (strongest form) """ if batch_size is not None: self._batch_size = batch_size if seed >= 0 or seed is None: if seed is not None: self._rng = np.random.RandomState(seed) else: self._rng = struct() self._rng.randint = lambda *args, **kwargs: None if shuffle_level is not None: shuffle_level = min(max(int(shuffle_level), 0), 2) self._shuffle_level = shuffle_level # ====== set_batch for Tasks ====== # if self._task is not None: self._task.set_batch(batch_size=batch_size, seed=seed, shuffle_level=shuffle_level) for i in self._subtask.itervalues(): i.set_batch(batch_size=batch_size, seed=seed, shuffle_level=shuffle_level) for i in self._crosstask.itervalues(): i.set_batch(batch_size=batch_size, seed=seed, shuffle_level=shuffle_level) return self
def __init__(self, batch_size=256, seed=-1, shuffle_level=0): super(MainLoop, self).__init__() self._task = None self._subtask = {} # run 1 epoch after given frequence self._crosstask = {} # randomly run 1 iter given probability # create default RNG (no randomization) self._rng = struct() self._rng.randint = lambda *args, **kwargs: None # set batch self.set_batch(batch_size=batch_size, seed=seed, shuffle_level=shuffle_level) self._callback = CallbackList() self._save_path = None self._save_hist = None self._save_obj = None
def set_batch(self, batch_size=None, seed=-1, shuffle_level=None): """ Parameters ---------- batch_size: int size of each batch return when iterate this Data seed: None, int if None, no shuffling is performed while iterating, if < 0, do not change the current seed if >= 0, enable randomization with given seed start: int, float if int, start indicates the index of starting data points to iterate. If float, start is the percentage of data to start. end: int, float ending point of the interation shuffle_level: int 0: only shuffle the order of each batch 1: shuffle the order of batches and inside each batch as well. 2: includes level 0 and 1, and custom shuffling (strongest form) """ if batch_size is not None: self._batch_size = batch_size if seed is None or seed >= 0: if seed is not None: self._rng = np.random.RandomState(seed) else: self._rng = struct() self._rng.randint = lambda *args, **kwargs: None if shuffle_level is not None: shuffle_level = min(max(int(shuffle_level), 0), 2) self._shuffle_level = shuffle_level # ====== set_batch for Tasks ====== # for i in self._task: i.set_batch(batch_size=batch_size, seed=seed, shuffle_level=shuffle_level) for i in self._subtask: i.set_batch(batch_size=batch_size, seed=seed, shuffle_level=shuffle_level) return self
def create_iteration(): seed = self._seed; self._seed = None if seed is not None: rng = np.random.RandomState(seed) else: # deterministic RandomState rng = struct() rng.randint = lambda x: None rng.permutation = lambda x: slice(None, None) # ====== easy access many private variables ====== # sequential = self._sequential start, end = self._start, self._end batch_size = self._batch_size distribution = np.asarray(self._distribution) # shuffle order of data (good for sequential mode) idx = rng.permutation(len(self._data)) data = self._data[idx] if isinstance(idx, slice) else [self._data[i] for i in idx] distribution = distribution[idx] shape = [i.shape[0] for i in data] # ====== prepare distribution information ====== # # number of sample should be traversed n = np.asarray([i * (_apply_approx(j, end) - _apply_approx(j, start)) for i, j in zip(distribution, shape)]) n = np.round(n).astype(int) # normalize the distribution (base on new sample n of each data) distribution = n / n.sum() distribution = _approximate_continuos_by_discrete(distribution) # somehow heuristic, rescale distribution to get more benifit from cache if distribution.sum() <= len(data): distribution = distribution * 3 # distribution now the actual batch size of each data distribution = (batch_size * distribution).astype(int) assert distribution.sum() % batch_size == 0, 'wrong distribution size!' # predefined (start,end) pair of each batch (e.g (0,256), (256,512)) idx = list(range(0, batch_size + distribution.sum(), batch_size)) idx = list(zip(idx, idx[1:])) # Dummy return to initialize everything yield None ##################################### # 1. optimized parallel code. if not sequential: # first iterators it = [iter(dat.set_batch(bs, seed=rng.randint(10e8), start=start, end=end, shuffle_level=self._shuffle_level)) for bs, dat in zip(distribution, data)] # iterator while sum(n) > 0: batch = [] for i, x in enumerate(it): if n[i] <= 0: continue try: x = next(x)[:n[i]] n[i] -= x.shape[0] batch.append(x) except StopIteration: # one iterator stopped it[i] = iter(data[i].set_batch(distribution[i], seed=rng.randint(10e8), start=start, end=end, shuffle_level=self._shuffle_level)) x = next(it[i])[:n[i]] n[i] -= x.shape[0] batch.append(x) # got final batch batch = np.vstack(batch) # no idea why random permutation is much faster than shuffle if self._shuffle_level > 0: batch = batch[rng.permutation(batch.shape[0])] # return the iterations for i, j in idx[:int(ceil(batch.shape[0] / batch_size))]: yield batch[i:j] ##################################### # 2. optimized sequential code. else: # first iterators batch_size = distribution.sum() it = [iter(dat.set_batch(batch_size, seed=rng.randint(10e8), start=start, end=end, shuffle_level=self._shuffle_level)) for dat in data] current_data = 0 # iterator while sum(n) > 0: if n[current_data] <= 0: current_data += 1 try: x = next(it[current_data])[:n[current_data]] n[current_data] -= x.shape[0] except StopIteration: # one iterator stopped it[current_data] = iter(data[current_data].set_batch(batch_size, seed=rng.randint(10e8), start=start, end=end, shuffle_level=self._shuffle_level)) x = next(it[current_data])[:n[current_data]] n[current_data] -= x.shape[0] # shuffle x if self._shuffle_level > 0: x = x[rng.permutation(x.shape[0])] for i, j in idx[:int(ceil(x.shape[0] / self._batch_size))]: yield x[i:j]
def _check_tag(var): if not hasattr(var, 'tag'): var.tag = struct()
from six.moves import range, zip, cPickle import numpy as np from odin import (SIG_TRAIN_ROLLBACK, SIG_TRAIN_SAVE, SIG_TRAIN_STOP) from odin.config import RNG_GENERATOR from odin import fuel from odin.fuel.dataset import Dataset from odin.utils import struct, as_tuple, is_number from .callbacks import * # =========================================================================== # Helper # =========================================================================== _SAVE_TASK = struct() _SAVE_TASK.name = "save" def __format_string(nb_of_float): x = ["{:.4f}"] * int(nb_of_float) return ";".join(x) def standard_trainer(train_data, valid_data, X, y_train, y_score, y_target, parameters,