def _multi_kfold_scoring(dataset, algo, L=10, k=2): """ Performs multiple scorings of the given dataset. Parameters ---------- dataset : rankeval.dataset.Dataset The dataset instance. algo : function See :func:`bias_variance`. L : int Number of iterations k : int Number of folds. Returns ------- score : numpy.ndarray A matrix num_instances x L. """ progress_bar = IntProgress(min=0, max=L, description="Computing L scores") display(progress_bar) scores = np.zeros( (dataset.n_instances, L), dtype=np.float32) for l in range(L): progress_bar.value += 1 scores[:,l] = _kfold_scoring(dataset, k, algo) progress_bar.bar_style = "success" progress_bar.close() return scores
class PBinJ(object): """ initialize multiple progress bars for tracking nested stages of fitting routine """ def __init__(self, n=1, value=0, status='{}', color='b', width='60%', height='22px'): self.displayed = False self.style_bar(n=n, value=value, status=status, color=color, width=width, height=height) def style_bar(self, n=1, value=0, status='{}', color='b', width='60%', height='22px'): colordict = {'g': 'success', 'b': '', 'r': 'danger', 'y': 'warning', 'c': 'info'} bar_style = colordict[color] self.bar = IntProgress(min=0, max=n, value=value, bar_style=bar_style) self.status = status self.bar.bar_style = bar_style self.bar.width = width self.bar.height = height def reset_bar(self, color=False): self.update(value=0) def update(self, value=None, status=None): if not self.displayed: display(self.bar) self.displayed=True if status is not None: if hasattr(status, '__iter__') and not isinstance(status, str): status = self.status.format(*status) else: status = self.status.format(status) self.bar.description = status if value is not None: self.bar.value = value+1 def clear(self): self.bar.close() self.displayed = False
def new_k_means(data, k, plot=True): # This will display a progress bar during k-mean execution f = IntProgress(description=f'KM (k={k}):', min=0, max=50) display(f) # initializing the array where we collect all cluster assignments cluster_collection = np.zeros((50, data.shape[0]), dtype=np.int32) # initializing the array where we collect all risk values risk_collection = np.zeros(50) for i in range(50): f.value += 1 centroids, clusters = k_means(data, k, random_seed=i, plot=False) risk_collection[i] = empirical_risk(data, clusters, centroids) cluster_collection[i, :] = clusters # find the best cluster assignment and print the lowest found empirical risk min_ind = np.argmin(risk_collection) max_ind = np.argmax(risk_collection) if plot: print("Cluster division with lowest empirical risk") plotting(data, clusters=cluster_collection[min_ind, :]) print("Cluster division with highest empirical risk") plotting(data, clusters=cluster_collection[max_ind, :]) print('min empirical risk is ', np.min(risk_collection)) # Let's remove progress bar f.close() return cluster_collection[min_ind, :], risk_collection
class PBinJ(object): """ initialize multiple progress bars for tracking nested stages of fitting routine """ def __init__(self, n=1, value=0, status="{}", color="r", width="50%", height="25px"): self.displayed = False self.style_bar(n=n, value=value, status=status, color=color, width=width, height=height) def style_bar(self, n=1, value=0, status="{}", color="r", width="50%", height="25px"): colordict = {"g": "#16a085", "b": "#4168B7", "r": "#e74c3c", "y": "#f39c12"} self.bar = IntProgress(min=0, max=n, value=value) self.status = status self.bar.color = colordict[color] self.bar.width = width self.bar.height = height def reset_bar(self): self.update(value=0) def update(self, value=None, status=None): if not self.displayed: display(self.bar) self.displayed = True if status is not None: if hasattr(status, "__iter__"): status = self.status.format(*status) else: status = self.status.format(status) self.bar.description = status if value is not None: self.bar.value = value + 1 def clear(self): self.bar.close()
def __get_max_gap_length(self, b): """ Compute the max gep length of a masked time series :param b: Progress bar object """ # TODO # This function should be paralelised! bands, rows, cols = self.mask.shape max_gap_length = np.zeros((rows, cols), np.int16) if not type(b) == QProgressBar: progress_bar = IntProgress( value=0, min=0, max=10, step=1, description='Computing max gap length...', bar_style='', # 'success', 'info', 'warning', 'danger' or '' orientation='horizontal', style = {'description_width': 'initial'}, layout={'width': '50%'} ) display(progress_bar) else: b.setEnabled(True) for i in range(rows): if type(b) == QProgressBar: b.setFormat('Computing maximum gap length...') b.setValue(int((i*10.)/rows)) else: progress_bar.value = int((i*10.)/rows) for j in range(cols): for key, group in i_groupby(self.mask.data[:,i,j]): if key == False: _gap_lenght = len(list(group)) if _gap_lenght > 0 and _gap_lenght > max_gap_length[i,j]: max_gap_length[i,j] = _gap_lenght if type(b) == QProgressBar: b.setValue(0) b.setEnabled(False) else: # Remove progress bar progress_bar.close() del progress_bar # Create xarray DataArray _max_gap_length = xr.DataArray(max_gap_length, coords=[self.mask.latitude.data, self.mask.longitude.data], dims=['latitude', 'longitude']) max_gap_length = None self.max_gap_length = _max_gap_length
def statistical_significance(datasets, model_a, model_b, metrics, n_perm=100000): """ This method computes the statistical significance of the performance difference between model_a and. Parameters ---------- datasets : list of Dataset The datasets to use for analyzing the behaviour of the model using the given metrics and models model_a : RTEnsemble The first model considered. model_b : RTEnsemble The second model considered. metrics : list of Metric The metrics to use for the analysis n_perm : int Number of permutations for the randomization test. Returns ------- stat_sig : xarray.DataArray A DataArray containing the statistical significance of the performance difference between any pair of models on the given dataset. """ progress_bar = IntProgress(min=0, max=len(datasets) * len(metrics), description="Iterating datasets and metrics") display(progress_bar) data = np.zeros(shape=(len(datasets), len(metrics), 2), dtype=np.float32) for idx_dataset, dataset in enumerate(datasets): y_pred_a = model_a.score(dataset, detailed=False) y_pred_b = model_b.score(dataset, detailed=False) for idx_metric, metric in enumerate(metrics): progress_bar.value += 1 metrics_a = metric.eval(dataset, y_pred_a)[1] metrics_b = metric.eval(dataset, y_pred_b)[1] p1, p2 = _randomization(metrics_a, metrics_b, n_perm=n_perm) data[idx_dataset][idx_metric][0] = p1 data[idx_dataset][idx_metric][1] = p2 progress_bar.bar_style = "success" progress_bar.close() performance = xr.DataArray( data, name='Statistical Significance', coords=[datasets, metrics, ['one-sided', 'two-sided']], dims=['dataset', 'metric', 'p-value']) return performance
def _kfold_scoring(dataset, k, algo): """ Scored the given datset with the given algo unsing k-fold train/test. Parameters ---------- dataset : rankeval.dataset.Dataset The dataset instance. k : int Number of folds. algo : function See :func:`bias_variance`. Returns ------- score : numpy.ndarray A vecotr of num_instances scores. """ progress_bar = IntProgress(min=0, max=k, description="Processing k folds") display(progress_bar) scores = np.zeros(dataset.n_instances, dtype=np.float32) query_sizes = dataset.get_query_sizes() # shuffle queries shuffled_qid = np.random.permutation(dataset.n_queries) chunk_query_size = int(math.ceil(dataset.n_queries/float(k))) for p in range(0, dataset.n_queries, chunk_query_size): progress_bar.value += 1 # p-th fold is used for testing test_rows = np.full(dataset.n_instances, fill_value=False, dtype=np.bool) for q in shuffled_qid[p: p + chunk_query_size]: test_rows[dataset.query_offsets[q]:dataset.query_offsets[q+1]] = True # other folds are used for training train_rows = np.logical_not(test_rows) train_q = np.full(dataset.n_queries, fill_value=True, dtype=np.bool) train_q[shuffled_qid[p: p+chunk_query_size]] = False # get algorithm predictions fold_scores = algo( dataset.X[train_rows], dataset.y[train_rows], query_sizes[train_q], dataset.X[test_rows] ) # update scores for the current fold scores[test_rows] = fold_scores progress_bar.bar_style = "success" progress_bar.close() return scores
def progress_bar(generator, mx): prog = IntProgress(value=0, max=mx) display(prog) for e in generator: yield e prog.value += 1 prog.close()
def display(self, width=0, height=0, ray=False, timeout=120): """Display PyMol session :param width: width in pixels (0 uses current viewport) :param height: height in pixels (0 uses current viewport) :param ray: use ray tracing (if running PyMOL headless, this parameter has no effect and ray tracing is always used) :param timeout: timeout in seconds Returns ------- fig : IPython.display.Image """ from IPython.display import Image from IPython.display import display from ipywidgets import IntProgress progress_max = int((timeout * 20)**0.5) progress = None filename = tempfile.mktemp('.png') try: self._server.png(filename, width, height, -1, int(ray)) for i in range(1, progress_max): if os.path.exists(filename): break if progress is None: progress = IntProgress(min=0, max=progress_max) display(progress) progress.value += 1 time.sleep(i / 10.0) if not os.path.exists(filename): raise RuntimeError('timeout exceeded') return Image(filename) finally: if progress is not None: progress.close() try: os.unlink(filename) except: pass
def in_progress(seq, msg="Progress: [%(processed)d / %(total)d]", length=None): """ Iterate over sequence, yielding item with progress widget displayed. This is useful if you need to precess sequence of items with some time consuming operations .. note:: This works only in Jupyter Notebook .. note:: This function requires *ipywidgets* package to be installed :param seq: sequence to iterate on. :param str msg: (optional) message template to display. available to use 'processed' and 'total' integer vars, where 'processed' is number of items processed and 'total' is total number of items in seq. :param int length: (optional) if seq is generator, or it is not possible to apply 'len(seq)' function to 'seq', then this argument is required and it's value will be used as total number of items in seq. Example example:: import time for i in in_progress(range(10)): time.sleep(1) """ from IPython.display import display from ipywidgets import IntProgress if length is None: length = len(seq) progress = IntProgress(value=0, min=0, max=length, description=msg % {'processed': 0, 'total': length}) display(progress) for i, item in enumerate(seq, 1): progress.value = i progress.description = msg % {'processed': i, 'total': length} yield item progress.close()
def get_press_series(spliter, color): paddings = 4 white_width = 17 + 2 * paddings black_width = 16 + 2 * paddings height = 106 width = 884 print('Start extracting keypress series ...') print(f' White width: {white_width}px') print(f' Black width: {black_width}px') for name in spliter: black_coor = None N = y_org[name].shape[0] for p in X_path[name]: img = cv2.imread(p) black_coor = get_black_boundaries(img) if len(black_coor) == 36: break bar = IntProgress(max=88 * N) display(bar) for k in range(88): last = -1 for i in range(N): if y_org[name][i][k] > 0: if last == -1: last = i if y_org[name][i][k] <= 0 or i == N - 1: if last != -1: if k in black_mask: add_series(name, 'black', last, i - 1, k, paddings, black_coor) else: add_series(name, 'white', last, i - 1, k, paddings) last = -1 bar.value += 1 bar.close() print(f'{name} set loading finished ...') print(' Pressed white keys: ' + str(len(X_series[name]['white']))) print(' Pressed black keys: ' + str(len(X_series[name]['black'])))
class lstm_data_batch: def __init__(self, type='train', color='white', NCHW=True, shuffle=True, need_bar=True, max_num=-1): self.type = type self.color = color self.NCHW = NCHW self.need_bar = need_bar if max_num == -1: self.max_num = len(X_series[type][color]) else: self.max_num = max_num self.order = np.arange(self.max_num) if shuffle: random.shuffle(self.order) if need_bar: self.bar = IntProgress(max=self.max_num) display(self.bar) def __iter__(self): self.index = 0 return self def __next__(self): if self.index >= self.max_num: if self.need_bar: self.bar.close() raise StopIteration ind = self.order[self.index] X_return = X_series[self.type][self.color][ind] y_return = y_series[self.type][self.color][ind] if self.NCHW: X_return = np.transpose(X_return, (0, 3, 1, 2)) self.index += 1 if self.need_bar: self.bar.value += 1 return (np.array(X_return), np.array(y_return))
class data_batch: def __init__(self, size, NCHW=True, concatenate=False): if size != 'single' and size != 'bundle': raise ValueError("Expected 'single' or 'bundle'") if concatenate: raise NotImplementedError self.len = len(X_path_list) self.bundle = (size == 'bundle') self.NCHW = NCHW self.concatenate = concatenate def __len__(self): return self.len def __iter__(self): self.index = 0 self.bar = IntProgress(max=self.len) display(self.bar) return self def __next__(self): if self.index >= self.len: self.bar.close() raise StopIteration else: img_path = X_path_list[self.index] self.index += 1 self.bar.value += 1 img = cv2.imread(img_path) white_keys = get_white_keys( img, (bundle_paddings if self.bundle else single_paddings)) black_keys = get_black_keys( img, black_coor, (bundle_paddings if self.bundle else single_paddings)) if self.NCHW: white_keys = np.transpose(white_keys, (0, 3, 1, 2)) black_keys = np.transpose(black_keys, (0, 3, 1, 2)) return white_keys, black_keys
class ProgressBar(object): def __init__(self, N=100, smoothing=0.1, interval=1): """Progress bar for an integer number of steps. Parameters ---------- N : int Number of steps. smoothing : float Smoothing factor used for estimating time. A smaller value averages more steps. interval : float Time interval in seconds to update display. Example ------- >>> bar = ProgressBar(100) >>> for i in range(100): ... print(i) ... bar.update() ... ... del bar Methods ------- update Increment progress. """ self.value = 0 self.max = N self.alpha = max(0, min(1, smoothing)) self.interval = interval t = time.time() self.start_time = t self.last_update = t self.t0 = t self.t = 0. if notebook: from ipywidgets import IntProgress, HTML from IPython.display import display self.bar = IntProgress(max=N) self.html = HTML(value=self._repr_html_()) display(self.bar, self.html) def __str__(self): # Time remaining rem = max(0, self.max - self.value) t_rem = datetime.timedelta(seconds=self.t * rem) t_avg = datetime.timedelta(seconds=self.t) t_tot = datetime.timedelta(seconds=self.t0 - self.start_time) p = min(20, int(20 * self.value / self.max)) bar = '[' + p * '=' + (20 - p) * ' ' + ']' return f'{bar} {self.value}/{self.max} {t_rem} {t_avg} {t_tot}' def _repr_html_(self): # Time remaining rem = max(0., self.max - self.value) t_rem = datetime.timedelta(seconds=self.t * rem) t_avg = datetime.timedelta(seconds=self.t) t_tot = datetime.timedelta(seconds=self.t0 - self.start_time) return f""" <table> <tr> <th>Progress:</th> <td>{self.value}/{self.max}</td> </tr> <tr> <th>Remaining time:</th> <td>{t_rem}</td></tr> <tr> <th>Average time:</th> <td>{t_avg}</td> </tr> <tr> <th>Total time:</th> <td>{t_tot}</td> </tr> </table> """ def update(self): """Increment progress.""" self.value += 1 # Time since last update t = time.time() t, self.t0 = t - self.t0, t # Time per update if self.value < 10: # Average self.t = (t + (self.value - 1) * self.t) / self.value else: # Exponential smoothing self.t = self.alpha * t + (1 - self.alpha) * self.t self.display() def display(self): if self.t0 - self.last_update > self.interval: if notebook: self.html.value = self._repr_html_() self.bar.value = self.value self.last_update = self.t0 else: print(self, flush=True) def __del__(self): """Close progress bar.""" if notebook: self.bar.close() self.html.close() close = __del__
class data_batch: def __init__(self, type='train', size='single', color='white', batch_size=64, need_velocity=True, NCHW=True, shuffle=True, concatenate=False, max_num=-1): self.size = size self.type = type self.color = color self.batch_size = batch_size self.NCHW = NCHW self.max_num = max_num self.pressed = [] self.unpressed = [] self.num_pressed = 0 self.num_unpressed = 0 self.need_velocity = need_velocity self.concatenate = concatenate if self.type == 'train': for i, x in enumerate(y[color][type]): if x > 0: self.pressed.append(i) self.num_pressed += 1 else: self.unpressed.append(i) self.num_unpressed += 1 if shuffle: random.shuffle(self.pressed) random.shuffle(self.unpressed) if self.max_num == -1: self.max_num = len(self.unpressed) self.iter_num = len(self.unpressed) * 2 else: self.max_num = self.max_num // 2 self.iter_num = max_num else: self.max_num = len(y[color][type]) self.iter_num = len(y[color][type]) self.bar = IntProgress(max=self.iter_num) display(self.bar) def __iter__(self): self.index = 0 return self def __next__(self): ind = np.array([]) if self.type == 'train': start = self.index * self.batch_size // 2 end = (self.index + 1) * self.batch_size // 2 if start >= self.max_num: self.bar.close() raise StopIteration if end >= self.max_num: end = self.max_num start = end - self.batch_size // 2 self.index += 1 s = start % self.num_pressed t = end % self.num_pressed if start // self.num_pressed == end // self.num_pressed: ind = np.append(ind, np.array(self.pressed[s:t])) else: ind = np.append(ind, np.array(self.pressed[s:])) ind = np.append(ind, np.array(self.pressed[:t])) ind = np.append(ind, np.array(self.unpressed[start:end])) ind = ind.flatten().astype('int64') else: start = self.index * self.batch_size end = start + self.batch_size if start >= self.max_num: self.bar.close() raise StopIteration if end >= self.max_num: end = self.max_num start = end - self.batch_size self.index += 1 ind = np.arange(start, end) np.random.shuffle(ind) X_return = X[self.size][self.color][self.type][ind] if self.concatenate: arr = X[self.size][self.color][self.type] pre = X_pre[self.size][self.color][self.type] post = X_post[self.size][self.color][self.type] ret_pre = [] ret_post = [] for x in ind: ret_pre.append(cv2.subtract(arr[x], pre[x])) ret_post.append(cv2.subtract(post[x], arr[x])) ret_pre = np.array(ret_pre) ret_post = np.array(ret_post) X_return = np.concatenate((ret_pre, X_return, ret_post), axis=3) if self.NCHW: X_return = np.transpose(X_return, (0, 3, 1, 2)) y_return = y[self.color][self.type][ind] if not self.need_velocity: y_return = (y_return > 0).astype(np.int) self.bar.value += self.batch_size return (X_return, y_return, ind)
def in_progress(seq, msg="Progress: [%(processed)d / %(total)d]", length=None, close=True): """ Iterate over sequence, yielding item with progress widget displayed. This is useful if you need to precess sequence of items with some time consuming operations .. note:: This works only in Jupyter Notebook .. note:: This function requires *ipywidgets* package to be installed :param seq: sequence to iterate on. :param str msg: (optional) message template to display. Following variables could be used in this template: - processed - total - time_total - time_per_item :param int length: (optional) if seq is generator, or it is not possible to apply 'len(seq)' function to 'seq', then this argument is required and it's value will be used as total number of items in seq. Example example:: import time for i in in_progress(range(10)): time.sleep(1) """ from IPython.display import display from ipywidgets import IntProgress import time if length is None: length = len(seq) start_time = time.time() progress = IntProgress(value=0, min=0, max=length, description=msg % { 'processed': 0, 'total': length, 'time_total': 0.0, 'time_per_item': 0.0, 'time_remaining': 0.0, }) display(progress) for i, item in enumerate(seq, 1): progress.value = i # i_start_time = time.time() yield item # Do the job i_end_time = time.time() progress.description = msg % { 'processed': i, 'total': length, 'time_total': i_end_time - start_time, 'time_per_item': (i_end_time - start_time) / i, 'time_remaining': ((i_end_time - start_time) / i) * (length - i), } if close: progress.close()
def train(self, phase=['train', 'val'], color='black', learning_rate=1e-3, weight_lambda=0.0005, num_epoch=5, max_num=-1, best_path='model_best.tar', current_path='model_latest.tar', tsb_writer=None, tag='', decay_every=10, save_model=True): model = self.model criterion = nn.MSELoss() optimizer = optim.Adam(self.model.parameters(), lr=learning_rate, weight_decay=weight_lambda) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=decay_every, gamma=0.05) since = time.time() best_model = None best_loss = None best_path = time.strftime('[%Y%m%d]%H-%M-%S') + best_path current_path = time.strftime('[%Y%m%d]%H-%M-%S') + current_path print(f'The best model will be saved to {best_path} ...') print(f'Thhe latest model will be saved to {current_path} ...') for epoch in range(num_epoch): print('Epoch {}/{}'.format(epoch + 1, num_epoch), end='') self.epoch_total += 1 _loss = dict() _diff = dict() for phase in phase: if phase == 'train': scheduler.step() model.train() else: model.eval() running_loss = 0.0 running_diff = 0.0 total = dataset.get_lstm_data_num( phase, color) if max_num == -1 else max_num bar = IntProgress(max=total) display(bar) for i, (inputs, labels) in enumerate( dataset.lstm_data_batch(type=phase, color=color, max_num=total, need_bar=False)): _labels = labels inputs = torch.Tensor(inputs) labels = torch.Tensor(np.array([labels]) / 63.5 - 1) inputs = inputs.to(device) labels = labels.to(device) optimizer.zero_grad() outputs = model(inputs) labels = torch.reshape(labels, [1]) loss = criterion(outputs, labels) if phase == 'train': loss.backward() optimizer.step() running_loss += loss.item() running_diff += np.abs( (outputs.cpu().detach().numpy()[0] + 1) * 63.5 - _labels) if torch.cuda.is_available(): torch.cuda.empty_cache() bar.value += 1 if i % 32 == 0: bar.description = f'{bar.value} / {total}' bar.close() epoch_loss = running_loss / total epoch_diff = running_diff / total if epoch % 5 == 0: print('{} Loss: {:.4f}, L1 Diff: {:.4f}'.format( phase, epoch_loss, epoch_diff)) _loss[phase] = epoch_loss _diff[phase] = epoch_diff if phase == 'val' and (best_loss == None or epoch_loss < best_loss): best_loss = epoch_loss best_model = copy.deepcopy(model.state_dict()) if save_model: torch.save(model.state_dict(), best_path) if save_model: torch.save(model.state_dict(), current_path) if tsb_writer and 'val' in phase and 'train' in phase: tsb_writer.add_scalars(f'{tag}/Loss', { 'val': _loss['val'], 'train': _loss['train'] }, self.epoch_total) tsb_writer.add_scalars(f'{tag}/L1 Diff', { 'val': _diff['val'], 'train': _diff['train'] }, self.epoch_total) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val loss: {:4f}'.format(best_loss)) self.model = model
def _analytics(self, b): """ Uses the self.user_qa_selection OrderedDictionary to extract the corresponding QA values and create a mask of dimensions: (number of qa layers, time steps, cols(lat), rows(lon)) Additionally computes the temporal mask and the max gap length """ if not type(b) == QProgressBar: progress_bar = IntProgress( value=0, min=0, max=len(self.user_qa_selection), step=1, description='', bar_style='', # 'success', 'info', 'warning', 'danger' or '' orientation='horizontal', style = {'description_width': 'initial'}, layout={'width': '50%'} ) display(progress_bar) n_qa_layers = len(self.user_qa_selection) # Get the name of the first data var to extract its shape for k, v in self.ts.data.data_vars.items(): break # Create mask xarray _time, _latitude, _longitude = self.ts.data.data_vars[k].shape mask = np.zeros((n_qa_layers, _time, _latitude, _longitude), np.int8) qa_layer = self.qa_def.QualityLayer.unique() # QA layer user to create mask _qa_layer = getattr(self.ts.qa, f"qa{qa_layer[0]}") for i, user_qa in enumerate(self.user_qa_selection): if type(b) == QProgressBar: b.setValue(i) b.setFormat(f"Masking by QA {user_qa}") else: progress_bar.value = i progress_bar.description = f"Masking by QA {user_qa}" user_qa_fieldname = user_qa.replace(" ", "_").replace("/", "_") for j, qa_value in enumerate(self.user_qa_selection[user_qa]): qa_value_field_name = qa_value.replace(" ", "_") qa_flag_val = self.qa_def[(self.qa_def.Name == user_qa) & (self.qa_def.Description == qa_value)].Value.iloc[0] if j == 0 : mask[i] = (_qa_layer[user_qa_fieldname] == qa_flag_val) else: mask[i] = np.logical_or( mask[i], _qa_layer[user_qa_fieldname] == qa_flag_val) if type(b) == QProgressBar: b.setValue(0) b.setEnabled(False) else: # Remove progress bar progress_bar.close() del progress_bar #self.__temp_mask = mask #mask = xr.DataArray(np.all(self.__temp_mask, axis=0), mask = xr.DataArray(np.all(mask, axis=0), coords=[v.time.data, v.latitude.data, v.longitude.data], dims=['time', 'latitude', 'longitude']) mask.attrs = v.attrs self.mask = mask # Remove local multi-layer mask variable mask = None del(mask) # Create the percentage of data available mask # Get the per-pixel per-time step binary mask pct_data_available = (self.mask.sum(axis=0) * 100.0) / _time pct_data_available.latitude.data = v.latitude.data pct_data_available.longitude.data = v.longitude.data # Set the pct_data_available object self.pct_data_available = pct_data_available # Using the computed mask get the max gap length self.__get_max_gap_length(b)
def tree_wise_performance(datasets, models, metrics, step=10): """ This method implements the analysis of the model on a tree-wise basis (part of the effectiveness analysis category). Parameters ---------- datasets : list of Dataset The datasets to use for analyzing the behaviour of the model using the given metrics and models models : list of RTEnsemble The models to analyze metrics : list of Metric The metrics to use for the analysis step : int Step-size identifying evenly spaced number of trees for evaluating the top=k model performance. (e.g., step=100 means the method will evaluate the model performance at 100, 200, 300, etc trees). Returns ------- metric_scores : xarray.DataArray A DataArray containing the metric scores of each model using the given metrics on the given datasets. The metric scores are cumulatively reported tree by tree, i.e., top 10 trees, top 20, etc., with a step-size between the number of trees as highlighted by the step parameter. """ def get_tree_steps(model_trees): trees = range(step-1, model_trees, step) # Add last tree to the steps if trees[-1] != model_trees-1: trees.append(model_trees-1) return np.array(trees) max_num_trees = 0 for model in models: if model.n_trees > max_num_trees: max_num_trees = model.n_trees tree_steps = get_tree_steps(max_num_trees) data = np.full(shape=(len(datasets), len(models), len(tree_steps), len(metrics)), fill_value=np.nan, dtype=np.float32) progress_bar = IntProgress(min=0, max=len(datasets)*len(metrics)* sum([len(get_tree_steps(model.n_trees)) for model in models ]), description="Computing metrics") display(progress_bar) for idx_dataset, dataset in enumerate(datasets): for idx_model, model in enumerate(models): y_pred, partial_y_pred, y_leaves = \ model.score(dataset, detailed=True) # the document scores are accumulated along for the various top-k # (in order to avoid useless re-scoring) y_pred = np.zeros(dataset.n_instances) for idx_top_k, top_k in enumerate(get_tree_steps(model.n_trees)): # compute the document scores using only top-k trees of # the model on the given dataset idx_tree_start = idx_top_k * step idx_tree_stop = top_k + 1 y_pred += partial_y_pred[:, idx_tree_start:idx_tree_stop].sum(axis=1) # compute the metric score using the predicted document scores for idx_metric, metric in enumerate(metrics): progress_bar.value += 1 metric_score, _ = metric.eval(dataset, y_pred) data[idx_dataset][idx_model][idx_top_k][idx_metric] = metric_score progress_bar.bar_style = "success" progress_bar.close() performance = xr.DataArray(data, name='Tree-Wise Performance', coords=[datasets, models, tree_steps+1, metrics], dims=['dataset', 'model', 'k', 'metric']) return performance
def train(net, data, epochs=10, batch_size=10, seq_length=50, lr=0.001, clip=5, val_frac=0.1, print_every=10): ''' Training a network Arguments --------- net: CharRNN network data: text data to train the network epochs: Number of epochs to train batch_size: Number of mini-sequences per mini-batch, aka batch size seq_length: Number of character steps per mini-batch lr: learning rate clip: gradient clipping val_frac: Fraction of data to hold out for validation print_every: Number of steps for printing training and validation loss ''' net.train() opt = torch.optim.Adam(net.parameters(), lr=lr) criterion = nn.CrossEntropyLoss() # create training and validation data val_idx = int(len(data) * (1 - val_frac)) data, val_data = data[:val_idx], data[val_idx:] if (net.train_on_gpu): net.cuda() counter = 0 n_chars = len(net.chars) progress = IntProgress( min=0, max=epochs * len(list(get_batches(data, batch_size, seq_length))), description="Training...") display(progress) for e in range(epochs): # initialize hidden state h = net.init_hidden(batch_size) for x, y in get_batches(data, batch_size, seq_length): counter += 1 progress.value += 1 # One-hot encode our data and make them Torch tensors x = one_hot_encode(x, n_chars) inputs, targets = torch.from_numpy(x), torch.from_numpy(y) if (net.train_on_gpu): inputs, targets = inputs.cuda(), targets.cuda() # Creating new variables for the hidden state, otherwise # we'd backprop through the entire training history h = tuple([each.data for each in h]) # zero accumulated gradients net.zero_grad() # get the output from the model output, h = net(inputs, h) # calculate the loss and perform backprop loss = criterion(output, targets.view(batch_size * seq_length).long()) loss.backward() # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs. nn.utils.clip_grad_norm_(net.parameters(), clip) opt.step() # loss stats if counter % print_every == 0: # Get validation loss val_h = net.init_hidden(batch_size) val_losses = [] net.eval() for x, y in get_batches(val_data, batch_size, seq_length): # One-hot encode our data and make them Torch tensors x = one_hot_encode(x, n_chars) x, y = torch.from_numpy(x), torch.from_numpy(y) # Creating new variables for the hidden state, otherwise # we'd backprop through the entire training history val_h = tuple([each.data for each in val_h]) inputs, targets = x, y if (net.train_on_gpu): inputs, targets = inputs.cuda(), targets.cuda() output, val_h = net(inputs, val_h) val_loss = criterion( output, targets.view(batch_size * seq_length).long()) val_losses.append(val_loss.item()) net.train( ) # reset to train mode after iterationg through validation data print("Epoch: {}/{}...".format(e + 1, epochs), "Step: {}...".format(counter), "Loss: {:.4f}...".format(loss.item()), "Val Loss: {:.4f}".format(np.mean(val_losses))) progress.close() print("Finished training.")
def in_progress(seq, msg="Progress: [%(processed)d / %(total)d]", length=None, close=True): """ Iterate over sequence, yielding item with progress widget displayed. This is useful if you need to precess sequence of items with some time consuming operations .. note:: This works only in Jupyter Notebook .. note:: This function requires *ipywidgets* package to be installed :param seq: sequence to iterate on. :param str msg: (optional) message template to display. Following variables could be used in this template: - processed - total - time_total - time_per_item :param int length: (optional) if seq is generator, or it is not possible to apply 'len(seq)' function to 'seq', then this argument is required and it's value will be used as total number of items in seq. Example example:: import time for i in in_progress(range(10)): time.sleep(1) """ from IPython.display import display from ipywidgets import IntProgress import time if length is None: length = len(seq) start_time = time.time() progress = IntProgress( value=0, min=0, max=length, description=msg % { 'processed': 0, 'total': length, 'time_total': 0.0, 'time_per_item': 0.0, 'time_remaining': 0.0, } ) display(progress) for i, item in enumerate(seq, 1): progress.value = i # i_start_time = time.time() yield item # Do the job i_end_time = time.time() progress.description = msg % { 'processed': i, 'total': length, 'time_total': i_end_time - start_time, 'time_per_item': (i_end_time - start_time) / i, 'time_remaining': ((i_end_time - start_time) / i) * (length - i), } if close: progress.close()
def train(self, batch_size=64, learning_rate=1e-3, num_epochs=5, max_num=-1, best_path='keyboard_model_best.tar', current_path='keyboard_model_latest.tar', decay_every=10, save_model=True, dirs=[0]): model = self.model criterion = nn.MSELoss() optimizer = optim.Adam(self.model.parameters(), lr=learning_rate) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=decay_every, gamma=0.05) since = time.time() best_model_wts = copy.deepcopy(model.state_dict()) best_loss = None for epoch in range(num_epochs): print('Epoch {}/{}'.format(epoch + 1, num_epochs)) # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': scheduler.step() model.train() # Set model to training mode else: model.eval() # Set model to evaluate mode running_loss = 0.0 # Iterate over data. max_num_for_this_epoch = max_num if phase == 'train' else -1 total = dataset.get_num_of_data( phase) if max_num == -1 else max_num bar = IntProgress(max=total) display(bar) for inputs, labels in dataset.data_batch( type=phase, batch_size=batch_size, max_num=max_num_for_this_epoch, dirs=dirs): inputs = torch.Tensor(inputs) labels = torch.Tensor(labels) inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward outputs = model(inputs) labels = torch.reshape(labels, [-1, 8]) loss = criterion(outputs, labels) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # statistics running_loss += loss.item() * batch_size # free unoccupied memory if torch.cuda.is_available(): torch.cuda.empty_cache() else: torch.cpu.empty_cache() bar.value += batch_size bar.description = f'{bar.value} / {total}' bar.close() epoch_loss = running_loss / dataset.get_num_of_data(phase) print('{} Loss: {:.4f}'.format(phase, epoch_loss)) # deep copy the model if phase == 'val' and (best_loss == None or epoch_loss < best_loss): best_loss = epoch_loss best_model_wts = copy.deepcopy(model.state_dict()) torch.save(model.state_dict(), best_path) print(f'The best model has been saved to {best_path} ...') torch.save(model.state_dict(), current_path) print(f'Current mode has been saved to {current_path} ...') print() time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) print('Best val loss: {:4f}'.format(best_loss)) # load best model weights model.load_state_dict(best_model_wts) self.model = model
def _randomization(metric_scores_a, metric_scores_b, n_perm=100000): """ This method computes the randomization test as described in [1]. Parameters ---------- metric_scores_a : numpy array Vector of per-query metric scores for the IR system A. metric_scores_b : numpy array Vector of per-query metric scores for the IR system B. n_perm : int Number of permutations evaluated in the randomization test. Returns ------- metric_scores : (float, float) A tuple (p-value_1, p-value_2) being respectively the one-sided and two-sided p-values. References ---------- .. [1] Smucker, Mark D., James Allan, and Ben Carterette. "A comparison of statistical significance tests for information retrieval evaluation." In Proceedings of the sixteenth ACM conference on Conference on information and knowledge management, pp. 623-632. ACM, 2007. """ progress_bar = IntProgress(min=0, max=10, description="Randomization Test") display(progress_bar) # find the best system metric_scores_a_mean = np.mean(metric_scores_a) metric_scores_b_mean = np.mean(metric_scores_b) best_metrics = metric_scores_a worst_metrics = metric_scores_b if metric_scores_a_mean < metric_scores_b_mean: best_metrics = metric_scores_b worst_metrics = metric_scores_a difference = np.mean(best_metrics) - np.mean(worst_metrics) abs_difference = np.abs(difference) p1 = 0.0 # one-sided p2 = 0.0 # two-sided N = float(len(metric_scores_a)) a_sum = np.sum(best_metrics) b_sum = np.sum(worst_metrics) # repeat n_prem times for i in range(n_perm): if i % (n_perm/10)==0: progress_bar.value+=1 # select a random subset sel = np.random.choice([False, True], len(metric_scores_a)) a_sel_sum = np.sum(best_metrics[sel]) b_sel_sum = np.sum(worst_metrics[sel]) # compute avg performance of randomized models a_mean = (a_sum - a_sel_sum + b_sel_sum) / N b_mean = (b_sum - b_sel_sum + a_sel_sum) / N # performance difference delta = a_mean - b_mean if delta >= difference: p1 += 1. if np.abs(delta) >= abs_difference: p2 += 1. progress_bar.bar_style = "success" progress_bar.close() p1 /= n_perm p2 /= n_perm return p1, p2
def bias_variance(datasets=[], algos=[], metrics=[], L=10, k=2): """ This method computes the bias vs. variance decomposition of the error. The approach used here is based on the works of [Webb05]_ and [Dom05]_. Each instance of the dataset is scored `L` times. A single scoring is achieved by splitting the dataset at random into `k` folds. Each fold is scored by the model `M` trained on the remainder folds. [Webb05]_ recommends the use of 2 folds. If metric is MSE then the standard decomposition is used. The Bias for and instance `x` is defined as mean squared error of the `L` trained models w.r.t. the true label `y`, denoted with :math:`{\\sf E}_{L} [M(x) - y]^2`. The Variance for an instance `x` is measured across the `L` trained models: :math:`{\\sf E}_{L} [M(x) - {\\sf E}_{L} M(x)]^2`. Both are averaged over all instances in the dataset. If metric is any of the IR quality measures, we resort to the bias variance decomposition of the mean squared error of the given metric w.r.t. its ideal value, e.g., for the case of NDCG, :math:`{\\sf E}_{L} [1 - NDCG]^2`. Recall that, a formal Bias/Variance decomposition was not proposed yet. Parameters ---------- dataset : rankeval.dataset.Dataset The dataset instance. algo : function This should be a wrapper of learning algorithm. The function should accept four parameters: `train_X`, `train_Y`, `train_q`, `test_X`. - `train_X`: numpy.ndarray storing a 2-D matrix of size num_docs x num_features - `train_Y`: numpy.ndarray storing a vector of document's relevance labels - `train_q`: numpy.ndarray storing a vector of query lengths - `test_X`: numpy.ndarray as for `train_X` A model is trained on `train_X`, `train_Y`, `train_q`, and used to score `test_X`. An numpy.ndarray with such score must be returned. metric : "mse" or rankeval.metrics.metric.Metric The metric used to compute the error. L : int Number of iterations k : int Number of folds. Returns ------- bias_variance : xarray.DataArray A DataArray containing the bias/variance decomposition of the error for any given dataset, algorithm and metric. References ---------- .. [Webb05] Webb, Geoffrey I., and Paul Conilione. "Estimating bias and variance from data." Pre-publication manuscript (`pdf <http://www.csse.monash.edu/webb/-Files/WebbConilione06.pdf>`_) (2005). .. [Dom05] Domingos P. A unified bias-variance decomposition. In Proceedings of 17th International Conference on Machine Learning 2000 (pp. 231-238). """ assert(k>=2) assert(L>=2) assert(len(datasets)>0) assert(len(metrics)>0) for metric in metrics: assert isinstance(metric, Metric) progress_bar = IntProgress(min=0, max=len(datasets)*len(metrics)*len(algos), description="Iterating datasets and metrics") display(progress_bar) data = np.zeros(shape=(len(datasets), len(metrics), len(algos), 3), dtype=np.float32) for idx_dataset, dataset in enumerate(datasets): for idx_algo, algo in enumerate(algos): for idx_metric, metric in enumerate(metrics): progress_bar.value += 1 scores = _multi_kfold_scoring(dataset, algo=algo, L=L, k=k) avg_error = 0. avg_bias = 0. avg_var = 0. if not isinstance(metric, MSE): # mse over metric, assume error is 1-metric # not exactly domingos paper q_scores = np.empty((dataset.n_queries, L), dtype=np.float32) for i in range(L): q_scores[:,i] = metric.eval(dataset=dataset, y_pred=scores[:,i])[1] avg_error = np.mean( (q_scores-1.)**2. ) avg_pred = np.mean(q_scores, axis=1) avg_bias = np.mean((avg_pred - 1.)**2.) avg_var = np.mean( (q_scores-avg_pred.reshape((-1,1)))**2. ) else: # mse avg_error = np.mean( (scores-dataset.y.reshape((-1,1)))**2. ) avg_pred = np.mean(scores, axis=1) avg_bias = np.mean((avg_pred - dataset.y)**2.) avg_var = np.mean( (scores-avg_pred.reshape((-1,1)))**2. ) data[idx_dataset][idx_metric][idx_algo][0] = avg_error data[idx_dataset][idx_metric][idx_algo][1] = avg_bias data[idx_dataset][idx_metric][idx_algo][2] = avg_var progress_bar.bar_style = "success" progress_bar.close() performance = xr.DataArray(data, name='Bias/Variance Decomposition', coords=[datasets, metrics, [a.__name__ for a in algos], ['Error', 'Bias', 'Variance']], dims=['dataset', 'metric', 'algo', 'error']) return performance
def get_press_series(spliter, color, difference, paddings=2): global tmp_imgs white_width = 17 + 2 * paddings black_width = 16 + 2 * paddings height = 106 width = 884 print('Start extracting keypress series ...') print(f' White width: {white_width}px') print(f' Black width: {black_width}px') print(f' Height: {height}px') print('') for name in spliter: black_coor = None N = y_org[name].shape[0] for p in X_path[name]: img = cv2.imread(p) black_coor = get_black_boundaries(img) if len(black_coor) == 36: break y_trans = np.transpose(y_org[name], (1, 0)) print('Pre-loading images ...') bar = IntProgress(max=N) display(bar) for i in range(N): img = pad_img(cv2.imread(X_path[name][i]), paddings) tmp_imgs.append(img) bar.value += 1 bar.close() bar = IntProgress(max=88) display(bar) for k in range(88): if k in black_mask: col = 'black' else: col = 'white' if col not in color: continue _y = y_trans[k] _y = np.argwhere(_y > 0).flatten() if _y.shape[0] == 0: continue last = _y[0] _n = len(_y) for i in range(_n): if i % 32 == 0: bar.description = f'{i}/{_n}' if i != 0 and _y[i] != _y[i - 1] + 1: if col == 'black': add_series(name, col, last, _y[i - 1], k, paddings, black_coor, difference) else: add_series(name, col, last, _y[i - 1], k, paddings, difference) last = _y[i] if i == _n - 1 and last != -1: if col == 'black': add_series(name, col, last, _y[i], k, paddings, black_coor, difference) else: add_series(name, col, last, _y[i], k, paddings, difference) bar.value += 1 bar.close() del tmp_imgs tmp_imgs = [] print(f'{name} set loading finished ...') print(' Pressed white keys: ' + str(len(X_series[name]['white']))) print(' Pressed black keys: ' + str(len(X_series[name]['black'])))
def seperate(spliter, color, size): single_paddings = 2 bundle_paddings = 10 white_single_width = 17 + 2 * single_paddings white_bundle_width = 17 + 2 * bundle_paddings black_single_width = 16 + 2 * single_paddings black_bundle_width = 16 + 2 * bundle_paddings height = 106 width = 884 print('Start seperating keyboard ...') print(f' White single width: {white_single_width}px') print(f' Black single width: {black_single_width}px') print(f' White bundle width: {white_bundle_width}px') print(f' Black bundle width: {black_bundle_width}px') for name in spliter: black_coor = None for p in X_path[name]: img = cv2.imread(p) black_coor = get_black_boundaries(img) if len(black_coor) == 36: break X['single']['white'][name] = [] X['single']['black'][name] = [] X['bundle']['white'][name] = [] X['bundle']['black'][name] = [] y['white'][name] = [] y['black'][name] = [] bar = IntProgress(max=len(X_path[name])) display(bar) for i, p in enumerate(X_path[name]): white_tmp_mask = None black_tmp_mask = None if random.random() > 0.005: white_tmp_mask, black_tmp_mask = get_masks(y_org[name][i]) else: white_tmp_mask = np.arange(52) black_tmp_mask = np.arange(36) img = cv2.imread(p) if 'single' in size: if 'white' in color: get_white_keys(X['single']['white'][name], img, white_tmp_mask, paddings=single_paddings) if 'black' in color: get_black_keys(X['single']['black'][name], img, black_coor, black_tmp_mask, paddings=single_paddings) if 'bundle' in size: if 'white' in color: get_white_keys(X['bundle']['white'][name], img, white_tmp_mask, paddings=bundle_paddings) if 'black' in color: get_black_keys(X['bundle']['black'][name], img, black_coor, black_tmp_mask, paddings=bundle_paddings) for ind in white_mask[white_tmp_mask]: y['white'][name].append(y_org[name][i][ind]) for ind in black_mask[black_tmp_mask]: y['black'][name].append(y_org[name][i][ind]) bar.value += 1 del img bar.close() print('In ' + name + 'set: ') for kind in color: for k2 in size: X[k2][kind][name] = np.array(X[k2][kind][name]) y[kind][name] = np.array(y[kind][name]) print(' # of pressed ' + kind + ' key: ' + str(np.sum(y[kind][name] > 0))) print(' # of unpressed ' + kind + ' key: ' + str(np.sum(y[kind][name] <= 0)))
def interpolate(self, tile_size=256, n_workers=1, threads_per_worker=8, memory_limit='14GB', progressBar=None): """ Interpolates the data of a time series object using the method or methods provided :param method: list of interpolation methods """ if self.mask is None: pass if self.isNotebook is True: # Set up progress bar _items = len(self.interpolation_methods.value) # For every interpol method selected by the user _item = 0 progress_bar = IntProgress( value=0, min=0, max=_items, step=1, description='', bar_style='', # 'success', 'info', 'warning', 'danger' or '' orientation='horizontal', style={'description_width': 'initial'}, layout={'width': '75%'}) display(progress_bar) progress_bar.value = _item # Get temp dataset to perform the interpolation data_var = self.data_vars.value # Interpolation methods interpolation_methods = self.interpolation_methods.value else: data_var = self.selected_data_var interpolation_methods = [self.selected_interpolation_method] tmp_ds = getattr(self.ts.data, data_var).copy(deep=True) # Store original data type dtype = tmp_ds.data.dtype # Get fill value and idx fill_value = tmp_ds.attrs['nodatavals'][0] mask_fill_value = (tmp_ds == fill_value) mask_fill_value = (mask_fill_value * fill_value).astype(dtype) #idx_no_data = np.where(tmp_ds.data == fill_value) # Apply mask tmp_ds *= self.mask # Set NaN where there are zeros tmp_ds = tmp_ds.where(tmp_ds != 0) # Where there were fill values, set the value again to # fill value to avoid not having data to interpolate #tmp_ds.data[idx_no_data] = fill_value tmp_ds += mask_fill_value #tmp_ds[idx_no_data] = fill_value # Where are less than 20% of observations, use fill value min_n_obs = int(tmp_ds.shape[0] * 0.2) #idx_lt_two_obs = np.where(self.mask.sum(axis=0) < min_n_obs) tmp_ds = tmp_ds.where(self.mask.sum(axis=0) > min_n_obs, fill_value) #tmp_ds.data[:, idx_lt_two_obs[0], # idx_lt_two_obs[1]] = fill_value #tmp_ds[:, idx_lt_two_obs[0], idx_lt_two_obs[1]] = fill_value for method in interpolation_methods: if self.isNotebook is True: progress_bar.value = _item progress_bar.description = (f"Interpolation of {data_var}" f" using {method}") if method == 'smoothn': # First, we need a linear interpolation tmp_interpol_ds = tmp_ds.interpolate_na(dim='time', method='linear') # Weigth obs #idx = np.nonzero(tmp_interpol_ds.data) #w = tmp_ds.copy(deep=True).data #w[idx] *= 2 # Smoothing s = float(self.smooth_factor.value) tmp_masked = np.ma.masked_equal( tmp_interpol_ds.data * self.mask, 0) tmp_smoothed = smoothn( tmp_masked, #W=tmp_masked * 2, isrobust=True, isrobust=True, s=s, TolZ=1e-6, axis=0)[0] tmp_masked = None del (tmp_masked) # Overwrite data tmp_interpol_ds.data = tmp_smoothed else: tmp_interpol_ds = tmp_ds.interpolate_na(dim='time', method=method) # Set data type to match the original (non-interpolated) tmp_interpol_ds.data = tmp_interpol_ds.data.astype(dtype) # Copy metadata attributes tmp_interpol_ds.attrs = tmp_ds.attrs # Save to file fname = f"{self.product}.{self.version}.{data_var}.{method}.tif" output_dir = os.path.join(self.source_dir, data_var[1::], 'interpolated') if os.path.exists(output_dir) is False: os.mkdir(output_dir) fname = os.path.join(output_dir, fname) save_dask_array(fname=fname, data=tmp_interpol_ds, data_var=data_var, method=method, tile_size=tile_size, n_workers=n_workers, threads_per_worker=threads_per_worker, memory_limit=memory_limit, progressBar=progressBar) if self.isNotebook is True: _item += 1 if self.isNotebook is True: # Remove progress bar progress_bar.close() del progress_bar
class data_batch: def __init__(self, type='train', size='single', color='white', batch_size=64, need_velocity=True, NCHW=True, shuffle=True, max_num=-1): self.size = size self.type = type self.color = color self.batch_size = batch_size self.NCHW = NCHW self.max_num = max_num self.pressed = [] self.unpressed = [] self.num_pressed = 0 self.num_unpressed = 0 self.need_velocity = need_velocity for i, x in enumerate(y[color][type]): if x > 0: self.pressed.append(i) self.num_pressed += 1 else: self.unpressed.append(i) self.num_unpressed += 1 if shuffle: random.shuffle(self.pressed) random.shuffle(self.unpressed) if self.max_num == -1: self.max_num = len(self.unpressed) self.iter_num = len(self.unpressed) * 2 else: self.max_num = self.max_num // 2 self.iter_num = max_num self.bar = IntProgress(max=self.iter_num) display(self.bar) def __iter__(self): self.index = 0 return self def __next__(self): start = self.index * self.batch_size // 2 end = (self.index + 1) * self.batch_size // 2 if start >= self.max_num: self.bar.close() raise StopIteration if end >= self.max_num: end = self.max_num start = end - self.batch_size // 2 self.index += 1 ind = np.array([]) s = start % self.num_pressed t = end % self.num_pressed if start // self.num_pressed == end // self.num_pressed: ind = np.append(ind, np.array(self.pressed[s:t])) else: ind = np.append(ind, np.array(self.pressed[s:])) ind = np.append(ind, np.array(self.pressed[:t])) ind = np.append(ind, np.array(self.unpressed[start:end])) ind = ind.flatten().astype('int64') X_return = X[self.size][self.color][self.type][ind] if self.NCHW: X_return = np.transpose(X_return, (0, 3, 1, 2)) y_return = y[self.color][self.type][ind] if not self.need_velocity: y_return = (y_return > 0).astype(np.int) self.bar.value += self.batch_size return (X_return, y_return)
def sample_mcmc(model, h, x0=None, burnin=1000, n_samples=10000, sample_rate=10, g=None, noiseless_sample=False, progress_bar=False): """ Sample points (theta) from either a Gaussian process model or simulator using the Metropolis-Hastings algorithm. Default proposal density, g, is a Gaussian with diagonal covariance; covariances set to a small value based on the range of possible parameter settings for each dimension. Args: (models.GP) OR (simulators.Simulator) model: GP model of the discrepancy, OR Simulator instance with callable f(), noiseless_f() (float) h: bandwidth for KDE. (np.ndarray) x0: initial starting point. (int) burnin: number of burn-in samples. (int) sample_rate: how many iterations sampling. (callable) g: proposal density. (bool) noiseless_sample: whether to call noiseless_f or f (when `model' is a Simulator). (bool) progress_bar: whether to show progress bar in Jupyter notebook. Returns: (np.ndarray) samples: with shape (n_samples, input_dim). """ input_dim = model.input_dim bounds = model.bounds # function proportional to predictive distribution if isinstance(model, GP): f = lambda x: norm.cdf( (h - model.mu(x)) / np.sqrt(model.v(x) + model.obs_noise)) elif isinstance(model, Simulator): # std. dev. of obs noise is stored in simulator, so no np.sqrt if noiseless_sample: f = lambda x: norm.cdf( (h - model.noiseless_f(x) / model.obs_noise)) else: f = lambda x: norm.cdf((h - model.f(x) / model.obs_noise)) else: raise ValueError('pass simulator or GP model as first argument.') if x0 is None: x0 = np.array([np.random.uniform(b1, b2) for (b1, b2) in bounds]).reshape(1, input_dim) if g is None: cov = [] for (b1, b2) in bounds: cov.append(0.025 * (b2 - b1)) cov = np.diag(np.array(cov)).reshape(input_dim, input_dim) g = lambda xt: np.random.multivariate_normal(xt.squeeze(), cov ).reshape(1, input_dim) progress_bar = progress_bar and 'jupyter' in os.environ['_'] # ================================================ # Burn-in period ================================= if progress_bar: prog = IntProgress(value=0, max=burnin, description='Burn-in') display(prog) x = np.array(x0) for i in range(burnin): cand = g(x) # candidate point if not model.within_bounds(cand): continue a = f(cand) / f(x) # acceptance ratio if np.random.rand() < a: # accept/reject x = np.copy(cand) if progress_bar: prog.value += 1 # ================================================ # Begin sampling ================================= if progress_bar: prog.close() prog = IntProgress(value=0, max=n_samples, description='Sampling') display(prog) samples = [] i = 0 while len(samples) < n_samples: cand = g(x) # candidate point if not model.within_bounds(cand): continue a = f(cand) / f(x) # acceptance ratio if a < 0: continue if np.random.rand() < a: # accept/reject x = np.copy(cand) if (i % sample_rate) == 0: samples.append(np.copy(x)) if progress_bar: prog.value += 1 i += 1 if progress_bar: prog.close() return np.array(samples).reshape(n_samples, input_dim)