Exemplo n.º 1
0
def _multi_kfold_scoring(dataset, algo, L=10, k=2):
    """
    Performs multiple scorings of the given dataset.

    Parameters
    ----------
    dataset : rankeval.dataset.Dataset
        The dataset instance.
    algo : function
        See :func:`bias_variance`.
    L : int
        Number of iterations
    k : int
        Number of folds.

    Returns
    -------
    score : numpy.ndarray
        A matrix num_instances x L.
    """
    progress_bar = IntProgress(min=0, max=L, description="Computing L scores")
    display(progress_bar)    

    scores = np.zeros( (dataset.n_instances, L), dtype=np.float32)

    for l in range(L):
        progress_bar.value += 1

        scores[:,l] = _kfold_scoring(dataset, k, algo)

    progress_bar.bar_style = "success"
    progress_bar.close()
    
    return scores
Exemplo n.º 2
0
class PBinJ(object):
    """ initialize multiple progress bars for tracking nested stages of fitting routine
    """
    def __init__(self, n=1, value=0, status='{}', color='b', width='60%', height='22px'):
        self.displayed = False
        self.style_bar(n=n, value=value, status=status, color=color, width=width, height=height)

    def style_bar(self, n=1, value=0, status='{}', color='b', width='60%', height='22px'):
        colordict = {'g': 'success', 'b': '', 'r': 'danger', 'y': 'warning', 'c': 'info'}
        bar_style = colordict[color]
        self.bar = IntProgress(min=0, max=n, value=value, bar_style=bar_style)
        self.status = status
        self.bar.bar_style = bar_style
        self.bar.width = width
        self.bar.height = height

    def reset_bar(self, color=False):
        self.update(value=0)

    def update(self, value=None, status=None):
        if not self.displayed:
            display(self.bar)
            self.displayed=True
        if status is not None:
            if hasattr(status, '__iter__') and not isinstance(status, str):
                status = self.status.format(*status)
            else:
                status = self.status.format(status)
            self.bar.description = status
        if value is not None:
            self.bar.value = value+1

    def clear(self):
        self.bar.close()
        self.displayed = False
def new_k_means(data, k, plot=True):
    # This will display a progress bar during k-mean execution
    f = IntProgress(description=f'KM (k={k}):', min=0, max=50)
    display(f)

    # initializing the array where we collect all cluster assignments
    cluster_collection = np.zeros((50, data.shape[0]), dtype=np.int32)
    # initializing the array where we collect all risk values
    risk_collection = np.zeros(50)

    for i in range(50):
        f.value += 1
        centroids, clusters = k_means(data, k, random_seed=i, plot=False)
        risk_collection[i] = empirical_risk(data, clusters, centroids)
        cluster_collection[i, :] = clusters

    # find the best cluster assignment and print the lowest found empirical risk
    min_ind = np.argmin(risk_collection)
    max_ind = np.argmax(risk_collection)
    if plot:
        print("Cluster division with lowest empirical risk")
        plotting(data, clusters=cluster_collection[min_ind, :])
        print("Cluster division with highest empirical risk")
        plotting(data, clusters=cluster_collection[max_ind, :])

        print('min empirical risk is ', np.min(risk_collection))

    # Let's remove progress bar
    f.close()
    return cluster_collection[min_ind, :], risk_collection
Exemplo n.º 4
0
class PBinJ(object):
    """ initialize multiple progress bars for tracking nested stages of fitting routine
    """

    def __init__(self, n=1, value=0, status="{}", color="r", width="50%", height="25px"):
        self.displayed = False
        self.style_bar(n=n, value=value, status=status, color=color, width=width, height=height)

    def style_bar(self, n=1, value=0, status="{}", color="r", width="50%", height="25px"):
        colordict = {"g": "#16a085", "b": "#4168B7", "r": "#e74c3c", "y": "#f39c12"}
        self.bar = IntProgress(min=0, max=n, value=value)
        self.status = status
        self.bar.color = colordict[color]
        self.bar.width = width
        self.bar.height = height

    def reset_bar(self):
        self.update(value=0)

    def update(self, value=None, status=None):
        if not self.displayed:
            display(self.bar)
            self.displayed = True
        if status is not None:
            if hasattr(status, "__iter__"):
                status = self.status.format(*status)
            else:
                status = self.status.format(status)
            self.bar.description = status
        if value is not None:
            self.bar.value = value + 1

    def clear(self):
        self.bar.close()
Exemplo n.º 5
0
    def __get_max_gap_length(self, b):
        """
        Compute the max gep length of a masked time series
        :param b: Progress bar object
        """
        # TODO
        # This function should be paralelised! 

        bands, rows, cols = self.mask.shape
        max_gap_length = np.zeros((rows, cols), np.int16)

        if not type(b) == QProgressBar:
            progress_bar = IntProgress(
                value=0,
                min=0,
                max=10,
                step=1,
                description='Computing max gap length...',
                bar_style='', # 'success', 'info', 'warning', 'danger' or ''
                orientation='horizontal',
                style = {'description_width': 'initial'},
                layout={'width': '50%'}
            )
            display(progress_bar)
        else:
            b.setEnabled(True)

        for i in range(rows):
            if type(b) == QProgressBar:
                b.setFormat('Computing maximum gap length...')
                b.setValue(int((i*10.)/rows))
            else:
                progress_bar.value = int((i*10.)/rows)

            for j in range(cols):
                for key, group in i_groupby(self.mask.data[:,i,j]):
                    if key == False:
                        _gap_lenght = len(list(group))
                        if _gap_lenght > 0 and _gap_lenght > max_gap_length[i,j]:
                            max_gap_length[i,j] = _gap_lenght

        if type(b) == QProgressBar:
            b.setValue(0)
            b.setEnabled(False)
        else:
            # Remove progress bar
            progress_bar.close()
            del progress_bar

        # Create xarray DataArray
        _max_gap_length = xr.DataArray(max_gap_length,
                            coords=[self.mask.latitude.data,
                                    self.mask.longitude.data],
                            dims=['latitude', 'longitude'])

        max_gap_length = None

        self.max_gap_length = _max_gap_length
Exemplo n.º 6
0
def statistical_significance(datasets,
                             model_a,
                             model_b,
                             metrics,
                             n_perm=100000):
    """
    This method computes the statistical significance of the performance difference between model_a and.

    Parameters
    ----------
    datasets : list of Dataset
        The datasets to use for analyzing the behaviour of the model using the given metrics and models
    model_a : RTEnsemble
        The first model considered.
    model_b : RTEnsemble
        The second model considered.
    metrics : list of Metric
        The metrics to use for the analysis
    n_perm : int
        Number of permutations for the randomization test.

    Returns
    -------
    stat_sig : xarray.DataArray
        A DataArray containing the statistical significance of the performance difference
        between any pair of models on the given dataset.
    """

    progress_bar = IntProgress(min=0,
                               max=len(datasets) * len(metrics),
                               description="Iterating datasets and metrics")
    display(progress_bar)

    data = np.zeros(shape=(len(datasets), len(metrics), 2), dtype=np.float32)
    for idx_dataset, dataset in enumerate(datasets):
        y_pred_a = model_a.score(dataset, detailed=False)
        y_pred_b = model_b.score(dataset, detailed=False)
        for idx_metric, metric in enumerate(metrics):
            progress_bar.value += 1

            metrics_a = metric.eval(dataset, y_pred_a)[1]
            metrics_b = metric.eval(dataset, y_pred_b)[1]

            p1, p2 = _randomization(metrics_a, metrics_b, n_perm=n_perm)

            data[idx_dataset][idx_metric][0] = p1
            data[idx_dataset][idx_metric][1] = p2

    progress_bar.bar_style = "success"
    progress_bar.close()

    performance = xr.DataArray(
        data,
        name='Statistical Significance',
        coords=[datasets, metrics, ['one-sided', 'two-sided']],
        dims=['dataset', 'metric', 'p-value'])

    return performance
Exemplo n.º 7
0
def _kfold_scoring(dataset, k, algo):
    """
    Scored the given datset with the given algo unsing k-fold train/test.

    Parameters
    ----------
    dataset : rankeval.dataset.Dataset
        The dataset instance.
    k : int
        Number of folds.
    algo : function
        See :func:`bias_variance`.

    Returns
    -------
    score : numpy.ndarray
        A vecotr of num_instances scores.
    """
    progress_bar = IntProgress(min=0, max=k, description="Processing k folds")
    display(progress_bar)    

    scores = np.zeros(dataset.n_instances, dtype=np.float32)
    query_sizes = dataset.get_query_sizes()
    # shuffle queries
    shuffled_qid = np.random.permutation(dataset.n_queries)
    chunk_query_size = int(math.ceil(dataset.n_queries/float(k)))
    for p in range(0, dataset.n_queries, chunk_query_size):
        progress_bar.value += 1

        # p-th fold is used for testing
        test_rows = np.full(dataset.n_instances,
                            fill_value=False,
                            dtype=np.bool)
        for q in shuffled_qid[p: p + chunk_query_size]:
            test_rows[dataset.query_offsets[q]:dataset.query_offsets[q+1]] = True
        # other folds are used for training
        train_rows = np.logical_not(test_rows)

        train_q = np.full(dataset.n_queries,
                          fill_value=True,
                          dtype=np.bool)
        train_q[shuffled_qid[p: p+chunk_query_size]] = False

        # get algorithm predictions
        fold_scores = algo(
            dataset.X[train_rows],
            dataset.y[train_rows],
            query_sizes[train_q],
            dataset.X[test_rows]
        )
        # update scores for the current fold
        scores[test_rows] = fold_scores
        
    progress_bar.bar_style = "success"
    progress_bar.close()
        
    return scores
Exemplo n.º 8
0
def progress_bar(generator, mx):
    prog = IntProgress(value=0, max=mx)
    display(prog)

    for e in generator:
        yield e
        prog.value += 1

    prog.close()
Exemplo n.º 9
0
    def display(self, width=0, height=0, ray=False, timeout=120):
        """Display PyMol session

        :param width: width in pixels (0 uses current viewport)
        :param height: height in pixels (0 uses current viewport)
        :param ray: use ray tracing (if running PyMOL headless, this parameter
        has no effect and ray tracing is always used)
        :param timeout: timeout in seconds

        Returns
        -------
        fig : IPython.display.Image

        """
        from IPython.display import Image
        from IPython.display import display
        from ipywidgets import IntProgress

        progress_max = int((timeout * 20)**0.5)
        progress = None
        filename = tempfile.mktemp('.png')

        try:
            self._server.png(filename, width, height, -1, int(ray))

            for i in range(1, progress_max):
                if os.path.exists(filename):
                    break

                if progress is None:
                    progress = IntProgress(min=0, max=progress_max)
                    display(progress)

                progress.value += 1
                time.sleep(i / 10.0)

            if not os.path.exists(filename):
                raise RuntimeError('timeout exceeded')

            return Image(filename)
        finally:
            if progress is not None:
                progress.close()

            try:
                os.unlink(filename)
            except:
                pass
Exemplo n.º 10
0
def in_progress(seq, msg="Progress: [%(processed)d / %(total)d]", length=None):
    """ Iterate over sequence, yielding item with progress widget displayed.
        This is useful if you need to precess sequence of items with some
        time consuming operations

        .. note::

            This works only in Jupyter Notebook

        .. note::

            This function requires *ipywidgets* package to be installed

        :param seq: sequence to iterate on.
        :param str msg: (optional) message template to display.
                        available to use 'processed' and 'total' integer vars,
                        where 'processed' is number of items processed and
                        'total' is total number of items in seq.
        :param int length: (optional) if seq is generator, or it is not
                           possible to apply 'len(seq)' function to 'seq',
                           then this argument is required and it's value will
                           be used as total number of items in seq.

        Example example::

            import time
            for i in in_progress(range(10)):
                time.sleep(1)
    """
    from IPython.display import display
    from ipywidgets import IntProgress

    if length is None:
        length = len(seq)

    progress = IntProgress(value=0, min=0, max=length,
                           description=msg % {'processed': 0,
                                              'total': length})
    display(progress)

    for i, item in enumerate(seq, 1):
        progress.value = i
        progress.description = msg % {'processed': i, 'total': length}
        yield item

    progress.close()
Exemplo n.º 11
0
def get_press_series(spliter, color):

    paddings = 4
    white_width = 17 + 2 * paddings
    black_width = 16 + 2 * paddings
    height = 106
    width = 884

    print('Start extracting keypress series ...')
    print(f'  White width: {white_width}px')
    print(f'  Black width: {black_width}px')

    for name in spliter:
        black_coor = None
        N = y_org[name].shape[0]
        for p in X_path[name]:
            img = cv2.imread(p)
            black_coor = get_black_boundaries(img)
            if len(black_coor) == 36:
                break
        bar = IntProgress(max=88 * N)
        display(bar)
        for k in range(88):
            last = -1
            for i in range(N):
                if y_org[name][i][k] > 0:
                    if last == -1:
                        last = i
                if y_org[name][i][k] <= 0 or i == N - 1:
                    if last != -1:
                        if k in black_mask:
                            add_series(name, 'black', last, i - 1, k, paddings,
                                       black_coor)
                        else:
                            add_series(name, 'white', last, i - 1, k, paddings)
                        last = -1
                bar.value += 1
        bar.close()
        print(f'{name} set loading finished ...')
        print('  Pressed white keys: ' + str(len(X_series[name]['white'])))
        print('  Pressed black keys: ' + str(len(X_series[name]['black'])))
Exemplo n.º 12
0
class lstm_data_batch:
    def __init__(self,
                 type='train',
                 color='white',
                 NCHW=True,
                 shuffle=True,
                 need_bar=True,
                 max_num=-1):
        self.type = type
        self.color = color
        self.NCHW = NCHW
        self.need_bar = need_bar
        if max_num == -1:
            self.max_num = len(X_series[type][color])
        else:
            self.max_num = max_num
        self.order = np.arange(self.max_num)
        if shuffle:
            random.shuffle(self.order)
        if need_bar:
            self.bar = IntProgress(max=self.max_num)
            display(self.bar)

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        if self.index >= self.max_num:
            if self.need_bar:
                self.bar.close()
            raise StopIteration
        ind = self.order[self.index]
        X_return = X_series[self.type][self.color][ind]
        y_return = y_series[self.type][self.color][ind]
        if self.NCHW:
            X_return = np.transpose(X_return, (0, 3, 1, 2))
        self.index += 1
        if self.need_bar:
            self.bar.value += 1
        return (np.array(X_return), np.array(y_return))
Exemplo n.º 13
0
class data_batch:
    def __init__(self, size, NCHW=True, concatenate=False):
        if size != 'single' and size != 'bundle':
            raise ValueError("Expected 'single' or 'bundle'")
        if concatenate:
            raise NotImplementedError
        self.len = len(X_path_list)
        self.bundle = (size == 'bundle')
        self.NCHW = NCHW
        self.concatenate = concatenate

    def __len__(self):
        return self.len

    def __iter__(self):
        self.index = 0
        self.bar = IntProgress(max=self.len)
        display(self.bar)
        return self

    def __next__(self):
        if self.index >= self.len:
            self.bar.close()
            raise StopIteration
        else:
            img_path = X_path_list[self.index]
            self.index += 1
            self.bar.value += 1
        img = cv2.imread(img_path)
        white_keys = get_white_keys(
            img, (bundle_paddings if self.bundle else single_paddings))
        black_keys = get_black_keys(
            img, black_coor,
            (bundle_paddings if self.bundle else single_paddings))
        if self.NCHW:
            white_keys = np.transpose(white_keys, (0, 3, 1, 2))
            black_keys = np.transpose(black_keys, (0, 3, 1, 2))
        return white_keys, black_keys
Exemplo n.º 14
0
class ProgressBar(object):
    def __init__(self, N=100, smoothing=0.1, interval=1):
        """Progress bar for an integer number of steps.

        Parameters
        ----------
        N : int
            Number of steps.
        smoothing : float
            Smoothing factor used for estimating time.
            A smaller value averages more steps.
        interval : float
            Time interval in seconds to update display.

        Example
        -------
        >>> bar = ProgressBar(100)
        >>> for i in range(100):
        ...     print(i)
        ...     bar.update()
        ...
        ... del bar

        Methods
        -------
        update
            Increment progress.
        """
        self.value = 0
        self.max = N
        self.alpha = max(0, min(1, smoothing))
        self.interval = interval

        t = time.time()
        self.start_time = t
        self.last_update = t
        self.t0 = t
        self.t = 0.

        if notebook:
            from ipywidgets import IntProgress, HTML
            from IPython.display import display
            self.bar = IntProgress(max=N)
            self.html = HTML(value=self._repr_html_())
            display(self.bar, self.html)

    def __str__(self):
        # Time remaining
        rem = max(0, self.max - self.value)
        t_rem = datetime.timedelta(seconds=self.t * rem)
        t_avg = datetime.timedelta(seconds=self.t)
        t_tot = datetime.timedelta(seconds=self.t0 - self.start_time)

        p = min(20, int(20 * self.value / self.max))
        bar = '[' + p * '=' + (20 - p) * ' ' + ']'
        return f'{bar} {self.value}/{self.max} {t_rem} {t_avg} {t_tot}'

    def _repr_html_(self):
        # Time remaining
        rem = max(0., self.max - self.value)
        t_rem = datetime.timedelta(seconds=self.t * rem)
        t_avg = datetime.timedelta(seconds=self.t)
        t_tot = datetime.timedelta(seconds=self.t0 - self.start_time)
        return f"""
            <table>
                <tr>
                    <th>Progress:</th>
                    <td>{self.value}/{self.max}</td>
                </tr>
                <tr>
                    <th>Remaining time:</th>
                    <td>{t_rem}</td></tr>
                <tr>
                    <th>Average time:</th>
                    <td>{t_avg}</td>
                </tr>
                <tr>
                    <th>Total time:</th>
                    <td>{t_tot}</td>
                </tr>
            </table>
        """

    def update(self):
        """Increment progress."""
        self.value += 1

        # Time since last update
        t = time.time()
        t, self.t0 = t - self.t0, t

        # Time per update
        if self.value < 10:
            # Average
            self.t = (t + (self.value - 1) * self.t) / self.value
        else:
            # Exponential smoothing
            self.t = self.alpha * t + (1 - self.alpha) * self.t
        self.display()

    def display(self):
        if self.t0 - self.last_update > self.interval:
            if notebook:
                self.html.value = self._repr_html_()
                self.bar.value = self.value
                self.last_update = self.t0
            else:
                print(self, flush=True)

    def __del__(self):
        """Close progress bar."""
        if notebook:
            self.bar.close()
            self.html.close()

    close = __del__
Exemplo n.º 15
0
class data_batch:
    def __init__(self,
                 type='train',
                 size='single',
                 color='white',
                 batch_size=64,
                 need_velocity=True,
                 NCHW=True,
                 shuffle=True,
                 concatenate=False,
                 max_num=-1):
        self.size = size
        self.type = type
        self.color = color
        self.batch_size = batch_size
        self.NCHW = NCHW
        self.max_num = max_num
        self.pressed = []
        self.unpressed = []
        self.num_pressed = 0
        self.num_unpressed = 0
        self.need_velocity = need_velocity
        self.concatenate = concatenate
        if self.type == 'train':
            for i, x in enumerate(y[color][type]):
                if x > 0:
                    self.pressed.append(i)
                    self.num_pressed += 1
                else:
                    self.unpressed.append(i)
                    self.num_unpressed += 1
            if shuffle:
                random.shuffle(self.pressed)
                random.shuffle(self.unpressed)
            if self.max_num == -1:
                self.max_num = len(self.unpressed)
                self.iter_num = len(self.unpressed) * 2
            else:
                self.max_num = self.max_num // 2
                self.iter_num = max_num
        else:
            self.max_num = len(y[color][type])
            self.iter_num = len(y[color][type])
        self.bar = IntProgress(max=self.iter_num)
        display(self.bar)

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        ind = np.array([])
        if self.type == 'train':
            start = self.index * self.batch_size // 2
            end = (self.index + 1) * self.batch_size // 2
            if start >= self.max_num:
                self.bar.close()
                raise StopIteration
            if end >= self.max_num:
                end = self.max_num
                start = end - self.batch_size // 2
            self.index += 1
            s = start % self.num_pressed
            t = end % self.num_pressed
            if start // self.num_pressed == end // self.num_pressed:
                ind = np.append(ind, np.array(self.pressed[s:t]))
            else:
                ind = np.append(ind, np.array(self.pressed[s:]))
                ind = np.append(ind, np.array(self.pressed[:t]))
            ind = np.append(ind, np.array(self.unpressed[start:end]))
            ind = ind.flatten().astype('int64')
        else:
            start = self.index * self.batch_size
            end = start + self.batch_size
            if start >= self.max_num:
                self.bar.close()
                raise StopIteration
            if end >= self.max_num:
                end = self.max_num
                start = end - self.batch_size
            self.index += 1
            ind = np.arange(start, end)
        np.random.shuffle(ind)
        X_return = X[self.size][self.color][self.type][ind]
        if self.concatenate:
            arr = X[self.size][self.color][self.type]
            pre = X_pre[self.size][self.color][self.type]
            post = X_post[self.size][self.color][self.type]
            ret_pre = []
            ret_post = []
            for x in ind:
                ret_pre.append(cv2.subtract(arr[x], pre[x]))
                ret_post.append(cv2.subtract(post[x], arr[x]))
            ret_pre = np.array(ret_pre)
            ret_post = np.array(ret_post)
            X_return = np.concatenate((ret_pre, X_return, ret_post), axis=3)
        if self.NCHW:
            X_return = np.transpose(X_return, (0, 3, 1, 2))
        y_return = y[self.color][self.type][ind]
        if not self.need_velocity:
            y_return = (y_return > 0).astype(np.int)
        self.bar.value += self.batch_size
        return (X_return, y_return, ind)
Exemplo n.º 16
0
def in_progress(seq,
                msg="Progress: [%(processed)d / %(total)d]",
                length=None,
                close=True):
    """ Iterate over sequence, yielding item with progress widget displayed.
        This is useful if you need to precess sequence of items with some
        time consuming operations

        .. note::

            This works only in Jupyter Notebook

        .. note::

            This function requires *ipywidgets* package to be installed

        :param seq: sequence to iterate on.
        :param str msg: (optional) message template to display.
                        Following variables could be used in this template:
                            - processed
                            - total
                            - time_total
                            - time_per_item
        :param int length: (optional) if seq is generator, or it is not
                           possible to apply 'len(seq)' function to 'seq',
                           then this argument is required and it's value will
                           be used as total number of items in seq.

        Example example::

            import time
            for i in in_progress(range(10)):
                time.sleep(1)
    """
    from IPython.display import display
    from ipywidgets import IntProgress
    import time

    if length is None:
        length = len(seq)

    start_time = time.time()

    progress = IntProgress(value=0,
                           min=0,
                           max=length,
                           description=msg % {
                               'processed': 0,
                               'total': length,
                               'time_total': 0.0,
                               'time_per_item': 0.0,
                               'time_remaining': 0.0,
                           })
    display(progress)

    for i, item in enumerate(seq, 1):
        progress.value = i

        # i_start_time = time.time()

        yield item  # Do the job

        i_end_time = time.time()

        progress.description = msg % {
            'processed': i,
            'total': length,
            'time_total': i_end_time - start_time,
            'time_per_item': (i_end_time - start_time) / i,
            'time_remaining': ((i_end_time - start_time) / i) * (length - i),
        }

    if close:
        progress.close()
Exemplo n.º 17
0
    def train(self,
              phase=['train', 'val'],
              color='black',
              learning_rate=1e-3,
              weight_lambda=0.0005,
              num_epoch=5,
              max_num=-1,
              best_path='model_best.tar',
              current_path='model_latest.tar',
              tsb_writer=None,
              tag='',
              decay_every=10,
              save_model=True):
        model = self.model
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(),
                               lr=learning_rate,
                               weight_decay=weight_lambda)
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=decay_every,
                                              gamma=0.05)
        since = time.time()
        best_model = None
        best_loss = None

        best_path = time.strftime('[%Y%m%d]%H-%M-%S') + best_path
        current_path = time.strftime('[%Y%m%d]%H-%M-%S') + current_path

        print(f'The best model will be saved to {best_path} ...')
        print(f'Thhe latest model will be saved to {current_path} ...')

        for epoch in range(num_epoch):

            print('Epoch {}/{}'.format(epoch + 1, num_epoch), end='')
            self.epoch_total += 1
            _loss = dict()
            _diff = dict()

            for phase in phase:
                if phase == 'train':
                    scheduler.step()
                    model.train()
                else:
                    model.eval()

                running_loss = 0.0
                running_diff = 0.0
                total = dataset.get_lstm_data_num(
                    phase, color) if max_num == -1 else max_num

                bar = IntProgress(max=total)
                display(bar)

                for i, (inputs, labels) in enumerate(
                        dataset.lstm_data_batch(type=phase,
                                                color=color,
                                                max_num=total,
                                                need_bar=False)):

                    _labels = labels
                    inputs = torch.Tensor(inputs)
                    labels = torch.Tensor(np.array([labels]) / 63.5 - 1)
                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    optimizer.zero_grad()

                    outputs = model(inputs)
                    labels = torch.reshape(labels, [1])
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    running_loss += loss.item()
                    running_diff += np.abs(
                        (outputs.cpu().detach().numpy()[0] + 1) * 63.5 -
                        _labels)

                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()

                    bar.value += 1
                    if i % 32 == 0:
                        bar.description = f'{bar.value} / {total}'

                bar.close()

                epoch_loss = running_loss / total
                epoch_diff = running_diff / total

                if epoch % 5 == 0:
                    print('{} Loss: {:.4f}, L1 Diff: {:.4f}'.format(
                        phase, epoch_loss, epoch_diff))

                _loss[phase] = epoch_loss
                _diff[phase] = epoch_diff

                if phase == 'val' and (best_loss == None
                                       or epoch_loss < best_loss):
                    best_loss = epoch_loss
                    best_model = copy.deepcopy(model.state_dict())
                    if save_model:
                        torch.save(model.state_dict(), best_path)

                if save_model:
                    torch.save(model.state_dict(), current_path)

            if tsb_writer and 'val' in phase and 'train' in phase:
                tsb_writer.add_scalars(f'{tag}/Loss', {
                    'val': _loss['val'],
                    'train': _loss['train']
                }, self.epoch_total)
                tsb_writer.add_scalars(f'{tag}/L1 Diff', {
                    'val': _diff['val'],
                    'train': _diff['train']
                }, self.epoch_total)

        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        print('Best val loss: {:4f}'.format(best_loss))

        self.model = model
Exemplo n.º 18
0
    def _analytics(self, b):
        """
        Uses the self.user_qa_selection OrderedDictionary to extract
        the corresponding QA values and create a mask of dimensions:
            (number of qa layers, time steps, cols(lat), rows(lon))
        Additionally computes the temporal mask and the max gap length
        """
        if not type(b) == QProgressBar:
            progress_bar = IntProgress(
                value=0,
                min=0,
                max=len(self.user_qa_selection),
                step=1,
                description='',
                bar_style='', # 'success', 'info', 'warning', 'danger' or ''
                orientation='horizontal',
                style = {'description_width': 'initial'},
                layout={'width': '50%'}
            )
            display(progress_bar)

        n_qa_layers = len(self.user_qa_selection)

        # Get the name of the first data var to extract its shape
        for k, v in self.ts.data.data_vars.items():
            break

        # Create mask xarray
        _time, _latitude, _longitude = self.ts.data.data_vars[k].shape
        mask = np.zeros((n_qa_layers, _time, _latitude, _longitude),
                        np.int8)

        qa_layer = self.qa_def.QualityLayer.unique()

        # QA layer user to create mask
        _qa_layer = getattr(self.ts.qa, f"qa{qa_layer[0]}")

        for i, user_qa in enumerate(self.user_qa_selection):

            if type(b) == QProgressBar:
                b.setValue(i)
                b.setFormat(f"Masking by QA {user_qa}")
            else:
                progress_bar.value = i
                progress_bar.description = f"Masking by QA {user_qa}"

            user_qa_fieldname = user_qa.replace(" ", "_").replace("/", "_")

            for j, qa_value in enumerate(self.user_qa_selection[user_qa]):
                qa_value_field_name = qa_value.replace(" ", "_")

                qa_flag_val = self.qa_def[(self.qa_def.Name == user_qa) & 
                        (self.qa_def.Description == qa_value)].Value.iloc[0]

                if j == 0 :
                    mask[i] = (_qa_layer[user_qa_fieldname] == qa_flag_val)
                else:
                    mask[i] = np.logical_or(
                            mask[i], _qa_layer[user_qa_fieldname] == qa_flag_val)

        if type(b) == QProgressBar:
            b.setValue(0)
            b.setEnabled(False)
        else:
            # Remove progress bar
            progress_bar.close()
            del progress_bar

        #self.__temp_mask = mask
        #mask = xr.DataArray(np.all(self.__temp_mask, axis=0),
        mask = xr.DataArray(np.all(mask, axis=0),
                            coords=[v.time.data,
                                    v.latitude.data,
                                    v.longitude.data],
                            dims=['time', 'latitude', 'longitude'])

        mask.attrs = v.attrs

        self.mask = mask
        # Remove local multi-layer mask variable
        mask = None
        del(mask)

        # Create the percentage of data available mask
        # Get the per-pixel per-time step binary mask
        pct_data_available = (self.mask.sum(axis=0) * 100.0) / _time
        pct_data_available.latitude.data = v.latitude.data
        pct_data_available.longitude.data = v.longitude.data
        # Set the pct_data_available object
        self.pct_data_available = pct_data_available

        # Using the computed mask get the max gap length
        self.__get_max_gap_length(b)
Exemplo n.º 19
0
def tree_wise_performance(datasets, models, metrics, step=10):
    """
    This method implements the analysis of the model on a tree-wise basis
    (part of the effectiveness analysis category).

    Parameters
    ----------
    datasets : list of Dataset
        The datasets to use for analyzing the behaviour of the model using
        the given metrics and models
    models : list of RTEnsemble
        The models to analyze
    metrics : list of Metric
        The metrics to use for the analysis
    step : int
        Step-size identifying evenly spaced number of trees for evaluating
        the top=k model performance.
        (e.g., step=100 means the method will evaluate the model performance
        at 100, 200, 300, etc trees).


    Returns
    -------
    metric_scores : xarray.DataArray
        A DataArray containing the metric scores of each model using the given
        metrics on the given datasets.
        The metric scores are cumulatively reported tree by tree, i.e., top 10
        trees, top 20, etc., with a step-size between the number of trees
        as highlighted by the step parameter.

    """
    def get_tree_steps(model_trees):
        trees = range(step-1, model_trees, step)
        # Add last tree to the steps
        if trees[-1] != model_trees-1:
            trees.append(model_trees-1)
        return np.array(trees)

    max_num_trees = 0
    for model in models:
        if model.n_trees > max_num_trees:
            max_num_trees = model.n_trees

    tree_steps = get_tree_steps(max_num_trees)

    data = np.full(shape=(len(datasets), len(models), len(tree_steps),
                          len(metrics)), fill_value=np.nan, dtype=np.float32)


    progress_bar = IntProgress(min=0, max=len(datasets)*len(metrics)*
                               sum([len(get_tree_steps(model.n_trees)) for model in models ]), 
                               description="Computing metrics")
    display(progress_bar)    


    for idx_dataset, dataset in enumerate(datasets):
        for idx_model, model in enumerate(models):
            y_pred, partial_y_pred, y_leaves = \
                model.score(dataset, detailed=True)

            # the document scores are accumulated along for the various top-k
            # (in order to avoid useless re-scoring)
            y_pred = np.zeros(dataset.n_instances)

            for idx_top_k, top_k in enumerate(get_tree_steps(model.n_trees)):

                # compute the document scores using only top-k trees of
                # the model on the given dataset
                idx_tree_start = idx_top_k * step
                idx_tree_stop = top_k + 1

                y_pred += partial_y_pred[:, idx_tree_start:idx_tree_stop].sum(axis=1)

                # compute the metric score using the predicted document scores
                for idx_metric, metric in enumerate(metrics):
                    progress_bar.value += 1

                    metric_score, _ = metric.eval(dataset, y_pred)
                    data[idx_dataset][idx_model][idx_top_k][idx_metric] = metric_score

    progress_bar.bar_style = "success"
    progress_bar.close()

    performance = xr.DataArray(data,
                               name='Tree-Wise Performance',
                               coords=[datasets, models, tree_steps+1, metrics],
                               dims=['dataset', 'model', 'k', 'metric'])
    return performance
Exemplo n.º 20
0
def train(net,
          data,
          epochs=10,
          batch_size=10,
          seq_length=50,
          lr=0.001,
          clip=5,
          val_frac=0.1,
          print_every=10):
    ''' Training a network 
    
        Arguments
        ---------
        
        net: CharRNN network
        data: text data to train the network
        epochs: Number of epochs to train
        batch_size: Number of mini-sequences per mini-batch, aka batch size
        seq_length: Number of character steps per mini-batch
        lr: learning rate
        clip: gradient clipping
        val_frac: Fraction of data to hold out for validation
        print_every: Number of steps for printing training and validation loss
    
    '''
    net.train()

    opt = torch.optim.Adam(net.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    # create training and validation data
    val_idx = int(len(data) * (1 - val_frac))
    data, val_data = data[:val_idx], data[val_idx:]

    if (net.train_on_gpu):
        net.cuda()

    counter = 0
    n_chars = len(net.chars)

    progress = IntProgress(
        min=0,
        max=epochs * len(list(get_batches(data, batch_size, seq_length))),
        description="Training...")
    display(progress)

    for e in range(epochs):

        # initialize hidden state
        h = net.init_hidden(batch_size)

        for x, y in get_batches(data, batch_size, seq_length):
            counter += 1
            progress.value += 1

            # One-hot encode our data and make them Torch tensors
            x = one_hot_encode(x, n_chars)
            inputs, targets = torch.from_numpy(x), torch.from_numpy(y)

            if (net.train_on_gpu):
                inputs, targets = inputs.cuda(), targets.cuda()

            # Creating new variables for the hidden state, otherwise
            # we'd backprop through the entire training history
            h = tuple([each.data for each in h])

            # zero accumulated gradients
            net.zero_grad()

            # get the output from the model
            output, h = net(inputs, h)

            # calculate the loss and perform backprop
            loss = criterion(output,
                             targets.view(batch_size * seq_length).long())
            loss.backward()
            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            nn.utils.clip_grad_norm_(net.parameters(), clip)
            opt.step()

            # loss stats
            if counter % print_every == 0:
                # Get validation loss
                val_h = net.init_hidden(batch_size)
                val_losses = []
                net.eval()
                for x, y in get_batches(val_data, batch_size, seq_length):
                    # One-hot encode our data and make them Torch tensors
                    x = one_hot_encode(x, n_chars)
                    x, y = torch.from_numpy(x), torch.from_numpy(y)

                    # Creating new variables for the hidden state, otherwise
                    # we'd backprop through the entire training history
                    val_h = tuple([each.data for each in val_h])

                    inputs, targets = x, y
                    if (net.train_on_gpu):
                        inputs, targets = inputs.cuda(), targets.cuda()

                    output, val_h = net(inputs, val_h)
                    val_loss = criterion(
                        output,
                        targets.view(batch_size * seq_length).long())

                    val_losses.append(val_loss.item())

                net.train(
                )  # reset to train mode after iterationg through validation data

                print("Epoch: {}/{}...".format(e + 1, epochs),
                      "Step: {}...".format(counter),
                      "Loss: {:.4f}...".format(loss.item()),
                      "Val Loss: {:.4f}".format(np.mean(val_losses)))

    progress.close()
    print("Finished training.")
Exemplo n.º 21
0
def in_progress(seq, msg="Progress: [%(processed)d / %(total)d]",
                length=None, close=True):
    """ Iterate over sequence, yielding item with progress widget displayed.
        This is useful if you need to precess sequence of items with some
        time consuming operations

        .. note::

            This works only in Jupyter Notebook

        .. note::

            This function requires *ipywidgets* package to be installed

        :param seq: sequence to iterate on.
        :param str msg: (optional) message template to display.
                        Following variables could be used in this template:
                            - processed
                            - total
                            - time_total
                            - time_per_item
        :param int length: (optional) if seq is generator, or it is not
                           possible to apply 'len(seq)' function to 'seq',
                           then this argument is required and it's value will
                           be used as total number of items in seq.

        Example example::

            import time
            for i in in_progress(range(10)):
                time.sleep(1)
    """
    from IPython.display import display
    from ipywidgets import IntProgress
    import time

    if length is None:
        length = len(seq)

    start_time = time.time()

    progress = IntProgress(
        value=0, min=0, max=length, description=msg % {
            'processed': 0,
            'total': length,
            'time_total': 0.0,
            'time_per_item': 0.0,
            'time_remaining': 0.0,
        }
    )
    display(progress)

    for i, item in enumerate(seq, 1):
        progress.value = i

        # i_start_time = time.time()

        yield item  # Do the job

        i_end_time = time.time()

        progress.description = msg % {
            'processed': i,
            'total': length,
            'time_total': i_end_time - start_time,
            'time_per_item': (i_end_time - start_time) / i,
            'time_remaining': ((i_end_time - start_time) / i) * (length - i),
        }

    if close:
        progress.close()
Exemplo n.º 22
0
    def train(self,
              batch_size=64,
              learning_rate=1e-3,
              num_epochs=5,
              max_num=-1,
              best_path='keyboard_model_best.tar',
              current_path='keyboard_model_latest.tar',
              decay_every=10,
              save_model=True,
              dirs=[0]):

        model = self.model
        criterion = nn.MSELoss()
        optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        scheduler = optim.lr_scheduler.StepLR(optimizer,
                                              step_size=decay_every,
                                              gamma=0.05)

        since = time.time()

        best_model_wts = copy.deepcopy(model.state_dict())
        best_loss = None

        for epoch in range(num_epochs):

            print('Epoch {}/{}'.format(epoch + 1, num_epochs))

            # Each epoch has a training and validation phase
            for phase in ['train', 'val']:
                if phase == 'train':
                    scheduler.step()
                    model.train()  # Set model to training mode
                else:
                    model.eval()  # Set model to evaluate mode

                running_loss = 0.0
                # Iterate over data.

                max_num_for_this_epoch = max_num if phase == 'train' else -1

                total = dataset.get_num_of_data(
                    phase) if max_num == -1 else max_num

                bar = IntProgress(max=total)

                display(bar)

                for inputs, labels in dataset.data_batch(
                        type=phase,
                        batch_size=batch_size,
                        max_num=max_num_for_this_epoch,
                        dirs=dirs):

                    inputs = torch.Tensor(inputs)
                    labels = torch.Tensor(labels)

                    inputs = inputs.to(device)
                    labels = labels.to(device)

                    # zero the parameter gradients
                    optimizer.zero_grad()

                    # forward
                    outputs = model(inputs)
                    labels = torch.reshape(labels, [-1, 8])
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                    # statistics
                    running_loss += loss.item() * batch_size

                    # free unoccupied memory
                    if torch.cuda.is_available():
                        torch.cuda.empty_cache()
                    else:
                        torch.cpu.empty_cache()

                    bar.value += batch_size
                    bar.description = f'{bar.value} / {total}'

                bar.close()

                epoch_loss = running_loss / dataset.get_num_of_data(phase)

                print('{} Loss: {:.4f}'.format(phase, epoch_loss))

            # deep copy the model
            if phase == 'val' and (best_loss == None
                                   or epoch_loss < best_loss):
                best_loss = epoch_loss
                best_model_wts = copy.deepcopy(model.state_dict())
                torch.save(model.state_dict(), best_path)
                print(f'The best model has been saved to {best_path} ...')

            torch.save(model.state_dict(), current_path)
            print(f'Current mode has been saved to {current_path} ...')

            print()

        time_elapsed = time.time() - since
        print('Training complete in {:.0f}m {:.0f}s'.format(
            time_elapsed // 60, time_elapsed % 60))
        print('Best val loss: {:4f}'.format(best_loss))

        # load best model weights
        model.load_state_dict(best_model_wts)
        self.model = model
Exemplo n.º 23
0
def _randomization(metric_scores_a, metric_scores_b, n_perm=100000):
    """
    This method computes the randomization test as described in [1].

    Parameters
    ----------
    metric_scores_a : numpy array
        Vector of per-query metric scores for the IR system A.
    metric_scores_b : numpy array
        Vector of per-query metric scores for the IR system B.
    n_perm : int
        Number of permutations evaluated in the randomization test.

    Returns
    -------
    metric_scores : (float, float)
        A tuple (p-value_1, p-value_2) being respectively the one-sided and two-sided p-values.

    References
    ----------
    .. [1] Smucker, Mark D., James Allan, and Ben Carterette.
        "A comparison of statistical significance tests for information retrieval evaluation."
        In Proceedings of the sixteenth ACM conference on Conference on information and knowledge management, pp. 623-632. ACM, 2007.
    """
    progress_bar = IntProgress(min=0, max=10, description="Randomization Test")
    display(progress_bar)    

    # find the best system
    metric_scores_a_mean = np.mean(metric_scores_a)
    metric_scores_b_mean = np.mean(metric_scores_b)

    best_metrics = metric_scores_a
    worst_metrics = metric_scores_b
    if metric_scores_a_mean < metric_scores_b_mean:
        best_metrics = metric_scores_b
        worst_metrics = metric_scores_a

    difference = np.mean(best_metrics) - np.mean(worst_metrics)
    abs_difference = np.abs(difference)

    p1 = 0.0  # one-sided
    p2 = 0.0  # two-sided
    N = float(len(metric_scores_a))

    a_sum = np.sum(best_metrics)
    b_sum = np.sum(worst_metrics)

    # repeat n_prem times
    for i in range(n_perm):
        if i % (n_perm/10)==0: progress_bar.value+=1
        
        # select a random subset
        sel = np.random.choice([False, True], len(metric_scores_a))

        a_sel_sum = np.sum(best_metrics[sel])
        b_sel_sum = np.sum(worst_metrics[sel])

        # compute avg performance of randomized models
        a_mean = (a_sum - a_sel_sum + b_sel_sum) / N
        b_mean = (b_sum - b_sel_sum + a_sel_sum) / N

        # performance difference
        delta = a_mean - b_mean

        if delta >= difference:
            p1 += 1.
        if np.abs(delta) >= abs_difference:
            p2 += 1.

    progress_bar.bar_style = "success"
    progress_bar.close()

    p1 /= n_perm
    p2 /= n_perm

    return p1, p2
Exemplo n.º 24
0
def bias_variance(datasets=[], algos=[], metrics=[], L=10, k=2):
    """
    This method computes the bias vs. variance decomposition of the error.
    The approach used here is based on the works of [Webb05]_ and [Dom05]_.

    Each instance of the dataset is scored `L` times.
    A single scoring is achieved by splitting the dataset at random into
    `k` folds. Each fold is scored by the model `M` trained on the remainder folds.
    [Webb05]_ recommends the use of 2 folds.

    If metric is MSE then the standard decomposition is used.
    The Bias for and instance `x` is defined as mean squared error of the `L` trained models
    w.r.t. the true label `y`, denoted with :math:`{\\sf E}_{L} [M(x) - y]^2`. 
    The Variance for an instance `x` is measured across the `L` trained models: 
    :math:`{\\sf E}_{L} [M(x) - {\\sf E}_{L} M(x)]^2`. 
    Both are averaged over all instances in the dataset.

    If metric is any of the IR quality measures, we resort to the bias variance
    decomposition of the mean squared error of the given metric w.r.t. its ideal value,
    e.g., for the case of NDCG, :math:`{\\sf E}_{L} [1 - NDCG]^2`. 
    Recall that, a formal Bias/Variance decomposition was not proposed yet.


    Parameters
    ----------
    dataset : rankeval.dataset.Dataset
        The dataset instance.
    algo : function
        This should be a wrapper of learning algorithm.
        The function should accept four parameters: `train_X`, `train_Y`, `train_q`, `test_X`.
            - `train_X`: numpy.ndarray storing a 2-D matrix of size num_docs x num_features
            - `train_Y`: numpy.ndarray storing a vector of document's relevance labels
            - `train_q`: numpy.ndarray storing a vector of query lengths
            - `test_X`: numpy.ndarray as for `train_X`
        
        A model is trained on `train_X`, `train_Y`, `train_q`, and used to score `test_X`.
        An numpy.ndarray with such score must be returned.
    metric : "mse" or rankeval.metrics.metric.Metric
        The metric used to compute the error.
    L : int
        Number of iterations
    k : int
        Number of folds.

    Returns
    -------
    bias_variance : xarray.DataArray
        A DataArray containing the bias/variance decomposition of the error
        for any given dataset, algorithm and metric.

    References
    ----------
    .. [Webb05] Webb, Geoffrey I., and Paul Conilione. "Estimating bias and variance from data." 
            Pre-publication manuscript (`pdf <http://www.csse.monash.edu/webb/-Files/WebbConilione06.pdf>`_) (2005).
    .. [Dom05] Domingos P. A unified bias-variance decomposition. 
            In Proceedings of 17th International Conference on Machine Learning 2000 (pp. 231-238).
    """
    assert(k>=2)
    assert(L>=2)
    assert(len(datasets)>0)
    assert(len(metrics)>0)
    for metric in metrics:
        assert isinstance(metric, Metric)

    progress_bar = IntProgress(min=0, max=len(datasets)*len(metrics)*len(algos),
                               description="Iterating datasets and metrics")
    display(progress_bar)    

    data = np.zeros(shape=(len(datasets), len(metrics), len(algos), 3), dtype=np.float32)
    for idx_dataset, dataset in enumerate(datasets):
        for idx_algo, algo in enumerate(algos):
            for idx_metric, metric in enumerate(metrics):
                progress_bar.value += 1
                
                scores = _multi_kfold_scoring(dataset, algo=algo, L=L, k=k)
                
                avg_error = 0.
                avg_bias = 0.
                avg_var = 0.
                if not isinstance(metric, MSE):
                    # mse over metric, assume error is 1-metric
                    # not exactly domingos paper
                    q_scores = np.empty((dataset.n_queries, L), dtype=np.float32) 
                    for i in range(L):
                        q_scores[:,i] = metric.eval(dataset=dataset, y_pred=scores[:,i])[1]            
                    avg_error = np.mean( (q_scores-1.)**2. )
                    avg_pred  = np.mean(q_scores, axis=1)
                    avg_bias  = np.mean((avg_pred - 1.)**2.)
                    avg_var   = np.mean( (q_scores-avg_pred.reshape((-1,1)))**2. )
                else:
                    # mse
                    avg_error = np.mean( (scores-dataset.y.reshape((-1,1)))**2. )
                    avg_pred  = np.mean(scores, axis=1)
                    avg_bias  = np.mean((avg_pred - dataset.y)**2.)
                    avg_var   = np.mean( (scores-avg_pred.reshape((-1,1)))**2. )

                data[idx_dataset][idx_metric][idx_algo][0] = avg_error
                data[idx_dataset][idx_metric][idx_algo][1] = avg_bias
                data[idx_dataset][idx_metric][idx_algo][2] = avg_var
                

    progress_bar.bar_style = "success"
    progress_bar.close()

    performance = xr.DataArray(data,
                               name='Bias/Variance Decomposition',
                               coords=[datasets, metrics, [a.__name__ for a in algos], 
                               ['Error', 'Bias', 'Variance']],
                               dims=['dataset', 'metric', 'algo', 'error'])

    return performance
Exemplo n.º 25
0
def get_press_series(spliter, color, difference, paddings=2):
    global tmp_imgs

    white_width = 17 + 2 * paddings
    black_width = 16 + 2 * paddings
    height = 106
    width = 884

    print('Start extracting keypress series ...')
    print(f'  White width: {white_width}px')
    print(f'  Black width: {black_width}px')
    print(f'  Height: {height}px')
    print('')

    for name in spliter:
        black_coor = None
        N = y_org[name].shape[0]
        for p in X_path[name]:
            img = cv2.imread(p)
            black_coor = get_black_boundaries(img)
            if len(black_coor) == 36:
                break
        y_trans = np.transpose(y_org[name], (1, 0))

        print('Pre-loading images ...')

        bar = IntProgress(max=N)
        display(bar)
        for i in range(N):
            img = pad_img(cv2.imread(X_path[name][i]), paddings)
            tmp_imgs.append(img)
            bar.value += 1
        bar.close()

        bar = IntProgress(max=88)
        display(bar)

        for k in range(88):
            if k in black_mask:
                col = 'black'
            else:
                col = 'white'
            if col not in color:
                continue
            _y = y_trans[k]
            _y = np.argwhere(_y > 0).flatten()
            if _y.shape[0] == 0:
                continue
            last = _y[0]
            _n = len(_y)
            for i in range(_n):
                if i % 32 == 0:
                    bar.description = f'{i}/{_n}'
                if i != 0 and _y[i] != _y[i - 1] + 1:
                    if col == 'black':
                        add_series(name, col, last, _y[i - 1], k, paddings,
                                   black_coor, difference)
                    else:
                        add_series(name, col, last, _y[i - 1], k, paddings,
                                   difference)
                    last = _y[i]
                if i == _n - 1 and last != -1:
                    if col == 'black':
                        add_series(name, col, last, _y[i], k, paddings,
                                   black_coor, difference)
                    else:
                        add_series(name, col, last, _y[i], k, paddings,
                                   difference)
            bar.value += 1

        bar.close()

        del tmp_imgs
        tmp_imgs = []

        print(f'{name} set loading finished ...')
        print('  Pressed white keys: ' + str(len(X_series[name]['white'])))
        print('  Pressed black keys: ' + str(len(X_series[name]['black'])))
Exemplo n.º 26
0
def seperate(spliter, color, size):

    single_paddings = 2
    bundle_paddings = 10
    white_single_width = 17 + 2 * single_paddings
    white_bundle_width = 17 + 2 * bundle_paddings
    black_single_width = 16 + 2 * single_paddings
    black_bundle_width = 16 + 2 * bundle_paddings
    height = 106
    width = 884

    print('Start seperating keyboard ...')
    print(f'  White single width: {white_single_width}px')
    print(f'  Black single width: {black_single_width}px')
    print(f'  White bundle width: {white_bundle_width}px')
    print(f'  Black bundle width: {black_bundle_width}px')

    for name in spliter:
        black_coor = None
        for p in X_path[name]:
            img = cv2.imread(p)
            black_coor = get_black_boundaries(img)
            if len(black_coor) == 36:
                break
        X['single']['white'][name] = []
        X['single']['black'][name] = []
        X['bundle']['white'][name] = []
        X['bundle']['black'][name] = []
        y['white'][name] = []
        y['black'][name] = []

        bar = IntProgress(max=len(X_path[name]))
        display(bar)

        for i, p in enumerate(X_path[name]):
            white_tmp_mask = None
            black_tmp_mask = None
            if random.random() > 0.005:
                white_tmp_mask, black_tmp_mask = get_masks(y_org[name][i])
            else:
                white_tmp_mask = np.arange(52)
                black_tmp_mask = np.arange(36)
            img = cv2.imread(p)
            if 'single' in size:
                if 'white' in color:
                    get_white_keys(X['single']['white'][name],
                                   img,
                                   white_tmp_mask,
                                   paddings=single_paddings)
                if 'black' in color:
                    get_black_keys(X['single']['black'][name],
                                   img,
                                   black_coor,
                                   black_tmp_mask,
                                   paddings=single_paddings)
            if 'bundle' in size:
                if 'white' in color:
                    get_white_keys(X['bundle']['white'][name],
                                   img,
                                   white_tmp_mask,
                                   paddings=bundle_paddings)
                if 'black' in color:
                    get_black_keys(X['bundle']['black'][name],
                                   img,
                                   black_coor,
                                   black_tmp_mask,
                                   paddings=bundle_paddings)
            for ind in white_mask[white_tmp_mask]:
                y['white'][name].append(y_org[name][i][ind])
            for ind in black_mask[black_tmp_mask]:
                y['black'][name].append(y_org[name][i][ind])
            bar.value += 1
            del img
        bar.close()

        print('In ' + name + 'set: ')
        for kind in color:
            for k2 in size:
                X[k2][kind][name] = np.array(X[k2][kind][name])
            y[kind][name] = np.array(y[kind][name])
            print('  # of pressed ' + kind + ' key: ' +
                  str(np.sum(y[kind][name] > 0)))
            print('  # of unpressed ' + kind + ' key: ' +
                  str(np.sum(y[kind][name] <= 0)))
Exemplo n.º 27
0
    def interpolate(self,
                    tile_size=256,
                    n_workers=1,
                    threads_per_worker=8,
                    memory_limit='14GB',
                    progressBar=None):
        """
        Interpolates the data of a time series object using
        the method or methods provided
        :param method: list of interpolation methods
        """
        if self.mask is None:
            pass

        if self.isNotebook is True:
            # Set up progress bar
            _items = len(self.interpolation_methods.value)
            # For every interpol method selected by the user
            _item = 0
            progress_bar = IntProgress(
                value=0,
                min=0,
                max=_items,
                step=1,
                description='',
                bar_style='',  # 'success', 'info', 'warning', 'danger' or ''
                orientation='horizontal',
                style={'description_width': 'initial'},
                layout={'width': '75%'})
            display(progress_bar)
            progress_bar.value = _item

            # Get temp dataset to perform the interpolation
            data_var = self.data_vars.value

            # Interpolation methods
            interpolation_methods = self.interpolation_methods.value

        else:
            data_var = self.selected_data_var
            interpolation_methods = [self.selected_interpolation_method]

        tmp_ds = getattr(self.ts.data, data_var).copy(deep=True)

        # Store original data type
        dtype = tmp_ds.data.dtype

        # Get fill value and idx
        fill_value = tmp_ds.attrs['nodatavals'][0]
        mask_fill_value = (tmp_ds == fill_value)
        mask_fill_value = (mask_fill_value * fill_value).astype(dtype)
        #idx_no_data = np.where(tmp_ds.data == fill_value)

        # Apply mask
        tmp_ds *= self.mask
        # Set NaN where there are zeros
        tmp_ds = tmp_ds.where(tmp_ds != 0)

        # Where there were fill values, set the value again to
        # fill value to avoid not having data to interpolate
        #tmp_ds.data[idx_no_data] = fill_value
        tmp_ds += mask_fill_value

        #tmp_ds[idx_no_data] = fill_value

        # Where are less than 20% of observations, use fill value
        min_n_obs = int(tmp_ds.shape[0] * 0.2)
        #idx_lt_two_obs = np.where(self.mask.sum(axis=0) < min_n_obs)
        tmp_ds = tmp_ds.where(self.mask.sum(axis=0) > min_n_obs, fill_value)

        #tmp_ds.data[:, idx_lt_two_obs[0],
        #            idx_lt_two_obs[1]] = fill_value
        #tmp_ds[:, idx_lt_two_obs[0], idx_lt_two_obs[1]] = fill_value

        for method in interpolation_methods:
            if self.isNotebook is True:
                progress_bar.value = _item
                progress_bar.description = (f"Interpolation of {data_var}"
                                            f" using {method}")

            if method == 'smoothn':
                # First, we need a linear interpolation
                tmp_interpol_ds = tmp_ds.interpolate_na(dim='time',
                                                        method='linear')

                # Weigth obs
                #idx = np.nonzero(tmp_interpol_ds.data)
                #w = tmp_ds.copy(deep=True).data
                #w[idx] *= 2
                # Smoothing
                s = float(self.smooth_factor.value)
                tmp_masked = np.ma.masked_equal(
                    tmp_interpol_ds.data * self.mask, 0)

                tmp_smoothed = smoothn(
                    tmp_masked,
                    #W=tmp_masked * 2, isrobust=True,
                    isrobust=True,
                    s=s,
                    TolZ=1e-6,
                    axis=0)[0]

                tmp_masked = None
                del (tmp_masked)
                # Overwrite data
                tmp_interpol_ds.data = tmp_smoothed

            else:
                tmp_interpol_ds = tmp_ds.interpolate_na(dim='time',
                                                        method=method)

            # Set data type to match the original (non-interpolated)
            tmp_interpol_ds.data = tmp_interpol_ds.data.astype(dtype)
            # Copy metadata attributes
            tmp_interpol_ds.attrs = tmp_ds.attrs

            # Save to file
            fname = f"{self.product}.{self.version}.{data_var}.{method}.tif"
            output_dir = os.path.join(self.source_dir, data_var[1::],
                                      'interpolated')

            if os.path.exists(output_dir) is False:
                os.mkdir(output_dir)
            fname = os.path.join(output_dir, fname)

            save_dask_array(fname=fname,
                            data=tmp_interpol_ds,
                            data_var=data_var,
                            method=method,
                            tile_size=tile_size,
                            n_workers=n_workers,
                            threads_per_worker=threads_per_worker,
                            memory_limit=memory_limit,
                            progressBar=progressBar)

            if self.isNotebook is True:
                _item += 1

        if self.isNotebook is True:
            # Remove progress bar
            progress_bar.close()
            del progress_bar
Exemplo n.º 28
0
class data_batch:
    def __init__(self,
                 type='train',
                 size='single',
                 color='white',
                 batch_size=64,
                 need_velocity=True,
                 NCHW=True,
                 shuffle=True,
                 max_num=-1):
        self.size = size
        self.type = type
        self.color = color
        self.batch_size = batch_size
        self.NCHW = NCHW
        self.max_num = max_num
        self.pressed = []
        self.unpressed = []
        self.num_pressed = 0
        self.num_unpressed = 0
        self.need_velocity = need_velocity
        for i, x in enumerate(y[color][type]):
            if x > 0:
                self.pressed.append(i)
                self.num_pressed += 1
            else:
                self.unpressed.append(i)
                self.num_unpressed += 1
        if shuffle:
            random.shuffle(self.pressed)
            random.shuffle(self.unpressed)
        if self.max_num == -1:
            self.max_num = len(self.unpressed)
            self.iter_num = len(self.unpressed) * 2
        else:
            self.max_num = self.max_num // 2
            self.iter_num = max_num
        self.bar = IntProgress(max=self.iter_num)
        display(self.bar)

    def __iter__(self):
        self.index = 0
        return self

    def __next__(self):
        start = self.index * self.batch_size // 2
        end = (self.index + 1) * self.batch_size // 2
        if start >= self.max_num:
            self.bar.close()
            raise StopIteration
        if end >= self.max_num:
            end = self.max_num
            start = end - self.batch_size // 2
        self.index += 1
        ind = np.array([])
        s = start % self.num_pressed
        t = end % self.num_pressed
        if start // self.num_pressed == end // self.num_pressed:
            ind = np.append(ind, np.array(self.pressed[s:t]))
        else:
            ind = np.append(ind, np.array(self.pressed[s:]))
            ind = np.append(ind, np.array(self.pressed[:t]))
        ind = np.append(ind, np.array(self.unpressed[start:end]))
        ind = ind.flatten().astype('int64')
        X_return = X[self.size][self.color][self.type][ind]
        if self.NCHW:
            X_return = np.transpose(X_return, (0, 3, 1, 2))
        y_return = y[self.color][self.type][ind]
        if not self.need_velocity:
            y_return = (y_return > 0).astype(np.int)
        self.bar.value += self.batch_size
        return (X_return, y_return)
Exemplo n.º 29
0
def sample_mcmc(model,
                h,
                x0=None,
                burnin=1000,
                n_samples=10000,
                sample_rate=10,
                g=None,
                noiseless_sample=False,
                progress_bar=False):
    """
        Sample points (theta) from either a Gaussian process model or simulator using the
          Metropolis-Hastings algorithm.

        Default proposal density, g, is a Gaussian with diagonal covariance; covariances set to 
          a small value based on the range of possible parameter settings for each dimension.

        Args:
            (models.GP) OR (simulators.Simulator) model:
                GP model of the discrepancy, OR Simulator instance with callable f(), noiseless_f()

               (float)                h:   bandwidth for KDE.
          (np.ndarray)               x0:   initial starting point.
                 (int)           burnin:   number of burn-in samples.
                 (int)      sample_rate:   how many iterations sampling. 
            (callable)                g:   proposal density.
                (bool) noiseless_sample:   whether to call noiseless_f or f (when `model' is a Simulator).
                (bool)     progress_bar:   whether to show progress bar in Jupyter notebook.

        Returns: 
          (np.ndarray)          samples:   with shape (n_samples, input_dim).
    """
    input_dim = model.input_dim
    bounds = model.bounds

    # function proportional to predictive distribution
    if isinstance(model, GP):
        f = lambda x: norm.cdf(
            (h - model.mu(x)) / np.sqrt(model.v(x) + model.obs_noise))
    elif isinstance(model, Simulator):
        # std. dev. of obs noise is stored in simulator, so no np.sqrt
        if noiseless_sample:
            f = lambda x: norm.cdf(
                (h - model.noiseless_f(x) / model.obs_noise))
        else:
            f = lambda x: norm.cdf((h - model.f(x) / model.obs_noise))
    else:
        raise ValueError('pass simulator or GP model as first argument.')

    if x0 is None:
        x0 = np.array([np.random.uniform(b1, b2)
                       for (b1, b2) in bounds]).reshape(1, input_dim)

    if g is None:
        cov = []
        for (b1, b2) in bounds:
            cov.append(0.025 * (b2 - b1))
        cov = np.diag(np.array(cov)).reshape(input_dim, input_dim)

        g = lambda xt: np.random.multivariate_normal(xt.squeeze(), cov
                                                     ).reshape(1, input_dim)

    progress_bar = progress_bar and 'jupyter' in os.environ['_']

    # ================================================
    # Burn-in period =================================
    if progress_bar:
        prog = IntProgress(value=0, max=burnin, description='Burn-in')
        display(prog)

    x = np.array(x0)
    for i in range(burnin):
        cand = g(x)  # candidate point
        if not model.within_bounds(cand):
            continue

        a = f(cand) / f(x)  # acceptance ratio
        if np.random.rand() < a:  # accept/reject
            x = np.copy(cand)

        if progress_bar:
            prog.value += 1

    # ================================================
    # Begin sampling =================================
    if progress_bar:
        prog.close()
        prog = IntProgress(value=0, max=n_samples, description='Sampling')
        display(prog)

    samples = []
    i = 0
    while len(samples) < n_samples:
        cand = g(x)  # candidate point
        if not model.within_bounds(cand):
            continue

        a = f(cand) / f(x)  # acceptance ratio
        if a < 0:
            continue

        if np.random.rand() < a:  # accept/reject
            x = np.copy(cand)

        if (i % sample_rate) == 0:
            samples.append(np.copy(x))
            if progress_bar:
                prog.value += 1

        i += 1

    if progress_bar:
        prog.close()

    return np.array(samples).reshape(n_samples, input_dim)