Пример #1
0
 def test_fork(self):
     """
     Test fork detection.
     """
     cuda.current_context()  # force cuda initialize
     # fork in process that also uses CUDA
     ctx = mp.get_context('fork')
     q = ctx.Queue()
     proc = ctx.Process(target=fork_test, args=[q])
     proc.start()
     exc = q.get()
     proc.join()
     # there should be an exception raised in the child process
     self.assertIsNotNone(exc)
     self.assertIn('CUDA initialized before forking', str(exc))
Пример #2
0
 def test_attached_non_primary(self):
     # Emulate non-primary context creation by 3rd party
     the_driver = driver.driver
     hctx = driver.drvapi.cu_context()
     the_driver.cuCtxCreate(byref(hctx), 0, 0)
     try:
         cuda.current_context()
     except RuntimeError as e:
         # Expecting an error about non-primary CUDA context
         self.assertIn("Numba cannot operate on non-primary CUDA context ",
                       str(e))
     else:
         self.fail("No RuntimeError raised")
     finally:
         the_driver.cuCtxDestroy(hctx)
Пример #3
0
    def test_max_pending_bytes(self):
        # get deallocation manager and flush it
        ctx = cuda.current_context()
        deallocs = ctx.deallocations
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)

        mi = ctx.get_memory_info()

        max_pending = 10**6  # 1MB
        old_ratio = config.CUDA_DEALLOCS_RATIO
        try:
            # change to a smaller ratio
            config.CUDA_DEALLOCS_RATIO = max_pending / mi.total
            # due to round off error (floor is used in calculating _max_pending_bytes)
            # it can be off by 1.
            self.assertAlmostEqual(deallocs._max_pending_bytes, max_pending, delta=1)

            # allocate half the max size
            # this will not trigger deallocation
            cuda.to_device(np.ones(max_pending // 2, dtype=np.int8))
            self.assertEqual(len(deallocs), 1)

            # allocate another remaining
            # this will not trigger deallocation
            cuda.to_device(np.ones(deallocs._max_pending_bytes - deallocs._size, dtype=np.int8))
            self.assertEqual(len(deallocs), 2)

            # another byte to trigger .clear()
            cuda.to_device(np.ones(1, dtype=np.int8))
            self.assertEqual(len(deallocs), 0)
        finally:
            # restore old ratio
            config.CUDA_DEALLOCS_RATIO = old_ratio
Пример #4
0
    def test_ipc_handle(self):
        # prepare data for IPC
        arr = np.arange(10, dtype=np.intp)
        devarr = cuda.to_device(arr)

        # create IPC handle
        ctx = cuda.current_context()
        ipch = ctx.get_ipc_handle(devarr.gpu_data)

        # manually prepare for serialization as bytes
        handle_bytes = bytes(ipch.handle)
        size = ipch.size

        # spawn new process for testing
        ctx = mp.get_context('spawn')
        result_queue = ctx.Queue()
        args = (handle_bytes, size, result_queue)
        proc = ctx.Process(target=base_ipc_handle_test, args=args)
        proc.start()
        succ, out = result_queue.get()
        if not succ:
            self.fail(out)
        else:
            np.testing.assert_equal(arr, out)
        proc.join(3)
Пример #5
0
    def test_max_pending_bytes(self):
        # get deallocation manager and flush it
        ctx = cuda.current_context()
        deallocs = ctx.deallocations
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)

        mi = ctx.get_memory_info()

        max_pending = 10**6  # 1MB
        old_ratio = config.CUDA_DEALLOCS_RATIO
        try:
            # change to a smaller ratio
            config.CUDA_DEALLOCS_RATIO = max_pending / mi.total
            self.assertEqual(deallocs._max_pending_bytes, max_pending)

            # deallocate half the max size
            cuda.to_device(np.ones(max_pending // 2, dtype=np.int8))
            self.assertEqual(len(deallocs), 1)

            # deallocate another remaining
            cuda.to_device(np.ones(max_pending - deallocs._size, dtype=np.int8))
            self.assertEqual(len(deallocs), 2)

            # another byte to trigger .clear()
            cuda.to_device(np.ones(1, dtype=np.int8))
            self.assertEqual(len(deallocs), 0)
        finally:
            # restore old ratio
            config.CUDA_DEALLOCS_RATIO = old_ratio
Пример #6
0
    def test_ipc_handle_serialization(self):
        # prepare data for IPC
        arr = np.arange(10, dtype=np.intp)
        devarr = cuda.to_device(arr)

        # create IPC handle
        ctx = cuda.current_context()
        ipch = ctx.get_ipc_handle(devarr.gpu_data)

        # pickle
        buf = pickle.dumps(ipch)
        ipch_recon = pickle.loads(buf)
        self.assertIs(ipch_recon.base, None)
        self.assertEqual(tuple(ipch_recon.handle), tuple(ipch.handle))
        self.assertEqual(ipch_recon.size, ipch.size)

        # spawn new process for testing
        ctx = mp.get_context('spawn')
        result_queue = ctx.Queue()
        args = (ipch, result_queue)
        proc = ctx.Process(target=serialize_ipc_handle_test, args=args)
        proc.start()
        succ, out = result_queue.get()
        if not succ:
            self.fail(out)
        else:
            np.testing.assert_equal(arr, out)
        proc.join(3)
Пример #7
0
    def test_mapped_contextmanager(self):
        # Check that temporarily mapped memory is unregistered immediately,
        # such that it can be re-mapped at any time
        class MappedException(Exception):
            pass

        arr = np.zeros(1)
        ctx = cuda.current_context()
        ctx.deallocations.clear()
        with self.check_ignored_exception(ctx):
            with cuda.mapped(arr) as marr:
                pass
            with cuda.mapped(arr) as marr:
                pass
            # Should also work inside a `defer_cleanup` block
            with cuda.defer_cleanup():
                with cuda.mapped(arr) as marr:
                    pass
                with cuda.mapped(arr) as marr:
                    pass
            # Should also work when breaking out of the block due to an exception
            try:
                with cuda.mapped(arr) as marr:
                    raise MappedException
            except MappedException:
                with cuda.mapped(arr) as marr:
                    pass
Пример #8
0
 def _array_helper(self, addr, datasize, shape,
                   strides, dtype, finalizer=None):
     ctx = cuda.current_context()
     ptr = ctypes.c_uint64(int(addr))
     mem = cuda.driver.MemoryPointer(ctx, ptr, datasize,
                                     finalizer=finalizer)
     return cuda.cudadrv.devicearray.DeviceNDArray(shape, strides, dtype,
                                                   gpu_data=mem)
Пример #9
0
 def the_work():
     dtype = np.dtype(np.intp)
     darr = handle.open_array(cuda.current_context(),
                              shape=handle.size // dtype.itemsize,
                              dtype=dtype)
     # copy the data to host
     arr = darr.copy_to_host()
     handle.close()
     return arr
Пример #10
0
def _as_numba_devarray(intaddr, nelem, dtype):
    dtype = np.dtype(dtype)
    addr = ctypes.c_uint64(intaddr)
    elemsize = dtype.itemsize
    datasize = elemsize * nelem
    memptr = cuda.driver.MemoryPointer(context=cuda.current_context(),
                                       pointer=addr, size=datasize)
    return cuda.devicearray.DeviceNDArray(shape=(nelem,), strides=(elemsize,),
                                          dtype=dtype, gpu_data=memptr)
Пример #11
0
    def test_context_memory(self):
        mem = cuda.current_context().get_memory_info()

        self.assertIsInstance(mem.free, numbers.Number)
        self.assertEquals(mem.free, mem[0])

        self.assertIsInstance(mem.total, numbers.Number)
        self.assertEquals(mem.total, mem[1])

        self.assertLessEqual(mem.free, mem.total)
Пример #12
0
def get_gpus_mem():
    gpus = cuda.gpus.lst
    mem_list = []
    for gpu in gpus:
        with gpu:
            meminfo = cuda.current_context().get_memory_info()
        mem_list.append(int(meminfo[0]))
    sort_gpus = [x for (y, x) in sorted(zip(mem_list, gpus), reverse=True)]
    sort_gmem = [y for (y, x) in sorted(zip(mem_list, gpus), reverse=True)]
    # return [sort_gpus[0]], [sort_gmem[0]]
    return sort_gpus, sort_gmem
Пример #13
0
 def test_max_pending_count(self):
     # get deallocation manager and flush it
     deallocs = cuda.current_context().deallocations
     deallocs.clear()
     self.assertEqual(len(deallocs), 0)
     # deallocate to maximum count
     for i in range(config.CUDA_DEALLOCS_COUNT):
         cuda.to_device(np.arange(1))
         self.assertEqual(len(deallocs), i + 1)
     # one more to trigger .clear()
     cuda.to_device(np.arange(1))
     self.assertEqual(len(deallocs), 0)
Пример #14
0
def wrap_fq(atoms, qbin=.1, sum_type='fq'):
    # get information for FQ transformation
    q = atoms.get_positions()
    q = q.astype(np.float32)
    n = len(q)
    if sum_type == 'fq':
        scatter_array = atoms.get_array('F(Q) scatter')
    else:
        scatter_array = atoms.get_array('PDF scatter')
    qmax_bin = scatter_array.shape[1]

    # get  number of allocated nodes
    n_nodes = count_nodes()
    print('nodes', n_nodes)

    # get info on our gpu setup and available memory
    mem_list = gpu_avail(n_nodes)
    mem_list.append(cuda.current_context().get_memory_info()[0])

    # starting buffers
    n_cov = 0

    # create list of tasks
    m_list = []
    while n_cov < n:
        for mem in mem_list:
            m = gpu_fq_atoms_allocation(n, qmax_bin, mem)
            if m > n - n_cov:
                m = n - n_cov
            m_list.append(m)
            if n_cov >= n:
                break
            n_cov += m
            if n_cov >= n:
                break

    # Make certain that we have covered all the atoms
    assert sum(m_list) == n

    reports = mpi_fq(n_nodes, m_list, q, scatter_array, qbin)

    fq = np.zeros(qmax_bin)
    for ele in reports:
        fq[:] += ele
    na = np.average(scatter_array, axis=0) ** 2 * n
    old_settings = np.seterr(all='ignore')
    fq = np.nan_to_num(1 / na * fq)
    np.seterr(**old_settings)
    return fq
Пример #15
0
 def test_attached_primary(self):
     # Emulate primary context creation by 3rd party
     the_driver = driver.driver
     hctx = driver.drvapi.cu_context()
     the_driver.cuDevicePrimaryCtxRetain(byref(hctx), 0)
     try:
         ctx = driver.Context(weakref.proxy(self), hctx)
         ctx.push()
         # Check that the context from numba matches the created primary
         # context.
         my_ctx = cuda.current_context()
         self.assertEqual(my_ctx.handle.value, ctx.handle.value)
     finally:
         ctx.pop()
         the_driver.cuDevicePrimaryCtxRelease(0)
Пример #16
0
    def test_basic(self):
        harr = np.arange(5)
        darr1 = cuda.to_device(harr)
        deallocs = cuda.current_context().deallocations
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)
        with cuda.defer_cleanup():
            darr2 = cuda.to_device(harr)
            del darr1
            self.assertEqual(len(deallocs), 1)
            del darr2
            self.assertEqual(len(deallocs), 2)
            deallocs.clear()
            self.assertEqual(len(deallocs), 2)

        deallocs.clear()
        self.assertEqual(len(deallocs), 0)
    def test_host_alloc_driver(self):
        n = 32
        mem = cuda.current_context().memhostalloc(n, mapped=True)

        dtype = np.dtype(np.uint8)
        ary = np.ndarray(shape=n // dtype.itemsize, dtype=dtype, buffer=mem)

        magic = 0xAB
        driver.device_memset(mem, magic, n)

        self.assertTrue(np.all(ary == magic))

        ary.fill(n)

        recv = np.empty_like(ary)

        driver.device_to_host(recv, mem, ary.size)

        self.assertTrue(np.all(ary == recv))
        self.assertTrue(np.all(recv == n))
Пример #18
0
 def test_ownership(self):
     # Get the deallocation queue
     ctx = cuda.current_context()
     deallocs = ctx.deallocations
     # Flush all deallocations
     deallocs.clear()
     self.assertEqual(len(deallocs), 0)
     # Make new device array
     d_arr = cuda.to_device(np.arange(100))
     # Convert it
     cvted = cuda.as_cuda_array(d_arr)
     # Drop reference to the original object such that
     # only `cvted` has a reference to it.
     del d_arr
     # There shouldn't be any new deallocations
     self.assertEqual(len(deallocs), 0)
     # Try to access the memory and verify its content
     np.testing.assert_equal(cvted.copy_to_host(), np.arange(100))
     # Drop last reference to the memory
     del cvted
     self.assertEqual(len(deallocs), 1)
     # Flush
     deallocs.clear()
Пример #19
0
    def test_exception(self):
        harr = np.arange(5)
        darr1 = cuda.to_device(harr)
        deallocs = cuda.current_context().deallocations
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)

        class CustomError(Exception):
            pass

        with self.assertRaises(CustomError):
            with cuda.defer_cleanup():
                darr2 = cuda.to_device(harr)
                del darr2
                self.assertEqual(len(deallocs), 1)
                deallocs.clear()
                self.assertEqual(len(deallocs), 1)
                raise CustomError
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)
        del darr1
        self.assertEqual(len(deallocs), 1)
        deallocs.clear()
        self.assertEqual(len(deallocs), 0)
Пример #20
0
 def test_stream(self):
     ctx = cuda.current_context()
     stream = ctx.create_stream()
     with self.check_ignored_exception(ctx):
         del stream
Пример #21
0
def get_device_total_memory(index=0):
    """
    Return total memory of CUDA device with index
    """
    with cuda.gpus[index]:
        return cuda.current_context().get_memory_info()[1]
Пример #22
0
 def test_device_memory(self):
     ctx = cuda.current_context()
     mem = ctx.memalloc(32)
     with self.check_ignored_exception(ctx):
         del mem
    logger.info(fn.__doc__)
    logger.info("Cardinality: {}".format(n))
    logger.info("Dimensionality: {}".format(d))
    return fn(n, d)


# computes required device memory: data + labels + dists + centroids
# n = cardinality, d = dimensionality, c = number of clusters
req_mem = lambda n, d, c: (n * d * 4 + n * 2 * 4 + c * d * 4)

# HOST memory max
MAX_ALLOWED_HOST_MEM = hostmemory * 2 ** 30
MAX_ALLOWED_HOST_MEM = int(MAX_ALLOWED_HOST_MEM)

# compute device memory
c = cuda.current_context()
free_mem, total_mem = c.get_memory_info()
MAX_ALLOWED_DEVICE_MEM = thresholdgpu * total_mem  # threshold default is 0.97
MAX_ALLOWED_DEVICE_MEM = int(MAX_ALLOWED_DEVICE_MEM)

logger.info("Will occupy maximum of {} MB in" " device memory.".format(MAX_ALLOWED_DEVICE_MEM / (1024.0 ** 2)))

# cardinality = [100, 250, 500, 750,
#                1e3, 2.5e3, 5e3, 7.5e3,
#                1e4, 2.5e4, 5e4, 7.5e4,
#                1e5, 2.5e5, 5e5, 7.5e5,
#                1e6, 2.5e6, 5e6, 7.5e6,
#                1e7]
cardinality = [100, 1e3, 5e3, 1e4, 5e4, 1e5, 5e5, 1e6, 5e6, 1e7]
cardinality = map(int, cardinality)
Пример #24
0
def cuda_current_context():
    ctx = cuda.current_context()
    return ctx
Пример #25
0
 def test_mapped_memory(self):
     ctx = cuda.current_context()
     mem = ctx.memhostalloc(32, mapped=True)
     with self.check_ignored_exception(ctx):
         del mem
Пример #26
0
def cc_X_or_above(major, minor):
    if not config.ENABLE_CUDASIM:
        return cuda.current_context().device.compute_capability >= (major, minor)
    else:
        return True
Пример #27
0
 def switch_gpu():
     with cuda.gpus[1]:
         return cuda.current_context().device.id
Пример #28
0
 def test_event(self):
     ctx = cuda.current_context()
     event = ctx.create_event()
     with self.check_ignored_exception(ctx):
         del event
Пример #29
0
def cc_X_or_above(major, minor):
    if not config.ENABLE_CUDASIM:
        return cuda.current_context().device.compute_capability >= (major,
                                                                    minor)
    else:
        return True
Пример #30
0
def gpu_stump(
    T_A, m, T_B=None, ignore_trivial=True, device_id=0, normalize=True, p=2.0
):
    """
    Compute the z-normalized matrix profile with one or more GPU devices

    This is a convenience wrapper around the Numba `cuda.jit` `_gpu_stump` function
    which computes the matrix profile according to GPU-STOMP. The default number of
    threads-per-block is set to `512` and may be changed by setting the global parameter
    `config.STUMPY_THREADS_PER_BLOCK` to an appropriate number based on your GPU
    hardware.

    Parameters
    ----------
    T_A : numpy.ndarray
        The time series or sequence for which to compute the matrix profile

    m : int
        Window size

    T_B : numpy.ndarray, default None
        The time series or sequence that will be used to annotate T_A. For every
        subsequence in T_A, its nearest neighbor in T_B will be recorded. Default is
        `None` which corresponds to a self-join.

    ignore_trivial : bool, default True
        Set to `True` if this is a self-join. Otherwise, for AB-join, set this
        to `False`. Default is `True`.

    device_id : int or list, default 0
        The (GPU) device number to use. The default value is `0`. A list of
        valid device ids (int) may also be provided for parallel GPU-STUMP
        computation. A list of all valid device ids can be obtained by
        executing `[device.id for device in numba.cuda.list_devices()]`.

    normalize : bool, default True
        When set to `True`, this z-normalizes subsequences prior to computing distances.
        Otherwise, this function gets re-routed to its complementary non-normalized
        equivalent set in the `@core.non_normalized` function decorator.

    p : float, default 2.0
        The p-norm to apply for computing the Minkowski distance. This parameter is
        ignored when `normalize == True`.

    Returns
    -------
    out : numpy.ndarray
        The first column consists of the matrix profile, the second column
        consists of the matrix profile indices, the third column consists of
        the left matrix profile indices, and the fourth column consists of
        the right matrix profile indices.

    See Also
    --------
    stumpy.stump : Compute the z-normalized matrix profile
    stumpy.stumped : Compute the z-normalized matrix profile with a distributed dask
        cluster
    stumpy.scrump : Compute an approximate z-normalized matrix profile

    Notes
    -----
    `DOI: 10.1109/ICDM.2016.0085 \
    <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__

    See Table II, Figure 5, and Figure 6

    Timeseries, T_A, will be annotated with the distance location
    (or index) of all its subsequences in another times series, T_B.

    Return: For every subsequence, Q, in T_A, you will get a distance
    and index for the closest subsequence in T_B. Thus, the array
    returned will have length T_A.shape[0]-m+1. Additionally, the
    left and right matrix profiles are also returned.

    Note: Unlike in the Table II where T_A.shape is expected to be equal
    to T_B.shape, this implementation is generalized so that the shapes of
    T_A and T_B can be different. In the case where T_A.shape == T_B.shape,
    then our algorithm reduces down to the same algorithm found in Table II.

    Additionally, unlike STAMP where the exclusion zone is m/2, the default
    exclusion zone for STOMP is m/4 (See Definition 3 and Figure 3).

    For self-joins, set `ignore_trivial = True` in order to avoid the
    trivial match.

    Note that left and right matrix profiles are only available for self-joins.

    Examples
    --------
    >>> from numba import cuda
    >>> if __name__ == "__main__":
    ...     all_gpu_devices = [device.id for device in cuda.list_devices()]
    ...     stumpy.gpu_stump(
    ...         np.array([584., -11., 23., 79., 1001., 0., -19.]),
    ...         m=3,
    ...         device_id=all_gpu_devices)
    array([[0.11633857113691416, 4, -1, 4],
           [2.694073918063438, 3, -1, 3],
           [3.0000926340485923, 0, 0, 4],
           [2.694073918063438, 1, 1, -1],
           [0.11633857113691416, 0, 0, -1]], dtype=object)
    """
    if T_B is None:  # Self join!
        T_B = T_A
        ignore_trivial = True

    T_A, M_T, Σ_T = core.preprocess(T_A, m)
    T_B, μ_Q, σ_Q = core.preprocess(T_B, m)

    if T_A.ndim != 1:  # pragma: no cover
        raise ValueError(
            f"T_A is {T_A.ndim}-dimensional and must be 1-dimensional. "
            "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`"
        )

    if T_B.ndim != 1:  # pragma: no cover
        raise ValueError(
            f"T_B is {T_B.ndim}-dimensional and must be 1-dimensional. "
            "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`"
        )

    core.check_window_size(m, max_size=min(T_A.shape[0], T_B.shape[0]))

    if ignore_trivial is False and core.are_arrays_equal(T_A, T_B):  # pragma: no cover
        logger.warning("Arrays T_A, T_B are equal, which implies a self-join.")
        logger.warning("Try setting `ignore_trivial = True`.")

    if ignore_trivial and core.are_arrays_equal(T_A, T_B) is False:  # pragma: no cover
        logger.warning("Arrays T_A, T_B are not equal, which implies an AB-join.")
        logger.warning("Try setting `ignore_trivial = False`.")

    n = T_B.shape[0]
    k = T_A.shape[0] - m + 1
    l = n - m + 1
    excl_zone = int(
        np.ceil(m / config.STUMPY_EXCL_ZONE_DENOM)
    )  # See Definition 3 and Figure 3

    T_A_fname = core.array_to_temp_file(T_A)
    T_B_fname = core.array_to_temp_file(T_B)
    M_T_fname = core.array_to_temp_file(M_T)
    Σ_T_fname = core.array_to_temp_file(Σ_T)
    μ_Q_fname = core.array_to_temp_file(μ_Q)
    σ_Q_fname = core.array_to_temp_file(σ_Q)

    out = np.empty((k, 4), dtype=object)

    if isinstance(device_id, int):
        device_ids = [device_id]
    else:
        device_ids = device_id

    profile = [None] * len(device_ids)
    indices = [None] * len(device_ids)

    for _id in device_ids:
        with cuda.gpus[_id]:
            if (
                cuda.current_context().__class__.__name__ != "FakeCUDAContext"
            ):  # pragma: no cover
                cuda.current_context().deallocations.clear()

    step = 1 + l // len(device_ids)

    # Start process pool for multi-GPU request
    if len(device_ids) > 1:  # pragma: no cover
        mp.set_start_method("spawn", force=True)
        pool = mp.Pool(processes=len(device_ids))
        results = [None] * len(device_ids)

    QT_fnames = []
    QT_first_fnames = []

    for idx, start in enumerate(range(0, l, step)):
        stop = min(l, start + step)

        QT, QT_first = core._get_QT(start, T_A, T_B, m)
        QT_fname = core.array_to_temp_file(QT)
        QT_first_fname = core.array_to_temp_file(QT_first)
        QT_fnames.append(QT_fname)
        QT_first_fnames.append(QT_first_fname)

        if len(device_ids) > 1 and idx < len(device_ids) - 1:  # pragma: no cover
            # Spawn and execute in child process for multi-GPU request
            results[idx] = pool.apply_async(
                _gpu_stump,
                (
                    T_A_fname,
                    T_B_fname,
                    m,
                    stop,
                    excl_zone,
                    M_T_fname,
                    Σ_T_fname,
                    QT_fname,
                    QT_first_fname,
                    μ_Q_fname,
                    σ_Q_fname,
                    k,
                    ignore_trivial,
                    start + 1,
                    device_ids[idx],
                ),
            )
        else:
            # Execute last chunk in parent process
            # Only parent process is executed when a single GPU is requested
            profile[idx], indices[idx] = _gpu_stump(
                T_A_fname,
                T_B_fname,
                m,
                stop,
                excl_zone,
                M_T_fname,
                Σ_T_fname,
                QT_fname,
                QT_first_fname,
                μ_Q_fname,
                σ_Q_fname,
                k,
                ignore_trivial,
                start + 1,
                device_ids[idx],
            )

    # Clean up process pool for multi-GPU request
    if len(device_ids) > 1:  # pragma: no cover
        pool.close()
        pool.join()

        # Collect results from spawned child processes if they exist
        for idx, result in enumerate(results):
            if result is not None:
                profile[idx], indices[idx] = result.get()

    os.remove(T_A_fname)
    os.remove(T_B_fname)
    os.remove(M_T_fname)
    os.remove(Σ_T_fname)
    os.remove(μ_Q_fname)
    os.remove(σ_Q_fname)
    for QT_fname in QT_fnames:
        os.remove(QT_fname)
    for QT_first_fname in QT_first_fnames:
        os.remove(QT_first_fname)

    for idx in range(len(device_ids)):
        profile_fname = profile[idx]
        indices_fname = indices[idx]
        profile[idx] = np.load(profile_fname, allow_pickle=False)
        indices[idx] = np.load(indices_fname, allow_pickle=False)
        os.remove(profile_fname)
        os.remove(indices_fname)

    for i in range(1, len(device_ids)):
        # Update all matrix profiles and matrix profile indices
        # (global, left, right) and store in profile[0] and indices[0]
        for col in range(profile[0].shape[1]):  # pragma: no cover
            cond = profile[0][:, col] < profile[i][:, col]
            profile[0][:, col] = np.where(cond, profile[0][:, col], profile[i][:, col])
            indices[0][:, col] = np.where(cond, indices[0][:, col], indices[i][:, col])

    out[:, 0] = profile[0][:, 0]
    out[:, 1:4] = indices[0][:, :]

    threshold = 10e-6
    if core.are_distances_too_small(out[:, 0], threshold=threshold):  # pragma: no cover
        logger.warning(f"A large number of values are smaller than {threshold}.")
        logger.warning("For a self-join, try setting `ignore_trivial = True`.")

    return out
 def switch_gpu():
     with cuda.gpus[1]:
         return cuda.current_context().device.id
Пример #32
0
def _get_context():
    pid = multiprocessing.current_process().pid
    ctxid = cuda.current_context().handle.value
    return pid, ctxid
Пример #33
0
 def test_device_memory(self):
     ctx = cuda.current_context()
     mem = ctx.memalloc(32)
     with self.check_ignored_exception(ctx):
         del mem
Пример #34
0
 def test_mapped_memory(self):
     ctx = cuda.current_context()
     mem = ctx.memhostalloc(32, mapped=True)
     with self.check_ignored_exception(ctx):
         del mem
Пример #35
0
 def cc(self):
     return cuda.current_context().device.compute_capability
Пример #36
0
def get_device_memory_info():
    """
    Returns the total amount of global memory on the device in bytes
    """
    meminfo = cuda.current_context().get_memory_info()
    return meminfo[1]
Пример #37
0
 def destroy(self):
     context = cuda.current_context(self.gpu)
     context.reset()
     # delete variables in self.
     gc.collect(1)
def iam_lots_gpu_compute(csv_filename="", patch_size=[1,2,4,8],
                         blending_weights=[0.65,0.2,0.1,0.05], num_sample=[512],
                         alpha=0.5, thrsh_patches = True, bin_tresh=0.5, save_jpeg=True,
                         delete_intermediary=False, nawm_preprocessing=False):
    '''
    FUNCTION'S SUMMARY:

    Main function of the LOTS-IAM-GPU algorithm. This function produces (i.e. saving)
    age maps that indicate level of irregularity of voxels in brain FLAIR MRI. This
    function reads a list of FLAIR MR image (NifTI), ICV mask (NifTI), CSF mask (NifTI),
    NAWM mask (NifTI), and Cortical mask (NifTI) to produce the corresponding age maps
    from a CSV file. Please note that this version only accept NifTI (.nii/.nii.gz) files.

    NOTE: NAWM and Cortical masks are optional. They will be used if they are included
    in the CSV file.

    Format of the CSV input file (NOTE: spaces are used to make the format clearer):

        path_to_mri_codebase_folder, mri_code_name, path_FLAIR, path_ICV, path_CSF,
        path_NAWM (optional), path_Cortical (optional)

    Example (NOTE: spaces are used to make the format clearer):

        /dir/MRI_DB/, MRI001, /dir/MRI_DB/MRI001/FLAIR.nii.gz, /dir/MRI_DB/MRI001/ICV.nii.gz,
        /dir/MRI_DB/MRI001/CSF.nii.gz, /dir/MRI_DB/MRI001/NAWM.nii.gz (optional),
        /dir/MRI_DB/MRI001/Cortex.nii.gz (optional)

    By default, the age maps are calculated by using four different sizes of source/target
    patches (i.e. 1x1, 2x2, 4x4, and 8x8) and 64 target samples. Furthermore, all intermediary
    files are saved in .mat (Matlab) and JPEG files.


    INPUT PARAMETERS:

    This function's behavior can be set by using input parameters below.

        1. output_filedir   : Path of directory for saving all results. Format of the path:
                              "output_path/name_of_experiment"

        2. csv_filename     : Name of a CSV input file which contains list all files to be
                              processed by the LOTS-IAM-GPU. Example: "input.csv"

        3. patch_size       : Size of source/target patches for IAM's computation. Default:
                              [1,2,4,8] to calculate age maps from four different sizes of
                              source/target patches i.e. 1x1, 2x2, 4x4, and 8x8. The sizes
                              of source/target patches must be in the form of python's list.

        4. blending_weights : Weights used for blending age maps produced by different size of
                              source/target patches. The weights must be in the form of python's
                              list, summed to 1, and its length must be the same as `patch_size`.

        5. num_sample       : A list of numbers used for randomly sampling target patches to be
                              used in the LOTS-IAM-GPU calculation. Default: [512]. Available
                              values: [64, 128, 256, 512, 1024, 2048]. Some important notes:

                                a. Smaller number will make computation faster.
                                b. Input the numbers as a list to automatically produce
                                   age maps by using all different numbers of target patches.
                                   The software will automatically create different output
                                   folders for different number of target samples.
                                c. For this version, only 64, 128, 256, 512, 1024, and 2048
                                   can be used as input numbers (error will be raised if other
                                   numbers are used).

        6. alpha            : Weight of distance function to blend maximum difference and
                              average difference between source and target patches. Default:
                              0.5. Input value should be between 0 and 1 (i.e. floating points).
                              The current distance function being used is:

                                  d = (alpha . |max(s - t)|) + ((1 - alpha) . |mean(s - t)|)

                              where d is distance value, s is source patch, and t is target patch.

        7. bin_tresh        : Threshold value for cutting of probability values of brain masks,
                              if probability masks are given instead of binary masks.

        8. save_jpeg        : True  --> Save all JPEG files for visualisation.
                              False --> Do not save the JPEG files.

        9. delete_intermediary : False --> Save all intermediary files (i.e. JPEG/.mat files).
                                 True  --> Delete all intermediary files, saving some spaces in
                                           the hard disk drive.

    OUTPUT:

    The software will automatically create a new folder provided in "output_filedir" variable.
    Please make sure that the directory is accessible and writable.

    Inside the experiment’s folder, each patient/MRI mri_code will have its own folder. In default,
    there are 6 sub-folders which are:
    1. 1: Contains age maps of each slice generated by using 1x1 patch.
    2. 2: Contains age maps of each slice generated by using 2x2 patch.
    3. 4: Contains age maps of each slice generated by using 4x4 patch.
    4. 8: Contains age maps of each slice generated by using 8x8 patch.
    5. IAM_combined_python: Contains two sub-folders:
        a. Patch: contains visualisation of age maps of each slices in JPEG files, and
        b. Combined: contains visualisation of the final output of LOTS-IAM-GPU’s computation.
    6. IAM_GPU_nifti_python: Contains one Matlab (.mat) file and three NIfTI files (.nii.gz):
        a. all_slice_dat.mat: processed mri_code of all slices in Matlab file,
        b. IAM_GPU_COMBINED.nii.gz: the original age map values,
        c. IAM_GPU_GN.nii.gz: the final age map values (i.e. GN and penalty), and
        d. IAM_GPU_GN_postprocessed.nii.gz: the final age map values plus post-processing
           (only if NAWM mask is provided).

    Note: If parameter value of `delete_intermediary` is `True`, then all folders listed above
    will be deleted, except for folder `IAM_GPU_nifti_python` and its contents.

    MORE HELP:

    Please read README.md file provided in:
        https://github.com/febrianrachmadi/lots-iam-gpu

    VERSION (dd/mm/yyyy):
    - 31/05/2018b: NAWM and Cortical brain masks are now optional input (will be used if available).
    - 31/05/2018a: Fix header information of the LOTS-IAM-GPU's result.
    - 08/05/2018 : Add lines to cutting off probability mask and deleting intermediary folders.
    - 07/05/2018 : Initial release code.
    '''

    ## Check availability of input files and output path
    if csv_filename == "":
        raise ValueError("Please set output folder's name and CSV mri_code filename. See: help(iam_lots_gpu)")
        return 0

    ## Check compatibility between 'patch_size' and 'blending_weights'
    if len(patch_size) != len(blending_weights):
        raise ValueError("Lengths of 'patch_size' and 'blending_weights' variables are not the same. Length of 'patch_size' is " + str(len(patch_size)) + ", while 'blending_weights' is " + str(len(blending_weights)) + ".")
        return 0

    ## If intermediary files to be deleted, don't even try to save JPEGs
    if delete_intermediary:
        save_jpeg = False

    ''' Set number of mean samples automatically '''
    ''' num_samples_all = [64, 128, 256, 512, 1024, 2048] '''
    ''' num_mean_samples_all = [16, 32, 32, 64, 128, 128] '''
    num_samples_all = num_sample
    num_mean_samples_all = []
    for sample in num_samples_all:
        if sample == 64:
            num_mean_samples_all.append(16)
        elif sample == 128:
            num_mean_samples_all.append(32)
        elif sample == 256:
            num_mean_samples_all.append(32)
        elif sample == 512:
            num_mean_samples_all.append(64)
        elif sample == 1024:
            num_mean_samples_all.append(128)
        elif sample == 2048:
            num_mean_samples_all.append(128)
        else:
            raise ValueError("Number of samples must be either 64, 128, 256, 512, 1024 or 2048!")
            return 0

    print("--- PARAMETERS - CHECKED ---")
    print('CSV mri_code filename: ' + csv_filename)
    print('Patch size(s): ' + str(patch_size))
    print('Number of samples (all): ' + str(num_samples_all))
    print('Number of mean samples (all): ' + str(num_mean_samples_all))
    print('Save JPEGs? ' + str(save_jpeg))
    print("--- PARAMETERS - CHECKED ---\n")

    for ii_s in range(0, len(num_samples_all)):
        num_samples = num_samples_all[ii_s]
        num_mean_samples = num_mean_samples_all[ii_s]
        print('Number of samples for IAM: ' + str(num_samples))
        print('Number of mean samples for IAM: ' + str(num_mean_samples))

        with open(csv_filename, newline='') as csv_file:
            num_subjects = len(csv_file.readlines())
            print('Number of subject(s): ' + str(num_subjects))

        with open(csv_filename, newline='', encoding="utf-8-sig") as csv_file:
            reader = csv.reader(csv_file)

            timer_idx = 0
            elapsed_times_all = np.zeros((num_subjects))
            elapsed_times_patch_all = np.zeros((num_subjects, len(patch_size)))
            for row in reader:
                mri_code = row[2]

                dirOutput = row[1]
                print('Output dir: ' + dirOutput + '\n--')

                try:
                    os.makedirs(dirOutput)
                except OSError as e:
                    if e.errno != errno.EEXIST:
                        raise


                print('--\nNow processing mri_code: ' + mri_code)

                inputSubjectDir = row[0]
                print('Input filename (full path): ' + inputSubjectDir)

                ''' Create output folder(s) '''
                dirOutData = dirOutput + '/' + mri_code
                dirOutDataCom = dirOutput + '/' + mri_code + '/IAM_combined_python/'
                dirOutDataPatch = dirOutput + '/' + mri_code + '/IAM_combined_python/Patch/'
                dirOutDataCombined = dirOutput + '/' + mri_code + '/IAM_combined_python/Combined/'
                try:
                    print(dirOutDataCom)
                    os.makedirs(dirOutDataCom)
                    os.makedirs(dirOutDataPatch)
                    os.makedirs(dirOutDataCombined)
                except OSError as e:
                    if e.errno != errno.EEXIST:
                        raise

                mri_data = sio.loadmat(row[0])     # Loading FLAIR
                mri_data = mri_data["flair"]
                [x_len, y_len, z_len] = mri_data.shape

                one_mri_data = timer()
                for xy in range(0, len(patch_size)):
                    print('>>> Processing patch-size: ' + str(patch_size[xy]) + ' <<<\n')

                    try:
                        os.makedirs(dirOutData + '/' + str(patch_size[xy]))
                    except OSError as e:
                        if e.errno != errno.EEXIST:
                            raise

                    one_patch = timer()
                    for zz in range(0, mri_data.shape[2]):
                        print('---> Slice number: ' + str(zz) + ' <---')

                        '''
                        KEY POINT: PRE-PROCESSING P.2 - START
                        -------------------------------------
                        This version still does per slice operation for extracting brain tissues.
                        Two important variables used in the next part of the code are:
                        1. mask_slice --->  Combination of ICV & CSF masks. It is used to find valid source patches
                                            for LOTS-IAM-GPU computation (i.e. brain tissues' source patches).
                        2. brain_slice -->  Brain tissues' information from FLAIR slice.
                        '''

                        mask_slice = np.nan_to_num(mri_data[:, :, zz])
                        mask_slice[mask_slice > 0] = 1

                        brain_slice = np.nan_to_num(mri_data[:, :, zz])

                        '''
                        -----------------------------------
                        KEY POINT: PRE-PROCESSING P.2 - END
                        '''

                        ## Show brain slice to be used for computation
                        #fig, ax = plt.subplots()
                        #cax = ax.imshow(icv_slice, cmap="jet")
                        #cbar = fig.colorbar(cax)
                        #fig.show()
                        #plt.savefig("plot.jpg")


                        # Vol distance threshold
                        vol_slice = np.count_nonzero(brain_slice) / (x_len * y_len)                         ## Proportion of brain slice compared to full image
                        print('DEBUG-Patch: brain_slice - ' + str(np.count_nonzero(brain_slice)) +
                              ', x_len * y_len - ' + str(x_len * y_len) + ', vol: ' + str(vol_slice))       ## x_len/y_len = 512 here

                        # Patch's sampling number treshold
                        TRSH = 0.50
                        if patch_size[xy] == 1:
                            if vol_slice < 0.010: TRSH = 0
                            elif vol_slice < 0.035: TRSH = 0.15
                            elif vol_slice < 0.070 and vol_slice >= 0.035: TRSH = 0.60
                            elif vol_slice >= 0.070: TRSH = 0.80
                        elif patch_size[xy] == 2:
                            if vol_slice < 0.010: TRSH = 0
                            elif vol_slice < 0.035: TRSH = 0.15
                            elif vol_slice < 0.070 and vol_slice >= 0.035: TRSH = 0.60
                            elif vol_slice >= 0.070: TRSH = 0.80
                        elif patch_size[xy] == 4 or patch_size[xy] == 8:
                            if vol_slice < 0.035: TRSH = 0

                        print('DEBUG-Patch: Size - ' + str(patch_size[xy]) + ', slice - ' + str(zz) +
                              ', vol: ' + str(vol_slice) + ', TRSH: ' + str(TRSH))

                        counter_y = int(y_len / patch_size[xy])                                             ## counter_y = 512 if patch of size 1 and image of size 512x512
                        counter_x = int(x_len / patch_size[xy])
                        source_patch_len = counter_x * counter_y                                            ## How many source patches are neede (e.g. for 1, we need one for each pixel)
                        age_values_all = np.zeros(source_patch_len)                                         ## Age Map that will be filled with the actual values

                        valid = 0
                        if ((vol_slice >= 0.008 and vol_slice < 0.035) and (patch_size[xy] == 1 or patch_size[xy] == 2)) or \
                            ((vol_slice >= 0.035 and vol_slice < 0.065) and (patch_size[xy] == 1 or patch_size[xy] == 2 or \
                             patch_size[xy] == 4)) or (vol_slice > 0.065):
                            valid = 1

                            ## Creating grid-patch 'xy-by-xy'
                            #  -- Column
                            y_c = np.ceil(patch_size[xy] / 2)
                            y_c_sources = np.zeros(int(y_len / patch_size[xy]))
                            for iy in range(0, int(y_len / patch_size[xy])):
                                y_c_sources[iy] = (iy * patch_size[xy]) + y_c - 1


                            #  -- Row
                            x_c = np.ceil(patch_size[xy] / 2)
                            x_c_sources = np.zeros(int(x_len / patch_size[xy]))
                            for ix in range(0, int(x_len / patch_size[xy])):
                                x_c_sources[ix] = (ix * patch_size[xy]) + x_c - 1


                            ''' Extracting Source Patches '''
                            area_source_patch = np.zeros([1,patch_size[xy],patch_size[xy]])
                            center_source_patch = np.zeros([1,2])
                            icv_source_flag = np.zeros([source_patch_len])
                            icv_source_flag_valid = np.ones([source_patch_len])
                            index_mapping = np.ones([source_patch_len]) * -1


                            flag = 1
                            index = 0
                            index_source= 0

                            if patch_size[xy] == 1:
                                area_source_patch = brain_slice[mask_slice == 1]
                                area_source_patch = area_source_patch.reshape([area_source_patch.shape[0], 1, 1])
                                index = source_patch_len
                                index_source = area_source_patch.shape[0]
                                icv_source_flag = mask_slice.flatten()
                                positive_indices = (np.where(brain_slice.flatten() > 0))[0]
                                index = 0
                                for i in positive_indices:
                                    index_mapping[i] = index
                                    index += 1

                            else:
                                area_source_patch = []
                                for isc in range(0, counter_x):
                                    for jsc in range(0, counter_y):
                                            icv_source_flag[index] = mask_slice[int(x_c_sources[isc]), int(y_c_sources[jsc])]
                                            if icv_source_flag[index] == 1:
                                                temp = get_area(x_c_sources[isc], y_c_sources[jsc],
                                                                patch_size[xy], patch_size[xy], brain_slice)
                                                area_source_patch.append(temp.tolist())
                                                index_mapping[index] = index_source
                                                index_source += 1

                                            index += 1
                                area_source_patch = np.asarray(area_source_patch)




                            icv_source_flag_valid = icv_source_flag_valid[0:index_source]
                            age_values_valid = np.zeros(index_source)

                            """ TO DELETE, IT'S JUST FOR DISSERTATION
                            for i in range(area_source_patch.shape[2]):
                                plt.imshow(area_source_patch[i]) #Needs to be in row,col order
                                plt.savefig("test.jpg")
                            """


                            ''' Extracting Target Patches '''
                            target_patches = []
                            index_debug = 0
                            random_array = np.random.randint(10, size=(x_len, y_len))
                            index_possible = np.zeros(brain_slice.shape)
                            index_possible[(mask_slice != 0) & (random_array > TRSH*10)] = 1
                            index_possible = np.argwhere(index_possible)


                            for index_chosen in index_possible:
                                x, y = index_chosen
                                area = get_area(x, y, patch_size[xy], patch_size[xy], brain_slice)
                                if area.size == patch_size[xy] * patch_size[xy]:
                                    if np.random.randint(low=1, high=10)/10 < (100/(x*y)) * num_samples:
                                        pass
                                    target_patches.append(area)
                                    index_debug += 1


                            target_patches_np = get_shuffled_patches(target_patches, num_samples)
                            target_patches_np = target_patches_np[0:num_samples,:,:]
                            print('Sampling finished: ' + ' with: ' + str(index_debug) + ' samples from: ' + str(x_len * y_len))
                            area = []

                            ''''''
                            ''' Reshaping array mri_code '''
                            area_source_patch_cuda_all = np.reshape(area_source_patch,(area_source_patch.shape[0],
                                                            area_source_patch.shape[1] * area_source_patch.shape[2]))
                            target_patches_np_cuda_all = np.reshape(target_patches_np, (target_patches_np.shape[0],
                                                            target_patches_np.shape[1] * target_patches_np.shape[2]))

                            #if patch_size[xy] == 2:
                            #    code.interact(local=dict(globals(), **locals()))

                            melvin = timer()
                            source_len = icv_source_flag_valid.shape[0]
                            loop_len = 512 # def: 512
                            loop_num = int(np.ceil(source_len / loop_len))
                            print('\nLoop Information:')
                            print('Total number of source patches: ' + str(source_len))
                            print('Number of voxels processed in one loop: ' + str(loop_len))
                            print('Number of loop needed: ' + str(loop_num))
                            print('Check GPU memory: ' + str(cuda.current_context().get_memory_info()))

                            for il in range(0, loop_num):
                                ''' Debug purposed printing '''
                                print('.', end='')
                                if np.remainder(il+1, 32) == 0:
                                    print(' ' + str(il+1) + '/' + str(loop_num)) # Print newline

                                ''' Only process sub-array '''
                                source_patches_loop = area_source_patch_cuda_all[il*loop_len:(il*loop_len)+loop_len,:]

                                '''  SUBTRACTION '''
                                sub_result_gm = cuda.device_array((source_patches_loop.shape[0],
                                                                   target_patches_np_cuda_all.shape[0],
                                                                   target_patches_np_cuda_all.shape[1]))
                                TPB = (4,256)
                                BPGx = int(math.ceil(source_patches_loop.shape[0] / TPB[0]))
                                BPGy = int(math.ceil(target_patches_np_cuda_all.shape[0] / TPB[1]))
                                BPGxy = (BPGx,BPGy)
                                cu_sub_st[BPGxy,TPB](source_patches_loop, target_patches_np_cuda_all, sub_result_gm)

                                '''  MAX-MEAN-ABS '''
                                sub_max_mean_result = cuda.device_array((source_patches_loop.shape[0],
                                                                         target_patches_np_cuda_all.shape[0],2))
                                cu_max_mean_abs[BPGxy,TPB](sub_result_gm, sub_max_mean_result)
                                sub_result_gm = 0  # Free memory

                                '''  DISTANCE '''
                                distances_result = cuda.device_array((source_patches_loop.shape[0],
                                                                      target_patches_np_cuda_all.shape[0]))
                                cu_distances[BPGxy,TPB](sub_max_mean_result,
                                                        icv_source_flag_valid[il*loop_len:(il*loop_len)+loop_len],
                                                        distances_result, alpha)
                                sub_max_mean_result = 0  # Free memory

                                ''' SORT '''
                                TPB = 256
                                BPG = int(math.ceil(distances_result.shape[0] / TPB))
                                cu_sort_distance[BPG,TPB](distances_result)

                                ''' MEAN (AGE-VALUE) '''
                                idx_start = 8 # Starting index of mean calculation (to avoid bad example)
                                distances_result_for_age = distances_result[:,idx_start:idx_start+num_mean_samples]
                                distances_result = 0  # Free memory
                                cu_age_value[BPG,TPB](distances_result_for_age,
                                                      age_values_valid[il*loop_len:(il*loop_len)+loop_len])
                                distances_result_for_age = 0  # Free memory
                                del source_patches_loop  # Free memory
                                #code.interact(local=dict(globals(), **locals()))
                            print(' - Finished!\n')
                            print(timer() - melvin)
                            raise Exception()
                        ''' Mapping from age_value_valid to age value_all '''
                        if valid == 1:
                            index = 0
                            for idx_val in index_mapping:
                                if idx_val != -1:
                                    age_values_all[index] = age_values_valid[int(idx_val)]
                                index += 1

                        ''' Normalisation to probabilistic map (0...1) '''
                        if (np.max(age_values_all) - np.min(age_values_all)) == 0:
                            all_mean_distance_normed = age_values_all
                        else:
                            all_mean_distance_normed = np.divide((age_values_all - np.min(age_values_all)),
                                (np.max(age_values_all) - np.min(age_values_all)))

                        ''' SAVE Result (JPG) '''
                        slice_age_map = np.zeros([counter_x,counter_y])
                        index = 0
                        for ix in range(0, counter_x):
                            for iy in range(0, counter_y):
                                slice_age_map[ix,iy] = all_mean_distance_normed[index]
                                index += 1

                        ## Save mri_data
                        sio.savemat(dirOutData + '/' + str(patch_size[xy]) + '/' + str(zz) + '_dat.mat',
                                    {'slice_age_map':slice_age_map})

                        print('Check GPU memory: ' + str(cuda.current_context().get_memory_info()))
                        print('GPU flushing..\n--\n')
                        numba.cuda.profile_stop()
                    elapsed_times_patch_all[timer_idx,xy] = timer() - one_patch
                    print('IAM for MRI ID: ' + mri_code + ' with patch size: ' + str(patch_size[xy])
                          + ' elapsed for: ' + str(elapsed_times_patch_all[timer_idx,xy]))

                elapsed_times_all[timer_idx] = timer() - one_mri_data
                print('IAM for MRI ID: ' + mri_code + ' elapsed for: ' + str(elapsed_times_all[timer_idx]))
                timer_idx += 1

                ''' Save all elapsed times '''
                sio.savemat(dirOutput + '/elapsed_times_all_' + str(num_samples) + 's' + str(num_mean_samples) + 'm.mat',
                            {'elapsed_times_all':elapsed_times_all})
                sio.savemat(dirOutput + '/elapsed_times_patch_all_' + str(num_samples) + 's' + str(num_mean_samples) + 'm.mat',
                            {'elapsed_times_patch_all':elapsed_times_patch_all})
                ''' IAM's (GPU Part) Computation ENDS here '''

                '''
                KEY POINT: IAM's Combination, Penalisation, and Post-processing - START
                -----------------------------------------------------------------------
                Part 0 - Saving output results in .mat and JPEG files.
                Part 1 - Combination of multiple age maps.
                Part 2 - Global normalisation and penalisation of age maps based on brain tissues.
                Part 3 - Post-processing.

                Hint: You can search the keys of Part 0/1/2/3.
                '''
                combined_age_map_mri = np.zeros((x_len, y_len, z_len))
                combined_age_map_mri_mult = np.zeros((x_len, y_len, z_len))
                combined_age_map_mri_mult_normed = np.zeros((x_len, y_len, z_len))
                for zz in range(0, mri_data.shape[2]):
                    mri_slice = mri_data[:,:,zz]
                    mask_slice = np.nan_to_num(mri_slice)
                    mask_slice[mask_slice > 0] = 1
                    penalty_slice = np.nan_to_num(mri_slice)   ### PENALTY

                    slice_age_map_all = np.zeros((len(patch_size), x_len, y_len))

                    dirOutData = dirOutput + '/' + mri_code
                    for xy in range(0, len(patch_size)):
                        mat_contents = sio.loadmat(dirOutData + '/' + str(patch_size[xy]) + '/' + str(zz) + '_dat.mat')
                        slice_age_map = mat_contents['slice_age_map']
                        slice_age_map_res = cv2.resize(slice_age_map, None, fx=patch_size[xy],
                                                       fy=patch_size[xy], interpolation=cv2.INTER_CUBIC)
                        slice_age_map_res = skifilters.gaussian(slice_age_map_res,sigma=0.5,truncate=2.0)
                        #if zz== 20:
                        #    code.interact(local=dict(globals(), **locals()))
                        slice_age_map_res = np.multiply(mask_slice, slice_age_map_res)
                        slice_age_map_all[xy,:,:] = slice_age_map_res
                    slice_age_map_all = np.nan_to_num(slice_age_map_all)


                    if save_jpeg:
                        ''' >>> Part 0 <<<'''
                        ''' Show all age maps based on patch's size and saving the mri_data '''
                        fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
                        fig.set_size_inches(10, 10)
                        fig.suptitle('All Patches Gaussian Filtered', fontsize=16)

                        axes[0,0].set_title('Patch 1 x 1')
                        im1 = axes[0,0].imshow(np.rot90(slice_age_map_all[0,:,:]), cmap="jet", vmin=0, vmax=1)
                        divider1 = make_axes_locatable(axes[0,0])
                        cax1 = divider1.append_axes("right", size="7%", pad=0.05)
                        cbar1 = plt.colorbar(im1, ticks=[0, 0.5, 1], cax=cax1)

                        if len(patch_size) > 1:
                            axes[0,1].set_title('Patch 2 x 2')
                            im2 = axes[0,1].imshow(np.rot90(slice_age_map_all[1,:,:]), cmap="jet", vmin=0, vmax=1)
                            divider2 = make_axes_locatable(axes[0,1])
                            cax2 = divider2.append_axes("right", size="7%", pad=0.05)
                            cbar2 = plt.colorbar(im2, ticks=[0, 0.5, 1], cax=cax2)

                            if len(patch_size) > 2:
                                axes[1,0].set_title('Patch 4 x 4')
                                im3 = axes[1,0].imshow(np.rot90(slice_age_map_all[2,:,:]), cmap="jet", vmin=0, vmax=1)
                                divider3 = make_axes_locatable(axes[1,0])
                                cax3 = divider3.append_axes("right", size="7%", pad=0.05)
                                cbar3 = plt.colorbar(im3, ticks=[0, 0.5, 1], cax=cax3)

                                if len(patch_size) > 3:
                                    axes[1,1].set_title('Patch 8 x 8')
                                    im4 = axes[1,1].imshow(np.rot90(slice_age_map_all[3,:,:]), cmap="jet", vmin=0, vmax=1)
                                    divider4 = make_axes_locatable(axes[1,1])
                                    cax4 = divider4.append_axes("right", size="7%", pad=0.05)
                                    cbar4 = plt.colorbar(im4, ticks=[0, 0.5, 1], cax=cax4)

                        plt.tight_layout()
                        plt.subplots_adjust(top=0.95)

                        ''' >>> Part 0 <<<'''
                        ''' Save mri_data in *_all.jpg '''
                        dirOutData = dirOutput + '/' + mri_code + '/IAM_combined_python/Patch/'
                        fig.savefig(dirOutData + str(zz) + '_all.jpg', dpi=100)
                        print('Saving files: ' + dirOutData + str(zz) + '_all.jpg')
                        plt.close()

                    ''' >>> Part 1 <<< '''
                    ''' Combined all patches age map information '''
                    combined_age_map = 0
                    for bi in range(len(patch_size)):
                        combined_age_map += np.multiply(blending_weights[bi],slice_age_map_all[bi,:,:])
                    combined_age_map_mri[:,:,zz] = combined_age_map

                    ''' Global Normalisation - saving needed mri_data '''
                    combined_age_map_mri_mult[:,:,zz] = np.multiply(np.multiply(combined_age_map, penalty_slice), mask_slice)  ### PENALTY
                    normed_only = np.divide((combined_age_map_mri[:,:,zz] - np.min(combined_age_map_mri[:,:,zz])),\
                                            (np.max(combined_age_map_mri[:,:,zz]) - np.min(combined_age_map_mri[:,:,zz])))
                    normed_mult = np.multiply(np.multiply(normed_only, penalty_slice), mask_slice)  ### PENALTY
                    normed_mult_normed = np.divide((normed_mult - np.min(normed_mult)), \
                                            (np.max(normed_mult) - np.min(normed_mult)))
                    combined_age_map_mri_mult_normed[:,:,zz] = normed_mult_normed

                    ''' Save mri_data in *.mat '''
                    dirOutData = dirOutput + '/' + mri_code + '/IAM_combined_python/Patch/'
                    print('Saving files: ' + dirOutData + 'c' + str(zz) + '_combined.mat\n')
                    sio.savemat(dirOutData + 'c' + str(zz) + '_combined.mat', {'slice_age_map_all':slice_age_map_all,
                                                                'combined_age_map':normed_only,
                                                                'mri_slice_mul_normed':normed_mult_normed,
                                                                'combined_mult':combined_age_map_mri_mult[:,:,zz]})

                ''' >>> Part 2 <<< '''
                ''' Penalty + Global Normalisation (GN) '''
                combined_age_map_mri_normed = np.divide((combined_age_map_mri - np.min(combined_age_map_mri)),\
                                            (np.max(combined_age_map_mri) - np.min(combined_age_map_mri)))
                combined_age_map_mri_mult_normed = np.divide((combined_age_map_mri_mult - np.min(combined_age_map_mri_mult)),\
                                            (np.max(combined_age_map_mri_mult) - np.min(combined_age_map_mri_mult)))

                if save_jpeg:
                    for zz in range(0, mri_data.shape[2]):
                        fig2, axes2 = plt.subplots(1, 3)
                        fig2.set_size_inches(16,5)

                        axes2[0].set_title('Combined and normalised')
                        im1 = axes2[0].imshow(np.rot90(np.nan_to_num(combined_age_map_mri_normed[:,:,zz])), cmap="jet", vmin=0, vmax=1)
                        divider1 = make_axes_locatable(axes2[0])
                        cax1 = divider1.append_axes("right", size="7%", pad=0.05)
                        cbar1 = plt.colorbar(im1, ticks=[0, 0.5, 1], cax=cax1)

                        axes2[1].set_title('Combined, penalised and normalised')
                        im2 = axes2[1].imshow(np.rot90(np.nan_to_num(combined_age_map_mri_mult_normed[:,:,zz])), cmap="jet", vmin=0, vmax=1)
                        divider2 = make_axes_locatable(axes2[1])
                        cax2 = divider2.append_axes("right", size="7%", pad=0.05)
                        cbar2 = plt.colorbar(im2, ticks=[0, 0.5, 1], cax=cax2)

                        axes2[2].set_title('Original MRI slice')
                        im3 = axes2[2].imshow(np.rot90(np.nan_to_num(mri_data[:,:,zz])), cmap="gray")
                        divider3 = make_axes_locatable(axes2[2])
                        cax3 = divider3.append_axes("right", size="7%", pad=0.05)
                        cbar3 = plt.colorbar(im3, cax=cax3)

                        plt.tight_layout()
                        # Make space for title
                        plt.subplots_adjust(top=0.95)

                        ''' Save mri_data in *_combined.jpg '''
                        dirOutData = dirOutput + '/' + mri_code + '/IAM_combined_python/Combined/'
                        fig2.savefig(dirOutData + str(zz) + '_combined.jpg', dpi=100)
                        print('Saving files: ' + dirOutData + str(zz) + '_combined.jpg')
                        plt.close()

                ''' Save mri_data in *.mat '''
                sio.savemat(dirOutDataCom + '/all_slice_dat.mat', {'combined_age_map_all_slice':combined_age_map_mri,
                                                   'mri_slice_mul_all_slice':combined_age_map_mri_mult,
                                                   'combined_age_map_mri_normed':combined_age_map_mri_normed,
                                                   'combined_age_map_mri_mult_normed':combined_age_map_mri_mult_normed})

                '''
                combined_age_map_mri_img = nib.Nifti1Image(combined_age_map_mri_normed, mri_nii.affine)
                nib.save(combined_age_map_mri_img, str(dirOutDataFin + '/IAM_GPU_COMBINED.nii.gz'))

                combined_age_map_mri_GN_img = nib.Nifti1Image(combined_age_map_mri_mult_normed, mri_nii.affine)
                nib.save(combined_age_map_mri_GN_img, str(dirOutDataFin + '/IAM_GPU_GN.nii.gz'))
                '''

                ''' >>> Part 3 <<< '''
                ''' Post-processing '''
                ''' COMMENTED OUT BECAUSE NOT AVAILABLE
                if nawm_available and ~nawm_preprocessing:
                    combined_age_map_mri_mult_normed = np.multiply(combined_age_map_mri_mult_normed,nawm_mri_code)
                    combined_age_map_mri_GN_img = nib.Nifti1Image(combined_age_map_mri_mult_normed, mri_nii.affine)
                    nib.save(combined_age_map_mri_GN_img, str(dirOutDataFin + '/IAM_GPU_GN_postprocessed.nii.gz'))
                '''
                '''
                ---------------------------------------------------------------------
                KEY POINT: IAM's Combination, Penalisation, and Post-processing - END
                '''

                if delete_intermediary:
                    shutil.rmtree(dirOutDataCom, ignore_errors=True)
                    for xy in range(0, len(patch_size)):
                        shutil.rmtree(dirOutput + '/' + mri_code + '/' + str(patch_size[xy]), ignore_errors=True)

                del temp
                del center_source_patch, icv_source_flag
                del icv_source_flag_valid, index_mapping
                del area_source_patch, target_patches_np   # Free memory
                del area_source_patch_cuda_all, target_patches_np_cuda_all   # Free memory
                gc.collect()

        ## Print the elapsed time information
        print('\n--\nSpeed statistics of this run..')
        print('mean elapsed time  : ' + str(np.mean(elapsed_times_all)) + ' seconds')
        print('std elapsed time   : ' + str(np.std(elapsed_times_all)) + ' seconds')
        print('median elapsed time : ' + str(np.median(elapsed_times_all)) + ' seconds')
        print('min elapsed time   : ' + str(np.min(elapsed_times_all)) + ' seconds')
        print('max elapsed time   : ' + str(np.max(elapsed_times_all)) + ' seconds')
Пример #39
0
def gpu_stump(T_A, m, T_B=None, ignore_trivial=True, device_id=0):
    """
    Compute the matrix profile with GPU-STOMP

    This is a convenience wrapper around the Numba `cuda.jit` `_gpu_stump` function
    which computes the matrix profile according to GPU-STOMP.

    Parameters
    ----------
    T_A : ndarray
        The time series or sequence for which to compute the matrix profile

    m : int
        Window size

    T_B : (optional) ndarray
        The time series or sequence that contain your query subsequences
        of interest. Default is `None` which corresponds to a self-join.

    ignore_trivial : bool
        Set to `True` if this is a self-join. Otherwise, for AB-join, set this
        to `False`. Default is `True`.

    device_id : int or list
        The (GPU) device number to use. The default value is `0`. A list of
        valid device ids (int) may also be provided for parallel GPU-STUMP
        computation. A list of all valid device ids can be obtained by
        executing `[device.id for device in cuda.list_devices()]`.

    Returns
    -------
    out : ndarray
        The first column consists of the matrix profile, the second column
        consists of the matrix profile indices, the third column consists of
        the left matrix profile indices, and the fourth column consists of
        the right matrix profile indices.

    Notes
    -----
    `DOI: 10.1109/ICDM.2016.0085 \
    <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__

    See Table II, Figure 5, and Figure 6

    Timeseries, T_B, will be annotated with the distance location
    (or index) of all its subsequences in another times series, T_A.

    Return: For every subsequence, Q, in T_B, you will get a distance
    and index for the closest subsequence in T_A. Thus, the array
    returned will have length T_B.shape[0]-m+1. Additionally, the
    left and right matrix profiles are also returned.

    Note: Unlike in the Table II where T_A.shape is expected to be equal
    to T_B.shape, this implementation is generalized so that the shapes of
    T_A and T_B can be different. In the case where T_A.shape == T_B.shape,
    then our algorithm reduces down to the same algorithm found in Table II.

    Additionally, unlike STAMP where the exclusion zone is m/2, the default
    exclusion zone for STOMP is m/4 (See Definition 3 and Figure 3).

    For self-joins, set `ignore_trivial = True` in order to avoid the
    trivial match.

    Note that left and right matrix profiles are only available for self-joins.
    """
    if T_B is None:  # Self join!
        T_B = T_A
        ignore_trivial = True

    # Swap T_A and T_B for GPU implementation
    # This keeps the API identical to and compatible with `stumpy.stump`
    tmp_T = T_A
    T_A = T_B
    T_B = tmp_T

    T_A, M_T, Σ_T = core.preprocess(T_A, m)
    T_B, μ_Q, σ_Q = core.preprocess(T_B, m)

    if T_A.ndim != 1:  # pragma: no cover
        raise ValueError(
            f"T_A is {T_A.ndim}-dimensional and must be 1-dimensional. "
            "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`"
        )

    if T_B.ndim != 1:  # pragma: no cover
        raise ValueError(
            f"T_B is {T_B.ndim}-dimensional and must be 1-dimensional. "
            "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`"
        )

    core.check_dtype(T_A)
    core.check_dtype(T_B)

    core.check_window_size(m)

    if ignore_trivial is False and core.are_arrays_equal(
            T_A, T_B):  # pragma: no cover
        logger.warning("Arrays T_A, T_B are equal, which implies a self-join.")
        logger.warning("Try setting `ignore_trivial = True`.")

    if ignore_trivial and core.are_arrays_equal(
            T_A, T_B) is False:  # pragma: no cover
        logger.warning(
            "Arrays T_A, T_B are not equal, which implies an AB-join.")
        logger.warning("Try setting `ignore_trivial = False`.")

    n = T_B.shape[0]
    k = T_A.shape[0] - m + 1
    l = n - m + 1
    excl_zone = int(np.ceil(m / 4))  # See Definition 3 and Figure 3

    T_A_fname = core.array_to_temp_file(T_A)
    T_B_fname = core.array_to_temp_file(T_B)
    M_T_fname = core.array_to_temp_file(M_T)
    Σ_T_fname = core.array_to_temp_file(Σ_T)
    μ_Q_fname = core.array_to_temp_file(μ_Q)
    σ_Q_fname = core.array_to_temp_file(σ_Q)

    out = np.empty((k, 4), dtype=object)

    if isinstance(device_id, int):
        device_ids = [device_id]
    else:
        device_ids = device_id

    profile = [None] * len(device_ids)
    indices = [None] * len(device_ids)

    for _id in device_ids:
        with cuda.gpus[_id]:
            if (cuda.current_context().__class__.__name__ !=
                    "FakeCUDAContext"):  # pragma: no cover
                cuda.current_context().deallocations.clear()

    step = 1 + l // len(device_ids)

    # Start process pool for multi-GPU request
    if len(device_ids) > 1:  # pragma: no cover
        mp.set_start_method("spawn", force=True)
        p = mp.Pool(processes=len(device_ids))
        results = [None] * len(device_ids)

    QT_fnames = []
    QT_first_fnames = []

    for idx, start in enumerate(range(0, l, step)):
        stop = min(l, start + step)

        QT, QT_first = _get_QT(start, T_A, T_B, m)
        QT_fname = core.array_to_temp_file(QT)
        QT_first_fname = core.array_to_temp_file(QT_first)
        QT_fnames.append(QT_fname)
        QT_first_fnames.append(QT_first_fname)

        if len(device_ids
               ) > 1 and idx < len(device_ids) - 1:  # pragma: no cover
            # Spawn and execute in child process for multi-GPU request
            results[idx] = p.apply_async(
                _gpu_stump,
                (
                    T_A_fname,
                    T_B_fname,
                    m,
                    stop,
                    excl_zone,
                    M_T_fname,
                    Σ_T_fname,
                    QT_fname,
                    QT_first_fname,
                    μ_Q_fname,
                    σ_Q_fname,
                    k,
                    ignore_trivial,
                    start + 1,
                    device_ids[idx],
                ),
            )
        else:
            # Execute last chunk in parent process
            # Only parent process is executed when a single GPU is requested
            profile[idx], indices[idx] = _gpu_stump(
                T_A_fname,
                T_B_fname,
                m,
                stop,
                excl_zone,
                M_T_fname,
                Σ_T_fname,
                QT_fname,
                QT_first_fname,
                μ_Q_fname,
                σ_Q_fname,
                k,
                ignore_trivial,
                start + 1,
                device_ids[idx],
            )

    # Clean up process pool for multi-GPU request
    if len(device_ids) > 1:  # pragma: no cover
        p.close()
        p.join()

        # Collect results from spawned child processes if they exist
        for idx, result in enumerate(results):
            if result is not None:
                profile[idx], indices[idx] = result.get()

    os.remove(T_A_fname)
    os.remove(T_B_fname)
    os.remove(M_T_fname)
    os.remove(Σ_T_fname)
    os.remove(μ_Q_fname)
    os.remove(σ_Q_fname)
    for QT_fname in QT_fnames:
        os.remove(QT_fname)
    for QT_first_fname in QT_first_fnames:
        os.remove(QT_first_fname)

    for idx in range(len(device_ids)):
        profile_fname = profile[idx]
        indices_fname = indices[idx]
        profile[idx] = np.load(profile_fname, allow_pickle=False)
        indices[idx] = np.load(indices_fname, allow_pickle=False)
        os.remove(profile_fname)
        os.remove(indices_fname)

    for i in range(1, len(device_ids)):
        # Update all matrix profiles and matrix profile indices
        # (global, left, right) and store in profile[0] and indices[0]
        for col in range(profile[0].shape[1]):  # pragma: no cover
            cond = profile[0][:, col] < profile[i][:, col]
            profile[0][:, col] = np.where(cond, profile[0][:, col],
                                          profile[i][:, col])
            indices[0][:, col] = np.where(cond, indices[0][:, col],
                                          indices[i][:, col])

    out[:, 0] = profile[0][:, 0]
    out[:, 1:4] = indices[0][:, :]

    threshold = 10e-6
    if core.are_distances_too_small(out[:, 0],
                                    threshold=threshold):  # pragma: no cover
        logger.warning(
            f"A large number of values are smaller than {threshold}.")
        logger.warning("For a self-join, try setting `ignore_trivial = True`.")

    return out
Пример #40
0
 def cc(self):
     return cuda.current_context().device.compute_capability
Пример #41
0
 def test_stream(self):
     ctx = cuda.current_context()
     stream = ctx.create_stream()
     with self.check_ignored_exception(ctx):
         del stream
Пример #42
0
def gpu_aamp(T_A, m, T_B=None, ignore_trivial=True, device_id=0):
    """
    Compute the non-normalized (i.e., without z-normalization) matrix profile with one
    or more GPU devices

    This is a convenience wrapper around the Numba `cuda.jit` `_gpu_aamp` function
    which computes the non-normalized matrix profile according to modified version
    GPU-STOMP.

    Parameters
    ----------
    T_A : ndarray
        The time series or sequence for which to compute the matrix profile

    m : int
        Window size

    T_B : ndarray, default None
        The time series or sequence that contain your query subsequences
        of interest. Default is `None` which corresponds to a self-join.

    ignore_trivial : bool, default True
        Set to `True` if this is a self-join. Otherwise, for AB-join, set this
        to `False`. Default is `True`.

    device_id : int or list, default 0
        The (GPU) device number to use. The default value is `0`. A list of
        valid device ids (int) may also be provided for parallel GPU-STUMP
        computation. A list of all valid device ids can be obtained by
        executing `[device.id for device in numba.cuda.list_devices()]`.

    Returns
    -------
    out : ndarray
        The first column consists of the matrix profile, the second column
        consists of the matrix profile indices, the third column consists of
        the left matrix profile indices, and the fourth column consists of
        the right matrix profile indices.

    Notes
    -----
    `arXiv:1901.05708 \
    <https://arxiv.org/pdf/1901.05708.pdf>`__

    See Algorithm 1

    Note that we have extended this algorithm for AB-joins as well.

    `DOI: 10.1109/ICDM.2016.0085 \
    <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__

    See Table II, Figure 5, and Figure 6
    """
    if T_B is None:  # Self join!
        T_B = T_A
        ignore_trivial = True

    T_A, T_A_subseq_isfinite = core.preprocess_non_normalized(T_A, m)
    T_B, T_B_subseq_isfinite = core.preprocess_non_normalized(T_B, m)

    T_A_subseq_squared = np.sum(core.rolling_window(T_A * T_A, m), axis=1)
    T_B_subseq_squared = np.sum(core.rolling_window(T_B * T_B, m), axis=1)

    if T_A.ndim != 1:  # pragma: no cover
        raise ValueError(
            f"T_A is {T_A.ndim}-dimensional and must be 1-dimensional. "
            "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`"
        )

    if T_B.ndim != 1:  # pragma: no cover
        raise ValueError(
            f"T_B is {T_B.ndim}-dimensional and must be 1-dimensional. "
            "For multidimensional STUMP use `stumpy.mstump` or `stumpy.mstumped`"
        )

    core.check_window_size(m, max_size=min(T_A.shape[0], T_B.shape[0]))

    if ignore_trivial is False and core.are_arrays_equal(
            T_A, T_B):  # pragma: no cover
        logger.warning("Arrays T_A, T_B are equal, which implies a self-join.")
        logger.warning("Try setting `ignore_trivial = True`.")

    if ignore_trivial and core.are_arrays_equal(
            T_A, T_B) is False:  # pragma: no cover
        logger.warning(
            "Arrays T_A, T_B are not equal, which implies an AB-join.")
        logger.warning("Try setting `ignore_trivial = False`.")

    n = T_B.shape[0]
    k = T_A.shape[0] - m + 1
    l = n - m + 1
    excl_zone = int(np.ceil(m / 4))  # See Definition 3 and Figure 3

    T_A_fname = core.array_to_temp_file(T_A)
    T_B_fname = core.array_to_temp_file(T_B)
    T_A_subseq_isfinite_fname = core.array_to_temp_file(T_A_subseq_isfinite)
    T_B_subseq_isfinite_fname = core.array_to_temp_file(T_B_subseq_isfinite)
    T_A_subseq_squared_fname = core.array_to_temp_file(T_A_subseq_squared)
    T_B_subseq_squared_fname = core.array_to_temp_file(T_B_subseq_squared)

    out = np.empty((k, 4), dtype=object)

    if isinstance(device_id, int):
        device_ids = [device_id]
    else:
        device_ids = device_id

    profile = [None] * len(device_ids)
    indices = [None] * len(device_ids)

    for _id in device_ids:
        with cuda.gpus[_id]:
            if (cuda.current_context().__class__.__name__ !=
                    "FakeCUDAContext"):  # pragma: no cover
                cuda.current_context().deallocations.clear()

    step = 1 + l // len(device_ids)

    # Start process pool for multi-GPU request
    if len(device_ids) > 1:  # pragma: no cover
        mp.set_start_method("spawn", force=True)
        p = mp.Pool(processes=len(device_ids))
        results = [None] * len(device_ids)

    QT_fnames = []
    QT_first_fnames = []

    for idx, start in enumerate(range(0, l, step)):
        stop = min(l, start + step)

        QT, QT_first = core._get_QT(start, T_A, T_B, m)
        QT_fname = core.array_to_temp_file(QT)
        QT_first_fname = core.array_to_temp_file(QT_first)
        QT_fnames.append(QT_fname)
        QT_first_fnames.append(QT_first_fname)

        if len(device_ids
               ) > 1 and idx < len(device_ids) - 1:  # pragma: no cover
            # Spawn and execute in child process for multi-GPU request
            results[idx] = p.apply_async(
                _gpu_aamp,
                (
                    T_A_fname,
                    T_B_fname,
                    m,
                    stop,
                    excl_zone,
                    T_A_subseq_isfinite_fname,
                    T_B_subseq_isfinite_fname,
                    T_A_subseq_squared_fname,
                    T_B_subseq_squared_fname,
                    QT_fname,
                    QT_first_fname,
                    k,
                    ignore_trivial,
                    start + 1,
                    device_ids[idx],
                ),
            )
        else:
            # Execute last chunk in parent process
            # Only parent process is executed when a single GPU is requested
            profile[idx], indices[idx] = _gpu_aamp(
                T_A_fname,
                T_B_fname,
                m,
                stop,
                excl_zone,
                T_A_subseq_isfinite_fname,
                T_B_subseq_isfinite_fname,
                T_A_subseq_squared_fname,
                T_B_subseq_squared_fname,
                QT_fname,
                QT_first_fname,
                k,
                ignore_trivial,
                start + 1,
                device_ids[idx],
            )

    # Clean up process pool for multi-GPU request
    if len(device_ids) > 1:  # pragma: no cover
        p.close()
        p.join()

        # Collect results from spawned child processes if they exist
        for idx, result in enumerate(results):
            if result is not None:
                profile[idx], indices[idx] = result.get()

    os.remove(T_A_fname)
    os.remove(T_B_fname)
    os.remove(T_A_subseq_isfinite_fname)
    os.remove(T_B_subseq_isfinite_fname)
    os.remove(T_A_subseq_squared_fname)
    os.remove(T_B_subseq_squared_fname)
    for QT_fname in QT_fnames:
        os.remove(QT_fname)
    for QT_first_fname in QT_first_fnames:
        os.remove(QT_first_fname)

    for idx in range(len(device_ids)):
        profile_fname = profile[idx]
        indices_fname = indices[idx]
        profile[idx] = np.load(profile_fname, allow_pickle=False)
        indices[idx] = np.load(indices_fname, allow_pickle=False)
        os.remove(profile_fname)
        os.remove(indices_fname)

    for i in range(1, len(device_ids)):
        # Update all matrix profiles and matrix profile indices
        # (global, left, right) and store in profile[0] and indices[0]
        for col in range(profile[0].shape[1]):  # pragma: no cover
            cond = profile[0][:, col] < profile[i][:, col]
            profile[0][:, col] = np.where(cond, profile[0][:, col],
                                          profile[i][:, col])
            indices[0][:, col] = np.where(cond, indices[0][:, col],
                                          indices[i][:, col])

    out[:, 0] = profile[0][:, 0]
    out[:, 1:4] = indices[0][:, :]

    threshold = 10e-6
    if core.are_distances_too_small(out[:, 0],
                                    threshold=threshold):  # pragma: no cover
        logger.warning(
            f"A large number of values are smaller than {threshold}.")
        logger.warning("For a self-join, try setting `ignore_trivial = True`.")

    return out
Пример #43
0
# Helper libraries
import numpy as np
import matplotlib.pyplot as plt
from time import time

print(tf.__version__)


#### USE CPU if you wish etc. 

#python -m pip install numba
from numba import cuda 
device = cuda.get_current_device()
device.reset()
cuda.current_context().trashing.clear()

cpu = True #False
cpu = False

if cpu:
  #my_devices = #tf.config.experimental.list_physical_devices(device_type='CPU')
  #tf.config.experimental.set_visible_devices(devices= my_devices, device_type='CPU')
  #for anyone who is using tf 2.1, the above comment does not seems to work.
  print("Using CPU!")
  tf.config.set_visible_devices([], 'GPU')

tf.debugging.set_log_device_placement(True)


fashion_mnist = keras.datasets.fashion_mnist
Пример #44
0
 def test_event(self):
     ctx = cuda.current_context()
     event = ctx.create_event()
     with self.check_ignored_exception(ctx):
         del event
Пример #45
0
 def test_initialized_in_context(self):
     # If we have a CUDA context, it should already have initialized its
     # memory manager.
     self.assertTrue(cuda.current_context().memory_manager.initialized)
Пример #46
0
def gpu_stump(
    T_A,
    m,
    T_B=None,
    ignore_trivial=True,
    threads_per_block=THREADS_PER_BLOCK,
    device_id=0,
):
    """
    Compute the matrix profile with GPU-STOMP

    This is a convenience wrapper around the Numba `cuda.jit` `_gpu_stump` function
    which computes the matrix profile according to GPU-STOMP.

    Parameters
    ----------
    T_A : ndarray
        The time series or sequence for which to compute the matrix profile
    m : int
        Window size
    T_B : ndarray
        The time series or sequence that contain your query subsequences
        of interest. Default is `None` which corresponds to a self-join.

    ignore_trivial : bool
        Set to `True` if this is a self-join. Otherwise, for AB-join, set this
        to `False`. Default is `True`.

    threads_per_block : int
        The number of GPU threads to use for all kernels. The default value is
        set in `THREADS_PER_BLOCK=512`.

    device_id : int or list
        The (GPU) device number to use. The defailt value is `0`. A list of
        valid device ids (int) may also be provided for parallel GPU-STUMP
        computation. A list of all valid device ids can be obtained by
        executing `[device.id for device in cuda.list_devices()]`.

    Returns
    -------
    out : ndarray
        The first column consists of the matrix profile, the second column
        consists of the matrix profile indices, the third column consists of
        the left matrix profile indices, and the fourth column consists of
        the right matrix profile indices.

    Notes
    -----

    `DOI: 10.1109/ICDM.2016.0085 \
    <https://www.cs.ucr.edu/~eamonn/STOMP_GPU_final_submission_camera_ready.pdf>`__

    See Table II, Figure 5, and Figure 6

    Timeseries, T_B, will be annotated with the distance location
    (or index) of all its subsequences in another times series, T_A.

    Return: For every subsequence, Q, in T_B, you will get a distance
    and index for the closest subsequence in T_A. Thus, the array
    returned will have length T_B.shape[0]-m+1. Additionally, the
    left and right matrix profiles are also returned.

    Note: Unlike in the Table II where T_A.shape is expected to be equal
    to T_B.shape, this implementation is generalized so that the shapes of
    T_A and T_B can be different. In the case where T_A.shape == T_B.shape,
    then our algorithm reduces down to the same algorithm found in Table II.

    Additionally, unlike STAMP where the exclusion zone is m/2, the default
    exclusion zone for STOMP is m/4 (See Definition 3 and Figure 3).

    For self-joins, set `ignore_trivial = True` in order to avoid the
    trivial match.

    Note that left and right matrix profiles are only available for self-joins.
    """

    T_A = np.asarray(T_A)
    core.check_dtype(T_A)
    core.check_nan(T_A)
    if T_B is None:  # Self join!
        T_B = T_A
        ignore_trivial = True
    T_B = np.asarray(T_B)

    core.check_dtype(T_B)
    core.check_nan(T_B)
    core.check_window_size(m)

    if ignore_trivial is False and core.are_arrays_equal(
            T_A, T_B):  # pragma: no cover
        logger.warning("Arrays T_A, T_B are equal, which implies a self-join.")
        logger.warning("Try setting `ignore_trivial = True`.")

    if ignore_trivial and core.are_arrays_equal(
            T_A, T_B) is False:  # pragma: no cover
        logger.warning(
            "Arrays T_A, T_B are not equal, which implies an AB-join.")
        logger.warning("Try setting `ignore_trivial = False`.")

    # Swap T_A and T_B for GPU implementation
    # This keeps the API identical to and compatible with `stumpy.stump`
    tmp_T = T_A
    T_A = T_B
    T_B = tmp_T

    n = T_B.shape[0]
    k = T_A.shape[0] - m + 1
    l = n - m + 1
    excl_zone = int(np.ceil(m / 4))  # See Definition 3 and Figure 3

    M_T, Σ_T = core.compute_mean_std(T_A, m)
    μ_Q, σ_Q = core.compute_mean_std(T_B, m)

    out = np.empty((k, 4), dtype=object)

    if isinstance(device_id, int):
        device_ids = [device_id]
    else:
        device_ids = device_id

    profile = [None] * len(device_ids)
    indices = [None] * len(device_ids)

    for _id in device_ids:
        cuda.select_device(_id)
        if (cuda.current_context().__class__.__name__ !=
                "FakeCUDAContext"):  # pragma: no cover
            cuda.current_context().deallocations.clear()

    step = 1 + l // len(device_ids)

    for idx, start in enumerate(range(0, l, step)):
        stop = min(l, start + step)

        QT, QT_first = _get_QT(start, T_A, T_B, m)
        profile[idx], indices[idx] = _gpu_stump(
            T_A,
            T_B,
            m,
            stop,
            excl_zone,
            M_T,
            Σ_T,
            QT,
            QT_first,
            μ_Q,
            σ_Q,
            k,
            ignore_trivial,
            start + 1,
            threads_per_block,
            device_ids[idx],
        )

    for i in range(1, len(device_ids)):
        # Update all matrix profiles and matrix profile indices
        # (global, left, right) and store in profile[0] and indices[0]
        for col in range(profile[0].shape[1]):  # pragma: no cover
            cond = profile[0][:, col] < profile[i][:, col]
            profile[0][:, col] = np.where(cond, profile[0][:, col],
                                          profile[i][:, col])
            indices[0][:, col] = np.where(cond, indices[0][:, col],
                                          indices[i][:, col])

    out[:, 0] = profile[0][:, 0]
    out[:, 1:4] = indices[0][:, :]

    threshold = 10e-6
    if core.are_distances_too_small(out[:, 0],
                                    threshold=threshold):  # pragma: no cover
        logger.warning(
            f"A large number of values are smaller than {threshold}.")
        logger.warning("For a self-join, try setting `ignore_trivial = True`.")

    return out
Пример #47
0
__author__ = 'christopher'
if __name__ == '__main__':
    from mpi4py import MPI
    from numba import cuda

    comm = MPI.Comm.Get_parent()
    rank = comm.Get_rank()
    meminfo = int(cuda.current_context().get_memory_info()[0])
    cuda.close()

    comm.gather(sendobj=meminfo, root=0)
    comm.Disconnect()
Пример #48
0
g_reg = cuda.to_device(regions,stream=stream)
g_ste = cuda.to_device(ste,stream=stream)
rng_states = create_xoroshiro128p_states(13456, seed=int(tim.time()))
g_steps = cuda.to_device(steps,stream=stream)
g_del = cuda.to_device(tt,stream=stream)
ranwalk = np.empty((10000,116,116), dtype=g_steps.dtype)
delay = np.empty((10000,116,116), dtype=g_steps.dtype)
output = np.empty(shape=g_steps.shape, dtype=g_steps.dtype)
delay_o = np.empty(shape=g_steps.shape, dtype=g_steps.dtype)

for i in range(10000):
   
    random_walk[116, 116](g_prob, g_time, g_reg, g_steps, g_del, g_ste, rng_states)
    print(i)    
    #print("g_steps size:", g_steps.size, " output size: ", output.size)
    cuda.cudadrv.driver.Context.synchronize(cuda.current_context())
    #time.sleep(0.7)
    #print("synchronized")
    output = g_steps.copy_to_host(stream=stream)
    delay_o = g_del.copy_to_host(stream=stream)
    #print("ovde zaglaviv")
    sd = output
    sc = delay_o
    ranwalk[i,:,:] = sd.reshape((116,116))
    delay[i,:,:] = sc.reshape((116,116))
    del rng_states
    #del g_steps
    rng_states = create_xoroshiro128p_states(116*116, seed=np.uint64(tim.time()))
    #g_steps = cuda.to_device(steps)
    
sci.savemat(name +'_randomwalk_steps.mat', {'ranwalk':ranwalk})