Exemplo n.º 1
0
    def __init__(self, plink_file, scratch_dir, overwrite=False):
        self.options = tf.python_io.TFRecordOptions(
            tf.python_io.TFRecordCompressionType.ZLIB)
        self.plink_file = plink_file
        self.scratch_dir = scratch_dir

        # read plink data
        print('\nReading PLINK data...')
        self.bim, self.fam, G = read_plink(plink_file)
        # import ipdb; ipdb.set_trace()
        print('Done')

        # write tf.records
        if overwrite:
            G_df = dd.from_dask_array(da.transpose(G))
            G_df = G_df.fillna(value=1)  # (. _ . )
            G_df = G_df.astype(np.int8)
            tf_records_filenames = G_df.apply(self._write_records,
                                              axis=1).compute()
            print('Done')
        else:
            root, dirs, files = next(os.walk(scratch_dir))
            tf_records_filenames = [
                root + f for f in files if f.endswith('.tfrecords')
            ]

        # split into training and test batches
        self.train_files, self.test_files = train_test_split(
            tf_records_filenames, test_size=0.20, random_state=42)
Exemplo n.º 2
0
    def __data_generation(self, indexes):
        'Generates data containing batch_size samples'  # X : (n_samples, *dim, n_channels)

        # Fetch features and normalize them
        Xnorm = da.from_zarr(self.features)[indexes, ]
        Xnorm = (Xnorm - self.norm_data['mean']) / self.norm_data['std']

        # Transpose if needed
        if self.transpose:
            Xnorm = da.transpose(Xnorm, axes=[0, 2, 1])

        # Generate data
        X = da.reshape(Xnorm, [len(indexes), *self.dim]).compute()
        y = self.labels[indexes].copy()

        # Return X,y pairs now (without mixup)
        if not self.use_mixup or (self.set_type != 'train'):
            return X, self.to_categorical(y, num_classes=self.num_classes)

        # Mixup
        mixed_x, y_a, y_b, lamb = self.mixup_batch(X,
                                                   y,
                                                   alpha=self.mixup_alpha)
        batch_data_in = mixed_x  # X_mixup
        y_a = self.to_categorical(y_a, num_classes=self.num_classes)
        y_b = self.to_categorical(y_b, num_classes=self.num_classes)
        batch_data_out = lamb * y_a + (1 - lamb) * y_b  # y_mixup

        return batch_data_in, batch_data_out
Exemplo n.º 3
0
def cal(x,client):
    st = time.time()

    #Distributed scheduler
    #with dask.set_options(get=dask.threaded.get):
    with dask.set_options(get=client.get):
        A = da.transpose(x)
        B = da.dot(x,A)
        C = da.dot(B,B)

        print C.compute()

    #Default scheduler
    # with dask.set_options(get=dask.threaded.get):
    #     A = da.transpose(x)
    #     B = da.dot(x,A)
    #     C = da.dot(B,B)
    #
    #     print C.compute()

    #mannually set global thread.
    # from multiprocessing.pool import ThreadPool
    # with dask.set_options(pool=ThreadPool(4)):
    #     A = da.transpose(x)
    #     B = da.dot(x,A)
    #     C = da.dot(B,B)
    #
    #     print C.compute(num_works = 4)




    print 'time: ',time.time()-st
    return 0
def test_reshape_data_kwargs_values(
    data,
    given_dims,
    return_dims,
    other_args,
    getitem_ops_for_expected,
    transposer,
):
    actual = reshape_data(
        data=data,
        given_dims=given_dims,
        return_dims=return_dims,
        **other_args,
    )

    expected = data[getitem_ops_for_expected]

    if transposer is not None:
        if isinstance(data, np.ndarray):
            expected = np.transpose(expected, transposer)
        else:
            expected = da.transpose(expected, transposer)

    # Check that the output data is the same type as the input
    assert type(actual) == type(expected)

    if isinstance(data, da.core.Array):
        actual = actual.compute()
        expected = expected.compute()

    # Check actual data
    np.testing.assert_array_equal(actual, expected)
Exemplo n.º 5
0
def power_dask(data, x_init):
    A = da.matmul(data, da.transpose(data))
    A.compute()
    T = 150
    y = x_init
    for t in range(T):
        v = np.matmul(A, y)
        y = v / np.linalg.norm(v)
Exemplo n.º 6
0
 def train_set_minibatches(self, batch_size=10):
     '''
     Yield batches of samples from the studies in the test and train datasets.
     '''
     for study, (bim, fam, G) in self.train_studies.items():
         for batch in minibatch(da.transpose(G), batch_size=batch_size):
             gene_matrix = batch.to_dask_dataframe()
             yield gene_matrix.fillna(gene_matrix.mean(axis=0), axis=0)
Exemplo n.º 7
0
def reconstruct_im(data_3D, dark_ref, return_dask=False):
    core_count = multiprocessing.cpu_count()
    data_shape = data_3D.shape
    con_shape = tuple((np.asarray(data_shape[0:2]) * np.asarray((0.5, 2))).astype(int))
    xvals = int(data_shape[-1] ** 0.5)
    data3d_dask = da.from_array(data_3D, chunks=(-1, -1, "auto"))
    data_shape = data3d_dask.shape
    mean_dark_ref = np.mean(dark_ref.astype(np.float), axis=-1)
    d3r = da.transpose(data3d_dask, (2, 0, 1))
    d3s = d3r - mean_dark_ref
    d3D_dref = da.transpose(d3s, (1, 2, 0))
    top_part = d3D_dref[0 : con_shape[0], :, :]
    bot_part = d3D_dref[con_shape[0] : data_shape[0], :, :]
    top_part_rs = top_part[::-1, ::-1, :]
    data3d_arranged = da.concatenate([bot_part, top_part_rs], axis=1)
    shape4d = (con_shape[0], con_shape[1], xvals, xvals)
    data4d_dask = da.reshape(data3d_arranged, shape4d)
    if return_dask:
        return data4d_dask
    else:
        data4D = data4d_dask.compute(num_workers=core_count)
        return data4D
Exemplo n.º 8
0
 def shape(cls, dataset, gridded=False):
     array = dataset.data[dataset.vdims[0].name]
     if not any(cls.irregular(dataset, kd) for kd in dataset.kdims):
         names = [kd.name for kd in dataset.kdims
                  if kd.name in array.dims][::-1]
         if not all(d in names for d in array.dims):
             array = np.squeeze(array)
         array = array.transpose(*names)
     shape = array.shape
     if gridded:
         return shape
     else:
         return (np.product(shape), len(dataset.dimensions()))
Exemplo n.º 9
0
def update_velocities(position, velocity, mass, G, epsilon):
    """Calculate the interactions between all particles and update the velocities.
    
    Args:
    position (dask array): dask array of all particle positions in cartesian coordinates.
    velocity (dask array): dask array of all particle velocities in cartesian coordinates.
    mass (dask array): dask array of all particle masses.
    G (float): gravitational constant.
    epsilon (float): softening parameter.
    
    Returns:
    velocity: updated particle velocities in cartesian coordinates.
    """
    dx = da.subtract.outer(position[:, 0], position[:, 0])
    dy = da.subtract.outer(position[:, 1], position[:, 1])
    dz = da.subtract.outer(position[:, 2], position[:, 2])
    r2 = da.square(dx) + da.square(dy) + da.square(dz) + da.square(epsilon)
    #
    coef = -G * mass[:]
    ax = coef * dx
    ay = coef * dy
    az = coef * dz
    #
    ax_scaled = da.divide(ax, r2)
    ay_scaled = da.divide(ay, r2)
    az_scaled = da.divide(az, r2)
    #
    total_ax = da.nansum(ax_scaled, axis=1)
    total_ay = da.nansum(ay_scaled, axis=1)
    total_az = da.nansum(az_scaled, axis=1)
    #
    velocity_x = da.diag(da.add.outer(da.transpose(velocity)[0], total_ax))
    velocity_y = da.diag(da.add.outer(da.transpose(velocity)[1], total_ay))
    velocity_z = da.diag(da.add.outer(da.transpose(velocity)[2], total_az))
    #
    velocity = np.column_stack((velocity_x.compute(), velocity_y.compute(),
                                velocity_z.compute()))
    return velocity
Exemplo n.º 10
0
def transpose_grid_array(
    grid: GridArray,
    axes: Optional[list] = None,
) -> GridArray:
    """Reverses or permutes the axes of a `GridArray`.

    Parameters
    ----------
    axes: ``list``, optional
         List of integers and/or strings that identify the permutation of the
         axes. The i'th axis of the returned `GridArray` will correspond to the
         axis numbered/labeled axes[i] of the input. If not specified, the
         order of the axes is reversed.

    Returns
    ------
    :class:`nata.containers.GridArray`:
        Transpose of ``grid``.

    Examples
    --------
    Transpose a three-dimensional array.

    >>> from nata.containers import GridArray
    >>> import numpy as np
    >>> data = np.arange(96).reshape((8, 4, 3))
    >>> grid = GridArray.from_array(data)
    >>> grid.transpose().shape
    (3, 4, 8)
    >>> grid.transpose(axes=[0,2,1]).shape
    (8, 3, 4)

    """

    # get transpose axes
    tr_axes = get_transpose_axes(grid, axes)

    if len(set(tr_axes)) is not grid.ndim:
        raise ValueError("invalid transpose axes")

    return GridArray.from_array(
        da.transpose(grid.to_dask(), axes=tr_axes),
        name=grid.name,
        label=grid.label,
        unit=grid.unit,
        axes=[grid.axes[axis] for axis in tr_axes],
        time=grid.time,
    )
Exemplo n.º 11
0
def get_activations(activation_function, batch_gen):
    """
    Computes the activations of a data set at one layer of the model in a 
    "delayed" way (for memory and computation efficiency) and return them as a
    dask array. 

    See: https://docs.dask.org/en/latest/delayed.html
    """

    layer_shape = K.int_shape(activation_function.outputs[0])[1:]
    layer_dim = np.prod(K.int_shape(activation_function.outputs[0])[1:])
    n_images = batch_gen.n_images
    n_aug = batch_gen.aug_per_im
    batch_size = batch_gen.batch_size

    # Delayed computation of the activations of a batch
    @dask.delayed
    def batch_activation():
        batch_images, _ = next(batch_gen())
        return activation_function([batch_images, 0])[0]

    # Delayed iteration over the data set
    activations_delayed = [batch_activation() for _
            in range(batch_gen.n_batches)]
    activations_da_list = [da.from_delayed(
            activation_delayed,
            shape=(batch_size * n_aug, ) + layer_shape,
            dtype=K.floatx())
        for activation_delayed in activations_delayed]
    activations_da = da.concatenate(activations_da_list, axis=0)

    # The last batch can be smaller
    activations_da = activations_da[:n_images * n_aug]

    # Reshape the activations such that 
    # shape = (n_diff_images, layer_dim, n_aug)
    activations_da = da.reshape(activations_da, 
                                (activations_da.shape[0], layer_dim))
    activations_da = da.transpose(da.reshape(activations_da.T, 
                                             (layer_dim, n_images, n_aug)),
                                  (1, 0, 2))

    return activations_da
Exemplo n.º 12
0
    def calc_moments(self):
        with h5py.File(self.infile, 'r', rdcc_nbytes=1000 * 1000 * 1000) as f:
            data = da.from_array(f['data'],
                                 chunks=(-1, 256, -1, -1))  # CNHW layout
            data = da.transpose(data, (1, 2, 3, 0))
            dtype = data.dtype

            if dtype != np.float32:
                print(
                    'WARNING: data will be saved as float32 but input ist float64!'
                )

            if self.mean is None:
                arr = data
                with ProgressBar():
                    self.mean, self.std = da.compute(arr.mean(axis=[0, 1, 2]),
                                                     arr.std(axis=[0, 1, 2]),
                                                     num_workers=8)
            else:
                self.mean, self.std = np.asarray(
                    self.mean, dtype=dtype), np.asarray(self.std, dtype=dtype)

            print('mean: {}, std: {}'.format(list(self.mean), list(self.std)))

            if self.log1p_norm:
                data_z_norm = (data - self.mean) / self.std
                data_log1p = da.sign(data_z_norm) * da.log1p(
                    da.fabs(data_z_norm))

                if self.mean_log1p is None:
                    arr = data_log1p
                    with ProgressBar():
                        self.mean_log1p, self.std_log1p = da.compute(
                            arr.mean(axis=[0, 1, 2]),
                            arr.std(axis=[0, 1, 2]),
                            num_workers=8)
                else:
                    self.mean_log1p, self.std_log1p = np.asarray(
                        self.mean_log1p,
                        dtype=dtype), np.asarray(self.std_log1p, dtype=dtype)

                print('mean_log1p: {}, std_log1p: {}'.format(
                    list(self.mean_log1p), list(self.std_log1p)))
Exemplo n.º 13
0
def test_linear_operators():
    A = da.random.random((100, 50), chunks=20)
    Adlo = linop.DaskLinearOperator(A)
    assert Adlo.size == A.size
    assert Adlo.shape == A.shape
    assert Adlo.chunks == A.chunks
    assert Adlo.numblocks == A.numblocks
    assert Adlo.dtype == A.dtype

    try:
        linop.DLOSymmetric(A)
    except AssertionError:
        print 'fail on dims'

    try:
        linop.DLOSymmetric(da.random.random((100, 100), chunks=(10, 20)))
    except AssertionError:
        print 'fail on chunks'

    Asymm = linop.DLOSymmetric(da.random.random((100, 100), chunks=10))
    assert Asymm.numblocks == (10, 10)

    Adn = linop.DLODense(A)
    assert Adn.numblocks == A.numblocks

    Adiag = linop.DLODiagonal(da.diag(da.random.random((100, 100), chunks=50)))
    assert Adiag.numblocks == (2, 2)
    assert Adiag.data.numblocks == (2, )

    Agm = linop.DLOGram(A)
    assert Agm.numblocks == (A.numblocks[1], A.numblocks[1])
    Agm2 = linop.DLOGram(da.transpose(A))
    assert Agm.shape == Agm2.shape

    Agm = linop.DLORegularizedGram(A)
    assert Agm.regularization == 1
        print('using dummy annot')
        with h5py.File(alnfile +'.h5', 'r') as hf:
            align_array = hf['MSA2array']
            print('array shape' ,align_array.shape)
            dummy_annot = {'dummy_gene': { 'qstart':1 , 'qend':align_array.shape[1]-1 , 'evalue':0  }}
            annotation = pd.DataFrame.from_dict( dummy_annot , orient = 'index')
    print('selecting informative sites')


    def retcounts( row ):
        return  np.unique( row , return_counts=True)

    with h5py.File(alnfile +'.h5', 'r') as hf:
        align_array = hf['MSA2array']
        array = da.from_array(align_array)
        array = da.transpose(array)
        #create a df of the columns
        daskdf = dd.from_dask_array(array)
        daskdf['unique']= daskdf.apply( retcounts  , axis =1)
        res = list( daskdf['unique'].compute() )
        print('compiling sites')
        sites= { col : dict(zip(list(unique[0]), list(unique[1]))) for col,unique in enumerate(res) }
        informativesites = set([ s for s in sites if len( set( sites[s].keys()) -set([b'-',b'N']) ) > 1  ] )

    print('done')
    print('informative columns:' , len(informativesites))

    #associate informative sites to a codon
    codon_dict = {}
    print( 'grouping codons')
    for i,r in annotation.iterrows():
Exemplo n.º 15
0
def cov_mult(conv_matrix, cov_matrix):
    conv_matrix = da.transpose(conv_matrix)
    return da.matmul(da.matmul(conv_matrix, cov_matrix),
                     da.transpose(conv_matrix))
Exemplo n.º 16
0
nny = np.random.uniform(0, 10)
nnz = np.random.uniform(0, 10)
theta = np.random.uniform(0, 2 * np.pi)

nx = 1 / np.sqrt(nnx**2 + nny**2 + nnz**2) * nnx
ny = 1 / np.sqrt(nnx**2 + nny**2 + nnz**2) * nny
nz = 1 / np.sqrt(nnx**2 + nny**2 + nnz**2) * nnz

R = rotation(nx, ny, nz, theta)

mesh2 = np.dot(R, mesh1)

sys.exit()

#mesh2 = da.transpose(da.dot(R,da.transpose(subcat['Position'])))
mesh2 = da.transpose(da.dot(R, da.transpose(mesh1)))

sys.exit()

proj1 = np.fft.fftshift(mesh1.preview(axes=[0, 1], Nmesh=nmesh))
proj1 = proj1[num:-num, num:-num]

# Generate a random projection angle
theta = np.random.uniform(0, np.pi, size=num_maps)
phi = np.random.uniform(0, 2 * np.pi, size=num_maps)

theta_hat = np.array(
    [np.cos(theta) * np.cos(phi),
     np.cos(theta) * np.sin(phi), -np.sin(theta)]).T
phi_hat = np.array([-np.sin(phi), np.cos(phi), np.zeros(num_maps)]).T
Exemplo n.º 17
0
def load_single(file,
                drop_ghost=True,
                use_dask=True,
                var_list="all",
                ini_file=None):
    """Load a single step file and generate an xarray Dataset

    Parameters
    ----------
    file : str or Path
        Location of the file to load
    drop_ghost : bool, optional
        Drop all of the ghost cells, by default True
    var_list : List, optional
        Load only a specific set of variables, by default 'all'

    Returns
    -------
    xarray Dataset
    """

    if var_list == "all":
        var_list = [
            "density",
            "pressure",
            "sound_speed",
            "x_velocity",
            "y_velocity",
            "ghost_cell",
            "deposited_energy",
            "deposited_power",
        ]

    data_vars = {}
    space_dims = ("i", "j")

    if not file.endswith(".h5"):
        raise Exception("Step files must be .h5 files")

    h5 = h5py.File(file, "r")

    for v in var_list:
        try:
            h5[f"/{v}"].shape
        except KeyError:
            continue

        if use_dask:
            chunk_size = h5[f"/{v}"].shape
            array = da.from_array(h5[f"/{v}"], chunks=chunk_size)
            array = da.transpose(array)
        else:
            array = h5[f"/{v}"][()].T.astype(np.float32)

        try:
            long_name = var_dict[v]["long_name"]
        except Exception:
            long_name = ""

        try:
            description = h5[f"/{v}"].attrs["description"].decode("utf-8")
        except Exception:
            description = ""

        try:
            standard_name = var_dict[v]["standard_name"]
        except Exception:
            standard_name = ""

        try:
            units = h5[f"/{v}"].attrs["units"].decode("utf-8")
        except Exception:
            units = ""

        data_vars[f"{v}"] = xr.Variable(
            space_dims,
            array,
            attrs={
                "units": units,
                "description": description,
                "long_name": long_name,
                "standard_name": standard_name,
            },
        )

    x = h5[f"/x"][()].T.astype(np.float32)
    x_units = h5[f"/x"].attrs["units"].decode("utf-8")
    y = h5[f"/y"][()].T.astype(np.float32)

    # Get the cell centers
    dy = (np.diff(x[0, :]) / 2.0)[0]
    dx = (np.diff(y[:, 0]) / 2.0)[0]

    # cell center locations
    xc = x[:-1, 0] + dx
    yc = y[0, :-1] + dy

    coords = {
        "time": h5[f"/time"][()].astype(np.float32),
        "x": (["i"], xc),
        "y": (["j"], yc),
    }

    time_units = h5[f"/time"].attrs["units"].decode("utf-8")

    # Get the details about the CATO build
    info_attr = {}
    info = [
        "build_type",
        "compile_hostname",
        "compile_os",
        "compiler_flags",
        "compiler_version",
        "git_changes",
        "git_hash",
        "git_ref",
        "version",
    ]
    for v in info:
        try:
            info_attr[v] = h5["/cato_info"].attrs[f"{v}"].decode("utf-8")
        except Exception:
            pass

    attr_dict = info_attr
    attr_dict["time_units"] = time_units
    attr_dict["space_units"] = x_units

    if ini_file:
        input_dict = read_ini(ini_file)
        attr_dict.update(input_dict)

    ds = xr.Dataset(data_vars=data_vars, coords=coords, attrs=attr_dict)
    if ini_file:
        try:
            ds.attrs["title"] = ds.attrs["general_title"]
        except Exception:
            pass

    if drop_ghost:
        try:
            ds = ds.where(ds["ghost_cell"] == 0, drop=True)
            return ds.drop("ghost_cell")
        except KeyError:
            return ds
    else:
        return ds
Exemplo n.º 18
0
    kernels_mean = np.random.random((total_kernels, 3**2 * input_channels))
    cov_list = [
        random_cov(3**2 * input_channels) for number in range(total_kernels)
    ]
    kernels_cov = np.stack(cov_list)

    X = da.from_array(X)
    kernels_mean = da.from_array(kernels_mean)
    kernels_cov = da.from_array(kernels_cov)

    batch_out = []
    for i in range(batch_size):
        kernel_out = []
        for j in range(total_kernels):
            mean = da.matmul(kernels_mean[j, :], X[i, :, :])
            cov = da.matmul(da.transpose(X[i, :, :]),
                            da.matmul(kernels_cov[j, :, :], X[i, :, :]))
            z = mvn_random_DASK(mean, cov, total_samples, input_size**2)
            g = relu(z)
            mean_g = da.mean(g, axis=1)
            kernel_out.append(mean_g)
        kernels_out = da.stack(kernel_out, axis=0)
        batch_out.append(kernels_out)
    batches_out = da.stack(batch_out, axis=0)
    print('task graph complete')
    mean_g.visualize(rankdir="LR",
                     filename="task_graph_mean_g.pdf",
                     cmap='viridis')
    kernels_out.visualize(rankdir="LR", filename="task_graph_conv_out.pdf")

    batches_out.visualize(rankdir="LR", filename="task_graph_batches_out.pdf")
Exemplo n.º 19
0
def _stage_3(
    B: Array,
    YP: Array,
    X: Array,
    Y: Array,
    contigs: Array,
    variant_chunk_start: NDArray,
) -> Optional[Array]:
    """Stage 3 - Leave-one-chromosome-out (LOCO) Estimation

    This stage will use the coefficients for the optimal model in
    stage 2 to re-estimate predictions in a LOCO scheme. This scheme
    involves omitting coefficients that correspond to all variant
    blocks for a single chromosome in the stage 2 model and then
    recomputing predictions without those coefficients.

    For more details, see the "LOCO predictions" section of the Supplementary Methods
    in [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
    """
    assert B.ndim == 2
    assert YP.ndim == 4
    assert X.ndim == 2
    assert Y.ndim == 2
    # Check that chunking across samples is the same for all arrays
    assert B.numblocks[0] == YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0]
    assert YP.chunks[2] == X.chunks[0] == Y.chunks[0]
    # Extract shape statistics
    sample_chunks = Y.chunks[0]
    n_covar = X.shape[1]
    n_variant_block, n_alpha_1 = YP.shape[:2]
    n_indvar = n_covar + n_variant_block * n_alpha_1
    n_sample_block = Y.numblocks[0]
    n_sample, n_outcome = Y.shape

    # Determine unique contigs to create LOCO estimates for
    contigs = np.asarray(contigs, like=contigs)
    unique_contigs = np.unique(contigs)  # type: ignore[no-untyped-call]
    if hasattr(unique_contigs, "compute"):
        unique_contigs = unique_contigs.compute()
    n_contig = len(unique_contigs)
    if n_contig <= 1:
        # Return nothing w/o at least 2 contigs
        return None

    assert n_variant_block == len(variant_chunk_start)
    # Create vector of size `n_variant_block` where value
    # at index i corresponds to contig for variant block i
    variant_block_contigs = contigs[variant_chunk_start]

    # Transform coefficients (B) such that trailing dimensions
    # contain right half of matrix product for prediction:
    # (n_sample_block * n_indvar, n_outcome) ->
    # (n_outcome, n_sample_block, n_indvar)
    B = da.stack([B.blocks[i] for i in range(n_sample_block)], axis=0)
    assert_block_shape(B, n_sample_block, 1, 1)
    assert_chunk_shape(B, 1, n_indvar, n_outcome)
    assert_array_shape(B, n_sample_block, n_indvar, n_outcome)
    B = da.transpose(B, (2, 0, 1))
    assert_block_shape(B, 1, n_sample_block, 1)
    assert_chunk_shape(B, n_outcome, 1, n_indvar)
    assert_array_shape(B, n_outcome, n_sample_block, n_indvar)

    # Decompose coefficients (B) so that variant blocks can be sliced:
    # BX -> (n_outcome, n_sample_block, n_covar)
    # BYP -> (n_outcome, n_sample_block, n_variant_block, n_alpha_1)
    BX = B[..., :n_covar]
    assert_array_shape(BX, n_outcome, n_sample_block, n_covar)
    BYP = B[..., n_covar:]
    assert_array_shape(BYP, n_outcome, n_sample_block, n_variant_block * n_alpha_1)
    BYP = BYP.reshape((n_outcome, n_sample_block, n_variant_block, n_alpha_1))
    assert_block_shape(BYP, 1, n_sample_block, 1, 1)
    assert_chunk_shape(BYP, n_outcome, 1, n_variant_block, n_alpha_1)
    assert_array_shape(BYP, n_outcome, n_sample_block, n_variant_block, n_alpha_1)

    # Transform base predictions (YP) such that trailing dimensions
    # contain left half of matrix product for prediction as well
    # as variant blocks to slice on:
    # (n_variant_block, n_alpha_1, n_sample, n_outcome) ->
    # (n_outcome, n_sample, n_variant_block, n_alpha_1)
    YP = da.transpose(YP, (3, 2, 0, 1))
    assert_block_shape(YP, 1, n_sample_block, n_variant_block, 1)
    assert_chunk_shape(YP, n_outcome, sample_chunks[0], 1, n_alpha_1)
    assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1)

    def apply(X: Array, YP: Array, BX: Array, BYP: Array) -> Array:
        # Collapse selected variant blocks and alphas into single
        # new covariate dimension
        assert YP.shape[2] == BYP.shape[2]
        n_group_covar = n_covar + BYP.shape[2] * n_alpha_1

        BYP = BYP.reshape((n_outcome, n_sample_block, -1))
        BG = da.concatenate((BX, BYP), axis=-1)
        BG = BG.rechunk((-1, None, -1))
        assert_block_shape(BG, 1, n_sample_block, 1)
        assert_chunk_shape(BG, n_outcome, 1, n_group_covar)
        assert_array_shape(BG, n_outcome, n_sample_block, n_group_covar)

        YP = YP.reshape((n_outcome, n_sample, -1))
        XYP = da.broadcast_to(X, (n_outcome, n_sample, n_covar))
        XG = da.concatenate((XYP, YP), axis=-1)
        XG = XG.rechunk((-1, None, -1))
        assert_block_shape(XG, 1, n_sample_block, 1)
        assert_chunk_shape(XG, n_outcome, sample_chunks[0], n_group_covar)
        assert_array_shape(XG, n_outcome, n_sample, n_group_covar)

        YG = da.map_blocks(
            # Block chunks:
            # (n_outcome, sample_chunks[0], n_group_covar) @
            # (n_outcome, n_group_covar, 1) [after transpose]
            lambda x, b: x @ b.transpose((0, 2, 1)),
            XG,
            BG,
            chunks=(n_outcome, sample_chunks, 1),
        )
        assert_block_shape(YG, 1, n_sample_block, 1)
        assert_chunk_shape(YG, n_outcome, sample_chunks[0], 1)
        assert_array_shape(YG, n_outcome, n_sample, 1)
        YG = da.squeeze(YG, axis=-1).T
        assert_block_shape(YG, n_sample_block, 1)
        assert_chunk_shape(YG, sample_chunks[0], n_outcome)
        assert_array_shape(YG, n_sample, n_outcome)
        return YG

    # For each contig, generate predictions for all sample+outcome
    # combinations using only betas from stage 2 results that
    # correspond to *other* contigs (i.e. LOCO)
    YC = []
    for contig in unique_contigs:
        # Define a variant block mask of size `n_variant_block`
        # determining which blocks correspond to this contig
        variant_block_mask = variant_block_contigs == contig
        if hasattr(variant_block_mask, "compute"):
            variant_block_mask = variant_block_mask.compute()
        BYPC = BYP[:, :, ~variant_block_mask, :]
        YPC = YP[:, :, ~variant_block_mask, :]
        YGC = apply(X, YPC, BX, BYPC)
        YC.append(YGC)
    YC = da.stack(YC, axis=0)
    assert_array_shape(YC, n_contig, n_sample, n_outcome)

    return YC
Exemplo n.º 20
0
 def test_set(self):
     gene_matrix = da.concatenate(
         [G for (bim, fam, G) in self.test_studies.values()], axis=0)
     gene_matrix = da.transpose(gene_matrix).to_dask_dataframe()
     return gene_matrix.fillna(gene_matrix.mean(axis=0), axis=0)
Exemplo n.º 21
0
    def _daread(
        img: Path,
        offsets: List[np.ndarray],
        read_lengths: np.ndarray,
        chunk_by_dims: List[str] = [
            Dimensions.SpatialZ,
            Dimensions.SpatialY,
            Dimensions.SpatialX,
        ],
        S: int = 0,
    ) -> Tuple[da.core.Array, str]:
        """
        Read a LIF image file as a delayed dask array where certain dimensions act as
        the chunk size.

        Parameters
        ----------
        img: Path
            The filepath to read.
        offsets: List[numpy.ndarray]
            A List of numpy ndarrays offsets, see _compute_offsets for more details.
        read_lengths: numpy.ndarray
            A 1D numpy array of read lengths, the index is the scene index
        chunk_by_dims: List[str]
            The dimensions to use as the for mapping the chunks / blocks.
            Default: [Dimensions.SpatialZ, Dimensions.SpatialY, Dimensions.SpatialX]
            Note: SpatialY and SpatialX will always be added to the list if not present.
        S: int
            If the image has different dimensions on any scene from another, the dask
            array construction will fail.
            In that case, use this parameter to specify a specific scene to construct a
            dask array for.
            Default: 0 (select the first scene)

        Returns
        -------
        img: dask.array.core.Array
            The constructed dask array where certain dimensions are chunked.
        dims: str
            The dimension order as a string.
        """
        # Get image dims indicies
        lif = LifFile(filename=img)
        image_dim_indices = LifReader._dims_shape(lif=lif)

        # Catch inconsistent scene dimension sizes
        if len(image_dim_indices) > 1:
            # Choose the provided scene
            try:
                image_dim_indices = image_dim_indices[S]
                log.info(
                    f"File contains variable dimensions per scene, "
                    f"selected scene: {S} for data retrieval."
                )
            except IndexError:
                raise exceptions.InconsistentShapeError(
                    f"The LIF image provided has variable dimensions per scene. "
                    f"Please provide a valid index to the 'S' parameter to create a "
                    f"dask array for the index provided. "
                    f"Provided scene index: {S}. Scene index range: "
                    f"0-{len(image_dim_indices)}."
                )
        else:
            # If the list is length one that means that all the scenes in the image
            # have the same dimensions
            # Just select the first dictionary in the list
            image_dim_indices = image_dim_indices[0]

        # Uppercase dimensions provided to chunk by dims
        chunk_by_dims = [d.upper() for d in chunk_by_dims]

        # Always add Y and X dims to chunk by dims because that is how LIF files work
        if Dimensions.SpatialY not in chunk_by_dims:
            log.info(
                "Adding the Spatial Y dimension to chunk by dimensions as it was not "
                "found."
            )
            chunk_by_dims.append(Dimensions.SpatialY)
        if Dimensions.SpatialX not in chunk_by_dims:
            log.info(
                "Adding the Spatial X dimension to chunk by dimensions as it was not "
                "found."
            )
            chunk_by_dims.append(Dimensions.SpatialX)

        # Setup read dimensions for an example chunk
        first_chunk_read_dims = {}
        for dim, (dim_begin_index, dim_end_index) in image_dim_indices.items():
            # Only add the dimension if the dimension isn't a part of the chunk
            if dim not in chunk_by_dims:
                # Add to read dims
                first_chunk_read_dims[dim] = dim_begin_index

        # Read first chunk for information used by dask.array.from_delayed
        sample, sample_dims = LifReader._get_array_from_offset(
            im_path=img,
            offsets=offsets,
            read_lengths=read_lengths,
            meta=lif.xml_root,
            read_dims=first_chunk_read_dims,
        )

        # Get the shape for the chunk and operating shape for the dask array
        # We also collect the chunk and non chunk dimension ordering so that we can
        # swap the dimensions after we block the dask array together.
        sample_chunk_shape = []
        operating_shape = []
        non_chunk_dimension_ordering = []
        chunk_dimension_ordering = []
        for i, dim_info in enumerate(sample_dims):
            # Unpack dim info
            dim, size = dim_info

            # If the dim is part of the specified chunk dims then append it to the
            # sample, and, append the dimension to the chunk dimension ordering
            if dim in chunk_by_dims:
                sample_chunk_shape.append(size)
                chunk_dimension_ordering.append(dim)

            # Otherwise, append the dimension to the non chunk dimension ordering, and,
            # append the true size of the image at that dimension
            else:
                non_chunk_dimension_ordering.append(dim)
                operating_shape.append(
                    image_dim_indices[dim][1] - image_dim_indices[dim][0]
                )

        # Convert shapes to tuples and combine the non and chunked dimension orders as
        # that is the order the data will actually come out of the read data as
        sample_chunk_shape = tuple(sample_chunk_shape)
        blocked_dimension_order = (
            non_chunk_dimension_ordering + chunk_dimension_ordering
        )

        # Fill out the rest of the operating shape with dimension sizes of 1 to match
        # the length of the sample chunk. When dask.block happens it fills the
        # dimensions from inner-most to outer-most with the chunks as long as the
        # dimension is size 1. Basically, we are adding empty dimensions to the
        # operating shape that will be filled by the chunks from dask
        operating_shape = tuple(operating_shape) + (1,) * len(sample_chunk_shape)

        # Create empty numpy array with the operating shape so that we can iter through
        # and use the multi_index to create the readers.
        lazy_arrays = np.ndarray(operating_shape, dtype=object)

        # We can enumerate over the multi-indexed array and construct read_dims
        # dictionaries by simply zipping together the ordered dims list and the current
        # multi-index plus the begin index for that plane. We then set the value of the
        # array at the same multi-index to the delayed reader using the constructed
        # read_dims dictionary.
        dims = [d for d in Dimensions.DefaultOrder]
        begin_indicies = tuple(image_dim_indices[d][0] for d in dims)
        for i, _ in np.ndenumerate(lazy_arrays):
            # Add the czi file begin index for each dimension to the array dimension
            # index
            this_chunk_read_indicies = (
                current_dim_begin_index + curr_dim_index
                for current_dim_begin_index, curr_dim_index in zip(begin_indicies, i)
            )

            # Zip the dims with the read indices
            this_chunk_read_dims = dict(
                zip(blocked_dimension_order, this_chunk_read_indicies)
            )

            # Remove the dimensions that we want to chunk by from the read dims
            for d in chunk_by_dims:
                if d in this_chunk_read_dims:
                    this_chunk_read_dims.pop(d)

            # Add delayed array to lazy arrays at index
            lazy_arrays[i] = da.from_delayed(
                delayed(LifReader._imread)(
                    img, offsets, read_lengths, lif.xml_root, this_chunk_read_dims
                ),
                shape=sample_chunk_shape,
                dtype=sample.dtype,
            )

        # Convert the numpy array of lazy readers into a dask array and fill the inner
        # most empty dimensions with chunks
        merged = da.block(lazy_arrays.tolist())

        # Because we have set certain dimensions to be chunked and others not
        # we will need to transpose back to original dimension ordering
        # Example being, if the original dimension ordering was "SZYX" and we want to
        # chunk by "S", "Y", and "X" we created an array with dimensions ordering "ZSYX"
        transpose_indices = []
        transpose_required = False
        for i, d in enumerate(Dimensions.DefaultOrder):
            new_index = blocked_dimension_order.index(d)
            if new_index != i:
                transpose_required = True
                transpose_indices.append(new_index)
            else:
                transpose_indices.append(i)

        # Only run if the transpose is actually required
        # The default case is "Z", "Y", "X", which _usually_ doesn't need to be
        # transposed because that is _usually_
        # The normal dimension order of the LIF file anyway
        if transpose_required:
            merged = da.transpose(merged, tuple(transpose_indices))

        # Because dimensions outside of Y and X can be in any order and present or not
        # we also return the dimension order string.
        return merged, "".join(dims)
Exemplo n.º 22
0
def _stage_2(
    YP: Array,
    X: Array,
    Y: Array,
    alphas: Optional[NDArray] = None,
    normalize: bool = True,
    _glow_adj_alpha: bool = False,
    _glow_adj_scaling: bool = False,
) -> Tuple[Array, Array]:
    """Stage 2 - WGR Meta Regression

    This stage will train separate ridge regression models for each outcome
    using the predictions from stage 1 for that same outcome as features. These
    predictions are then evaluated based on R2 score to determine an optimal
    "meta" estimator (see `_stage_1` for the "base" estimator description). Results
    then include only predictions and coefficients from this optimal model.

    For more details, see the level 1 regression model described in step 1
    of [Mbatchou et al. 2020](https://www.biorxiv.org/content/10.1101/2020.06.19.162354v2).
    """
    assert YP.ndim == 4
    assert X.ndim == 2
    assert Y.ndim == 2
    # Check that chunking across samples is the same for all arrays
    assert YP.numblocks[2] == X.numblocks[0] == Y.numblocks[0]
    assert YP.chunks[2] == X.chunks[0] == Y.chunks[0]
    # Assert single chunks for covariates and outcomes
    assert X.numblocks[1] == Y.numblocks[1] == 1
    # Extract shape statistics
    n_variant_block, n_alpha_1 = YP.shape[:2]
    n_sample_block = Y.numblocks[0]
    n_sample, n_outcome = Y.shape
    n_covar = X.shape[1]
    n_indvar = n_covar + n_variant_block * n_alpha_1
    sample_chunks = Y.chunks[0]

    if normalize:
        assert_block_shape(YP, n_variant_block, 1, n_sample_block, 1)
        assert_chunk_shape(YP, 1, n_alpha_1, sample_chunks[0], n_outcome)
        # See: https://github.com/projectglow/glow/issues/260
        if _glow_adj_scaling:
            YP = da.map_blocks(
                lambda x: (x - x.mean(axis=2, keepdims=True))
                / x.std(axis=2, keepdims=True),
                YP,
            )
        else:
            YP = (YP - YP.mean(axis=2, keepdims=True)) / YP.std(axis=2, keepdims=True)
    # Tranpose for refit on level 1 predictions
    YP = YP.transpose((3, 2, 0, 1))
    assert_array_shape(YP, n_outcome, n_sample, n_variant_block, n_alpha_1)

    if alphas is None:
        # See: https://github.com/projectglow/glow/issues/255
        if _glow_adj_alpha:
            alphas = get_alphas(n_variant_block * n_alpha_1 * n_outcome)
        else:
            alphas = get_alphas(n_variant_block * n_alpha_1)
    n_alpha_2 = alphas.size

    YR = []
    BR = []
    for i in range(n_outcome):
        # Slice and reshape to new 2D covariate matrix;
        # The order of raveling in trailing dimensions is important
        # and later reshapes will assume variants, alphas order
        XPB = YP[i].reshape((n_sample, n_variant_block * n_alpha_1))
        # Prepend covariates and chunk along first dim only
        XPB = da.concatenate((X, XPB), axis=1)
        XPB = XPB.rechunk(chunks=(None, -1))
        assert_array_shape(XPB, n_sample, n_indvar)
        assert XPB.numblocks == (n_sample_block, 1)
        # Extract outcome vector
        YB = Y[:, [i]]
        assert XPB.ndim == YB.ndim == 2
        # Fit and predict folds for each parameter
        BB, YPB = _ridge_regression_cv(XPB, YB, alphas, n_zero_reg=n_covar)[-2:]
        assert_array_shape(BB, n_alpha_2, n_sample_block * n_indvar, 1)
        assert_array_shape(YPB, n_alpha_2, n_sample, 1)
        BR.append(BB)
        YR.append(YPB)

    # Concatenate predictions along outcome dimension
    YR = da.concatenate(YR, axis=2)
    assert_block_shape(YR, 1, n_sample_block, n_outcome)
    assert_chunk_shape(YR, n_alpha_2, sample_chunks[0], 1)
    assert_array_shape(YR, n_alpha_2, n_sample, n_outcome)
    # Move samples to last dim so all others are batch
    # dims for R2 calculations
    YR = da.transpose(YR, (0, 2, 1))
    assert_array_shape(YR, n_alpha_2, n_outcome, n_sample)
    YR = YR.rechunk((-1, -1, None))
    assert_block_shape(YR, 1, 1, n_sample_block)
    assert YR.shape[1:] == Y.T.shape

    # Concatenate betas along outcome dimension
    BR = da.concatenate(BR, axis=2)
    assert_block_shape(BR, 1, n_sample_block, n_outcome)
    assert_chunk_shape(BR, n_alpha_2, n_indvar, 1)
    assert_array_shape(BR, n_alpha_2, n_sample_block * n_indvar, n_outcome)

    # Compute R2 scores within each sample block for each outcome + alpha
    R2 = da.stack(
        [
            r2_score(YR.blocks[..., i], Y.T.blocks[..., i])
            # Avoid warnings on R2 calculations for blocks with single rows
            if YR.chunks[-1][i] > 1 else da.full(YR.shape[:-1], np.nan)
            for i in range(n_sample_block)
        ]
    )
    assert_array_shape(R2, n_sample_block, n_alpha_2, n_outcome)
    # Coerce to finite or nan before nan-aware mean
    R2 = da.where(da.isfinite(R2), R2, np.nan)
    # Find highest mean alpha score for each outcome across blocks
    R2M = da.nanmean(R2, axis=0)
    assert_array_shape(R2M, n_alpha_2, n_outcome)
    # Identify index for the alpha value with the highest mean score
    R2I = da.argmax(R2M, axis=0)
    assert_array_shape(R2I, n_outcome)

    # Choose the predictions corresponding to the model with best score
    YRM = da.stack([YR[R2I[i], i, :] for i in range(n_outcome)], axis=-1)
    YRM = YRM.rechunk((None, -1))
    assert_block_shape(YRM, n_sample_block, 1)
    assert_chunk_shape(YRM, sample_chunks[0], n_outcome)
    assert_array_shape(YRM, n_sample, n_outcome)
    # Choose the betas corresponding to the model with the best score
    BRM = da.stack([BR[R2I[i], :, i] for i in range(n_outcome)], axis=-1)
    BRM = BRM.rechunk((None, -1))
    assert_block_shape(BRM, n_sample_block, 1)
    assert_chunk_shape(BRM, n_indvar, n_outcome)
    assert_array_shape(BRM, n_sample_block * n_indvar, n_outcome)
    return BRM, YRM
Exemplo n.º 23
0
            X.compute(), kernels_mean.compute(), kernels_cov.compute(),
            batch_size, total_kernels, input_size)

    times = []  # list for storing execution times
    cluster = 'localhost:8001'  # address of compute cluster
    with Client(cluster) as client:  # Using cluster as client do
        for n in range(itrs):  # itrs runs
            start = time.time()  # save start tikme
            batch_out = []  # create list for batch output
            for i in range(batch_size):  # for each image
                kernel_out = []  # create list for kernel outputs
                mean = da.matmul(kernels_mean,
                                 X[i, :, :])  # compute all kernel means
                for j in range(total_kernels):  # for each kernel
                    cov = da.matmul(
                        da.transpose(X[i, :, :]),  # compute covariance
                        da.matmul(kernels_cov[j, :, :], X[i, :, :]))
                    z = mvn_random_DASK(
                        mean[j, :], cov, total_samples,
                        input_size**2)  # sample from transformed distribution
                    g = relu(z)  # pass samples through relu
                    mean_g = da.mean(
                        g, axis=1)  # compute ensemble mean from samples
                    kernel_out.append(
                        mean_g)  # add ensemble mean to kernel outputs list
                kernels_out = da.stack(kernel_out,
                                       axis=0)  # stack all kernel outputs
                batch_out.append(
                    kernels_out
                )  # add stacked kernel outputs to batch output list
            batches_out = da.stack(batch_out,
Exemplo n.º 24
0
    assert_eq(dm, m)


functions = [
    lambda x: x,
    lambda x: da.expm1(x),
    lambda x: 2 * x,
    lambda x: x / 2,
    lambda x: x**2,
    lambda x: x + x,
    lambda x: x * x,
    lambda x: x[0],
    lambda x: x[:, 1],
    lambda x: x[:1, None, 1:3],
    lambda x: x.T,
    lambda x: da.transpose(x, (1, 2, 0)),
    lambda x: x.sum(),
    lambda x: x.dot(np.arange(x.shape[-1])),
    lambda x: x.dot(np.eye(x.shape[-1])),
    lambda x: da.tensordot(x, np.ones(x.shape[:2]), axes=[(0, 1), (0, 1)]),
    lambda x: x.sum(axis=0),
    lambda x: x.max(axis=0),
    lambda x: x.sum(axis=(1, 2)),
    lambda x: x.astype(np.complex128),
    lambda x: x.map_blocks(lambda x: x * 2),
    lambda x: x.round(1),
    lambda x: x.reshape((x.shape[0] * x.shape[1], x.shape[2])),
    lambda x: abs(x),
    lambda x: x > 0.5,
    lambda x: x.rechunk((4, 4, 4)),
    lambda x: x.rechunk((2, 2, 1)),
Exemplo n.º 25
0
Arquivo: result.py Projeto: IOMRC/piv
    def ds(self):
        if self._ds is None:
            file_exists = os.path.exists(self._result_file)

            reprocess = not file_exists or self._reprocess

            if reprocess:
                if file_exists:
                    print('Old file exists ' + self._result_file)
                    #print('Removing old file ' + self._result_file)
                    #shutil.rmtree(self._result_file)

                ds_data = OrderedDict()

                to_seconds = np.vectorize(
                    lambda x: x.seconds + x.microseconds / 1E6)

                print('Processing binary data...')
                xx, yy, zz = self._loadgrid()
                if xx is None:
                    if self._from_nc:
                        print('Processing existing netcdf...')
                        fn = self._result_file[:-5] + '_QC_raw.nc'
                        if os.path.exists(fn):
                            ds_temp = xr.open_dataset(self._result_file[:-5] +
                                                      '_QC_raw.nc',
                                                      chunks={'time': 50})
                            u = da.transpose(ds_temp['U'].data,
                                             axes=[3, 0, 1, 2])
                            v = da.transpose(ds_temp['V'].data,
                                             axes=[3, 0, 1, 2])
                            w = da.transpose(ds_temp['W'].data,
                                             axes=[3, 0, 1, 2])
                            tt = ds_temp['time']
                            te = (tt - tt[0]) / np.timedelta64(1, 's')
                            xx = ds_temp['x'].values
                            yy = ds_temp['y'].values
                            zz = ds_temp['z'].values
                        else:
                            print('USING OLD ZARR DATA')
                            ds_temp = xr.open_zarr(self._result_file)
                            u = da.transpose(ds_temp['U'].data,
                                             axes=[3, 0, 1, 2])
                            v = da.transpose(ds_temp['V'].data,
                                             axes=[3, 0, 1, 2])
                            w = da.transpose(ds_temp['W'].data,
                                             axes=[3, 0, 1, 2])
                            tt = ds_temp['time']
                            te = (tt - tt[0]) / np.timedelta64(1, 's')
                            xx = ds_temp['x'].values
                            yy = ds_temp['y'].values
                            zz = ds_temp['z'].values
                            print('ERROR: No NetCDF data found for ' +
                                  self._xml_file)
                            #return None
                            # print(u.shape)

                else:
                    tt, uvw = self._loaddata(xx, yy, zz)
                    if tt is None:
                        print('ERROR: No binary data found for ' +
                              self._xml_file)
                        return None

                    # calculate the elapsed time from the Timestamp objects and then convert to datetime64 datatype
                    te = to_seconds(tt - tt[0])
                    tt = pd.to_datetime(tt)
                    uvw = uvw.persist()
                    u = uvw[:, :, :, :, 0]
                    v = uvw[:, :, :, :, 1]
                    w = uvw[:, :, :, :, 2]


#                    u = xr.DataArray(uvw[:,:,:,:,0], coords=[tt, xx, yy, zz], dims=['time','x', 'y', 'z'],
#                                     name='U', attrs={'standard_name': 'sea_water_x_velocity', 'units': 'm s-1'})
#                    v = xr.DataArray(uvw[:,:,:,:,1], coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'],
#                                     name='V', attrs={'standard_name': 'sea_water_x_velocity', 'units': 'm s-1'})
#                    w = xr.DataArray(uvw[:,:,:,:,2], coords=[tt, xx, yy, zz], dims=['time', 'x', 'y', 'z'],
#                                     name='W', attrs={'standard_name': 'upward_sea_water_velocity', 'units': 'm s-1'})

                if xx is None:
                    print('No data found')
                    return None

                u = u.persist()
                v = v.persist()
                w = w.persist()

                dx = float(xx[1] - xx[0])
                dy = float(yy[1] - yy[0])
                dz = float(zz[1] - zz[0])

                if self._norm_dims:
                    exp = self._result_root.split('/')[4]
                    runSheet = pd.read_csv('~/RunSheet-%s.csv' % exp)
                    runSheet = runSheet.set_index('RunID')
                    runDetails = runSheet.ix[int(self.run_id[-2:])]

                    T = runDetails['T (s)']
                    h = runDetails['h (m)']
                    D = runDetails['D (m)']

                    ww = te / T
                    om = 2. * np.pi / T
                    d_s = (2. * 1E-6 / om)**0.5
                    bl = 3. * np.pi / 4. * d_s

                    if exp == 'Exp6':
                        if D == 0.1:
                            dy_c = (188. + 82.) / 2
                            dx_c = 39.25
                            cx = dx_c / 1000.
                            cy = dy_c / 1000.
                        else:
                            dy_c = (806. + 287.) / 2. * 0.22
                            dx_c = 113 * 0.22
                            cx = dx_c / 1000.
                            cy = dy_c / 1000.
                    elif exp == 'Exp8':
                        dy_c = 624 * 0.22
                        dx_c = 15
                        cx = dx_c / 1000.
                        cy = dy_c / 1000.
                    xn = (xx + (D / 2. - cx)) / D
                    yn = (yy - cy) / D
                    zn = zz / h

                    xnm, ynm = np.meshgrid(xn, yn)
                    rr = np.sqrt(xnm**2. + ynm**2)
                    cylMask = rr < 0.5

                    nanPlane = np.ones(cylMask.shape)
                    nanPlane[cylMask] = np.nan
                    nanPlane = nanPlane.T
                    nanPlane = nanPlane[np.newaxis, :, :, np.newaxis]

                    u = u * nanPlane
                    v = v * nanPlane
                    w = w * nanPlane

                    if D == 0.1:
                        xInds = xn > 3.
                    else:
                        xInds = xn > 2.

                    blInd = np.argmax(zn > bl / h)
                    blPlane = int(round(blInd))

                    Ue = u[:, xInds, :, :]
                    Ue_bar = da.nanmean(Ue, axis=(1, 2, 3)).compute()
                    Ue_bl = da.nanmean(Ue[:, :, :, blPlane],
                                       axis=(1, 2)).compute()

                    inds = ~np.isnan(Ue_bl)

                    xv = ww[inds] % 1.
                    xv = xv + np.random.normal(scale=1E-6, size=xv.shape)
                    yv = Ue_bl[inds]
                    xy = np.stack([
                        np.concatenate([xv - 1., xv, xv + 1.]),
                        np.concatenate([yv, yv, yv])
                    ]).T
                    xy = xy[xy[:, 0].argsort(), :]
                    xi = np.linspace(-0.5, 1.5, len(xv) / 8)
                    n = np.nanmax(xy[:, 1])
                    # print(n)
                    # fig,ax = pl.subplots()
                    # ax.scatter(xy[:,0],xy[:,1]/n)
                    # print(xy)
                    spl = si.LSQUnivariateSpline(xy[:, 0],
                                                 xy[:, 1] / n,
                                                 t=xi,
                                                 k=3)
                    roots = spl.roots()
                    der = spl.derivative()
                    slope = der(roots)
                    inds = np.min(np.where(slope > 0))
                    dt = (roots[inds] % 1.).mean() - 0.5

                    tpx = np.arange(0, 0.5, 0.001)
                    U0_bl = np.abs(spl(tpx + dt).min() * n)
                    ws = ww - dt
                    Ue_spl = spl((ws - 0.5) % 1.0 + dt) * n * -1.0

                    #maxima = spl.derivative().roots()
                    #Umax = spl(maxima)
                    #UminIdx = np.argmin(Umax)
                    #U0_bl = np.abs(Umax[UminIdx]*n)

                    #ww_at_min = maxima[UminIdx]
                    #ws = ww - ww_at_min + 0.25

                    inds = ~np.isnan(Ue_bar)

                    xv = ww[inds] % 1.
                    xv = xv + np.random.normal(scale=1E-6, size=xv.shape)
                    yv = Ue_bar[inds]
                    xy = np.stack([
                        np.concatenate([xv - 1., xv, xv + 1.]),
                        np.concatenate([yv, yv, yv])
                    ]).T
                    xy = xy[xy[:, 0].argsort(), :]
                    xi = np.linspace(-0.5, 1.5, len(xv) / 8)
                    n = np.nanmax(xy[:, 1])
                    spl = si.LSQUnivariateSpline(xy[:, 0],
                                                 xy[:, 1] / n,
                                                 t=xi,
                                                 k=4)
                    maxima = spl.derivative().roots()
                    Umax = spl(maxima)
                    UminIdx = np.argmin(Umax)
                    U0_bar = np.abs(Umax[UminIdx] * n)

                    ww = xr.DataArray(ww, coords=[
                        tt,
                    ], dims=[
                        'time',
                    ])
                    ws = xr.DataArray(ws - 0.5, coords=[
                        tt,
                    ], dims=[
                        'time',
                    ])

                    xn = xr.DataArray(xn, coords=[
                        xx,
                    ], dims=[
                        'x',
                    ])
                    yn = xr.DataArray(yn, coords=[
                        yy,
                    ], dims=[
                        'y',
                    ])
                    zn = xr.DataArray(zn, coords=[
                        zz,
                    ], dims=[
                        'z',
                    ])

                    Ue_bar = xr.DataArray(Ue_bar,
                                          coords=[
                                              tt,
                                          ],
                                          dims=[
                                              'time',
                                          ])
                    Ue_bl = xr.DataArray(Ue_bl, coords=[
                        tt,
                    ], dims=[
                        'time',
                    ])
                    Ue_spl = xr.DataArray(Ue_spl,
                                          coords=[
                                              tt,
                                          ],
                                          dims=[
                                              'time',
                                          ])

                    ds_data['ww'] = ww
                    ds_data['ws'] = ws

                    ds_data['xn'] = xn
                    ds_data['yn'] = yn
                    ds_data['zn'] = zn

                    ds_data['Ue_bar'] = Ue_bar
                    ds_data['Ue_bl'] = Ue_bl
                    ds_data['Ue_spl'] = Ue_spl

                te = xr.DataArray(te, coords=[
                    tt,
                ], dims=[
                    'time',
                ])

                dims = ['time', 'x', 'y', 'z']
                coords = [tt, xx, yy, zz]

                ds_data['U'] = xr.DataArray(u,
                                            coords=coords,
                                            dims=dims,
                                            name='U',
                                            attrs={
                                                'standard_name':
                                                'sea_water_x_velocity',
                                                'units': 'm s-1'
                                            })
                ds_data['V'] = xr.DataArray(v,
                                            coords=coords,
                                            dims=dims,
                                            name='V',
                                            attrs={
                                                'standard_name':
                                                'sea_water_x_velocity',
                                                'units': 'm s-1'
                                            })
                ds_data['W'] = xr.DataArray(w,
                                            coords=coords,
                                            dims=dims,
                                            name='W',
                                            attrs={
                                                'standard_name':
                                                'sea_water_x_velocity',
                                                'units': 'm s-1'
                                            })
                ds_data['te'] = te

                # stdV = da.nanstd(v)
                # stdW = da.nanstd(w)
                # thres=7.
                if 'U0_bl' in locals():
                    condition = (da.fabs(v) / U0_bl >
                                 1.5) | (da.fabs(w) / U0_bl > 0.6)
                    for var in ['U', 'V', 'W']:
                        ds_data[var].data = da.where(condition, np.nan,
                                                     ds_data[var].data)

                piv_step_frame = float(
                    self._xml_root.findall('piv/stepFrame')[0].text)

                print('Calculating tensor')
                # j = jacobianConv(ds.U, ds.V, ds.W, dx, dy, dz, sigma=1.5)
                j = jacobianDask(u, v, w, piv_step_frame, dx, dy, dz)
                print('Done')
                #j = da.from_array(j,chunks=(20,-1,-1,-1,-1,-1))

                #                j = jacobianDask(uvw[:,:,:,:,0],uvw[:,:,:,:,1], uvw[:,:,:,:,2], piv_step_frame, dx, dy, dz)
                jT = da.transpose(j, axes=[0, 1, 2, 3, 5, 4])

                #                j = j.persist()
                #                jT = jT.persist()

                jacobianNorm = da.sqrt(
                    da.nansum(da.nansum(j**2., axis=-1), axis=-1))

                strainTensor = (j + jT) / 2.
                vorticityTensor = (j - jT) / 2.

                strainTensorNorm = da.sqrt(
                    da.nansum(da.nansum(strainTensor**2., axis=-1), axis=-1))
                vorticityTensorNorm = da.sqrt(
                    da.nansum(da.nansum(vorticityTensor**2., axis=-1),
                              axis=-1))
                divergence = j[:, :, :, :, 0, 0] + j[:, :, :, :, 1,
                                                     1] + j[:, :, :, :, 2, 2]
                # print(divergence)
                omx = vorticityTensor[:, :, :, :, 2, 1] * 2.
                omy = vorticityTensor[:, :, :, :, 0, 2] * 2.
                omz = vorticityTensor[:, :, :, :, 1, 0] * 2.

                divNorm = divergence / jacobianNorm

                #                divNorm = divNorm.persist()

                #                divNorm_mean = da.nanmean(divNorm)
                #                divNorm_std = da.nanstd(divNorm)

                dims = ['x', 'y', 'z']
                comp = ['u', 'v', 'w']

                ds_data['jacobian'] = xr.DataArray(
                    j,
                    coords=[tt, xx, yy, zz, comp, dims],
                    dims=['time', 'x', 'y', 'z', 'comp', 'dims'],
                    name='jacobian')

                ds_data['jacobianNorm'] = xr.DataArray(
                    jacobianNorm,
                    coords=[tt, xx, yy, zz],
                    dims=['time', 'x', 'y', 'z'],
                    name='jacobianNorm')

                ds_data['strainTensor'] = xr.DataArray(
                    strainTensor,
                    coords=[tt, xx, yy, zz, comp, dims],
                    dims=['time', 'x', 'y', 'z', 'comp', 'dims'],
                    name='strainTensor')

                ds_data['vorticityTensor'] = xr.DataArray(
                    vorticityTensor,
                    coords=[tt, xx, yy, zz, comp, dims],
                    dims=['time', 'x', 'y', 'z', 'comp', 'dims'],
                    name='vorticityTensor')

                ds_data['vorticityNorm'] = xr.DataArray(
                    vorticityTensorNorm,
                    coords=[tt, xx, yy, zz],
                    dims=['time', 'x', 'y', 'z'],
                    name='vorticityNorm')

                ds_data['strainNorm'] = xr.DataArray(
                    strainTensorNorm,
                    coords=[tt, xx, yy, zz],
                    dims=['time', 'x', 'y', 'z'],
                    name='strainNorm')

                ds_data['divergence'] = xr.DataArray(
                    divergence,
                    coords=[tt, xx, yy, zz],
                    dims=['time', 'x', 'y', 'z'],
                    name='divergence')

                ds_data['omx'] = xr.DataArray(omx,
                                              coords=[tt, xx, yy, zz],
                                              dims=['time', 'x', 'y', 'z'],
                                              name='omx')

                ds_data['omy'] = xr.DataArray(omy,
                                              coords=[tt, xx, yy, zz],
                                              dims=['time', 'x', 'y', 'z'],
                                              name='omy')

                ds_data['omz'] = xr.DataArray(omz,
                                              coords=[tt, xx, yy, zz],
                                              dims=['time', 'x', 'y', 'z'],
                                              name='omz')

                ds_data['divNorm'] = xr.DataArray(divNorm,
                                                  coords=[tt, xx, yy, zz],
                                                  dims=['time', 'x', 'y', 'z'],
                                                  name='divNorm')

                #                ds_data['divNorm_mean'] = xr.DataArray(divNorm_mean)
                #                ds_data['divNorm_std'] = xr.DataArray(divNorm_std)

                ds = xr.Dataset(ds_data)
                #                if self._from_nc:
                #                    for k,v in ds_temp.attrs.items():
                #                        ds.attrs[k]=v
                #ds = ds.chunk({'time': 20})

                self._append_CF_attrs(ds)
                self._append_attrs(ds)
                ds.attrs['filename'] = self._result_file

                if self._norm_dims:

                    KC = U0_bl * T / D
                    delta = (2. * np.pi * d_s) / h
                    S = delta / KC

                    ds.attrs['T'] = T
                    ds.attrs['h'] = h
                    ds.attrs['D'] = D
                    ds.attrs['U0_bl'] = U0_bl
                    ds.attrs['U0_bar'] = U0_bar
                    ds.attrs['KC'] = KC
                    ds.attrs['S'] = S
                    ds.attrs['Delta+'] = ((1E-6 * T)**0.5) / h
                    ds.attrs['Delta_l'] = 2 * np.pi * d_s
                    ds.attrs['Delta_s'] = d_s
                    ds.attrs['Re_D'] = U0_bl * D / 1E-6
                    ds.attrs['Beta'] = D**2. / (1E-6 * T)

                delta = (ds.attrs['dx'] * ds.attrs['dy'] *
                         ds.attrs['dz'])**(1. / 3.)
                dpx = (ds.attrs['pdx'] * ds.attrs['pdy'] *
                       ds.attrs['pdz'])**(1. / 3.)
                delta_px = delta / dpx
                dt = ds.attrs['piv_step_ensemble']

                #                divRMS = da.sqrt(da.nanmean((divergence * dt) ** 2.))
                #                divRMS = divRMS.persist()
                #                vorticityTensorNorm.persist()
                #                velocityError = divRMS/((3./(2.*delta_px**2.))**0.5)
                # print(da.percentile(ds_new['vorticityTensorNorm'].data.ravel(),99.))
                # print(ds_new['divRMS'])
                # print(ds_new['divNorm_mean'])
                #                vorticityError = divRMS/dt/da.percentile(vorticityTensorNorm.ravel(),99.)

                #                divNorm_mean = da.nanmean(divNorm)
                #                divNorm_std = da.nanstd(divNorm)

                # print("initial save")
                #ds.to_zarr(self._result_file,compute=False)
                #ds = xr.open_zarr(self._result_file)

                #                xstart = np.argmax(xx > 0.05)
                #                ystart = np.argmax(yy > 0.07)

                divRMS = da.sqrt(da.nanmean(
                    (divergence * dt)**2.))  #.compute()
                #divNorm = divergence / jacobianNorm
                #divNorm = divNorm.compute()
                #divNorm_mean = da.nanmean(divNorm).compute()
                #divNorm_std = da.nanstd(divNorm).compute()
                velocityError = divRMS / ((3. / (2. * delta_px**2.))**0.5)
                vortNorm = vorticityTensorNorm  #.compute()

                vorticityError = divRMS / dt / np.percentile(
                    vortNorm.ravel(), 99.)

                velocityError, vorticityError = da.compute(
                    velocityError, vorticityError)

                #ds.attrs['divNorm_mean'] = divNorm_mean
                #ds.attrs['divNorm_std'] = divNorm_std
                ds.attrs['velocityError'] = velocityError
                ds.attrs['vorticityError'] = vorticityError

                if self._norm_dims:
                    xInds = (xn > 0.5) & (xn < 2.65)
                    yInds = (yn > -0.75) & (yn < 0.75)
                else:
                    xInds = range(len(ds['x']))
                    yInds = range(len(ds['y']))
                vrms = (ds['V'][:, xInds, yInds, :]**2.).mean(
                    dim=['time', 'x', 'y', 'z'])**0.5
                wrms = (ds['W'][:, xInds, yInds, :]**2.).mean(
                    dim=['time', 'x', 'y', 'z'])**0.5
                ds.attrs['Vrms'] = float(vrms.compute())
                ds.attrs['Wrms'] = float(wrms.compute())

                #fig,ax = pl.subplots()
                #ax.plot(ds.ws,ds.Ue_spl/U0_bl,color='k')
                #ax.plot(ds.ws,ds.Ue_bl/U0_bl,color='g')
                #ax.set_xlabel(r'$t/T$')
                #ax.set_ylabel(r'$U_{bl}/U_0$')
                #fig.savefig(self._result_file[:-4] + 'png',dpi=125)
                #pl.close(fig)
                # print("second save")
                #ds.to_netcdf(self._result_file)
                ds.to_zarr(self._result_file, mode='w')

                print('Cached ' + self._result_file)

                #ds = xr.open_dataset(self._result_file,chunks={'time':20})
                ds = xr.open_zarr(self._result_file)
                ds.attrs['filename'] = self._result_file
            else:
                #ds = xr.open_dataset(self._result_file,chunks={'time':20})
                ds = xr.open_zarr(self._result_file)
                ds.attrs['filename'] = self._result_file

            self._ds = ds

        return self._ds
Exemplo n.º 26
0
fbinningClip = lambda x, bin2_iStEn, bin1_nAverage: da.mean(da.reshape(x[slice(*bin2_iStEn)], (-1, bin1_nAverage)), 1)
fbinning = lambda x, bin1_nAverage: da.mean(da.reshape(x, (-1, bin1_nAverage)), 1)
repeat3shift1 = lambda A2: [A2[t:(len(A2) - 2 + t)] for t in range(3)]
median3cols = lambda a, b, c: da.where(a < b, da.where(c < a, a, da.where(b < c, b, c)),
                                       da.where(a < c, a, da.where(c < b, b, c)))
median3 = lambda x: da.hstack((np.NaN, median3cols(*repeat3shift1(x)), np.NaN))
# not convertable to dask easily:
fVabs_old = lambda Gxyz, kVabs: np.polyval(kVabs.flat, np.sqrt(np.tan(fInclination(Gxyz))))
rep2mean = lambda x, bOk: np.interp(np.arange(len(x)), np.flatnonzero(bOk), x[bOk], np.NaN, np.NaN)
fForce2Vabs_fitted = lambda x: da.where(x > 2, 2, da.where(x < 1, 0.25 * x, 0.25 * x + 0.3 * (x - 1) ** 4))
fIncl2Force = lambda incl: da.sqrt(da.tan(incl))
fVabs = lambda Gxyz, kVabs: fForce2Vabs_fitted(fIncl2Force(fInclination(Gxyz)))
f = lambda fun, *args: fun(*args)
positiveInd = lambda i, L: np.int32(da.where(i < 0, L - i, i))
minInterval = lambda iLims1, iLims2, L: f(
    lambda iL1, iL2: da.transpose([max(iL1[:, 0], iL2[:, 0]), min(iL1[:, -1], iL2[:, -1])]), positiveInd(iLims1, L),
    positiveInd(iLims2, L))
fStEn2bool = lambda iStEn, length: da.hstack(
    [(da.ones(iEn2iSt, dtype=np.bool8) if b else da.zeros(iEn2iSt, dtype=np.bool8)) for iEn2iSt, b in da.vstack((
        da.diff(
            da.hstack(
                (
                    0,
                    iStEn.flat,
                    length))),
        da.hstack(
            (
                da.repeat(
                    [
                        (
                            False,
Exemplo n.º 27
0
def activations(images, labels, batch_size, model, layer_regex, nodaug_params, 
                daug_params, include_input=False, class_invariance=False, 
                n_daug_rep=0,  norms=['fro']):
    """
    Computes metrics from the activations, such as the norm of the feature
    maps, data augmentation invariance, class invariance, etc.

    Parameters
    ----------
    images : h5py Dataset
        The set of images

    labels : h5py Dataset
        The ground truth labels

    batch_size : int
        Batch size

    model : Keras Model
        The model

    nodaug_params : dict
        Dictionary of data augmentation parameters for the baseline

    daug_params : dict
        Dictionary of data augmentation parameters

    include_input : bool
        If True, the input layer is considered for the analysis

    class_invariance : bool
        If True, the class invariance score is computed

    n_daug_rep : int
        If larger than 0, the data augentation invariance score is computed,
        performing n_daug_rep repetitions of random augmentations

    norms : list
        List of keywords to specify the types of norms to compute on the 
        activations

    Returns
    -------
    results_dict : dict
        Dictionary containing some performance metrics
    """
    def _update_stats(mean_norm, std_norm, norm):
        mean_norm_batch = np.mean(norm, axis=0)
        std_norm_batch = np.std(norm, axis=0)
        mean_norm = init / float(end) * mean_norm + \
                    batch_size / float(end) * mean_norm_batch
        std_norm = init / float(end) * std_norm ** 2 + \
                    batch_size / float(end) * std_norm_batch ** 2 + \
                    (init * batch_size) / float(end ** 2) * \
                    (mean_norm - mean_norm_batch) ** 2
        std_norm = np.sqrt(std_norm)

        return mean_norm, std_norm

    def _frobenius_norm(activations):
        norm = np.linalg.norm(
                activations, ord='fro', 
                axis=tuple(range(1, len(activations.shape) - 1)))
        return norm

    def _inf_norm(activations):
        norm = np.max(np.abs(activations),
                      axis=tuple(range(1, len(activations.shape) - 1)))
        return norm

    model = del_extra_nodes(model)

    n_images = images.shape[0]
    n_batches_per_epoch = int(np.ceil(float(n_images) / batch_size))

    # Get relevant layers
    if include_input:
        layer_regex = '({}|.*input.*)'.format(layer_regex)
    else:
        layer_regex = layer_regex

    layers = [layer.name for layer in model.layers 
              if re.compile(layer_regex).match(layer.name)]

    # Initialize HDF5 to store the activations
#     filename = 'hdf5_aux_{}'.format(time.time())
#     activations_hdf5_aux = h5py.File(filename, 'w')
#     hdf5_aux = [filename]
# 
#     grp_activations = activations_hdf5_aux.create_group('activations')

    if class_invariance:
#         grp_labels = activations_hdf5_aux.create_group('labels')
        labels_true_da = []
        labels_pred_da = []
        predictions_da = []
#         labels_true = grp_labels.create_dataset(
#                 'labels_true', shape=(n_images, ), dtype=np.uint8)
#         labels_pred = grp_labels.create_dataset(
#                 'labels_pred', shape=(n_images, ), dtype=np.uint8)
#         predictions = grp_labels.create_dataset(
#                 'predictions', shape=labels.shape, dtype=K.floatx())
        idx_softmax = model.output_names.index('softmax')
        store_labels = True
    else:
        store_labels = False

    # Initialize results dictionary
    results_dict = {'activations_norm': {}, 'summary': {}, 
                    'class_invariance': {}, 'daug_invariance': {}} 

    # Iterate over the layers
    for layer_name in layers:

        # Create batch generator
        image_gen = get_generator(images, **nodaug_params)
        batch_gen = generate_batches(image_gen, images, labels, batch_size,
                                     aug_per_im=1, shuffle=False)

        layer = model.get_layer(layer_name)
        layer_shape = layer.output_shape[1:]
        n_channels = layer_shape[-1]

        if re.compile('.*input.*').match(layer_name):
            layer_name = 'input'

        print('\nLayer {}\n'.format(layer_name))

        # Create a Dataset for the activations of the layer
#         activations_layer = grp_activations.create_dataset(
#                 layer_name, shape=(n_images, ) + layer_shape, 
#                 dtype=K.floatx())
        # Create dask array for the activations of the layer
        activations_layer_da = []

        # Initialize placeholders in the results dict for the layer
        results_dict['activations_norm'].update({layer_name: 
            {n: {'mean': np.zeros(n_channels), 
                 'std': np.zeros(n_channels)} for n in norms}})
        layer_dict = results_dict['activations_norm'][layer_name]

        activation_function = K.function([model.input, 
                                          K.learning_phase()], 
                                         [layer.output])

        # Iterate over the data set in batches
        init = 0
        for batch_images, batch_labels in tqdm(
                batch_gen, total=n_batches_per_epoch):

            batch_size = batch_images.shape[0]
            end = init + batch_size

            # Store labels
            if store_labels:
                preds = model.predict_on_batch(batch_images)
                if isinstance(preds, list):
                    preds = preds[idx_softmax]
                labels_pred_da.append(da.from_array(
                    np.argmax(preds, axis=1)))
                labels_true_da.append(da.from_array(
                    np.argmax(batch_labels, axis=1)))
                predictions_da.append(da.from_array(preds))
#                 labels_pred[init:end] = np.argmax(preds, axis=1)
#                 labels_true[init:end] = np.argmax(batch_labels, axis=1)
#                 predictions[init:end, :] = preds

            # Get and store activations
            activations = activation_function([batch_images, 0])[0]
            activations_layer_da.append(da.from_array(
                activations, chunks=activations.shape))
#             activations_layer[init:end] = activations

            # Compute norms
            for norm_key in norms:
                mean_norm = layer_dict[norm_key]['mean']
                std_norm = layer_dict[norm_key]['std']
                if norm_key == 'fro':
                    norm = _frobenius_norm(activations)
                elif norm_key == 'inf':
                    norm = _inf_norm(activations)
                else:
                    raise NotImplementedError('Implemented norms are fro '
                            'and inf')
                mean_norm, std_norm = _update_stats(mean_norm, std_norm, 
                                                    norm)
                layer_dict[norm_key]['mean'] = mean_norm
                layer_dict[norm_key]['std'] = std_norm

            init = end
            if init == n_images:
                store_labels = False
                break

        # Concatenate dask arrays
        activations_layer_da = da.concatenate(activations_layer_da, axis=0)
        activations_layer_da = activations_layer_da.reshape((n_images, -1))
        d_activations = activations_layer_da.shape[-1]

        if class_invariance:
            print('\nComputing class invariance\n')
            labels_pred_da = da.concatenate(labels_pred_da)
            labels_true_da = da.concatenate(labels_true_da)
            predictions_da = da.concatenate(predictions_da)
            n_classes = len(np.unique(labels_true_da))

        # Compute MSE matrix of the activations
        r = da.reshape(da.sum(da.square(activations_layer_da), 
                                        axis=1), (-1, 1))
        mse_matrix_da = (r - 2 * da.dot(activations_layer_da,
                                     da.transpose(activations_layer_da)) \
                     + da.transpose(r)) / d_activations
        mse_matrix_da = mse_matrix_da.rechunk((mse_matrix_da.chunksize[0],
                                               mse_matrix_da.shape[-1]))

        # Compute class invariance
        time0 = time()
        results_dict['class_invariance'].update({layer_name: {}})
        class_invariance_scores_da = []
        if class_invariance:
#             mse_matrix_mean = da.mean(mse_matrix_da).compute()
            for cl in tqdm(range(n_classes)):
                labels_cl = labels_pred_da == cl
                labels_cl = labels_cl.compute()
                mse_class = mse_matrix_da[labels_cl, :][:, labels_cl]
                mse_class = mse_class.rechunk((-1, -1))
#                 mse_class_mean = da.mean(mse_class).compute()
#                 class_invariance_score = 1. - np.divide(
#                         mse_class_mean, mse_matrix_mean)
#                 results_dict['class_invariance'][layer_name].update(
#                         {cl: class_invariance_score})
                class_invariance_scores_da.append(
                        1. - da.divide(da.mean(mse_class),
                                       da.mean(mse_matrix_da)))

        # Compute data augmentation invariance
        print('\nComputing data augmentation invariance\n')
        mse_daug_da = []

        results_dict['daug_invariance'].update({layer_name: {}})

        for r in range(n_daug_rep):
            print('Repetition {}'.format(r))

            image_gen_daug = get_generator(images, **daug_params)
            batch_gen_daug = generate_batches(image_gen_daug, images, labels, 
                                              batch_size, aug_per_im=1, 
                                              shuffle=False)

            activations_layer_daug_da = []

            # Iterate over the data set in batches to compute activations
            init = 0
            for batch_images, batch_labels in tqdm(
                    batch_gen, total=n_batches_per_epoch):

                batch_size = batch_images.shape[0]
                end = init + batch_size

                # Get and store activations
                activations = activation_function([batch_images, 0])[0]
                activations_layer_daug_da.append(da.from_array(
                    activations, chunks=activations.shape))

                init = end
                if init == n_images:
                    break

            activations_layer_daug_da = da.concatenate(
                    activations_layer_daug_da, axis=0)
            activations_layer_daug_da = activations_layer_daug_da.reshape(
                    (n_images, -1))
            activations_layer_daug_da = activations_layer_daug_da.rechunk(
                    (activations_layer_daug_da.chunksize[0],
                     activations_layer_daug_da.shape[-1]))

            # Compute MSE daug
            mse_daug_da.append(da.mean(da.square(activations_layer_da - \
                                                 activations_layer_daug_da), 
                                       axis=1))

        mse_daug_da = da.stack(mse_daug_da, axis=1)

        mse_sum = da.repeat(da.reshape(da.sum(mse_matrix_da, axis=1),
                                       (n_images, 1)), n_daug_rep, axis=1)

        daug_invariance_score_da = 1 - n_images * da.divide(mse_daug_da, mse_sum)

        time1 = time()

        # Compute dask results and update results dict
        results_dask = da.compute(class_invariance_scores_da,
                                  daug_invariance_score_da)

        time2 = time()

        results_dict['class_invariance'][layer_name].update(
                {cl: cl_inv_score 
                    for cl, cl_inv_score in enumerate(results_dask[0])})
        results_dict['daug_invariance'].update({layer_name: 
            {r: daug_inv_score 
                for r, daug_inv_score in enumerate(results_dask[1].T)}})
    # Compute summary statistics of the norms across the channels
    for layer, layer_dict in results_dict['activations_norm'].items():
        results_dict['summary'].update({layer: {}})
        for norm_key, norm_dict in layer_dict.items():
            results_dict['summary'][layer].update({norm_key: {
                'mean': np.mean(norm_dict['mean']), 
                'std': np.mean(norm_dict['std'])}})

    return results_dict
Exemplo n.º 28
0
 def _adapt_chunking(self, array, sig_dims):
     n_dimension = array.ndim
     # Handle chunked signal dimensions by merging just in case
     sig_dim_idxs = [*range(n_dimension)[-sig_dims:]]
     if any([len(array.chunks[c]) > 1 for c in sig_dim_idxs]):
         original_n_chunks = [len(c) for c in array.chunks]
         array = array.rechunk({idx: -1 for idx in sig_dim_idxs})
         log.warning('Merging sig dim chunks as LiberTEM does not '
                     'support paritioning along the sig axes. '
                     f'Original n_blocks: {original_n_chunks}. '
                     f'New n_blocks: {[len(c) for c in array.chunks]}.')
     # Warn if there is no nav_dim chunking
     n_nav_chunks = [len(dim_chunking) for dim_chunking in array.chunks[:-sig_dims]]
     if set(n_nav_chunks) == {1}:
         log.warning('Dask array is not chunked in navigation dimensions, '
                     'cannot split into nav-partitions without loading the '
                     'whole dataset on each worker. '
                     f'Array shape: {array.shape}. '
                     f'Chunking: {array.chunks}. '
                     f'array size {array.nbytes / 1e6} MiB.')
         # If we are here there is nothing else to do.
         return array
     # Orient the nav dimensions so that the zeroth dimension is
     # the most chunked, this obviously changes the dataset nav_shape !
     if not self._preserve_dimension:
         n_nav_chunks = [len(dim_chunking) for dim_chunking in array.chunks[:-sig_dims]]
         nav_sort_order = np.argsort(n_nav_chunks)[::-1].tolist()
         sort_order = nav_sort_order + sig_dim_idxs
         if not np.equal(sort_order, np.arange(n_dimension)).all():
             original_shape = array.shape
             original_n_chunks = [len(c) for c in array.chunks]
             array = da.transpose(array, axes=sort_order)
             log.warning('Re-ordered nav_dimensions to improve partitioning, '
                         'create the dataset with preserve_dimensions=True '
                         'to suppress this behaviour. '
                         f'Original shape: {original_shape} with '
                         f'n_blocks: {original_n_chunks}. '
                         f'New shape: {array.shape} with '
                         f'n_blocks: {[len(c) for c in array.chunks]}.')
     # Handle chunked nav_dimensions
     # We can allow nav_dimensions to be fully chunked (one chunk per element)
     # up-to-but-not-including the first non-fully chunked dimension. After this point
     # we must merge/rechunk all subsequent nav dimensions to ensure continuity
     # of frame indexes in a flattened nav dimension. This should be removed
     # when if we allow non-contiguous flat_idx Partitions
     nav_rechunk_dict = {}
     for dim_idx, dim_chunking in enumerate(array.chunks[:-sig_dims]):
         if set(dim_chunking) == {1}:
             continue
         else:
             merge_dimensions = [*range(dim_idx + 1, n_dimension - sig_dims)]
             for merge_i in merge_dimensions:
                 if len(array.chunks[merge_i]) > 1:
                     nav_rechunk_dict[merge_i] = -1
     if nav_rechunk_dict:
         original_n_chunks = [len(c) for c in array.chunks]
         array = array.rechunk(nav_rechunk_dict)
         log.warning('Merging nav dimension chunks according to scheme '
                     f'{nav_rechunk_dict} as we cannot maintain continuity '
                     'of frame indexing in the flattened navigation dimension. '
                     f'Original n_blocks: {original_n_chunks}. '
                     f'New n_blocks: {[len(c) for c in array.chunks]}.')
     # Merge remaining chunks maintaining C-ordering until we reach a target chunk sizes
     # or a minmum number of partitions corresponding to the number of workers
     new_chunking, min_size, max_size = merge_until_target(array, self._min_size)
     if new_chunking != array.chunks:
         original_n_chunks = [len(c) for c in array.chunks]
         chunksizes = get_chunksizes(array)
         orig_min, orig_max = chunksizes.min(), chunksizes.max()
         array = array.rechunk(new_chunking)
         log.warning('Applying re-chunking to increase minimum partition size. '
                     f'n_blocks: {original_n_chunks} => {[len(c) for c in array.chunks]}. '
                     f'Min chunk size {orig_min / 1e6:.1f} => {min_size / 1e6:.1f} MiB , '
                     f'Max chunk size {orig_max / 1e6:.1f} => {max_size / 1e6:.1f} MiB.')
     return array
            X.compute(), kernels_mean.compute(), kernels_cov.compute(),
            batch_size, total_kernels, input_size)

    times = []
    with Client('localhost:8001') as client:
        for n in range(5):
            # client.restart() # resets cluster
            # Do something using 'client'
            start = time.time()
            batches = []
            for i in range(batch_size):
                kernel_out = []
                for j in range(total_kernels):
                    mean = da.matmul(kernels_mean[j, :], X[i, :, :])
                    cov = da.matmul(
                        da.transpose(X[i, :, :]),
                        da.matmul(kernels_cov[j, :, :], X[i, :, :]))
                    z = mvn_random_DASK(mean, cov, total_samples,
                                        input_size**2)
                    g = relu(z)
                    mean_g = da.mean(g, axis=1)
                    kernel_out.append(mean_g)
                kernels_out = da.stack(kernel_out, axis=0)
                batches.append(kernels_out.compute())
                print('task graph complete')
            batches_out_result = np.stack(batches, axis=0)

            print("compute done")
            times.append(time.time() - start)
            if validate:
                print(
Exemplo n.º 30
0
    pytest.importorskip("numba", minversion="0.40.0")


functions = [
    lambda x: x,
    lambda x: da.expm1(x),
    lambda x: 2 * x,
    lambda x: x / 2,
    lambda x: x ** 2,
    lambda x: x + x,
    lambda x: x * x,
    lambda x: x[0],
    lambda x: x[:, 1],
    lambda x: x[:1, None, 1:3],
    lambda x: x.T,
    lambda x: da.transpose(x, (1, 2, 0)),
    lambda x: x.sum(),
    lambda x: x.mean(),
    lambda x: x.moment(order=0),
    pytest.param(
        lambda x: x.std(),
        marks=pytest.mark.xfail(
            reason="fixed in https://github.com/pydata/sparse/pull/243"
        ),
    ),
    pytest.param(
        lambda x: x.var(),
        marks=pytest.mark.xfail(
            reason="fixed in https://github.com/pydata/sparse/pull/243"
        ),
    ),
Exemplo n.º 31
0
    def _read_image(self, image_group, image_sub_group_key):
        """ Return a dictionary ready to parse of return to io module"""
        image_sub_group = image_group[image_sub_group_key]
        original_metadata = _parse_metadata(image_group, image_sub_group_key)
        original_metadata.update(self.original_metadata)
        if 'Detector' in original_metadata['BinaryResult'].keys():
            self.detector_name = _parse_detector_name(original_metadata)

        read_stack = (self.load_SI_image_stack or self.im_type == 'Image')
        h5data = image_sub_group['Data']
        # Get the scanning area shape of the SI from the images
        self.spatial_shape = h5data.shape[:-1]
        # Set the axes in frame, y, x order
        if self.lazy:
            data = da.transpose(
                da.from_array(
                    h5data,
                    chunks=h5data.chunks),
                axes=[2, 0, 1])
        else:
            # Workaround for a h5py bug https://github.com/h5py/h5py/issues/977
            # Change back to standard API once issue #977 is fixed.
            # Preallocate the numpy array and use read_direct method, which is
            # much faster in case of chunked data.
            data = np.empty(h5data.shape)
            h5data.read_direct(data)
            data = np.rollaxis(data, axis=2)

        pix_scale = original_metadata['BinaryResult'].get(
            'PixelSize', {'height': 1.0, 'width': 1.0})
        offsets = original_metadata['BinaryResult'].get(
            'Offset', {'x': 0.0, 'y': 0.0})
        original_units = original_metadata['BinaryResult'].get(
            'PixelUnitX', '')

        axes = []
        # stack of images
        if not read_stack:
            data = data[0:1, ...]

        if data.shape[0] == 1:
            # Squeeze
            data = data[0, ...]
            i = 0
        else:
            frame_time = original_metadata['Scan']['FrameTime']
            frame_time, time_unit = self._convert_scale_units(
                frame_time, 's', 2 * data.shape[0])
            axes.append({'index_in_array': 0,
                         'name': 'Time',
                         'offset': 0,
                         'scale': frame_time,
                         'size': data.shape[0],
                         'units': time_unit,
                         'navigate': True})
            i = 1
        scale_x = self._convert_scale_units(
            pix_scale['width'], original_units, data.shape[i + 1])
        scale_y = self._convert_scale_units(
            pix_scale['height'], original_units, data.shape[i])
        offset_x = self._convert_scale_units(
            offsets['x'], original_units, data.shape[i + 1])
        offset_y = self._convert_scale_units(
            offsets['y'], original_units, data.shape[i])
        axes.extend([{'index_in_array': i,
                      'name': 'y',
                      'offset': offset_y[0],
                      'scale': scale_y[0],
                      'size': data.shape[i],
                      'units': scale_y[1],
                      'navigate': False},
                     {'index_in_array': i + 1,
                      'name': 'x',
                      'offset': offset_x[0],
                      'scale': scale_x[0],
                      'size': data.shape[i + 1],
                      'units': scale_x[1],
                      'navigate': False}
                     ])

        md = self._get_metadata_dict(original_metadata)
        md['Signal']['signal_type'] = 'image'
        if self.detector_name is not None:
            original_metadata['DetectorMetadata'] = _get_detector_metadata_dict(
                original_metadata,
                self.detector_name)
        if hasattr(self, 'map_label_dict'):
            if image_sub_group_key in self.map_label_dict:
                md['General']['title'] = self.map_label_dict[image_sub_group_key]

        return {'data': data,
                'axes': axes,
                'metadata': md,
                'original_metadata': original_metadata,
                'mapping': self._get_mapping(map_selected_element=False,
                                             parse_individual_EDS_detector_metadata=False)}