def subsample_volume(input_name, factor=2, output_name=None): '''Subsample volume''' # Read input input_h5 = h5py.File(input_name, 'r') input_volume = input_h5['timeseries_volume'] vol_shape = input_h5['vol_shape'][...] # Setup output if output_name is None: output_name = '%s_subsampled_%dx.h5' % (os.path.splitext(input_name)[0], factor) if os.path.exists(output_name): return output_name output_h5 = h5py.File(output_name, 'w') output_volume = None for t in xrange(input_volume.shape[0]): x, new_vol_shape = volume.subsample_volume(input_volume[t,:], vol_shape, factor) if output_volume is None: data_shape = (input_volume.shape[0], x.shape[0]) output_volume = output_h5.create_dataset('timeseries_volume', data_shape, 'f') output_volume[t, :] = x simple_progress_bar(t, input_volume.shape[0]) print('Saving volume to', output_h5) output_h5.create_dataset('vol_shape', data=new_vol_shape[...]) output_h5.flush() output_h5.close() return output_name
def matlabv7_to_h5(input_filename, data_name, output_filename=None): if output_filename is None: output_filename = os.path.splitext(input_filename)[0] + '.h5' f = h5py.File(input_filename,'r') matdata = f[data_name] fout = h5py.File(output_filename, 'w') img_shape = matdata.shape[1:][::-1] vol_shape = np.array((1,) + img_shape) data_shape = (matdata.shape[0], np.prod(vol_shape)) fout.create_dataset('vol_shape', data=vol_shape) output_volume = fout.create_dataset('timeseries_volume', data_shape, 'f') for t in xrange(matdata.shape[0]): simple_progress_bar(t, matdata.shape[0]) output_volume[t] = np.reshape(np.reshape(matdata[t], vol_shape[::-1]), np.prod(vol_shape), order='f') fout.close()
def chunked_map_ooc(datareader, fn, targs=None, chunk_args=None, chunks=20, output_filename=None, n_jobs=-1, aout=True, preproc=None, vol_shape_arg=True, overwite=True, transpose=False, subsamp_factor=1, **kwargs): '''Apply a function to chunks of the vectorized datset. :param data: dataset :param fn: Function to apply to chunks of dataset. :param targs: Argument for each frame of dataset :param chunk_args: Argument for each temporal chunk :param chunks: Number of chunks to split data into :param n_jobs: Number of jobs to use :param aout: Array output. Reduces the output of `fn` to an array either in memory or to h5 file output_filename if not None. If aout is True, then we return a DataReader, otherwise we return a list containing the function outputs. :param output_filename: h5 file to write output if aout=True :param **kwargs: Additional kwargs to be passed to `fn` ''' # Using numpy within multiprocessing segfaults on Mac. # See: http://mail.python.org/pipermail/python-ideas/2012-November/017932.html if sys.platform == 'darwin': n_jobs = 1 from io import FileDataReader, MemoryDataReader out = None ntimesteps = datareader.ntimesteps nvoxels = datareader.nvoxels ntimesteps_out = int( np.floor(datareader.ntimesteps / float(subsamp_factor))) vol_shape = datareader.vol_shape if transpose: input_tchunks = np.array_split(np.arange(nvoxels), chunks) else: input_tchunks = np.array_split(np.arange(ntimesteps), chunks) args = [] carg = [] tidx = 0 for t, tchunk in enumerate(input_tchunks): simple_progress_bar(t, len(input_tchunks)) if tchunk.size: # Select appropriate parameters if targs is not None: args = targs[tchunk] elif chunk_args is not None: args = [chunk_args[t]] * len(tchunk) else: args = None # Read chunk of data if transpose: data = datareader.read(None, tchunk) else: data = datareader.read(tchunk, None) if preproc: if vol_shape_arg: data, vol_shape = preproc(data, datareader.vol_shape) else: data = preproc(data).astype(data.dtype) # Compute fnc in parallel over chunks of data from joblib import Parallel, delayed #TODO(ben): clean up/simplify if fn is not None: if vol_shape_arg: if args is not None: tout = Parallel(n_jobs=n_jobs)( delayed(fn)(datum, vol_shape, arg, **kwargs) for datum, arg in zip(data, args)) else: tout = Parallel(n_jobs=n_jobs)( delayed(fn)(datum, vol_shape, **kwargs) for datum in data) else: if args is not None: tout = Parallel(n_jobs=n_jobs)( delayed(fn)(datum, arg, **kwargs) for datum, arg in zip(data, args)) else: tout = Parallel(n_jobs=n_jobs)( delayed(fn)(datum, **kwargs) for datum in data) else: tout = [data] if aout: # Compress output into array tout = np.vstack(tout) if out is None: if transpose: shape = (tout.shape[0], nvoxels) else: shape = (ntimesteps_out, ) + tout.shape[1:] if output_filename is None: out = np.zeros(shape, dtype=tout.dtype) else: import h5py f = h5py.File(output_filename, 'w') f.create_dataset('vol_shape', data=vol_shape) out = f.create_dataset('timeseries_volume', shape, dtype=tout.dtype) tchunk_out = np.arange(tidx, tidx + tout.shape[0]) if transpose: out[:, tchunk_out] = tout else: out[tchunk_out, :] = tout tidx += tout.shape[0] else: # Keep output in a big list if out is None: out = [] out.extend(tout) simple_progress_bar(t, chunks) if aout: if output_filename is not None: f.close() outreader = FileDataReader(output_filename) else: outreader = MemoryDataReader(out, vol_shape) return outreader else: return out
def load_data(input_filename, time_window=None, crops=None, big_data=False, subsample_time=1, output_filename=None, chunks=100, min_sub=False, max_proj=False, smooth=False): '''Load dataset into DataReader object. Can be read into memory using .read() :param input_filename: Input h5 filename :param time_window: tuple of [start_time, end_time] :param subsample_time: Subsampling factor in time. Note this does not do smoothing before subsampling. :param crops: crop volumes according to list of 3 tuples: [(xmin, xmax), (ymin, ymax), (zmin, zmax)] :param big_data: Write output to h5 file if cropped, and return FileDataReader :param output_filename: Output file to write if using big_data (optional) :param chunks: Read dataset in this many different chunks. :param min_sub: Min subtract dataset :param smooth: Whether to average frames before subsampling. ''' if big_data and output_filename is None: output_filename = os.path.splitext(input_filename)[0] + '_cropped.h5' input_data = FileDataReader(input_filename) vol_shape = input_data.vol_shape time_window = time_window if time_window is not None else [ 0, input_data.ntimesteps ] time_window = np.array(time_window) if time_window[1] == -1: time_window[1] = input_data.ntimesteps time_list = np.arange(time_window[0], time_window[1], subsample_time) #XXX: ben throwing away data! ntimesteps = len(time_list) do_crop = crops is not None and np.any(crops > 0) do_time = ntimesteps != input_data.ntimesteps if min_sub: if 'vol_min' not in input_data.file: raise ValueError('Run update_timeseries_stats on %s' % input_filename) vol_min = input_data.file['vol_min'][...] if not (do_crop or do_time or min_sub or max_proj): if big_data: return input_data else: return MemoryDataReader(input_data.read(), input_data.vol_shape) if do_crop: idx, new_vol_shape = grid_to_idx(vol_shape, crops[0], crops[1], crops[2]) else: nocrops = True idx = slice(None) new_vol_shape = vol_shape if max_proj: new_vol_shape = np.array(new_vol_shape) new_vol_shape_full = new_vol_shape.copy() new_vol_shape[0] = 1 nvoxels = np.prod(new_vol_shape) # Allocate memory or file if big_data: print('Writing output to ', output_filename) output_file = h5py.File(output_filename, 'w') output_file.create_dataset('vol_shape', data=new_vol_shape) data = output_file.create_dataset('timeseries_volume', (ntimesteps, nvoxels), dtype=input_data.data.dtype) else: data = np.zeros((ntimesteps, nvoxels), dtype=input_data.data.dtype) # Read dataset into memory or file #output_tchunks = np.array_split(np.arange(ntimesteps), chunks) if subsample_time != 1 and smooth: ntimesteps_last = (time_window[1] - time_window[0]) % (subsample_time * chunks) tlist_full = np.arange(time_window[0], time_window[1]) input_tchunks = np.array_split( tlist_full[:len(tlist_full) - ntimesteps_last], chunks) if ntimesteps_last > 0: input_tchunks.append(tlist_full[-ntimesteps_last:]) chunks += 1 else: input_tchunks = np.array_split(time_list, chunks) tidx = 0 sigma = subsample_time / 4.0 for i, input_tchunk in enumerate(input_tchunks): simple_progress_bar(i, chunks) if input_tchunk.size: # array_split can have empty arrays # h5 only supports one indexing vector dchunk = input_data.read(input_tchunk, slice(None)) if dchunk.ndim == 1: dchunk = dchunk.reshape(1, -1) dchunk = dchunk[:, idx] if subsample_time != 1 and smooth: dchunk = gaussian_filter1d(dchunk, sigma=sigma, axis=0)[::subsample_time] #if preproc is not None: # dchunk = preproc(dchunk) if min_sub: dchunk -= vol_min[idx] output_tchunk = np.arange(tidx, tidx + dchunk.shape[0]) tidx += dchunk.shape[0] if max_proj: for t in xrange(dchunk.shape[0]): data[output_tchunk[t]] = volume_to_vector( vector_to_volume( dchunk[t], new_vol_shape_full).max(2)[:, :, np.newaxis]) else: data[output_tchunk, :] = dchunk # Return data as DataReader objects if big_data: output_file.close() out_data = FileDataReader(output_filename) else: out_data = MemoryDataReader(data, new_vol_shape) return out_data