def _check_data_shape_dask(data, input_idxs): """Check data shape and adjust if necessary.""" # Handle multiple datasets if data.ndim > 2 and data.shape[0] * data.shape[1] == input_idxs.shape[0]: data = da.reshape(data, data.shape[0] * data.shape[1], data.shape[2]) # Also ravel single dataset elif data.shape[0] != input_idxs.size: data = da.ravel(data) # Ensure two dimensions if data.ndim == 1: data = da.reshape(data, (data.size, 1)) return data
def _matvec(self, x): if self.reshape: x = da.reshape(x, self.dims) y = da.roll(x, shift=self.shift, axis=self.dir) y = y.rechunk(x.chunks) return y.ravel()
def get_sample_from_bil_info(self, data, fill_value=np.nan, output_shape=None): if fill_value is None: fill_value = np.nan # FIXME: can be this made into a dask construct ? cols, lines = np.meshgrid(np.arange(data['x'].size), np.arange(data['y'].size)) cols = da.ravel(cols) lines = da.ravel(lines) try: self.valid_input_index = self.valid_input_index.compute() except AttributeError: pass vii = self.valid_input_index.squeeze() try: self.index_array = self.index_array.compute() except AttributeError: pass # ia contains reduced (valid) indices of the source array, and has the # shape of the destination array ia = self.index_array rlines = lines[vii][ia] rcols = cols[vii][ia] slices = [] mask_slices = [] mask_2d_added = False coords = {} try: # FIXME: Use same chunk size as input data coord_x, coord_y = self.target_geo_def.get_proj_vectors_dask() except AttributeError: coord_x, coord_y = None, None for _, dim in enumerate(data.dims): if dim == 'y': slices.append(rlines) if not mask_2d_added: mask_slices.append(ia >= self.target_geo_def.size) mask_2d_added = True if coord_y is not None: coords[dim] = coord_y elif dim == 'x': slices.append(rcols) if not mask_2d_added: mask_slices.append(ia >= self.target_geo_def.size) mask_2d_added = True if coord_x is not None: coords[dim] = coord_x else: slices.append(slice(None)) mask_slices.append(slice(None)) try: coords[dim] = data.coords[dim] except KeyError: pass res = data.values[slices] res[mask_slices] = fill_value try: p_1 = res[:, :, 0] p_2 = res[:, :, 1] p_3 = res[:, :, 2] p_4 = res[:, :, 3] except IndexError: p_1 = res[:, 0] p_2 = res[:, 1] p_3 = res[:, 2] p_4 = res[:, 3] s__, t__ = self.bilinear_s, self.bilinear_t res = (p_1 * (1 - s__) * (1 - t__) + p_2 * s__ * (1 - t__) + p_3 * (1 - s__) * t__ + p_4 * s__ * t__) epsilon = 1e-6 data_min = da.nanmin(data) - epsilon data_max = da.nanmax(data) + epsilon idxs = (res > data_max) | (res < data_min) res = da.where(idxs, fill_value, res) shp = self.target_geo_def.shape if data.ndim == 3: res = da.reshape(res, (res.shape[0], shp[0], shp[1])) else: res = da.reshape(res, (shp[0], shp[1])) res = DataArray(da.from_array(res, chunks=CHUNK_SIZE), dims=data.dims, coords=coords) return res
def importData(self, samplesToRun, maxNJetBin = 11): #variables to train variables = self.getList() f = h5py.File(samplesToRun[0], "r") columnHeaders = f["EventShapeVar"].attrs["column_headers"] f.close() for v in variables: if not v in columnHeaders: print "Variable not found: %s"%v #load data files dsets = [h5py.File(filename, mode='r')['EventShapeVar'] for filename in samplesToRun] arrays = [da.from_array(dset, chunks=(65536, 1024)) for dset in dsets] x = da.concatenate(arrays, axis=0) #setup and get data dataColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in variables]) data = x[:,dataColumns] npyInputData = data.compute() #print data.shape #setup and get labels npyInputAnswers = np.zeros((npyInputData.shape[0], 2)) if self.signal: npyInputAnswers[:,0] = 1 else: npyInputAnswers[:,1] = 1 #setup and get domains domainColumnNames = ["NGoodJets_double"] #maxNJetBin = 11 domainColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in domainColumnNames]) inputDomains = x[:,domainColumns] tempInputDomains = inputDomains.astype(int) tempInputDomains = da.reshape(tempInputDomains, [-1]) tempInputDomains[tempInputDomains > maxNJetBin] = maxNJetBin minNJetBin = tempInputDomains.min().compute() numDomains = maxNJetBin + 1 - minNJetBin tempInputDomains = tempInputDomains - tempInputDomains.min() d = np.zeros((npyInputData.shape[0], numDomains)) d[np.arange(d.shape[0]), tempInputDomains] = 1 #setup and get weights wgtColumnNames = ["Weight"] wgtColumns = np.array([np.flatnonzero(columnHeaders == v)[0] for v in wgtColumnNames]) npyInputSampleWgts = x[:,wgtColumns].compute() #NJet npyNJet = np.zeros((npyInputData.shape[0], 1)) for i in range(0, len(d)): nJet = minNJetBin for j in range(len(d[i])): if d[i][j] == 1: break else: nJet +=1 npyNJet[i][0] = int(nJet) return {"data":npyInputData, "labels":npyInputAnswers, "domain":d, "Weight":npyInputSampleWgts, "nJet":npyNJet}
dtype=np.int32)\ for i in range(0,neff,chunk_size)]) print " >> Expected shape:", eventId.shape # m0 branches = ["m0"] m0 = da.concatenate([\ da.from_delayed(\ load_single(tree,i,i+chunk_size, branches),\ shape=(chunk_size,),\ dtype=np.float32)\ for i in range(0,neff,chunk_size)]) print " >> Expected shape:", m0.shape # EB rescaled by m0 m0_reshape = da.reshape(m0, [-1,1,1,1]) X_m0 = scale*X/m0_reshape print " >> Expected shape:", X_m0.shape # Likelihood weights if j == 0: #h, bins = da.histogram(m0, bins=31, range=[80., 390.]) h, bins = da.histogram(m0, bins=152, range=[82., 386.]) else: #h, bins = da.histogram(m0, bins=8, range=[70., 150.]) h, bins = da.histogram(m0, bins=38, range=[74., 150.]) h = h.compute() h = h*np.float32(neff)/np.float32(h.sum()) binsLow = bins[:-1] lhood = 1./h lhood = lhood/lhood.sum()
def generate4D_frms6(data_dir, bin_factor=2, workers=0): current_dir = os.getcwd() os.chdir(data_dir) data_class = st.util.Frms6Reader() tot_files = 0 for file in glob.glob("*.frms6"): tot_files += 1 filesizes = np.zeros((tot_files, 4), dtype=int) filenames = np.zeros(tot_files, dtype=object) ii = 0 for file in glob.glob("*.frms6"): fname = data_dir + file dshape = np.asarray(data_class.getDataShape(fname), dtype=int) filesizes[ii, 0:3] = dshape filesizes[ii, -1] = fname[-7] filenames[ii] = fname ii += 1 os.chdir(current_dir) if workers == 0: workers = int(1 + tot_files) cluster = dd.LocalCluster(n_workers=workers) client = dd.Client(cluster) draw_shape = (np.mean(filesizes[filesizes[:, -1] != 0, 0:3], axis=0)).astype(int) dref_shape = filesizes[filesizes[:, -1] == 0, 0:3][0] data_shape = np.copy(dref_shape) data_shape[-1] = (np.sum(filesizes[:, -2]) - np.amin(filesizes[:, -2])).astype(int) individual_shape = np.zeros(4, dtype=int) individual_shape[0:3] = draw_shape individual_shape[-1] = int(tot_files - 1) data3d_before = [] ii = np.arange(tot_files)[filesizes[:, -1] == 0][0] dark_read = dask.delayed(data_class.readData)( filenames[ii], image_range=(0, dref_shape[-1]), pixels_x=dref_shape[0], pixels_y=dref_shape[1], ) dark_data = da.from_delayed(dark_read, filesizes[ii, 0:3], np.float32) del ii mean_dark_ref = da.mean(dark_data, axis=-1) for jj in np.arange(1, tot_files): ii = np.arange(tot_files)[filesizes[:, -1] == jj][0] test_read = dask.delayed(data_class.readData)( filenames[ii], image_range=(0, draw_shape[-1]), pixels_x=draw_shape[0], pixels_y=draw_shape[1], ) test_data = da.from_delayed(test_read, filesizes[ii, 0:3], np.float32) test_data = test_data.rechunk(-1, -1, 256) data3d_before.append(test_data) data3d_dask = da.concatenate(data3d_before, axis=-1) data_shape = data3d_dask.shape con_shape = tuple((np.asarray(data_shape[0:2]) * np.asarray((0.5, 2))).astype(int)) xvals = int(data_shape[-1] ** 0.5) d3r = da.transpose(data3d_dask, (2, 0, 1)) d3s = d3r - mean_dark_ref d3D_dref = da.transpose(d3s, (1, 2, 0)) top_part = d3D_dref[0 : con_shape[0], :, :] bot_part = d3D_dref[con_shape[0] : data_shape[0], :, :] top_part_rs = top_part[::-1, ::-1, :] data3d_arranged = da.concatenate([bot_part, top_part_rs], axis=1) shape4d = (con_shape[0], con_shape[1], xvals, xvals) data4d_dask = da.reshape(data3d_arranged, shape4d) bin_nums = int((xvals / bin_factor) ** 2) xvals_bin = int(xvals / bin_factor) if np.logical_not((np.mod(xvals, bin_factor)).astype(bool)): yyb = np.arange(data4d_dask.shape[2])[::bin_factor] xxb = np.arange(data4d_dask.shape[3])[::bin_factor] data3d_binY = da.reshape( data4d_dask[:, :, yyb, :], (con_shape[0], con_shape[1], int(xvals * xvals_bin)), ) for ybf in np.arange(1, bin_factor): data3d_binY = data3d_binY + da.reshape( data4d_dask[:, :, yyb + ybf, :], (con_shape[0], con_shape[1], int(xvals * xvals_bin)), ) data4d_binY = da.reshape( data3d_binY, (con_shape[0], con_shape[1], xvals_bin, xvals) ) data3d_binYX = da.reshape( data4d_binY[:, :, :, xxb], (con_shape[0], con_shape[1], bin_nums) ) for xbf in np.arange(1, bin_factor): data3d_binYX = data3d_binYX + da.reshape( data4d_binY[:, :, :, xxb + xbf], (con_shape[0], con_shape[1], bin_nums) ) data4d_bin = da.reshape( data3d_binYX, (con_shape[0], con_shape[1], xvals_bin, xvals_bin) ) data4D = data4d_bin.compute() else: data4D = data4d_dask.compute() cluster.close() return data4D
nrg_2k.chunks #Reading in all load files at once files = sorted(glob.glob('Texas/*.hdf5')) dsets = [h5py.File(f)['/load'] for f in files] #Reshaping as a numpy array to give yearly, daily and 15 minute intervals of power readings arrs = [np.array(d).reshape((1, 365, 96)) for d in dsets[1:]] #Stacking into 4 years arrs_stacked = np.concatenate(arrs, axis = 0) #converting to yearly array, with first dimension as year da_arrs = da.from_array(arrs_stacked) #Alternative workflow all in dask with no intermediate NumPy da_arrs2 = [da.from_array(d) for d in dsets[1:]] da_arrs2_stack = da.stack(da_arrs2) da_arrs2_rshp = da.reshape(da_arrs2_stack, (4, 365, 96)) #Working with dask dataframes import dask.dataframe as dd df = dd.read_csv('WDI.csv') #Looking at all indicator filters np.array(df['Indicator Code'].unique()) fil1 = df['Indicator Code'] == 'SP.POP.0014.TO.ZS' fil2 = df['Region'] == 'East Asia & Pacific' #Filtering df1 = df.loc[fil1 & fil2] #Basic grouping and plotting output
def main(argv=None): # cluster = LocalCluster(dashboard_address=None) # client = Client(cluster, memory_limit='{}GB'.format(FLAGS.memory_limit), # processes=False) K.set_floatx('float32') chunk_size = FLAGS.chunk_size # Read data set hdf5_file = h5py.File(FLAGS.data_file, 'r') images, labels, _ = hdf52dask(hdf5_file, FLAGS.group, chunk_size, shuffle=FLAGS.shuffle, seed=FLAGS.seed, pct=FLAGS.pct) n_images = images.shape[0] n_batches = int(np.ceil(n_images / float(FLAGS.batch_size))) # Data augmentation parameters daug_params_file = get_daug_scheme_path(FLAGS.daug_params, FLAGS.data_file) daug_params = yaml.load(open(daug_params_file, 'r'), Loader=yaml.FullLoader) nodaug_params_file = get_daug_scheme_path('nodaug.yml', FLAGS.data_file) nodaug_params = yaml.load(open(nodaug_params_file, 'r'), Loader=yaml.FullLoader) # Initialize the network model model_filename = FLAGS.model model = load_model(model_filename) # Print the model summary model.summary() # Get relevant layers if FLAGS.store_input: layer_regex = '({}|.*input.*)'.format(FLAGS.layer_regex) else: layer_regex = FLAGS.layer_regex layers = [ layer.name for layer in model.layers if re.compile(layer_regex).match(layer.name) ] # Create batch generators n_daug_rep = FLAGS.n_daug_rep n_diff_per_batch = int(FLAGS.batch_size / n_daug_rep) image_gen_daug = get_generator(images, **daug_params) batch_gen_daug = batch_generator(image_gen_daug, images, labels, batch_size=n_diff_per_batch, aug_per_im=n_daug_rep, shuffle=False) image_gen_nodaug = get_generator(images, **nodaug_params) batch_gen_nodaug = batch_generator(image_gen_nodaug, images, labels, FLAGS.batch_size, aug_per_im=1, shuffle=False) # Outputs if FLAGS.output_dir == '-1': FLAGS.output_dir = os.path.dirname(FLAGS.model) output_hdf5 = h5py.File( os.path.join(FLAGS.output_dir, FLAGS.output_mse_matrix_hdf5), 'w') output_pickle = os.path.join(FLAGS.output_dir, FLAGS.output_pickle) df_init_idx = 0 df = pd.DataFrame() # Iterate over the layers for layer_idx, layer_name in enumerate(layers): # Reload the model if layer_idx > 0: K.clear_session() model = load_model(model_filename) layer = model.get_layer(layer_name) # Rename input layer if re.compile('.*input.*').match(layer_name): layer_name = 'input' hdf5_layer = output_hdf5.create_group(layer_name) activation_function = K.function( [model.input, K.learning_phase()], [layer.output]) print('\nComputing pairwise similarity at layer {}'.format(layer_name)) # Compute activations of original data (without augmentation) a_nodaug_da = get_activations(activation_function, batch_gen_nodaug) a_nodaug_da = da.squeeze(a_nodaug_da) a_nodaug_da = da.rechunk(a_nodaug_da, (chunk_size, ) + (a_nodaug_da.shape[1:])) dim_activations = a_nodaug_da.shape[1] # Comute matrix of similarities r = da.reshape(da.sum(da.square(a_nodaug_da), axis=1), (-1, 1)) mse_matrix = (r - 2 * da.dot(a_nodaug_da, da.transpose(a_nodaug_da)) \ + da.transpose(r)) / dim_activations # Compute activations with augmentation a_daug_da = get_activations(activation_function, batch_gen_daug) a_daug_da = da.rechunk(a_daug_da, (chunk_size, dim_activations, 1)) # Compute similarity of augmentations with respect to the # activations of the original data a_nodaug_da = da.repeat(da.reshape(a_nodaug_da, a_nodaug_da.shape + (1, )), repeats=n_daug_rep, axis=2) a_nodaug_da = da.rechunk(a_nodaug_da, (chunk_size, dim_activations, 1)) mse_daug = da.mean(da.square(a_nodaug_da - a_daug_da), axis=1) # Compute invariance score mse_sum = da.repeat(da.reshape(da.sum(mse_matrix, axis=1), (n_images, 1)), repeats=n_daug_rep, axis=1) mse_sum = da.rechunk(mse_sum, (chunk_size, 1)) invariance = 1 - n_images * da.divide(mse_daug, mse_sum) print('Dimensionality activations: {}x{}x{}'.format( n_images, dim_activations, n_daug_rep)) # Store HDF5 file if FLAGS.output_mse_matrix_hdf5: mse_matrix_ds = hdf5_layer.create_dataset( 'mse_matrix', shape=mse_matrix.shape, chunks=mse_matrix.chunksize, dtype=K.floatx()) mse_daug_ds = hdf5_layer.create_dataset('mse_daug', shape=mse_daug.shape, chunks=mse_daug.chunksize, dtype=K.floatx()) invariance_ds = hdf5_layer.create_dataset( 'invariance', shape=invariance.shape, chunks=invariance.chunksize, dtype=K.floatx()) time_init = time() with ProgressBar(dt=1): da.store([mse_matrix, mse_daug, invariance], [mse_matrix_ds, mse_daug_ds, invariance_ds]) time_end = time() print('Elapsed time: {}'.format(time_end - time_init)) invariance = np.ravel( np.asarray(output_hdf5[layer_name]['invariance'])) else: time_init = time() invariance = da.ravel(invariance).compute() time_end = time() print('Elapsed time: {}'.format(time_end - time_init)) # Update pandas data frame for plotting df_end_idx = df_init_idx + n_images * n_daug_rep d = pd.DataFrame( { 'Layer': layer_name, 'sample': np.repeat(np.arange(n_images), n_daug_rep), 'n_daug': np.tile(np.arange(n_daug_rep), n_images), 'invariance': invariance }, index=np.arange(df_init_idx, df_end_idx).tolist()) df = df.append(d) df_init_idx += df_end_idx pickle.dump(df, open(output_pickle, 'wb')) output_hdf5.close()