def save_arr(arr, storage_type, file_path, key='/data', axis=0, chunks_shape=None, compression=None): """ Save dask array to hdf5 dataset or numpy file stack. """ if storage_type == "hdf5": if chunks_shape: print(f'Using chunk shape {chunks_shape}') da.to_hdf5(file_path, key, arr, chunks=chunks_shape) else: if compression == "gzip": print('Using gzip compression') da.to_hdf5(file_path, key, arr, chunks=None, compression="gzip") else: print('Without compression') da.to_hdf5(file_path, key, arr, chunks=None) elif storage_type == "numpy": da.to_npy_stack(os.path.join(file_path, "npy/"), arr, axis=axis)
def to_npy_stack(source_h5_path, dest_path, verbose=False, channel_len=1048576): """ Convert original h5 file to npy stack :param source_h5_path: :param dest_path: :param channel_len: :return: """ if verbose: start = time() print("Converting to npy stack") h5_file = h5py.File(source_h5_path, "r") print(channel_len * num_chans_per_block) arr = da.from_array(h5_file["data"], chunks=(2, 1, channel_len * num_chans_per_block)) if not os.path.isdir(dest_path): os.mkdir(dest_path) if not os.path.isdir(dest_path + "/original"): os.mkdir(dest_path + "/original") da.to_npy_stack(dest_path + "/original", arr, axis=2) if verbose: end = time() print("Converted to npy stack in %.4f seconds." % (end - start))
def save_arr(arr, storage_type, file_path, key='/data', axis=0, chunks_shape=None): """ Save array to hdf5 dataset or numpy file stack. """ if storage_type == "hdf5": if chunks_shape: da.to_hdf5(file_path, key, arr, chunks=chunks_shape) else: da.to_hdf5(file_path, key, arr) elif storage_type == "numpy": da.to_npy_stack(os.path.join(file_path, "npy/"), arr, axis=axis) return
def test_to_npy_stack(): x = np.arange(5 * 10 * 10).reshape((5, 10, 10)) d = da.from_array(x, chunks=(2, 4, 4)) dirname = mkdtemp() try: da.to_npy_stack(dirname, d, axis=0) assert os.path.exists(os.path.join(dirname, '0.npy')) assert (np.load(os.path.join(dirname, '1.npy')) == x[2:4]).all() e = da.from_npy_stack(dirname) assert eq(d, e) finally: shutil.rmtree(dirname)
def test_to_npy_stack(): x = np.arange(5*10*10).reshape((5, 10, 10)) d = da.from_array(x, chunks=(2, 4, 4)) dirname = mkdtemp() try: da.to_npy_stack(dirname, d, axis=0) assert os.path.exists(os.path.join(dirname, '0.npy')) assert (np.load(os.path.join(dirname, '1.npy')) == x[2:4]).all() e = da.from_npy_stack(dirname) assert eq(d, e) finally: shutil.rmtree(dirname)
def split_array(arr, f, nb_blocks=None): """ Split an array given its chunk shape. Output is a hdf5 file with as many datasets as chunks. Arguments: ---------- nb_blocks: nb blocks we want to extract """ # arr_list = get_arr_list(arr, nb_blocks) datasets = list() # for hdf5: """for i, a in enumerate(arr_list): # print("creating dataset in split file -> dataset path: ", '/data' + str(i)) # print("storing data of shape", a.shape) datasets.append(f.create_dataset('/data' + str(i), shape=a.shape)) return da.store(arr_list, datasets, compute=False)""" # for numpy storage return da.to_npy_stack('data/numpy_data', arr, axis=0)
X = np.zeros((len(corpus), maxlen)) for i in range(len(corpus)): for no, k in enumerate(corpus[i].split()[:maxlen][::-1]): X[i, -1 - no] = dic.get(k, UNK) return X sentiment_label = ['negative', 'positive'] g_sentiment = load_graph('sentiment.pb') x_sentiment = g_sentiment.get_tensor_by_name('import/Placeholder:0') logits_sentiment = g_sentiment.get_tensor_by_name('import/logits:0') sess_sentiment = tf.InteractiveSession(graph=g_sentiment) with open('fast-text-sentiment.json') as fopen: dict_sentiment = json.load(fopen) def classify(texts): batch_x_text = [clearstring(t) for t in texts] batch_x = str_idx(batch_x_text, dict_sentiment['dictionary'], 100) output_sentiment = sess_sentiment.run(logits_sentiment, feed_dict={x_sentiment: batch_x}) labels = [sentiment_label[l] for l in np.argmax(output_sentiment, 1)] return da.stack(labels, axis=0) b = db.read_text('big-text.txt') stacked = da.stack(b, axis=0).rechunk(20) result = stacked.map_blocks(classify).rechunk(-1) da.to_npy_stack('./', result)
def to_stack(p, d): x = da.stack(d, axis=0) da.to_npy_stack(p, x, axis=0) return x
tmp_data_holder = h5file_holder[file_name][data_name] tmp_dask_data_holder = da.from_array(tmp_data_holder[data_ends_list[data_idx][0]: data_ends_list[data_idx][1]] , chunks='auto') dataset_holder.append(tmp_dask_data_holder) # Create dask arrays based on these h5 files dataset = da.concatenate(dataset_holder, axis=0) # Calculate the correlation matrix. num_dim = len(dataset.shape) inner_prod_matrix = da.tensordot(dataset, dataset, axes=(list(range(1, num_dim)), list(range(1, num_dim)))) # Save the distance patch name_to_save = address_output + "/distances/patch_{}_{}.npy".format(comm_rank - 1, comm_rank - 1) da.to_npy_stack(name_to_save, inner_prod_matrix) # comm.Barrier() # There is no need to synchronize here """ Step Four: Calculate the off-diagonal patch """ # Construct the data for off-diagonal patch patch_number = len(job_list[comm_rank - 1]) - 1 for _local_idx in range(1, patch_number): # The first patch calculated for each row is the diagonal patch. # Get to know which patch is to process job_idx = job_list[comm_rank - 1][_local_idx] col_info_holder = data_source.batch_ends_local[job_idx[1]] # For different horizontal patches
import dask.array as da import numpy as np dirname = '../data/' filename = dirname + 'gyroidUniform.npy' # Map numpy to .npy file np_map = np.memmap(filename, dtype=np.float64, mode='r', shape=(200,200,200)) # Map Dask array to numpy array dask_arr = da.from_array(np_map, chunks=(25)) # Split into Blocks da.to_npy_stack(dirname, dask_arr, axis=0) # Load Blocks one = np.load(dirname +'0.npy') two = np.load(dirname +'1.npy') three = np.load(dirname +'2.npy') four = np.load(dirname +'3.npy') five = np.load(dirname +'4.npy') six = np.load(dirname +'5.npy') seven = np.load(dirname +'6.npy') eight = np.load(dirname +'7.npy')
def find_filename_data(au_emote_dict_loc, classifier_loc, real_time_file_loc, out_loc, out_q, filename, return_plot_data=False, event_delta_seconds=1): """ Find PSD data for given file and dump it :param au_emote_dict_loc: location of au_emote_dict, in json format :param classifier loc: location of pickled classifier :param real_time_file_loc: location of real times :param out_loc: Where to dump PSD data :param out_q: Output queue for multiprocessing :param filename: File to find data of :param return_plot_data: Whether this is used for plotting :param event_delta_seconds: Seconds around an event to do PSD of """ tqdm_num, filename = filename tqdm_num = (tqdm_num % 5) + 1 au_emote_dict = json.load(open(au_emote_dict_loc)) raw = map_raw(filename) if raw is None: return events, times, corr = get_events(filename, au_emote_dict, classifier_loc, real_time_file_loc) if times is None: times = [] # print("{0} number of times: {1}".format(filename, len(times))) if times: predicDic = {time: predic for time, predic in zip(times, corr)} eventTimes = set(x[0] for x in events) picks = mne.pick_types(raw.info, ecog=True, ecg=True) if return_plot_data: return get_window_data(raw, times, corr, picks, eventTimes, tqdm_num, filename, return_plot_data, event_delta_seconds) freqs, temp_all_data, temp_labels, temp_times, test_data, test_times = get_window_data( raw, times, corr, picks, eventTimes, tqdm_num, filename, return_plot_data, event_delta_seconds) if freqs is not None: freqs = da.from_array(freqs, chunks=(100, )) if freqs is not None: filename_out_dir = os.path.join( out_loc, 'classifier_data', os.path.basename(filename).replace('.edf', '')) if not os.path.exists(filename_out_dir): os.makedirs(filename_out_dir) da.to_npy_stack(os.path.join(filename_out_dir, 'freqs'), freqs) conditional_dump(temp_all_data, os.path.join(filename_out_dir, 'data')) conditional_dump(temp_labels, os.path.join(filename_out_dir, 'labels')) conditional_dump(temp_times, os.path.join(filename, 'times')) conditional_dump(test_data, os.path.join(filename_out_dir, 'test_data')) conditional_dump(test_times, os.path.join(filename_out_dir, 'test_times')) # if temp_all_data is not None: # da.to_npy_stack( # os.path.join(filename_out_dir, 'data'), # da.from_array(temp_all_data, chunks=(10000, -1, -1))) # if temp_labels is not None: # da.to_npy_stack( # os.path.join(filename_out_dir, 'labels'), # da.from_array(temp_labels, chunks=(10000, ))) # if temp_times is not None: # da.to_npy_stack( # os.path.join(filename_out_dir, 'times'), # da.from_array(temp_times, chunks=(10000, ))) out_q.put((filename, len(times)))
def conditional_dump(array, location): if array is not None: da.to_npy_stack(location, array)
def find_filename_data(au_emote_dict_loc, classifier_loc, real_time_file_loc, out_loc, out_q, filename, return_plot_data=False, event_delta_seconds=1): tqdm_num, filename = filename tqdm_num = (tqdm_num % 5) + 1 au_emote_dict = json.load(open(au_emote_dict_loc)) raw = map_raw(filename) if raw is None: return events, times, corr = get_events(filename, au_emote_dict, classifier_loc, real_time_file_loc) if times is None: times = [] # print("{0} number of times: {1}".format(filename, len(times))) if times: predicDic = {time: predic for time, predic in zip(times, corr)} eventTimes = set(x[0] for x in events) picks = mne.pick_types(raw.info, ecog=True, ecg=True) if return_plot_data: return get_window_data(raw, times, corr, picks, eventTimes, tqdm_num, filename, return_plot_data, event_delta_seconds) freqs, temp_all_data, temp_labels, temp_times, test_data, test_times = get_window_data( raw, times, corr, picks, eventTimes, tqdm_num, filename, return_plot_data, event_delta_seconds) if freqs is not None: freqs = da.from_array(freqs, chunks=(100, )) if freqs is not None: filename_out_dir = os.path.join(out_loc, 'classifier_data', os.path.basename(filename).replace( '.edf', '')) if not os.path.exists(filename_out_dir): os.makedirs(filename_out_dir) da.to_npy_stack(os.path.join(filename_out_dir, 'freqs'), freqs) conditional_dump(temp_all_data, os.path.join(filename_out_dir, 'data')) conditional_dump(temp_labels, os.path.join(filename_out_dir, 'labels')) conditional_dump(temp_times, os.path.join(filename, 'times')) conditional_dump(test_data, os.path.join(filename_out_dir, 'test_data')) conditional_dump(test_times, os.path.join(filename_out_dir, 'test_times')) # if temp_all_data is not None: # da.to_npy_stack( # os.path.join(filename_out_dir, 'data'), # da.from_array(temp_all_data, chunks=(10000, -1, -1))) # if temp_labels is not None: # da.to_npy_stack( # os.path.join(filename_out_dir, 'labels'), # da.from_array(temp_labels, chunks=(10000, ))) # if temp_times is not None: # da.to_npy_stack( # os.path.join(filename_out_dir, 'times'), # da.from_array(temp_times, chunks=(10000, ))) out_q.put((filename, len(times)))
pattern=pattern, ).generator(): b = block_array print("Done:", b.shape) if __name__ == '__main__': if len(list(Path('.').glob('fail.zarr'))) == 0: from numcodecs import Blosc compressor = Blosc() x = da.random.random((100, 100, 3), chunks=(100, 100,3)) x.to_zarr('fail.zarr', compressor=compressor, overwrite=True) if len(list(Path('.').glob('fail.npystack'))) == 0: x = da.random.random((100, 100, 3), chunks=(100, 100,3)) da.to_npy_stack('fail.npystack', x) from dask.distributed import LocalCluster, Client # cluster = LocalCluster(host='0.0.0.0', n_workers=1, threads_per_worker=1) # client = Client(cluster) # 'fail' client = Client(threads_per_worker=1) # 'fail' # client = Client(processes=False, threads_per_worker=1) # 'not fail' load_zarr() # 'fail' with zarr #load_zarr('fail.npystack') # 'not fail' with npystack
tst_predictions = [] for data in tqdm(test): with torch.no_grad(): #load data onto gpu #then forward pass images = data['image'].cuda(non_blocking=True) output = model.eval()(images) pred = nn.Sigmoid()(output) tst_predictions.append(pred.detach().cpu().numpy()) tst_predictions = np.concatenate(tst_predictions, axis=0) #create an array of labels that are all zeros and fill in the values from a combination #of the ground truth labels from training and validation sets and the predicted #labels for unlabeled indices #convert gt_labels from strings to integers predicted_labels = (gt_labels == 'informative').astype(np.uint8) predicted_labels[unlabeled_indices] = (tst_predictions[:, 0] > 0.5).astype( np.uint8) print(f'Saving predictions...') np.save(os.path.join(savedir, "nn_predictions.npy"), predicted_labels) print(f'Saving filepaths...') filtered_fpaths = da.from_array(impaths[predicted_labels == 1].compute()) da.to_npy_stack(os.path.join(savedir, 'nn_filtered_fpaths.npz'), filtered_fpaths) print('Finished.')
#remove all the matched images from both hashes and impaths hashes = np.delete(hashes, matches, axis=0) impaths = np.delete(impaths, matches, axis=0) #because this script can take a long time to complete, let's save checkpoint #results for each dataset when it's finished with deduplication, then we have #the option to resume later on np.save(exemplar_fpath, np.array(exemplars)) #run the dataset level deduplication on multiple groups at once #results for each group are saved in separate .npy files, if the #.npy file already exists, then it will be skipped. This makes it #easier to add new datasets to the existing directory structure with Pool(processes) as pool: pool.map(group_dedupe, list(zip(unq_datasets, groups_impaths))) #now that all the patches from individual datasets are deduplicated, #we'll combine all the separate .npy arrays into a single dask array and save it exemplar_fpaths = glob(os.path.join(savedir, '*_exemplars.npy')) deduplicated_fpaths = np.concatenate( [np.load(fp) for fp in exemplar_fpaths]) #convert to dask and save deduplicated_fpaths = da.from_array(deduplicated_fpaths) da.to_npy_stack(os.path.join(savedir, 'deduplicated_fpaths.npz'), deduplicated_fpaths) #print the total number of deduplicated patches print(f'{len(deduplicated_fpaths)} patches remaining after deduplication.')
def write_to_npy_stack(out_dir, arr): a__, b__, c__ = da.to_npy_stack(out_dir, arr, axis=0) _ = dask.base.compute_as_if_collection(a__, b__, c__)