def __init__(self, labels, optical_flow_path, segments_path, batch_size=BATCH_SIZE, shuffle=True): self.batch_size = batch_size self.labels = labels self.optical_flow = da.from_npy_stack(optical_flow_path) self.segments = da.from_npy_stack(segments_path) self.shuffle = shuffle self.on_epoch_end()
def get_or_create_array(config, npy_stack_dir=None): """ Load or create Dask Array for tests. You can specify a test case too. If file exists the function returns the array. If chunk_shape given the function rechunk the array before returning it. If file does not exist it will be created using "shape" parameter. Arguments (from config object): ---------- file_path: File containing the array, will be created if does not exist. chunk_shape: shape: Shape of the array to create if does not exist. test_case: Test case. If None, returns the test array. nb_chunks: Number of chunks to treat in the test case. overwrite: Use the chunk_shape to create a new array, overwriting if file_path already used. split_file: for the test case 'split' """ file_path = config.array_filepath if not os.path.isfile(file_path): raise FileNotFoundError() # get the file and rechunk logically using a chosen chunk shape, or dask default if npy_stack_dir: arr = da.from_npy_stack(dirname=npy_stack_dir, mmap_mode=None) else: if config.chunks_shape: arr = get_dask_array_from_hdf5( file_path, logic_chunks_shape=config.chunks_shape) else: arr = get_dask_array_from_hdf5(file_path) # TODO: see what happens return arr
def measuring(): global sky global dirty global psf list_schedule = [] list_compute = [] list_total = [] list_load = [] n = 5 for i in range (1): start_time1 = time.time() #sky_npy, sky = load_data(os.path.split(os.path.split(os.getcwd())[0])[0] + '/sky.npy', n) #dirty_npy, dirty = load_data(os.path.split(os.path.split(os.getcwd())[0])[0] + '/dirty.npy', n) #psf_npy, psf = load_data(os.path.split(os.path.split(os.getcwd())[0])[0] + '/psf.npy', n) sky = da.from_npy_stack(os.getcwd() + '/skyStack') dirty = da.from_npy_stack(os.getcwd() + '/dirtyStack') psf = da.from_npy_stack(os.getcwd() + '/psfStack') end_time1 = time.time() start_time2 = time.time() scheduling() end_time2 = time.time() #pbar = ProgressBar() #with pbar, Profiler() as prof, ResourceProfiler() as rprof, CacheProfiler() as cprof: start_time3 = time.time() quad.compute() end_time3 = time.time() list_load.append(end_time1 - start_time1) list_schedule.append(end_time2 - start_time2) list_compute.append(end_time3 - start_time3) list_total.append(end_time3 - start_time1) #visualize([prof, rprof, cprof]) print('num de dimension: {}'.format(n)) print('load time: {}'.format(round(sum(list_load)/len(list_load), 4))) print('scheduling time: {}'.format(round(sum(list_schedule)/len(list_schedule), 4))) print('compute time: {}'.format(round(sum(list_compute)/len(list_compute), 4))) print('total time: {}'.format(round(sum(list_total)/len(list_total), 4)))
def _tile_generator(self): if str(self.input_path).endswith('zarr'): level_array = da.from_zarr( str(self.input_path) ) else: level_array = da.from_npy_stack( str(self.input_path) ) block_array = level_array.blocks[0,0].compute() yield block_array
def dask_player(video, segmentation, optical_flow): Cache(2e6).register() # Turn cache on globally video = cv2.VideoCapture(video) segmentation = da.from_npy_stack(segmentation) optical_flow = da.from_npy_stack(optical_flow) bar = Bar('Frame', max=video.get(cv2.CAP_PROP_FRAME_COUNT)) bar.index = OFFSET video.set(cv2.CAP_PROP_POS_FRAMES, OFFSET) while video.isOpened(): video_frame = video.read()[1] optical_flow_frame = optical_flow[bar.index].compute() segmentation_frame = segmentation[bar.index].compute() cv2.imshow('video', video_frame) cv2.imshow('segmentation', segmentation_frame) cv2.imshow('optical_flow', visualize_optical_flow(optical_flow_frame)) import pdb pdb.set_trace() cv2.waitKey(27) bar.next()
def get_data(data_loc: str) -> tuple: data_folders = [ os.path.join(data_loc, x) for x in os.listdir(data_loc) if 'cb46fd46' in x ] out_data = None out_labels = None out_times = None for data_folder in tqdm(data_folders): all_data_fol = os.path.join(data_folder, 'data') label_folder = os.path.join(data_folder, 'labels') times_folder = os.path.join(data_folder, 'times') if out_data is None: out_data = da.from_npy_stack(all_data_fol) else: out_data = da.concatenate( [out_data, da.from_npy_stack(all_data_fol)]) if out_labels is None: out_labels = da.from_npy_stack(label_folder) else: out_labels = da.concatenate( [out_labels, da.from_npy_stack(label_folder)]) if out_times is None: out_times = da.from_npy_stack(times_folder) else: out_times = da.concatenate( [out_times, da.from_npy_stack(times_folder)]) return out_data.compute(), out_labels.compute(), out_times.comput()
def test_to_npy_stack(): x = np.arange(5*10*10).reshape((5, 10, 10)) d = da.from_array(x, chunks=(2, 4, 4)) dirname = mkdtemp() try: da.to_npy_stack(dirname, d, axis=0) assert os.path.exists(os.path.join(dirname, '0.npy')) assert (np.load(os.path.join(dirname, '1.npy')) == x[2:4]).all() e = da.from_npy_stack(dirname) assert eq(d, e) finally: shutil.rmtree(dirname)
def test_to_npy_stack(): x = np.arange(5 * 10 * 10).reshape((5, 10, 10)) d = da.from_array(x, chunks=(2, 4, 4)) dirname = mkdtemp() try: da.to_npy_stack(dirname, d, axis=0) assert os.path.exists(os.path.join(dirname, '0.npy')) assert (np.load(os.path.join(dirname, '1.npy')) == x[2:4]).all() e = da.from_npy_stack(dirname) assert eq(d, e) finally: shutil.rmtree(dirname)
def benchmark_dask_map_block_1d_sparse_distributed(filename, cutoff=15, direct_output=True): global point_array func_name = sys._getframe().f_code.co_name results = [] #cache = Chest(path=os.path.join(BASE_DIRECTORY, "cache"), available_memory=98e9) cluster_details = client.ncores() number_nodes = len(cluster_details.keys()) number_cores = sum(cluster_details.values()) global point_array with dask.set_options(get=client.get): start = time.time() point_array = da.from_npy_stack(filename) chunk_size = point_array.chunks[0][0] end_read = time.time() results.append( "%s,dask-distributed, %s, %d, %d, comet, read_file, %.4f" % (filename, func_name, number_nodes, number_cores, end_read - start)) chunk_size dist_matrix = da.zeros((point_array.shape[0], point_array.shape[0]), chunks=(chunk_size, point_array.shape[0])) """map_block_distances operates on point_array """ da_res = dist_matrix.map_blocks(map_blocks_1d_sparse, chunks=(chunk_size, 3), dtype='int') res = da_res.compute() end_compute = time.time() results.append( "%s,dask-distributed, %s, %d, %d, comet, compute, %.4f" % (filename, func_name, number_nodes, number_cores, end_compute - end_read)) results.append("%s,dask-distributed, %s, %d, %d, comet, total, %.4f" % (filename, func_name, number_nodes, number_cores, end_compute - start)) print("\n".join(results))
import numpy as np from keras.models import load_model from dask import array as da hybrid_model = load_model('results/hybrid_model.hdf5') def predict(block): if block.shape == (1,1,1,1): return block return hybrid_model.predict(block) if __name__ == '__main__': from multiprocessing.pool import ThreadPool import dask with dask.config.set(pool=ThreadPool(8)): segments = da.from_npy_stack('data/comma_ai/test_segments')[1:] # drop first segment optical_flow = da.from_npy_stack('data/comma_ai/test_optical_flow') frame = da.concatenate([optical_flow, segments.reshape((10797, 480, 640, 1))], axis=3) predicted_speeds = None for block in frame.blocks: if predicted_speeds is None: predicted_speeds = hybrid_model.predict(block.compute()) else: predicted_speeds = np.concatenate([predicted_speeds, hybrid_model.predict(block.compute())]) print 'processed block' # duplicate first prediction with open('test.txt', 'w+') as f: np.savetxt(f,predicted_speeds)
type=str, metavar='savedir', help='Directory in which to save model and evaluation plots') #parse the arguments args = parser.parse_args() impaths_file = args.impaths_file labels_fpath = args.labels_fpath savedir = args.savedir #make sure the savedir exists if not os.path.isdir(savedir): os.mkdir(savedir) #load the dask array impaths = da.from_npy_stack(impaths_file) #load the labels array gt_labels = np.load(labels_fpath) #sanity check that the number of labels and impaths are the same assert (len(impaths) == len(gt_labels) ), "Number of impaths and labels are different!" #it's expected that the gt_labels were generated within a Jupyter notebook by #using the the corrector.py labeling utilities #in that case the labels are text with the possible options of "informative", "uninformative", and "none" #those with the label "none" are considered the unlabeled set and we make predictions #about their labels using the random forest that we train on the labeled images good_indices = np.where(gt_labels == 'informative')[0] bad_indices = np.where(gt_labels == 'uninformative')[0]
from progress.bar import Bar import os import cv2 import numpy as np import pickle from dask import array as da from progress.bar import Bar optical_flow_model = load_model('results/optical_flow_baseline.hdf5') if __name__ == '__main__': from multiprocessing.pool import ThreadPool import dask with dask.config.set(pool=ThreadPool(8)): optical_flow = da.from_npy_stack('data/comma_ai/test_optical_flow') predicted_speeds = None for block in optical_flow.blocks: if predicted_speeds is None: predicted_speeds = optical_flow_model.predict(block.compute()) else: predicted_speeds = np.concatenate([ predicted_speeds, optical_flow_model.predict(block.compute()) ]) print 'processed block' # duplicate first prediction with open('test.txt', 'w+') as f: np.savetxt(f, predicted_speeds)
def learn_clusters(n_clust): client = Client(n_workers=4, processes=True) # 1. Learn clusters # Full set kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust) array = da.from_npy_stack(npy_stack_path) kmeans = KMeans(n_clusters=n_clust) # Learn on a part of set # array = np.load('Clustering/npy_post_vecs_part/0.npy') # kmeans = SKMeans(n_clusters=n_clust) print('Fitting') kmeans.fit(array) del array # Dump centroids to the disk # Dump as a sklearn object, for (maybe) faster prediction and less problems skmeans = SKMeans(n_clusters=n_clust) skmeans.cluster_centers_ = kmeans.cluster_centers_ skmeans._n_threads = _openmp_effective_n_threads() dump(skmeans, kmeans_path) del kmeans, skmeans # dump(kmeans, kmeans_path) # For learning on a part of set # del kmeans print('Fitted') # 3. Turn posts into clusters kmeans_path = 'Clustering/KMeans/n{}posts.joblib'.format(n_clust) df = dd.read_parquet('preprocessed.parquet') df = df.map_partitions(df_to_vector_predict, kmeans_path, meta={ 'user_id': int, 'post_id': int, 'text': object, 'type': str, 'date': str, 'cluster': int }) df.to_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) print('Clustered') # 2.5. Filter outdated posts out. (The next time write date of parsing to user_info) # For each user find his last like and filter out likes that are older than the last + half a year df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) print('Original df len: {}'.format(len(df))) year = 31536000 # One year in timestamp kyear = 20 break_time = kyear * year # 0.75*year - A quarter to year last_like = df['date'].max().compute( ) # Set has been fully collected on 8 of June 2020 df = df[df['date'] > last_like - break_time] # Pass only a quarter-to-year recent likes print('max_date: {} '.format(df['date'].max().compute())) print('min date: {}'.format(df['date'].min().compute())) print('Filtered df len: {}'.format(len(df))) print('Likes has been filtered out by date') # 3. Group clusters by user_id and turn them into a single vector for each user # df = dd.read_parquet('Clustering/KMeans/n{}posts.parquet'.format(n_clust)) # INSTEAD OF FILTER! # - Count text_likes number for each user (and later merge with user_info) count = df.drop(columns=['post_id', 'type', 'date', 'cluster']).groupby( 'user_id')['text'].count().compute() count.rename('text_likes', inplace=True) # Generate meta meta = {'user_id': int} for i in range(n_clust): meta[i] = float df = df.map_partitions( lambda df_part: kt.clusters_to_vector(df_part, n_clust), meta=meta) df.to_parquet( 'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust)) # 5. Merge clusters and user_info dataframes. (Working with pandas frames) df_info = pd.read_csv('users_info.csv') df_info = df_info.merge(count, on='user_id', how='inner') del count df = pd.read_parquet( 'Clustering/KMeans/n{}posts-cluster_vecs.parquet'.format(n_clust)) df = df_info.merge( df, on='user_id', how='inner' ) # Merging user's info and clusters. Maybe, mistake is here df.to_csv('Clustering/KMeans/n{}-final_dataset-{}year.csv'.format( n_clust, kyear)) print('Final dataset has been saved') del df_info # Filter some users out # df = pd.read_csv('Clustering/KMeans/n{}-final_dataset.csv'.format(n_clust)).drop(columns=['Unnamed: 0']) # TESTING df = df.loc[(df['text_likes'] > 100) & (df['text_likes'] < 1000)] df['bdate'] = df['bdate'].apply( lambda bd: time.mktime(datetime.strptime(bd, "%d.%m.%Y").timetuple())) # Clean up the dataset df = df.drop(columns=[ 'posts_n', 'text_likes', 'status', 'sex', 'smoking', 'alcohol', 'parth_id', 'country', 'city', 'user_id' ]).dropna().reset_index(drop=True) # 6. Supervise a Linear Regression model regr = LinearRegression() R2 = train(df, regr) client.close() return R2
if __name__ == "__main__": from distributed import Client, LocalCluster import dask.dataframe as df import dask.array as da cluster = LocalCluster() client = Client(cluster) matches = da.from_npy_stack("/spell/data/") matches = df.from_array(matches) # IMPORTANT: note that this repartition is optional, if you want a partitioned write matches.repartition(npartitions=1).to_csv("predictions.csv")
volume_paths = glob(os.path.join(volume_dir, '*')) print(f'Found {len(volume_paths)} in {volume_dir}') #extract the volume names #NOTE: this is the same code used to generate the names #from cross_section3d.py volume_names = [] for vp in volume_paths: fext = vp.split('.')[-1] if vp[-5:] != 'nii.gz' else 'nii.gz' volume_names.append(vp.split('/')[-1].split(f'.{fext}')[0]) volume_names = np.array(volume_names) #convert filtered to numpy straightaway #dask.array doesn't have good support for string operations filtered_impaths = da.from_npy_stack(filtered_impaths_file).compute() #the first thing that we need to do it to isolate #images from 3d source datasets. during creation we #gave 2d files the handy identifying -LOC-2d- source3d = np.where( np.core.defchararray.find(filtered_impaths, '-LOC-2d') == -1) print( f'Isolated {len(source3d[0])} images from 3d volumes out of {len(filtered_impaths)}' ) #overwrite filtered_impaths to save space #and sort the results such that images from the same #source datasets are grouped together filtered_impaths = np.sort(filtered_impaths[source3d])