def test_serializable_locks(c, s, a, b): def f(x, lock=None): with lock: return x + 1 # note, the creation of Lock needs to be done inside a cluster for lock in [HDF5_LOCK, Lock(), Lock('filename.nc'), CombinedLock([HDF5_LOCK]), CombinedLock([HDF5_LOCK, Lock('filename.nc')])]: futures = c.map(f, list(range(10)), lock=lock) yield c.gather(futures) lock2 = pickle.loads(pickle.dumps(lock)) assert type(lock) == type(lock2)
def __getitem__(self, item): path = os.path.join(self.path, item) try: # this will work if dask.distributed workers exist lock = Lock(path) except AttributeError: # otherwise default to the interprocesslock used by zarr lock = InterProcessLock(path) return lock
def lock_for_conflicts(conflicts, base_name="pangeo-forge"): try: global_client = get_client() is_distributed = True except ValueError: # Don't bother with locks if we are not in a distributed context # NOTE! This means we HAVE to use dask.distributed as our parallel execution enviroment # This should be compatible with Prefect. is_distributed = False if is_distributed: locks = [Lock(f"{base_name}-{c}", global_client) for c in conflicts] for lock in locks: logger.debug(f"Acquiring lock {lock.name}...") lock.acquire() logger.debug(f"Acquired lock {lock.name}") else: logger.debug(f"Asked to lock {conflicts} but no Dask client found.") try: yield finally: if is_distributed: for lock in locks: lock.release() logger.debug(f"Released lock {lock.name}")
def lock_for_conflicts(conflicts, base_name="pangeo-forge", timeout=None): """ Parameters ---------- timeout : int, optional The time to wait *for each lock*. """ try: global_client = get_client() is_distributed = True except ValueError: # Don't bother with locks if we are not in a distributed context # NOTE! This means we HAVE to use dask.distributed as our parallel execution enviroment # This should be compatible with Prefect. is_distributed = False if is_distributed: locks = [Lock(f"{base_name}-{c}", global_client) for c in conflicts] for lock in locks: logger.debug(f"Acquiring lock {lock.name}...") acquired = lock.acquire(timeout=timeout) if not acquired: logger.warning("Failed to acquire lock %s before timeout %s", lock.name, timeout) raise ValueError(f"Failed to acquire lock {lock.name} before timeout {timeout}") logger.debug(f"Acquired lock {lock.name}") else: logger.debug(f"Asked to lock {conflicts} but no Dask client found.") try: yield finally: if is_distributed: for lock in locks: lock.release() logger.debug(f"Released lock {lock.name}")
def touch_random_unused_file(base_dir:Path, ext:Optional[str]=None)->Path: assert base_dir.is_dir() if ext is None: ext = "" elif ext[0] != ".": ext = f".{ext}" lock = Lock(f"dir_lock:{base_dir.name}") while(not lock.acquire(timeout=5)): pass # THREADSAFE name = f"{get_random_ascii_str(10)}{ext}" path = base_dir.joinpath(name) while path.is_file(): name = f"{get_random_ascii_str(10)}{ext}" path = base_dir.joinpath(name) path.touch() # End Threadsafe lock.release() return path
def _flush_to_batches(redis_key, name): lock = Lock(redis_key) # TODO set timeout and handle if lock.acquire(timeout=1): try: processor = SchemaPreprocessor(name) batch_writer = BatchWriter(name) # Get the batch and remove the read range atomically with rd.pipeline() as pipe: pipe.multi() pipe.lrange(redis_key, 0, Config.batches.size - 1) pipe.ltrim(redis_key, Config.batches.size, -1) batch = pipe.execute()[0] batch_matrix = processor.json_blobs_to_matrix(batch) batch_writer.write_batch_matrix(batch_matrix) finally: lock.release() else: raise Reschedule()
def get_scheduler_lock(scheduler, path_or_file=None): """ Get the appropriate lock for a certain situation based onthe dask scheduler used. See Also -------- dask.utils.get_scheduler_lock """ if scheduler == 'distributed': from dask.distributed import Lock return Lock(path_or_file) elif scheduler == 'multiprocessing': return multiprocessing.Lock() elif scheduler == 'threaded': from dask.utils import SerializableLock return SerializableLock() else: return threading.Lock()
def _get_distributed_lock(key): from dask.distributed import Lock return Lock(key)
save_mats(count, runName, AA_mutation,nucleotide_mutation) print('DONE ! ') brake.set(False) return None #######start the sankof algo here ####################### print('starting sankof') #scale cluster #scatter the blank tree and row index for each process #remote_tree = client.scatter(tree) remote_index = client.scatter(IDindex) inq = Queue('inq') outq = Queue('outq') lock = Lock('x') stopiter = Variable(False) brake = Variable(True) saver_started = False workers_started = False #start workers for workers in range(NCORE*ncpu ): w = client.submit( calculate_small_parsimony , inq= None ,outq = None ,stopiter= stopiter , treefile=treefile , bootstrap_replicates = bootstrap_replicates, matfile= alnfile+'.h5' , row_index= remote_index , iolock = lock, verbose = False ) fire_and_forget(w) s = client.submit( collect_futures , queue= None , stopiter=stopiter , brake = brake, runName= runName , nucleotides_only =False )
def quick_proc(ds, opts, label_raw, label, client, reference=None, pxmask=None): reference = imread(opts.reference) if reference is None else reference pxmask = imread(opts.pxmask) if pxmask is None else pxmask stack = ds.stacks[label_raw] # stk_del = ds.stacks['label_raw'].to_delayed().ravel() # get array names and shapes by correcting a single image (the last one) sample_res = _fast_correct(stack[-1:, ...].compute(scheduler='threading'), opts=opts, data_key=ds.data_pattern + '/' + label, shots_grp=ds.shots_pattern, peaks_grp=ds.data_pattern) # print({k: v.dtype for k, v in sample_res.items()}) # initialize file structure for (file, subset), grp in ds.shots.groupby(['file', 'subset']): with h5py.File(file, 'a') as fh: for pattern, data in sample_res.items(): path = pattern.replace('%', subset) # print('Initializing', file, path) fh.require_dataset(path, shape=(len(grp), ) + data.shape[1:], dtype=data.dtype, chunks=(1, ) + data.shape[1:], compression=opts.compression) fh[ds.data_pattern.replace('%', subset)].attrs['signal'] = label # array of integers corresponding to the chunk number chunk_label = np.concatenate( [np.repeat(ii, cs) for ii, cs in enumerate(stack.chunks[0])]) # delay objects returning the image and info dictionary cmp_del = [ dask.delayed(_fast_correct)(raw_chk, opts) for raw_chk in ds.raw_counts.to_delayed().ravel() ] # file lock objects locks = {fn: Lock() for fn in ds.files} # make delay objects for writing results to file (= maximum side effects!) dels = [] for chks, (cl, sht) in zip(cmp_del, ds.shots.groupby(chunk_label)): assert len(sht.drop_duplicates(['file', 'subset'])) == 1 ii_to = sht.shot_in_subset.values dels.append( dask.delayed(nexus._save_single_chunk_multi)( chks, file=sht.file.values[0], subset=sht.subset.values[0], idcs=ii_to, lock=locks[sht.file.values[0]])) # random.shuffle(dels) # shuffling tasks to minimize concurrent file access chunk_info = client.compute(dels, sync=True) return pd.DataFrame(chunk_info, columns=['file', 'subset', 'path', 'shot_in_subset'])