def wait(self, futures): if self.in_worker: secede() results = self.client.gather(list(futures)) if self.in_worker: rejoin() return [r.get() for r in results]
def generateCoeffs(coeffs): c = get_client() futures = [c.submit(coeff.generate) for coeff in coeffs] secede() c.gather(futures) rejoin() return coeffs
def retrieval_context(self): """Override ParallelBackendBase.retrieval_context to avoid deadlocks. This removes thread from the worker's thread pool (using 'secede'). Seceding avoids deadlock in nested parallelism settings. """ # See 'joblib.Parallel.__call__' and 'joblib.Parallel.retrieve' for how # this is used. if hasattr(thread_state, 'execution_state'): # we are in a worker. Secede to avoid deadlock. secede() yield if hasattr(thread_state, 'execution_state'): rejoin()
def run_experiment(self): """ Run the experiment. Including all scenarios and replications. """ dask_client = get_client(timeout=600) # print("dask client: {}".format(dask_client)) # try: secede() print("secede: {}".format(self.name)) # seceded = True # except ValueError: # seceded = False futures = [] total_runs = len(self.scenarios) * self.num_replications for scenario in self.scenarios: scenario_name, configuration = scenario # replications_done = self.find_replications_done(scenario_name, configuration) replications_done = 0 # random number of steps for this scenario total_steps = random.randint(300, 700) for run_id in range(replications_done, self.num_replications): future = dask_client.submit(run_model, self.name, scenario_name, run_id, total_runs, total_steps, self.scratch_path) futures.append(future) loggers_info = dask_client.gather(futures) # if seceded: print("rejoin: {}".format(self.name)) rejoin() # gather all the output dbs into a single db out_db_filepath = Logger.gather_databases(self.name, loggers_info) dest_file_path_name = os.path.join(OUTPUT_DIR, os.path.split(out_db_filepath)[1]) if not os.path.exists(dest_file_path_name): shutil.move(out_db_filepath, dest_file_path_name) return dest_file_path_name
def generate(self): def generateConstCoeff(c, dofs, order): # Generate the coefficient c.generate() tmp = c.components # Constant part of the coefficient? if len(c.derivs) == 0: return tmp # Contract the indices from the phi expansions for d in c.derivs: dphis = dPhis(dofs, d) a = tuple(range(self.order, self.order + d + 1)) b = tuple(range(d + 1)) tmp = np.tensordot(tmp, dphis, axes=(a, b)) # Ignore zeros return tmp c = get_client() futures = [ c.submit(generateConstCoeff, coeff, self.parametrization.dofs, self.order) for coeff in self.constCoeffs ] secede() c.gather(futures) rejoin() # Add them together if len(futures) == 0: return self.components = futures[0].result() for i in range(1, len(futures)): self.components = self.components + futures[i].result()
def run_test_with_timeout( test_config: TestConfig, incoming_state: dict, hostnames: List[str], duration: int = 15, ) -> dict: """ Calls run_test with a timeout and signals run_test to end gracefully if timeout has completed Args: test_config: Config of test to run incoming_state: Initial state to run actions/asserts in hostnames: List of runner hostnames duration: Optional timeout to run test within (I suppose this is to make it convenient to call in runners) Returns: New state after running actions and asserts """ if duration is None or duration < 0: return run_test(test_config, incoming_state, hostnames) # NOTE: Use a dask cluster scheduler? client = get_client() # Used to prevent system deadlock since we are spawning 2 threads secede() # NOTE: may improve way of doing this timeout_signal_name = f"keep-going-{str(uuid.uuid4())}" keep_going = Variable(timeout_signal_name) keep_going.set(True) run_test_task: Future = client.submit( run_test, test_config=test_config, incoming_state=incoming_state, hostnames=hostnames, timeout_signal_name=timeout_signal_name, ) LOGGER.debug("Test duration config: %d seconds", duration) def distributed_timeout(): # If a timeout from a previous test did not complete, it will keep running (it cannot be canceled) # However, if it keeps running, it can end another test early # This means it needs to receive a signal to return end_time = datetime.now() + timedelta(seconds=duration) while datetime.now() <= end_time and keep_going.get(): time.sleep(test_config.get("secondsBetweenCycles", 1)) timeout_task: Future = client.submit(distributed_timeout) # Wait for either test or timeout to finish # Return test result if it finishes first # End test if timeout finishes first and return state start = datetime.now() wait([run_test_task, timeout_task], return_when="FIRST_COMPLETED") end = datetime.now() rejoin() LOGGER.debug("Test %s took %d seconds", test_config["name"], (end - start).seconds) if run_test_task.done(): keep_going.set(False) return run_test_task.result() elif timeout_task.done(): LOGGER.debug("test task: %s", run_test_task) LOGGER.debug("timeout task: %s", timeout_task) LOGGER.info("Test %s timed out", test_config["name"]) # NOTE: add timed out to summary? keep_going.set(False) return run_test_task.result()
def coclustering(Z, nclusters_row, nclusters_col, errobj, niters, epsilon, col_clusters_init=None, row_clusters_init=None, run_on_worker=False): """ Run the co-clustering, Dask implementation :param Z: m x n data matrix :param nclusters_row: num row clusters :param nclusters_col: number of column clusters :param errobj: convergence threshold for the objective function :param niters: maximum number of iterations :param epsilon: numerical parameter, avoids zero arguments in log :param row_clusters_init: initial row cluster assignment :param col_clusters_init: initial column cluster assignment :param run_on_worker: whether the function is submitted to a Dask worker :return: has converged, number of iterations performed. final row and column clustering, error value """ client = get_client() Z = da.array(Z) if not isinstance(Z, da.Array) else Z [m, n] = Z.shape row_chunks, col_chunks = Z.chunksize row_clusters = da.array(row_clusters_init) \ if row_clusters_init is not None \ else _initialize_clusters(m, nclusters_row, chunks=row_chunks) col_clusters = da.array(col_clusters_init) \ if col_clusters_init is not None \ else _initialize_clusters(n, nclusters_col, chunks=col_chunks) R = _setup_cluster_matrix(nclusters_row, row_clusters) C = _setup_cluster_matrix(nclusters_col, col_clusters) e, old_e = 2 * errobj, 0 s = 0 converged = False Gavg = Z.mean() while (not converged) & (s < niters): logger.debug(f'Iteration # {s} ..') # Calculate cluster based averages # nel_clusters is a matrix with the number of elements per co-cluster # originally computed as: da.dot(da.dot(R.T, da.ones((m, n))), C) nel_row_clusters = da.bincount(row_clusters, minlength=nclusters_row) nel_col_clusters = da.bincount(col_clusters, minlength=nclusters_col) logger.debug('num of populated clusters: row {}, col {}'.format( da.sum(nel_row_clusters > 0).compute(), da.sum(nel_col_clusters > 0).compute())) nel_clusters = da.outer(nel_row_clusters, nel_col_clusters) CoCavg = (da.matmul(da.matmul(R.T, Z), C) + Gavg * epsilon) / \ (nel_clusters + epsilon) # Calculate distance based on row approximation d_row = _distance(Z, da.matmul(C, CoCavg.T), epsilon) # Assign to best row cluster row_clusters = da.argmin(d_row, axis=1) R = _setup_cluster_matrix(nclusters_row, row_clusters) # Calculate distance based on column approximation d_col = _distance(Z.T, da.matmul(R, CoCavg), epsilon) # Assign to best column cluster col_clusters = da.argmin(d_col, axis=1) C = _setup_cluster_matrix(nclusters_col, col_clusters) # Error value (actually just the column components really) old_e = e minvals = da.min(d_col, axis=1) # power 1 divergence, power 2 euclidean e = da.sum(da.power(minvals, 1)) row_clusters, R, col_clusters, C, e = client.persist( [row_clusters, R, col_clusters, C, e]) if run_on_worker: # this is workaround for e.compute() for a function that runs # on a worker with multiple threads # https://github.com/dask/distributed/issues/3827 e = client.compute(e) secede() e = e.result() rejoin() else: e = e.compute() logger.debug(f'Error = {e:+.15e}, dE = {e - old_e:+.15e}') converged = abs(e - old_e) < errobj s = s + 1 if converged: logger.debug(f'Coclustering converged in {s} iterations') else: logger.debug(f'Coclustering not converged in {s} iterations') return converged, s, row_clusters, col_clusters, e
def calc_cv_per_model(nr_particles, model_weights, N_BOOTSTR, test_w, transitions, test_X): """ Calculate the Coefficient of Variation. Parameters ---------- nr_particles: int Number of particles to estimate the CV for model_weights: np.ndarray array of model weights N_BOOTSTR: int Nr of bootstrapped KDEs to take to estimate the CV test_w: List[np.ndarray] test_w[m] are the weights of the test points test_X[m] of model m transitions: List[Transition] List of transitions test_X: List[np.ndarray] test_X[m] are the test points with weights test_w[m] client: Client to execute on Returns ------- cv, variations_at_X: float, List[np.ndarray] * cv is the mean variation * variations_at_X are the variations at the test_X """ test_transitions = copy.deepcopy(transitions) # how many particles to draw for each model n_per_model = np.random.multinomial(nr_particles, model_weights) # N_BOOTSTR times, train test_transitions on n_per_model points, and # calculate the weights associated with test_X, for each model logger.debug("Start CV") futures = [] for _ in range(N_BOOTSTR): futures.append( weights(n_per_model, transitions, test_transitions, test_X)) logger.debug("Gathering futures") secede() chunked = client.gather(futures) rejoin() bootstr_w_at_test_X = [ np.concatenate(chunk) for bs in chunked for chunk in bs ] del chunked per_model_w = [np.asarray(arr) for arr in zip(*bootstr_w_at_test_X)] # calculate the cv of the bootstrapped weights for each model variations_at_X = [st.variation(ws, axis=0) for ws in per_model_w] # normalize by number of samples per model model_weighted_variations_at_X = [ var * n / n_per_model.sum() for var, n in zip(variations_at_X, n_per_model) ] # weight cvs by the point weights point_weighted_var_at_X = [ var * w for var, w in zip(model_weighted_variations_at_X, test_w) ] # compute an "average coefficient of variation": # for each model, sum up the weighted cvs over the test points # then, take the sum over all models cv = sum(var.sum() for var in point_weighted_var_at_X) logger.debug("CV done") return float(cv), variations_at_X
def _rejoin_pool_dask(): from dask.distributed import rejoin rejoin()