def run_indri(args, output, overwrite_threads=False): from subprocess import Popen, PIPE import os cancel = Variable('cancel', get_client()) if cancel.get(): return ('canceled', get_worker().address, 0, get_loadinfo()) start = time.time() if overwrite_threads: processes = len(os.sched_getaffinity(0)) - 1 args = (args[0], '-threads={}'.format(processes), *args[1:]) with Popen(args, stdout=PIPE, stderr=PIPE) as proc: content = [] for l in proc.stdout: content.append(l) if len(content) % 1000 != 0: continue if cancel.get(): proc.kill() return ('killed', get_worker().address, time.time() - start, get_loadinfo()) with open(output, 'wb') as f: f.writelines(content) return ('completed', get_worker().address, time.time() - start, get_loadinfo())
def _func_init_nccl(sessionId, uniqueId): """ Initialize ncclComm_t on worker Parameters ---------- sessionId : str session identifier from a comms instance uniqueId : array[byte] The NCCL unique Id generated from the client. """ worker = get_worker() raft_comm_state = get_raft_comm_state( sessionId=sessionId, state_object=get_worker() ) wid = raft_comm_state["wid"] nWorkers = raft_comm_state["nworkers"] try: n = nccl() n.init(nWorkers, uniqueId, wid) raft_comm_state["nccl"] = n except Exception as e: worker.log_event( topic="error", msg=f"An error occurred initializing NCCL: {e}." ) raise
def get_endpoints(addr_ports): # Create endpoints to all other workers ucx = get_worker()._ucx for address, port in addr_ports: if address != get_worker().address: host, p = parse_host_port(address) ucx.get_endpoint(host, port)
def build_ucx(): # Create listener and cache on worker get_worker()._callback_invoked = False def mock_callback(ep): get_worker()._callback_invoked = True ucx = UCX.get(mock_callback) get_worker()._ucx = ucx return get_worker().address, ucx.listener_port()
def xgboost_postprocess(pkl_path): with open(pkl_path, 'rb') as rf: obj = pickle.load(rf) try: worker = get_worker() dp = None for plg in worker.plugins: if 'process' in plg: dp = worker.plugins[plg] break if dp is None: raise ValueError('No process plugin registered') except Exception as e: logger.error(str(e), exc_info=True) raise e objects = obj['content'] objects = postprocess(dp.postprocess_model, dp.classes, objects) # remove empty strings returned from postprocess objects = [i for i in objects if i != ''] obj['xgboost_content'] = objects with open(pkl_path, 'wb') as wf: pickle.dump(obj, wf) return pkl_path
def _wrapped_function(function, *args, **kwargs): available_resources = kwargs['available_resources'] per_worker_logging = kwargs.pop('per_worker_logging') gpu_assignments = kwargs.pop('gpu_assignments') # Set up the logging per worker if the flag is set to True. if per_worker_logging: # Each worker should have its own log file. kwargs['logger_path'] = '{}.log'.format(get_worker().id) if available_resources.number_of_gpus > 0: worker_id = distributed.get_worker().id available_resources._gpu_device_indices = ( '0' if worker_id not in gpu_assignments else gpu_assignments[worker_id]) logging.info( f'Launching a job with access to GPUs {available_resources._gpu_device_indices}' ) return_value = _Multiprocessor.run(function, *args, **kwargs) return return_value
def _func_set_worker_as_nccl_root(sessionId, verbose): """ Creates a persistent nccl uniqueId on the scheduler node. Parameters ---------- sessionId : Associated session to attach the unique ID to. verbose : Indicates whether or not to emit additional information Return ------ uniqueId : byte str NCCL uniqueId, associating this DASK worker as its root node. """ worker = get_worker() if verbose: worker.log_event( topic="info", msg=f"Setting worker as NCCL root for session, '{sessionId}'", ) nccl_uid = set_nccl_root(sessionId=sessionId, state_object=worker) if verbose: worker.log_event( topic="info", msg="Done setting scheduler as NCCL root." ) return nccl_uid
def _wrapped_function(function, *args, **kwargs): available_resources = kwargs["available_resources"] per_worker_logging = kwargs.pop("per_worker_logging") gpu_assignments = kwargs.pop("gpu_assignments") # Set up the logging per worker if the flag is set to True. if per_worker_logging: # Each worker should have its own log file. os.makedirs("worker-logs", exist_ok=True) kwargs["logger_path"] = os.path.join("worker-logs", f"{get_worker().id}.log") if available_resources.number_of_gpus > 0: worker_id = distributed.get_worker().id available_resources._gpu_device_indices = ( "0" if worker_id not in gpu_assignments else gpu_assignments[worker_id]) logger.info(f"Launching a job with access to GPUs " f"{available_resources._gpu_device_indices}") return_value = _Multiprocessor.run(function, *args, **kwargs) return return_value
def evalOneMax(individual, lmbd=3): #lmbd = 3 edge = '18_1' start_time = 57600 end_time = 86400 try: #rank = mp.current_process()._identity[0] #rank = scoop.worker.decode("utf-8") #rank = rank.replace(".", "") #rank = rank.replace(":","") #rank = random.randint(0, 100) get_worker().id except: rank = 0 return run_sim(lmbd, edge, start_time, end_time, rank, individual)
def dask_to_fst(*args, **kwargs): import sys import logging from dask.distributed import get_worker #logger name is the same for all workers logger = logging.getLogger(logging_basename) #add handlers if none are present for this worker if not len(logger.handlers): command_line_args = args[2] logger.setLevel(command_line_args.log_level) logging.captureWarnings(True) #handlers worker_id = str(get_worker().id).lower() stream_handler = logging.StreamHandler(sys.stdout) file_handler = logging.FileHandler('logs/' + worker_id, 'w') #levels stream_handler.setLevel(command_line_args.log_level) file_handler.setLevel(command_line_args.log_level) #format formatter_stream = logging.Formatter(worker_id + ' %(message)s') stream_handler.setFormatter(formatter_stream) formatter_file = logging.Formatter( '%(asctime)s - %(name)s in %(funcName)s - %(levelname)s - %(message)s' ) file_handler.setFormatter(formatter_file) #add handlers logger.addHandler(stream_handler) logger.addHandler(file_handler) return to_fst(*args, **kwargs)
def _train_part(params, model_factory, list_of_parts, worker_address_to_port, return_model, time_out=120, **kwargs): local_worker_address = get_worker().address machine_list = ','.join([ '%s:%d' % (urlparse(worker_address).hostname, port) for worker_address, port in worker_address_to_port.items() ]) network_params = { 'machines': machine_list, 'local_listen_port': worker_address_to_port[local_worker_address], 'time_out': time_out, 'num_machines': len(worker_address_to_port) } params.update(network_params) # Concatenate many parts into one parts = tuple(zip(*list_of_parts)) data = _concat(parts[0]) label = _concat(parts[1]) weight = _concat(parts[2]) if len(parts) == 3 else None try: model = model_factory(**params) model.fit(data, label, sample_weight=weight, **kwargs) finally: _safe_call(_LIB.LGBM_NetworkFree()) return model if return_model else None
def __setstate__(self, state): # When we are running on a dask worker, functions # are executed in a different thread from the worker # itself, even if there is only one thread. To prevent # problems with SQLite, we check if this is a worker and # if there is only one thread, in which case we can # safely ignore the fact that the database is accessed # from a different thread than where it is created. from dask.distributed import get_worker try: worker = get_worker() except ValueError: n_threads = -1 else: n_threads = worker.nthreads database_path = state.pop('_sqlitedb_path_', None) database_readonly = state.pop('_sqlitedb_readonly_', False) self.__dict__ = state if database_path and not database_readonly: from ...database import SQLiteDB if os.path.exists(database_path): self.db = SQLiteDB( database_path, initialize='skip', readonly=database_readonly, check_same_thread=(n_threads != 1), )
def construct_linked_kb(eids): try: worker = get_worker() dp = None for plg in worker.plugins: if 'Linking' in plg: dp = worker.plugins[plg] break if dp is None: raise Exception('No linking plugin registered') if eids is None: return None results = [] for eid in eids: entity = dp.linker.kb.cui_to_entity[eid] result = { 'id': entity.concept_id, 'name': entity.canonical_name, 'aliases': tuple(entity.aliases), 'types': tuple(entity.types), 'description': entity.definition } results.append(result) result_df = pd.DataFrame(results) return result_df except Exception as e: logger.error(str(e), exc_info=True) return None
def link(content, score_threshold=0.8): try: worker = get_worker() dp = None for plg in worker.plugins: if 'Linking' in plg: dp = worker.plugins[plg] break if dp is None: raise Exception('No linking plugin registered') linking_result = dp.nlp(content) ent_set = set() nonlinked_list = set() # We'll only add one copy of the entity mention per paragraph. for ent in linking_result.ents: linked = False for ent_id, score in ent._.kb_ents: if score > score_threshold: linked = True if ent_id in ent_set: continue ent_set.add(ent_id) break if not linked: nonlinked_list.add(ent.text) ent_set = list(ent_set) nonlinked_list = list(nonlinked_list) return nonlinked_list, ent_set except Exception as e: logger.error(str(e), exc_info=True) return (None, None)
def func_chunk(chunk_ds): worker = get_worker() # 'memory_limit': '2GB', worker.memory_target_fraction = 0.95 worker.memory_spill_fraction = 0.95 worker.memory_pause_fraction = 0.95 worker.memory_terminate_fraction = 0.95 # False # print('chunk started:', chunk_ds.lat[0].data, chunk_ds.lon[0].data, flush=True) res_ds = nested_groupby_apply(chunk_ds, ['lat', 'lon', 'prob'], func_pwc_mr_fd) # group_by removes the dimensions mentioned, so the resulting ds is # lower dimensional, unfortunatley, map_blocks does not do that and so # putting the sub result datasets back together becomes technically difficult # chunk_fake_ds = make_fake_ds(chunk_ds).chunk(sub_chunk_dict) # sub_chunk_ds = chunk_ds.chunk(sub_chunk_dict) # res_ds = xr.map_blocks(func_pwc_mr_fd, sub_chunk_ds, template=chunk_fake_ds) # print( # 'chunk finished:', # chunk_ds.lat[0].data, chunk_ds.lon[0].data, chunk_ds.prob[0].data, # flush=True # ) # write_to_logfile( # 'chunk finished,', # "lat:", chunk_ds.lat[0].data, # "lon:", chunk_ds.lon[0].data, # "prob:", chunk_ds.prob[0].data # ) return res_ds
def fake_remote_fct( docker_auth: DockerBasicAuth, service_key: str, service_version: str, input_data: TaskInputData, output_data_keys: TaskOutputDataSchema, log_file_url: AnyUrl, command: List[str], ) -> TaskOutputData: # get the task data worker = get_worker() task = worker.tasks.get(worker.get_current_task()) assert task is not None print(f"--> task {task=} started") cancel_event = Event(TaskCancelEventName.format(task.key)) # tell the client we are started start_event = Event(_DASK_EVENT_NAME) start_event.set() # sleep a bit in case someone is aborting us print("--> waiting for task to be aborted...") cancel_event.wait(timeout=10) if cancel_event.is_set(): # NOTE: asyncio.CancelledError is not propagated back to the client... print("--> raising cancellation error now") raise TaskCancelledError return TaskOutputData.parse_obj({"some_output_key": 123})
def dask_mapper(current_range): """ Gets the paths to the file(s) in the current executor, then declares the headers found. Args: current_range (tuple): The current range of the dataset being processed on the executor. Returns: function: The map function to be executed on each executor, complete with all headers needed for the analysis. """ # Retrieve the current worker local directory localdir = get_worker().local_directory # Get and declare headers on each worker headers_on_executor = [ os.path.join(localdir, os.path.basename(filepath)) for filepath in headers ] Utils.declare_headers(headers_on_executor) # Get and declare shared libraries on each worker shared_libs_on_ex = [ os.path.join(localdir, os.path.basename(filepath)) for filepath in shared_libraries ] Utils.declare_shared_libraries(shared_libs_on_ex) return mapper(current_range)
async def _func_ucp_create_endpoints(sessionId, worker_info): """ Runs on each worker to create ucp endpoints to all other workers :param sessionId: uuid unique id for this instance :param worker_info: dict Maps worker address to rank & UCX port :param r: float a random number to stop the function from being cached """ dask_worker = get_worker() local_address = dask_worker.address eps = [None] * len(worker_info) count = 1 for k in worker_info: if str(k) != str(local_address): ip, port = parse_host_port(k) ep = await ucp.create_endpoint(ip, worker_info[k]["p"]) eps[worker_info[k]["r"]] = ep count += 1 worker_state(sessionId)["ucp_eps"] = eps
def worker_load(h5pyFileName, localNames, svNames, elements, allSums): from dask.distributed import get_worker worker = get_worker() worker._structures = {} worker._true_forces = {} with h5py.File(h5pyFileName, 'r') as h5pyFile: for struct in localNames: worker._structures[struct] = {} for sv in svNames: worker._structures[struct][sv] = {} for elem in elements: worker._structures[struct][sv][elem] = {} group = h5pyFile[struct][sv][elem] energyData = np.array(group['energy'][()], dtype=np.float32) forcesData = np.array(group['forces'][()], dtype=np.float32) if (allSums) and (len(forcesData.shape) == 4): forcesData = forcesData.sum(axis=0) worker._structures[struct][sv][elem]['energy'] = energyData worker._structures[struct][sv][elem]['forces'] = forcesData tvF = h5pyFile[struct].attrs['forces'] worker._true_forces[struct] = np.array(tvF, dtype=np.float32)
def get_raft_comm_state(sessionId, state_object=None): """ Retrieves cuML comms state on the scheduler node, for the given sessionId, creating a new session if it does not exist. If no session id is given, returns the state dict for all sessions. Parameters ---------- sessionId : SessionId value to retrieve from the dask_scheduler instances state_object : Object (either Worker, or Scheduler) on which the raft comm state will retrieved (or created) Returns ------- session state : str session state associated with sessionId """ state_object = state_object if state_object is not None else get_worker() if not hasattr(state_object, "_raft_comm_state"): state_object._raft_comm_state = {} if ( sessionId is not None and sessionId not in state_object._raft_comm_state ): state_object._raft_comm_state[sessionId] = {"ts": time.time()} if sessionId is not None: return state_object._raft_comm_state[sessionId] return state_object._raft_comm_state
def _train_part(params, model_factory, list_of_parts, worker_addresses, return_model, local_listen_port=12400, time_out=120, **kwargs): network_params = build_network_params(worker_addresses, get_worker().address, local_listen_port, time_out) params.update(network_params) # Concatenate many parts into one parts = tuple(zip(*list_of_parts)) data = concat(parts[0]) label = concat(parts[1]) weight = concat(parts[2]) if len(parts) == 3 else None try: model = model_factory(**params) model.fit(data, label, sample_weight=weight) finally: _safe_call(_LIB.LGBM_NetworkFree()) return model if return_model else None
def call_anneal_method(remote_worker, sampler_state, lambdas, noneq_trajectory_filename=None, num_integration_steps=1, return_timer=False, return_sampler_state=False, rethermalize=False, compute_incremental_work=True): """ this function calls LocallyOptimalAnnealing.anneal; since we can only map functions with parallelisms (no actors), we need to submit a function that calls the LocallyOptimalAnnealing.anneal method. """ if remote_worker == 'remote': _class = distributed.get_worker() else: _class = remote_worker incremental_work, new_sampler_state, timer, _pass, endstate_corrections = _class.annealing_class.anneal( sampler_state=sampler_state, lambdas=lambdas, noneq_trajectory_filename=noneq_trajectory_filename, num_integration_steps=num_integration_steps, return_timer=return_timer, return_sampler_state=return_sampler_state, rethermalize=rethermalize, compute_incremental_work=compute_incremental_work) return incremental_work, new_sampler_state, timer, _pass, endstate_corrections
def fake_sidecar_fct( docker_auth: DockerBasicAuth, service_key: str, service_version: str, input_data: TaskInputData, output_data_keys: TaskOutputDataSchema, log_file_url: AnyUrl, command: List[str], expected_annotations: Dict[str, Any], ) -> TaskOutputData: sub = Sub(TaskCancelEvent.topic_name()) # get the task data worker = get_worker() task = worker.tasks.get(worker.get_current_task()) assert task is not None print(f"--> task {task=} started") assert task.annotations == expected_annotations # sleep a bit in case someone is aborting us print("--> waiting for task to be aborted...") for msg in sub: assert msg print(f"--> received cancellation msg: {msg=}") cancel_event = TaskCancelEvent.parse_raw(msg) # type: ignore assert cancel_event if cancel_event.job_id == task.key: print("--> raising cancellation error now") raise asyncio.CancelledError("task cancelled") return TaskOutputData.parse_obj({"some_output_key": 123})
def _fit_local(params, model_factory, list_of_parts, worker_addresses, return_model, local_listen_port=12400, listen_time_out=120, **kwargs): network_params = build_network_params(worker_addresses, get_worker().address, local_listen_port, listen_time_out) params = {**params, **network_params} # Prepare data if len(list_of_parts[0]) == 3: data, labels, weight = zip(*list_of_parts) weight = concat(weight) else: data, labels = zip(*list_of_parts) weight = None data = concat(data) # Concatenate many parts into one labels = concat(labels) try: classifier = model_factory(**params) classifier.fit(data, labels, sample_weight=weight) finally: _safe_call(_LIB.LGBM_NetworkFree()) if return_model: return classifier else: return None
def process(self, inputs): ''' ''' import cudf worker = None try: from dask.distributed import get_worker worker = get_worker() except (ValueError, ImportError): pass logname = convert(self.__class__.__name__) logmgr = MortgagePluginsLoggerMgr(worker, logname) logger = logmgr.get_logger() worker_name = '' if worker is not None: worker_name = 'WORKER {} '.format(worker.name) performance_path = self.conf['csvfile_perfdata'] logger.info(worker_name + 'LOADING: {}'.format(performance_path)) cols = list(self.addition.keys()) dtypes = list(self.addition.values()) mortgage_gdf = cudf.read_csv(performance_path, names=cols, dtype=dtypes, delimiter='|', skiprows=1) logmgr.cleanup() return mortgage_gdf
async def _func_ucp_create_endpoints(sessionId, worker_info): """ Runs on each worker to create ucp endpoints to all other workers Parameters ---------- sessionId : str uuid unique id for this instance worker_info : dict Maps worker addresses to NCCL ranks & UCX ports """ eps = [None] * len(worker_info) count = 1 for k in worker_info: ip, port = parse_host_port(k) ep = await get_ucx().get_endpoint(ip, worker_info[k]["port"]) eps[worker_info[k]["rank"]] = ep count += 1 raft_comm_state = get_raft_comm_state( sessionId=sessionId, state_object=get_worker() ) raft_comm_state["ucp_eps"] = eps
def _func_store_initial_state(nworkers, sessionId, uniqueId, wid): raft_comm_state = get_raft_comm_state( sessionId=sessionId, state_object=get_worker() ) raft_comm_state["nccl_uid"] = uniqueId raft_comm_state["wid"] = wid raft_comm_state["nworkers"] = nworkers
def _func_build_handle(sessionId, streams_per_handle, verbose): """ Builds a handle_t on the current worker given the initialized comms Parameters ---------- sessionId : str id to reference state for current comms instance. streams_per_handle : int number of internal streams to create verbose : bool print verbose logging output """ worker = get_worker() if verbose: worker.log_event( topic="info", msg="Finished injecting comms on handle." ) handle = Handle(streams_per_handle) raft_comm_state = get_raft_comm_state( sessionId=sessionId, state_object=worker ) workerId = raft_comm_state["wid"] nWorkers = raft_comm_state["nworkers"] nccl_comm = raft_comm_state["nccl"] inject_comms_on_handle_coll_only( handle, nccl_comm, nWorkers, workerId, verbose ) raft_comm_state["handle"] = handle
def wrapped(doc: str, *args, **kwargs): worker = get_worker() try: nlp = worker.nlp except AttributeError: nlp = spacy.load(model) worker.nlp = nlp return func(nlp(doc), *args, **kwargs)
async def _func_init_all( sessionId, uniqueId, comms_p2p, worker_info, verbose, streams_per_handle ): worker = get_worker() raft_comm_state = get_raft_comm_state( sessionId=sessionId, state_object=worker ) raft_comm_state["nccl_uid"] = uniqueId raft_comm_state["wid"] = worker_info[get_worker().address]["rank"] raft_comm_state["nworkers"] = len(worker_info) if verbose: worker.log_event(topic="info", msg="Initializing NCCL.") start = time.time() _func_init_nccl(sessionId, uniqueId) if verbose: elapsed = time.time() - start worker.log_event( topic="info", msg=f"NCCL Initialization took: {elapsed} seconds." ) if comms_p2p: if verbose: worker.log_event(topic="info", msg="Initializing UCX Endpoints") if verbose: start = time.time() await _func_ucp_create_endpoints(sessionId, worker_info) if verbose: elapsed = time.time() - start msg = ( f"Done initializing UCX endpoints." f"Took: {elapsed} seconds.\nBuilding handle." ) worker.log_event(topic="info", msg=msg) _func_build_handle_p2p(sessionId, streams_per_handle, verbose) if verbose: worker.log_event(topic="info", msg="Done building handle.") else: _func_build_handle(sessionId, streams_per_handle, verbose)