def use_dill(self): """Expand serialization support with dill adds support for closures, etc. This calls ipykernel.pickleutil.use_dill() here and on each engine. """ pickleutil.use_dill() return self.apply(pickleutil.use_dill)
def setup_parallel(parallel): if parallel: pickleutil.use_dill() #can_map.pop(FunctionType, None) #serialize.pickle = pickle print("Running in parallel") rc = Client() rc[:].use_dill() lview = rc.load_balanced_view() lview.block = True else: lview = None return lview
def __init__(self, scheduler, queue, num_jobs, cores_per_job=1, profile=None, start_wait=16, extra_params=None, retries=None, direct=False, wait_for_all_engines=False): self.stopped = False self.profile = profile num_jobs = int(num_jobs) cores_per_job = int(cores_per_job) start_wait = int(start_wait) if extra_params is None: extra_params = {} max_delay = start_wait * 60 delay = 5 max_tries = 10 _create_base_ipython_dirs() if self.profile is None: self.has_throwaway = True self.profile = create_throwaway_profile() else: # ensure we have an .ipython directory to prevent issues # creating it during parallel startup cmd = [sys.executable, "-E", "-c", "from IPython import start_ipython; start_ipython()", "profile", "create", "--parallel"] + _get_profile_args(self.profile) subprocess.check_call(cmd) self.has_throwaway = False num_tries = 0 self.cluster_id = str(uuid.uuid4()) url_file = get_url_file(self.profile, self.cluster_id) while 1: try: if extra_params.get("run_local"): _start_local(num_jobs, self.profile, self.cluster_id) else: _start(scheduler, self.profile, queue, num_jobs, cores_per_job, self.cluster_id, extra_params) break except subprocess.CalledProcessError: if num_tries > max_tries: raise num_tries += 1 time.sleep(delay) try: self.client = None if wait_for_all_engines: # Start using cluster when this many engines are up need_engines = num_jobs else: need_engines = 1 slept = 0 max_up = 0 up = 0 while up < need_engines: up = _nengines_up(url_file) print('\r{0} Engines running'.format(up), end="") if up < max_up: print ("\nEngine(s) that were up have shutdown prematurely. " "Aborting cluster startup.") _stop(self.profile, self.cluster_id) sys.exit(1) max_up = up time.sleep(delay) slept += delay if slept > max_delay: raise IOError(""" The cluster startup timed out. This could be for a couple of reasons. The most common reason is that the queue you are submitting jobs to is oversubscribed. You can check if this is what is happening by trying again, and watching to see if jobs are in a pending state or a running state when the startup times out. If they are in the pending state, that means we just need to wait longer for them to start, which you can specify by passing the --timeout parameter, in minutes. The second reason is that there is a problem with the controller and engine jobs being submitted to the scheduler. In the directory you ran from, you should see files that are named YourScheduler_enginesABunchOfNumbers and YourScheduler_controllerABunchOfNumbers. If you submit one of those files manually to your scheduler (for example bsub < YourScheduler_controllerABunchOfNumbers) You will get a more helpful error message that might help you figure out what is going wrong. The third reason is that you need to submit your bcbio_nextgen.py job itself as a job; bcbio-nextgen needs to run on a compute node, not the login node. So the command you use to run bcbio-nextgen should be submitted as a job to the scheduler. You can diagnose this because the controller and engine jobs will be in the running state, but the cluster will still timeout. Finally, it may be an issue with how the cluster is configured-- the controller and engine jobs are unable to talk to each other. They need to be able to open ports on the machines each of them are running on in order to work. You can diagnose this as the possible issue by if you have submitted the bcbio-nextgen job to the scheduler, the bcbio-nextgen main job and the controller and engine jobs are all in a running state and the cluster still times out. This will likely to be something that you'll have to talk to the administrators of the cluster you are using about. If you need help debugging, please post an issue here and we'll try to help you with the detective work: https://github.com/roryk/ipython-cluster-helper/issues """) print() self.client = Client(url_file, timeout=60) if direct: self.view = _get_direct_view(self.client, retries) else: self.view = _get_balanced_blocked_view(self.client, retries) self.view.clusterhelper = {"profile": self.profile, "cluster_id": self.cluster_id} if dill: pickleutil.use_dill() self.view.apply(pickleutil.use_dill) except: self.stop() raise