예제 #1
0
파일: view.py 프로젝트: CaptainAL/Spyder
 def use_dill(self):
     """Expand serialization support with dill
     
     adds support for closures, etc.
     
     This calls ipykernel.pickleutil.use_dill() here and on each engine.
     """
     pickleutil.use_dill()
     return self.apply(pickleutil.use_dill)
예제 #2
0
def setup_parallel(parallel):
    if parallel:
        pickleutil.use_dill()
        #can_map.pop(FunctionType, None)
        #serialize.pickle = pickle
        print("Running in parallel")
        rc = Client()
        rc[:].use_dill()
        lview = rc.load_balanced_view()
        lview.block = True
    else:
        lview = None
    return lview
예제 #3
0
    def __init__(self, scheduler, queue, num_jobs, cores_per_job=1, profile=None,
                 start_wait=16, extra_params=None, retries=None, direct=False,
                 wait_for_all_engines=False):
        self.stopped = False
        self.profile = profile
        num_jobs = int(num_jobs)
        cores_per_job = int(cores_per_job)
        start_wait = int(start_wait)

        if extra_params is None:
            extra_params = {}
        max_delay = start_wait * 60
        delay = 5
        max_tries = 10
        _create_base_ipython_dirs()
        if self.profile is None:
            self.has_throwaway = True
            self.profile = create_throwaway_profile()
        else:
            # ensure we have an .ipython directory to prevent issues
            # creating it during parallel startup
            cmd = [sys.executable, "-E", "-c", "from IPython import start_ipython; start_ipython()",
                   "profile", "create", "--parallel"] + _get_profile_args(self.profile)
            subprocess.check_call(cmd)
            self.has_throwaway = False
        num_tries = 0

        self.cluster_id = str(uuid.uuid4())
        url_file = get_url_file(self.profile, self.cluster_id)

        while 1:
            try:
                if extra_params.get("run_local"):
                    _start_local(num_jobs, self.profile, self.cluster_id)
                else:
                    _start(scheduler, self.profile, queue, num_jobs, cores_per_job, self.cluster_id, extra_params)
                break
            except subprocess.CalledProcessError:
                if num_tries > max_tries:
                    raise
                num_tries += 1
                time.sleep(delay)

        try:
            self.client = None
            if wait_for_all_engines:
                # Start using cluster when this many engines are up
                need_engines = num_jobs
            else:
                need_engines = 1
            slept = 0
            max_up = 0
            up = 0
            while up < need_engines:
                up = _nengines_up(url_file)
                print('\r{0} Engines running'.format(up), end="")
                if up < max_up:
                    print ("\nEngine(s) that were up have shutdown prematurely. "
                           "Aborting cluster startup.")
                    _stop(self.profile, self.cluster_id)
                    sys.exit(1)
                max_up = up
                time.sleep(delay)
                slept += delay
                if slept > max_delay:
                    raise IOError("""

        The cluster startup timed out. This could be for a couple of reasons. The
        most common reason is that the queue you are submitting jobs to is
        oversubscribed. You can check if this is what is happening by trying again,
        and watching to see if jobs are in a pending state or a running state when
        the startup times out. If they are in the pending state, that means we just
        need to wait longer for them to start, which you can specify by passing
        the --timeout parameter, in minutes.

        The second reason is that there is a problem with the controller and engine
        jobs being submitted to the scheduler. In the directory you ran from,
        you should see files that are named YourScheduler_enginesABunchOfNumbers and
        YourScheduler_controllerABunchOfNumbers. If you submit one of those files
        manually to your scheduler (for example bsub < YourScheduler_controllerABunchOfNumbers)
        You will get a more helpful error message that might help you figure out what
        is going wrong.

        The third reason is that you need to submit your bcbio_nextgen.py job itself as a job;
        bcbio-nextgen needs to run on a compute node, not the login node. So the
        command you use to run bcbio-nextgen should be submitted as a job to
        the scheduler. You can diagnose this because the controller and engine
        jobs will be in the running state, but the cluster will still timeout.

        Finally, it may be an issue with how the cluster is configured-- the controller
        and engine jobs are unable to talk to each other. They need to be able to open
        ports on the machines each of them are running on in order to work. You
        can diagnose this as the possible issue by if you have submitted the bcbio-nextgen
        job to the scheduler, the bcbio-nextgen main job and the controller and
        engine jobs are all in a running state and the cluster still times out. This will
        likely to be something that you'll have to talk to the administrators of the cluster
        you are using about.

        If you need help debugging, please post an issue here and we'll try to help you
        with the detective work:

        https://github.com/roryk/ipython-cluster-helper/issues

                            """)
            print()
            self.client = Client(url_file, timeout=60)
            if direct:
                self.view = _get_direct_view(self.client, retries)
            else:
                self.view = _get_balanced_blocked_view(self.client, retries)
            self.view.clusterhelper = {"profile": self.profile,
                                       "cluster_id": self.cluster_id}
            if dill:
                pickleutil.use_dill()
                self.view.apply(pickleutil.use_dill)
        except:
            self.stop()
            raise