Пример #1
0
    def start_worker(self):
        if self.worker_rpc_server is None:
            self.worker_rpc_server = ThreadedColaRPCServer(
                (self.ip, self.port))
        self.worker = Worker(self)
        self.worker.run()

        return self.worker
Пример #2
0
 def start_worker(self):
     if self.worker_rpc_server is None:
         self.worker_rpc_server = ThreadedColaRPCServer((self.ip, 
                                                         self.port))
     self.worker = Worker(self)
     self.worker.run()
     
     return self.worker
Пример #3
0
class Context(object):
    fix_addr = lambda _, addr: addr if ':' in addr \
                    else '%s:%s'%(addr, main_conf.worker.port)
    fix_ip = lambda _, addr: addr if ':' not in addr \
                    else addr.split(':', 1)[0]
    
    def __init__(self, local_mode=False, is_master=False, master_addr=None, 
                 is_client=False, working_dir=None, mkdirs=False, 
                 ip=None, ips=None):
        self.is_local_mode = local_mode
        self.is_master = is_master
        self.is_client = is_client
        
        self.master_addr = master_addr
        self.master_ip = self.master_addr
        if not self.is_local_mode:
            if self.master_addr is None:
                raise ValueError('Master address must be supplied when local_mode is False')
                
            if ':' not in self.master_addr:
                self.master_addr = '%s:%s' % (self.master_addr, main_conf.master.port)
            else:
                self.master_ip = self.master_addr.split(':', 1)[0]
            self.master_port = int(self.master_addr.split(':', 1)[1])
        
        self.working_dir = working_dir
        if self.working_dir is None:
            tmp = tempfile.gettempdir()
            self.working_dir = os.path.join(tmp, 'cola')
            if mkdirs and not os.path.exists(self.working_dir):
                os.makedirs(self.working_dir)
                
        self.ip = ip
        if self.ip is None:
            if self.is_master:
                self.ip = self.master_ip
            else:
                self.ip = get_ip()
                if self.is_local_mode and not self.ip:
                    self.ip = '127.0.0.1'
        if ':' in self.ip:
            self.addr = self.ip
            self.ip = self.ip.split(':', 1)[0]
        else:
            self.addr = '%s:%s' % (self.ip, main_conf.worker.port)
        self.port = int(self.addr.split(':', 1)[1])
        if self.master_addr is None: self.master_addr = '%s:%s' % (self.ip, main_conf.master.port)
        self.worker_addr = self.addr
        
        self.ips = ips if ips is not None else []
        if not self.ips:
            self.ips.append(self.ip)
        self.addrs = [self.fix_addr(_ip) for _ip in self.ips]
        self.ips = [self.fix_ip(_ip) for _ip in self.ips]
            
        self.manager = ContextManager()
        self.manager.start(manager_init)
        self.env = self.manager.dict({'ip': self.ip,
                                      'addr': self.addr,
                                      'port': self.port,
                                      'root': self.working_dir,
                                      'is_local': self.is_local_mode, 
                                      'master_ip': self.master_ip,
                                      'master_addr': self.master_addr,
                                      'job_desc' : {}
                                      })
        self.logger = get_logger('cola_context')
        
        self.master_rpc_server = None
        self.worker_rpc_server = None

    def add_node(self, ip_or_addr):
        if ':' not in ip_or_addr:
            if ip_or_addr in self.ips:
                return
            self.ips.append(ip_or_addr)
            self.addrs.append(self.fix_addr(ip_or_addr))
        else:
            if ip_or_addr in self.addrs:
                return
            self.addrs.append(ip_or_addr)
            self.ips.append(self.fix_ip(ip_or_addr))

    def remove_node(self, ip_or_addr):
        if ':' not in ip_or_addr:
            if ip_or_addr in self.ips:
                self.ips.remove(ip_or_addr)
                self.addrs.remove(self.fix_addr(ip_or_addr))
        else:
            if ip_or_addr in self.addrs:
                self.addrs.remove(ip_or_addr)
                self.addrs.remove(self.fix_ip(ip_or_addr))

    def get_cola_dir(self):
        return os.path.dirname(os.path.abspath(__file__))
        
    def _get_name_and_dir(self, working_dir, job_name, 
                          overwrite=False, clear=False):
        src_job_name = job_name
        base_dir = working_dir
        src_working_dir = working_dir \
            = os.path.join(base_dir, job_name)
        idx = 1
        while os.path.exists(working_dir):
            if clear:
                shutil.rmtree(working_dir)
            if overwrite:
                job_name = '%s%s' % (src_job_name, idx)
                working_dir = os.path.join(base_dir, job_name)
                idx += 1
            if not clear and not overwrite:
                break
                
        if clear or not overwrite:
            return src_job_name, src_working_dir
        return job_name, working_dir

    def _clear_job_desc(self, job_name):
        if job_name in self.env['job_desc']:
            del self.env['job_desc'][job_name]
        
    def _run_local_job(self, job_path, overwrite=False, rpc_server=None, settings=None):
        job_desc = import_job_desc(job_path)
        if settings is not None: job_desc.update_settings(settings)
        base_name = job_desc.uniq_name
        self.env['job_desc'][base_name] = job_desc

        addr_dirname = self.addr.replace('.', '_').replace(':', '_')
        working_dir = os.path.join(self.working_dir, 'worker', addr_dirname)
        clear = job_desc.settings.job.clear
        job_name, working_dir = self._get_name_and_dir(
            working_dir, base_name, overwrite=overwrite, clear=clear)
                    
        clock = Clock()
        job = Job(self, job_path, job_name, job_desc=job_desc,
                  working_dir=working_dir, rpc_server=rpc_server,
                  manager=self.manager)
        t = threading.Thread(target=job.run, args=(True, ))
        t.start()
        
        stopped = multiprocessing.Event()
        def stop(signum, frame):
            if 'main' not in multiprocessing.current_process().name.lower():
                return
            if stopped.is_set():
                return
            else:
                stopped.set()
                
            self.logger.debug("Catch interrupt signal, start to stop")
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()
            
        signal.signal(signal.SIGINT, stop)
        signal.signal(signal.SIGTERM, stop)
        
        idle_times = 0
        while t.is_alive():
            if job.get_status() == FINISHED:
                break
            if job.get_status() == IDLE:
                idle_times += 1
                if idle_times > MAX_IDLE_TIMES:
                    break
            else:
                idle_times = 0
            
            try:
                t.join(5)
            except IOError:
                break
            
        need_shutdown = False
        if not job.stopped.is_set() and job.get_status() == FINISHED:
            self.logger.debug('All objects have been fetched, try to finish job')
            need_shutdown = True
        elif not stopped.is_set() and not t.is_alive():
            need_shutdown = True
        elif not job.stopped.is_set() and job.get_status() == IDLE:
            self.logger.debug('No bundle or url to perform, try to finish job')
            need_shutdown = True
            
        if need_shutdown is True:
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()

        self.logger.debug('Job id:%s finished, spend %.2f seconds for running' % (
            job_name, clock.clock()))
        
    def run_job(self, job_path, overwrite=False, init_rpc=False, settings=None):
        rpc_server = None
        if init_rpc:
            rpc_server = ThreadedColaRPCServer((self.ip, self.port))
            
        if self.is_local_mode:
            self._run_local_job(job_path, overwrite=overwrite, 
                                rpc_server=rpc_server, settings=settings)
        else:
            job_name = import_job_desc(job_path).uniq_name
            
            def create_zip(working_dir):
                zip_dir = os.path.join(self.working_dir, 'zip')
                filename = job_name + '.zip'
                zip_file = os.path.join(zip_dir, filename)
                
                ZipHandler.compress(zip_file, job_path, type_filters=('pyc', ))
                return job_name
            
            if hasattr(self, 'master'):
                create_zip(os.path.join(self.working_dir, 'master'))
                self.master.run_job(job_name, unzip=True)
            elif hasattr(self, 'worker'):
                create_zip(os.path.join(self.working_dir, 'worker'))
                self.worker.prepare(job_name, unzip=True)
                self.worker.run_job(job_name)
            
    def start_master(self):
        if not self.is_master:
            return
        
        if self.master_rpc_server is None:
            self.master_rpc_server = ThreadedColaRPCServer((self.ip, 
                                                            self.master_port))

        self.master = Master(self)
        self.master.run()
        
        return self.master
        
    def start_worker(self):
        if self.worker_rpc_server is None:
            self.worker_rpc_server = ThreadedColaRPCServer((self.ip, 
                                                            self.port))
        self.worker = Worker(self)
        self.worker.run()
        
        return self.worker
    
    def kill_master(self):
        if self.is_master and self.master is not None:
            self.master.shutdown()
        elif self.is_client:
            client_call(self.master_addr, 'shutdown')
            
    def list_workers(self):
        if self.is_master and self.master is not None:
            return self.master.list_workers()
        else:
            return client_call(self.master_addr, 'list_workers')
        
    def list_jobs(self):
        jobs = {}
        if self.is_master and self.master is not None:
            runnable_jobs = self.master.list_runnable_jobs()
            running_jobs = self.master.job_tracker.running_jobs
        else:
            runnable_jobs = client_call(self.master_addr, 'runnable_jobs')
            running_jobs = client_call(self.master_addr, 'running_jobs')
        for job_id, job_name in runnable_jobs.iteritems():
            jobs[job_id] = {'name': job_name}
            if job_id in running_jobs:
                jobs[job_id]['status'] = 'running'
            else:
                jobs[job_id]['status'] = 'stopped'
        
        return jobs

    def kill_job(self, job_id):
        if self.is_master and self.master is not None:
            self.master.stop_job(job_id)
        else:
            client_call(self.master_addr, 'stop_job', job_id)
            
    def get_job_counter(self, job_id):
        if self.is_master and self.master is not None:
            return self.master.counter_server.output()
        else:
            from cola.functions.counter import FUNC_PREFIX
            from cola.core.utils import get_rpc_prefix
            
            func_name = '%s%s' % (get_rpc_prefix(job_id, FUNC_PREFIX), 'get_global')
            
            return client_call(self.master_addr, func_name)
        
    def pack_job_error(self, job_id):
        if self.is_master and self.master is not None:
            return self.master.pack_job_error(job_id)
        else:
            return client_call(self.master_addr, 'pack_job_error', job_id)
Пример #4
0
class Context(object):
    fix_addr = lambda _, addr: addr if ':' in addr \
                    else '%s:%s'%(addr, main_conf.worker.port)
    fix_ip = lambda _, addr: addr if ':' not in addr \
                    else addr.split(':', 1)[0]

    def __init__(self,
                 local_mode=False,
                 is_master=False,
                 master_addr=None,
                 is_client=False,
                 working_dir=None,
                 mkdirs=False,
                 ip=None,
                 ips=None):
        self.is_local_mode = local_mode
        self.is_master = is_master
        self.is_client = is_client

        self.master_addr = master_addr
        self.master_ip = self.master_addr
        if not self.is_local_mode:
            if self.master_addr is None:
                raise ValueError(
                    'Master address must be supplied when local_mode is False')

            if ':' not in self.master_addr:
                self.master_addr = '%s:%s' % (self.master_addr,
                                              main_conf.master.port)
            else:
                self.master_ip = self.master_addr.split(':', 1)[0]

        self.working_dir = working_dir
        if self.working_dir is None:
            tmp = tempfile.gettempdir()
            self.working_dir = os.path.join(tmp, 'cola')
            if mkdirs and not os.path.exists(self.working_dir):
                os.makedirs(self.working_dir)

        self.ip = ip
        if self.ip is None:
            if self.is_master:
                self.ip = self.master_ip
            else:
                self.ip = get_ip()
                if self.is_local_mode and not self.ip:
                    self.ip = '127.0.0.1'
        if self.master_addr is None:
            self.master_addr = '%s:%s' % (self.ip, main_conf.master.port)
        self.worker_addr = '%s:%s' % (self.ip, main_conf.worker.port)

        self.ips = ips if ips is not None else []
        if not self.ips:
            self.ips.append(self.ip)
        self.addrs = [self.fix_addr(_ip) for _ip in self.ips]

        self.manager = ContextManager()
        self.manager.start(manager_init)
        self.env = self.manager.dict({
            'ip': self.ip,
            'root': self.working_dir,
            'is_local': self.is_local_mode,
            'master_ip': self.master_ip,
            'job_desc': {}
        })
        self.logger = get_logger('cola_context')

        self.master_rpc_server = None
        self.worker_rpc_server = None

    def get_cola_dir(self):
        return os.path.dirname(os.path.abspath(__file__))

    def _get_name_and_dir(self,
                          working_dir,
                          job_name,
                          overwrite=False,
                          clear=False):
        src_job_name = job_name
        base_dir = working_dir
        src_working_dir = working_dir \
            = os.path.join(base_dir, job_name)
        idx = 1
        while os.path.exists(working_dir):
            if clear:
                shutil.rmtree(working_dir)
            if overwrite:
                job_name = '%s%s' % (src_job_name, idx)
                working_dir = os.path.join(base_dir, job_name)
                idx += 1
            if not clear and not overwrite:
                break

        if clear or not overwrite:
            return src_job_name, src_working_dir
        return job_name, working_dir

    def _run_local_job(self,
                       job_path,
                       overwrite=False,
                       rpc_server=None,
                       settings=None):
        job_desc = import_job_desc(job_path)
        if settings is not None: job_desc.update_settings(settings)
        base_name = job_desc.uniq_name
        self.env['job_desc'][base_name] = job_desc

        working_dir = os.path.join(self.working_dir, 'worker')
        clear = job_desc.settings.job.clear
        job_name, working_dir = self._get_name_and_dir(working_dir,
                                                       base_name,
                                                       overwrite=overwrite,
                                                       clear=clear)

        clock = Clock()
        job = Job(self,
                  job_path,
                  job_name,
                  job_desc=job_desc,
                  working_dir=working_dir,
                  rpc_server=rpc_server,
                  manager=self.manager)
        t = threading.Thread(target=job.run, args=(True, ))
        t.start()

        stopped = multiprocessing.Event()

        def stop(signum, frame):
            if 'main' not in multiprocessing.current_process().name.lower():
                return
            if stopped.is_set():
                return
            else:
                stopped.set()

            self.logger.debug("Catch interrupt signal, start to stop")
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()

        signal.signal(signal.SIGINT, stop)
        signal.signal(signal.SIGTERM, stop)

        idle_times = 0
        while t.is_alive():
            if job.get_status() == FINISHED:
                break
            if job.get_status() == IDLE:
                idle_times += 1
                if idle_times > MAX_IDLE_TIMES:
                    break
            else:
                idle_times = 0

            try:
                t.join(5)
            except IOError:
                break

        need_shutdown = False
        if not job.stopped.is_set() and job.get_status() == FINISHED:
            self.logger.debug(
                'All objects have been fetched, try to finish job')
            need_shutdown = True
        elif not stopped.is_set() and not t.is_alive():
            need_shutdown = True
        elif not job.stopped.is_set() and job.get_status() == IDLE:
            self.logger.debug('No bundle or url to perform, try to finish job')
            need_shutdown = True

        if need_shutdown is True:
            job.shutdown()
            if rpc_server:
                rpc_server.shutdown()

        self.logger.debug(
            'Job id:%s finished, spend %.2f seconds for running' %
            (job_name, clock.clock()))

    def run_job(self,
                job_path,
                overwrite=False,
                init_rpc=False,
                settings=None):
        rpc_server = None
        if init_rpc:
            rpc_server = ThreadedColaRPCServer(
                (self.ip, main_conf.worker.port))

        if self.is_local_mode:
            self._run_local_job(job_path,
                                overwrite=overwrite,
                                rpc_server=rpc_server)
        else:
            job_name = import_job_desc(job_path).uniq_name

            def create_zip(working_dir):
                zip_dir = os.path.join(self.working_dir, 'zip')
                filename = job_name + '.zip'
                zip_file = os.path.join(zip_dir, filename)

                ZipHandler.compress(zip_file, job_path, type_filters=('pyc', ))
                return job_name

            if hasattr(self, 'master'):
                create_zip(os.path.join(self.working_dir, 'master'))
                self.master.run_job(job_name, unzip=True)
            elif hasattr(self, 'worker'):
                create_zip(os.path.join(self.working_dir, 'worker'))
                self.worker.prepare(job_name, unzip=True)
                self.worker.run_job(job_name)

    def start_master(self):
        if not self.is_master:
            return

        if self.master_rpc_server is None:
            self.master_rpc_server = ThreadedColaRPCServer(
                (self.ip, main_conf.master.port))

        self.master = Master(self)
        self.master.run()

        return self.master

    def start_worker(self):
        if self.worker_rpc_server is None:
            self.worker_rpc_server = ThreadedColaRPCServer(
                (self.ip, main_conf.worker.port))

        self.worker = Worker(self)
        self.worker.run()

        return self.worker

    def kill_master(self):
        if self.is_master and self.master is not None:
            self.master.shutdown()
        elif self.is_client:
            client_call(self.master_addr, 'shutdown')

    def list_workers(self):
        if self.is_master and self.master is not None:
            return self.master.list_workers()
        else:
            return client_call(self.master_addr, 'list_workers')

    def list_jobs(self):
        jobs = {}
        if self.is_master and self.master is not None:
            runnable_jobs = self.master.list_runnable_jobs()
            running_jobs = self.master.job_tracker.running_jobs
        else:
            runnable_jobs = client_call(self.master_addr, 'runnable_jobs')
            running_jobs = client_call(self.master_addr, 'running_jobs')
        for job_id, job_name in runnable_jobs.iteritems():
            jobs[job_id] = {'name': job_name}
            if job_id in running_jobs:
                jobs[job_id]['status'] = 'running'
            else:
                jobs[job_id]['status'] = 'stopped'

        return jobs

    def kill_job(self, job_id):
        if self.is_master and self.master is not None:
            self.master.stop_job(job_id)
        else:
            client_call(self.master_addr, 'stop_job', job_id)

    def get_job_counter(self, job_id):
        if self.is_master and self.master is not None:
            return self.master.counter_server.output()
        else:
            from cola.functions.counter import FUNC_PREFIX
            from cola.core.utils import get_rpc_prefix

            func_name = '%s%s' % (get_rpc_prefix(job_id,
                                                 FUNC_PREFIX), 'get_global')

            return client_call(self.master_addr, func_name)

    def pack_job_error(self, job_id):
        if self.is_master and self.master is not None:
            return self.master.pack_job_error(job_id)
        else:
            return client_call(self.master_addr, 'pack_job_error', job_id)