Exemplo n.º 1
0
    def start_job(self, zip_filename, uncompress=True, client=None):
        if uncompress:
            zip_file = os.path.join(self.zip_dir, zip_filename)

            # transfer zip file to workers
            for watcher in self.nodes_watchers:
                if watcher.split(':')[0] == self.ip_address:
                    continue
                file_trans_client = FileTransportClient(watcher, zip_file)
                file_trans_client.send_file()

            job_dir = ZipHandler.uncompress(zip_file, self.job_dir)
        else:
            job_dir = os.path.join(self.job_dir,
                                   zip_filename.rsplit('.', 1)[0])

        job = import_job(job_dir)

        worker_port = job.context.job.port
        port = job.context.job.master_port
        nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers]

        if len(nodes) > 0:
            info = MasterJobInfo(port, nodes, worker_port)
            self.running_jobs[job.real_name] = info

            dirname = os.path.dirname(os.path.abspath(__file__))
            f = os.path.join(dirname, 'loader.py')
            workers = ['%s:%s' % (node, worker_port) for node in nodes]

            cmds = [
                'python', f, '-j', job_dir, '-i', self.ip_address, '-n',
                ' '.join(workers)
            ]
            if self.data_path is not None:
                cmds.extend(['-d', self.data_path])
            if self.force:
                cmds.append('-f')
            if client is not None:
                cmds.extend(['-c', client])
            popen = subprocess.Popen(cmds)
            info.popen = popen

            # call workers to start job
            for worker_watcher in self.nodes_watchers:
                client_call(worker_watcher,
                            'start_job',
                            zip_filename,
                            uncompress,
                            ignore=True)
Exemplo n.º 2
0
def runLocalJob(master, job_path):
    '''
    push local job to cola cluster and run
    '''
    
    if not os.path.exists(job_path):
        logger.error('Job path not exists!')
        return
    
    try:
        import_job(job_path)
    except (ImportError, AttributeError):
        logger.error('Job path is illegal!')
        return
    
    start_log_server()
    thread = start_rpc_server()
        
    logger.info('Pushing job to cola cluster...')
    dir_ = tempfile.mkdtemp()
    try:
        zip_filename = os.path.split(job_path)[1].replace(' ', '_') + '.zip'
        zip_file = os.path.join(dir_, zip_filename)
        
        ZipHandler.compress(zip_file, job_path, type_filters=("pyc", ))
        FileTransportClient(master, zip_file).send_file()
        
        logger.info('Push finished.')
    finally:
        shutil.rmtree(dir_)
    
    logger.info('Start to run job.')    
    _client_call(master, 'start_job', zip_filename, True, client)
    thread.join()
Exemplo n.º 3
0
 def start_job(self, zip_filename, uncompress=True, client=None):
     if uncompress:
         zip_file = os.path.join(self.zip_dir, zip_filename)
         
         # transfer zip file to workers
         for watcher in self.nodes_watchers:
             if watcher.split(':')[0] == self.ip_address:
                 continue
             file_trans_client = FileTransportClient(watcher, zip_file)
             file_trans_client.send_file()
         
         job_dir = ZipHandler.uncompress(zip_file, self.job_dir)
     else:
         job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0])
         
     job = import_job(job_dir)
     
     worker_port = job.context.job.port
     port = job.context.job.master_port
     nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers]
     
     if len(nodes) > 0:
         info = MasterJobInfo(port, nodes, worker_port)
         self.running_jobs[job.real_name] = info
         
         dirname = os.path.dirname(os.path.abspath(__file__))
         f = os.path.join(dirname, 'loader.py')
         workers = ['%s:%s'%(node, worker_port) for node in nodes]
         
         cmds = ['python', f, '-j', job_dir, '-i', self.ip_address, 
                 '-n', ' '.join(workers)]
         if self.data_path is not None:
             cmds.extend(['-d', self.data_path])
         if self.force:
             cmds.append('-f')
         if client is not None:
             cmds.extend(['-c', client])
         popen = subprocess.Popen(cmds)
         info.popen = popen
         
         # call workers to start job
         for worker_watcher in self.nodes_watchers:
             client_call(worker_watcher, 'start_job', zip_filename, uncompress, ignore=True)
Exemplo n.º 4
0
    def start_job(self, zip_filename, uncompress=True, client=None):
        if uncompress:
            zip_file = os.path.join(self.zip_dir, zip_filename)

            # transfer zip file to workers
            for watcher in self.nodes_watchers:
                if watcher.split(":")[0] == self.ip_address:
                    continue
                file_trans_client = FileTransportClient(watcher, zip_file)
                file_trans_client.send_file()

            job_dir = ZipHandler.uncompress(zip_file, self.job_dir)
        else:
            job_dir = os.path.join(self.job_dir, zip_filename.rsplit(".", 1)[0])

        job = import_job(job_dir)

        worker_port = job.context.job.port
        port = job.context.job.master_port
        nodes = [watcher.split(":")[0] for watcher in self.nodes_watchers]

        if len(nodes) > 0:
            info = MasterJobInfo(port, nodes, worker_port)
            self.running_jobs[job.real_name] = info

            dirname = os.path.dirname(os.path.abspath(__file__))
            f = os.path.join(dirname, "loader.py")
            workers = ["%s:%s" % (node, worker_port) for node in nodes]

            cmds = ["python", f, "-j", job_dir, "-i", self.ip_address, "-n", " ".join(workers)]
            if self.data_path is not None:
                cmds.extend(["-d", self.data_path])
            if self.force:
                cmds.append("-f")
            if client is not None:
                cmds.extend(["-c", client])
            popen = subprocess.Popen(cmds)
            info.popen = popen

            # call workers to start job
            for worker_watcher in self.nodes_watchers:
                client_call(worker_watcher, "start_job", zip_filename, uncompress)
Exemplo n.º 5
0
 def pack_job_error(self, job_name):
     working_dir = os.path.join(self.working_dir, job_name)
     pack_dir = pack_local_job_error(job_name, working_dir=working_dir, 
                                     logger=self.logger)
     zip_filename = os.path.join(self.zip_dir,
                                 '%s_%s_errors.zip'%(self.ctx.ip.replace('.', '_'), job_name))
     if os.path.exists(zip_filename):
         os.remove(zip_filename)
     
     ZipHandler.compress(zip_filename, pack_dir)
     FileTransportClient(self.master, zip_filename).send_file()
Exemplo n.º 6
0
    def run(self, args):
        master_addr = args.master
        ctx = Context(is_client=True, master_addr=master_addr)

        if args.list is True:
            jobs = ctx.list_jobs()
            self.logger.info('list jobs at master: %s' % ctx.master_addr)
            for job_id, info in jobs.iteritems():
                self.logger.info(
                    '====> job id: %s, job description: %s, status: %s' % \
                    (job_id, info['name'], info['status']))
            if len(jobs) == 0:
                self.logger.info('no jobs exist')
        elif args.kill is not None:
            job_id = self._get_matched_job_name(ctx, args.kill)
            if job_id is not None:
                ctx.kill_job(job_id)
                self.logger.info('killed job: %s' % job_id)
        elif args.upload is not None:
            path = os.path.abspath(args.upload)
            if not os.path.exists(path):
                self.logger.error('upload path does not exist')
                return

            job_id = None
            try:
                job_id = import_job_desc(path).uniq_name
            except Exception, e:
                self.logger.exception(e)
                self.logger.error('uploading job description failed')
                return

            new_upload_dir = os.path.join(tempfile.gettempdir(), job_id)
            if os.path.exists(new_upload_dir):
                shutil.rmtree(new_upload_dir)
            shutil.copytree(path, new_upload_dir)

            temp_filename = os.path.join(tempfile.gettempdir(),
                                         job_id + '.zip')
            ZipHandler.compress(temp_filename,
                                new_upload_dir,
                                type_filters=('pyc', ))
            try:
                FileTransportClient(ctx.master_addr, temp_filename).send_file()
            finally:
                os.remove(temp_filename)
                shutil.rmtree(new_upload_dir)
            self.logger.info('upload job <id: %s> finished' % job_id)

            if args.run == 'U':
                client_call(ctx.master_addr, 'run_job', job_id, True)
                self.logger.info('submit job <id: %s> to the cluster' % job_id)
Exemplo n.º 7
0
 def start_job(self, zip_filename, uncompress=True):
     if uncompress:
         zip_file = os.path.join(self.zip_dir, zip_filename)
         
         # transfer zip file to workers
         for watcher in self.nodes_watchers:
             if watcher.split(':')[0] == self.ip_address:
                 continue
             file_trans_client = FileTransportClient(watcher, zip_file)
             file_trans_client.send_file()
         
         job_dir = ZipHandler.uncompress(zip_file, self.job_dir)
     else:
         job_dir = os.path.join(self.job_dir, zip_filename.rsplit('.', 1)[0])
         
     job = import_job(job_dir)
     
     worker_port = job.context.job.port
     port = job.context.job.master_port
     nodes = [watcher.split(':')[0] for watcher in self.nodes_watchers]
     
     if len(nodes) > 0:
         info = MasterJobInfo(port, nodes, worker_port)
         self.running_jobs[job.real_name] = info
         
         dirname = os.path.dirname(os.path.abspath(__file__))
         f = os.path.join(dirname, 'loader.py')
         workers = ['%s:%s'%(node, worker_port) for node in nodes]
         subprocess.Popen('python "%(py)s" "%(job_dir)s" %(nodes)s' % {
             'py': f,
             'job_dir': job_dir,
             'nodes': ' '.join(workers)
         })
         
         # call workers to start job
         for worker_watcher in self.nodes_watchers:
             client_call(worker_watcher, 'start_job', zip_filename, uncompress)
Exemplo n.º 8
0
    def run_job(self, job_name, unzip=False, wait_for_workers=False):
        if wait_for_workers:
            while not self.stopped.is_set():
                if len(self.worker_tracker.workers) > 0:
                    break
                stopped = self.stopped.wait(3)
                if stopped:
                    return

        if unzip:
            self._unzip(job_name)

        job_path = os.path.join(self.job_dir, job_name)
        job_desc = import_job_desc(job_path)
        job_master = JobMaster(self.ctx, job_name, job_desc,
                               self.worker_tracker.workers.keys())
        job_master.init()
        self.job_tracker.register_job(job_name, job_master)
        self._register_runned_job(job_name, job_desc)

        zip_file = os.path.join(self.zip_dir, job_name + '.zip')
        for worker in job_master.workers:
            FileTransportClient(worker, zip_file).send_file()

        self.logger.debug('entering the master prepare stage, job id: %s' %
                          job_name)
        self.logger.debug('job available workers: %s' % job_master.workers)
        stage = Stage(job_master.workers, 'prepare')
        prepared_ok = stage.barrier(True, job_name)
        if not prepared_ok:
            self.logger.error("prepare for running failed")
            return

        self.logger.debug('entering the master run_job stage, job id: %s' %
                          job_name)
        stage = Stage(job_master.workers, 'run_job')
        run_ok = stage.barrier(True, job_name)
        if not run_ok:
            self.logger.error("run job failed, job id: %s" % job_name)