예제 #1
0
def test_kill_task(mocker):
    ID = str(uuid.uuid4())
    sched = mocker.Mock()
    framework = {'id': {'value': ID}}
    master = mocker.Mock()
    driver = MesosSchedulerDriver(sched, framework, master)
    driver._send = mocker.Mock()
    driver.killTask({"value": "my-task"})
    driver._send.assert_not_called()
    driver._stream_id = str(uuid.uuid4())
    driver.killTask({"value": "my-task"})
    driver._send.assert_called_once_with({
        'type': 'KILL',
        'framework_id': {'value': ID},
        'kill': {
            'task_id': {'value': 'my-task'},
        }
    })
예제 #2
0
def test_kill_task(mocker):
    ID = str(uuid.uuid4())
    sched = mocker.Mock()
    framework = {'id': {'value': ID}}
    master = mocker.Mock()
    driver = MesosSchedulerDriver(sched, framework, master)
    driver._send = mocker.Mock()
    driver.killTask({"value": "my-task"})
    driver._send.assert_not_called()
    driver._stream_id = str(uuid.uuid4())
    driver.killTask({"value": "my-task"})
    driver._send.assert_called_once_with({
        'type': 'KILL',
        'framework_id': {'value': ID},
        'kill': {
            'task_id': {'value': 'my-task'},
        }
    })
예제 #3
0
class ProcScheduler(Scheduler):
    def __init__(self):
        self.framework_id = None
        self.framework = self._init_framework()
        self.executor = None
        self.master = str(CONFIG.get("master", os.environ["MESOS_MASTER"]))
        self.driver = MesosSchedulerDriver(self, self.framework, self.master)
        self.procs_pending = {}
        self.procs_launched = {}
        self.slave_to_proc = {}
        self._lock = RLock()

    def _init_framework(self):
        framework = mesos_pb2.FrameworkInfo()
        framework.user = getpass.getuser()
        framework.name = repr(self)
        framework.hostname = socket.gethostname()
        return framework

    def _init_executor(self):
        executor = mesos_pb2.ExecutorInfo()
        executor.executor_id.value = "default"
        executor.command.value = "%s -m %s.executor" % (sys.executable, __package__)

        mem = executor.resources.add()
        mem.name = "mem"
        mem.type = mesos_pb2.Value.SCALAR
        mem.scalar.value = MIN_MEMORY

        cpus = executor.resources.add()
        cpus.name = "cpus"
        cpus.type = mesos_pb2.Value.SCALAR
        cpus.scalar.value = MIN_CPUS

        if "PYTHONPATH" in os.environ:
            var = executor.command.environment.variables.add()
            var.name = "PYTHONPATH"
            var.value = os.environ["PYTHONPATH"]

        executor.framework_id.value = str(self.framework_id.value)
        return executor

    def _init_task(self, proc, offer):
        task = mesos_pb2.TaskInfo()
        task.task_id.value = str(proc.id)
        task.slave_id.value = offer.slave_id.value
        task.name = repr(proc)
        task.executor.MergeFrom(self.executor)
        task.data = pickle.dumps(proc.params)

        cpus = task.resources.add()
        cpus.name = "cpus"
        cpus.type = mesos_pb2.Value.SCALAR
        cpus.scalar.value = proc.cpus

        mem = task.resources.add()
        mem.name = "mem"
        mem.type = mesos_pb2.Value.SCALAR
        mem.scalar.value = proc.mem

        return task

    def _filters(self, seconds):
        f = mesos_pb2.Filters()
        f.refuse_seconds = seconds
        return f

    def __repr__(self):
        return "%s[%s]: %s" % (self.__class__.__name__, os.getpid(), " ".join(sys.argv))

    def registered(self, driver, framework_id, master_info):
        with self._lock:
            logger.info("Framework registered with id=%s, master=%s" % (framework_id, master_info))
            self.framework_id = framework_id
            self.executor = self._init_executor()

    def resourceOffers(self, driver, offers):
        def get_resources(offer):
            cpus, mem = 0.0, 0.0
            for r in offer.resources:
                if r.name == "cpus":
                    cpus = float(r.scalar.value)
                elif r.name == "mem":
                    mem = float(r.scalar.value)
            return cpus, mem

        with self._lock:
            random.shuffle(offers)
            for offer in offers:
                if not self.procs_pending:
                    logger.debug("Reject offers forever for no pending procs, " "offers=%s" % (offers,))
                    driver.launchTasks(offer.id, [], self._filters(FOREVER))
                    continue

                cpus, mem = get_resources(offer)
                tasks = []
                for proc in self.procs_pending.values():
                    if cpus >= proc.cpus and mem >= proc.mem:
                        tasks.append(self._init_task(proc, offer))
                        del self.procs_pending[proc.id]
                        self.procs_launched[proc.id] = proc
                        cpus -= proc.cpus
                        mem -= proc.mem

                seconds = 5 + random.random() * 5
                driver.launchTasks(offer.id, tasks, self._filters(seconds))
                if tasks:
                    logger.info(
                        "Accept offer for procs, offer=%s, "
                        "procs=%s, filter_time=%s" % (offer, [int(t.task_id.value) for t in tasks], seconds)
                    )
                else:
                    logger.info("Retry offer for procs later, offer=%s, " "filter_time=%s" % (offer, seconds))

    def _call_finished(self, proc_id, success, message, data, slave_id=None):
        with self._lock:
            proc = self.procs_launched.pop(proc_id)
            if slave_id is not None:
                if slave_id in self.slave_to_proc:
                    self.slave_to_proc[slave_id].remove(proc_id)
            else:
                for slave_id, procs in self.slave_to_proc.iteritems():
                    if proc_id in procs:
                        procs.remove(proc_id)

            proc._finished(success, message, data)

    def statusUpdate(self, driver, update):
        with self._lock:
            proc_id = int(update.task_id.value)
            logger.info("Status update for proc, id=%s, state=%s" % (proc_id, update.state))
            if update.state == mesos_pb2.TASK_RUNNING:
                if update.slave_id.value in self.slave_to_proc:
                    self.slave_to_proc[update.slave_id.value].add(proc_id)
                else:
                    self.slave_to_proc[update.slave_id.value] = set([proc_id])

                proc = self.procs_launched[proc_id]
                proc._started()

            elif update.state >= mesos_pb2.TASK_FINISHED:
                slave_id = update.slave_id.value
                success = update.state == mesos_pb2.TASK_FINISHED
                message = update.message
                data = update.data and pickle.loads(update.data)
                self._call_finished(proc_id, success, message, data, slave_id)
                driver.reviveOffers()

    def offerRescinded(self, driver, offer_id):
        with self._lock:
            if self.procs_pending:
                logger.info("Revive offers for pending procs")
                driver.reviveOffers()

    def slaveLost(self, driver, slave_id):
        with self._lock:
            for proc_id in self.slave_to_proc.pop(slave_id, []):
                self._call_finished(proc_id, False, "Slave lost", None, slave_id)

    def error(self, driver, message):
        with self._lock:
            for proc in self.procs_pending.values():
                self._call_finished(proc.id, False, "Stopped", None)

            for proc in self.procs_launched.values():
                self._call_finished(proc.id, False, "Stopped", None)

        self.stop()

    def start(self):
        self.driver.start()

    def stop(self):
        assert not self.driver.aborted
        self.driver.stop()

    def submit(self, proc):
        if self.driver.aborted:
            raise RuntimeError("driver already aborted")

        with self._lock:
            if proc.id not in self.procs_pending:
                logger.info("Try submit proc, id=%s", (proc.id,))
                self.procs_pending[proc.id] = proc
                if len(self.procs_pending) == 1:
                    logger.info("Revive offers for pending procs")
                    self.driver.reviveOffers()
            else:
                raise ValueError("Proc with same id already submitted")

    def cancel(self, proc):
        if self.driver.aborted:
            raise RuntimeError("driver already aborted")

        with self._lock:
            if proc.id in self.procs_pending:
                del self.procs_pending[proc.id]
            elif proc.id in self.procs_launched:
                del self.procs_launched[proc.id]
                self.driver.killTask(mesos_pb2.TaskID(value=str(proc.id)))

            for slave_id, procs in self.slave_to_proc.items():
                procs.pop(proc.id)
                if not procs:
                    del self.slave_to_proc[slave_id]

    def send_data(self, pid, type, data):
        if self.driver.aborted:
            raise RuntimeError("driver already aborted")

        msg = pickle.dumps((pid, type, data))
        for slave_id, procs in self.slave_to_proc.iteritems():
            if pid in procs:
                self.driver.sendFrameworkMessage(self.executor.executor_id, mesos_pb2.SlaveID(value=slave_id), msg)
                return

        raise RuntimeError("Cannot find slave for pid %s" % (pid,))
예제 #4
0
class ProcScheduler(Scheduler):

    def __init__(self):
        self.framework_id = None
        self.framework = self._init_framework()
        self.executor = None
        self.master = str(CONFIG.get('master', os.environ['MESOS_MASTER']))
        self.driver = MesosSchedulerDriver(self, self.framework, self.master)
        self.procs_pending = {}
        self.procs_launched = {}
        self.slave_to_proc = {}
        self._lock = RLock()

    def _init_framework(self):
        framework = mesos_pb2.FrameworkInfo()
        framework.user = getpass.getuser()
        framework.name = repr(self)
        framework.hostname = socket.gethostname()
        return framework

    def _init_executor(self):
        executor = mesos_pb2.ExecutorInfo()
        executor.executor_id.value = 'default'
        executor.command.value = '%s -m %s.executor' % (
            sys.executable, __package__)

        mem = executor.resources.add()
        mem.name = 'mem'
        mem.type = mesos_pb2.Value.SCALAR
        mem.scalar.value = MIN_MEMORY

        cpus = executor.resources.add()
        cpus.name = 'cpus'
        cpus.type = mesos_pb2.Value.SCALAR
        cpus.scalar.value = MIN_CPUS

        if 'PYTHONPATH' in os.environ:
            var = executor.command.environment.variables.add()
            var.name = 'PYTHONPATH'
            var.value = os.environ['PYTHONPATH']

        executor.framework_id.value = str(self.framework_id.value)
        return executor

    def _init_task(self, proc, offer):
        task = mesos_pb2.TaskInfo()
        task.task_id.value = str(proc.id)
        task.slave_id.value = offer.slave_id.value
        task.name = repr(proc)
        task.executor.MergeFrom(self.executor)
        task.data = pickle.dumps(proc.params)

        cpus = task.resources.add()
        cpus.name = 'cpus'
        cpus.type = mesos_pb2.Value.SCALAR
        cpus.scalar.value = proc.cpus

        mem = task.resources.add()
        mem.name = 'mem'
        mem.type = mesos_pb2.Value.SCALAR
        mem.scalar.value = proc.mem

        return task

    def _filters(self, seconds):
        f = mesos_pb2.Filters()
        f.refuse_seconds = seconds
        return f

    def __repr__(self):
        return "%s[%s]: %s" % (
            self.__class__.__name__,
            os.getpid(), ' '.join(sys.argv))

    def registered(self, driver, framework_id, master_info):
        with self._lock:
            logger.info('Framework registered with id=%s, master=%s' % (
                framework_id, master_info))
            self.framework_id = framework_id
            self.executor = self._init_executor()

    def resourceOffers(self, driver, offers):
        def get_resources(offer):
            cpus, mem = 0.0, 0.0
            for r in offer.resources:
                if r.name == 'cpus':
                    cpus = float(r.scalar.value)
                elif r.name == 'mem':
                    mem = float(r.scalar.value)
            return cpus, mem

        with self._lock:
            random.shuffle(offers)
            for offer in offers:
                if not self.procs_pending:
                    logger.debug('Reject offers forever for no pending procs, '
                                 'offers=%s' % (offers, ))
                    driver.launchTasks(offer.id, [], self._filters(FOREVER))
                    continue

                cpus, mem = get_resources(offer)
                tasks = []
                for proc in self.procs_pending.values():
                    if cpus >= proc.cpus and mem >= proc.mem:
                        tasks.append(self._init_task(proc, offer))
                        del self.procs_pending[proc.id]
                        self.procs_launched[proc.id] = proc
                        cpus -= proc.cpus
                        mem -= proc.mem

                seconds = 5 + random.random() * 5
                driver.launchTasks(offer.id, tasks, self._filters(seconds))
                if tasks:
                    logger.info('Accept offer for procs, offer=%s, '
                                'procs=%s, filter_time=%s' % (
                                    offer,
                                    [int(t.task_id.value) for t in tasks],
                                    seconds))
                else:
                    logger.info('Retry offer for procs later, offer=%s, '
                                'filter_time=%s' % (
                                    offer, seconds))

    def _call_finished(self, proc_id, success, message, data, slave_id=None):
        with self._lock:
            proc = self.procs_launched.pop(proc_id)
            if slave_id is not None:
                if slave_id in self.slave_to_proc:
                    self.slave_to_proc[slave_id].remove(proc_id)
            else:
                for slave_id, procs in self.slave_to_proc.iteritems():
                    if proc_id in procs:
                        procs.remove(proc_id)

            proc._finished(success, message, data)

    def statusUpdate(self, driver, update):
        with self._lock:
            proc_id = int(update.task_id.value)
            logger.info('Status update for proc, id=%s, state=%s' % (
                proc_id, update.state))
            if update.state == mesos_pb2.TASK_RUNNING:
                if update.slave_id.value in self.slave_to_proc:
                    self.slave_to_proc[update.slave_id.value].add(proc_id)
                else:
                    self.slave_to_proc[update.slave_id.value] = set([proc_id])

                proc = self.procs_launched[proc_id]
                proc._started()

            elif update.state >= mesos_pb2.TASK_FINISHED:
                slave_id = update.slave_id.value
                success = (update.state == mesos_pb2.TASK_FINISHED)
                message = update.message
                data = update.data and pickle.loads(update.data)
                self._call_finished(proc_id, success, message, data, slave_id)
                driver.reviveOffers()

    def offerRescinded(self, driver, offer_id):
        with self._lock:
            if self.procs_pending:
                logger.info('Revive offers for pending procs')
                driver.reviveOffers()

    def slaveLost(self, driver, slave_id):
        with self._lock:
            for proc_id in self.slave_to_proc.pop(slave_id, []):
                self._call_finished(
                    proc_id, False, 'Slave lost', None, slave_id)

    def error(self, driver, message):
        with self._lock:
            for proc in self.procs_pending.values():
                self._call_finished(proc.id, False, 'Stopped', None)

            for proc in self.procs_launched.values():
                self._call_finished(proc.id, False, 'Stopped', None)

        self.stop()

    def start(self):
        self.driver.start()

    def stop(self):
        assert not self.driver.aborted
        self.driver.stop()

    def submit(self, proc):
        if self.driver.aborted:
            raise RuntimeError('driver already aborted')

        with self._lock:
            if proc.id not in self.procs_pending:
                logger.info('Try submit proc, id=%s', (proc.id,))
                self.procs_pending[proc.id] = proc
                if len(self.procs_pending) == 1:
                    logger.info('Revive offers for pending procs')
                    self.driver.reviveOffers()
            else:
                raise ValueError('Proc with same id already submitted')

    def cancel(self, proc):
        if self.driver.aborted:
            raise RuntimeError('driver already aborted')

        with self._lock:
            if proc.id in self.procs_pending:
                del self.procs_pending[proc.id]
            elif proc.id in self.procs_launched:
                del self.procs_launched[proc.id]
                self.driver.killTask(mesos_pb2.TaskID(value=str(proc.id)))

            for slave_id, procs in self.slave_to_proc.items():
                procs.pop(proc.id)
                if not procs:
                    del self.slave_to_proc[slave_id]

    def send_data(self, pid, type, data):
        if self.driver.aborted:
            raise RuntimeError('driver already aborted')

        msg = pickle.dumps((pid, type, data))
        for slave_id, procs in self.slave_to_proc.iteritems():
            if pid in procs:
                self.driver.sendFrameworkMessage(
                    self.executor.executor_id,
                    mesos_pb2.SlaveID(value=slave_id),
                    msg)
                return

        raise RuntimeError('Cannot find slave for pid %s' % (pid,))
예제 #5
0
class ProcScheduler(Scheduler):
    def __init__(self):
        self.framework_id = None
        self.framework = self._init_framework()
        self.executor = None
        self.master = str(CONFIG.get('master', os.environ['MESOS_MASTER']))
        self.driver = MesosSchedulerDriver(self, self.framework, self.master)
        self.procs_pending = {}
        self.procs_launched = {}
        self.agent_to_proc = {}
        self._lock = RLock()

    def _init_framework(self):
        framework = dict(
            user=getpass.getuser(),
            name=repr(self),
            hostname=socket.gethostname(),
        )
        return framework

    def _init_executor(self):
        executor = dict(
            executor_id=dict(value='default'),
            framework_id=self.framework_id,
            command=dict(value='%s -m %s.executor' %
                         (sys.executable, __package__)),
            resources=[
                dict(
                    name='mem',
                    type='SCALAR',
                    scalar=dict(value=MIN_MEMORY),
                ),
                dict(name='cpus', type='SCALAR', scalar=dict(value=MIN_CPUS)),
            ],
        )

        if 'PYTHONPATH' in os.environ:
            executor['command.environment'] = dict(variables=[
                dict(
                    name='PYTHONPATH',
                    value=os.environ['PYTHONPATH'],
                ),
            ])

        return executor

    def _init_task(self, proc, offer):
        resources = [
            dict(
                name='cpus',
                type='SCALAR',
                scalar=dict(value=proc.cpus),
            ),
            dict(
                name='mem',
                type='SCALAR',
                scalar=dict(value=proc.mem),
            )
        ]

        if proc.gpus > 0:
            resources.append(
                dict(
                    name='gpus',
                    type='SCALAR',
                    scalar=dict(value=proc.gpus),
                ))

        task = dict(
            task_id=dict(value=str(proc.id)),
            name=repr(proc),
            executor=self.executor,
            agent_id=offer['agent_id'],
            data=b2a_base64(pickle.dumps(proc.params)).strip(),
            resources=resources,
        )

        return task

    def _filters(self, seconds):
        f = dict(refuse_seconds=seconds)
        return f

    def __repr__(self):
        return "%s[%s]: %s" % (self.__class__.__name__, os.getpid(), ' '.join(
            sys.argv))

    def registered(self, driver, framework_id, master_info):
        with self._lock:
            logger.info('Framework registered with id=%s, master=%s' %
                        (framework_id, master_info))
            self.framework_id = framework_id
            self.executor = self._init_executor()

    def resourceOffers(self, driver, offers):
        def get_resources(offer):
            cpus, mem, gpus = 0.0, 0.0, 0
            for r in offer['resources']:
                if r['name'] == 'cpus':
                    cpus = float(r['scalar']['value'])
                elif r['name'] == 'mem':
                    mem = float(r['scalar']['value'])
                elif r['name'] == 'gpus':
                    gpus = int(r['scalar']['value'])

            return cpus, mem, gpus

        with self._lock:
            random.shuffle(offers)
            for offer in offers:
                if not self.procs_pending:
                    logger.debug('Reject offers forever for no pending procs, '
                                 'offers=%s' % (offers, ))
                    driver.declineOffer(offer['id'], self._filters(FOREVER))
                    continue

                cpus, mem, gpus = get_resources(offer)
                tasks = []
                for proc in list(self.procs_pending.values()):
                    if (cpus >= proc.cpus + MIN_CPUS
                            and mem >= proc.mem + MIN_MEMORY
                            and gpus >= proc.gpus):
                        tasks.append(self._init_task(proc, offer))
                        del self.procs_pending[proc.id]
                        self.procs_launched[proc.id] = proc
                        cpus -= proc.cpus
                        mem -= proc.mem
                        gpus -= proc.gpus

                seconds = 5 + random.random() * 5
                if tasks:
                    logger.info(
                        'Accept offer for procs, offer=%s, '
                        'procs=%s, filter_time=%s' %
                        (offer, [int(t['task_id']['value'])
                                 for t in tasks], seconds))
                    driver.launchTasks(offer['id'], tasks,
                                       self._filters(seconds))
                else:
                    logger.info('Retry offer for procs later, offer=%s, '
                                'filter_time=%s' % (offer, seconds))
                    driver.declineOffer(offer['id'], self._filters(seconds))

    def _call_finished(self, proc_id, success, message, data, agent_id=None):
        with self._lock:
            proc = self.procs_launched.pop(proc_id)
            if agent_id is not None:
                if agent_id in self.agent_to_proc:
                    self.agent_to_proc[agent_id].remove(proc_id)
            else:
                for agent_id, procs in list(self.agent_to_proc.items()):
                    if proc_id in procs:
                        procs.remove(proc_id)

            proc._finished(success, message, data)

    def statusUpdate(self, driver, update):
        with self._lock:
            proc_id = int(update['task_id']['value'])
            logger.info('Status update for proc, id=%s, state=%s' %
                        (proc_id, update['state']))
            agent_id = update['agent_id']['value']
            if update['state'] == 'TASK_RUNNING':
                if agent_id in self.agent_to_proc:
                    self.agent_to_proc[agent_id].add(proc_id)
                else:
                    self.agent_to_proc[agent_id] = set([proc_id])

                proc = self.procs_launched[proc_id]
                proc._started()

            elif update['state'] not in {
                    'TASK_STAGING', 'TASK_STARTING', 'TASK_RUNNING'
            }:
                success = (update['state'] == 'TASK_FINISHED')
                message = update.get('message')
                data = update.get('data')
                if data:
                    data = pickle.loads(a2b_base64(data))

                self._call_finished(proc_id, success, message, data, agent_id)
                driver.reviveOffers()

    def offerRescinded(self, driver, offer_id):
        with self._lock:
            if self.procs_pending:
                logger.info('Revive offers for pending procs')
                driver.reviveOffers()

    def executorLost(self, driver, executor_id, agent_id, status):
        agent_id = agent_id['value']
        with self._lock:
            for proc_id in self.agent_to_proc.pop(agent_id, []):
                self._call_finished(proc_id, False, 'Executor lost', None,
                                    agent_id)

    def slaveLost(self, driver, agent_id):
        agent_id = agent_id['value']
        with self._lock:
            for proc_id in self.agent_to_proc.pop(agent_id, []):
                self._call_finished(proc_id, False, 'Agent lost', None,
                                    agent_id)

    def error(self, driver, message):
        with self._lock:
            for proc in list(self.procs_pending.values()):
                self._call_finished(proc.id, False, message, None)

            for proc in list(self.procs_launched.values()):
                self._call_finished(proc.id, False, message, None)

        self.stop()

    def start(self):
        self.driver.start()

    def stop(self):
        assert not self.driver.aborted
        self.driver.stop()

    def submit(self, proc):
        if self.driver.aborted:
            raise RuntimeError('driver already aborted')

        with self._lock:
            if proc.id not in self.procs_pending:
                logger.info('Try submit proc, id=%s', (proc.id, ))
                self.procs_pending[proc.id] = proc
                if len(self.procs_pending) == 1:
                    logger.info('Revive offers for pending procs')
                    self.driver.reviveOffers()
            else:
                raise ValueError('Proc with same id already submitted')

    def cancel(self, proc):
        if self.driver.aborted:
            raise RuntimeError('driver already aborted')

        with self._lock:
            if proc.id in self.procs_pending:
                del self.procs_pending[proc.id]
            elif proc.id in self.procs_launched:
                del self.procs_launched[proc.id]
                self.driver.killTask(dict(value=str(proc.id)))

            for agent_id, procs in list(self.agent_to_proc.items()):
                procs.pop(proc.id)
                if not procs:
                    del self.agent_to_proc[agent_id]

    def send_data(self, pid, type, data):
        if self.driver.aborted:
            raise RuntimeError('driver already aborted')

        msg = b2a_base64(pickle.dumps((pid, type, data)))
        for agent_id, procs in list(self.agent_to_proc.items()):
            if pid in procs:
                self.driver.sendFrameworkMessage(self.executor['executor_id'],
                                                 dict(value=agent_id), msg)
                return

        raise RuntimeError('Cannot find agent for pid %s' % (pid, ))
예제 #6
0
파일: schedule.py 프로젝트: zouzias/dpark
class MesosScheduler(DAGScheduler):
    def __init__(self, master, options):
        DAGScheduler.__init__(self)
        self.master = master
        self.use_self_as_exec = options.self
        self.cpus = options.cpus
        self.mem = options.mem
        self.task_per_node = options.parallel or multiprocessing.cpu_count()
        self.group = options.group
        self.logLevel = options.logLevel
        self.options = options
        self.started = False
        self.last_finish_time = 0
        self.isRegistered = False
        self.executor = None
        self.driver = None
        self.out_logger = None
        self.err_logger = None
        self.lock = threading.RLock()
        self.init_job()

    def init_job(self):
        self.activeJobs = {}
        self.activeJobsQueue = []
        self.taskIdToJobId = {}
        self.taskIdToAgentId = {}
        self.jobTasks = {}
        self.agentTasks = {}

    def clear(self):
        DAGScheduler.clear(self)
        self.init_job()

    def start(self):
        if not self.out_logger:
            self.out_logger = self.start_logger(sys.stdout)
        if not self.err_logger:
            self.err_logger = self.start_logger(sys.stderr)

    def start_driver(self):
        name = '[dpark] ' + \
            os.path.abspath(sys.argv[0]) + ' ' + ' '.join(sys.argv[1:])
        if len(name) > 256:
            name = name[:256] + '...'
        framework = Dict()
        framework.user = getuser()
        if framework.user == 'root':
            raise Exception('dpark is not allowed to run as \'root\'')
        framework.name = name
        framework.hostname = socket.gethostname()
        framework.webui_url = self.options.webui_url

        self.driver = MesosSchedulerDriver(self,
                                           framework,
                                           self.master,
                                           use_addict=True)
        self.driver.start()
        logger.debug('Mesos Scheudler driver started')

        self.started = True
        self.last_finish_time = time.time()

        def check():
            while self.started:
                now = time.time()
                if (not self.activeJobs
                        and now - self.last_finish_time > MAX_IDLE_TIME):
                    logger.info('stop mesos scheduler after %d seconds idle',
                                now - self.last_finish_time)
                    self.stop()
                    break
                time.sleep(1)

        spawn(check)

    def start_logger(self, output):
        sock = env.ctx.socket(zmq.PULL)
        port = sock.bind_to_random_port('tcp://0.0.0.0')

        def collect_log():
            while not self._shutdown:
                if sock.poll(1000, zmq.POLLIN):
                    line = sock.recv()
                    output.write(line)

        spawn(collect_log)

        host = socket.gethostname()
        addr = 'tcp://%s:%d' % (host, port)
        logger.debug('log collecter start at %s', addr)
        return addr

    @safe
    def registered(self, driver, frameworkId, masterInfo):
        self.isRegistered = True
        logger.debug('connect to master %s:%s, registered as %s',
                     masterInfo.hostname, masterInfo.port, frameworkId.value)
        self.executor = self.getExecutorInfo(str(frameworkId.value))

    @safe
    def reregistered(self, driver, masterInfo):
        logger.warning('re-connect to mesos master %s:%s', masterInfo.hostname,
                       masterInfo.port)

    @safe
    def disconnected(self, driver):
        logger.debug('framework is disconnected')

    def _get_container_image(self):
        return self.options.image

    @safe
    def getExecutorInfo(self, framework_id):
        info = Dict()
        info.framework_id.value = framework_id

        if self.use_self_as_exec:
            info.command.value = os.path.abspath(sys.argv[0])
            info.executor_id.value = sys.argv[0]
        else:
            info.command.value = '%s %s' % (
                sys.executable,
                os.path.abspath(
                    os.path.join(os.path.dirname(__file__), 'executor.py')))
            info.executor_id.value = 'default'

        info.command.environment.variables = variables = []

        v = Dict()
        variables.append(v)
        v.name = 'UID'
        v.value = str(os.getuid())

        v = Dict()
        variables.append(v)
        v.name = 'GID'
        v.value = str(os.getgid())

        container_image = self._get_container_image()
        if container_image:
            info.container.type = 'DOCKER'
            info.container.docker.image = container_image
            info.container.docker.parameters = parameters = []
            p = Dict()
            p.key = 'memory-swap'
            p.value = '-1'
            parameters.append(p)

            info.container.volumes = volumes = []
            for path in ['/etc/passwd', '/etc/group']:
                v = Dict()
                volumes.append(v)
                v.host_path = v.container_path = path
                v.mode = 'RO'

            for path in conf.MOOSEFS_MOUNT_POINTS:
                v = Dict()
                volumes.append(v)
                v.host_path = v.container_path = path
                v.mode = 'RW'

            for path in conf.DPARK_WORK_DIR.split(','):
                v = Dict()
                volumes.append(v)
                v.host_path = v.container_path = path
                v.mode = 'RW'

            def _mount_volume(volumes, host_path, container_path, mode):
                v = Dict()
                volumes.append(v)
                v.container_path = container_path
                v.mode = mode
                if host_path:
                    v.host_path = host_path

            if self.options.volumes:
                for volume in self.options.volumes.split(','):
                    fields = volume.split(':')
                    if len(fields) == 3:
                        host_path, container_path, mode = fields
                        mode = mode.upper()
                        assert mode in ('RO', 'RW')
                    elif len(fields) == 2:
                        host_path, container_path = fields
                        mode = 'RW'
                    elif len(fields) == 1:
                        container_path, = fields
                        host_path = ''
                        mode = 'RW'
                    else:
                        raise Exception('cannot parse volume %s', volume)
                    _mount_volume(volumes, host_path, container_path, mode)

        info.resources = resources = []

        mem = Dict()
        resources.append(mem)
        mem.name = 'mem'
        mem.type = 'SCALAR'
        mem.scalar.value = EXECUTOR_MEMORY

        cpus = Dict()
        resources.append(cpus)
        cpus.name = 'cpus'
        cpus.type = 'SCALAR'
        cpus.scalar.value = EXECUTOR_CPUS

        Script = os.path.realpath(sys.argv[0])
        info.name = Script

        info.data = encode_data(
            marshal.dumps((Script, os.getcwd(), sys.path, dict(os.environ),
                           self.task_per_node, self.out_logger,
                           self.err_logger, self.logLevel, env.environ)))
        return info

    @safe
    def submitTasks(self, tasks):
        if not tasks:
            return

        job = SimpleJob(self, tasks, self.cpus, tasks[0].rdd.mem or self.mem)
        self.activeJobs[job.id] = job
        self.activeJobsQueue.append(job)
        self.jobTasks[job.id] = set()
        stage_scope = ''
        try:
            from dpark.web.ui.views.rddopgraph import StageInfo
            stage_scope = StageInfo.idToRDDNode[
                tasks[0].rdd.id].scope.call_site
        except:
            pass
        stage = self.idToStage[tasks[0].stageId]
        stage.try_times += 1
        logger.info(
            'Got job %d with %d tasks for stage: %d(try %d times) '
            'at scope[%s] and rdd:%s', job.id, len(tasks), tasks[0].stageId,
            stage.try_times, stage_scope, tasks[0].rdd)

        need_revive = self.started
        if not self.started:
            self.start_driver()
        while not self.isRegistered:
            self.lock.release()
            time.sleep(0.01)
            self.lock.acquire()

        if need_revive:
            self.requestMoreResources()

    def requestMoreResources(self):
        logger.debug('reviveOffers')
        self.driver.reviveOffers()

    @safe
    def resourceOffers(self, driver, offers):
        rf = Dict()
        if not self.activeJobs:
            driver.suppressOffers()
            rf.refuse_seconds = 60 * 5
            for o in offers:
                driver.declineOffer(o.id, rf)
            return

        start = time.time()
        random.shuffle(offers)
        cpus = [self.getResource(o.resources, 'cpus') for o in offers]
        mems = [
            self.getResource(o.resources, 'mem') -
            (o.agent_id.value not in self.agentTasks and EXECUTOR_MEMORY or 0)
            for o in offers
        ]
        logger.debug('get %d offers (%s cpus, %s mem), %d jobs', len(offers),
                     sum(cpus), sum(mems), len(self.activeJobs))

        tasks = {}
        for job in self.activeJobsQueue:
            while True:
                launchedTask = False
                for i, o in enumerate(offers):
                    sid = o.agent_id.value
                    group = (self.getAttribute(o.attributes, 'group')
                             or 'None')
                    if (self.group or
                            group.startswith('_')) and group not in self.group:
                        continue
                    if self.agentTasks.get(sid, 0) >= self.task_per_node:
                        continue
                    if (mems[i] < self.mem + EXECUTOR_MEMORY
                            or cpus[i] < self.cpus + EXECUTOR_CPUS):
                        continue
                    t = job.slaveOffer(str(o.hostname), cpus[i], mems[i])
                    if not t:
                        continue
                    task = self.createTask(o, job, t)
                    tasks.setdefault(o.id.value, []).append(task)

                    logger.debug('dispatch %s into %s', t, o.hostname)
                    tid = task.task_id.value
                    self.jobTasks[job.id].add(tid)
                    self.taskIdToJobId[tid] = job.id
                    self.taskIdToAgentId[tid] = sid
                    self.agentTasks[sid] = self.agentTasks.get(sid, 0) + 1
                    cpus[i] -= min(cpus[i], t.cpus)
                    mems[i] -= t.mem
                    launchedTask = True

                if not launchedTask:
                    break

        used = time.time() - start
        if used > 10:
            logger.error('use too much time in resourceOffers: %.2fs', used)

        for o in offers:
            if o.id.value in tasks:
                driver.launchTasks(o.id, tasks[o.id.value])
            else:
                driver.declineOffer(o.id)

        logger.debug('reply with %d tasks, %s cpus %s mem left',
                     sum(len(ts) for ts in tasks.values()), sum(cpus),
                     sum(mems))

    @safe
    def offerRescinded(self, driver, offer_id):
        logger.debug('rescinded offer: %s', offer_id)
        if self.activeJobs:
            self.requestMoreResources()

    def getResource(self, res, name):
        for r in res:
            if r.name == name:
                return r.scalar.value
        return 0.0

    def getAttribute(self, attrs, name):
        for r in attrs:
            if r.name == name:
                return r.text.value

    def createTask(self, o, job, t):
        task = Dict()
        tid = '%s:%s:%s' % (job.id, t.id, t.tried)
        task.name = 'task %s' % tid
        task.task_id.value = tid
        task.agent_id.value = o.agent_id.value
        task.data = encode_data(
            compress(six.moves.cPickle.dumps((t, t.tried), -1)))
        task.executor = self.executor
        if len(task.data) > 1000 * 1024:
            logger.warning('task too large: %s %d', t, len(task.data))

        resources = task.resources = []

        cpu = Dict()
        resources.append(cpu)
        cpu.name = 'cpus'
        cpu.type = 'SCALAR'
        cpu.scalar.value = t.cpus

        mem = Dict()
        resources.append(mem)
        mem.name = 'mem'
        mem.type = 'SCALAR'
        mem.scalar.value = t.mem
        return task

    @safe
    def statusUpdate(self, driver, status):
        tid = status.task_id.value
        state = status.state
        logger.debug('status update: %s %s', tid, state)

        jid = self.taskIdToJobId.get(tid)
        _, task_id, tried = list(map(int, tid.split(':')))
        if state == 'TASK_RUNNING':
            if jid in self.activeJobs:
                job = self.activeJobs[jid]
                job.statusUpdate(task_id, tried, state)
            else:
                logger.debug('kill task %s as its job has gone', tid)
                self.driver.killTask(Dict(value=tid))

            return

        self.taskIdToJobId.pop(tid, None)
        if jid in self.jobTasks:
            self.jobTasks[jid].remove(tid)
        if tid in self.taskIdToAgentId:
            agent_id = self.taskIdToAgentId[tid]
            if agent_id in self.agentTasks:
                self.agentTasks[agent_id] -= 1
            del self.taskIdToAgentId[tid]

        if jid not in self.activeJobs:
            logger.debug('ignore task %s as its job has gone', tid)
            return

        job = self.activeJobs[jid]
        reason = status.get('message')
        data = status.get('data')
        if state in ('TASK_FINISHED', 'TASK_FAILED') and data:
            try:
                reason, result, accUpdate = six.moves.cPickle.loads(
                    decode_data(data))
                if result:
                    flag, data = result
                    if flag >= 2:
                        try:
                            data = urllib.request.urlopen(data).read()
                        except IOError:
                            # try again
                            data = urllib.request.urlopen(data).read()
                        flag -= 2
                    data = decompress(data)
                    if flag == 0:
                        result = marshal.loads(data)
                    else:
                        result = six.moves.cPickle.loads(data)
            except Exception as e:
                logger.warning('error when cPickle.loads(): %s, data:%s', e,
                               len(data))
                state = 'TASK_FAILED'
                return job.statusUpdate(task_id, tried, 'TASK_FAILED',
                                        'load failed: %s' % e)
            else:
                return job.statusUpdate(task_id, tried, state, reason, result,
                                        accUpdate)

        # killed, lost, load failed
        job.statusUpdate(task_id, tried, state, reason or data)

    def jobFinished(self, job):
        logger.debug('job %s finished', job.id)
        if job.id in self.activeJobs:
            self.last_finish_time = time.time()
            del self.activeJobs[job.id]
            self.activeJobsQueue.remove(job)
            for tid in self.jobTasks[job.id]:
                self.driver.killTask(Dict(value=tid))
            del self.jobTasks[job.id]

            if not self.activeJobs:
                self.agentTasks.clear()

        for tid, jid in six.iteritems(self.taskIdToJobId):
            if jid not in self.activeJobs:
                logger.debug('kill task %s, because it is orphan', tid)
                self.driver.killTask(Dict(value=tid))

    @safe
    def check(self):
        for job in self.activeJobs.values():
            if job.check_task_timeout():
                self.requestMoreResources()

    @safe
    def error(self, driver, message):
        logger.error('Mesos error message: %s', message)
        raise RuntimeError(message)

    # @safe
    def stop(self):
        if not self.started:
            return
        logger.debug('stop scheduler')
        self.started = False
        self.isRegistered = False
        self.driver.stop(False)
        self.driver = None

    def defaultParallelism(self):
        return 16

    def frameworkMessage(self, driver, executor_id, agent_id, data):
        logger.warning('[agent %s] %s', agent_id.value, data)

    def executorLost(self, driver, executor_id, agent_id, status):
        logger.warning('executor at %s %s lost: %s', agent_id.value,
                       executor_id.value, status)
        self.agentTasks.pop(agent_id.value, None)

    def slaveLost(self, driver, agent_id):
        logger.warning('agent %s lost', agent_id.value)
        self.agentTasks.pop(agent_id.value, None)

    def killTask(self, job_id, task_id, tried):
        tid = Dict()
        tid.value = '%s:%s:%s' % (job_id, task_id, tried)
        self.driver.killTask(tid)
예제 #7
0
class MesosScheduler(DAGScheduler):

    def __init__(self, master, options):
        DAGScheduler.__init__(self)
        self.master = master
        self.use_self_as_exec = options.self
        self.cpus = options.cpus
        self.mem = options.mem
        self.task_per_node = options.parallel or multiprocessing.cpu_count()
        self.group = options.group
        self.logLevel = options.logLevel
        self.options = options
        self.started = False
        self.last_finish_time = 0
        self.isRegistered = False
        self.executor = None
        self.driver = None
        self.out_logger = None
        self.err_logger = None
        self.lock = threading.RLock()
        self.init_job()

    def init_job(self):
        self.activeJobs = {}
        self.activeJobsQueue = []
        self.taskIdToJobId = {}
        self.taskIdToAgentId = {}
        self.jobTasks = {}
        self.agentTasks = {}

    def clear(self):
        DAGScheduler.clear(self)
        self.init_job()

    def start(self):
        if not self.out_logger:
            self.out_logger = self.start_logger(sys.stdout)
        if not self.err_logger:
            self.err_logger = self.start_logger(sys.stderr)

    def start_driver(self):
        name = '[dpark] ' + \
            os.path.abspath(sys.argv[0]) + ' ' + ' '.join(sys.argv[1:])
        if len(name) > 256:
            name = name[:256] + '...'
        framework = Dict()
        framework.user = getuser()
        if framework.user == 'root':
            raise Exception('dpark is not allowed to run as \'root\'')
        framework.name = name
        framework.hostname = socket.gethostname()
        framework.webui_url = self.options.webui_url

        self.driver = MesosSchedulerDriver(
            self, framework, self.master, use_addict=True
        )
        self.driver.start()
        logger.debug('Mesos Scheudler driver started')

        self.started = True
        self.last_finish_time = time.time()

        def check():
            while self.started:
                now = time.time()
                if (not self.activeJobs and
                        now - self.last_finish_time > MAX_IDLE_TIME):
                    logger.info('stop mesos scheduler after %d seconds idle',
                                now - self.last_finish_time)
                    self.stop()
                    break
                time.sleep(1)

        spawn(check)

    def start_logger(self, output):
        sock = env.ctx.socket(zmq.PULL)
        port = sock.bind_to_random_port('tcp://0.0.0.0')

        def collect_log():
            while not self._shutdown:
                if sock.poll(1000, zmq.POLLIN):
                    line = sock.recv()
                    output.write(line)

        spawn(collect_log)

        host = socket.gethostname()
        addr = 'tcp://%s:%d' % (host, port)
        logger.debug('log collecter start at %s', addr)
        return addr

    @safe
    def registered(self, driver, frameworkId, masterInfo):
        self.isRegistered = True
        logger.debug('connect to master %s:%s, registered as %s',
                     masterInfo.hostname, masterInfo.port, frameworkId.value)
        self.executor = self.getExecutorInfo(str(frameworkId.value))

    @safe
    def reregistered(self, driver, masterInfo):
        logger.warning('re-connect to mesos master %s:%s',
                       masterInfo.hostname, masterInfo.port)

    @safe
    def disconnected(self, driver):
        logger.debug('framework is disconnected')

    @safe
    def getExecutorInfo(self, framework_id):
        info = Dict()
        info.framework_id.value = framework_id

        if self.use_self_as_exec:
            info.command.value = os.path.abspath(sys.argv[0])
            info.executor_id.value = sys.argv[0]
        else:
            info.command.value = '%s %s' % (
                sys.executable,
                os.path.abspath(
                    os.path.join(
                        os.path.dirname(__file__),
                        'executor.py'))
            )
            info.executor_id.value = 'default'

        info.command.environment.variables = variables = []

        v = Dict()
        variables.append(v)
        v.name = 'UID'
        v.value = str(os.getuid())

        v = Dict()
        variables.append(v)
        v.name = 'GID'
        v.value = str(os.getgid())

        if self.options.image:
            info.container.type = 'DOCKER'
            info.container.docker.image = self.options.image

            info.container.volumes = volumes = []
            for path in ['/etc/passwd', '/etc/group']:
                v = Dict()
                volumes.append(v)
                v.host_path = v.container_path = path
                v.mode = 'RO'

            for path in conf.MOOSEFS_MOUNT_POINTS:
                v = Dict()
                volumes.append(v)
                v.host_path = v.container_path = path
                v.mode = 'RW'

            for path in conf.DPARK_WORK_DIR.split(','):
                v = Dict()
                volumes.append(v)
                v.host_path = v.container_path = path
                v.mode = 'RW'

            if self.options.volumes:
                for volume in self.options.volumes.split(','):
                    fields = volume.split(':')
                    if len(fields) == 3:
                        host_path, container_path, mode = fields
                        mode = mode.upper()
                        assert mode in ('RO', 'RW')
                    elif len(fields) == 2:
                        host_path, container_path = fields
                        mode = 'RW'
                    elif len(fields) == 1:
                        container_path, = fields
                        host_path = ''
                        mode = 'RW'
                    else:
                        raise Exception('cannot parse volume %s', volume)

                    mkdir_p(host_path)

                v = Dict()
                volumes.append(v)
                v.container_path = container_path
                v.mode = mode
                if host_path:
                    v.host_path = host_path

        info.resources = resources = []

        mem = Dict()
        resources.append(mem)
        mem.name = 'mem'
        mem.type = 'SCALAR'
        mem.scalar.value = EXECUTOR_MEMORY

        cpus = Dict()
        resources.append(cpus)
        cpus.name = 'cpus'
        cpus.type = 'SCALAR'
        cpus.scalar.value = EXECUTOR_CPUS

        Script = os.path.realpath(sys.argv[0])
        info.name = Script

        info.data = encode_data(marshal.dumps(
            (
                Script, os.getcwd(), sys.path, dict(os.environ),
                self.task_per_node, self.out_logger, self.err_logger,
                self.logLevel, env.environ
            )
        ))
        return info

    @safe
    def submitTasks(self, tasks):
        if not tasks:
            return

        job = SimpleJob(self, tasks, self.cpus, tasks[0].rdd.mem or self.mem)
        self.activeJobs[job.id] = job
        self.activeJobsQueue.append(job)
        self.jobTasks[job.id] = set()
        logger.info(
            'Got job %d with %d tasks: %s',
            job.id,
            len(tasks),
            tasks[0].rdd)

        need_revive = self.started
        if not self.started:
            self.start_driver()
        while not self.isRegistered:
            self.lock.release()
            time.sleep(0.01)
            self.lock.acquire()

        if need_revive:
            self.requestMoreResources()

    def requestMoreResources(self):
        logger.debug('reviveOffers')
        self.driver.reviveOffers()

    @safe
    def resourceOffers(self, driver, offers):
        rf = Dict()
        if not self.activeJobs:
            rf.refuse_seconds = 60 * 5
            for o in offers:
                driver.declineOffer(o.id, rf)
            return

        start = time.time()
        random.shuffle(offers)
        cpus = [self.getResource(o.resources, 'cpus') for o in offers]
        mems = [self.getResource(o.resources, 'mem')
                - (o.agent_id.value not in self.agentTasks
                    and EXECUTOR_MEMORY or 0)
                for o in offers]
        logger.debug('get %d offers (%s cpus, %s mem), %d jobs',
                     len(offers), sum(cpus), sum(mems), len(self.activeJobs))

        tasks = {}
        for job in self.activeJobsQueue:
            while True:
                launchedTask = False
                for i, o in enumerate(offers):
                    sid = o.agent_id.value
                    group = (
                        self.getAttribute(
                            o.attributes,
                            'group') or 'None')
                    if (self.group or group.startswith(
                            '_')) and group not in self.group:
                        continue
                    if self.agentTasks.get(sid, 0) >= self.task_per_node:
                        continue
                    if (mems[i] < self.mem + EXECUTOR_MEMORY
                            or cpus[i] < self.cpus + EXECUTOR_CPUS):
                        continue
                    t = job.slaveOffer(str(o.hostname), cpus[i], mems[i])
                    if not t:
                        continue
                    task = self.createTask(o, job, t)
                    tasks.setdefault(o.id.value, []).append(task)

                    logger.debug('dispatch %s into %s', t, o.hostname)
                    tid = task.task_id.value
                    self.jobTasks[job.id].add(tid)
                    self.taskIdToJobId[tid] = job.id
                    self.taskIdToAgentId[tid] = sid
                    self.agentTasks[sid] = self.agentTasks.get(sid, 0) + 1
                    cpus[i] -= min(cpus[i], t.cpus)
                    mems[i] -= t.mem
                    launchedTask = True

                if not launchedTask:
                    break

        used = time.time() - start
        if used > 10:
            logger.error('use too much time in resourceOffers: %.2fs', used)

        for o in offers:
            if o.id.value in tasks:
                driver.launchTasks(o.id, tasks[o.id.value])
            else:
                driver.declineOffer(o.id)

        logger.debug('reply with %d tasks, %s cpus %s mem left',
                     sum(len(ts) for ts in tasks.values()),
                     sum(cpus), sum(mems))

    @safe
    def offerRescinded(self, driver, offer_id):
        logger.debug('rescinded offer: %s', offer_id)
        if self.activeJobs:
            self.requestMoreResources()

    def getResource(self, res, name):
        for r in res:
            if r.name == name:
                return r.scalar.value
        return 0.0

    def getAttribute(self, attrs, name):
        for r in attrs:
            if r.name == name:
                return r.text.value

    def createTask(self, o, job, t):
        task = Dict()
        tid = '%s:%s:%s' % (job.id, t.id, t.tried)
        task.name = 'task %s' % tid
        task.task_id.value = tid
        task.agent_id.value = o.agent_id.value
        task.data = encode_data(
            compress(cPickle.dumps((t, t.tried), -1))
        )
        task.executor = self.executor
        if len(task.data) > 1000 * 1024:
            logger.warning('task too large: %s %d',
                           t, len(task.data))

        resources = task.resources = []

        cpu = Dict()
        resources.append(cpu)
        cpu.name = 'cpus'
        cpu.type = 'SCALAR'
        cpu.scalar.value = t.cpus

        mem = Dict()
        resources.append(mem)
        mem.name = 'mem'
        mem.type = 'SCALAR'
        mem.scalar.value = t.mem
        return task

    @safe
    def statusUpdate(self, driver, status):
        tid = status.task_id.value
        state = status.state
        logger.debug('status update: %s %s', tid, state)

        jid = self.taskIdToJobId.get(tid)
        _, task_id, tried = map(int, tid.split(':'))
        if state == 'TASK_RUNNING':
            if jid in self.activeJobs:
                job = self.activeJobs[jid]
                job.statusUpdate(task_id, tried, state)
            else:
                logger.debug('kill task %s as its job has gone', tid)
                self.driver.killTask(Dict(value=tid))

            return

        self.taskIdToJobId.pop(tid, None)
        if jid in self.jobTasks:
            self.jobTasks[jid].remove(tid)
        if tid in self.taskIdToAgentId:
            agent_id = self.taskIdToAgentId[tid]
            if agent_id in self.agentTasks:
                self.agentTasks[agent_id] -= 1
            del self.taskIdToAgentId[tid]

        if jid not in self.activeJobs:
            logger.debug('ignore task %s as its job has gone', tid)
            return

        job = self.activeJobs[jid]
        data = status.get('data')
        if state in ('TASK_FINISHED', 'TASK_FAILED') and data:
            try:
                reason, result, accUpdate = cPickle.loads(
                    decode_data(data))
                if result:
                    flag, data = result
                    if flag >= 2:
                        try:
                            data = urllib.urlopen(data).read()
                        except IOError:
                            # try again
                            data = urllib.urlopen(data).read()
                        flag -= 2
                    data = decompress(data)
                    if flag == 0:
                        result = marshal.loads(data)
                    else:
                        result = cPickle.loads(data)
            except Exception as e:
                logger.warning(
                    'error when cPickle.loads(): %s, data:%s', e, len(data))
                state = 'TASK_FAILED'
                return job.statusUpdate(
                    task_id, tried, 'TASK_FAILED', 'load failed: %s' % e)
            else:
                return job.statusUpdate(task_id, tried, state,
                                        reason, result, accUpdate)

        # killed, lost, load failed
        job.statusUpdate(task_id, tried, state, data)

    def jobFinished(self, job):
        logger.debug('job %s finished', job.id)
        if job.id in self.activeJobs:
            del self.activeJobs[job.id]
            self.activeJobsQueue.remove(job)
            for tid in self.jobTasks[job.id]:
                self.driver.killTask(Dict(value=tid))
            del self.jobTasks[job.id]
            self.last_finish_time = time.time()

            if not self.activeJobs:
                self.agentTasks.clear()

        for tid, jid in self.taskIdToJobId.iteritems():
            if jid not in self.activeJobs:
                logger.debug('kill task %s, because it is orphan', tid)
                self.driver.killTask(Dict(value=tid))

    @safe
    def check(self):
        for job in self.activeJobs.values():
            if job.check_task_timeout():
                self.requestMoreResources()

    @safe
    def error(self, driver, message):
        logger.warning('Mesos error message: %s', message)

    # @safe
    def stop(self):
        if not self.started:
            return
        logger.debug('stop scheduler')
        self.started = False
        self.isRegistered = False
        self.driver.stop(False)
        self.driver = None

    def defaultParallelism(self):
        return 16

    def frameworkMessage(self, driver, executor_id, agent_id, data):
        logger.warning('[agent %s] %s', agent_id.value, data)

    def executorLost(self, driver, executor_id, agent_id, status):
        logger.warning(
            'executor at %s %s lost: %s',
            agent_id.value,
            executor_id.value,
            status)
        self.agentTasks.pop(agent_id.value, None)

    def slaveLost(self, driver, agent_id):
        logger.warning('agent %s lost', agent_id.value)
        self.agentTasks.pop(agent_id.value, None)

    def killTask(self, job_id, task_id, tried):
        tid = Dict()
        tid.value = '%s:%s:%s' % (job_id, task_id, tried)
        self.driver.killTask(tid)