Пример #1
0
    def __init__(self, cfg, espa_api, worklist):
        master = cfg.get('mesos_master')
        principal = cfg.get('mesos_principal')
        secret = cfg.get('mesos_secret')

        self.workList = worklist
        self.runningList = {}
        self.max_cpus = cfg.get('max_cpu')
        self.required_cpus = cfg.get('task_cpu')
        self.required_memory = cfg.get('task_mem')
        self.required_disk = cfg.get('task_disk')
        self.task_image = cfg.get('task_image')
        self.refuse_seconds = cfg.get('offer_refuse_seconds')
        self.request_count = cfg.get('product_request_count')
        self.products = cfg.get('product_frequency')
        self.healthy_states = [
            "TASK_STAGING", "TASK_STARTING", "TASK_RUNNING", "TASK_FINISHED"
        ]
        self.espa = espa_api
        self.cfg = cfg

        self.client = MesosClient(mesos_urls=[master],
                                  frameworkName='ESPA Mesos Framework')
        self.client.verify = False
        self.client.set_credentials(principal, secret)
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)

        # put some work on the queue
        get_products_to_process(cfg, self.espa, self.workList)
Пример #2
0
 def __init__(self, url="http://127.0.0.1:5050"):
     logging.basicConfig()
     self.logger = logging.getLogger(__name__)
     self.logger.setLevel(logging.DEBUG)
     self.launched_tasks = 0
     self.running_tasks = 0
     self.finished_tasks = 0
     self.killed_tasks = 0
     self.lost_tasks = []
     #        self.failed_tasks = []
     #signal.signal(signal.SIGINT, signal.SIG_IGN)
     logging.getLogger('mesoshttp').setLevel(logging.DEBUG)
     self.driver = None
     self.client = MesosClient(mesos_urls=[url],
                               frameworkId=None,
                               frameworkName='Python HTTP framework',
                               frameworkUser=getpass.getuser())
     self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
     self.client.on(MesosClient.OFFERS, self.offer_received)
     self.client.on(MesosClient.UPDATE, self.status_update)
     self.th = Test.MesosFramework(self.client)
     self.th.start()
     while True and self.th.isAlive():
         try:
             self.th.join(1)
         except KeyboardInterrupt:
             self.shutdown()
             break
Пример #3
0
    def __init__(self):
        logging.basicConfig()
        self.logger = logging.getLogger(__name__)
        logging.getLogger('mesoshttp').setLevel(logging.DEBUG)

        self.driver = None
        # Note: leader.mesos address requires Mesos DNS
        #self.client = MesosClient(mesos_urls=['zk://leader.mesos:2181/mesos'])
        # If you are purely using Mesos, you should use explicit address of Master
        # Example: Zookeeper master discovery
        #self.client = MesosClient(mesos_urls=['zk://127.0.0.1:2181/mesos'])
        # Example: Directly address Mesos
        #self.client = MesosClient(mesos_urls=['http://127.0.0.1:5050'])

        # By default, use direct master addressing
        # Allow for comma delimited URLs to be passed in via MASTER_URLS
        # environment variable
        master_urls = os.getenv('MESOS_URLS', 'http://127.0.0.1:5050')
        self.client = MesosClient(mesos_urls=master_urls.split(','))

        secret = os.getenv('SERVICE_SECRET')
        if secret:
            self.client.set_service_account(json.loads(secret))
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)
        self.th = Test.MesosFramework(self.client)
        self.th.start()
        while True and self.th.isAlive():
            try:
                self.th.join(1)
            except KeyboardInterrupt:
                self.shutdown()
                break
Пример #4
0
 def __init__(self, url="http://127.0.0.1:5050", name="HTTP framework", user="******"):
     logging.basicConfig()
     self.logger = logging.getLogger(__name__)
     self.tasks = 0
     #signal.signal(signal.SIGINT, signal.SIG_IGN)
     logging.getLogger('mesoshttp').setLevel(logging.DEBUG)
     self.driver = None
     self.mesos_offer = None # to store only one offer
     self.update = None
     self.task_id = None
     self.agent_id = None
     self.client = MesosClient(mesos_urls = [url], frameworkName = name, frameworkUser = user)
     self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
     self.client.on(MesosClient.OFFERS, self.offer_received)
     self.client.on(MesosClient.UPDATE, self.status_update)
     self.th = Test.MesosFramework(self.client)
Пример #5
0
 def __init__(self):
     logging.basicConfig()
     self.logger = logging.getLogger(__name__)
     logging.getLogger('mesoshttp').setLevel(logging.DEBUG)
     self.driver = None
     self.client = MesosClient(mesos_urls=['http://52.87.159.219:5050'])
     self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
     self.client.on(MesosClient.OFFERS, self.offer_received)
     self.client.on(MesosClient.UPDATE, self.status_update)
     self.th = Test.MesosFramework(self.client)
     self.th.start()
     while True and self.th.isAlive():
         try:
             self.th.join(1)
         except KeyboardInterrupt:
             self.shutdown()
             break
Пример #6
0
    def run_scheduler(self, mesos_master):
        logger.info("Scale rising...")
        self.scheduler = ScaleScheduler()
        self.scheduler.initialize()
        scheduler_mgr.hostname = socket.getfqdn()

        logger.info('Connecting to Mesos master at %s:', mesos_master)

        # By default use ZK for master detection
        self.client = MesosClient(
            mesos_urls=[settings.MESOS_MASTER],
            # We have to run tasks as root, so docker commands may be executed
            frameworkUser='******',
            frameworkName=settings.FRAMEWORK_NAME,
            frameworkHostname=scheduler_mgr.hostname,
            frameworkWebUI=settings.WEBSERVER_ADDRESS)
        if settings.SERVICE_SECRET:
            # We are in Enterprise mode and using service account
            self.client.set_service_account(json.loads(
                settings.SERVICE_SECRET))
        elif settings.PRINCIPAL and settings.SECRET:
            self.client.set_credentials(settings.PRINCIPAL, settings.SECRET)

        mesos_role = settings.MESOS_ROLE
        logger.info('Launching scheduler with role: %s' % mesos_role)
        self.client.set_role(settings.MESOS_ROLE)

        logger.info('Accepting offers from role: %s' %
                    settings.ACCEPTED_RESOURCE_ROLE)

        self.client.add_capability('GPU_RESOURCES')

        try:
            self.scheduler.run(self.client)
            status = 0
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown()
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)
Пример #7
0
 def start(self):
     self.fprint("Start, connecting to:" + MESOS_MASTER)
     self.driver = None
     self.client = MesosClient(mesos_urls=[MESOS_MASTER])
     self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
     self.client.on(MesosClient.OFFERS, self.offer_received)
     self.client.on(MesosClient.UPDATE, self.status_update)
     self.client.frameworkName = "HPC Framework"
     self.th = HpcFramework.MesosFramework(self.client)
     self.th.start()
     while self.th.isAlive():
         try:
             self.running = True
             self.th.join(1)
         except Exception as e:
             self.fprint("mesos framework exception" + str(e))
     self.running = False
     self.fprint("mesos framework stopped")
Пример #8
0
    def __init__(self,
                 script_path,
                 setup_path,
                 headnode,
                 ssl_thumbprint,
                 client_cert,
                 node_group=""):
        logging.basicConfig()
        self.logger = logging_aux.init_logger_aux("hpcframework",
                                                  "hpcframework.log")
        # signal.signal(signal.SIGINT, signal.SIG_IGN)
        logging.getLogger('mesoshttp').setLevel(logging.DEBUG)
        self.node_idle_check_table = {}
        self.script_path = script_path
        self.setup_path = setup_path
        self.headnode = headnode
        self.ssl_thumbprint = ssl_thumbprint
        self.node_group = node_group
        self.hpc_client = HpcRestClient(client_cert)
        self.heartbeat_table = hpc_cluster_manager.HpcClusterManager(
            self.hpc_client, node_group=self.node_group)
        self.heartbeat_table.subscribe_node_closed_callback(
            lambda l: map(self._kill_task_by_hostname, l))
        self.heartbeat_table.start()
        self.core_provisioning = 0.0
        self.driver = None  # type: MesosClient.SchedulerDriver
        framework_suffix = self.headnode.replace(',', '_')
        if self.node_group != "":
            framework_suffix = framework_suffix + '-' + self.node_group
        self.mesos_client = MesosClient(
            mesos_urls=['http://172.16.1.4:5050'],
            # mesos_urls=['zk://127.0.0.1:2181/mesos'],
            frameworkName="HPC-Pack-Framework-{}".format(framework_suffix))

        self.mesos_client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.mesos_client.on(MesosClient.OFFERS, self.offer_received)
        self.mesos_client.on(MesosClient.UPDATE, self.status_update)
        self.th = HpcpackFramwork.MesosFramework(self.mesos_client)
        self.stop = False
Пример #9
0
class Command(BaseCommand):
    """Command that launches the Scale scheduler
    """

    help = 'Launches the Scale scheduler'

    def handle(self, *args, **options):
        """See :meth:`django.core.management.base.BaseCommand.handle`.

        This method starts the scheduler.
        """

        # Register a listener to handle clean shutdowns
        signal.signal(signal.SIGTERM, self._onsigterm)

        # Set up global shutdown
        global GLOBAL_SHUTDOWN
        GLOBAL_SHUTDOWN = self._shutdown

        logger.info('Scale Scheduler %s', settings.VERSION)

        self.run_scheduler(settings.MESOS_MASTER)

    def run_scheduler(self, mesos_master):
        logger.info("Scale rising...")
        self.scheduler = ScaleScheduler()
        self.scheduler.initialize()
        scheduler_mgr.hostname = socket.getfqdn()

        logger.info('Connecting to Mesos master at %s:', mesos_master)

        # By default use ZK for master detection
        self.client = MesosClient(
            mesos_urls=[settings.MESOS_MASTER],
            # We have to run tasks as root, so docker commands may be executed
            frameworkUser='******',
            frameworkName=settings.FRAMEWORK_NAME,
            frameworkHostname=scheduler_mgr.hostname,
            frameworkWebUI=settings.WEBSERVER_ADDRESS)
        if settings.SERVICE_SECRET:
            # We are in Enterprise mode and using service account
            self.client.set_service_account(json.loads(
                settings.SERVICE_SECRET))
        elif settings.PRINCIPAL and settings.SECRET:
            self.client.set_credentials(settings.PRINCIPAL, settings.SECRET)

        mesos_role = settings.MESOS_ROLE
        logger.info('Launching scheduler with role: %s' % mesos_role)
        self.client.set_role(settings.MESOS_ROLE)

        logger.info('Accepting offers from role: %s' %
                    settings.ACCEPTED_RESOURCE_ROLE)

        self.client.add_capability('GPU_RESOURCES')

        try:
            self.scheduler.run(self.client)
            status = 0
        except:
            status = 1
            logger.exception('Mesos Scheduler Driver returned an exception')

        #Perform a shut down and return any non-zero status
        shutdown_status = self._shutdown()
        status = status or shutdown_status

        logger.info('Exiting...')
        sys.exit(status)

    def _onsigterm(self, signum, _frame):
        """See signal callback registration: :py:func:`signal.signal`.

        This callback performs a clean shutdown when a TERM signal is received.
        """
        logger.info('Scheduler command terminated due to signal: %i', signum)
        self._shutdown()
        sys.exit(1)

    def _shutdown(self):
        """Performs any clean up required by this command.

        :returns: The exit status code based on whether the shutdown operation was clean with no exceptions.
        :rtype: int
        """
        status = 0

        try:
            if self.scheduler:
                self.scheduler.shutdown()
        except:
            logger.exception('Failed to properly shutdown Scale scheduler.')
            status = 1

        return status
Пример #10
0
class HpcFramework(object):
    messages = []
    to_be_scheduled = []
    scheduled = []

    def fprint(self, msg):
        msg = str(datetime.now()) + " HPC: " + msg
        self.messages.append(msg)
        print(msg)

    class MesosFramework(threading.Thread):
        def __init__(self, client):
            threading.Thread.__init__(self)
            self.client = client
            self.stop = False

        def run(self):
            try:
                self.client.register()
            except KeyboardInterrupt:
                self.fprint('Stop requested by user, stopping framework....')

    running = False

    def __init__(self):
        self.fprint("init")

    def start(self):
        self.fprint("Start, connecting to:" + MESOS_MASTER)
        self.driver = None
        self.client = MesosClient(mesos_urls=[MESOS_MASTER])
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)
        self.client.frameworkName = "HPC Framework"
        self.th = HpcFramework.MesosFramework(self.client)
        self.th.start()
        while self.th.isAlive():
            try:
                self.running = True
                self.th.join(1)
            except Exception as e:
                self.fprint("mesos framework exception" + str(e))
        self.running = False
        self.fprint("mesos framework stopped")

    def shutdown(self):
        self.fprint("shutdown")
        self.driver.tearDown()
        self.client.stop = True
        self.stop = True

    def subscribed(self, driver):
        self.fprint("subscribed")
        self.driver = driver

    def status_update(self, update):
        stat = update['status']
        state = stat['state']
        job = next(x for x in self.scheduled
                   if x['task_id']['value'] == stat['task_id']['value'])

        self.fprint(
            "status_update " + stat['task_id']['value'][:4] + " " +
            str(stat['state']))  # + " " + str(stat.get('message', {})))
        if state == "TASK_STARTING":
            pass
        elif state == "TASK_RUNNING":
            pass
        elif state == "TASK_FINISHED":
            self.scheduled.remove(job)
            pending_jobs.append(job)
        elif state == "TASK_FAILED":
            pass
        elif state == "TASK_KILLED":
            pass
        elif state == "TASK_ERROR":
            pass
        elif state == "TASK_DROPPED":
            pass
        elif state == "TASK_UNREACHABLE":
            pass
        else:
            self.fprint("unkown task state" + state)

    def compatible_offer(self, job, offer):
        of = offer.get_offer()
        for reqres in job['resources']:
            if reqres['type'] == "SCALAR":
                for avres in (x for x in of['resources']
                              if x['name'] == reqres['name']):
                    if reqres['scalar']['value'] > avres['scalar']['value']:
                        self.fprint(reqres['name'] + ": " +
                                    str(reqres['scalar']['value']) +
                                    "incompatible with offer " +
                                    str(avres['scalar']['value']))
                        return False
            elif reqres['type'] == "RANGES":
                self.fprint("TODO: Check ranges requirements")
            else:
                self.fprint("Unkown/invalid resource type: " +
                            str(reqres['type']))
                return False
        return True

    def offer_received(self, offers):
        self.fprint("offer_received" + (str(offers)))
        for offer in offers:
            if len(self.to_be_scheduled) > 0 and self.compatible_offer(
                    self.to_be_scheduled[0], offer):
                job = self.to_be_scheduled[0]
                self.fprint("Scheduling  " + job['task_id']['value'][:4])
                self.to_be_scheduled.remove(job)
                self.scheduled.append(job)
                self.run_job(job, offer)
            else:
                offer.decline()

    def run_job(self, job, mesos_offer):
        offer = mesos_offer.get_offer()
        self.fprint("run_job: " + job['task_id']['value'][:4] + " on " +
                    offer['hostname'])
        job['agent_id']['value'] = offer['agent_id']['value']
        mesos_offer.accept([job])
Пример #11
0
class Test(object):


    class MesosFramework(threading.Thread):

        def __init__(self, client):
            threading.Thread.__init__(self)
            self.client = client
            self.stop = False

        def run(self):
            try:
                self.client.register()
            except KeyboardInterrupt:
                print('Stop requested by user, stopping framework....')


    def __init__(self):
        logging.basicConfig()
        self.logger = logging.getLogger(__name__)
        #signal.signal(signal.SIGINT, signal.SIG_IGN)
        logging.getLogger('mesoshttp').setLevel(logging.DEBUG)
        self.driver = None
        self.client = MesosClient(mesos_urls=['http://127.0.0.1:5050'])
        #self.client = MesosClient(mesos_urls=['zk://127.0.0.1:2181/mesos'])
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)
        self.th = Test.MesosFramework(self.client)
        self.th.start()
        while True and self.th.isAlive():
            try:
                self.th.join(1)
            except KeyboardInterrupt:
                self.shutdown()
                break


    def shutdown(self):
        print('Stop requested by user, stopping framework....')
        self.logger.warn('Stop requested by user, stopping framework....')
        self.driver.tearDown()
        self.client.stop = True
        self.stop = True


    def subscribed(self, driver):
        self.logger.warn('SUBSCRIBED')
        self.driver = driver

    def status_update(self, update):
        if update['status']['state'] == 'TASK_RUNNING':
            self.driver.kill(update['status']['agent_id']['value'], update['status']['task_id']['value'])

    def offer_received(self, offers):
        self.logger.warn('OFFER: %s' % (str(offers)))
        i = 0
        for offer in offers:
            if i == 0:
                self.run_job(offer)
            else:
                offer.decline()
            i+=1

    def run_job(self, mesos_offer):
        offer = mesos_offer.get_offer()
        print(str(offer))
        task = {
            'name': 'sample test',
            'task_id': {'value': uuid.uuid4().hex},
            'agent_id': {'value': offer['agent_id']['value']},
            'resources': [
            {
                'name': 'cpus',
                'type': 'SCALAR',
                'scalar': {'value': 1}
            },
            {
                'name': 'mem',
                'type': 'SCALAR',
                'scalar': {'value': 1000}
            }
            ],
            'command': {'value': 'sleep 30'},
            'container': {
                'type': 'MESOS',
                'mesos': {
                    'image': {
                        'type': 'DOCKER',
                        'docker': {'name': 'debian'}
                    }
                }
            }
        }

        mesos_offer.accept([task])
Пример #12
0
class Test(object):
    class MesosFramework(threading.Thread):
        def __init__(self, client):
            threading.Thread.__init__(self)
            self.client = client
            self.stop = False

        def run(self):
            try:
                self.client.register()
            except KeyboardInterrupt:
                print('Stop requested by user, stopping framework....')

    def __init__(self):
        logging.basicConfig()
        self.logger = logging.getLogger(__name__)
        logging.getLogger('mesoshttp').setLevel(logging.DEBUG)
        self.driver = None
        self.client = MesosClient(mesos_urls=['http://52.87.159.219:5050'])
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)
        self.th = Test.MesosFramework(self.client)
        self.th.start()
        while True and self.th.isAlive():
            try:
                self.th.join(1)
            except KeyboardInterrupt:
                self.shutdown()
                break

    def shutdown(self):
        print('Stop requested by user, stopping framework....')
        self.logger.warn('Stop requested by user, stopping framework....')
        self.driver.tearDown()
        self.client.stop = True
        self.stop = True

    def subscribed(self, driver):
        self.logger.warn('SUBSCRIBED')
        self.driver = driver

    def status_update(self, update):
        a = 1

    #        if update['status']['state'] == 'TASK_RUNNING':
    #            self.driver.kill(update['status']['agent_id']['value'], update['status']['task_id']['value'])

    def offer_received(self, offers):
        self.logger.warn('OFFER: %s' % (str(offers)))
        i = 0
        for offer in offers:
            if i == 0:
                self.run_job(offer)
            else:
                offer.decline()
            i += 1

    def run_job(self, mesos_offer):
        offer = mesos_offer.get_offer()
        print(str(offer))
        task = {
            'name':
            'sample test',
            'task_id': {
                'value': uuid.uuid4().hex
            },
            'agent_id': {
                'value': offer['agent_id']['value']
            },
            'resources': [{
                'name': 'cpus',
                'type': 'SCALAR',
                'scalar': {
                    'value': 1
                }
            }, {
                'name': 'mem',
                'type': 'SCALAR',
                'scalar': {
                    'value': 64
                }
            }],
            'command': {
                'value':
                "curl -X POST 'https://hooks.slack.com/services/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' -H 'content-type: application/json; charset=UTF-8' -d '{\"text\" : \"Jan D.\"}'"
            },
            'container': {
                'type': 'DOCKER',
                'docker': {
                    'image': 'tutum/curl'
                }
            }
        }

        mesos_offer.accept([task])
Пример #13
0
class ESPAFramework(object):
    def __init__(self, cfg, espa_api, worklist):
        master = cfg.get('mesos_master')
        principal = cfg.get('mesos_principal')
        secret = cfg.get('mesos_secret')

        self.workList = worklist
        self.runningList = {}
        self.max_cpus = cfg.get('max_cpu')
        self.required_cpus = cfg.get('task_cpu')
        self.required_memory = cfg.get('task_mem')
        self.required_disk = cfg.get('task_disk')
        self.task_image = cfg.get('task_image')
        self.refuse_seconds = cfg.get('offer_refuse_seconds')
        self.request_count = cfg.get('product_request_count')
        self.products = cfg.get('product_frequency')
        self.healthy_states = [
            "TASK_STAGING", "TASK_STARTING", "TASK_RUNNING", "TASK_FINISHED"
        ]
        self.espa = espa_api
        self.cfg = cfg

        self.client = MesosClient(mesos_urls=[master],
                                  frameworkName='ESPA Mesos Framework')
        self.client.verify = False
        self.client.set_credentials(principal, secret)
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)

        # put some work on the queue
        get_products_to_process(cfg, self.espa, self.workList)

    def _getResource(self, res, name):
        for r in res:
            if r['name'] == name:
                return r['scalar']['value']
        return 0.0

    def _updateResource(self, res, name, value):
        if value <= 0:
            return
        for r in res:
            if r['name'] == name:
                r['scalar']['value'] -= value
        return

    def subscribed(self, driver):
        log.warning('SUBSCRIBED')
        self.driver = driver

    def core_limit_reached(self):
        running_count = len(self.runningList)
        task_core_count = self.required_cpus
        core_utilization = running_count * task_core_count
        resp = False

        log.debug("Number of cores being used: {}".format(core_utilization))
        if core_utilization >= self.max_cpus:
            log.debug("Max number of cores being used. Max = {}".format(
                self.max_cpus))
            resp = True

        return resp

    def accept_offer(self, offer):
        accept = True
        resources = offer.get('resources')
        if self.required_cpus != 0:
            cpu = self._getResource(resources, "cpus")
            if self.required_cpus > cpu:
                accept = False
        if self.required_memory != 0:
            mem = self._getResource(resources, "mem")
            if self.required_memory > mem:
                accept = False
        if self.required_disk != 0:
            disk = self._getResource(resources, "disk")
            if self.required_disk > disk:
                accept = False
        if (accept == True):
            self._updateResource(resources, "cpus", self.required_cpus)
            self._updateResource(resources, "mem", self.required_memory)
            self._updateResource(resources, "disk", self.required_disk)

        return accept

    def decline_offer(self, offer):
        options = {'filters': {'refuse_seconds': self.refuse_seconds}}
        log.debug("declining offer: {} with options: {}".format(
            offer, options))
        try:
            offer.decline(options)
        except Exception as error:
            log.error(
                "Exception encountered declining offer: {}, error: {}".format(
                    offer, error))
            raise
        return True

    def offer_received(self, offers):
        response = addict.Dict()
        response.offers.length = len(offers)
        response.offers.accepted = 0
        log.debug("Received {} new offers...".format(response.offers.length))

        # check to see if Mesos tasks are enabled
        if self.espa.mesos_tasks_disabled():
            # decline the offers to free up the resources
            log.debug("mesos tasks disabled, declining {} offers".format(
                len(offers)))
            for offer in offers:
                self.decline_offer(offer)
            response.tasks.enabled = False
            return response
        else:
            response.tasks.enabled = True

        # check to see if core limit has been reached
        if self.core_limit_reached():
            # decline the offers to free up the resources
            log.debug(
                "Core utilization limit reached, declining {} offers".format(
                    len(offers)))
            for offer in offers:
                self.decline_offer(offer)
            response.tasks.enabled = False
            return response
        else:
            response.tasks.enabled = True

        for offer in offers:
            mesos_offer = offer.get_offer()
            if self.accept_offer(mesos_offer):
                log.debug("Acceptable offer, checking for work to do")
                try:
                    work = self.workList.get(
                        False
                    )  # will raise multiprocessing.Empty if no objects present
                    orderid = work.get('orderid')
                    scene = work.get('scene')
                    task_id = "{}_@@@_{}".format(orderid, scene)
                    new_task = task.build(task_id, mesos_offer,
                                          self.task_image, self.required_cpus,
                                          self.required_memory,
                                          self.required_disk, work, self.cfg)
                    log.debug("New Task definition: {}".format(new_task))
                    offer.accept([new_task])
                    self.espa.update_status(scene, orderid, 'tasked')
                    response.offers.accepted += 1
                except Empty:
                    log.debug("Work queue is empty, declining offer")
                    self.decline_offer(offer)
                except Exception as e:
                    log.error(
                        "Exception creating new task. offer: {}, exception: {}\n declining offer"
                        .format(offer, e))
                    self.decline_offer(offer)
            else:
                log.debug("Unacceptable offer, declining")
                self.decline_offer(offer)

        log.debug("resourceOffer response: {}".format(response))
        return response

    def status_update(self, update):
        # possible state values
        # http://mesos.apache.org/api/latest/java/org/apache/mesos/Protos.TaskState.html
        task_id = update['status']['task_id']['value']
        orderid, scene = task_id.split("_@@@_")
        state = update['status']['state']

        response = addict.Dict()
        response.task_id = task_id
        response.state = state

        if state in self.healthy_states:
            log.debug("status update for: {}  new status: {}".format(
                task_id, state))
            response.status = "healthy"

            if state == "TASK_RUNNING":
                response.list.name = "running"
                if task_id not in self.runningList:
                    self.runningList[task_id] = util.right_now()
                    response.list.status = "new"
                else:
                    response.list.status = "current"

            if state == "TASK_FINISHED":
                try:
                    self.runningList.__delitem__(task_id)
                except KeyError:
                    log.debug(
                        "Received TASK_FINISHED update for {}, which wasn't in the runningList"
                        .format(task_id))

        else:  # something abnormal happened
            log.error("abnormal task state for: {}, full update: {}".format(
                task_id, update))
            response.status = "unhealthy"
            self.espa.set_scene_error(scene, orderid, update)
            if task_id in self.runningList:
                self.runningList.__delitem__(task_id)

        return response
Пример #14
0
class HpcpackFramwork(object):
    class MesosFramework(threading.Thread):
        def __init__(self, client):
            threading.Thread.__init__(self)
            self.client = client
            self.stop = False

        def run(self):
            try:
                self.client.register()
            except KeyboardInterrupt:
                print('Stop requested by user, stopping framework....')

    def __init__(self,
                 script_path,
                 setup_path,
                 headnode,
                 ssl_thumbprint,
                 client_cert,
                 node_group=""):
        logging.basicConfig()
        self.logger = logging_aux.init_logger_aux("hpcframework",
                                                  "hpcframework.log")
        # signal.signal(signal.SIGINT, signal.SIG_IGN)
        logging.getLogger('mesoshttp').setLevel(logging.DEBUG)
        self.node_idle_check_table = {}
        self.script_path = script_path
        self.setup_path = setup_path
        self.headnode = headnode
        self.ssl_thumbprint = ssl_thumbprint
        self.node_group = node_group
        self.hpc_client = HpcRestClient(client_cert)
        self.heartbeat_table = hpc_cluster_manager.HpcClusterManager(
            self.hpc_client, node_group=self.node_group)
        self.heartbeat_table.subscribe_node_closed_callback(
            lambda l: map(self._kill_task_by_hostname, l))
        self.heartbeat_table.start()
        self.core_provisioning = 0.0
        self.driver = None  # type: MesosClient.SchedulerDriver
        framework_suffix = self.headnode.replace(',', '_')
        if self.node_group != "":
            framework_suffix = framework_suffix + '-' + self.node_group
        self.mesos_client = MesosClient(
            mesos_urls=['http://172.16.1.4:5050'],
            # mesos_urls=['zk://127.0.0.1:2181/mesos'],
            frameworkName="HPC-Pack-Framework-{}".format(framework_suffix))

        self.mesos_client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.mesos_client.on(MesosClient.OFFERS, self.offer_received)
        self.mesos_client.on(MesosClient.UPDATE, self.status_update)
        self.th = HpcpackFramwork.MesosFramework(self.mesos_client)
        self.stop = False

    def start(self):
        self.th.start()
        while True and self.th.isAlive():
            try:
                self.th.join(1)
            except KeyboardInterrupt:
                self.shutdown()
                break

    def __encode_utf16b64(self, content):
        utf16 = content.encode('utf-16')
        utf16_nobom = utf16[2:] if utf16[0:2] == codecs.BOM_UTF16 else utf16
        utf16_b64 = base64.b64encode(utf16_nobom)
        return utf16_b64

    def shutdown(self):
        print 'Stop requested by user, stopping framework....'
        self.logger.warn('Stop requested by user, stopping framework....')
        self.driver.tearDown()
        self.mesos_client.stop = True
        self.stop = True

    def subscribed(self, driver):
        self.logger.warn('SUBSCRIBED')
        self.driver = driver

    def status_update(self, update):
        # if update['status']['state'] == 'TASK_RUNNING':
        #     self.driver.kill(update['status']['agent_id']['value'], update['status']['task_id']['value'])
        self.logger.info("Update received:\n{}".format(str(update)))

    def offer_received(self, offers):
        handled_offer = []  # type: List[Offer]

        try:
            # self.logger.info('OFFER: %s' % (str(offers)))
            if self.node_group == "":
                grow_decision = self.hpc_client.get_grow_decision()
            else:
                grow_decision = self.hpc_client.get_grow_decision(
                    self.node_group)

            if grow_decision is None:
                cores_to_grow = 0
                cores_in_provisioning = 0
            else:
                cores_in_provisioning = self.heartbeat_table.get_cores_in_provisioning(
                )
                cores_to_grow = grow_decision.cores_to_grow - cores_in_provisioning

            for offer in offers:  # type: Offer
                take_offer = False
                cpus = 0.0
                if cores_to_grow > 0:
                    offer_dict = offer.get_offer()
                    self.logger.info(
                        "cores_to_grow: {}, cores_in_provisioning: {}, offer_received: {}"
                        .format(cores_to_grow, cores_in_provisioning,
                                (str(offer_dict))))
                    if 'attributes' in offer_dict:
                        attributes = offer_dict['attributes']
                        if get_text(attributes, 'os') == 'windows_server':
                            match_node_group = False
                            if self.node_group == "":
                                match_node_group = True
                            elif get_text(attributes, 'node_group').upper(
                            ) == self.node_group.upper():
                                match_node_group = True
                            else:
                                match_node_group = False

                            if match_node_group:
                                cores = get_scalar(attributes, 'cores')
                                cpus = get_scalar(offer_dict['resources'],
                                                  'cpus')
                                # work around of MESOS-8631
                                if cpus >= cores - 0.1:
                                    if not self.heartbeat_table.check_fqdn_collision(
                                            offer_dict['hostname']):
                                        take_offer = True
                if take_offer:
                    cores_to_grow -= cpus
                    self.accept_offer(offer)
                    handled_offer.append(offer)
                else:
                    self.decline_offer(offer)
                    handled_offer.append(offer)

        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as ex:
            self.logger.exception(ex)
        finally:
            # We have to either accept or decline an offer
            # TODO: Retry in a separate thread
            while True:
                try:
                    if not offers:
                        return
                    else:
                        to_decline = [
                            offer for offer in offers
                            if offer not in handled_offer
                        ]
                        for offer in to_decline:
                            self.decline_offer(offer)
                            handled_offer.append(offer)
                        return
                except Exception as ex:
                    self.logger.exception(ex)
                    time.sleep(10)

    def decline_offer(self, offer):
        self.logger.info("Decline offer %s" % offer.get_offer()['id']['value'])
        offer.decline()

    def accept_offer(self, offer):
        self.logger.info("Offer %s meets HPC's requirement" %
                         offer.get_offer()['id']['value'])
        offer_dict = offer.get_offer()
        self.logger.info("Accepting offer: {}".format(str(offer_dict)))
        agent_id = offer_dict['agent_id']['value']
        fqdn = offer_dict['hostname']
        task_id = uuid.uuid4().hex
        cpus = get_scalar(offer_dict['resources'], 'cpus')

        # work around of MESOS-8631
        if 'attributes' in offer_dict:
            attributes = offer_dict['attributes']
            cores = get_scalar(attributes, 'cores')
        else:
            cores = cpus

        task = {
            'name':
            'hpc pack mesos cn',
            'task_id': {
                'value': task_id
            },
            'agent_id': {
                'value': agent_id
            },
            'resources': [
                {
                    'name': 'cpus',
                    'type': 'SCALAR',
                    # work around of MESOS-8631
                    'scalar': {
                        'value': cores - 0.1
                    }
                },
                {
                    'name': 'mem',
                    'type': 'SCALAR',
                    'scalar': {
                        'value': get_scalar(offer_dict['resources'], 'mem')
                    }
                }
            ],
            'command': {
                'value':
                'powershell -File ' + self.script_path + " -setupPath " +
                self.setup_path + " -headnode " + self.headnode +
                " -sslthumbprint " + self.ssl_thumbprint + " > setupscript.log"
            }
        }
        self.logger.debug("Sending command:\n{}".format(
            task['command']['value']))
        offer.accept([task])
        self.heartbeat_table.add_slaveinfo(fqdn, agent_id, task_id, cpus)

    def _kill_task(self, host):
        self.logger.debug("Killing task {} on host {}".format(
            host.task_id, host.fqdn))
        self.driver.kill(host.agent_id, host.task_id)

    def _kill_task_by_hostname(self, hostname):
        (task_id, agent_id) = self.heartbeat_table.get_task_info(hostname)
        if task_id != "":
            self.logger.debug("Killing task {} on host {}".format(
                task_id, hostname))
            self.driver.kill(agent_id, task_id)
        else:
            self.logger.warn(
                "Task info for host {} not found".format(hostname))
Пример #15
0
class Test(object):
    class MesosFramework(threading.Thread):
        def __init__(self, client):
            threading.Thread.__init__(self)
            self.client = client
            self.exited = False

        def run(self):
            try:
                self.client.register()
            except KeyboardInterrupt:
                print('Stop requested by user, stopping framework....')
            print("MesosFramework: run end")
            self.exited = True

        def is_done(self):
            return self.exited

    def __init__(self, url="http://127.0.0.1:5050", name="HTTP framework", user="******"):
        logging.basicConfig()
        self.logger = logging.getLogger(__name__)
        self.tasks = 0
        #signal.signal(signal.SIGINT, signal.SIG_IGN)
        logging.getLogger('mesoshttp').setLevel(logging.DEBUG)
        self.driver = None
        self.mesos_offer = None # to store only one offer
        self.update = None
        self.task_id = None
        self.agent_id = None
        self.client = MesosClient(mesos_urls = [url], frameworkName = name, frameworkUser = user)
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)
        self.th = Test.MesosFramework(self.client)

    def reset_offer(self):
        ret = copy.deepcopy(self.mesos_offer)
        self.mesos_offer = None
        return ret

    def reset_update(self):
        ret = copy.deepcopy(self.update)
        self.update = None
        return ret

    def start(self):
        self.th.start()

    def is_done(self):
        return self.th.is_done()

    def wait_offer(self, sec):
        cnt = 0
        while self.mesos_offer is None and cnt < sec:
            print("Waiting for offer: %d" % cnt)
            cnt += 1
            gevent.sleep(1)
        if self.mesos_offer is None:
            print("Timeout waiting for offer")
        return self.mesos_offer
#        return self.wait4(self.mesos_offer, sec, "offer")

    def wait_update(self, sec):
        cnt = 0
        while self.update is None and cnt < sec:
            print("Waiting for update %d" %  cnt)
            cnt += 1
            gevent.sleep(1)
        if self.update is None:
            print("Timeout waiting for update")
        return self.update
#        return self.wait4(self.update, sec, "update")

    def wait4(self, obj, sec, msg):
        cnt = 0
        while obj is None and cnt < sec:
            print("Waiting for %s %s %d" % (msg, repr(obj), cnt))
            cnt += 1
            gevent.sleep(1)
        if obj is None:
            print("Timeout waiting for: %s" % msg)
        return obj

    def shutdown(self):
        print('Stop requested by user, stopping framework....')
        self.driver.tearDown()
        self.client.stop = True

    def subscribed(self, driver):
        print('SUBSCRIBED')
        self.driver = driver

    def status_update(self, update):
        print("STATUS UPDATE: %s" % update['status']['state'])
        if self.update is None:
            self.update = update
            self.task_id = update['status']['task_id']
            self.agent_id = update['status']['agent_id']
            print("Use update: %s" % self.update)
#        if update['status']['state'] == 'TASK_RUNNING':
#            self.tasks+=1
#            self.logger.warn("Task %s (%d/%d): TASK_RUNNING" % (update['status']['agent_id']['value'],self.tasks,max_tasks))
#            self.driver.kill(update['status']['agent_id']['value'], update['status']['task_id']['value'])
#            if self.tasks >= max_tasks:
#                self.logger.warn("Max tasks %d lauched, shutdown now" % self.tasks)
#                self.shutdown()
#        elif update['status']['state'] == 'TASK_FINISHED':
#            self.logger.warn("Task %s: TASK_FINISHED" % update['status']['agent_id']['value'])
#            self.tasks+=1
#            if self.tasks >= self.max_tasks:
#                self.shutdown()
#        else:
#            self.logger.warn("Status update: %s" % update['status']['state'])

    def offer_received(self, offers):
        print("OFFERS: %s" % (str(offers)))
        i = 0
        for offer in offers:
            if i == 0 and self.mesos_offer is None:
                self.mesos_offer = offer
                print("Use offer: %s" % self.mesos_offer.get_offer())
            else:
                print("Declining offer: %s" % offer.get_offer())
                offer.decline()
            i+=1

    def run_task(self, sec, o = None):
        offer = o if o else self.mesos_offer.get_offer()
        print("Run task on offer: %s" % str(offer))
        task = {
            'name': 'sample test',
            'task_id': {'value': uuid.uuid4().hex},
            'agent_id': {'value': offer['agent_id']['value']},
            'resources': [
            {
                'name': 'cpus',
                'type': 'SCALAR',
                'scalar': {'value': 1}
            },
            {
                'name': 'mem',
                'type': 'SCALAR',
                'scalar': {'value': 1000}
            }
            ],
            'command': {'value': 'sleep ' + str(sec)},
        }
        self.mesos_offer.accept([task])

    def kill_task(self, task_name = None, agent_name = None):
        t = task_name if task_name else self.update['status']['task_id']['value']
        a = agent_name if agent_name else self.update['status']['agent_id']['value']
        print("kill task: %s on %s" % (t, a))
        self.driver.kill(a, t)

    def send_message(self, msg, agent_id = None, executor_id = None):
        a = agent_id if agent_id else self.update['agent_id']
        e = task_name if task_name else self.update['executor_id']
        print("Send message: %s" % msg)
        self.driver.message(a, e, msg)

    def reconcile(self):
        print("Reconcile")
        task = { 'task_id' : self.task_id, 'agent_id' : self.agent_id }
        self.driver.reconcile([task])
Пример #16
0
class Test(object):
    class MesosFramework(threading.Thread):
        def __init__(self, client):
            threading.Thread.__init__(self)
            self.client = client
            self.stop = False

        def run(self):
            try:
                self.client.register()
            except KeyboardInterrupt:
                print('Stop requested by user, stopping framework....')

    def __init__(self):
        logging.basicConfig()
        self.logger = logging.getLogger(__name__)
        logging.getLogger('mesoshttp').setLevel(logging.DEBUG)

        self.driver = None
        # Note: leader.mesos address requires Mesos DNS
        #self.client = MesosClient(mesos_urls=['zk://leader.mesos:2181/mesos'])
        # If you are purely using Mesos, you should use explicit address of Master
        # Example: Zookeeper master discovery
        #self.client = MesosClient(mesos_urls=['zk://127.0.0.1:2181/mesos'])
        # Example: Directly address Mesos
        #self.client = MesosClient(mesos_urls=['http://127.0.0.1:5050'])

        # By default, use direct master addressing
        # Allow for comma delimited URLs to be passed in via MASTER_URLS
        # environment variable
        master_urls = os.getenv('MESOS_URLS', 'http://127.0.0.1:5050')
        self.client = MesosClient(mesos_urls=master_urls.split(','))

        secret = os.getenv('SERVICE_SECRET')
        if secret:
            self.client.set_service_account(json.loads(secret))
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)
        self.th = Test.MesosFramework(self.client)
        self.th.start()
        while True and self.th.isAlive():
            try:
                self.th.join(1)
            except KeyboardInterrupt:
                self.shutdown()
                break

    def shutdown(self):
        print('Stop requested by user, stopping framework....')
        self.logger.warn('Stop requested by user, stopping framework....')
        self.client.stop = True
        self.driver.tearDown()
        self.stop = True

    def subscribed(self, driver):
        self.logger.warn('SUBSCRIBED')
        self.driver = driver

    def status_update(self, update):
        if update['status']['state'] == 'TASK_RUNNING':
            self.driver.kill(update['status']['agent_id']['value'],
                             update['status']['task_id']['value'])

    def offer_received(self, offers):
        self.logger.warn('OFFER: %s' % (str(offers)))
        i = 0
        for offer in offers:
            if i == 0:
                self.run_job(offer)
            else:
                offer.decline()
            i += 1

    def run_job(self, mesos_offer):
        offer = mesos_offer.get_offer()
        print(str(offer))
        task = {
            'name':
            'sample test',
            'task_id': {
                'value': uuid.uuid4().hex
            },
            'agent_id': {
                'value': offer['agent_id']['value']
            },
            'resources': [{
                'name': 'cpus',
                'type': 'SCALAR',
                'scalar': {
                    'value': 1
                }
            }, {
                'name': 'mem',
                'type': 'SCALAR',
                'scalar': {
                    'value': 1000
                }
            }],
            'command': {
                'value': 'sleep 30'
            },
            'container': {
                'type': 'MESOS',
                'mesos': {
                    'image': {
                        'type': 'DOCKER',
                        'docker': {
                            'name': 'debian'
                        }
                    }
                }
            }
        }

        mesos_offer.accept([task])
Пример #17
0
class Test(object):
    class MesosFramework(threading.Thread):
        def __init__(self, client):
            threading.Thread.__init__(self)
            self.client = client
            self.stop = False

        def run(self):
            try:
                self.client.register()
            except KeyboardInterrupt:
                print('Stop requested by user, stopping framework....')

    def __init__(self, url="http://127.0.0.1:5050"):
        logging.basicConfig()
        self.logger = logging.getLogger(__name__)
        self.logger.setLevel(logging.DEBUG)
        self.launched_tasks = 0
        self.running_tasks = 0
        self.finished_tasks = 0
        self.killed_tasks = 0
        self.lost_tasks = []
        #        self.failed_tasks = []
        #signal.signal(signal.SIGINT, signal.SIG_IGN)
        logging.getLogger('mesoshttp').setLevel(logging.DEBUG)
        self.driver = None
        self.client = MesosClient(mesos_urls=[url],
                                  frameworkId=None,
                                  frameworkName='Python HTTP framework',
                                  frameworkUser=getpass.getuser())
        self.client.on(MesosClient.SUBSCRIBED, self.subscribed)
        self.client.on(MesosClient.OFFERS, self.offer_received)
        self.client.on(MesosClient.UPDATE, self.status_update)
        self.th = Test.MesosFramework(self.client)
        self.th.start()
        while True and self.th.isAlive():
            try:
                self.th.join(1)
            except KeyboardInterrupt:
                self.shutdown()
                break

    def shutdown(self):
        print('Stop requested by user, stopping framework....')
        self.logger.info('Stop requested by user, stopping framework....')
        self.driver.tearDown()
        self.client.stop = True
        self.stop = True

    def subscribed(self, driver):
        self.logger.info('SUBSCRIBED')
        self.driver = driver

    def status_update(self, update):
        status = update['status']['state']
        task = update['status']['task_id']['value']
        self.logger.info("Task %s: %s" % (task, status))
        if status == 'TASK_RUNNING':
            self.running_tasks += 1
            if not self.running_tasks % 3:
                #            if not self.running_tasks%3 and task not in self.failed_tasks:
                self.logger.info("Killing task: %s" % task)
                self.driver.kill(update['status']['agent_id']['value'], task)
        elif status == 'TASK_FINISHED':
            self.finished_tasks += 1
            if self.finished_tasks >= max_tasks:
                self.logger.info("All %d tasks finished, shutdown now" %
                                 self.finished_tasks)
                self.shutdown()
        elif status == 'TASK_LOST':
            self.lost_tasks.append(task)
            self.logger.info("Reconcile on lost task")
            self.driver.reconcile([])
        elif status == 'TASK_FAILED':
            #            self.failed_tasks.append(task)
            self.logger.info("Reconcile on failed task")
            self.driver.reconcile([])
        else:
            self.logger.info("Status update: %s" % update['status']['state'])
        self.logger.info("(l%d/f%d/r%d/m%d)" %
                         (self.launched_tasks, self.finished_tasks,
                          self.running_tasks, max_tasks))

    def offer_received(self, offers):
        self.logger.info('OFFER: %s' % (str(offers)))
        i = 0
        if len(self.lost_tasks) > 0:
            task_name = self.lost_tasks.pop(0)
            print("Relaunching lost task: %s" % task_name)
        else:
            task_name = uuid.uuid4().hex

        for offer in offers:
            if i == 0:
                self.run_job(offer, task_name)
            else:
                offer.decline()
            i += 1


#    def run_job(self, mesos_offer, task_name = uuid.uuid4().hex):

    def run_job(self, mesos_offer, task_name):
        offer = mesos_offer.get_offer()
        cmd = "sleep 3" if self.launched_tasks % 4 else 'sleep 2; exit 1'
        #        print(str(offer))
        task = {
            'name':
            'sample http task',
            'task_id': {
                'value': task_name
            },
            'agent_id': {
                'value': offer['agent_id']['value']
            },
            'resources': [{
                'name': 'cpus',
                'type': 'SCALAR',
                'scalar': {
                    'value': 0.1
                }
            }, {
                'name': 'mem',
                'type': 'SCALAR',
                'scalar': {
                    'value': 100
                }
            }],
            'command': {
                'value': cmd
            },
        }

        mesos_offer.accept([task])
        self.launched_tasks += 1