Пример #1
0
def update_worker_status():
    """
    Update the worker status to the master as well as container info.
    """

    threading.Timer(5, update_worker_status).start()
    """
    Get machine status by calling a unix command and fetch for load average
    """

    content = Services.get_machine_status(Setting, CRole.WORKER)
    content[Definition.REST.get_str_docker(
    )] = DockerService.get_containers_status()
    content[Definition.REST.get_str_local_imgs(
    )] = DockerService.get_local_images()

    s_content = bytes(json.dumps(content), 'utf-8')

    html = urllib3.PoolManager()
    try:
        r = html.request('PUT',
                         Definition.Master.get_str_check_master(
                             Setting.get_master_addr(),
                             Setting.get_master_port(), Setting.get_token()),
                         body=s_content)

        if r.status != 200:
            SysOut.err_string("Cannot update worker status to the master!")
        else:
            SysOut.debug_string("Reports status to master node complete.")

    except Exception as e:
        SysOut.err_string("Master is not available!")
        print(e)
Пример #2
0
    def find_available_worker(self, container):
        candidates = []
        workers = LookUpTable.Workers.verbose()
        SysOut.debug_string("Found workers: " + str(workers))
        if not workers:
            return None

        # loop through workers and make tuples of worker IP, load and if requested container is available locally
        for worker in workers:

            curr_worker = workers[worker]
            if container in curr_worker[Definition.REST.get_str_local_imgs()]:
                candidates.append(
                    ((curr_worker[Definition.get_str_node_addr()],
                      curr_worker[Definition.get_str_node_port()]),
                     curr_worker[Definition.get_str_load5()], True))
            else:
                candidates.append(
                    ((curr_worker[Definition.get_str_node_addr()],
                      curr_worker[Definition.get_str_node_port()]),
                     curr_worker[Definition.get_str_load5()], False))

        candidates.sort(
            key=lambda x: (-x[2], x[1])
        )  # sort candidate workers first on availability of image, then on load (avg load last 5 mins)
        for candidate in list(candidates):
            if not float(candidate[1]) < 0.5:
                candidates.remove(
                    candidate
                )  # remove candidates with higher than 50% cpu load

        return candidates
Пример #3
0
    def on_put(self, req, res):
        """
        PUT: /status?token={None}
        """
        if not Definition.get_str_token() in req.params:
            res.body = "Token is required."
            res.content_type = "String"
            res.status = falcon.HTTP_401
            return

        if req.params[Definition.get_str_token()] == Setting.get_token():
            raw = str(req.stream.read(), 'UTF-8')
            data = eval(raw)

            LookUpTable.update_worker(data)
            SysOut.debug_string("Update worker status ({0})".format(
                data[Definition.get_str_node_name()]))

            res.body = "Okay"
            res.content_type = "String"
            res.status = falcon.HTTP_200
        else:
            res.body = "Invalid token ID."
            res.content_type = "String"
            res.status = falcon.HTTP_401
Пример #4
0
 def collect_exited_containers(self):
     while True:
         sleep(self.gc_run_interval)
         
         exited_containers = []
         current_containers = DockerService.get_containers_status()
         for cont in current_containers:
             # find exited containers
             if cont.get(Definition.Container.Status.get_str_status()) == 'exited':
                 exited_containers.append(cont.get(Definition.Container.Status.get_str_sid()))
             
         for sid in exited_containers:
             if not DockerService.delete_container(sid):
                 SysOut.debug_string("Could not delete target container: {}".format(sid))
Пример #5
0
    def start_job(self, target, job_data):
        # send request to worker
        worker_url = "http://{}:{}/docker?token=None&command=create".format(
            target[0], target[1])
        req_data = bytes(json.dumps(job_data), 'utf-8')
        resp = urlopen(
            worker_url, req_data
        )  # NOTE: might need increase in timeout to allow download of large container images!!!

        if resp.getcode() == 200:  # container was created
            sid = str(resp.read(), 'utf-8')
            SysOut.debug_string("Received sid from container: " + sid)
            return sid
        return False
Пример #6
0
    def on_put(self, req, res):
        """
        PUT: /status?token={None}
        """
        if not Definition.get_str_token() in req.params:
            res.body = "Token is required."
            res.content_type = "String"
            res.status = falcon.HTTP_401
            return

        if Definition.Docker.get_str_finished() in req.params:
            # a container is shutting down, update containers
            # TODO: add some kind of safety mechanism to really make sure no new requests have been sent to this container before acknowledging removal?
            if LookUpTable.remove_container(
                    req.params.get(
                        Definition.Container.get_str_con_image_name()),
                    req.params.get(Definition.Docker.get_str_finished())):
                format_response_string(res, falcon.HTTP_200,
                                       "Container successfully removed")
                # NOTE: container will terminate as soon as it reads this response!
            else:
                format_response_string(
                    res, falcon.HTTP_400,
                    "Could not remove container from table!")
                # NOTE: container will continue as before when it reads this response!
            return

        if req.params[Definition.get_str_token()] == Setting.get_token():
            data = json.loads(
                str(req.stream.read(req.content_length or 0), 'utf-8'))

            LookUpTable.update_worker(data)
            SysOut.debug_string("Update worker status ({0})".format(
                data[Definition.get_str_node_name()]))

            res.body = "Okay"
            res.content_type = "String"
            res.status = falcon.HTTP_200
        else:
            res.body = "Invalid token ID."
            res.content_type = "String"
            res.status = falcon.HTTP_401

        return
Пример #7
0
    def set_node_addr(addr=None):
        if addr:
            Setting.__node_addr = addr
        else:
            import socket
            from harmonicIO.general.services import Services
            Setting.__node_addr = socket.gethostname()
            SysOut.debug_string(Setting.__node_addr)
            # if addr is valid
            if Services.is_valid_ipv4(
                    Setting.__node_addr) or Services.is_valid_ipv6(
                        Setting.__node_addr):
                return None

            # if addr is not valid
            Setting.__node_addr = Services.get_host_name_i()
            if Services.is_valid_ipv4(
                    Setting.__node_addr) or Services.is_valid_ipv6(
                        Setting.__node_addr):
                return None

            SysOut.terminate_string("Cannot get node ip address!")
Пример #8
0
    def job_queuer(self):
        while True:
            job_data = JobQueue.q.get()
            num_of_conts = job_data.get('num')
            job_sids = []
            targets = self.find_available_worker(
                job_data.get(Definition.Container.get_str_con_image_name()))
            SysOut.debug_string("Candidate workers: " + str(targets))
            n = 0
            while len(job_sids) < num_of_conts:
                target = targets[n][0]
                SysOut.debug_string("Attempting to send request to worker: " +
                                    str(target))
                try:
                    sid = self.start_job(target, job_data)
                    if sid:
                        job_sids.append(sid)
                    else:  # not sure how urllib handles a 400 response, but this needs to happen either in case of exception or sid = False
                        if n < len(
                                targets) - 1:  # other candidates are available
                            n += 1
                            continue
                        else:
                            job_data['job_status'] = JobStatus.FAILED
                            break

                    if len(job_sids) == num_of_conts:
                        job_data['job_status'] = JobStatus.READY
                        job_data[Definition.Container.Status.get_str_sid(
                        )] = job_sids  #TODO: add this in metatable

                except:
                    SysOut.debug_string(
                        "Response from worker threw exception!")
                    if n < len(targets) - 1:  # other candidates are available
                        SysOut.usr_string(
                            "We got to other candidates available!!!!!!! -------------------------------------"
                        )
                        n += 1
                        continue
                    else:
                        job_data['job_status'] = JobStatus.FAILED
                        break  # break makes it stop trying to create new containers as soon as one fails, is this desireable? Probaby as now it is unlikely that there is any hosting capability

            ## NOTE: can get really ugly, need to cleanup containers that started (rollback) OR let user know how many were started instead?? or retry failed ones?
            LookUpTable.Jobs.update_job(job_data)
            JobQueue.q.task_done()
Пример #9
0
    sc = StreamConnector(MASTER_DATA["MASTER_ADDR"],
                         MASTER_DATA["MASTER_PORT"],
                         token=SETTING["TOKEN"],
                         std_idle_time=SETTING["IDLE_TIME"],
                         max_try=SETTING["MAX_TRY"],
                         source_name=SETTING["SOURCE_NAME"])

    if sc.is_master_alive():
        SysOut.out_string(
            "Connection to the master ({0}:{1}) is successful.".format(
                MASTER_DATA["MASTER_ADDR"], MASTER_DATA["MASTER_PORT"]))
    else:
        SysOut.terminate_string("Master at ({0}:{1}) is not alive!".format(
            MASTER_DATA["MASTER_ADDR"], MASTER_DATA["MASTER_PORT"]))

    SysOut.debug_string(
        "Generating random order of data in {0} series.".format(ITEM_NUMBER))
    stream_order, d_list = get_random_data()

    # Stream according to the random order
    for _, obj_type in stream_order:

        d_container = sc.get_data_container()

        # Assign data to container
        d_container += d_list[obj_type]

        sc.send_data(PROCC_DATA[obj_type], PROCC_DATA["OS"], d_container)

    SysOut.out_string("Finish!")
Пример #10
0
    def __check_for_scale():
        tmp = "MSGs "
        for key, value in MessagesQueue.__msg_queue.items():
            tmp += "({0} -> {1}) ".format(key, len(value))

        SysOut.debug_string(tmp)
Пример #11
0
    def on_post(self, req, res):
        """
        POST: /streamRequest?token=None
        This function invoked by the driver in micro-batch in the container.
        It responds with getting a stream from data source or from messaging system.
        """
        if not Definition.get_str_token() in req.params:
            res.body = "Token is required."
            res.content_type = "String"
            res.status = falcon.HTTP_401
            return

        # Check that the PE is existing or not, if not insert and respond
        if Definition.REST.Batch.get_str_batch_addr() in req.params and \
           Definition.REST.Batch.get_str_batch_port() in req.params and \
           Definition.REST.Batch.get_str_batch_status() in req.params and \
           Definition.Container.get_str_con_image_name() in req.params:

            # Check for data type
            if req.params[Definition.REST.Batch.get_str_batch_port()].isdigit() and \
               req.params[Definition.REST.Batch.get_str_batch_status()].isdigit():

                ret = LookUpTable.Containers.get_container_object(req)

                # If queue contain data, ignore update and stream from queue
                length = MessagesQueue.get_queues_length(
                    ret[Definition.Container.get_str_con_image_name()])

                if not length:
                    LookUpTable.Containers.update_container(ret)
                    SysOut.debug_string("No item in queue!")
                    res.body = "No item in queue"
                    res.content_type = "String"
                    res.status = falcon.HTTP_200
                    return

                if length > 0 and ret[
                        Definition.REST.Batch.get_str_batch_status(
                        )] == CStatus.AVAILABLE:
                    # ret[Definition.REST.Batch.get_str_batch_status()] = CStatus.BUSY
                    # LookUpTable.Containers.update_container(ret)

                    res.data = bytes(
                        MessagesQueue.pop_queue(ret[
                            Definition.Container.get_str_con_image_name()]))
                    res.content_type = "Bytes"
                    res.status = falcon.HTTP_203
                    return
                else:
                    # Register a new channel
                    LookUpTable.Containers.update_container(ret)
                    res.body = "OK"
                    res.content_type = "String"
                    res.status = falcon.HTTP_200
                    return
            else:
                res.body = "Invalid data type!"
                res.content_type = "String"
                res.status = falcon.HTTP_406
                return
        else:
            res.body = "Invalid parameters!"
            res.content_type = "String"
            res.status = falcon.HTTP_406
            return