示例#1
0
    def test_connect(self):
        server = get_server_on_this_node()
        server.start()

        time.sleep(1)
        worker = create_worker(server.address)
        worker.start()

        #    while True:
        #      print "main thread is pausing..."
        time.sleep(5)

        print "connecting to %s:%d" % server.address

        client = JSClient(server.address)
        req = ControlMessage()
        req.type = ControlMessage.GET_NODE_LIST_REQ

        buf = client.do_rpc(req, True)
        resp = ControlMessage()
        resp.ParseFromString(buf)

        self.assertEquals(resp.node_count, 1)
        #    print resp

        worker.stop()
        client.close()
        server.stop()
        print "Done"
 def setUp(self):
     self.controller = Controller(('localhost', 0))
     self.controller.start()
     print "controller bound to %s:%d" % self.controller.address
     self.client = JSClient(self.controller.address)
     self.server = RemoteController()
     self.server.connect(self.controller.address[0],
                         self.controller.address[1])
     self.workers = []
示例#3
0
 def setUp(self):
     self.controller = Controller(('localhost', 0))
     self.controller.start()
     print "controller bound to %s:%d" % self.controller.address
     self.client = JSClient(self.controller.address)
示例#4
0
class TestWebIntegration(unittest.TestCase):
    def setUp(self):
        self.controller = Controller(('localhost', 0))
        self.controller.start()
        print "controller bound to %s:%d" % self.controller.address
        self.client = JSClient(self.controller.address)

    def tearDown(self):
        self.client.close()
        self.controller.stop()

    def test_operator_cube(self):
        # Create a worker and give it enough time to heartbeat (i.e. register with the controller)
        jsnode_cmd = "./jsnoded -a localhost:%d --start -C ./config/datanode.conf" % (
            self.controller.address[1])
        print "starting", jsnode_cmd
        workerProc = subprocess.Popen(jsnode_cmd,
                                      shell=True,
                                      preexec_fn=os.setsid)
        time.sleep(2)

        # Tell the controller to deploy a topology (it will be deployed on the only worker)
        compID = 17
        req = ControlMessage()
        req.type = ControlMessage.ALTER
        req.alter.computationID = compID

        newTask = req.alter.toStart.add()
        newTask.op_typename = "ContinuousSendK"
        newTask.id.computationID = req.alter.computationID
        newTask.id.task = 2

        newCube = req.alter.toCreate.add()
        newCube.name = "a_test_cube"
        newCube.overwrite_old = True

        d = newCube.schema.dimensions.add()
        d.name = "text"
        d.type = Element.STRING
        d.tuple_indexes.append(0)

        a = newCube.schema.aggregates.add()
        a.name = "count"
        a.type = "count"
        a.tuple_indexes.append(1)

        edge = req.alter.edges.add()
        edge.src = 2
        edge.computation = compID
        edge.dest_cube = newCube.name

        buf = self.client.do_rpc(req, True)
        resp = ControlMessage()
        resp.ParseFromString(buf)
        self.assertEquals(resp.type, ControlMessage.OK)
        self.assertEquals(len(self.controller.computations), 1)
        # Wait for the topology to start running on the worker
        time.sleep(2)
        workerList = self.controller.get_nodes()
        assert (len(workerList) == 1)
        assert (len(workerList[0].assignments) == 1)
        self.assertEquals(workerList[0].assignments.values()[0].state,
                          WorkerAssignment.RUNNING)

        # GET the web interface page and make sure both the operator and cube appear
        getResp = urllib2.urlopen("http://localhost:8081/").read()
        self.assertTrue(newTask.op_typename in getResp)
        self.assertTrue(newCube.name in getResp)
        print getResp
        os.killpg(workerProc.pid, signal.SIGTERM)
class TestController(unittest.TestCase):
    def setUp(self):
        self.controller = Controller(('localhost', 0))
        self.controller.start()
        print "controller bound to %s:%d" % self.controller.address
        self.client = JSClient(self.controller.address)

    def tearDown(self):
        self.client.close()
        self.controller.stop()

    def test_connect(self):
        # Test the connection by a simple GET_NODES call
        req = ControlMessage()
        req.type = ControlMessage.GET_NODE_LIST_REQ
        buf = self.client.do_rpc(req, True)
        resp = ControlMessage()
        resp.ParseFromString(buf)
        self.assertEquals(resp.node_count, 0)

    def test_heartbeat(self):
        req = ControlMessage()
        req.type = ControlMessage.HEARTBEAT
        req.heartbeat.freemem_mb = 3900
        req.heartbeat.cpuload_pct = 90
        addr = self.client.sock.getsockname()
        req.heartbeat.dataplane_addr.address = addr[0]
        req.heartbeat.dataplane_addr.portno = addr[1]
        buf = self.client.do_rpc(req, False)
        # Since no response is expected, sleep a little to give the controller time to process message
        time.sleep(1)
        workerList = self.controller.get_nodes()
        self.assertEquals(len(workerList), 1)
        self.assertEquals(workerList[0].endpoint,
                          self.client.sock.getsockname())

    def test_worker_liveness(self):

        print "\n--- test worker liveness ---"
        # Use a smaller heartbeat interval to speed up this test
        hbInterval = 0.5
        self.controller.hbInterval = hbInterval
        worker1 = create_worker(self.controller.address, hbInterval)
        worker1.start()
        worker2 = create_worker(self.controller.address, hbInterval)
        worker2.start()
        time.sleep(hbInterval)

        # Initially the controller should see two alive workers
        workerList = self.controller.get_nodes()
        self.assertEquals(len(workerList), 2)
        self.assertEquals(workerList[0].state, CWorker.ALIVE)
        self.assertEquals(workerList[1].state, CWorker.ALIVE)

        # Stop sending heartbeats from one of the workers; it should be marked dead after
        # several hb intervals
        worker1.stop_heartbeat_thread()
        time.sleep(hbInterval * (CWorker.DEFAULT_HB_DEAD_INTERVALS + 1))
        workerList = self.controller.get_nodes()
        self.assertEquals(len(workerList), 1)
        self.assertEquals(workerList[0].state, CWorker.ALIVE)

        # Kill the second worker; it should be marked dead much faster since we're closing the socket
        worker2.stop()
        time.sleep(1)
        self.assertEquals(len(self.controller.get_nodes()), 0)

    def test_deploy(self):
        print "--- test deploy ---"
        # Create a worker and give it enough time to heartbeat (i.e. register with the controller)
        worker1 = create_worker(self.controller.address)
        worker1.start()
        worker2 = create_worker(self.controller.address)
        worker2.start()
        time.sleep(2)
        # Deploy a single-operator topology
        req = ControlMessage()
        req.type = ControlMessage.ALTER
        req.alter.computationID = 17
        newOp = req.alter.toStart.add()
        newOp.op_typename = OpType.UNIX
        cfg = newOp.config.add()
        cfg.opt_name = "cmd"
        cfg.val = "cat /etc/shells"
        newOp.id.computationID = req.alter.computationID
        newOp.id.task = 1
        # Bind this operator to the second worker
        workerEndpoint = worker2.controllerConn.getsockname()
        newOp.site.address = workerEndpoint[0]
        newOp.site.portno = workerEndpoint[1]

        buf = self.client.do_rpc(req, True)
        resp = ControlMessage()
        resp.ParseFromString(buf)
        self.assertEquals(resp.type, ControlMessage.OK)
        # Wait for the topology to start running; there should be one task on the
        # second worker and none on the first
        time.sleep(1)
        self.assertEquals(len(worker2.tasks), 1)
        self.assertEquals(len(worker1.tasks), 0)
        self.assertEquals(len(self.controller.computations), 1)

        req = ControlMessage()
        req.type = ControlMessage.STOP_COMPUTATION
        req.comp_to_stop = int(1)
        self.controller.stop_computation(resp, req)
        self.assertEquals(len(self.controller.computations), 0)
        cworker2 = self.controller.workers[
            worker2.controllerConn.getsockname()]
        self.assertTrue(cworker2 is not None)

        self.assertEquals(len(cworker2.assignments), 0)

        print "stopping workers"
        worker1.stop()
        worker2.stop()
示例#6
0
class TestClientReaderIntegration(unittest.TestCase):
    def setUp(self):
        self.controller = Controller(('localhost', 0))
        self.controller.start()
        print "controller bound to %s:%d" % self.controller.address

        self.client = JSClient(self.controller.address)

    def tearDown(self):
        self.controller.stop()
        self.client.close()
        os.close(self.jsnode_out[0])
        os.killpg(self.workerProc.pid, signal.SIGKILL)

    # returns the response to this request
    def make_deploy_request(self, graph):
        # copy graph to alter message
        req = ControlMessage()
        req.type = ControlMessage.ALTER
        req.alter.Clear()

        graph.add_to_PB(req.alter)

        return self.client.do_rpc(req, True)

    def validate_response(self, buf):
        resp = ControlMessage()
        resp.ParseFromString(buf)

        if resp.type == ControlMessage.ERROR:
            print "error from client", resp.error_msg
        self.assertEquals(resp.type, ControlMessage.OK)
        # Make sure the controller created state for this computation
        self.assertEquals(len(self.controller.computations), 1)

        # Wait for the topology to start running on the worker
        time.sleep(1)

    def make_local_worker(self):

        try:
            self.workerProc.terminate()
        except AttributeError:
            pass

        jsnode_cmd = "./jsnoded -a %s:%d --start " \
                     "-C ./config/datanode.conf &"
        jsnode_cmd = jsnode_cmd % self.controller.address
        print "starting", jsnode_cmd
        self.jsnode_out = mkstemp(suffix='client_reader_test_jsnode_dump',
                                  prefix='/tmp/jsnode')
        self.workerProc = subprocess.Popen(jsnode_cmd,
                                           shell=True,
                                           preexec_fn=os.setsid,
                                           stdout=self.jsnode_out[0],
                                           stderr=subprocess.STDOUT)
        time.sleep(2)

        # shouldn't have terminated yet
        # TODO why does this fail
        #self.assertIsNone(self.workerProc.poll())

    def test_reader(self):
        g = jsapi.QueryGraph()

        k = 40
        echoer = jsapi.SendK(g, k)

        resultReader = ClientDataReader()
        g.connectExternal(echoer, resultReader.prep_to_receive_data())

        self.make_local_worker()
        #self.controller.deploy(g)
        self.validate_response(self.make_deploy_request(g))

        # validate SendK by counting
        tuplesReceived = []
        map(tuplesReceived.append, resultReader)

        self.assertEquals(len(tuplesReceived), k)
        print "client reader test succeeded"
示例#7
0
 def connect(self, addr, port):
     self.client = JSClient((addr, port))
示例#8
0
class RemoteController():
    def __init__(self, netaddr=None):
        """ Set up a connection from a client to a controller process.

    netaddr should be a tuple in the following format: (host_IP, host_port)
    """
        self.node_cache = None

        # TODO make netaddr mandatory (why isn't it already?)
        if netaddr is not None:
            logger.info("connecting to %s" % str(netaddr))
            self.connect(*netaddr)

    def connect(self, addr, port):
        self.client = JSClient((addr, port))

    def all_nodes(self):
        """Returns a list of all nodes in the system."""

        if self.node_cache is None:
            self.node_cache = []
            req = ControlMessage()
            req.type = ControlMessage.GET_NODE_LIST_REQ

            resp = self.client.ctrl_rpc(req, True)

            for nID in resp.nodes:
                nID2 = NodeID()
                nID2.CopyFrom(nID)
                self.node_cache.append(nID2)
        return self.node_cache

    def get_a_node(self):
        if self.node_cache is None:
            self.all_nodes()
        return self.node_cache[0]


#    raise "Unimplemented!"

    def deploy(self, op_graph, cube=None, cube_placement=None):
        """Deploys an operator graph"""
        if cube is not None:
            if cube_placement is not None:
                cube.instantiante_on(cube_placement)
            else:
                cube.instantiate_on(self.all_nodes())
        else:
            assert cube_placement is None

        logger.info("Sending create request to controller...")
        resp = self.client.ctrl_rpc(op_graph.get_deploy_pb(), True)

        print resp

    def deploy_pb(self, req):
        """Deploys an operator graph; returns an integer ID or an error message"""

        logger.info("Sending create request to controller...")
        resp = self.client.ctrl_rpc(req, True)
        if resp.type == ControlMessage.OK:
            logger.info("Started job %i" % resp.started_comp_id)
            return resp.started_comp_id
        else:
            logger.error("Failed to start job:" + resp.error_msg.msg)
            return resp.error_msg.msg

    def stop_computation(self, comput_id):
        req = ControlMessage()
        req.type = ControlMessage.STOP_COMPUTATION
        req.comp_to_stop = int(comput_id)
        resp = self.client.ctrl_rpc(req, True)
        return resp
class TestOpIntegration(unittest.TestCase):
    def setUp(self):
        self.controller = Controller(('localhost', 0))
        self.controller.start()
        print "controller bound to %s:%d" % self.controller.address
        self.client = JSClient(self.controller.address)
        self.server = RemoteController()
        self.server.connect(self.controller.address[0],
                            self.controller.address[1])
        self.workers = []

    def tearDown(self):
        self.client.close()
        self.controller.stop()
        try:
            for worker in self.workers:
                worker.terminate()
            print "killing workers WORKED"
        except OSError:
            print "killing workers FAILED"
        self.workers = []

    def start_workers(self, num_workers):
        # Use at least 2 workers
        webPortMin = 8083
        workerProcs = []
        webPort = webPortMin
        for i in range(num_workers):
            # Create a worker
            jsnode_cmd = "./jsnoded -a localhost:%d -w %d --start -C ./config/local_bare.conf" % (
                self.controller.address[1], webPort)
            webPort += 1
            print "starting", jsnode_cmd
            workerProcs.append(
                subprocess.Popen(jsnode_cmd, shell=True, preexec_fn=os.setsid))
            time.sleep(1)

        self.workers.extend(workerProcs)
        # Give the workers time to register with the controller
        time.sleep(3)

    def verify_workers(self, num_workers):
        # Get the list of workers
        req = ControlMessage()
        req.type = ControlMessage.GET_NODE_LIST_REQ
        buf = self.client.do_rpc(req, True)
        resp = ControlMessage()
        resp.ParseFromString(buf)
        workersEp = resp.nodes
        self.assertEquals(len(workersEp), num_workers)

    def test_topk(self):
        num_workers = 2
        self.start_workers(num_workers)
        self.verify_workers(num_workers)

        root_node = self.server.get_a_node()
        assert isinstance(root_node, NodeID)
        all_nodes = self.server.all_nodes()

        for i in range(5):
            g = get_graph(root_node, all_nodes, rate=1000)
            req = g.get_deploy_pb()
            cid = self.server.deploy_pb(req)
            if type(cid) == types.IntType:
                print time.ctime(), "Computation running; ID =", cid
            else:
                print "computation failed", cid
                break
            time.sleep(3)
            workerList = self.controller.get_nodes()
            self.assertEquals(len(workerList), num_workers)
            #self.assertEquals(len(workerList[0].assignments), 1)
            for j in range(num_workers):
                self.assertEquals(workerList[j].assignments.values()[0].state,
                                  WorkerAssignment.RUNNING)
            self.server.stop_computation(cid)
            print time.ctime(), "Computation stopped; ID =", cid
            time.sleep(2)
示例#10
0
class TestOpIntegration(unittest.TestCase):
    def setUp(self):
        self.controller = Controller(('localhost', 0))
        self.controller.start()
        print "controller bound to %s:%d" % self.controller.address
        self.client = JSClient(self.controller.address)
        self.workers = list()

    def tearDown(self):
        self.client.close()
        self.controller.stop()
        map(lambda proc: os.killpg(proc.pid, signal.SIGTERM), self.workers)
        self.workers = list()

    def test_operator(self):
        # Create a worker and give it enough time to heartbeat (i.e. register with the controller)
        jsnode_cmd = "./jsnoded -a localhost:%d --start -C ./config/datanode.conf" % (
            self.controller.address[1])
        print "starting", jsnode_cmd
        workerProc = subprocess.Popen(jsnode_cmd,
                                      shell=True,
                                      preexec_fn=os.setsid)
        self.workers.append(workerProc)
        time.sleep(2)

        # Tell the controller to deploy a topology (it will be deployed on the only worker)
        compID = 17
        req = ControlMessage()
        req.type = ControlMessage.ALTER
        req.alter.computationID = compID
        newTask = req.alter.toStart.add()
        newTask.op_typename = "DummyReceiver"
        newTask.id.computationID = req.alter.computationID
        newTask.id.task = 2

        buf = self.client.do_rpc(req, True)
        resp = ControlMessage()
        resp.ParseFromString(buf)
        self.assertEquals(resp.type, ControlMessage.OK)
        # Make sure the controller created state for this computation
        self.assertEquals(len(self.controller.computations), 1)
        # Wait for the topology to start running on the worker
        time.sleep(2)
        workerList = self.controller.get_nodes()
        assert (len(workerList) == 1)
        self.assertEquals(len(workerList[0].assignments), 1)
        self.assertEquals(workerList[0].assignments.values()[0].state,
                          WorkerAssignment.RUNNING)

    def test_multiple_operators(self):
        # Create a worker and give it enough time to heartbeat (i.e. register with the controller)
        jsnode_cmd = "./jsnoded -a localhost:%d --start -C ./config/datanode.conf" % (
            self.controller.address[1])
        print "starting", jsnode_cmd
        workerProc = subprocess.Popen(jsnode_cmd,
                                      shell=True,
                                      preexec_fn=os.setsid)
        self.workers.append(workerProc)
        time.sleep(2)

        # Tell the controller to deploy multiple single-node topologies (they will all be
        # deployed on the only worker)
        numComps = 10
        req = ControlMessage()
        resp = ControlMessage()
        for compID in range(1, numComps + 1):
            req.type = ControlMessage.ALTER
            req.alter.Clear()
            req.alter.computationID = compID
            newTask = req.alter.toStart.add()
            newTask.id.computationID = compID
            newTask.id.task = 2
            newTask.op_typename = "StringGrep"
            opCfg = newTask.config.add()
            opCfg.opt_name = "pattern"
            opCfg.val = ".*"
            buf = self.client.do_rpc(req, True)
            resp.Clear()
            resp.ParseFromString(buf)
            self.assertEquals(resp.type, ControlMessage.OK)
            # Make sure the controller created state for each computation
            self.assertEquals(len(self.controller.computations), compID)

        # Wait for the topologies to start running on the worker
        time.sleep(2)
        workerList = self.controller.get_nodes()
        assert (len(workerList) == 1)
        for assignment in workerList[0].assignments.values():
            self.assertEquals(assignment.state, WorkerAssignment.RUNNING)

    def test_operator_chain(self):
        # Use at least 2 workers
        numWorkers = 5
        webPortMin = 8082
        workerProcs = []
        webPort = webPortMin
        for i in range(numWorkers):
            # Create a worker
            jsnode_cmd = "./jsnoded -a localhost:%d -w %d --start -C ./config/datanode.conf" % (
                self.controller.address[1], webPort)
            webPort += 1
            print "starting", jsnode_cmd
            workerProcs.append(
                subprocess.Popen(jsnode_cmd, shell=True, preexec_fn=os.setsid))
        self.workers.extend(workerProcs)
        # Give the workers time to register with the controller
        time.sleep(3)

        # Get the list of workers
        req = ControlMessage()
        req.type = ControlMessage.GET_NODE_LIST_REQ
        buf = self.client.do_rpc(req, True)
        resp = ControlMessage()
        resp.ParseFromString(buf)
        workersEp = resp.nodes
        self.assertEquals(len(workersEp), numWorkers)

        # Issue a query that runs an operator on each worker: send some tuples from a
        # source, filter them through the remaining workers and collect at the end.
        req = ControlMessage()
        assignedOps = []
        compID = 17
        # Make this number unique so we can check it later
        numTuples = 193
        for i in range(len(workersEp)):
            req.type = ControlMessage.ALTER
            req.alter.computationID = compID
            task = req.alter.toStart.add()
            task.id.computationID = compID
            task.id.task = i + 1  # start task numbers at 1
            # Pin the operator to the current worker
            task.site.address = workersEp[i].address
            task.site.portno = workersEp[i].portno

            if i == 0:
                # Send some tuples from the first worker; use a continuous sender so the chain
                # doesn't get torn down.
                task.op_typename = "ContinuousSendK"
                opCfg = task.config.add()
                opCfg.opt_name = "k"
                opCfg.val = str(numTuples)
                opCfg = task.config.add()
                opCfg.opt_name = "period"
                # Use a large period so we only have to check for one batch of tuples
                opCfg.val = str(10000)
            elif i == len(workersEp) - 1:
                # Collect tuples at the last worker
                task.op_typename = "DummyReceiver"
            else:
                # Insert no-op filters in between
                task.op_typename = "StringGrep"
                opCfg = task.config.add()
                opCfg.opt_name = "pattern"
                opCfg.val = ".*"
                opCfg2 = task.config.add()
                opCfg2.opt_name = "id"
                opCfg2.val = "0"

            assignedOps.append(task)

            if i > 0:
                # Create an edge from the previous operator to this one
                e = req.alter.edges.add()
                e.src = task.id.task - 1
                e.dest = task.id.task
                e.computation = compID

        # Deploy the query
        buf = self.client.do_rpc(req, True)
        resp = ControlMessage()
        resp.ParseFromString(buf)
        self.assertEquals(ControlMessage.OK, resp.type)
        # The controller overwrites the computation ID, so get it again
        self.assertEquals(1, len(self.controller.computations))
        compID = self.controller.computations.keys()[0]
        time.sleep(2)

        # Make sure the operators were started, one per worker
        webPort = webPortMin
        for i in range(len(workersEp)):
            # GET the web interface of each worker
            url = "http://" + workersEp[i].address + ":" + str(webPort) + "/"
            getResp = urllib2.urlopen(url).read()
            print getResp
            # Figure out which operator is on this worker based on identifying information
            for op in assignedOps[:]:
                opName = op.op_typename
                opId = "(" + str(compID) + "," + str(op.id.task) + ")"
                #print "SEARCHING FOR " + opName + " AND " + str(opId)
                if (opName in getResp) and (opId in getResp):
                    assignedOps.remove(op)
                    # If this is the final receiver, check the received tuple count
                    if opName == "DummyReceiver":
                        if not str(numTuples) in getResp:
                            print getResp
                        self.assertTrue(str(numTuples) in getResp)
                    break
            webPort += 1
        # We should have matched all operators, one per worker
        print "length at end is " + str(len(assignedOps))
        self.assertEquals(0, len(assignedOps))