def test_connect(self): server = get_server_on_this_node() server.start() time.sleep(1) worker = create_worker(server.address) worker.start() # while True: # print "main thread is pausing..." time.sleep(5) print "connecting to %s:%d" % server.address client = JSClient(server.address) req = ControlMessage() req.type = ControlMessage.GET_NODE_LIST_REQ buf = client.do_rpc(req, True) resp = ControlMessage() resp.ParseFromString(buf) self.assertEquals(resp.node_count, 1) # print resp worker.stop() client.close() server.stop() print "Done"
def setUp(self): self.controller = Controller(('localhost', 0)) self.controller.start() print "controller bound to %s:%d" % self.controller.address self.client = JSClient(self.controller.address) self.server = RemoteController() self.server.connect(self.controller.address[0], self.controller.address[1]) self.workers = []
def setUp(self): self.controller = Controller(('localhost', 0)) self.controller.start() print "controller bound to %s:%d" % self.controller.address self.client = JSClient(self.controller.address)
class TestWebIntegration(unittest.TestCase): def setUp(self): self.controller = Controller(('localhost', 0)) self.controller.start() print "controller bound to %s:%d" % self.controller.address self.client = JSClient(self.controller.address) def tearDown(self): self.client.close() self.controller.stop() def test_operator_cube(self): # Create a worker and give it enough time to heartbeat (i.e. register with the controller) jsnode_cmd = "./jsnoded -a localhost:%d --start -C ./config/datanode.conf" % ( self.controller.address[1]) print "starting", jsnode_cmd workerProc = subprocess.Popen(jsnode_cmd, shell=True, preexec_fn=os.setsid) time.sleep(2) # Tell the controller to deploy a topology (it will be deployed on the only worker) compID = 17 req = ControlMessage() req.type = ControlMessage.ALTER req.alter.computationID = compID newTask = req.alter.toStart.add() newTask.op_typename = "ContinuousSendK" newTask.id.computationID = req.alter.computationID newTask.id.task = 2 newCube = req.alter.toCreate.add() newCube.name = "a_test_cube" newCube.overwrite_old = True d = newCube.schema.dimensions.add() d.name = "text" d.type = Element.STRING d.tuple_indexes.append(0) a = newCube.schema.aggregates.add() a.name = "count" a.type = "count" a.tuple_indexes.append(1) edge = req.alter.edges.add() edge.src = 2 edge.computation = compID edge.dest_cube = newCube.name buf = self.client.do_rpc(req, True) resp = ControlMessage() resp.ParseFromString(buf) self.assertEquals(resp.type, ControlMessage.OK) self.assertEquals(len(self.controller.computations), 1) # Wait for the topology to start running on the worker time.sleep(2) workerList = self.controller.get_nodes() assert (len(workerList) == 1) assert (len(workerList[0].assignments) == 1) self.assertEquals(workerList[0].assignments.values()[0].state, WorkerAssignment.RUNNING) # GET the web interface page and make sure both the operator and cube appear getResp = urllib2.urlopen("http://localhost:8081/").read() self.assertTrue(newTask.op_typename in getResp) self.assertTrue(newCube.name in getResp) print getResp os.killpg(workerProc.pid, signal.SIGTERM)
class TestController(unittest.TestCase): def setUp(self): self.controller = Controller(('localhost', 0)) self.controller.start() print "controller bound to %s:%d" % self.controller.address self.client = JSClient(self.controller.address) def tearDown(self): self.client.close() self.controller.stop() def test_connect(self): # Test the connection by a simple GET_NODES call req = ControlMessage() req.type = ControlMessage.GET_NODE_LIST_REQ buf = self.client.do_rpc(req, True) resp = ControlMessage() resp.ParseFromString(buf) self.assertEquals(resp.node_count, 0) def test_heartbeat(self): req = ControlMessage() req.type = ControlMessage.HEARTBEAT req.heartbeat.freemem_mb = 3900 req.heartbeat.cpuload_pct = 90 addr = self.client.sock.getsockname() req.heartbeat.dataplane_addr.address = addr[0] req.heartbeat.dataplane_addr.portno = addr[1] buf = self.client.do_rpc(req, False) # Since no response is expected, sleep a little to give the controller time to process message time.sleep(1) workerList = self.controller.get_nodes() self.assertEquals(len(workerList), 1) self.assertEquals(workerList[0].endpoint, self.client.sock.getsockname()) def test_worker_liveness(self): print "\n--- test worker liveness ---" # Use a smaller heartbeat interval to speed up this test hbInterval = 0.5 self.controller.hbInterval = hbInterval worker1 = create_worker(self.controller.address, hbInterval) worker1.start() worker2 = create_worker(self.controller.address, hbInterval) worker2.start() time.sleep(hbInterval) # Initially the controller should see two alive workers workerList = self.controller.get_nodes() self.assertEquals(len(workerList), 2) self.assertEquals(workerList[0].state, CWorker.ALIVE) self.assertEquals(workerList[1].state, CWorker.ALIVE) # Stop sending heartbeats from one of the workers; it should be marked dead after # several hb intervals worker1.stop_heartbeat_thread() time.sleep(hbInterval * (CWorker.DEFAULT_HB_DEAD_INTERVALS + 1)) workerList = self.controller.get_nodes() self.assertEquals(len(workerList), 1) self.assertEquals(workerList[0].state, CWorker.ALIVE) # Kill the second worker; it should be marked dead much faster since we're closing the socket worker2.stop() time.sleep(1) self.assertEquals(len(self.controller.get_nodes()), 0) def test_deploy(self): print "--- test deploy ---" # Create a worker and give it enough time to heartbeat (i.e. register with the controller) worker1 = create_worker(self.controller.address) worker1.start() worker2 = create_worker(self.controller.address) worker2.start() time.sleep(2) # Deploy a single-operator topology req = ControlMessage() req.type = ControlMessage.ALTER req.alter.computationID = 17 newOp = req.alter.toStart.add() newOp.op_typename = OpType.UNIX cfg = newOp.config.add() cfg.opt_name = "cmd" cfg.val = "cat /etc/shells" newOp.id.computationID = req.alter.computationID newOp.id.task = 1 # Bind this operator to the second worker workerEndpoint = worker2.controllerConn.getsockname() newOp.site.address = workerEndpoint[0] newOp.site.portno = workerEndpoint[1] buf = self.client.do_rpc(req, True) resp = ControlMessage() resp.ParseFromString(buf) self.assertEquals(resp.type, ControlMessage.OK) # Wait for the topology to start running; there should be one task on the # second worker and none on the first time.sleep(1) self.assertEquals(len(worker2.tasks), 1) self.assertEquals(len(worker1.tasks), 0) self.assertEquals(len(self.controller.computations), 1) req = ControlMessage() req.type = ControlMessage.STOP_COMPUTATION req.comp_to_stop = int(1) self.controller.stop_computation(resp, req) self.assertEquals(len(self.controller.computations), 0) cworker2 = self.controller.workers[ worker2.controllerConn.getsockname()] self.assertTrue(cworker2 is not None) self.assertEquals(len(cworker2.assignments), 0) print "stopping workers" worker1.stop() worker2.stop()
class TestClientReaderIntegration(unittest.TestCase): def setUp(self): self.controller = Controller(('localhost', 0)) self.controller.start() print "controller bound to %s:%d" % self.controller.address self.client = JSClient(self.controller.address) def tearDown(self): self.controller.stop() self.client.close() os.close(self.jsnode_out[0]) os.killpg(self.workerProc.pid, signal.SIGKILL) # returns the response to this request def make_deploy_request(self, graph): # copy graph to alter message req = ControlMessage() req.type = ControlMessage.ALTER req.alter.Clear() graph.add_to_PB(req.alter) return self.client.do_rpc(req, True) def validate_response(self, buf): resp = ControlMessage() resp.ParseFromString(buf) if resp.type == ControlMessage.ERROR: print "error from client", resp.error_msg self.assertEquals(resp.type, ControlMessage.OK) # Make sure the controller created state for this computation self.assertEquals(len(self.controller.computations), 1) # Wait for the topology to start running on the worker time.sleep(1) def make_local_worker(self): try: self.workerProc.terminate() except AttributeError: pass jsnode_cmd = "./jsnoded -a %s:%d --start " \ "-C ./config/datanode.conf &" jsnode_cmd = jsnode_cmd % self.controller.address print "starting", jsnode_cmd self.jsnode_out = mkstemp(suffix='client_reader_test_jsnode_dump', prefix='/tmp/jsnode') self.workerProc = subprocess.Popen(jsnode_cmd, shell=True, preexec_fn=os.setsid, stdout=self.jsnode_out[0], stderr=subprocess.STDOUT) time.sleep(2) # shouldn't have terminated yet # TODO why does this fail #self.assertIsNone(self.workerProc.poll()) def test_reader(self): g = jsapi.QueryGraph() k = 40 echoer = jsapi.SendK(g, k) resultReader = ClientDataReader() g.connectExternal(echoer, resultReader.prep_to_receive_data()) self.make_local_worker() #self.controller.deploy(g) self.validate_response(self.make_deploy_request(g)) # validate SendK by counting tuplesReceived = [] map(tuplesReceived.append, resultReader) self.assertEquals(len(tuplesReceived), k) print "client reader test succeeded"
def connect(self, addr, port): self.client = JSClient((addr, port))
class RemoteController(): def __init__(self, netaddr=None): """ Set up a connection from a client to a controller process. netaddr should be a tuple in the following format: (host_IP, host_port) """ self.node_cache = None # TODO make netaddr mandatory (why isn't it already?) if netaddr is not None: logger.info("connecting to %s" % str(netaddr)) self.connect(*netaddr) def connect(self, addr, port): self.client = JSClient((addr, port)) def all_nodes(self): """Returns a list of all nodes in the system.""" if self.node_cache is None: self.node_cache = [] req = ControlMessage() req.type = ControlMessage.GET_NODE_LIST_REQ resp = self.client.ctrl_rpc(req, True) for nID in resp.nodes: nID2 = NodeID() nID2.CopyFrom(nID) self.node_cache.append(nID2) return self.node_cache def get_a_node(self): if self.node_cache is None: self.all_nodes() return self.node_cache[0] # raise "Unimplemented!" def deploy(self, op_graph, cube=None, cube_placement=None): """Deploys an operator graph""" if cube is not None: if cube_placement is not None: cube.instantiante_on(cube_placement) else: cube.instantiate_on(self.all_nodes()) else: assert cube_placement is None logger.info("Sending create request to controller...") resp = self.client.ctrl_rpc(op_graph.get_deploy_pb(), True) print resp def deploy_pb(self, req): """Deploys an operator graph; returns an integer ID or an error message""" logger.info("Sending create request to controller...") resp = self.client.ctrl_rpc(req, True) if resp.type == ControlMessage.OK: logger.info("Started job %i" % resp.started_comp_id) return resp.started_comp_id else: logger.error("Failed to start job:" + resp.error_msg.msg) return resp.error_msg.msg def stop_computation(self, comput_id): req = ControlMessage() req.type = ControlMessage.STOP_COMPUTATION req.comp_to_stop = int(comput_id) resp = self.client.ctrl_rpc(req, True) return resp
class TestOpIntegration(unittest.TestCase): def setUp(self): self.controller = Controller(('localhost', 0)) self.controller.start() print "controller bound to %s:%d" % self.controller.address self.client = JSClient(self.controller.address) self.server = RemoteController() self.server.connect(self.controller.address[0], self.controller.address[1]) self.workers = [] def tearDown(self): self.client.close() self.controller.stop() try: for worker in self.workers: worker.terminate() print "killing workers WORKED" except OSError: print "killing workers FAILED" self.workers = [] def start_workers(self, num_workers): # Use at least 2 workers webPortMin = 8083 workerProcs = [] webPort = webPortMin for i in range(num_workers): # Create a worker jsnode_cmd = "./jsnoded -a localhost:%d -w %d --start -C ./config/local_bare.conf" % ( self.controller.address[1], webPort) webPort += 1 print "starting", jsnode_cmd workerProcs.append( subprocess.Popen(jsnode_cmd, shell=True, preexec_fn=os.setsid)) time.sleep(1) self.workers.extend(workerProcs) # Give the workers time to register with the controller time.sleep(3) def verify_workers(self, num_workers): # Get the list of workers req = ControlMessage() req.type = ControlMessage.GET_NODE_LIST_REQ buf = self.client.do_rpc(req, True) resp = ControlMessage() resp.ParseFromString(buf) workersEp = resp.nodes self.assertEquals(len(workersEp), num_workers) def test_topk(self): num_workers = 2 self.start_workers(num_workers) self.verify_workers(num_workers) root_node = self.server.get_a_node() assert isinstance(root_node, NodeID) all_nodes = self.server.all_nodes() for i in range(5): g = get_graph(root_node, all_nodes, rate=1000) req = g.get_deploy_pb() cid = self.server.deploy_pb(req) if type(cid) == types.IntType: print time.ctime(), "Computation running; ID =", cid else: print "computation failed", cid break time.sleep(3) workerList = self.controller.get_nodes() self.assertEquals(len(workerList), num_workers) #self.assertEquals(len(workerList[0].assignments), 1) for j in range(num_workers): self.assertEquals(workerList[j].assignments.values()[0].state, WorkerAssignment.RUNNING) self.server.stop_computation(cid) print time.ctime(), "Computation stopped; ID =", cid time.sleep(2)
class TestOpIntegration(unittest.TestCase): def setUp(self): self.controller = Controller(('localhost', 0)) self.controller.start() print "controller bound to %s:%d" % self.controller.address self.client = JSClient(self.controller.address) self.workers = list() def tearDown(self): self.client.close() self.controller.stop() map(lambda proc: os.killpg(proc.pid, signal.SIGTERM), self.workers) self.workers = list() def test_operator(self): # Create a worker and give it enough time to heartbeat (i.e. register with the controller) jsnode_cmd = "./jsnoded -a localhost:%d --start -C ./config/datanode.conf" % ( self.controller.address[1]) print "starting", jsnode_cmd workerProc = subprocess.Popen(jsnode_cmd, shell=True, preexec_fn=os.setsid) self.workers.append(workerProc) time.sleep(2) # Tell the controller to deploy a topology (it will be deployed on the only worker) compID = 17 req = ControlMessage() req.type = ControlMessage.ALTER req.alter.computationID = compID newTask = req.alter.toStart.add() newTask.op_typename = "DummyReceiver" newTask.id.computationID = req.alter.computationID newTask.id.task = 2 buf = self.client.do_rpc(req, True) resp = ControlMessage() resp.ParseFromString(buf) self.assertEquals(resp.type, ControlMessage.OK) # Make sure the controller created state for this computation self.assertEquals(len(self.controller.computations), 1) # Wait for the topology to start running on the worker time.sleep(2) workerList = self.controller.get_nodes() assert (len(workerList) == 1) self.assertEquals(len(workerList[0].assignments), 1) self.assertEquals(workerList[0].assignments.values()[0].state, WorkerAssignment.RUNNING) def test_multiple_operators(self): # Create a worker and give it enough time to heartbeat (i.e. register with the controller) jsnode_cmd = "./jsnoded -a localhost:%d --start -C ./config/datanode.conf" % ( self.controller.address[1]) print "starting", jsnode_cmd workerProc = subprocess.Popen(jsnode_cmd, shell=True, preexec_fn=os.setsid) self.workers.append(workerProc) time.sleep(2) # Tell the controller to deploy multiple single-node topologies (they will all be # deployed on the only worker) numComps = 10 req = ControlMessage() resp = ControlMessage() for compID in range(1, numComps + 1): req.type = ControlMessage.ALTER req.alter.Clear() req.alter.computationID = compID newTask = req.alter.toStart.add() newTask.id.computationID = compID newTask.id.task = 2 newTask.op_typename = "StringGrep" opCfg = newTask.config.add() opCfg.opt_name = "pattern" opCfg.val = ".*" buf = self.client.do_rpc(req, True) resp.Clear() resp.ParseFromString(buf) self.assertEquals(resp.type, ControlMessage.OK) # Make sure the controller created state for each computation self.assertEquals(len(self.controller.computations), compID) # Wait for the topologies to start running on the worker time.sleep(2) workerList = self.controller.get_nodes() assert (len(workerList) == 1) for assignment in workerList[0].assignments.values(): self.assertEquals(assignment.state, WorkerAssignment.RUNNING) def test_operator_chain(self): # Use at least 2 workers numWorkers = 5 webPortMin = 8082 workerProcs = [] webPort = webPortMin for i in range(numWorkers): # Create a worker jsnode_cmd = "./jsnoded -a localhost:%d -w %d --start -C ./config/datanode.conf" % ( self.controller.address[1], webPort) webPort += 1 print "starting", jsnode_cmd workerProcs.append( subprocess.Popen(jsnode_cmd, shell=True, preexec_fn=os.setsid)) self.workers.extend(workerProcs) # Give the workers time to register with the controller time.sleep(3) # Get the list of workers req = ControlMessage() req.type = ControlMessage.GET_NODE_LIST_REQ buf = self.client.do_rpc(req, True) resp = ControlMessage() resp.ParseFromString(buf) workersEp = resp.nodes self.assertEquals(len(workersEp), numWorkers) # Issue a query that runs an operator on each worker: send some tuples from a # source, filter them through the remaining workers and collect at the end. req = ControlMessage() assignedOps = [] compID = 17 # Make this number unique so we can check it later numTuples = 193 for i in range(len(workersEp)): req.type = ControlMessage.ALTER req.alter.computationID = compID task = req.alter.toStart.add() task.id.computationID = compID task.id.task = i + 1 # start task numbers at 1 # Pin the operator to the current worker task.site.address = workersEp[i].address task.site.portno = workersEp[i].portno if i == 0: # Send some tuples from the first worker; use a continuous sender so the chain # doesn't get torn down. task.op_typename = "ContinuousSendK" opCfg = task.config.add() opCfg.opt_name = "k" opCfg.val = str(numTuples) opCfg = task.config.add() opCfg.opt_name = "period" # Use a large period so we only have to check for one batch of tuples opCfg.val = str(10000) elif i == len(workersEp) - 1: # Collect tuples at the last worker task.op_typename = "DummyReceiver" else: # Insert no-op filters in between task.op_typename = "StringGrep" opCfg = task.config.add() opCfg.opt_name = "pattern" opCfg.val = ".*" opCfg2 = task.config.add() opCfg2.opt_name = "id" opCfg2.val = "0" assignedOps.append(task) if i > 0: # Create an edge from the previous operator to this one e = req.alter.edges.add() e.src = task.id.task - 1 e.dest = task.id.task e.computation = compID # Deploy the query buf = self.client.do_rpc(req, True) resp = ControlMessage() resp.ParseFromString(buf) self.assertEquals(ControlMessage.OK, resp.type) # The controller overwrites the computation ID, so get it again self.assertEquals(1, len(self.controller.computations)) compID = self.controller.computations.keys()[0] time.sleep(2) # Make sure the operators were started, one per worker webPort = webPortMin for i in range(len(workersEp)): # GET the web interface of each worker url = "http://" + workersEp[i].address + ":" + str(webPort) + "/" getResp = urllib2.urlopen(url).read() print getResp # Figure out which operator is on this worker based on identifying information for op in assignedOps[:]: opName = op.op_typename opId = "(" + str(compID) + "," + str(op.id.task) + ")" #print "SEARCHING FOR " + opName + " AND " + str(opId) if (opName in getResp) and (opId in getResp): assignedOps.remove(op) # If this is the final receiver, check the received tuple count if opName == "DummyReceiver": if not str(numTuples) in getResp: print getResp self.assertTrue(str(numTuples) in getResp) break webPort += 1 # We should have matched all operators, one per worker print "length at end is " + str(len(assignedOps)) self.assertEquals(0, len(assignedOps))