def schedule_link_join(self, time, graph, origin, destination): current_graph = self.get_current_graph(graph) new_graph = self.initialize_new_graph(current_graph) joining_links = [] for link in new_graph.removed_links: if link.source.name == origin and link.destination.name == destination or link.source.name == destination and link.destination.name == origin: joining_links.append(link) for link in joining_links: new_graph.removed_links.remove(link) new_graph.links.append(link) for l in joining_links: for node in new_graph.services: if l.source == node: for nodeinstance in new_graph.services[node]: nodeinstance.links.append(l) for bridge in new_graph.bridges: if l.source == new_graph.bridges[bridge][0]: new_graph.bridges[bridge][0].links.append(l) self.recompute_and_store(new_graph, time) print_message("Link " + origin + "--" + destination + " scheduled to join at " + str(time)) self.graph_changes.append((time, [graph, new_graph]))
def schedule_link_leave(self, time, graph, origin, destination): current_graph = self.get_current_graph(graph) new_graph = self.initialize_new_graph(current_graph) for l in new_graph.links: if (l.source.name == origin and l.destination.name == destination) or (l.source.name == destination and l.destination.name == origin): new_graph.removed_links.append(l) for l in new_graph.removed_links: if l in new_graph.links: # and has not been removed before new_graph.links.remove(l) for node in new_graph.services: for nodeinstance in new_graph.services[node]: if l in nodeinstance.links: nodeinstance.links.remove(l) for bridge in new_graph.bridges: if l in new_graph.bridges[bridge][0].links: new_graph.bridges[bridge][0].links.remove(l) self.recompute_and_store(new_graph, time) print_message("Link " + origin + "--" + destination + " scheduled to leave at " + str(time)) self.graph_changes.append((time, [graph, new_graph]))
def __init__(self, net_graph, event_scheduler): self.graph = net_graph # type: NetGraph self.scheduler = event_scheduler # type: EventScheduler self.active_paths = [] # type: List[NetGraph.Path] self.active_paths_ids = [] # type: List[int] self.flow_accumulator = { } # type: Dict[str, List[List[int], int, int]] self.state_lock = Lock() self.last_time = 0 # self.delayed_flows = 0 EmulationCore.POOL_PERIOD = float( environ.get(ENVIRONMENT.POOL_PERIOD, str(EmulationCore.POOL_PERIOD))) EmulationCore.ITERATIONS_TO_INTEGRATE = int( environ.get(ENVIRONMENT.ITERATION_COUNT, str(EmulationCore.ITERATIONS_TO_INTEGRATE))) print_message("Pool Period: " + str(EmulationCore.POOL_PERIOD)) # print_message("Iteration Count: " + str(EmulationCore.ITERATIONS_TO_INTEGRATE)) self.check_flows_time_delta = 0 # We need to give the callback a reference to ourselves (kind of hackish...) global emuManager emuManager = self if getenv('RUNTIME_EMULATION', 'true') != 'false': self.comms = CommunicationsManager(self.collect_flow, self.graph, self.scheduler)
def path_change(graphs): start = time() graph = graphs[0] new_graph = graphs[1] try: #is a service not reachable after this change? Then set packet loss to 100% to_remove = [] for service in graph.paths: if not service in new_graph.paths and isinstance( service, NetGraph.Service): to_remove.append(service) change_loss(service, 1.0) for service in to_remove: del graph.paths[service] graph.links_by_index = new_links_by_index( new_graph.links_by_index, graph.links_by_index) #update necessary?? #apply paths that do exist now and *were* already in the last graph... new_paths_by_id = {} for service in new_graph.paths: if service in graph.paths: if isinstance(service, NetGraph.Service) and not service == graph.root: current_bw = graph.paths[service].current_bandwidth new_path = new_graph.paths[service] with graph.paths[service].lock: new_path.links = [ graph.links_by_index[link.index] for link in new_path.links ] graph.paths[service] = new_path graph.paths[ service].current_bandwidth = current_bw #the new paths have the clean maximum computed. Here we need the bookkeeping of the old path. change_loss(service, graph.paths[service].drop) change_latency(service, graph.paths[service].latency, graph.paths[service].jitter) new_paths_by_id[new_path.id] = new_path #... or not else: # service is now reachable after not having been reachable if isinstance(service, NetGraph.Service): with graph.paths[service].lock: graph.paths[service] = new_graph.paths[service] graph.paths[service].links = update_links([ link.index for link in graph.paths[service].links ], graph.links_by_index) graph.paths[service].current_bandwidth = 0 change_loss(service, graph.paths[service].drop) change_latency(service, graph.paths[service].latency, graph.paths[service].jitter) new_paths_by_id[new_path.id] = new_path graph.paths_by_id = new_paths_by_id graph.links_by_index = new_graph.links_by_index #update necessary?? except Exception as e: print_message("Error updating paths: " + str(e)) end = time() print_message("recalculated in " + '{p:.4f}'.format(p=end - start))
def schedule_bridge_join(self, time, graph, name): current_graph = self.get_current_graph(graph) new_graph = self.initialize_new_graph(current_graph) bridge = new_graph.removed_bridges[name] del new_graph.removed_bridges[name] new_graph.bridges[name] = bridge self.recompute_and_store(new_graph, time) print_message("Bridge " + name + " scheduled to join back at " + str(time)) self.graph_changes.append((time, [graph, new_graph]))
def schedule_new_link(self, time, graph, source, destination, latency, jitter, drop, bandwidth, network): current_graph = self.get_current_graph(graph) new_graph = self.initialize_new_graph(current_graph) new_graph.new_link(source, destination, latency, jitter, drop, bandwidth, network) self.recompute_and_store(new_graph, time) print_message("Link " + source + "--" + destination + " scheduled to newly join at " + str(time)) self.graph_changes.append((time, [graph, new_graph]))
def schedule_bridge_leave(self, time, graph, name): current_graph = self.get_current_graph(graph) new_graph = self.initialize_new_graph(current_graph) #hack to still find the bridge in XMLGraphParser at startup time #it doesn't matter to have it there, because it will be overwritten by dynamic changes at runtime graph.removed_bridges[name] = new_graph.bridges[name] new_graph.removed_bridges[name] = new_graph.bridges[name] del new_graph.bridges[name] self.recompute_and_store(new_graph, time) print_message("Bridge " + name + " scheduled to leave at " + str(time)) self.graph_changes.append((time, [graph, new_graph]))
def start_god_container(self, label): while True: try: # If we are bootstrapper: us = None while not us: containers = self.high_level_client.containers.list() for container in containers: if "boot" + label in container.labels: us = container sleep(1) boot_image = us.image inspect_result = self.low_level_client.inspect_container(us.id) env = inspect_result["Config"]["Env"] print_message("[Py (bootstrapper)] ip: " + str(socket.gethostbyname(socket.gethostname()))) # create a "God" container that is in the host's Pid namespace self.high_level_client.containers.run(image=boot_image, command=["-g", label, str(us.id)], privileged=True, pid_mode="host", network="host", shm_size=int(os.getenv('SHM_SIZE', '8000000000')), remove=True, name="god_" + str(random.getrandbits(64)), # grep friendly environment=env, volumes_from=[us.id], # network_mode="container:"+us.id, # share the network stack with this container # network='test_overlay', labels=["god" + label], detach=True) # stderr=True, # stdout=True) print_named("bootstrapper", "Started God container. Waiting for experiment to finish...") pause() return except Exception as e: print_error(e) sleep(5) continue # If we get any exceptions try again
def schedule_link_change(self, time, graph, origin, destination, bandwidth, latency, jitter, drop): current_graph = self.get_current_graph(graph) new_graph = self.initialize_new_graph(current_graph) for link in new_graph.links: if link.source.name == origin and link.destination.name == destination: link.bandwidth_bps = bandwidth if bandwidth >= 0 else link.bandwidth_bps link.latency = float(latency) if latency >= 0 else link.latency link.jitter = float(jitter) if jitter >= 0 else link.jitter link.drop = float(drop) if drop >= 0 else link.drop self.recompute_and_store(new_graph, time) print_message("Link " + origin + "--" + destination + " scheduled to change at " + str(time)) self.graph_changes.append((time, [graph, new_graph]))
def parse_schedule(self, service, graph): """ :param service: NetGraph.Service :return: """ XMLtree = ET.parse(self.file) root = XMLtree.getroot() if root.tag != 'experiment': print_and_fail( 'Not a valid Kollaps topology file, root is not <experiment>') dynamic = None for child in root: if child.tag == 'dynamic': if dynamic is not None: print_and_fail("Only one <dynamic> block is allowed.") dynamic = child scheduler = EventScheduler() first_join = -1.0 first_leave = float('inf') # if there is no dynamic block than this instance joins straight away if dynamic is None: scheduler.schedule_join(0.0) return scheduler seed(12345) replicas = [] for i in range(service.replica_count): replicas.append( [False, False, False]) # Joined = False, Disconnected = False, Used = False # indexes for replicas entries JOINED = 0 DISCONNECTED = 1 USED = 2 # there is a dynamic block, so check if there is anything scheduled for us for event in dynamic: if event.tag != 'schedule': print_and_fail("Only <schedule> is allowed inside <dynamic>") # parse time of event time = 0.0 try: time = float(event.attrib['time']) if time < 0.0: print_and_fail("time attribute must be a positive number") except ValueError as e: print_and_fail("time attribute must be a valid real number") if 'name' in event.attrib and 'time' in event.attrib and 'action' in event.attrib: node_name = event.attrib['name'] bridge_names = [] for bridge in list(graph.bridges.keys()) + list( graph.removed_bridges.keys()): bridge_names.append(bridge) # if a bridge is scheduled if node_name in bridge_names: if event.attrib['action'] == 'join': scheduler.schedule_bridge_join(time, graph, node_name) elif event.attrib['action'] == 'leave': scheduler.schedule_bridge_leave(time, graph, node_name) continue # parse name of service. only process actions that target us if node_name != service.name: continue # parse amount of replicas affected amount = 1 if 'amount' in event.attrib: amount = int(event.attrib['amount']) # parse action if event.attrib['action'] == 'join': for i in range(amount): available = False id = 0 # Pick a random replica while (not available): id = randrange(0, service.replica_count) available = not replicas[id][JOINED] if not service.reuse_ip: available = available and not replicas[id][USED] # Mark the state replicas[id][JOINED] = True if not service.reuse_ip: replicas[id][USED] = True # if its us, schedule the action if service.replica_id == id: scheduler.schedule_join(time) print_message(service.name + " replica " + str(service.replica_id) + " scheduled to join at " + str(time)) if first_join < 0.0: first_join = time elif event.attrib['action'] == 'leave' or event.attrib[ 'action'] == 'crash': for i in range(amount): up = False id = 0 # Pick a random replica while (not up): id = randrange(0, service.replica_count) up = replicas[id][JOINED] # Mark the state replicas[id][JOINED] = False # if its us, schedule the action if service.replica_id == id: if event.attrib['action'] == 'leave': scheduler.schedule_leave(time) print_message(service.name + " replica " + str(service.replica_id) + " scheduled to leave at " + str(time)) elif event.attrib['action'] == 'crash': scheduler.schedule_crash(time) print_message(service.name + " replica " + str(service.replica_id) + " scheduled to crash at " + str(time)) if first_leave > time: first_leave = time elif event.attrib['action'] == 'reconnect': for i in range(amount): disconnected = False id = 0 # Pick a random replica while (not disconnected): id = randrange(0, service.replica_count) disconnected = replicas[id][DISCONNECTED] # Mark the state replicas[id][DISCONNECTED] = False # if its us, schedule the action if service.replica_id == id: print_message(service.name + " replica " + str(service.replica_id) + " scheduled to reconnect at " + str(time)) scheduler.schedule_reconnect(time) elif event.attrib['action'] == 'disconnect': for i in range(amount): connected = False id = 0 # Pick a random replica while (not connected): id = randrange(0, service.replica_count) connected = replicas[id][ JOINED] and not replicas[id][DISCONNECTED] # Mark the state replicas[id][DISCONNECTED] = True # if its us, schedule the action if service.replica_id == id: print_message(service.name + " replica " + str(service.replica_id) + " scheduled to disconnect at " + str(time)) scheduler.schedule_disconnect(time) else: print_and_fail( "Unrecognized action: " + event.attrib['action'] + " , allowed actions are join, leave, crash, disconnect, reconnect" ) #Do something dynamically with a link elif 'origin' in event.attrib and 'dest' in event.attrib and 'time' in event.attrib: #parse origin and destination origin = event.attrib['origin'] destination = event.attrib['dest'] if 'action' in event.attrib: #link is joining or leaving if event.attrib['action'] == 'leave': scheduler.schedule_link_leave(time, graph, origin, destination) elif event.attrib['action'] == 'join': #Link is already defined but has been removed before if not 'upload' in event.attrib or not 'latency' in event.attrib: scheduler.schedule_link_join( time, graph, origin, destination) #A completely new link with defined properties joins elif not 'upload' in event.attrib and not 'latency' in event.attrib and not 'network' in event.attrib: print_and_fail( "Link description incomplete. For a new link, you must provide at least latency, upload, and network attributes." ) else: bandwidth = event.attrib['upload'] latency = float(event.attrib['latency']) drop = 0 if 'drop' in event.attrib: drop = float(event.attrib['drop']) jitter = 0 if 'jitter' in event.attrib: jitter = float(event.attrib['jitter']) network = event.attrib['network'] scheduler.schedule_new_link( time, graph, origin, destination, latency, jitter, drop, bandwidth, network) if 'download' in event.attrib: bandwidth = event.attrib['download'] scheduler.schedule_new_link( time, graph, destination, origin, latency, jitter, drop, bandwidth, network) else: print_and_fail("Unrecognized action for link: " + event.attrib['action'] + ", allowed are join and leave") else: #properties of link are changing bandwidth = -1 if 'upload' in event.attrib: bandwidth = graph.bandwidth_in_bps( event.attrib['upload']) latency = -1 if 'latency' in event.attrib: latency = float(event.attrib['latency']) drop = -1 if 'drop' in event.attrib: drop = float(event.attrib['drop']) jitter = -1 if 'jitter' in event.attrib: jitter = float(event.attrib['jitter']) scheduler.schedule_link_change(time, graph, origin, destination, bandwidth, latency, jitter, drop) else: print_and_fail( '<schedule> must have either name, time and action attributes,' + ' or link origin dest and properties attributes') # deal with auto join if first_join < 0.0: print_message(service.name + " scheduled to join at " + str(0.0)) scheduler.schedule_join(0.0) if first_leave < first_join: print_and_fail("Dynamic: service " + service.name + " leaves before having joined") scheduler.schedule_graph_changes() return scheduler
def main(): if len(sys.argv) < 4: print_and_fail("Missing arguments. emucore <topology> <container id>") else: topology_file = sys.argv[1] # For future reference: This topology file must not exceed 512KB otherwise docker refuses # to copy it as a config file, this has happened with the 2k scale-free topology... setup_container(sys.argv[2], sys.argv[3]) # Because of the bootstrapper hack we cant get output from the emucore through standard docker logs... #sys.stdout = open("/var/log/need.log", "w") #sys.stderr = sys.stdout graph = NetGraph() parser = XMLGraphParser(topology_file, graph) parser.fill_graph() print_message("Done parsing topology") print_message("Resolving hostnames...") graph.resolve_hostnames() print_message("All hosts found!") print_message("Determining the root of the tree...") # Get our own ip address and set the root of the "tree" ownIP = get_own_ip(graph) graph.root = graph.hosts_by_ip[ip2int(ownIP)] if graph.root is None: print_and_fail( "Failed to identify current service instance in topology!") print_message("We are " + graph.root.name + "@" + ownIP) print_identified(graph, "Calculating shortest paths...") graph.calculate_shortest_paths() print_message("Parsing dynamic event schedule...") scheduler = parser.parse_schedule(graph.root, graph) signal(SIGTERM, lambda signum, frame: exit(0)) print_message("Initializing network emulation...") manager = EmulationCore(graph, scheduler) manager.initialize() print_identified(graph, "Waiting for command to start experiment") sys.stdout.flush() sys.stderr.flush() if getenv('RUNTIME_EMULATION', 'true') != 'false': # Enter the emulation loop manager.emulation_loop()
def receive_dashboard_commands(self): self.dashboard_socket.listen() while True: connection, addr = self.dashboard_socket.accept() connection.settimeout(5) try: data = connection.recv(1) if data: command = struct.unpack("<1B", data)[0] if command == CommunicationsManager.STOP_COMMAND: connection.close() with self.stop_lock: print_message("Stopping experiment") self.broadcast_groups = [] #TODO Stop is now useless, probably best to just replace with shutdown elif command == CommunicationsManager.SHUTDOWN_COMMAND: print_message("Received Shutdown command") msg = "packets: recv " + str( self.received) + ", prod " + str(self.produced) print_identified(self.graph, msg) connection.send( struct.pack("<3Q", self.produced, 50, self.received)) ack = connection.recv(1) if len(ack) != 1: print_error("Bad ACK len:" + str(len(ack))) connection.close() continue if struct.unpack("<1B", ack)[0] != CommunicationsManager.ACK: print_error("Bad ACK, not and ACK" + str(struct.unpack("<1B", ack))) connection.close() continue connection.close() with self.stop_lock: # self.process_pool.terminate() # self.process_pool.join() self.dashboard_socket.close() for s in broadcast_sockets: s.close() # self.sock.close() PathEmulation.tearDown() print_identified(self.graph, "Shutting down") sys.stdout.flush() sys.stderr.flush() stop_experiment() interrupt_main() return elif command == CommunicationsManager.READY_COMMAND: connection.send( struct.pack("<1B", CommunicationsManager.ACK)) connection.close() elif command == CommunicationsManager.START_COMMAND: connection.close() print_message("Starting Experiment!") self.scheduler.start() except OSError as e: continue # Connection timed out (most likely)
def stopExperiment(): with DashboardState.lock: if DashboardState.stopping or not DashboardState.ready: return else: DashboardState.stopping = True produced = 0 received = 0 gaps = [] to_kill = [] for node in DashboardState.hosts: host = DashboardState.hosts[node] if node.supervisor: continue to_kill.append(host) to_stop = to_kill[:] # Stop all services while to_stop: host = to_stop.pop() try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(2) s.connect((host.ip, CommunicationsManager.TCP_PORT)) s.send(struct.pack("<1B", CommunicationsManager.STOP_COMMAND)) s.close() except OSError as e: print_error(e) to_stop.insert(0, host) sleep(0.5) # Collect sent/received statistics and shutdown while to_kill: host = to_kill.pop() try: s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(2) s.connect((host.ip, CommunicationsManager.TCP_PORT)) s.send(struct.pack("<1B", CommunicationsManager.SHUTDOWN_COMMAND)) data = s.recv(64) if len(data) < struct.calcsize("<3Q"): s.close() print_message("Got less than 24 bytes for counters.") to_kill.insert(0, host) continue s.send(struct.pack("<1B", CommunicationsManager.ACK)) s.close() data_tuple = struct.unpack("<3Q", data) produced += data_tuple[0] received += data_tuple[2] with DashboardState.lock: host.status = 'Down' continue except OSError as e: print_error("timed out\n" + str(e)) to_kill.insert(0, host) sleep(0.5) with DashboardState.lock: print_named("dashboard", "packets: recv " + str(received) + ", prod " + str(produced)) sys.stdout.flush() if produced > 0: DashboardState.lost_packets = 1-(received/produced) else: DashboardState.lost_packets = 0 DashboardState.stopping = False