def experimentCleanup(container_name): '''Cleanup for experiments''' perturbs.stopChaos(container_name) container = container_api.getContainer(container_name) monitoring.stopMonitoring(container) container.stop() print('🦀🦀🦀 experiments aborted, cleaned up')
def premade_external(name, cmd, pid=None): '''A bunch of premade auto completed perturbation''' container = container_api.getContainer(name) if not cmd: print('Invalid cmd %s' % cmd) exit() sysfault.applyFault(container, cmd, pid) print('Started sysfault %s on %s' % (cmd, container.name))
def start(name): '''Start to monitor container with given name''' # First do some simple verification of command. container = container_api.getContainer(name) if container_api.hasMonitoring(name): print('Container %s already has monitoring' % name) return # Now start monitoring. monitoring.startMonitoring(container)
def start(name, delay_enter, delay_exit, error, signal, syscall, when): '''Start syscall perturbation on container''' container = container_api.getContainer(name) # Injects the fault 'Error NO ENTity' on open the first time and every second time after after that. fault = sysfault.Fault(delay_enter=delay_enter, delay_exit=delay_exit, error=error, signal=signal, syscall=syscall, when=when) sysfault.applyFault(container, fault, None) print('Started sysfault %s on %s' % (fault, container.name))
def stop(name): '''Stop to monitor container with given name''' container = container_api.getContainer(name) monitoring.stopMonitoring(container)
def stopChaos(name): '''Stops sysfault on container with given name''' container = container_api.getContainer(name) sysfault.clearFaults(container) print('Stopped sysfault on %s' % container.name)
def start(name, exp_time, pid_name, start_cmd, stop_cmd): '''Run experiments''' container_name = name perturbations = perturbs.getPremadeFaults() enumerated_perturbations = enumerate(perturbations) container_to_cleanup.append(container_name) signal.signal(signal.SIGTERM, signalCleanup) signal.signal(signal.SIGINT, signalCleanup) #0. Create experiment directory realpath = os.path.realpath('') experiment_dir = '%s/%s' % (realpath, container_name + '_exp') folders = None try: os.mkdir(experiment_dir) except FileExistsError: # Existing experiment run found, can we continue it? # List all directories. folders = os.listdir(experiment_dir) # Convert to int as to be able to sort. folders = list(map(int, folders)) folders.sort() if len(folders) is 0: newest = 0 else: goal_list_of_folders = set(list(range(0, 180))) current_list_of_folders = set(folders) newest = sorted(goal_list_of_folders - current_list_of_folders)[0] # Skip perturbations already completed. enumerated_perturbations = enumerate(perturbations[newest:], newest) print('🦀 Existing experiment detected, continuing from %s/%s' % (newest, perturbations[newest])) #1. select perturbation & start monitoring length = len(perturbations) for index, p in enumerated_perturbations: print('🦀🦀🦀 %d/%d running experiment perturbation %s' % (index + 1, length, p)) if folders is not None and index in folders: # Skip this one as output folder already exists. print('🦀 already done, skipping') continue # A) start container. start_proc = runCmd(start_cmd.split(' ')) # B) assume we need to wait a bit for the above to start. time.sleep(20) # C) start monitoring container = container_api.getContainer(container_name) # first process, second one is pid. pid_to_monitor = container_api.getProcessesByNameExternal( container_name, pid_name)[0][1] monitoring.startMonitoring(container, pid_to_monitor) start_time = currentTimeS() #2. wait predetermined amount of time. printSleep(exp_time, info_str='for baseline#1') #3. start perturbation. # first process, first one is pid. pid_to_perturb = container_api.getProcessesByNameLocal( container_name, pid_name)[0][1] perturbs.premade_external(container_name, p, pid=pid_to_perturb) #4. wait predetermined amount of time. printSleep(exp_time, info_str='for perturbation') #5. stop perturbation. print('🦀 finished perturbing') perturbs.stopChaos(container_name) printSleep(exp_time, info_str='for baseline#2') #6. log results to files for future processing. print('🦀 logging files') end_time = currentTimeS() time_span = end_time - start_time output_dir = '%s/%s' % (experiment_dir, index) os.mkdir(output_dir) #create output directory. filename = '%s/%s' % (output_dir, p) # SYSCALL with open(filename + '_syscall.csv', 'w', newline='') as file: metrics.syscallQuery(name, end_time, timespan=time_span, csvfile=file) # NETWORK HTTP with open(filename + '_nethttp.csv', 'w', newline='') as file: metrics.networkQuery(name, end_time, timespan=time_span, csvfile=file) # CPU with open(filename + '_cpu.csv', 'w', newline='') as file: metrics.cpuQuery(name, end_time, timespan=time_span, csvfile=file) # RAM with open(filename + '_mem.csv', 'w', newline='') as file: metrics.memQuery(name, end_time, timespan=time_span, csvfile=file) # NETWORK RECEIVE with open(filename + '_netrec.csv', 'w', newline='') as file: metrics.netreceiveQuery(name, end_time, timespan=time_span, csvfile=file) # NETWORK TRANSMIT with open(filename + '_netsend.csv', 'w', newline='') as file: metrics.nettransmitQuery(name, end_time, timespan=time_span, csvfile=file) # DISK IO with open(filename + '_io.csv', 'w', newline='') as file: metrics.ioQuery(name, end_time, timespan=time_span, csvfile=file) # HTTP LATENCY with open(filename + '_latency.csv', 'w', newline='') as file: metrics.latencyQuery(name, end_time, timespan=time_span, csvfile=file) # E) Stop monitoring monitoring.stopMonitoring(container) # D) Stop command start_proc.kill() if stop_cmd is not '': print(stop_cmd) stop_proc = runCmd(stop_cmd.split(' ')) stop_proc.wait() #7. if not done, goto #2 and repeat #2-#7 # loop, so happen by default. print('🦀 stopping monitoring') monitoring.stopMonitoring(container) print('🦀🦀🦀 experiments completed')