def test_order1(self): async def aprint(x): print(x) def job(n): return Job(aprint(n), label=n) sub1, sub2, sub3, sub4 = Scheduler(), Scheduler(), Scheduler(), Scheduler() sched = Scheduler( Sequence( job('top'), sub1, job('middle'), sub2, sub3, sub4)) for i in range(3): sub1.add(job(i+1)) sub2.add(job(i+4)) sub3.add(job(i+7)) sub4.add(job(i+10)) sub4.add(job(13)) produce_png(sched, "test_png_order1")
def test_shutdown_nested_timeout(self): # so here we create 16 jobs for which the shutdown # durations will be # 0.0 0.1 0.2 0.3 - 1.0 1.1 1.2 1.3 # 2.0 2.1 2.2 2.3 - 3.0 3.1 3.2 3.3 # so if we set shutdown_timeout = 0.9s, we should # still find counter == 12 cardinal = 4 # same to the square top = CounterScheduler(label="TOP", shutdown_timeout=0.9) subs = [] for i in range(cardinal): sub = Scheduler(label=f"SUB {i}") subs.append(sub) sub.add( Sequence(*[ CounterJob(top, 10 * i + j, aprint('ok'), label=10 * i + j) for j in range(cardinal) ])) top.add(Sequence(*subs)) self.assertEqual(top.counter, 0) self.assertTrue(top.run()) self.assertEqual(top.counter, cardinal * cardinal) self.assertFalse(top.shutdown()) self.assertEqual(top.counter, cardinal * (cardinal - 1))
def test_sequence6(self): "adding a sequence" sched = Scheduler() a1 = J(sl(0.1), label=1) a2 = J(sl(0.1), label=2) a3 = J(sl(0.1), label=3) sched.add(Seq(a1, a2, a3)) self.assertTrue(sched.orchestrate())
def test_display(self): class FakeTask: def __init__(self): self._result = 0 self._exception = None def annotate_job_with_fake_task(job, state, boom): task = FakeTask() if state == "done": task._state = asyncio.futures._FINISHED job._task = task job._running = True elif state == "running": task._state = "NONE" job._task = task job._running = True elif state == "scheduled": task._state = "NONE" job._task = task job._running = False else: pass # here we assume that a job that has raised an exception is # necessarily done if boom: if state in ("idle", "scheduled", "running"): print("incompatible combination boom x idle - ignored") return else: job._task._exception = True return job class J(AbstractJob): pass sched = Scheduler() previous = None for state in "idle", "scheduled", "running", "done": for boom in True, False: for critical in True, False: for forever in True, False: j = J(critical=critical, forever=forever, label="forever={} crit.={} status={} boom={}" .format(forever, critical, state, boom), required=previous ) if annotate_job_with_fake_task(j, state, boom): sched.add(j) previous = j sched.list()
def check_expansion(self, *deferred_expected_s): s = Scheduler() formatters = {} for deferred, _ in deferred_expected_s: formatters[deferred] = f = CaptureFormatter() f.start_capture() n = SshNode(localhostname(), username=localuser(), formatter=f) s.add(SshJob(node=n, commands=Run(deferred))) s.run() for deferred, expected in deferred_expected_s: captured = formatters[deferred].get_capture() self.assertEqual(captured, expected)
def test_capture(self): s = Scheduler() f = CaptureFormatter() n = SshNode(localhostname(), username=localuser(), formatter=f) s.add(SshJob(node=n, commands=[ Run("echo LINE1"), Run("echo LINE2"), ])) f.start_capture() s.run() captured = f.get_capture() expected = "LINE1\nLINE2\n" self.assertEqual(captured, expected)
def test_nested_cycles(self): watch = Watch() def job(i): return Job(co_print_sleep(watch, .2, f"job {i}"), label=f"job{i}") js1, js2, js3 = [job(i) for i in range(11, 14)] s2 = Scheduler(Sequence(js1, js2, js3)) j1, j3 = job(1), job(3) s1 = Scheduler(Sequence(j1, s2, j3)) self.assertTrue(s1.check_cycles()) # create cycle in subgraph js1.requires(js3) self.assertFalse(s1.check_cycles()) # restore in OK state js1.requires(js3, remove=True) self.assertTrue(s1.check_cycles()) # add cycle in toplevel j1.requires(j3) self.assertFalse(s1.check_cycles()) # restore in OK state j1.requires(j3, remove=True) self.assertTrue(s1.check_cycles()) # add one level down s3 = Scheduler() jss1, jss2, jss3 = [job(i) for i in range(111, 114)] Sequence(jss1, jss2, jss3, scheduler=s3) # surgery in s2; no cycles s2.remove(js2) s2.sanitize() s2.add(s3) s3.requires(js1) js3.requires(s3) self.assertTrue(s1.check_cycles()) # add cycle in s3 js1.requires(js3) self.assertFalse(s1.check_cycles())
def test_shutdown_nested(self): cardinal = 4 # same to the square top = CounterScheduler(label="TOP") subs = [] for i in range(cardinal): sub = Scheduler(label=f"SUB {i}") subs.append(sub) sub.add( Sequence(*[ CounterJob(top, 0, aprint('ok'), label=10 * i + j) for j in range(cardinal) ])) top.add(Sequence(*subs)) self.assertEqual(top.counter, 0) self.assertTrue(top.run()) self.assertEqual(top.counter, cardinal * cardinal) self.assertTrue(top.shutdown()) self.assertEqual(top.counter, 0)
def populate_sched(self, scheduler, jobs, nested=0, pack_job=1): if nested != 0: for cpt_job, job in enumerate(jobs): if cpt_job % pack_job == 0: core_sched = Scheduler() top_sched = core_sched current_sched = core_sched for i in range(nested-1): top_sched = Scheduler() top_sched.add(current_sched) current_sched = top_sched core_sched.add(job) if cpt_job % pack_job == 0: scheduler.add(top_sched) #scheds = [Scheduler(job, scheduler=scheduler) for job in jobs] else: for job in jobs: scheduler.add(job) return scheduler
def one_run(tx_power, phy_rate, antenna_mask, channel, *, run_name=default_run_name, slicename=default_slicename, load_images=False, node_ids=None, parallel=None, verbose_ssh=False, verbose_jobs=False, dry_run=False): """ Performs data acquisition on all nodes with the following settings Arguments: tx_power: in dBm, a string like 5, 10 or 14 phy_rate: a string among 1, 54 antenna_mask: a string among 1, 3, 7 channel: a string like e.g. 1 or 40 run_name: the name for a subdirectory where all data will be kept successive runs should use the same name for further visualization slicename: the Unix login name (slice name) to enter the gateway load_images: a boolean specifying whether nodes should be re-imaged first node_ids: a list of node ids to run the scenario on; strings or ints are OK; defaults to the all 37 nodes i.e. the whole testbed parallel: a number of simulataneous jobs to run 1 means all data acquisition is sequential (default) 0 means maximum parallel """ # # dry-run mode # just display a one-liner with parameters # if dry_run: load_msg = "" if not load_images else " LOAD" nodes = " ".join(str(n) for n in node_ids) print("dry-run: {run_name}{load_msg} -" " t{tx_power} r{phy_rate} a{antenna_mask} ch{channel} -" "nodes {nodes}" .format(**locals())) # in dry-run mode we are done return True # set default for the nodes parameter node_ids = [int(id) for id in node_ids] if node_ids is not None else default_node_ids ### # create the logs directory based on input parameters run_root = naming_scheme(run_name, tx_power, phy_rate, antenna_mask, channel, autocreate=True) # the nodes involved faraday = SshNode(hostname=default_gateway, username=slicename, formatter=TimeColonFormatter(), verbose=verbose_ssh) # this is a python dictionary that allows to retrieve a node object # from an id node_index = { id: SshNode(gateway=faraday, hostname=fitname(id), username="******", formatter=TimeColonFormatter(), verbose=verbose_ssh) for id in node_ids } # the global scheduler scheduler = Scheduler(verbose=verbose_jobs) ########## check_lease = SshJob( scheduler=scheduler, node=faraday, verbose=verbose_jobs, critical=True, command=Run("rhubarbe leases --check"), ) # load images if requested green_light = check_lease if load_images: # the nodes that we **do not** use should be turned off # so if we have selected e.g. nodes 10 12 and 15, we will do # rhubarbe off -a ~10 ~12 ~15, meaning all nodes except 10, 12 and 15 negated_node_ids = ["~{}".format(id) for id in node_ids] # replace green_light in this case green_light = SshJob( node=faraday, required=check_lease, critical=True, scheduler=scheduler, verbose=verbose_jobs, commands=[ Run("rhubarbe", "off", "-a", *negated_node_ids), Run("rhubarbe", "load", "-i", "u16-ath-noreg", *node_ids), Run("rhubarbe", "wait", *node_ids) ] ) ########## # setting up the wireless interface on all nodes # # this is a python feature known as a list comprehension # we just create as many SshJob instances as we have # (id, SshNode) couples in node_index # and gather them all in init_wireless_jobs # they all depend on green_light # # provide node-utilities with the ranges/units it expects frequency = channel_frequency[int(channel)] # tx_power_in_mBm not in dBm tx_power_driver = tx_power * 100 if load_images: # The first init_wireless_jobs always has troubles... Do it twice the first time (nasty hack) init_wireless_jobs = [ SshJob( scheduler=scheduler, required=green_light, node=node, verbose=verbose_jobs, label="init {}".format(id), commands=[ RunScript("node-utilities.sh", "init-ad-hoc-network", wireless_driver, "foobar", frequency, phy_rate, antenna_mask, tx_power_driver), RunScript("node-utilities.sh", "init-ad-hoc-network", wireless_driver, "foobar", frequency, phy_rate, antenna_mask, tx_power_driver) ] ) for id, node in node_index.items()] else: init_wireless_jobs = [ SshJob( scheduler=scheduler, required=green_light, node=node, verbose=verbose_jobs, label="init {}".format(id), command=RunScript("node-utilities.sh", "init-ad-hoc-network", wireless_driver, "foobar", frequency, phy_rate, antenna_mask, tx_power_driver) ) for id, node in node_index.items()] # then install and run olsr on fit nodes run_olsr = [ SshJob( scheduler=scheduler, node=node, required=init_wireless_jobs, label="init and run olsr on fit nodes", verbose=verbose_jobs, command=RunScript("node-utilities.sh", "run-olsr") ) for i, node in node_index.items()] # after that, run tcpdump on fit nodes, this job never ends... run_tcpdump = [ SshJob( scheduler=scheduler, node=node, required=run_olsr, label="run tcpdump on fit nodes", verbose=verbose_jobs, commands=[ Run("echo run tcpdump on fit{:02d}".format(i)), Run("tcpdump -U -i moni-{} -y ieee802_11_radio -w /tmp/fit{}.pcap".format(wireless_driver, i)) ] ) for i, node in node_index.items()] # let the wireless network settle settle_wireless_job = PrintJob( "Let the wireless network settle", sleep=settle_delay, scheduler=scheduler, required=run_olsr, label="settling") ########## # create all the ping jobs, i.e. max*(max-1)/2 # this again is a python list comprehension # see the 2 for instructions at the bottom # # notice that these SshJob instances are not yet added # to the scheduler, we will add them later on # depending on the sequential/parallel strategy pings = [ SshJob( node=nodei, required=settle_wireless_job, label="ping {} -> {}".format(i, j), verbose=verbose_jobs, commands=[ Run("echo {} '->' {}".format(i, j)), RunScript("node-utilities.sh", "my-ping", "10.0.0.{}".format(j), ping_timeout, ping_interval, ping_size, ping_number, ">", "PING-{:02d}-{:02d}".format(i, j)), Pull(remotepaths="PING-{:02d}-{:02d}".format(i, j), localpath=str(run_root)), ] ) # looping on the source, now only fit01 is source for i, nodei in node_index.items() # and on the destination for j, nodej in node_index.items() # and keep only half of the couples if (j > i) and (i==1) ] # retrieve all pcap files from fit nodes retrieve_tcpdump = [ SshJob( scheduler=scheduler, node=nodei, required=pings, label="retrieve pcap trace from fit{:02d}".format(i), verbose=verbose_jobs, commands=[ RunScript("node-utilities.sh", "kill-olsr"), Run("sleep 1;pkill tcpdump; sleep 1"), RunScript("node-utilities.sh", "process-pcap", i), Run( "echo retrieving pcap trace and result-{i}.txt from fit{i:02d}".format(i=i)), Pull(remotepaths=["/tmp/fit{}.pcap".format(i), "/tmp/result-{}.txt".format(i)], localpath=str(run_root)), ] ) for i, nodei in node_index.items() ] # xxx this is a little fishy # should we not just consider that the default is parallel=1 ? if parallel is None: # with the sequential strategy, we just need to # create a Sequence out of the list of pings # Sequence will add the required relationships scheduler.add(Sequence(*pings, scheduler=scheduler)) # for running sequentially we impose no limit on the scheduler # that will be limitied anyways by the very structure # of the required graph jobs_window = None else: # with the parallel strategy # we just need to insert all the ping jobs # as each already has its required OK scheduler.update(pings) # this time the value in parallel is the one # to use as the jobs_limit; if 0 then inch'allah jobs_window = parallel # if not in dry-run mode, let's proceed to the actual experiment ok = scheduler.orchestrate(jobs_window=jobs_window) # give details if it failed if not ok: scheduler.debrief() # data acquisition is done, let's aggregate results # i.e. compute averages if ok: post_processor = Aggregator(run_root, node_ids, antenna_mask) post_processor.run() return ok
def prepare_testbed_scheduler( # pylint: disable=r0913, r0914 gateway: SshNode, load_flag: bool, experiment_scheduler: Scheduler, images_mapping, nodes_left_alone=None, sdrs_left_alone=None, phones_left_alone=None, verbose_jobs=False): """ This function is designed as a standard way for experiments to warm up. Experimenters only need to write a scheduler that defines the behaviour of their core experiment, this function will add additional steps that take care of a) checking for a valid lease, b) load images on nodes, and c) turn off unused devices. It is generally desirable to write an experiment script that has a `--load/-l` boolean flag; typically, one would use the ``--load`` flag the first time that an experiment is launched during a given timeslot, while subsequent calls won't. That is the purpose of the ``load_flag`` below; when set to False, only step a) is performed, otherwise the resulting scheduler will go for the full monty. Parameters: gateway_sshnode: the ssh handle to the gateway load_flag(bool): if not set, only the lease is checked experiment_scheduler: core scheduler for the experiment images_mapping: a dictionary that specifies images to be loaded on nodes; see examples below nodes_left_alone: a list of node numbers that should be left intact, neither loaded nor turned off; phones_left_alone: a list of node numbers that should be left intact, i.e. not switched to airplane mode. Return : The overall scheduler where the input ``experiment_scheduler`` is embedded. Examples: Specify a mapping like the following:: images_mapping = { "ubuntu" : [1, 4, 5], "gnuradio": [16]} Note that the format for ``images_mapping``, is flexible; if only one node is to be loaded, the iterable level is optional; also each node can be specified as an ``int``, a ``bytes``, a ``str``, in which case non numeric characters are ignored. So this is a legitimate requirement as well:: images_mapping = { 'openair-cn': 12 + 4, 'openair-enodeb': ('fit32',), 'ubuntu': {12, 'reboot1', '004', 'you-get-the-picture-34'} } """ # handle default mutable args nodes_left_alone = set(nodes_left_alone) if nodes_left_alone else set() sdrs_left_alone = set(sdrs_left_alone) if sdrs_left_alone else set() phones_left_alone = set(phones_left_alone) if phones_left_alone else set() scheduler = Scheduler(label="Preparation") check_lease = SshJob( scheduler=scheduler, node=gateway, verbose=verbose_jobs, label="Check lease {}".format(gateway.username), command=Run("rhubarbe leases --check", label="rlease"), ) # if no image loading is requested, we're done here if not load_flag: scheduler.add(experiment_scheduler) experiment_scheduler.requires(check_lease) return scheduler # otherwise, we want to do in parallel # (*) as many image-loading jobs as we have entries in images_mapping # (*) one job to turn off phones, nodes and usrps # as parallelizing brings no speed up at all # todo ideally we could also probe the testbed to figure out which nodes # are currently unavailable, and let them alone as well; but well. # the jobs that we need to wait for before going on with the real stuff octopus = [] loaded_nodes = set() for image, nodes in images_mapping.items(): # let's be as flexible as possible # (1) empty node list should be fine if not nodes: continue # (2) atomic types should be allowed if isinstance(nodes, (int, str, bytes)): nodes = [nodes] # (3) accept all forms of inputs nodes = {r2lab_id(node) for node in nodes} duplicates = loaded_nodes & nodes if duplicates: print("WARNING - nodes in {} have been assigned several images". format(duplicates)) loaded_nodes.update(nodes) # for there on we need strings node_args = " ".join(str(node) for node in nodes) octopus.append( SshJob( gateway, scheduler=scheduler, required=check_lease, label=("loading {} on {}".format(image, node_args)), commands=[ Run("rhubarbe load -i {} {}".format(image, node_args)), Run("rhubarbe wait {}".format(node_args)), ], verbose=verbose_jobs, )) ### turn off stuff # nodes dont_off_nodes = nodes_left_alone | loaded_nodes # do turn off usrp device even on loaded nodes dont_off_sdrs = sdrs_left_alone # phones - there's no equivalent of --all ~ notation with phones off_phones = set(range(1, PHONES+1)) \ - {r2lab_id(ph) for ph in phones_left_alone} r2lab_includes = [ find_local_embedded_script(x) for x in ("faraday.sh", "r2labutils.sh") ] if off_phones: octopus.append( SshJob(gateway, scheduler=scheduler, required=check_lease, critical=False, commands=[ RunScript(find_local_embedded_script("faraday.sh"), "macphone{}".format(phone), "r2lab-embedded/shell/macphone.sh", "phone-off", label="turn off phone {}".format(phone), includes=r2lab_includes) for phone in off_phones ], verbose=verbose_jobs)) octopus.append( SshJob( gateway, scheduler=scheduler, required=check_lease, label="Turn off unused devices", commands=[ Run(_rhubarbe_command(verb="off", left_alone=dont_off_nodes)), Run(_rhubarbe_command(verb="usrpoff", left_alone=dont_off_sdrs)), ], verbose=verbose_jobs, )) # embed experiment scheduler experiment_scheduler.requires(octopus) scheduler.add(experiment_scheduler) return scheduler
def main(argv): if len(argv) == 3: print("!! Unfinished routines !!") else: print("++ Using default settings ++") ########################### ## Local Variables # platform='multiGPU' platform='distributed' gateway_user='******' gateway_host='gw_host' node_username='******' ######################################################### ## Distributed Requirements num_ps = 1 num_workers = 2 ######################################################### gateway = SshNode( gateway_host, username=gateway_user ) ########################################################## elif platform == 'distributed': ## Jetson-TX2 Cluster hosts = [cluster_ip_host] ######################################################### ## Use the Server node for processing the first satge Data-mining server = ResourceManager._set_Node(master_host, master_user, gateway,) ############################ # Push the launch file (run_splitpoint) # With the Parameters Connfiguration on the server # To execute the First Satege in this host job_launch_S1 = SshJob( node = server, commands = [ ## Run the script locate in the laptop RunScript("run_dataspworkers_mlp.sh", platform, num_ps, num_workers), Run("echo Split Data DONE"), ], ) ############################# ## A collection of the PS node ps = [] [ps.append(ResourceManager._set_Node(hosts[i], node_username, gateway,)) for i in range(num_ps)] ############################# ## A collection of the workers node workers = [] [workers.append(ResourceManager._set_Node(hosts[num_ps+i], node_username, gateway,)) for i in range(num_workers)] ######################################################### ## Setting Parameters for the First Stage FEATURES_NAME = "FULL-W1_x1_x2_x3_x4_x5_x7_x8_Y1" SANDBOX=str("/data_B/datasets/drg-PACA/healthData/sandbox-"+FEATURES_NAME) YEAR=str(2008) ## Stage 1 # localdir = "/1_Mining-Stage/" # SP_Dir_X = str(SANDBOX+localdir+"BPPR-"+FEATURES_NAME+"-"YEAR) ############################# ## Setting parameters for the Second Stage S_PLOINT = str(3072) #1536) #SP_ARGV = str(S_PLOINT+"-"+platform) SP_ARGV = platform+"-"+str(num_workers) SP2=str(SANDBOX+"/2_Split-Point-"+SP_ARGV+"/") ############################# ## BPPR Directories dir_train = "/data_training/" dir_valid = "/data_valid/" dir_test = "/data_test/" ############################ ## Worker data management worker_healthData = "/opt/diagnosenet/healthData/" worker_sandbox = str(worker_healthData+"/sandbox-"+FEATURES_NAME) worker_splitpoint = str(worker_sandbox+"/2_Split-Point-"+SP_ARGV+"/") worker_train = str(worker_splitpoint+dir_train) worker_valid = str(worker_splitpoint+dir_valid) worker_test = str(worker_splitpoint+dir_test) ############################ ## Worker commands mkd_worker_sandbox = str("mkdir"+" "+worker_sandbox) mkd_worker_splitpoint = str("mkdir"+" "+worker_splitpoint) mkd_worker_train = str("mkdir"+" "+worker_train) mkd_worker_valid = str("mkdir"+" "+worker_valid) mkd_worker_test = str("mkdir"+" "+worker_test) ############################# ## Create a JOB to build the sandbox for each Worker job_build_sandbox = [] [ job_build_sandbox.append(SshJob( node = workers[i], commands = [ RunString(mkd_worker_sandbox), RunString(mkd_worker_splitpoint), RunString(mkd_worker_train), RunString(mkd_worker_valid), RunString(mkd_worker_test), Run("echo SANDBOX ON WORKER DONE"), ], )) for i in range(len(workers)) ] ############################# ## Create a command for transfer data scp = "scp" cmd_X_train_transfer = [] cmd_y_train_transfer = [] cmd_X_valid_transfer = [] cmd_y_valid_transfer = [] cmd_X_test_transfer = [] cmd_y_test_transfer = [] for i in range(num_workers): worker_host = str(node_user+"@"+ hosts[num_ps+i] +":") num_file = str(i+1) ## Commands to transfer Training dataset X_train_splitted = str(SP2+dir_train+"X_training-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_X_train_transfer.append(str(scp+" "+X_train_splitted+" "+worker_host+worker_train)) y_train_splitted = str(SP2+dir_train+"y_training-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_y_train_transfer.append(str(scp+" "+y_train_splitted+" "+worker_host+worker_train)) ## Commands to transfer Validation dataset X_valid_splitted = str(SP2+dir_valid+"X_valid-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_X_valid_transfer.append(str(scp+" "+X_valid_splitted+" "+worker_host+worker_valid)) y_valid_splitted = str(SP2+dir_valid+"y_valid-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_y_valid_transfer.append(str(scp+" "+y_valid_splitted+" "+worker_host+worker_valid)) ## Commands to transfer Test dataset X_test_splitted = str(SP2+dir_test+"X_test-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_X_test_transfer.append(str(scp+" "+X_test_splitted+" "+worker_host+worker_test)) y_test_splitted = str(SP2+dir_test+"y_test-"+FEATURES_NAME+"-"+YEAR+"-"+num_file+".txt") cmd_y_test_transfer.append(str(scp+" "+y_test_splitted+" "+worker_host+worker_test)) ############################ ## Build a JOB for transfering data to each worker sandbox job_data_transfer = [] [job_data_transfer.append(SshJob( node = server, commands = [ RunString(cmd_X_train_transfer[i]), RunString(cmd_y_train_transfer[i]), Run("echo SENDER TRAINING DATA DONE"), RunString(cmd_X_valid_transfer[i]), RunString(cmd_y_valid_transfer[i]), Run("echo SENDER VALID DATA DONE"), RunString(cmd_X_test_transfer[i]), RunString(cmd_y_test_transfer[i]), Run("echo SENDER TEST DATA DONE"), ],) ) for i in range(len(workers))] ######################################################### ## Create a sequence orchestration scheduler instance upfront worker_seq = [] ## Add the Stage-1 JOB into Scheduler worker_seq.append(Scheduler(Sequence( job_launch_S1))) ## Add the worker JOBs into Scheduler [worker_seq.append(Scheduler(Sequence( job_build_sandbox[i], job_data_transfer[i], )) ) for i in range(len(workers))] ############################# ## Old method ## Add the JOB PS Replicas into Scheduler # worker_seq.append(Scheduler(Sequence( # job_PS_replicas))) # # ## Add the JOB WORKER Replicas into Scheduler # worker_seq.append(Scheduler(Sequence( # job_WORKER_replicas))) ############################# ## Run the Sequence JOBS # [seq.orchestrate() for seq in worker_seq] ######################################################### ######################################################### ## Push the launch file (run_secondstage_distributed) ## With the Distributed Parameters for each worker replicas ## To distributed training of Unsupervised Embedding ############################# ## Build a collection of TensorFlow Hosts for PS tf_ps = [] [tf_ps.append(str(hosts[i]+":2222")) for i in range(num_ps)] # print("+++ tf_ps: {}".format(tf_ps)) tf_ps=','.join(tf_ps) ############################# ## Build a collection of TensorFlow Hosts for workers tf_workers = [] [tf_workers.append(str(hosts[num_ps+i]+":2222")) for i in range(num_workers)] # print("+++ tf_workers: {}".format(tf_workers)) tf_workers=','.join(tf_workers) job_PS_replicas = [] [job_PS_replicas.append(SshJob( node = ps[i], commands = [ ## Launches local script to execute on cluster # RunScript("run_secondstage_distributed.sh", # platform, tf_ps, tf_workers, # num_ps, num_workers, "ps", i), RunScript("run_thirdstage_distributed_mlp.sh", platform, tf_ps, tf_workers, num_ps, num_workers, "ps", i), Run("echo PS REPLICA DONE"), ],) ) for i in range(len(ps))] job_WORKER_replicas = [] [job_WORKER_replicas.append(SshJob( node = workers[i], commands = [ ## Launches local script to execute on cluster # RunScript("run_secondstage_distributed.sh", # platform, tf_ps, tf_workers, # num_ps, num_workers, "worker", i), RunScript("run_thirdstage_distributed_mlp.sh", platform, tf_ps, tf_workers, num_ps, num_workers, "worker", i), Run("echo WORKER REPLICA DONE"), ], ) ) for i in range(len(workers))] ############################# ### Simultaneous jobs s_distraining = Scheduler() [s_distraining.add(job_PS_replicas[i]) for i in range(len(ps))] [s_distraining.add(job_WORKER_replicas[i]) for i in range(len(workers))] s_distraining.run(jobs_window = int(num_ps+num_workers+1))
Pull(remotepaths = "PING-{:02d}-{:02d}".format(i, j), localpath="."), ] ) # looping on the source for i, nodei in node_index.items() # and on the destination for j, nodej in node_index.items() # and keep only half of the couples if j > i ] if args.parallel is None: # with the sequential strategy, we just need to # create a Sequence out of the list of pings # Sequence will add the required relationships scheduler.add(Sequence(*pings, scheduler=scheduler)) # for running sequentially we impose no limit on the scheduler # that will be limitied anyways by the very structure # of the required graph jobs_window = None else: # with the parallel strategy # we just need to insert all the ping jobs # as each already has its required OK scheduler.update(pings) # this time the value in args.parallel is the one # to use as the jobs_limit; if 0 then inch'allah jobs_window = args.parallel # finally - i.e. when all pings are done # we can list the current contents of our local directory
def one_run(*, protocol, interference, run_name=default_run_name, slicename=default_slicename, tx_power, phy_rate, antenna_mask, channel, load_images=False, node_ids=DEFAULT_NODE_IDS, src_ids=DEFAULT_SRC_IDS, dest_ids=DEFAULT_DEST_IDS, scrambler_id=DEFAULT_SCRAMBLER_ID, tshark=False, map=False, warmup=False, route_sampling=False, iperf=False, verbose_ssh=False, verbose_jobs=False, dry_run=False, run_number=None): """ Performs data acquisition on all nodes with the following settings Arguments: tx_power: in dBm, a string like 5, 10 or 14. Corresponds to the transmission power. phy_rate: a string among 1, 54. Correspond to the wifi rate. antenna_mask: a string among 1, 3, 7. channel: a string like e.g. 1 or 40. Correspond to the channel. protocol: a string among batman , olsr. Correspond to the protocol interference : in amplitude percentage, a string like 15 or 20. Correspond to the power of the noise generated in the spectrum. Can be either None or "None" to mean no interference. run_name: the name for a subdirectory where all data will be kept successive runs should use the same name for further visualization slicename: the Unix login name (slice name) to enter the gateway load_images: a boolean specifying whether nodes should be re-imaged first node_ids: a list of node ids to run the scenario against; strings or ints are OK; tshark: a boolean specifying wether we should format/parse the .pcap. map: a boolean specifying wether we should fetch/parse the route tables of the nodes. warmup: a boolean specifying whether we should run a ping before the experiment to be certain of the stabilisation on the network. src_ids: a list of nodes from which we will launch the ping from. strings or ints are OK. ping_messages : the number of ping packets that will be generated """ # set default for the nodes parameter node_ids = ([int(id) for id in node_ids] if node_ids is not None else DEFAULT_NODE_IDS) src_ids = ([int(id) for id in src_ids] if src_ids is not None else DEFAULT_SRC_IDS) dest_ids = ([int(id) for id in dest_ids] if dest_ids is not None else DEFAULT_NODE_IDS) # all nodes - i.e. including sources and destinations - # need to run the protocol node_ids = list(set(node_ids).union(set(src_ids).union(set(dest_ids)))) if interference == "None": interference = None # open result dir no matter what run_root = naming_scheme( run_name=run_name, protocol=protocol, interference=interference, autocreate=True) # fix me trace = run_root / f"trace-{%m-%d-%H-%M}" ref_time = apssh_time() trace = run_root / f"trace-{ref_time}" try: with trace.open('w') as feed: def log_line(line): time_line(line, file=feed) load_msg = f"{'WITH' if load_images else 'NO'} image loading" interference_msg = (f"interference={interference} " f"from scrambler={scrambler_id}") nodes = " ".join(str(n) for n in node_ids) srcs = " ".join(str(n) for n in src_ids) dests = " ".join(str(n) for n in dest_ids) ping_labels = [ f"PING {s} ➡︎ {d}" for s in src_ids # and on the destination for d in dest_ids if d != s ] log_line(f"output in {run_root}") log_line(f"trace in {trace}") log_line(f"protocol={protocol}") log_line(f"{load_msg}") log_line(f"{interference_msg}") log_line("----") log_line(f"Selected nodes : {nodes}") log_line(f"Sources : {srcs}") log_line(f"Destinations : {dests}") for label in ping_labels: log_line(f"{label}") log_line("----") for feature in ('warmup', 'tshark', 'map', 'route_sampling', 'iperf'): log_line(f"Feature {feature}: {locals()[feature]}") except Exception as exc: print(f"Cannot write into {trace} - aborting this run") print(f"Found exception {type(exc)} - {exc}") return False # # dry-run mode # just display a one-liner with parameters # prelude = "" if not dry_run else "dry_run:" with trace.open() as feed: print(f"**************** {ref_time} one_run #{run_number}:") for line in feed: print(prelude, line, sep='', end='') if dry_run: return True # the nodes involved faraday = SshNode(hostname=default_gateway, username=slicename, formatter=TimeColonFormatter(), verbose=verbose_ssh) # this is a python dictionary that allows to retrieve a node object # from an id node_index = { id: SshNode(gateway=faraday, hostname=fitname(id), username="******", formatter=TimeColonFormatter(), verbose=verbose_ssh) for id in node_ids } # extracts for sources and destinations src_index = {id:node for (id, node) in node_index.items() if id in src_ids} dest_index = {id:node for (id, node) in node_index.items() if id in dest_ids} if interference: node_scrambler = SshNode( gateway=faraday, hostname=fitname(scrambler_id), username="******", formatter=TimeColonFormatter(), verbose=verbose_ssh) # the global scheduler scheduler = Scheduler(verbose=verbose_jobs) ########## check_lease = SshJob( scheduler=scheduler, node=faraday, verbose=verbose_jobs, label="rhubarbe check lease", command=Run("rhubarbe leases --check", label="rlease"), ) # load images if requested green_light = check_lease # at some point we did not load the scrambler if interference was None # and that was a way to run faster loads with no interference # but now we always load the scrambler node with gnuradio # this is because when we do runs.py -i None 15 30 ... # then the first call to one_run is with interference being None # but it is still important to load the scrambler if load_images: # copy node_ids load_ids = node_ids[:] load_ids.append(scrambler_id) # the nodes that we **do not** use should be turned off # so if we have selected e.g. nodes 10 12 and 15, we will do # rhubarbe off -a ~10 ~12 ~15, meaning all nodes except 10, 12 and 15 negated_node_ids = [f"~{id}" for id in load_ids] # we can do these three things in parallel ready_jobs = [ SshJob(node=faraday, required=green_light, scheduler=scheduler, verbose=verbose_jobs, command=Run("rhubarbe", "off", "-a", *negated_node_ids, label="turn off unused nodes")), SshJob(node=faraday, required=green_light, scheduler=scheduler, verbose=verbose_jobs, label="load batman image", command=Run("rhubarbe", "load", "-i", "batman-olsr", *node_ids, label=f"load ubuntu on {node_ids}")), SshJob( node=faraday, required=green_light, scheduler=scheduler, verbose=verbose_jobs, label="load gnuradio image", command=Run("rhubarbe", "load", "-i", "batman-olsr-gnuradio", scrambler_id, label=f"load gnuradio on {scrambler_id}")), ] # replace green_light in this case green_light = SshJob( node=faraday, required=ready_jobs, scheduler=scheduler, verbose=verbose_jobs, label="wait for nodes to come up", command=Run("rhubarbe", "wait", *load_ids)) ########## # setting up the wireless interface on all nodes # # provide node-utilities with the ranges/units it expects frequency = channel_frequency[int(channel)] # tx_power_in_mBm not in dBm tx_power_driver = tx_power * 100 #just in case somme services failed in the previous experiment reset_failed_services_job = [ SshJob( node=node, verbose=verbose_jobs, label="reset failed services", command=Run("systemctl reset-failed", label="reset-failed services")) for id, node in node_index.items() ] reset_failed_services = Scheduler( *reset_failed_services_job, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="Reset failed services") init_wireless_sshjobs = [ SshJob( node=node, verbose=verbose_jobs, label=f"init {id}", command=RunScript( "node-utilities.sh", f"init-ad-hoc-network-{WIRELESS_DRIVER}", WIRELESS_DRIVER, "foobar", frequency, phy_rate, antenna_mask, tx_power_driver, label="init add-hoc network"), ) for id, node in node_index.items()] init_wireless_jobs = Scheduler( *init_wireless_sshjobs, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="Initialisation of wireless chips") if interference: # Run uhd_siggen with the chosen power init_scrambler_job = SshJob( scheduler=scheduler, required=green_light, forever=True, node=node_scrambler, verbose=verbose_jobs, #TODO : If exit-signal patch is done add exit-signal=["TERM"] # to this run object and call uhd_siggen directly commands=[RunScript("node-utilities.sh", "init-scrambler", label="init scrambler"), Run(f"systemd-run --unit=uhd_siggen -t ", f"uhd_siggen -a usrp -f {frequency}M", f"--sine --amplitude 0.{interference}", label="systemctl start uhd_siggen") ] ) green_light = [init_wireless_jobs, reset_failed_services] # then install and run batman on fit nodes run_protocol_job = [ SshJob( # scheduler=scheduler, node=node, label=f"init and run {protocol} on fit node {id}", verbose=verbose_jobs, # CAREFUL : These ones use sytemd-run # with the ----service-type=forking option! command=RunScript("node-utilities.sh", f"run-{protocol}", label=f"run {protocol}"), ) for id, node in node_index.items()] run_protocol = Scheduler( *run_protocol_job, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="init and run routing protocols") green_light = run_protocol # after that, run tcpdump on fit nodes, this job never ends... if tshark: run_tcpdump_job = [ SshJob( # scheduler=scheduler_monitoring, node=node, forever=True, label=f"run tcpdump on fit node {id}", verbose=verbose_jobs, command=[ Run("systemd-run -t --unit=tcpdump", f"tcpdump -U -i moni-{WIRELESS_DRIVER}", f"-y ieee802_11_radio -w /tmp/fit{id}.pcap", label=f"tcpdump {id}") ] ) for id, node in node_index.items() ] run_tcpdump = Scheduler( *run_tcpdump_job, scheduler=scheduler, required=green_light, forever=True, verbose=verbose_jobs, label="Monitoring - tcpdumps") # let the wireless network settle settle_scheduler = Scheduler( scheduler=scheduler, required=green_light, ) if warmup: # warmup pings don't need to be sequential, so let's # do all the nodes at the same time # on a given node though, we'll ping the other ends sequentially # see the graph for more warmup_jobs = [ SshJob( node=node_s, verbose=verbose_jobs, commands=[ RunScript("node-utilities.sh", "my-ping", f"10.0.0.{d}", warmup_ping_timeout, warmup_ping_interval, warmup_ping_size, warmup_ping_messages, f"warmup {s} ➡︎ {d}", label=f"warmup {s} ➡︎ {d}") for d in dest_index.keys() if s != d ] ) # for each selected experiment nodes for s, node_s in src_index.items() ] warmup_scheduler = Scheduler( *warmup_jobs, scheduler=settle_scheduler, verbose=verbose_jobs, label="Warmup pings") settle_wireless_job2 = PrintJob( "Let the wireless network settle after warmup", sleep=settle_delay_shorter, scheduler=settle_scheduler, required=warmup_scheduler, label=f"settling-warmup for {settle_delay_shorter} sec") # this is a little cheating; could have gone before the bloc above # but produces a nicer graphical output # we might want to help asynciojobs if it offered a means # to specify entry and exit jobs in a scheduler settle_wireless_job = PrintJob( "Let the wireless network settle", sleep=settle_delay_long, scheduler=settle_scheduler, label=f"settling for {settle_delay_long} sec") green_light = settle_scheduler if iperf: iperf_service_jobs = [ SshJob( node=node_d, verbose=verbose_jobs, forever=True, commands=[ Run("systemd-run -t --unit=iperf", "iperf -s -p 1234 -u", label=f"iperf serv on {d}"), ], ) for d, node_d in dest_index.items() ] iperf_serv_sched = Scheduler( *iperf_service_jobs, verbose=verbose_jobs, label="Iperf Servers", # for a nicer graphical output # otherwise the exit arrow # from scheduler 'iperf mode' # to job 'settling for 60s' # gets to start from this box forever=True, ) iperf_cli = [ SshJob( node=node_s, verbose=verbose_jobs, commands=[ Run("sleep 7", label=""), Run(f"iperf", f"-c 10.0.0.{d} -p 1234", f"-u -b {phy_rate}M -t 60", f"-l 1024 > IPERF-{s:02d}-{d:02d}", label=f"run iperf {s} ➡︎ {d}") ] ) for s, node_s in src_index.items() for d, node_d in dest_index.items() if s != d ] iperf_cli_sched = Scheduler( Sequence(*iperf_cli), verbose=verbose_jobs, label="Iperf Clients") iperf_stop = [ SshJob(node=node_d, verbose=verbose_jobs, label=f"Stop iperf on {d}", command=Run("systemctl stop iperf")) for d, node_d in dest_index.items() ] iperf_stop_sched = Scheduler( *iperf_stop, required=iperf_cli_sched, verbose=verbose_jobs, label="Iperf server stop") iperf_fetch = [ SshJob(node=node_s, verbose=verbose_jobs, command=Pull( remotepaths=[f"IPERF-{s:02d}-{d:02d}"], localpath=str(run_root), label="fetch iperf {s} ➡︎ {d}") ) for s, node_s in src_index.items() for d, node_d in dest_index.items() if s != d ] iperf_fetch_sched = Scheduler( *iperf_fetch, required=iperf_stop_sched, verbose=verbose_jobs, label="Iperf fetch report") iperf_jobs = [iperf_serv_sched, iperf_cli_sched, iperf_stop_sched, iperf_fetch_sched] iperf_sched = Scheduler( *iperf_jobs, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="Iperf Module") settle_wireless_job_iperf = PrintJob( "Let the wireless network settle", sleep=settle_delay_shorter, scheduler=scheduler, required=iperf_sched, label=f"settling-iperf for {settle_delay_shorter} sec") green_light = settle_wireless_job_iperf # create all the tracepath jobs from the first node in the list if map: map_jobs = [ SshJob( node=node, label=f"Generating ROUTE file for proto {protocol} on node {id}", verbose=verbose_jobs, commands=[ RunScript(f"node-utilities.sh", f"route-{protocol}", f"> ROUTE-TABLE-{id:02d}", label="get route table"), Pull(remotepaths=[f"ROUTE-TABLE-{id:02d}"], localpath=str(run_root), label="") ], ) for id, node in node_index.items() ] map_scheduler = Scheduler( *map_jobs, scheduler=scheduler, required=green_light, verbose=verbose_jobs, label="Snapshoting route files") green_light = map_scheduler if route_sampling: route_sampling_jobs = [ SshJob( node=node, label=f"Route sampling service for proto {protocol} on node {id}", verbose=False, forever=True, commands=[ Push(localpaths=["route-sample-service.sh"], remotepath=".", label=""), Run("chmod +x route-sample-service.sh", label=""), Run("systemd-run -t --unit=route-sample", "/root/route-sample-service.sh", "route-sample", f"ROUTE-TABLE-{id:02d}-SAMPLED", protocol, label="start route-sampling"), ], ) for id, node in node_index.items() ] route_sampling_scheduler = Scheduler( *route_sampling_jobs, scheduler=scheduler, verbose=False, forever=True, label="Route Sampling services launch", required=green_light) ########## # create all the ping jobs, i.e. max*(max-1)/2 # this again is a python list comprehension # see the 2 for instructions at the bottom # # notice that these SshJob instances are not yet added # to the scheduler, we will add them later on # depending on the sequential/parallel strategy pings_job = [ SshJob( node=node_s, verbose=verbose_jobs, commands=[ Run(f"echo actual ping {s} ➡︎ {d} using {protocol}", label=f"ping {s} ➡︎ {d}"), RunScript("node-utilities.sh", "my-ping", f"10.0.0.{d}", ping_timeout, ping_interval, ping_size, ping_messages, f"actual {s} ➡︎ {d}", ">", f"PING-{s:02d}-{d:02d}", label=""), Pull(remotepaths=[f"PING-{s:02d}-{d:02d}"], localpath=str(run_root), label=""), ], ) # for each selected experiment nodes for s, node_s in src_index.items() for d, node_d in dest_index.items() if s != d ] pings = Scheduler( scheduler=scheduler, label="PINGS", verbose=verbose_jobs, required=green_light) # retrieve all pcap files from fit nodes stop_protocol_job = [ SshJob( # scheduler=scheduler, node=node, # required=pings, label=f"kill routing protocol on {id}", verbose=verbose_jobs, command=RunScript(f"node-utilities.sh", f"kill-{protocol}", label=f"kill-{protocol}"), ) for id, node in node_index.items() ] stop_protocol = Scheduler( *stop_protocol_job, scheduler=scheduler, required=pings, label="Stop routing protocols", ) if tshark: retrieve_tcpdump_job = [ SshJob( # scheduler=scheduler, node=nodei, # required=pings, label=f"retrieve pcap trace from fit{i:02d}", verbose=verbose_jobs, commands=[ Run("systemctl stop tcpdump", label="stop tcpdump"), #Run("systemctl reset-failed tcpdump"), #RunScript("node-utilities.sh", "kill-tcpdump", # label="kill-tcpdump"), Run( f"echo retrieving pcap trace and result-{i}.txt from fit{i:02d}", label=""), Pull(remotepaths=[f"/tmp/fit{i}.pcap"], localpath=str(run_root), label=""), ], ) for i, nodei in node_index.items() ] retrieve_tcpdump = Scheduler( *retrieve_tcpdump_job, scheduler=scheduler, required=pings, label="Retrieve tcpdump", ) if route_sampling: retrieve_sampling_job = [ SshJob( # scheduler=scheduler, node=nodei, # required=pings, label=f"retrieve sampling trace from fit{i:02d}", verbose=verbose_jobs, commands=[ # RunScript("node-utilities.sh", "kill-route-sample", protocol, # label = "kill route sample"), #RunScript("route-sample-service.sh", "kill-route-sample", # label="kill route sample"), Run("systemctl stop route-sample", label="stop route-sample"), Run( f"echo retrieving sampling trace from fit{i:02d}", label=""), Pull(remotepaths=[f"ROUTE-TABLE-{i:02d}-SAMPLED"], localpath=str(run_root), label=""), ], ) for i, nodei in node_index.items() ] retrieve_sampling = Scheduler( *retrieve_sampling_job, scheduler=scheduler, required=pings, verbose=verbose_jobs, label="Stop & retrieve route sampling", ) if tshark: parse_pcaps_job = [ SshJob( # scheduler=scheduler, node=LocalNode(), # required=retrieve_tcpdump, label=f"parse pcap trace {run_root}/fit{i}.pcap", verbose=verbose_jobs, #commands = [RunScript("parsepcap.sh", run_root, i)] command=Run("tshark", "-2", "-r", f"{run_root}/fit{i}.pcap", "-R", f"'(ip.dst==10.0.0.{i} && icmp) && radiotap.dbm_antsignal'", "-Tfields", "-e", "'ip.src'", "-e" "'ip.dst'", "-e", "'radiotap.dbm_antsignal'", ">", f"{run_root}/result-{i}.txt", label=f"parsing pcap from {i}"), ) for i in node_ids ] parse_pcaps = Scheduler( *parse_pcaps_job, scheduler=scheduler, required=retrieve_tcpdump, label="Parse pcap", ) if interference: kill_uhd_siggen = SshJob( scheduler=scheduler, node=node_scrambler, required=pings, label=f"killing uhd_siggen on the scrambler node {scrambler_id}", verbose=verbose_jobs, commands=[Run("systemctl", "stop", "uhd_siggen"), #Run("systemctl reset-failed tcpdump"), ], ) kill_2_uhd_siggen = SshJob( scheduler=scheduler, node=faraday, required=kill_uhd_siggen, label=f"turning off usrp on the scrambler node {scrambler_id}", verbose=verbose_jobs, command=Run("rhubarbe", "usrpoff", scrambler_id), ) pings.add(Sequence(*pings_job)) # for running sequentially we impose no limit on the scheduler # that will be limitied anyways by the very structure # of the required graph # safety check scheduler.export_as_pngfile(run_root / "experiment-graph") if dry_run: scheduler.list() return True # if not in dry-run mode, let's proceed to the actual experiment ok = scheduler.run() # jobs_window=jobs_window) # close all ssh connections close_ssh_in_scheduler(scheduler) # give details if it failed if not ok: scheduler.debrief() scheduler.export_as_pngfile("debug") if ok and map: time_line("Creation of MAP files") post_processor = ProcessRoutes(run_root, src_ids, node_ids) post_processor.run() if ok and route_sampling: time_line("Creation of ROUTE SAMPLING files") post_processor = ProcessRoutes(run_root, src_ids, node_ids) post_processor.run_sampled() # data acquisition is done, let's aggregate results # i.e. compute averages #if ok and tshark: #post_processor = Aggregator(run_root, node_ids, antenna_mask) #post_processor.run() time_line("one_run done") return ok
def test_format(self): s = Scheduler() f = TerminalFormatter("%Y:%H:%S - @host@:@line@", verbose=True) n = SshNode(localhostname(), username=localuser(), formatter=f) s.add(SshJob(node=n, commands=[Run("echo LINE1"), Run("echo LINE2")])) s.run()
def main(self, *test_argv): # pylint: disable=r0915,r0912,r0914,c0111 self.parser = parser = argparse.ArgumentParser() # scope - on what hosts parser.add_argument( "-s", "--script", action='store_true', default=False, help=f"""If this flag is present, the first element of the remote command is assumed to be either the name of a local script, or, if this is not found, the body of a local script, that will be copied over before being executed remotely. In this case it should be executable. On the remote boxes it will be installed and run in the {default_remote_workdir} directory. """) parser.add_argument( "-i", "--includes", dest='includes', default=[], action='append', help="""for script mode only : a list of local files that are pushed remotely together with the local script, and in the same location; useful when you want to to run remotely a shell script that sources other files; remember that on the remote end all files (scripts and includes) end up in the same location""") parser.add_argument("-t", "--target", dest='targets', action='append', default=[], help=""" specify targets (additive); at least one is required; each target can be either * a space-separated list of hostnames * the name of a file containing hostnames * the name of a directory containing files named after hostnames; see e.g. the --mark option """) parser.add_argument("-x", "--exclude", dest='excludes', action='append', default=[], help=""" like --target, but for specifying exclusions; for now there no wildcard mechanism is supported here; also the order in which --target and --exclude options are mentioned does not matter; use --dry-run to only check for the list of applicable hosts """) # global settings parser.add_argument("-w", "--window", type=int, default=0, help=""" specify how many connections can run simultaneously; default is no limit """) parser.add_argument( "-c", "--connect-timeout", dest='timeout', type=float, default=default_timeout, help=f"specify connection timeout, default is {default_timeout}s") # ssh settings parser.add_argument( "-l", "--login", default=default_username, help=f"remote user name - default is {default_username}") parser.add_argument("-k", "--key", dest='keys', default=None, action='append', type=str, help=""" The default is for apssh to locate an ssh-agent through the SSH_AUTH_SOCK environment variable. If this cannot be found, or has an empty set of keys, then the user should specify private key file(s) - additive """) parser.add_argument("-K", "--ok-if-no-key", default=False, action='store_true', help=""" When no key can be found, apssh won't even bother to try and connect. With this option it proceeds even with no key available. """) parser.add_argument("-g", "--gateway", default=None, help=""" specify a gateway for 2-hops ssh - either hostname or username@hostname """) # how to store results # terminal parser.add_argument("-r", "--raw-format", default=False, action='store_true', help=""" produce raw result, incoming lines are shown as-is without hostname """) parser.add_argument( "-tc", "--time-colon-format", default=False, action='store_true', help="equivalent to --format '@time@:@host@:@line@") parser.add_argument("-f", "--format", default=None, action='store', help="""specify output format, which may include * `strftime` formats like e.g. %%H-%%M, and one of the following: * @user@ for the remote username, * @host@ for the target hostname, * @line@ for the actual line output (which contains the actual newline) * @time@ is a shorthand for %%H-%%M-%%S""") # filesystem parser.add_argument("-o", "--out-dir", default=None, help="specify directory where to store results") parser.add_argument("-d", "--date-time", default=None, action='store_true', help="use date-based directory to store results") parser.add_argument("-m", "--mark", default=False, action='store_true', help=""" available with the -d and -o options only. When specified, then for all nodes there will be a file created in the output subdir, named either 0ok/<hostname> for successful nodes, or 1failed/<hostname> for the other ones. This mark file will contain a single line with the returned code, or 'None' if the node was not reachable at all """) # usual stuff parser.add_argument("-n", "--dry-run", default=False, action='store_true', help="Only show details on selected hostnames") parser.add_argument("-v", "--verbose", action='store_true', default=False) parser.add_argument("-D", "--debug", action='store_true', default=False) parser.add_argument("-V", "--version", action='store_true', default=False) # the commands to run parser.add_argument("commands", nargs=argparse.REMAINDER, type=str, help=""" command to run remotely. If the -s or --script option is provided, the first argument here should denote a (typically script) file **that must exist** on the local filesystem. This script is then copied over to the remote system and serves as the command for remote execution """) if test_argv: args = self.parsed_args = parser.parse_args(test_argv) else: args = self.parsed_args = parser.parse_args() # helpers if args.version: print(f"apssh version {apssh_version}") exit(0) # manual check for REMAINDER if not args.commands: print("You must provide a command to be run remotely") parser.print_help() exit(1) # load keys self.loaded_private_keys = load_private_keys( self.parsed_args.keys, args.verbose or args.debug) if not self.loaded_private_keys and not args.ok_if_no_key: print("Could not find any usable key - exiting") exit(1) # initialize a gateway proxy if --gateway is specified gateway = None if args.gateway: gwuser, gwhost = self.user_host(args.gateway) gateway = SshProxy(hostname=gwhost, username=gwuser, keys=self.loaded_private_keys, formatter=self.get_formatter(), timeout=self.parsed_args.timeout, debug=self.parsed_args.debug) proxies = self.create_proxies(gateway) if args.verbose: print_stderr(f"apssh is working on {len(proxies)} nodes") window = self.parsed_args.window # populate scheduler scheduler = Scheduler(verbose=args.verbose) if not args.script: command_class = Run extra_kwds_args = {} else: # try RunScript command_class = RunScript extra_kwds_args = {'includes': args.includes} # but if the filename is not found then use RunString script = args.commands[0] if not Path(script).exists(): if args.verbose: print("Warning: file not found '{}'\n" "=> Using RunString instead".format(script)) command_class = RunString for proxy in proxies: scheduler.add( SshJob(node=proxy, critical=False, command=command_class(*args.commands, **extra_kwds_args))) # pylint: disable=w0106 scheduler.jobs_window = window if not scheduler.run(): scheduler.debrief() results = [job.result() for job in scheduler.jobs] ########## # print on stdout the name of the output directory # useful mostly with -d : subdir = self.get_formatter().run_name \ if isinstance(self.get_formatter(), SubdirFormatter) \ else None if subdir: print(subdir) # details on the individual retcods - a bit hacky if self.parsed_args.debug: for proxy, result in zip(proxies, results): print(f"PROXY {proxy.hostname} -> {result}") # marks names = {0: '0ok', None: '1failed'} if subdir and self.parsed_args.mark: # do we need to create the subdirs need_ok = [s for s in results if s == 0] if need_ok: os.makedirs(f"{subdir}/{names[0]}", exist_ok=True) need_fail = [s for s in results if s != 0] if need_fail: os.makedirs(f"{subdir}/{names[None]}", exist_ok=True) for proxy, result in zip(proxies, results): prefix = names[0] if result == 0 else names[None] mark_path = Path(subdir) / prefix / proxy.hostname with mark_path.open("w") as mark: mark.write(f"{result}\n") # xxx - when in gateway mode, the gateway proxy never gets disconnected # which probably is just fine # return 0 only if all hosts have returned 0 # otherwise, return 1 failures = [r for r in results if r != 0] overall = 0 if not failures else 1 return overall
def one_run(tx_power, phy_rate, antenna_mask, channel, interference, protocol, *, run_name=default_run_name, slicename=default_slicename, load_images=False, node_ids=None, verbose_ssh=False, verbose_jobs=False, dry_run=False, tshark=False, map=False, warmup=False, exp=default_exp, dest=default_node_ids, ping_number=default_ping_number, route_sampling=False): """ Performs data acquisition on all nodes with the following settings Arguments: tx_power: in dBm, a string like 5, 10 or 14. Correspond to the transmission power. phy_rate: a string among 1, 54. Correspond to the wifi rate. antenna_mask: a string among 1, 3, 7. channel: a string like e.g. 1 or 40. Correspond to the channel. protocol: a string among batman , olsr. Correspond to the protocol interference : in dBm, a string like 60 or 50. Correspond to the power of the noise generated in the root. run_name: the name for a subdirectory where all data will be kept successive runs should use the same name for further visualization slicename: the Unix login name (slice name) to enter the gateway load_images: a boolean specifying whether nodes should be re-imaged first node_ids: a list of node ids to run the scenario against; strings or ints are OK; defaults to the nodes [1, 4, 5, 12, 19, 22,27 ,31, 33, 37] tshark: a boolean specifying wether we should format/parse the .pcap. map: a boolean specifying wether we should fetch/parse the route tables of the nodes. warmup: a boolean specifying wether we should run a ping before the experiment to be certain of the stabilisation on the network. exp: a list of nodes from which we will launch the ping from. strings or ints are OK. default to the node [1] ping_number : The number of pings that will be generated """ # set default for the nodes parameter node_ids = [int(id) for id in node_ids ] if node_ids is not None else default_node_ids exp_ids = [int(id) for id in exp] if exp is not None else default_exp dest_ids = [int(id) for id in dest] if dest is not None else default_node_ids # # dry-run mode # just display a one-liner with parameters # if dry_run: print("************************************") print("\n") run_root = naming_scheme(protocol, run_name, tx_power, phy_rate, antenna_mask, channel, interference, autocreate=False) load_msg = "" if not load_images else " LOAD" nodes = " ".join(str(n) for n in node_ids) exps = " ".join(str(n) for n in exp) pingst = [ "PING{}-->{}".format(e, j) for e in exp_ids # and on the destination for j in node_ids if e != j #and not #(j in exp_ids and j < e) ] print( "dry-run:{protocol} {run_name}{load_msg} -" " t{tx_power} r{phy_rate} a{antenna_mask} ch{channel} I{interference}-" "nodes {nodes}" " exp {exps}".format(**locals())) print( "\nNodes from which the experiment will be launched : \n{}\nList of pings generated:\n" .format(exps)) print(pingst) print("\n") if warmup: print("Will do warmup pings\n") if tshark: print( "Will format data using tshark and will agregate the RSSI into one RSSI.txt file" ) if map: print( "Will fetch the routing tables of the node (when stabilited) and will agregate the results\n" ) if route_sampling: print("Will launch route sampling services on nodes") #print("Test creation of ROUTES files") #post_processor= ProcessRoutes(run_root, exp_ids, node_ids) #post_processor.run() #print("\nList of tracepaths generated:\n{}".format(tracepathst)) # in dry-run mode we are done ### # create the logs directory based on input parameters run_root = naming_scheme(protocol, run_name, tx_power, phy_rate, antenna_mask, channel, interference, autocreate=False) if (run_root.is_dir()): purgedir(run_root) run_root = naming_scheme(protocol, run_name, tx_power, phy_rate, antenna_mask, channel, interference, autocreate=True) exp_info_file_name = run_root / "info.txt" with exp_info_file_name.open("w") as info_file: info_file.write("Selected nodes : \n") for node in node_ids[:-1]: info_file.write(f"{node} ") info_file.write(f"{node_ids[-1]}") info_file.write("\nSources : \n") for src in exp_ids[:-1]: info_file.write(f"{src} ") info_file.write(f"{exp_ids[-1]}") info_file.write("\nDestinations : \n") for dest in dest_ids[:-1]: info_file.write(f"{dest} ") info_file.write(f"{dest_ids[-1]}" + "\n") # the nodes involved faraday = SshNode(hostname=default_gateway, username=slicename, formatter=TimeColonFormatter(), verbose=verbose_ssh) # this is a python dictionary that allows to retrieve a node object # from an id node_index = { id: SshNode(gateway=faraday, hostname=fitname(id), username="******", formatter=TimeColonFormatter(), verbose=verbose_ssh) for id in node_ids } if interference != "None": node_scrambler = SshNode(gateway=faraday, hostname=fitname(scrambler_id), username="******", formatter=TimeColonFormatter(), verbose=verbose_ssh) # the global scheduler scheduler = Scheduler(verbose=verbose_jobs) # if tshark: #scheduler_monitoring = Scheduler(verbose=verbose_jobs) #if interference != "None": #scheduler_interferences = Scheduler(verbose=verbose_jobs) ########## check_lease = SshJob( scheduler=scheduler, node=faraday, verbose=verbose_jobs, critical=True, label="rhubarbe check lease", command=Run("rhubarbe leases --check", label="rlease"), #keep_connection = True ) # load images if requested green_light = check_lease if load_images: # the nodes that we **do not** use should be turned off # so if we have selected e.g. nodes 10 12 and 15, we will do # rhubarbe off -a ~10 ~12 ~15, meaning all nodes except 10, 12 and 15 negated_node_ids = ["~{}".format(id) for id in node_ids] #Add the id of the scrambler in the list and load the gnuradio image negated_node_ids.append("~{}".format(scrambler_id)) load_ids = [int(id) for id in node_ids ] if node_ids is not None else default_node_ids load_ids.append(scrambler_id) # replace green_light in this case #We use a modified image of gnuradio where uhd_siggen handle the signal SIGTERM in order to finish properly green_light = SshJob( node=faraday, required=check_lease, #critical=True, scheduler=scheduler, verbose=verbose_jobs, label="rhubarbe load/wait on nodes {}".format(load_ids), commands=[ Run("rhubarbe", "off", "-a", *negated_node_ids, label="roff {}".format(negated_node_ids)), Run("rhubarbe", "load", *node_ids, label="rload {}".format(node_ids)), Run("rhubarbe", "load", "-i", "gnuradio_batman", scrambler_id, label="load gnuradio batman on {}".format(scrambler_id)), Run("rhubarbe", "wait", *load_ids, label="rwait") ], #keep_connection = True ) ########## # setting up the wireless interface on all nodes # # this is a python feature known as a list comprehension # we just create as many SshJob instances as we have # (id, SshNode) couples in node_index # and gather them all in init_wireless_jobs # they all depend on green_light # # provide node-utilities with the ranges/units it expects frequency = channel_frequency[int(channel)] # tx_power_in_mBm not in dBm tx_power_driver = tx_power * 100 init_wireless_sshjobs = [ SshJob( #scheduler=scheduler, #required=green_light, node=node, verbose=verbose_jobs, label="init {}".format(id), command=RunScript("node-utilities.sh", "init-ad-hoc-network-{}".format(wireless_driver), wireless_driver, "foobar", frequency, phy_rate, antenna_mask, tx_power_driver, label="init add-hoc network"), #keep_connection = True ) for id, node in node_index.items() ] init_wireless_jobs = Scheduler( *init_wireless_sshjobs, scheduler=scheduler, required=green_light, #critical = True, verbose=verbose_jobs, label="Initialisation of wireless chips") green_light_prot = init_wireless_jobs if interference != "None": #Run uhd_siggen with the chosen power frequency_str = frequency / 1000 frequency_str = str(frequency_str) + "G" init_scrambler_job = [ SshJob( forever=True, node=node_scrambler, verbose=verbose_jobs, label="init scrambler on node {}".format(scrambler_id), command=RunScript("node-utilities.sh", "init-scrambler", interference, frequency_str, label="init scambler"), #keep_connection = True ) ] init_scrambler = Scheduler( *init_scrambler_job, scheduler=scheduler, required=green_light, #forever = True, #critical = True, verbose=verbose_jobs, label="Running interference") # then install and run batman on fit nodes run_protocol_job = [ SshJob( #scheduler=scheduler, node=node, #required=green_light_prot, label="init and run {} on fit node {}".format(protocol, i), verbose=verbose_jobs, command=RunScript("node-utilities.sh", "run-{}".format(protocol), label="run {}".format(protocol)), #keep_connection = True ) for i, node in node_index.items() ] run_protocol = Scheduler( *run_protocol_job, scheduler=scheduler, required=green_light_prot, #critical = True, verbose=verbose_jobs, label="init and run routing protocols") # after that, run tcpdump on fit nodes, this job never ends... if tshark: run_tcpdump_job = [ SshJob( #scheduler=scheduler_monitoring, node=node, forever=True, label="run tcpdump on fit node".format(i), verbose=verbose_jobs, commands=[ RunScript("node-utilities.sh", "run-tcpdump", wireless_driver, i, label="run tcpdump") ], #keep_connection = True ) for i, node in node_index.items() ] run_tcpdump = Scheduler( *run_tcpdump_job, scheduler=scheduler, required=run_protocol, forever=True, #critical = True, verbose=verbose_jobs, label="Monitoring (tcpdum) Jobs") # let the wireless network settle settle_wireless_job = PrintJob( "Let the wireless network settle", sleep=settle_delay, scheduler=scheduler, required=run_protocol, label="settling for {} sec".format(settle_delay)) green_light_experiment = settle_wireless_job if warmup: warmup_pings_job = [ SshJob( node=nodei, #required=green_light_experiment, label="warmup ping {} -> {}".format(i, j), verbose=verbose_jobs, commands=[ Run("echo {} '->' {}".format(i, j), label="ping {} '->' {}".format(i, j)), RunScript("node-utilities.sh", "my-ping", "10.0.0.{}".format(j), ping_timeout, ping_interval, ping_size, ping_number, label="") ], #keep_connection = True ) #for each selected experiment nodes for e in exp_ids # looping on the source (to get the correct sshnodes) for i, nodei in node_index.items() # and on the destination for j, nodej in node_index.items() # and keep only sources that are in the selected experiment nodes and remove destination that are themselves # and remove the couples that have already be done # print("i {index} exp {expe}".format(index = i, expe= exp)) if (i == e) and e != j and not (j in exp_ids and j < e) ] warmup_pings = Scheduler( Sequence(*warmup_pings_job), scheduler=scheduler, required=green_light_experiment, #critical = True, verbose=verbose_jobs, label="Warmup ping") settle_wireless_job2 = PrintJob( "Let the wireless network settle", sleep=settle_delay / 2, scheduler=scheduler, required=warmup_pings, label="settling-warmup for {} sec".format(settle_delay / 2)) green_light_experiment = settle_wireless_job2 ########## # create all the tracepath jobs from the first node in the list # if map: routes_job = [ SshJob( node=nodei, #scheduler=scheduler, #required=green_light_experiment, label="Generating ROUTE file for prot {} on node {}".format( protocol, i), verbose=verbose_jobs, commands=[ RunScript("node-utilities.sh", "route-{}".format(protocol), ">", "ROUTE-TABLE-{:02d}".format(i), label="get route table"), Pull(remotepaths="ROUTE-TABLE-{:02d}".format(i), localpath=str(run_root), label="") ], #keep_connection = True ) for i, nodei in node_index.items() ] routes = Scheduler( *routes_job, scheduler=scheduler, required=green_light_experiment, #critical = True, verbose=verbose_jobs, label="Snapshoting route files") green_light_experiment = routes if route_sampling: routes_sampling_job2 = [ SshJob( node=nodei, label="Route sampling service for prot {} on node {}".format( protocol, i), verbose=False, #forever = True, commands=[ Push(localpaths=["route_sample_service.sh"], remotepath=".", label=""), Run("source", "route_sample_service.sh;", "route-sample", "ROUTE-TABLE-{:02d}-SAMPLED".format(i), "{}".format(protocol), label="run route sampling service"), ], #keep_connection = True ) for i, nodei in node_index.items() ] routes_sampling_job = [ SshJob( node=nodei, label="Route sampling service for prot {} on node {}".format( protocol, i), verbose=False, forever=True, #critical = True, #required = green_light_experiment, #scheduler = scheduler, commands=[ RunScript("route_sample_service.sh", "route-sample", "ROUTE-TABLE-{:02d}-SAMPLED".format(i), "{}".format(protocol), label="run route sampling service"), ], #keep_connection = True ) for i, nodei in node_index.items() ] routes_sampling = Scheduler( *routes_sampling_job, scheduler=scheduler, verbose=False, forever=True, #critical = True, label="Route Sampling services launch", required=green_light_experiment) ########## # create all the ping jobs, i.e. max*(max-1)/2 # this again is a python list comprehension # see the 2 for instructions at the bottom # # notice that these SshJob instances are not yet added # to the scheduler, we will add them later on # depending on the sequential/parallel strategy pings_job = [ SshJob( node=nodei, #required=green_light_experiment, label="ping {} -> {}".format(i, j), verbose=verbose_jobs, commands=[ Run("echo {} '->' {}".format(i, j), label="ping {}'->' {}".format(i, j)), RunScript("node-utilities.sh", "my-ping", "10.0.0.{}".format(j), ping_timeout, ping_interval, ping_size, ping_number, ">", "PING-{:02d}-{:02d}".format(i, j), label=""), Pull(remotepaths="PING-{:02d}-{:02d}".format(i, j), localpath=str(run_root), label=""), ], #keep_connection = True ) #for each selected experiment nodes for e in exp_ids # looping on the source (to get the correct sshnodes) for i, nodei in node_index.items() # and on the destination for j in dest_ids # and keep only sources that are in the selected experiment nodes and remove destination that are themselves # and remove the couples that have already be done if (i == e) and e != j and not (j in exp_ids and j < e) ] pings = Scheduler( scheduler=scheduler, label="PINGS", #critical = True, verbose=verbose_jobs, required=green_light_experiment) # retrieve all pcap files from fit nodes stop_protocol_job = [ SshJob( #scheduler=scheduler, node=nodei, #required=pings, label="kill routing protocol on fit{:02d}".format(i), verbose=verbose_jobs, #critical = True, commands=[ RunScript("node-utilities.sh", "kill-{}".format(protocol), label="kill-{}".format(protocol)), ], #keep_connection = False ) for i, nodei in node_index.items() ] stop_protocol = Scheduler( *stop_protocol_job, scheduler=scheduler, required=pings, #critical = True, label="Stop routing protocols", ) if tshark: retrieve_tcpdump_job = [ SshJob( #scheduler=scheduler, node=nodei, #required=pings, label="retrieve pcap trace from fit{:02d}".format(i), verbose=verbose_jobs, #critical = True, commands=[ # RunScript("node-utilities.sh", "kill-{}".format(protocol), label = "kill-{}".format(protocol)), RunScript("node-utilities.sh", "kill-tcpdump", label="kill-tcpdump"), #Run("sleep 1"), Run("echo retrieving pcap trace and result-{i}.txt from fit{i:02d}" .format(i=i), label=""), Pull(remotepaths=["/tmp/fit{}.pcap".format(i)], localpath=str(run_root), label=""), ], #keep_connection = True ) for i, nodei in node_index.items() ] retrieve_tcpdump = Scheduler( *retrieve_tcpdump_job, scheduler=scheduler, required=pings, #critical = True, label="Retrieve tcpdump", ) if route_sampling: retrieve_sampling_job = [ SshJob( #scheduler=scheduler, node=nodei, #required=pings, label="retrieve sampling trace from fit{:02d}".format(i), verbose=verbose_jobs, #critical = True, commands=[ #RunScript("node-utilities.sh", "kill-route-sample", protocol, # label = "kill route sample"), RunScript("route_sample_service.sh", "kill-route-sample", label="kill route sample"), Run("echo retrieving sampling trace from fit{i:02d}". format(i=i), label=""), Pull(remotepaths=["ROUTE-TABLE-{:02d}-SAMPLED".format(i)], localpath=str(run_root), label=""), ], #keep_connection = True ) for i, nodei in node_index.items() ] retrieve_sampling = Scheduler( *retrieve_sampling_job, scheduler=scheduler, required=pings, #critical=True, verbose=verbose_jobs, label="Retrieve & stopping route sampling", ) if tshark: parse_pcaps_job = [ SshJob( #scheduler=scheduler, node=LocalNode(), #required=retrieve_tcpdump, label="parse pcap trace {path}/fit{node}.pcap".format( path=run_root, node=i), verbose=verbose_jobs, #commands = [RunScript("parsepcap.sh", run_root, i)] commands=[ Run("tshark", "-2", "-r", "{path}/fit{node}.pcap".format(path=run_root, node=i), "-R", "'(ip.dst==10.0.0.{node} && icmp) && radiotap.dbm_antsignal'" .format(node=i), "-Tfields", "-e", "'ip.src'", "-e" "'ip.dst'", "-e", "'radiotap.dbm_antsignal'", ">", "{path}/result-{node}.txt".format(path=run_root, node=i), label="parse pcap locally") ], #keep_connection = True ) for i in node_ids ] parse_pcaps = Scheduler( *parse_pcaps_job, scheduler=scheduler, required=retrieve_tcpdump, #critical=True, label="Parse pcap", ) #TODO: TURN OFF USRP if interference != "None": kill_uhd_siggen = SshJob( scheduler=scheduler, node=node_scrambler, required=pings, label="killing uhd_siggen on the scrambler node {}".format( scrambler_id), verbose=verbose_jobs, #critical = True, commands=[Run("pkill", "uhd_siggen")], #keep_connection = True ) kill_2_uhd_siggen = SshJob( scheduler=scheduler, node=faraday, required=kill_uhd_siggen, label="turning off usrp on the scrambler node {}".format( scrambler_id), verbose=verbose_jobs, commands=[ Run("rhubarbe", "usrpoff", "fit{}".format(scrambler_id)) ], #keep_connection = True ) #if map: #scheduler.add(Sequence(*tracepaths, scheduler=scheduler)) #if warmup: # scheduler.add(Sequence(*warmup_pings_job, scheduler=scheduler)) pings.add(Sequence(*pings_job)) # for running sequentially we impose no limit on the scheduler # that will be limitied anyways by the very structure # of the required graph #jobs_window = None if dry_run: scheduler.export_as_pngfile(run_root / "experiment_graph") return True # if not in dry-run mode, let's proceed to the actual experiment ok = scheduler.orchestrate() #jobs_window=jobs_window) scheduler.shutdown() dot_file = run_root / "experiment_graph" if not dot_file.is_file(): scheduler.export_as_dotfile(dot_file) #TODO : Is it necessary? if the user want to see it he can just do it? #call(["dot", "-Tpng", dot_file, "-o", run_root / "experitment_graph.png"]) #ok=True #ok = False # give details if it failed if not ok: scheduler.debrief() scheduler.export_as_dotfile("debug") if ok and map: print("Creation of ROUTES files") post_processor = ProcessRoutes(run_root, exp_ids, node_ids) post_processor.run() if ok and route_sampling: post_processor = ProcessRoutes(run_root, exp_ids, node_ids) post_processor.run_sampled() print("END of creation for ROUTES FILES") # data acquisition is done, let's aggregate results # i.e. compute averages if ok and tshark: post_processor = Aggregator(run_root, node_ids, antenna_mask) post_processor.run() return ok