def run(self, num_run): with utils.open_cfg(mode='w') as cfg: cfg['out_folders'] = {} if not 'delete_hdfs' in cfg['main']: cfg['main']['delete_hdfs'] = 'true' cfg['main']['num_run'] = str(num_run) ''' sess_file = Path("session.txt") session_no = 0 if sess_file.exists(): with open("session.txt", 'r') as f: fc = f.read() session_no = int(fc) + 1 if len(fc) > 0 else 0 f.close() with open("session.txt", 'w') as f: f.write(str(session_no)) f.close() ''' for i in range(num_run): if self.cluster_id == c.CLUSTER_MAP['spark']: print(bold('Experiment ({}/{})'.format(i + 1, num_run))) try: self.retrieve_nodes() with utils.open_cfg(mode='w') as cfg: cfg['main']['iter_num'] = str(i + 1) x_run.run_benchmark(self.nodes) if i == 0: with utils.open_cfg(mode='w') as cfg: cfg['main']['delete_hdfs'] = 'false' except (OSError, IOError) as exc: print('ERROR: {}\n\nSkipping Experiment ({}/{})'.format( exc, i + 1, num_run))
def run_disabled(self, num_run): with utils.open_cfg(mode='w') as cfg: cfg['out_folders'] = {} cfg['main']['delete_hdfs'] = 'true' for i in range(num_run): if self.cluster_id == c.CLUSTER_MAP['spark']: print(bold('Experiment ({}/{})'.format(i + 1, num_run))) try: self.retrieve_nodes() x_run.run_benchmark(self.nodes) if i == 0: with utils.open_cfg(mode='w') as cfg: cfg['main']['delete_hdfs'] = 'false' except (OSError, IOError) as exc: print('ERROR: {}\n\nSkipping Experiment ({}/{})'.format( exc, i + 1, num_run))
def run_xspark(current_cluster, num_instance=NUM_INSTANCE, num_run=NUM_RUN, cluster_id=CLUSTER_ID, terminate=TERMINATE, run=RUN, reboot=REBOOT, assume_yes=False): """ Main function; * Launch spot request of NUMINSTANCE * Run Benchmark * Download Log * Plot data from log """ print( header( 'run_xspark(num_instance={}, num_run={}, cluster_id={},terminate={}, run={}, reboot={})' .format(num_instance, num_run, cluster_id, terminate, run, reboot))) cfg = utils.get_cfg() cfg['main'] = {} cfg.set('main', 'current_cluster', current_cluster) utils.write_cfg(cfg) if PROVIDER == "AWS_SPOT": set_spot_drivers() cls = get_driver("ec2_spot_" + REGION.replace('-', '_')) driver = cls(AWS_ACCESS_ID, AWS_SECRET_KEY) elif PROVIDER == "AZURE": set_azurearm_driver() cls = get_driver("CustomAzureArm") driver = cls(tenant_id=AZ_TENANT_ID, subscription_id=AZ_SUBSCRIPTION_ID, key=AZ_APPLICATION_ID, secret=AZ_SECRET, region=CONFIG_DICT["Azure"]["Location"]) else: print("Unsupported provider", PROVIDER) return if num_instance > 0: # Create nodes if PROVIDER == "AWS_SPOT": nodes, spot_requests = launch.launch_libcloud( driver, num_instance, CONFIG_DICT, cluster_id, assume_yes) if PROVIDER == "AZURE": nodes = launch.launch_libcloud(driver, num_instance, CONFIG_DICT, cluster_id, assume_yes) # nodes is a list of "libcloud.compute.base.Node" print("CHECK SECURITY GROUP ALLOWED IP SETTINGS!!!") # Tag nodes if PROVIDER == "AWS_SPOT": for node in nodes: driver.ex_create_tags(node, TAG[0]) elif PROVIDER == "AZURE": for node in nodes: driver.ex_create_tags( node, {"ClusterId": cluster_id }) # was CONFIG_DICT["Azure"]["ClusterId"] instance_ids = [n.id for n in nodes] # Wait for all the nodes to become RUNNNING print("Waiting for nodes to run") launch.wait_for_running_libcloud(driver, instance_ids, copy.deepcopy(instance_ids)) time.sleep(15) # Wait for all the nodes to be pingable print("Waiting for nodes to be pingable") launch.wait_ping_libcloud(driver, instance_ids, copy.deepcopy(instance_ids)) if reboot: print("Rebooting instances...") # Retrieve running nodes if PROVIDER == "AWS_SPOT": nodes = driver.list_nodes( ex_filters={'instance-state-name': ['running']}) nodes = [ n for n in nodes if driver.ex_describe_tags(node)['Value'] == cluster_id ] elif PROVIDER == "AZURE": nodes = driver.list_nodes( ex_resource_group=CONFIG_DICT["Azure"]["ResourceGroup"]) nodes = [ n for n in nodes if n.extra["tags"]["ClusterId"] == cluster_id ] # Reboot nodes for node in nodes: driver.reboot_node(node) # Wait for all the nodes to be pingable instance_ids = [n.id for n in nodes] launch.wait_ping_libcloud(driver, instance_ids, copy.deepcopy(instance_ids)) if run: for i in range(num_run): if PROVIDER == "AWS_SPOT": nodes = driver.list_nodes( ex_filters={'instance-state-name': ['running']}) nodes = [ n for n in nodes if driver.ex_describe_tags(n)['Value'] == cluster_id ] elif PROVIDER == "AZURE": nodes = driver.list_nodes( ex_resource_group=CONFIG_DICT["Azure"]["ResourceGroup"]) nodes = [ n for n in nodes if n.extra["tags"]["ClusterId"] == cluster_id ] # nodes is a list of "libcloud.compute.base.Node" print("Found {} nodes".format(len(nodes))) x_run.run_benchmark(nodes) if terminate: print("Begin termination of instances and cleaning") # Cancel Spot Request if PROVIDER == "AWS_SPOT" and num_instance > 0: for s in spot_requests: driver.ex_cancel_spot_instance_request(s) print("Spot requests cancelled") ################################################### # Retrieve running nodes if PROVIDER == "AWS_SPOT": nodes = driver.list_nodes( ex_filters={'instance-state-name': ['running']}) nodes = [ n for n in nodes if driver.ex_describe_tags(n)['Value'] == cluster_id ] elif PROVIDER == "AZURE": nodes = driver.list_nodes( ex_resource_group=CONFIG_DICT["Azure"]["ResourceGroup"]) nodes = [ n for n in nodes if n.extra["tags"]["ClusterId"] == cluster_id ] print("Found {} nodes".format(len(nodes))) # nodes is a list of "libcloud.compute.base.Node" # Destroy all nodes print("Destroying nodes") for node in nodes: driver.destroy_node(node) print(okgreen("All nodes destroyed"))
def main(): """ Main function; * Launch spot request of NUMINSTANCE * Run Benchmark * Download Log * Plot data from log """ session = boto3.Session(profile_name=CREDENTIAL_PROFILE) client = session.client('ec2', region_name=REGION) if NUM_INSTANCE > 0: spot_request_ids = launch.launch(client, NUM_INSTANCE, CONFIG_DICT) print("CHECK SECURITY GROUP ALLOWED IP SETTINGS!!!") # Wait for our spots to fulfill launch.wait_for_fulfillment(client, spot_request_ids, copy.deepcopy(spot_request_ids)) spot_instance_response = client.describe_spot_instance_requests( SpotInstanceRequestIds=spot_request_ids) instance_ids = [ result["InstanceId"] for result in spot_instance_response["SpotInstanceRequests"] ] client.create_tags(Resources=instance_ids, Tags=TAG) # Wait Running launch.wait_for_running(client, instance_ids, copy.deepcopy(instance_ids)) time.sleep(15) launch.wait_ping(client, instance_ids, copy.deepcopy(instance_ids)) if REBOOT: print("Rebooting instances...") session = boto3.Session(profile_name=CREDENTIAL_PROFILE) ec2 = session.resource('ec2', region_name=REGION) instances = ec2.instances.filter(Filters=[{ 'Name': 'instance-state-name', 'Values': ['running'] }, { 'Name': 'tag:ClusterId', 'Values': [CLUSTER_ID] }]) instance_ids = [x.id for x in instances] client.reboot_instances(InstanceIds=instance_ids) launch.wait_ping(client, instance_ids, copy.deepcopy(instance_ids)) if RUN: for i in range(NUM_RUN): run.run_benchmark() if TERMINATE: instances = client.instances.filter(Filters=[{ 'Name': 'instance-state-name', 'Values': ['running'] }, { 'Name': 'tag:ClusterId', 'Values': [CLUSTER_ID] }]) instance_ids = [x.id for x in instances] # TODO get spot_request_ids launch.terminate(client, spot_request_ids, instance_ids)