示例#1
0
 def run(self, num_run):
     with utils.open_cfg(mode='w') as cfg:
         cfg['out_folders'] = {}
         if not 'delete_hdfs' in cfg['main']:
             cfg['main']['delete_hdfs'] = 'true'
         cfg['main']['num_run'] = str(num_run)
         '''
         sess_file = Path("session.txt")
         session_no = 0
         if sess_file.exists():
             with open("session.txt", 'r') as f:
                 fc = f.read()
                 session_no = int(fc) + 1 if len(fc) > 0 else 0
                 f.close()
         with open("session.txt", 'w') as f:
                 f.write(str(session_no))
                 f.close()
         '''
     for i in range(num_run):
         if self.cluster_id == c.CLUSTER_MAP['spark']:
             print(bold('Experiment ({}/{})'.format(i + 1, num_run)))
         try:
             self.retrieve_nodes()
             with utils.open_cfg(mode='w') as cfg:
                 cfg['main']['iter_num'] = str(i + 1)
             x_run.run_benchmark(self.nodes)
             if i == 0:
                 with utils.open_cfg(mode='w') as cfg:
                     cfg['main']['delete_hdfs'] = 'false'
         except (OSError, IOError) as exc:
             print('ERROR: {}\n\nSkipping Experiment ({}/{})'.format(
                 exc, i + 1, num_run))
示例#2
0
 def run_disabled(self, num_run):
     with utils.open_cfg(mode='w') as cfg:
         cfg['out_folders'] = {}
         cfg['main']['delete_hdfs'] = 'true'
     for i in range(num_run):
         if self.cluster_id == c.CLUSTER_MAP['spark']:
             print(bold('Experiment ({}/{})'.format(i + 1, num_run)))
         try:
             self.retrieve_nodes()
             x_run.run_benchmark(self.nodes)
             if i == 0:
                 with utils.open_cfg(mode='w') as cfg:
                     cfg['main']['delete_hdfs'] = 'false'
         except (OSError, IOError) as exc:
             print('ERROR: {}\n\nSkipping Experiment ({}/{})'.format(
                 exc, i + 1, num_run))
示例#3
0
def run_xspark(current_cluster,
               num_instance=NUM_INSTANCE,
               num_run=NUM_RUN,
               cluster_id=CLUSTER_ID,
               terminate=TERMINATE,
               run=RUN,
               reboot=REBOOT,
               assume_yes=False):
    """ Main function;
    * Launch spot request of NUMINSTANCE
    * Run Benchmark
    * Download Log
    * Plot data from log
    """
    print(
        header(
            'run_xspark(num_instance={}, num_run={}, cluster_id={},terminate={}, run={}, reboot={})'
            .format(num_instance, num_run, cluster_id, terminate, run,
                    reboot)))
    cfg = utils.get_cfg()
    cfg['main'] = {}
    cfg.set('main', 'current_cluster', current_cluster)
    utils.write_cfg(cfg)

    if PROVIDER == "AWS_SPOT":
        set_spot_drivers()
        cls = get_driver("ec2_spot_" + REGION.replace('-', '_'))
        driver = cls(AWS_ACCESS_ID, AWS_SECRET_KEY)
    elif PROVIDER == "AZURE":
        set_azurearm_driver()
        cls = get_driver("CustomAzureArm")
        driver = cls(tenant_id=AZ_TENANT_ID,
                     subscription_id=AZ_SUBSCRIPTION_ID,
                     key=AZ_APPLICATION_ID,
                     secret=AZ_SECRET,
                     region=CONFIG_DICT["Azure"]["Location"])

    else:
        print("Unsupported provider", PROVIDER)
        return

    if num_instance > 0:

        # Create nodes
        if PROVIDER == "AWS_SPOT":
            nodes, spot_requests = launch.launch_libcloud(
                driver, num_instance, CONFIG_DICT, cluster_id, assume_yes)

        if PROVIDER == "AZURE":
            nodes = launch.launch_libcloud(driver, num_instance, CONFIG_DICT,
                                           cluster_id, assume_yes)

        # nodes is a list of "libcloud.compute.base.Node"

        print("CHECK SECURITY GROUP ALLOWED IP SETTINGS!!!")

        # Tag nodes
        if PROVIDER == "AWS_SPOT":
            for node in nodes:
                driver.ex_create_tags(node, TAG[0])
        elif PROVIDER == "AZURE":
            for node in nodes:
                driver.ex_create_tags(
                    node, {"ClusterId": cluster_id
                           })  # was CONFIG_DICT["Azure"]["ClusterId"]

        instance_ids = [n.id for n in nodes]

        # Wait for all the nodes to become RUNNNING
        print("Waiting for nodes to run")
        launch.wait_for_running_libcloud(driver, instance_ids,
                                         copy.deepcopy(instance_ids))

        time.sleep(15)

        # Wait for all the nodes to be pingable
        print("Waiting for nodes to be pingable")
        launch.wait_ping_libcloud(driver, instance_ids,
                                  copy.deepcopy(instance_ids))

    if reboot:
        print("Rebooting instances...")

        # Retrieve running nodes
        if PROVIDER == "AWS_SPOT":
            nodes = driver.list_nodes(
                ex_filters={'instance-state-name': ['running']})
            nodes = [
                n for n in nodes
                if driver.ex_describe_tags(node)['Value'] == cluster_id
            ]
        elif PROVIDER == "AZURE":
            nodes = driver.list_nodes(
                ex_resource_group=CONFIG_DICT["Azure"]["ResourceGroup"])
            nodes = [
                n for n in nodes if n.extra["tags"]["ClusterId"] == cluster_id
            ]

        # Reboot nodes
        for node in nodes:
            driver.reboot_node(node)

        # Wait for all the nodes to be pingable
        instance_ids = [n.id for n in nodes]
        launch.wait_ping_libcloud(driver, instance_ids,
                                  copy.deepcopy(instance_ids))

    if run:
        for i in range(num_run):
            if PROVIDER == "AWS_SPOT":
                nodes = driver.list_nodes(
                    ex_filters={'instance-state-name': ['running']})
                nodes = [
                    n for n in nodes
                    if driver.ex_describe_tags(n)['Value'] == cluster_id
                ]
            elif PROVIDER == "AZURE":
                nodes = driver.list_nodes(
                    ex_resource_group=CONFIG_DICT["Azure"]["ResourceGroup"])
                nodes = [
                    n for n in nodes
                    if n.extra["tags"]["ClusterId"] == cluster_id
                ]

            # nodes is a list of "libcloud.compute.base.Node"
            print("Found {} nodes".format(len(nodes)))

            x_run.run_benchmark(nodes)

    if terminate:
        print("Begin termination of instances and cleaning")

        # Cancel Spot Request
        if PROVIDER == "AWS_SPOT" and num_instance > 0:
            for s in spot_requests:
                driver.ex_cancel_spot_instance_request(s)
            print("Spot requests cancelled")

        ###################################################

        # Retrieve running nodes
        if PROVIDER == "AWS_SPOT":
            nodes = driver.list_nodes(
                ex_filters={'instance-state-name': ['running']})
            nodes = [
                n for n in nodes
                if driver.ex_describe_tags(n)['Value'] == cluster_id
            ]
        elif PROVIDER == "AZURE":
            nodes = driver.list_nodes(
                ex_resource_group=CONFIG_DICT["Azure"]["ResourceGroup"])
            nodes = [
                n for n in nodes if n.extra["tags"]["ClusterId"] == cluster_id
            ]
        print("Found {} nodes".format(len(nodes)))

        # nodes is a list of "libcloud.compute.base.Node"

        # Destroy all nodes
        print("Destroying nodes")
        for node in nodes:
            driver.destroy_node(node)

        print(okgreen("All nodes destroyed"))
示例#4
0
def main():
    """ Main function;
    * Launch spot request of NUMINSTANCE
    * Run Benchmark
    * Download Log
    * Plot data from log
    """
    session = boto3.Session(profile_name=CREDENTIAL_PROFILE)
    client = session.client('ec2', region_name=REGION)

    if NUM_INSTANCE > 0:
        spot_request_ids = launch.launch(client, NUM_INSTANCE, CONFIG_DICT)

        print("CHECK SECURITY GROUP ALLOWED IP SETTINGS!!!")

        # Wait for our spots to fulfill
        launch.wait_for_fulfillment(client, spot_request_ids,
                                    copy.deepcopy(spot_request_ids))

        spot_instance_response = client.describe_spot_instance_requests(
            SpotInstanceRequestIds=spot_request_ids)
        instance_ids = [
            result["InstanceId"]
            for result in spot_instance_response["SpotInstanceRequests"]
        ]

        client.create_tags(Resources=instance_ids, Tags=TAG)

        # Wait Running
        launch.wait_for_running(client, instance_ids,
                                copy.deepcopy(instance_ids))

        time.sleep(15)

        launch.wait_ping(client, instance_ids, copy.deepcopy(instance_ids))

    if REBOOT:
        print("Rebooting instances...")
        session = boto3.Session(profile_name=CREDENTIAL_PROFILE)
        ec2 = session.resource('ec2', region_name=REGION)
        instances = ec2.instances.filter(Filters=[{
            'Name': 'instance-state-name',
            'Values': ['running']
        }, {
            'Name': 'tag:ClusterId',
            'Values': [CLUSTER_ID]
        }])
        instance_ids = [x.id for x in instances]
        client.reboot_instances(InstanceIds=instance_ids)
        launch.wait_ping(client, instance_ids, copy.deepcopy(instance_ids))

    if RUN:
        for i in range(NUM_RUN):
            run.run_benchmark()

    if TERMINATE:
        instances = client.instances.filter(Filters=[{
            'Name': 'instance-state-name',
            'Values': ['running']
        }, {
            'Name': 'tag:ClusterId',
            'Values': [CLUSTER_ID]
        }])
        instance_ids = [x.id for x in instances]
        # TODO get spot_request_ids
        launch.terminate(client, spot_request_ids, instance_ids)